diff --git a/fda-analyse-service/app.py b/fda-analyse-service/app.py index 137c09fc8e91dd93c240508b0f394650c4140582..12213a9b4fc7eef683e03c1e247335dcad816e53 100644 --- a/fda-analyse-service/app.py +++ b/fda-analyse-service/app.py @@ -123,8 +123,8 @@ def determinepk(): res = {"success": False, "message": str(e)} return Response(res, mimetype="application/json"), 500 -@app.route('/api/analyse/update_mdb_col', methods=["POST"], endpoint='mdb_basicstatistic') -@swag_from('/as-yml/updatecol.yml') +@app.route('/api/mdb/update_mdb_col', methods=["POST"], endpoint='mdb_basicstatistic') +@swag_from('as-yml/bstatistic.yml') def mdb_basicstatistic(): input_json = request.get_json() try: diff --git a/fda-analyse-service/as-yml/bstatistic.yml b/fda-analyse-service/as-yml/bstatistic.yml new file mode 100644 index 0000000000000000000000000000000000000000..a65ca91b16e3c240ee2b6b71291c8ba2c7afcbab --- /dev/null +++ b/fda-analyse-service/as-yml/bstatistic.yml @@ -0,0 +1,28 @@ +summary: "Add basic statistics, i.e., determine min, max, values of numerical columns..., and save to metadatabase" +description: "Updates entity mdb_columns and mdb_columns_num (columns with numerical values), mdb_columns_nom (columns +nominal values) and mdb_columns_cat (columns with categorical values, e.g. ENUM datatypes) in metadatabase" +consumes: +- "application/json" +produces: +- "application/json" +parameters: +- in: "body" + name: "body" + description: "Updates entity mdb_columns, mdb_columns_nom (attribute max_length), mdb_columns_num (min, max, mean, sd, histogram) and mdb_columns_cat (num_cat, cat_array). The attribute 'histogram' describes a equi-width histogram with a fix number of 10 buckets. The last value in this numeric array is the width of one bucket. The attribute cat_array contains an array with the names of the categories." + required: true + schema: + type: "object" + properties: + dbid: + type: "integer" + example: 1 + tid: + type: "integer" + example: 1 +responses: + 200: + description: "OK" + 405: + description: "Invalid input" + 409: + description: "Conflict" \ No newline at end of file diff --git a/fda-analyse-service/basicstatistics.py b/fda-analyse-service/basicstatistics.py index 3fa1f0bdd018b3350e88d2c6ab3315eeeb39ad11..56ee31d739cc08ed7341fa3153bfda02a5d87d10 100644 --- a/fda-analyse-service/basicstatistics.py +++ b/fda-analyse-service/basicstatistics.py @@ -5,84 +5,54 @@ import requests import json def update_bstatistic(dbid, tid): - logging.info("called update_bstatistic") - # Get database info - try: - s = requests.get( - "http://fda-database-service:9092/api/database/", - params = {"id":dbid}, - headers=headers - ).json() - - logging.info("s", s) - except Exception as e: - logging.error("Error while trying to get database info",e) - - # Get tablename by dbid and tid - try: - tbl_info = requests.get( - "http://fda-table-service:9094/api/database/{0}/table/{1}/".format(dbid,tid), headers=headers).json() - logging.info(tbl_info) - tbl_name = tbl_info['internalName'] - except Exception as e: - logging.error("Error:", e) - - logging.info("tbl name: " + tbl_name) - nomdtlist = ['text', 'character varying', 'varchar', 'char'] - numdtlist = ['number','decimal', 'numeric','bigint', 'integer', 'smallint', 'real', 'double precision', 'timestamp'] + numdtlist = ['number', 'decimal', 'numeric', 'bigint', 'integer', 'smallint', 'real', 'double precision', + 'timestamp'] catdtlist = ['boolean', 'enum', 'date', 'user-defined'] - - # Get columnname + logging.info("called update_bstatistic") + logging.info("update nominal columns") + # Get database info try: - conn=connect( - dbname="fda", - user = "postgres", - host = "fda-metadata-db", - password = "postgres" + # Connecting to metadatabase + conn = connect( + dbname="fda", + user="postgres", + host="fda-metadata-db", + password="postgres" ) + # Compare columns datatype cursor = conn.cursor() - cursor.execute("SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in %s ", dbid, tid, lst2mariadbarr(nomdtlist)) - cursor.fetchall() - logging.info(cursor.rowcount()) - for row in cursor: - logging.info("insert mdb_columns_nom: " + row) + cursor.execute("SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in (%s, %s, %s, %s) ", + (dbid, tid, 'text', 'character varying', 'varchar', 'char')) + nom = cursor.fetchall() + logging.info(list(enumerate(nom))) + for num, row in enumerate(nom): + logging.info("insert mdb_columns_nom: "+ '('+str(row[0])+','+str(row[1])+','+str(row[2])+')') insert_mdb_nomcol(row[0], row[1], row[2]) - cursor.close() - cursor = conn.cursor() - cursor.execute("SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in %s ", dbid, tid, lst2mariadbarr(numdtlist)) - for row in cursor.fetchall(): - logging.info("insert mdb_columns_num: " + row) + conn.commit() + logging.info("update numerical columns") + cursor.execute("""SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in (%s,%s,%s,%s,%s,%s,%s) + and (lower(cname)!='id')""", + (dbid, tid, 'number', 'decimal', 'numeric', 'bigint', 'integer', 'smallint', 'real')) + numc = cursor.fetchall() + for num, row in enumerate(numc): + logging.info("insert mdb_columns_num: "+ '('+str(row[0])+','+str(row[1])+','+str(row[2])+')') insert_mdb_numcol(row[0], row[1], row[2]) - cursor.close() - cursor = conn.cursor() - cursor.execute("SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in %s ", dbid, tid, - lst2mariadbarr(catdtlist)) + conn.commit() + logging.info("update nominal columns") + cursor.execute("SELECT cDBID, tID, ID FROM mdb_columns where cDBID=%s and tID=%s and lower(datatype) in (%s,%s,%s,%s) ", + (dbid, tid, 'boolean', 'enum', 'date', 'user-defined')) for row in cursor.fetchall(): - logging.info("insert mdb_columns_nom_cat: " + row) - insert_mdb_catcol(row[0], row[1], row[2]) + logging.info("insert mdb_columns_nom_cat: " + '('+str(row[0])+','+str(row[1])+','+str(row[2])+')') + insert_mdb_catcol(row[0], row[1], row[2]) + conn.commit() cursor.close() - cursor = conn.cursor() - cursor.execute("SELECT value FROM mdb_images_environment_item where key=%s",'ROOT') - value=cursor.fetchone() - except Exception as e: - print("Error while trying to get cname from mdb",e) - - # Conneting to database - try: - engine = create_engine('mysql+pymysql://root:'+str(value)+'@dbrepo-userdb-'+s[0]['internalName'].replace('_', '-')+'/'+s[0]['internalName']) - - sql = text("""SELECT column_name, columns.data_type, columns.ordinal_position, is_nullable - from information_schema.columns - where columns.table_name= :tblname and column_name=:colname""") - - with engine.begin() as conn: - res = conn.execute(sql, tblname=tbl_name,colname=cname).fetchone() except Exception as e: - print("Error while connecting to database.", e) + logging.error("error while trying to update_bstatistics",e) def insert_mdb_nomcol(dbid, tid, cid): - # Connecting to metadatabase - to obtain column name + # Connecting to metadatabase - to obtain column name"" + logging.info("get nominal columns") try: conn = connect( dbname="fda", @@ -92,37 +62,39 @@ def insert_mdb_nomcol(dbid, tid, cid): ) cursor = conn.cursor() - cursor.execute("select cDBID,tID,ID,internal_name from mdb_columns where cdbid = %s and tid = %s and cid =%s", + cursor.execute( + "SELECT internal_name FROM mdb_databases where ID=%s", (dbid,)) + s = cursor.fetchone() + conn.commit() + cursor.execute("SELECT internal_name FROM mdb_tables where tDBID=%s and ID=%s",(dbid, tid)) + t = cursor.fetchone() + conn.commit() + cursor.execute("select cDBID,tID,ID,internal_name from mdb_columns where cDBID = %s and tID = %s and ID =%s", (dbid, tid, cid)) res = cursor.fetchall() + conn.commit() cname = res[0][3] + cursor.execute("SELECT value FROM mdb_images_environment_item where key=%s", ('MARIADB_ROOT_PASSWORD',)) + value = cursor.fetchone() + conn.commit() + cursor.close() + logging.info("nominal column: " + str(cname)) except Exception as e: - print("Error while connecting to metadatabase.", e) + print("error while inserting into mdb_columns_nom.", e) + logging.info(res) # Connect to database - to obtain max_length try: - s = requests.get( - "http://fda-database-service:9092/api/database/", - params={"id": dbid} - ).json() - except Exception as e: - print("Error while trying to get database info", e) - try: - tbl_info = requests.get( - "http://fda-table-service:9094/api/database/{0}/table/{1}/".format(dbid, tid)).json() - tbl_name = tbl_info['internalName'] - except Exception as e: - print("Error:", e) - try: - engine = create_engine('mysql+pymysql://root:'+str(value)+'@dbrepo-userdb-'+s[0]['internalName'].replace('_', '-')+'/'+s[0]['internalName']) + logging.info("determine max_length in :"+'userdb-'+str(s[0])) + engine = create_engine('mysql+pymysql://root:'+str(value[0])+'@dbrepo-userdb-'+s[0].replace('_', '-')+'/'+s[0]) - sql = text("select max(char_length(" + cname + ")) from " + tbl_name) + sql = text("select max(char_length(" + cname + ")) from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() maxlen = res[0] except Exception as e: - print("Error while connecting to database", e) + logging.error("error while connecting to userdb", e) try: conn = connect( dbname="fda", @@ -139,13 +111,14 @@ def insert_mdb_nomcol(dbid, tid, cid): ret = cursor.statusmessage conn.commit() + cursor.close() except Exception as e: - print("Error while inserting into metadatabase", e) + print("error while inserting into fda-metadata-db", e) return json.dumps(ret) - def insert_mdb_numcol(dbid, tid, cid): # Connecting to metadatabase to obtain columnname + logging.info("get numerical columns") try: conn = connect( dbname="fda", @@ -155,25 +128,24 @@ def insert_mdb_numcol(dbid, tid, cid): ) cursor = conn.cursor() - cursor.execute("select cDBID,tID,ID,cName from mdb_columns where cdbid = %s and tid = %s and cid =%s", + cursor.execute( + "SELECT internal_name FROM mdb_databases where ID=%s", (dbid,)) + s = cursor.fetchone() + conn.commit() + cursor.execute("SELECT internal_name FROM mdb_tables where tDBID=%s and ID=%s", (dbid, tid)) + t = cursor.fetchone() + conn.commit() + cursor.execute("select cDBID,tID,ID,internal_name from mdb_columns where cdbid = %s and tid = %s and id =%s", (dbid, tid, cid)) res = cursor.fetchall() cname = res[0][3] + cursor.execute("SELECT value FROM mdb_images_environment_item where key=%s", ('MARIADB_ROOT_PASSWORD',)) + value = cursor.fetchone() + conn.commit() + cursor.close() except Exception as e: - print("Error while connecting to metadatabase.", e) - try: - s = requests.get( - "http://fda-database-service:9092/api/database/", - params={"id": dbid} - ).json() - except Exception as e: - print("Error while trying to get database info", e) - try: - tbl_info = requests.get( - "http://fda-table-service:9094/api/database/{0}/table/{1}/".format(dbid, tid)).json() - tbl_name = tbl_info['internalName'] - except Exception as e: - print("Error:", e) + print("error while connecting to fda-metadata-db.", e) + # Determine min, max, ... try: # Postgres engine @@ -181,30 +153,29 @@ def insert_mdb_numcol(dbid, tid, cid): # 'postgresql+psycopg2://postgres:postgres@fda-userdb-' + s[0]['internalName'].replace('_', '-') + '/' + s[0][ # 'internalName']) # Mariadb engine - engine = create_engine( - 'mysql+pymysql://root:'+str(value)+'@dbrepo-userdb-' + s[0]['internalName'].replace('_', '-') + '/' + s[0][ - 'internalName']) + engine = create_engine('mysql+pymysql://root:'+str(value[0])+'@dbrepo-userdb-'+s[0].replace('_', '-')+'/'+s[0]) + logging.info("determine min, max, mean, ... in :" + 'userdb-' + str(s[0])) # min - sql = text("select min(" + cname + ") from " + tbl_name) + sql = text("select min(" + cname + ") from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() minval = res[0] # max - sql = text("select max(" + cname + ") from " + tbl_name) + sql = text("select max(" + cname + ") from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() maxval = res[0] # mean - sql = text("select avg(" + cname + ") from " + tbl_name) + sql = text("select avg(" + cname + ") from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() meanval = res[0] # sd - sql = text("select stddev(" + cname + ") from " + tbl_name) + sql = text("select stddev(" + cname + ") from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() sdval = float(res[0]) @@ -215,7 +186,7 @@ def insert_mdb_numcol(dbid, tid, cid): # sql = text("select " + cname + "from " + tbl_name + "where rand() <= 0.3") width_bucket = (maxval - minval + 1) / num_buckets for i in range(0, num_buckets): - sql = text("select count(*) from " + tbl_name + " where " + cname + " >= " + str( + sql = text("select count(*) from " + t[0] + " where " + cname + " >= " + str( minval + i * width_bucket) + " and " + cname + " < " + str(minval + (i + 1) * width_bucket)) with engine.begin() as conn: res = conn.execute(sql).fetchone() @@ -225,7 +196,7 @@ def insert_mdb_numcol(dbid, tid, cid): histpgarr = lst2pgarr(hist_lst) except Exception as e: - print("Error while connecting to database", e) + print("error while connecting to userdb", e) # Insert / update values in metadata-db try: conn = connect( @@ -239,21 +210,21 @@ def insert_mdb_numcol(dbid, tid, cid): cursor = conn.cursor() cursor.execute("""Insert into mdb_columns_num (cdbid,tid,cid,minval,maxval,mean,sd,histogram,last_modified) - values (%s,%s,%s,%s,%s,%s,%s,%s,%s,current_timestamp) - ON CONFLICT (cdbid,tid,cid) do update set - (minval,maxval,mean,sd,histogram,last_modified) = (%s,%s,%s,%s,%s,%s,current_timestamp)""", + values (%s,%s,%s,%s,%s,%s,%s,%s,current_timestamp) + ON CONFLICT (cdbid,tid,cid) do update set + (minval,maxval,mean,sd,histogram,last_modified) = (%s,%s,%s,%s,%s,current_timestamp)""", (dbid, tid, cid, minval, maxval, meanval, sdval, histpgarr, minval, maxval, meanval, sdval, - histpgarr,)) - + histpgarr)) ret = cursor.statusmessage conn.commit() except Exception as e: - print("Error while inserting into metadatabase", e) + print("error while inserting into fda-metadata-db", e) return json.dumps(ret) def insert_mdb_catcol(dbid, tid, cid): # Connecting to metadatabase to obtain columnname + logging.info("get categorical columns") try: conn = connect( dbname="fda", @@ -263,25 +234,24 @@ def insert_mdb_catcol(dbid, tid, cid): ) cursor = conn.cursor() - cursor.execute("select cDBID,tID,ID,cname from mdb_columns where cdbid = %s and tid = %s and id =%s", + cursor.execute( + "SELECT internal_name FROM mdb_databases where ID=%s", (dbid,)) + s = cursor.fetchone() + conn.commit() + cursor.execute("SELECT internal_name FROM mdb_tables where tDBID=%s and ID=%s", (dbid, tid)) + t = cursor.fetchone() + conn.commit() + cursor.execute("select cDBID,tID,ID,internal_name from mdb_columns where cdbid = %s and tid = %s and id =%s", (dbid, tid, cid)) res = cursor.fetchall() cname = res[0][3] + cursor.execute("SELECT value FROM mdb_images_environment_item where key=%s", ('MARIADB_ROOT_PASSWORD',)) + value = cursor.fetchone() + conn.commit() + cursor.close() except Exception as e: - print("Error while connecting to metadatabase.", e) - try: - s = requests.get( - "http://fda-database-service:9092/api/database/", - params={"id": dbid} - ).json() - except Exception as e: - print("Error while trying to get database info", e) - try: - tbl_info = requests.get( - "http://fda-table-service:9094/api/database/{0}/table/{1}/".format(dbid, tid)).json() - tbl_name = tbl_info['internalName'] - except Exception as e: - print("Error:", e) + print("error while connecting to fda-metadata-db.", e) + # Determine number of categories, categories array try: # Postgres engine @@ -289,18 +259,17 @@ def insert_mdb_catcol(dbid, tid, cid): # 'postgresql+psycopg2://postgres:postgres@fda-userdb-' + s[0]['internalName'].replace('_', '-') + '/' + s[0][ # 'internalName']) # Mariadb engine - engine = create_engine( - 'mysql+pymysql://root:'+str(value)+'@dbrepo-userdb-' + s[0]['internalName'].replace('_', '-') + '/' + s[0][ - 'internalName']) + engine = create_engine('mysql+pymysql://root:'+str(value[0])+'@dbrepo-userdb-'+s[0].replace('_', '-')+'/'+s[0]) + logging.info("determine categories in :" + 'userdb-' + str(s[0])) # num_categories - sql = text("select count( distinct " + cname + ") from " + tbl_name) + sql = text("select count( distinct " + cname + ") from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchone() num_cat = int(res[0]) # cat_array - sql = text("select distinct " + cname + " from " + tbl_name) + sql = text("select distinct " + cname + " from " + t[0]) with engine.begin() as conn: res = conn.execute(sql).fetchall() cat_arr = lst2pgarr(lstflat(res)) @@ -319,19 +288,17 @@ def insert_mdb_catcol(dbid, tid, cid): cursor = conn.cursor() cursor.execute("""Insert into mdb_columns_cat (cdbid,tid,cid,num_cat,cat_array,last_modified) values (%s,%s,%s,%s,%s,current_timestamp) - ON CONFLICT (cdbid,tid,cid) do update set + ON CONFLICT (cdbid,tid,cid) do update set (num_cat,cat_array,last_modified) = (%s,%s,current_timestamp)""", (dbid, tid, cid, num_cat, cat_arr, num_cat, cat_arr)) ret = cursor.statusmessage conn.commit() + cursor.close() except Exception as e: - print("Error while inserting into metadatabase", e) + print("error while inserting into fda-metadata-db", e) return json.dumps(ret) - # Useful helper functions -def lst2mariadbarr(lst): - return '(' + ','.join(list(map(lambda str: "'" + str + "'", lst)))+')' lstflat = lambda x: [item for sublst in x for item in sublst] -lst2pgarr = lambda lst: '{' + ','.join(lst) + '}' \ No newline at end of file +lst2pgarr = lambda lst: '{' + ','.join(lst) + '}' diff --git a/fda-metadata-db/setup-schema.sql b/fda-metadata-db/setup-schema.sql index 89947b142127325f4df480836a3ca81dd2a2bd46..1ed1445f337af3d5858deb4b8afd00fd8aca4063 100644 --- a/fda-metadata-db/setup-schema.sql +++ b/fda-metadata-db/setup-schema.sql @@ -403,7 +403,7 @@ CREATE TABLE IF NOT EXISTS mdb_COLUMNS_num Mean NUMERIC, Median NUMERIC, Sd Numeric, - Histogram INTEGER[], + Histogram NUMERIC[], last_modified timestamp without time zone, created timestamp without time zone NOT NULL DEFAULT NOW(), FOREIGN KEY (cDBID, tID, cID) REFERENCES mdb_COLUMNS (cDBID, tID, ID),