diff --git a/dbrepo-analyse-service/app.py b/dbrepo-analyse-service/app.py index 35beb86015fdccfac5d19627700d7885e9dee5f3..2dc9161746fbc52fb523b2bcfe934a2dac8ffbb4 100644 --- a/dbrepo-analyse-service/app.py +++ b/dbrepo-analyse-service/app.py @@ -288,7 +288,7 @@ def analyse_datatypes(): try: res = determine_datatypes(filename, enum, enum_tol, separator) logging.debug("determine datatype resulted in datatypes %s", res) - return Response(res, mimetype="application/json"), 202 + return Response(res.model_dump_json(), mimetype="application/json"), 202 except OSError as e: logging.error(f"Failed to determine data types: {e}") return ApiError(status='BAD_REQUEST', message=str(e), code='error.analyse.invalid').model_dump_json(), 400 diff --git a/dbrepo-analyse-service/determine_dt.py b/dbrepo-analyse-service/determine_dt.py index d78959f3c4c51d62bd88fcb2c837e8a9d0f9f201..a0890c2b7a9cd5a9e53649464cfa19ec47f0e45d 100644 --- a/dbrepo-analyse-service/determine_dt.py +++ b/dbrepo-analyse-service/determine_dt.py @@ -16,7 +16,7 @@ from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto from clients.s3_client import S3Client -def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> {}: +def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> AnalysisDto: # Use option enum=True for searching Postgres ENUM Types in CSV file. Remark # Enum is not SQL standard, hence, it might not be supported by all db-engines. # However, it can be used in Postgres and MySQL. @@ -26,7 +26,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> stream = response['Body'] if response['ContentLength'] == 0: logging.warning(f'Failed to determine data types: file {filename} has empty body') - return json.dumps({'columns': [], 'separator': ','}) + return AnalysisDto(columns=dict(), separator=",", line_termination="\n") with io.BytesIO(stream.read()) as fh: line_terminator = None @@ -66,6 +66,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> for name, dataType in df.dtypes.items(): col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name])) + r[name] = col if dataType == dtype('float64'): if pandas.to_numeric(df[name], errors='coerce').notnull().all(): logging.debug(f"mapped column {name} from float64 to decimal") @@ -113,10 +114,9 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> col.type = DataTypeDto.DATETIME else: logging.warning(f'default to \'text\' for column {name} and type {dtype}') - r[name] = col s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator) logging.info("Determined data types %s", s) - return s.model_dump_json() + return s def peek_line(f) -> bytes: diff --git a/dbrepo-analyse-service/determine_pk.py b/dbrepo-analyse-service/determine_pk.py index 141d90b78e43b05cce8b2f6c10700a18d14c6073..b0ad8cbf769bb87b814bd2d22261b349ce9bd303 100644 --- a/dbrepo-analyse-service/determine_pk.py +++ b/dbrepo-analyse-service/determine_pk.py @@ -9,8 +9,8 @@ from clients.s3_client import S3Client def determine_pk(filename: str, separator: str = ','): - dt = json.loads(determine_datatypes(filename=filename, separator=separator)) - dt = {k.lower(): v for k, v in dt["columns"].items()} + dt = determine_datatypes(filename=filename, separator=separator) + dt = {k.lower(): v for k, v in dt.columns.items()} # {k.lower(): v for k, v in dt['columns'].items() if v != 'Numeric'} colnames = dt.keys() colindex = list(range(0, len(colnames))) diff --git a/dbrepo-analyse-service/test/test_determine_dt.py b/dbrepo-analyse-service/test/test_determine_dt.py index 3d7e4f8d3bee3f60d593572d420b3243fea179a2..73c443b7280e45295bb66a5ee4b4519daf50627c 100644 --- a/dbrepo-analyse-service/test/test_determine_dt.py +++ b/dbrepo-analyse-service/test/test_determine_dt.py @@ -1,6 +1,6 @@ -import json import unittest +from api.dto import AnalysisDto from clients.s3_client import S3Client from botocore.exceptions import ClientError from determine_dt import determine_datatypes @@ -9,96 +9,196 @@ from determine_dt import determine_datatypes class DetermineDatatypesTest(unittest.TestCase): # @Test def test_determine_datatypesDateTime_succeeds(self): - exp = { - "columns": { - "Datum": "timestamp", - "Standort": "varchar", - "Parameter": "varchar", - "Intervall": "varchar", - "Einheit": "varchar", - "Wert": "decimal", - "Status": "varchar", - }, - "separator": ",", - "line_termination": "\n" - } + exp = AnalysisDto(separator=",", line_termination="\n", columns={ + "Datum": { + "type": "timestamp", + "null_allowed": False, + }, + "Standort": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Parameter": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Intervall": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Einheit": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Wert": { + "type": "decimal", + "size": 10, + "d": 4, + "null_allowed": False, + }, + "Status": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + }) # mock S3Client().upload_file("datetime.csv", './data/test_dt/', 'dbrepo') # test response = determine_datatypes(filename="datetime.csv", separator=",") - self.assertEqual(response, json.dumps(exp)) + self.assertEqual(exp, response) + + # @Test - # @Test def test_determine_datatypesDateTimeWithTimezone_succeeds(self): - exp = { - "columns": { - "Datum": "timestamp", - "Standort": "varchar", - "Parameter": "varchar", - "Intervall": "varchar", - "Einheit": "varchar", - "Wert": "decimal", - "Status": "varchar", - }, - "separator": ",", - "line_termination": "\n" - } + exp = AnalysisDto(separator=",", line_termination="\n", columns={ + "Datum": { + "type": "timestamp", + "null_allowed": False, + }, + "Standort": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Parameter": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Intervall": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Einheit": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Wert": { + "type": "decimal", + "size": 10, + "d": 4, + "null_allowed": False, + }, + "Status": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + }) # mock S3Client().upload_file("datetime_tz.csv", './data/test_dt/', 'dbrepo') # test response = determine_datatypes(filename="datetime_tz.csv", separator=",") - self.assertEqual(response, json.dumps(exp)) + self.assertEqual(exp, response) + + # @Test - # @Test def test_determine_datatypesDateTimeWithT_succeeds(self): - exp = { - "columns": { - "Datum": "timestamp", - "Standort": "varchar", - "Parameter": "varchar", - "Intervall": "varchar", - "Einheit": "varchar", - "Wert": "decimal", - "Status": "varchar", - }, - "separator": ",", - "line_termination": "\n" - } + exp = AnalysisDto(separator=",", line_termination="\n", columns={ + "Datum": { + "type": "timestamp", + "null_allowed": False, + }, + "Standort": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Parameter": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Intervall": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Einheit": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "Wert": { + "type": "decimal", + "size": 10, + "d": 4, + "null_allowed": False, + }, + "Status": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + }) # mock S3Client().upload_file("datetime_t.csv", './data/test_dt/', 'dbrepo') # test response = determine_datatypes(filename="datetime_t.csv", separator=",") - self.assertEqual(response, json.dumps(exp)) + self.assertEqual(exp, response) # @Test def test_determine_datatypes_succeeds(self): - exp = { - "columns": { - "int": "bigint", - "float": "decimal", - "string": "varchar", - "boolean": "bool", - "bool": "bool", - "date": "timestamp", - "time": "timestamp", - "enum": "varchar", # currently not used - }, - "separator": ",", - "line_termination": "\n" - } + exp = AnalysisDto(separator=",", line_termination="\n", columns={ + "int": { + "type": "bigint", + "size": 255, + "null_allowed": False, + }, + "float": { + "type": "decimal", + "size": 10, + "d": 4, + "null_allowed": False, + }, + "string": { + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + "boolean": { + "type": "bool", + "size": None, + "null_allowed": False, + }, + "bool": { + "type": "bool", + "null_allowed": False, + }, + "date": { + "type": "timestamp", + "null_allowed": False, + }, + "time": { + "type": "timestamp", + "null_allowed": False, + }, + "enum": { # currently not used + "type": "varchar", + "size": 255, + "null_allowed": False, + }, + }) # mock S3Client().upload_file("datatypes.csv", './data/test_dt/', 'dbrepo') # test response = determine_datatypes(filename="datatypes.csv", separator=",") - self.assertEqual(response, json.dumps(exp)) + self.assertEqual(exp, response) # @Test def test_determine_datatypes_fileDoesNotExist_fails(self): @@ -121,9 +221,8 @@ class DetermineDatatypesTest(unittest.TestCase): # test response = determine_datatypes("empty.csv") - data = json.loads(response) - self.assertEqual([], data["columns"]) - self.assertEqual(",", data["separator"]) + self.assertEqual({}, response.columns) + self.assertEqual(",", response.separator) # @Test def test_determine_datatypes_separatorSemicolon_succeeds(self): @@ -133,8 +232,7 @@ class DetermineDatatypesTest(unittest.TestCase): # test response = determine_datatypes(filename="separator.csv", separator=";") - data = json.loads(response) - self.assertEqual(";", data["separator"]) + self.assertEqual(";", response.separator) # @Test def test_determine_datatypes_separatorGuess_succeeds(self): @@ -144,8 +242,7 @@ class DetermineDatatypesTest(unittest.TestCase): # test response = determine_datatypes(filename="separator.csv") - data = json.loads(response) - self.assertEqual(";", data["separator"]) + self.assertEqual(";", response.separator) # @Test def test_determine_datatypes_separatorGuessLargeDataset_succeeds(self): @@ -155,27 +252,33 @@ class DetermineDatatypesTest(unittest.TestCase): # test response = determine_datatypes(filename="large.csv") - data = json.loads(response) - self.assertEqual(",", data["separator"]) + self.assertEqual(",", response.separator) # @Test def test_determine_datatypes_separatorGuessText_succeeds(self): - exp = { - "columns": { - "id": "bigint", - "author": "varchar", - "abstract": "text" + exp = AnalysisDto(separator=";", line_termination="\n", columns={ + "id": { + "type": "bigint", + "size": 255, + "null_allowed": False + }, + "author": { + "type": "varchar", + "size": 255, + "null_allowed": False + }, + "abstract": { + "type": "text", + "null_allowed": False }, - "separator": ";", - "line_termination": "\n" - } + }) # mock S3Client().upload_file("novel.csv", './data/test_dt/', 'dbrepo') # test response = determine_datatypes(filename="novel.csv", separator=";") - self.assertEqual(response, json.dumps(exp)) + self.assertEqual(exp, response) if __name__ == "__main__":