Skip to content
Snippets Groups Projects
Verified Commit da83c4cb authored by Martin Weise's avatar Martin Weise
Browse files

Updated tests

parent 61ad0c0e
No related branches found
No related tags found
No related merge requests found
...@@ -288,7 +288,7 @@ def analyse_datatypes(): ...@@ -288,7 +288,7 @@ def analyse_datatypes():
try: try:
res = determine_datatypes(filename, enum, enum_tol, separator) res = determine_datatypes(filename, enum, enum_tol, separator)
logging.debug("determine datatype resulted in datatypes %s", res) logging.debug("determine datatype resulted in datatypes %s", res)
return Response(res, mimetype="application/json"), 202 return Response(res.model_dump_json(), mimetype="application/json"), 202
except OSError as e: except OSError as e:
logging.error(f"Failed to determine data types: {e}") logging.error(f"Failed to determine data types: {e}")
return ApiError(status='BAD_REQUEST', message=str(e), code='error.analyse.invalid').model_dump_json(), 400 return ApiError(status='BAD_REQUEST', message=str(e), code='error.analyse.invalid').model_dump_json(), 400
......
...@@ -16,7 +16,7 @@ from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto ...@@ -16,7 +16,7 @@ from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto
from clients.s3_client import S3Client from clients.s3_client import S3Client
def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> {}: def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> AnalysisDto:
# Use option enum=True for searching Postgres ENUM Types in CSV file. Remark # Use option enum=True for searching Postgres ENUM Types in CSV file. Remark
# Enum is not SQL standard, hence, it might not be supported by all db-engines. # Enum is not SQL standard, hence, it might not be supported by all db-engines.
# However, it can be used in Postgres and MySQL. # However, it can be used in Postgres and MySQL.
...@@ -26,7 +26,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> ...@@ -26,7 +26,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') ->
stream = response['Body'] stream = response['Body']
if response['ContentLength'] == 0: if response['ContentLength'] == 0:
logging.warning(f'Failed to determine data types: file {filename} has empty body') logging.warning(f'Failed to determine data types: file {filename} has empty body')
return json.dumps({'columns': [], 'separator': ','}) return AnalysisDto(columns=dict(), separator=",", line_termination="\n")
with io.BytesIO(stream.read()) as fh: with io.BytesIO(stream.read()) as fh:
line_terminator = None line_terminator = None
...@@ -66,6 +66,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> ...@@ -66,6 +66,7 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') ->
for name, dataType in df.dtypes.items(): for name, dataType in df.dtypes.items():
col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name])) col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name]))
r[name] = col
if dataType == dtype('float64'): if dataType == dtype('float64'):
if pandas.to_numeric(df[name], errors='coerce').notnull().all(): if pandas.to_numeric(df[name], errors='coerce').notnull().all():
logging.debug(f"mapped column {name} from float64 to decimal") logging.debug(f"mapped column {name} from float64 to decimal")
...@@ -113,10 +114,9 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> ...@@ -113,10 +114,9 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') ->
col.type = DataTypeDto.DATETIME col.type = DataTypeDto.DATETIME
else: else:
logging.warning(f'default to \'text\' for column {name} and type {dtype}') logging.warning(f'default to \'text\' for column {name} and type {dtype}')
r[name] = col
s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator) s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator)
logging.info("Determined data types %s", s) logging.info("Determined data types %s", s)
return s.model_dump_json() return s
def peek_line(f) -> bytes: def peek_line(f) -> bytes:
......
...@@ -9,8 +9,8 @@ from clients.s3_client import S3Client ...@@ -9,8 +9,8 @@ from clients.s3_client import S3Client
def determine_pk(filename: str, separator: str = ','): def determine_pk(filename: str, separator: str = ','):
dt = json.loads(determine_datatypes(filename=filename, separator=separator)) dt = determine_datatypes(filename=filename, separator=separator)
dt = {k.lower(): v for k, v in dt["columns"].items()} dt = {k.lower(): v for k, v in dt.columns.items()}
# {k.lower(): v for k, v in dt['columns'].items() if v != 'Numeric'} # {k.lower(): v for k, v in dt['columns'].items() if v != 'Numeric'}
colnames = dt.keys() colnames = dt.keys()
colindex = list(range(0, len(colnames))) colindex = list(range(0, len(colnames)))
......
import json
import unittest import unittest
from api.dto import AnalysisDto
from clients.s3_client import S3Client from clients.s3_client import S3Client
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from determine_dt import determine_datatypes from determine_dt import determine_datatypes
...@@ -9,96 +9,196 @@ from determine_dt import determine_datatypes ...@@ -9,96 +9,196 @@ from determine_dt import determine_datatypes
class DetermineDatatypesTest(unittest.TestCase): class DetermineDatatypesTest(unittest.TestCase):
# @Test # @Test
def test_determine_datatypesDateTime_succeeds(self): def test_determine_datatypesDateTime_succeeds(self):
exp = { exp = AnalysisDto(separator=",", line_termination="\n", columns={
"columns": { "Datum": {
"Datum": "timestamp", "type": "timestamp",
"Standort": "varchar", "null_allowed": False,
"Parameter": "varchar", },
"Intervall": "varchar", "Standort": {
"Einheit": "varchar", "type": "varchar",
"Wert": "decimal", "size": 255,
"Status": "varchar", "null_allowed": False,
}, },
"separator": ",", "Parameter": {
"line_termination": "\n" "type": "varchar",
} "size": 255,
"null_allowed": False,
},
"Intervall": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Einheit": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Wert": {
"type": "decimal",
"size": 10,
"d": 4,
"null_allowed": False,
},
"Status": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
})
# mock # mock
S3Client().upload_file("datetime.csv", './data/test_dt/', 'dbrepo') S3Client().upload_file("datetime.csv", './data/test_dt/', 'dbrepo')
# test # test
response = determine_datatypes(filename="datetime.csv", separator=",") response = determine_datatypes(filename="datetime.csv", separator=",")
self.assertEqual(response, json.dumps(exp)) self.assertEqual(exp, response)
# @Test # @Test
def test_determine_datatypesDateTimeWithTimezone_succeeds(self): def test_determine_datatypesDateTimeWithTimezone_succeeds(self):
exp = { exp = AnalysisDto(separator=",", line_termination="\n", columns={
"columns": { "Datum": {
"Datum": "timestamp", "type": "timestamp",
"Standort": "varchar", "null_allowed": False,
"Parameter": "varchar", },
"Intervall": "varchar", "Standort": {
"Einheit": "varchar", "type": "varchar",
"Wert": "decimal", "size": 255,
"Status": "varchar", "null_allowed": False,
}, },
"separator": ",", "Parameter": {
"line_termination": "\n" "type": "varchar",
} "size": 255,
"null_allowed": False,
},
"Intervall": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Einheit": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Wert": {
"type": "decimal",
"size": 10,
"d": 4,
"null_allowed": False,
},
"Status": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
})
# mock # mock
S3Client().upload_file("datetime_tz.csv", './data/test_dt/', 'dbrepo') S3Client().upload_file("datetime_tz.csv", './data/test_dt/', 'dbrepo')
# test # test
response = determine_datatypes(filename="datetime_tz.csv", separator=",") response = determine_datatypes(filename="datetime_tz.csv", separator=",")
self.assertEqual(response, json.dumps(exp)) self.assertEqual(exp, response)
# @Test # @Test
def test_determine_datatypesDateTimeWithT_succeeds(self): def test_determine_datatypesDateTimeWithT_succeeds(self):
exp = { exp = AnalysisDto(separator=",", line_termination="\n", columns={
"columns": { "Datum": {
"Datum": "timestamp", "type": "timestamp",
"Standort": "varchar", "null_allowed": False,
"Parameter": "varchar", },
"Intervall": "varchar", "Standort": {
"Einheit": "varchar", "type": "varchar",
"Wert": "decimal", "size": 255,
"Status": "varchar", "null_allowed": False,
}, },
"separator": ",", "Parameter": {
"line_termination": "\n" "type": "varchar",
} "size": 255,
"null_allowed": False,
},
"Intervall": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Einheit": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
"Wert": {
"type": "decimal",
"size": 10,
"d": 4,
"null_allowed": False,
},
"Status": {
"type": "varchar",
"size": 255,
"null_allowed": False,
},
})
# mock # mock
S3Client().upload_file("datetime_t.csv", './data/test_dt/', 'dbrepo') S3Client().upload_file("datetime_t.csv", './data/test_dt/', 'dbrepo')
# test # test
response = determine_datatypes(filename="datetime_t.csv", separator=",") response = determine_datatypes(filename="datetime_t.csv", separator=",")
self.assertEqual(response, json.dumps(exp)) self.assertEqual(exp, response)
# @Test # @Test
def test_determine_datatypes_succeeds(self): def test_determine_datatypes_succeeds(self):
exp = { exp = AnalysisDto(separator=",", line_termination="\n", columns={
"columns": { "int": {
"int": "bigint", "type": "bigint",
"float": "decimal", "size": 255,
"string": "varchar", "null_allowed": False,
"boolean": "bool", },
"bool": "bool", "float": {
"date": "timestamp", "type": "decimal",
"time": "timestamp", "size": 10,
"enum": "varchar", # currently not used "d": 4,
}, "null_allowed": False,
"separator": ",", },
"line_termination": "\n" "string": {
} "type": "varchar",
"size": 255,
"null_allowed": False,
},
"boolean": {
"type": "bool",
"size": None,
"null_allowed": False,
},
"bool": {
"type": "bool",
"null_allowed": False,
},
"date": {
"type": "timestamp",
"null_allowed": False,
},
"time": {
"type": "timestamp",
"null_allowed": False,
},
"enum": { # currently not used
"type": "varchar",
"size": 255,
"null_allowed": False,
},
})
# mock # mock
S3Client().upload_file("datatypes.csv", './data/test_dt/', 'dbrepo') S3Client().upload_file("datatypes.csv", './data/test_dt/', 'dbrepo')
# test # test
response = determine_datatypes(filename="datatypes.csv", separator=",") response = determine_datatypes(filename="datatypes.csv", separator=",")
self.assertEqual(response, json.dumps(exp)) self.assertEqual(exp, response)
# @Test # @Test
def test_determine_datatypes_fileDoesNotExist_fails(self): def test_determine_datatypes_fileDoesNotExist_fails(self):
...@@ -121,9 +221,8 @@ class DetermineDatatypesTest(unittest.TestCase): ...@@ -121,9 +221,8 @@ class DetermineDatatypesTest(unittest.TestCase):
# test # test
response = determine_datatypes("empty.csv") response = determine_datatypes("empty.csv")
data = json.loads(response) self.assertEqual({}, response.columns)
self.assertEqual([], data["columns"]) self.assertEqual(",", response.separator)
self.assertEqual(",", data["separator"])
# @Test # @Test
def test_determine_datatypes_separatorSemicolon_succeeds(self): def test_determine_datatypes_separatorSemicolon_succeeds(self):
...@@ -133,8 +232,7 @@ class DetermineDatatypesTest(unittest.TestCase): ...@@ -133,8 +232,7 @@ class DetermineDatatypesTest(unittest.TestCase):
# test # test
response = determine_datatypes(filename="separator.csv", separator=";") response = determine_datatypes(filename="separator.csv", separator=";")
data = json.loads(response) self.assertEqual(";", response.separator)
self.assertEqual(";", data["separator"])
# @Test # @Test
def test_determine_datatypes_separatorGuess_succeeds(self): def test_determine_datatypes_separatorGuess_succeeds(self):
...@@ -144,8 +242,7 @@ class DetermineDatatypesTest(unittest.TestCase): ...@@ -144,8 +242,7 @@ class DetermineDatatypesTest(unittest.TestCase):
# test # test
response = determine_datatypes(filename="separator.csv") response = determine_datatypes(filename="separator.csv")
data = json.loads(response) self.assertEqual(";", response.separator)
self.assertEqual(";", data["separator"])
# @Test # @Test
def test_determine_datatypes_separatorGuessLargeDataset_succeeds(self): def test_determine_datatypes_separatorGuessLargeDataset_succeeds(self):
...@@ -155,27 +252,33 @@ class DetermineDatatypesTest(unittest.TestCase): ...@@ -155,27 +252,33 @@ class DetermineDatatypesTest(unittest.TestCase):
# test # test
response = determine_datatypes(filename="large.csv") response = determine_datatypes(filename="large.csv")
data = json.loads(response) self.assertEqual(",", response.separator)
self.assertEqual(",", data["separator"])
# @Test # @Test
def test_determine_datatypes_separatorGuessText_succeeds(self): def test_determine_datatypes_separatorGuessText_succeeds(self):
exp = { exp = AnalysisDto(separator=";", line_termination="\n", columns={
"columns": { "id": {
"id": "bigint", "type": "bigint",
"author": "varchar", "size": 255,
"abstract": "text" "null_allowed": False
},
"author": {
"type": "varchar",
"size": 255,
"null_allowed": False
},
"abstract": {
"type": "text",
"null_allowed": False
}, },
"separator": ";", })
"line_termination": "\n"
}
# mock # mock
S3Client().upload_file("novel.csv", './data/test_dt/', 'dbrepo') S3Client().upload_file("novel.csv", './data/test_dt/', 'dbrepo')
# test # test
response = determine_datatypes(filename="novel.csv", separator=";") response = determine_datatypes(filename="novel.csv", separator=";")
self.assertEqual(response, json.dumps(exp)) self.assertEqual(exp, response)
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment