From 61ad0c0efa4b8cde9296f6431d3dc470a474fee3 Mon Sep 17 00:00:00 2001 From: Martin Weise <martin.weise@tuwien.ac.at> Date: Wed, 31 Jul 2024 19:24:32 +0200 Subject: [PATCH] Hotfix the data type sizes --- dbrepo-analyse-service/api/dto.py | 69 ++++++++++++++++--- dbrepo-analyse-service/app.py | 47 +++++++++++-- .../as-yml/analyse_datatypes.yml | 2 +- dbrepo-analyse-service/determine_dt.py | 40 +++++++---- .../tuwien/mapper/MariaDbMapperUnitTest.java | 43 ++++++++++++ .../service/SchemaServiceIntegrationTest.java | 5 +- dbrepo-ui/components/table/TableImport.vue | 18 +++-- dbrepo-ui/components/table/TableSchema.vue | 6 -- .../[database_id]/table/create/dataset.vue | 4 ++ 9 files changed, 189 insertions(+), 45 deletions(-) create mode 100644 dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java diff --git a/dbrepo-analyse-service/api/dto.py b/dbrepo-analyse-service/api/dto.py index 66eed5ee5b..c3c6a22c04 100644 --- a/dbrepo-analyse-service/api/dto.py +++ b/dbrepo-analyse-service/api/dto.py @@ -1,15 +1,66 @@ -from typing import Optional +from enum import Enum +from typing import Optional, List from pydantic import BaseModel -class ColumnStat(BaseModel): - val_min: Optional[float] - val_max: Optional[float] - mean: Optional[float] - median: Optional[float] - std_dev: Optional[float] +class DataTypeDto(str, Enum): + """ + Enumeration of languages. + """ + BIGINT = "bigint" + BINARY = "binary" + BIT = "bit" + BLOB = "blob" + BOOL = "bool" + CHAR = "char" + DATE = "date" + DATETIME = "datetime" + DECIMAL = "decimal" + DOUBLE = "double" + ENUM = "enum" + FLOAT = "float" + INT = "int" + LONGBLOB = "longblob" + LONGTEXT = "longtext" + MEDIUMBLOB = "mediumblob" + MEDIUMINT = "mediumint" + MEDIUMTEXT = "mediumtext" + SET = "set" + SMALLINT = "smallint" + TEXT = "text" + TIMESTAMP = "timestamp" + TINYBLOB = "tinyblob" + TINYINT = "tinyint" + TINYTEXT = "tinytext" + YEAR = "year" + VARBINARY = "varbinary" + VARCHAR = "varchar" -class TableStat(BaseModel): - columns: dict[str, ColumnStat] +class ColumnAnalysisDto(BaseModel): + type: DataTypeDto + null_allowed: bool + size: Optional[int] = None + d: Optional[int] = None + dfid: Optional[int] = None + enums: Optional[list] = None + sets: Optional[list] = None + + +class AnalysisDto(BaseModel): + columns: dict[str, ColumnAnalysisDto] + separator: str + line_termination: str + + +class ColumnStatDto(BaseModel): + val_min: Optional[float] = None + val_max: Optional[float] = None + mean: Optional[float] = None + median: Optional[float] = None + std_dev: Optional[float] = None + + +class TableStatDto(BaseModel): + columns: dict[str, ColumnStatDto] diff --git a/dbrepo-analyse-service/app.py b/dbrepo-analyse-service/app.py index bbce751508..35beb86015 100644 --- a/dbrepo-analyse-service/app.py +++ b/dbrepo-analyse-service/app.py @@ -77,10 +77,17 @@ template = { "openapi": "3.0.0", "components": { "schemas": { - "DataTypesDto": { + "AnalysisDto": { "properties": { "columns": { - "$ref": "#/components/schemas/SuggestedColumnDto" + "type": "array", + "items": { + "properties": { + "column_name": { + "$ref": "#/components/schemas/ColumnAnalysisDto" + } + } + } }, "line_termination": { "example": "\r\n", @@ -125,10 +132,40 @@ template = { ], "type": "object" }, - "SuggestedColumnDto": { + "ColumnAnalysisDto": { "properties": { - "column_name": { - "type": "string" + "type": { + "type": "string", + "example": "decimal" + }, + "null_allowed": { + "type": "boolean" + }, + "size": { + "type": "integer", + "example": 10 + }, + "d": { + "type": "integer", + "example": 4 + }, + "dfid": { + "type": "integer", + "example": None + }, + "enums": { + "type": "array", + "example": None, + "properties": { + "type": "string" + } + }, + "sets": { + "type": "array", + "example": None, + "properties": { + "type": "string" + } } }, "type": "object" diff --git a/dbrepo-analyse-service/as-yml/analyse_datatypes.yml b/dbrepo-analyse-service/as-yml/analyse_datatypes.yml index 14529bb34b..78f84f9e27 100644 --- a/dbrepo-analyse-service/as-yml/analyse_datatypes.yml +++ b/dbrepo-analyse-service/as-yml/analyse_datatypes.yml @@ -38,7 +38,7 @@ responses: content: application/json: schema: - $ref: '#/components/schemas/DataTypesDto' + $ref: '#/components/schemas/AnalysisDto' 400: description: "Failed to determine data types" content: diff --git a/dbrepo-analyse-service/determine_dt.py b/dbrepo-analyse-service/determine_dt.py index 6a22401866..d78959f3c4 100644 --- a/dbrepo-analyse-service/determine_dt.py +++ b/dbrepo-analyse-service/determine_dt.py @@ -9,8 +9,10 @@ import pandas from numpy import dtype, max, min from flask import current_app +from pandas import DataFrame from pandas.errors import EmptyDataError +from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto from clients.s3_client import S3Client @@ -63,52 +65,58 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> r = {} for name, dataType in df.dtypes.items(): + col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name])) if dataType == dtype('float64'): if pandas.to_numeric(df[name], errors='coerce').notnull().all(): logging.debug(f"mapped column {name} from float64 to decimal") - r[name] = 'decimal' + col.type = DataTypeDto.DECIMAL + col.size = 10 + col.d = 4 else: logging.debug(f"mapped column {name} from float64 to text") - r[name] = 'text' + col.type = DataTypeDto.TEXT elif dataType == dtype('int64'): min_val = min(df[name]) max_val = max(df[name]) if 0 <= min_val <= 1 and 0 <= max_val <= 1: logging.debug(f"mapped column {name} from int64 to bool") - r[name] = 'bool' + col.type = DataTypeDto.BOOL continue logging.debug(f"mapped column {name} from int64 to bigint") - r[name] = 'bigint' + col.type = DataTypeDto.BIGINT + col.size = 255 elif dataType == dtype('O'): try: pandas.to_datetime(df[name], format='mixed') logging.debug(f"mapped column {name} from O to timestamp") - r[name] = 'timestamp' + col.type = DataTypeDto.TIMESTAMP continue except ValueError: pass max_size = max(df[name].astype(str).map(len)) if max_size <= 1: logging.debug(f"mapped column {name} from O to char") - r[name] = 'char' + col.type = DataTypeDto.CHAR + col.size = 1 if 0 <= max_size <= 255: logging.debug(f"mapped column {name} from O to varchar") - r[name] = 'varchar' + col.type = DataTypeDto.VARCHAR + col.size = 255 else: logging.debug(f"mapped column {name} from O to text") - r[name] = 'text' + col.type = DataTypeDto.TEXT elif dataType == dtype('bool'): logging.debug(f"mapped column {name} from bool to bool") - r[name] = 'bool' + col.type = DataTypeDto.BOOL elif dataType == dtype('datetime64'): logging.debug(f"mapped column {name} from datetime64 to datetime") - r[name] = 'datetime' + col.type = DataTypeDto.DATETIME else: logging.warning(f'default to \'text\' for column {name} and type {dtype}') - r[name] = 'text' - s = {"columns": r, "separator": separator, "line_termination": line_terminator} + r[name] = col + s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator) logging.info("Determined data types %s", s) - return json.dumps(s) + return s.model_dump_json() def peek_line(f) -> bytes: @@ -116,3 +124,9 @@ def peek_line(f) -> bytes: line: bytes = f.readline() f.seek(pos) return line + + +def contains_null(df: DataFrame) -> bool: + if '\\N' in df.values: + return True + return df.isnull().values.any() diff --git a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java new file mode 100644 index 0000000000..a1a3ef4dad --- /dev/null +++ b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java @@ -0,0 +1,43 @@ +package at.tuwien.mapper; + +import at.tuwien.test.AbstractUnitTest; +import lombok.extern.log4j.Log4j2; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; + +@Log4j2 +@SpringBootTest +@ExtendWith(SpringExtension.class) +public class MariaDbMapperUnitTest extends AbstractUnitTest { + + @Autowired + private MariaDbMapper mariaDbMapper; + + public static Stream<Arguments> nameToInternalName_parameters() { + return Stream.of( + Arguments.arguments("dash_minus", "OE/NO-027", "oe_no_027"), + Arguments.arguments("percent", "OE%NO-027", "oe_no_027"), + Arguments.arguments("umlaut", "OE/NĂ–-027", "oe_no__027"), + Arguments.arguments("dot", "OE.NO-027", "oe_no_027"), + Arguments.arguments("double_dot", "OE:NO-027", "oe_no_027") + ); + } + + @ParameterizedTest + @MethodSource("nameToInternalName_parameters") + public void nameToInternalName_succeeds(String name, String input, String expected) { + + /* test */ + assertEquals(expected, mariaDbMapper.nameToInternalName(input)); + } + +} diff --git a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java index cc64476927..be1f6b5dae 100644 --- a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java +++ b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java @@ -59,7 +59,10 @@ public class SchemaServiceIntegrationTest extends AbstractUnitTest { } @Test - public void inspectTable_succeeds() throws TableNotFoundException, SQLException { + public void inspectTable_sameNameDifferentDb_succeeds() throws TableNotFoundException, SQLException { + + /* mock */ + MariaDbConfig.execute(DATABASE_2_PRIVILEGED_DTO, "CREATE TABLE not_in_metadata_db (wrong_id BIGINT NOT NULL PRIMARY KEY, given_name VARCHAR(255) NOT NULL, middle_name VARCHAR(255), family_name VARCHAR(255) NOT NULL, age INT NOT NULL) WITH SYSTEM VERSIONING;"); /* test */ final TableDto response = schemaService.inspectTable(DATABASE_1_PRIVILEGED_DTO, "not_in_metadata_db"); diff --git a/dbrepo-ui/components/table/TableImport.vue b/dbrepo-ui/components/table/TableImport.vue index 65fdd4930d..e89c920d32 100644 --- a/dbrepo-ui/components/table/TableImport.vue +++ b/dbrepo-ui/components/table/TableImport.vue @@ -509,19 +509,17 @@ export default { analyseService.suggest(payload) .then((analysis) => { const {columns, separator, line_termination} = analysis - const queryService = useQueryService() - const dataTypes = queryService.mySql8DataTypes() this.columns = Object.entries(columns) - .map(([key, val]) => { + .map(([name, analyse]) => { return { - name: key, - type: val, - null_allowed: true, + name: name, + type: analyse.type, + null_allowed: analyse.null_allowed, primary_key: false, - size: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultSize : null, - d: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultD : null, - enums: [], - sets: [] + size: analyse.size, + d: analyse.d, + enums: analyse.enums, + sets: analyse.sets } }) this.suggestedAnalyseSeparator = separator diff --git a/dbrepo-ui/components/table/TableSchema.vue b/dbrepo-ui/components/table/TableSchema.vue index e820d0aea2..25c4f66cb5 100644 --- a/dbrepo-ui/components/table/TableSchema.vue +++ b/dbrepo-ui/components/table/TableSchema.vue @@ -227,9 +227,6 @@ export default { database () { return this.cacheStore.getDatabase }, - needsSequence () { - return this.columns.filter(c => c.primary_key).length === 0 - }, dateFormats () { if (!this.database || !('container' in this.database) || !('image' in this.database.container) || !('date_formats' in this.database.container.image)) { return [] @@ -287,9 +284,6 @@ export default { if (idx > 0) { return true } - if (this.needsSequence) { - return true - } if (this.columns[0].primary_key) { return false } diff --git a/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue b/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue index 045c1932c2..c3b5a38c7a 100644 --- a/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue +++ b/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue @@ -373,10 +373,14 @@ export default { }, async onImport () { this.loadingImport = true + const cacheStore = useCacheStore() + cacheStore.reloadDatabase() await this.$router.push({ path: `/database/${this.$route.params.database_id}/table/${this.table.id}/import`, query: this.tableImport }) }, async onContinue () { this.loadingContinue = true + const cacheStore = useCacheStore() + cacheStore.reloadDatabase() await this.$router.push(`/database/${this.$route.params.database_id}/table/${this.table.id}/data`) } } -- GitLab