From 61ad0c0efa4b8cde9296f6431d3dc470a474fee3 Mon Sep 17 00:00:00 2001
From: Martin Weise <martin.weise@tuwien.ac.at>
Date: Wed, 31 Jul 2024 19:24:32 +0200
Subject: [PATCH] Hotfix the data type sizes

---
 dbrepo-analyse-service/api/dto.py             | 69 ++++++++++++++++---
 dbrepo-analyse-service/app.py                 | 47 +++++++++++--
 .../as-yml/analyse_datatypes.yml              |  2 +-
 dbrepo-analyse-service/determine_dt.py        | 40 +++++++----
 .../tuwien/mapper/MariaDbMapperUnitTest.java  | 43 ++++++++++++
 .../service/SchemaServiceIntegrationTest.java |  5 +-
 dbrepo-ui/components/table/TableImport.vue    | 18 +++--
 dbrepo-ui/components/table/TableSchema.vue    |  6 --
 .../[database_id]/table/create/dataset.vue    |  4 ++
 9 files changed, 189 insertions(+), 45 deletions(-)
 create mode 100644 dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java

diff --git a/dbrepo-analyse-service/api/dto.py b/dbrepo-analyse-service/api/dto.py
index 66eed5ee5b..c3c6a22c04 100644
--- a/dbrepo-analyse-service/api/dto.py
+++ b/dbrepo-analyse-service/api/dto.py
@@ -1,15 +1,66 @@
-from typing import Optional
+from enum import Enum
+from typing import Optional, List
 
 from pydantic import BaseModel
 
 
-class ColumnStat(BaseModel):
-    val_min: Optional[float]
-    val_max: Optional[float]
-    mean: Optional[float]
-    median: Optional[float]
-    std_dev: Optional[float]
+class DataTypeDto(str, Enum):
+    """
+    Enumeration of languages.
+    """
+    BIGINT = "bigint"
+    BINARY = "binary"
+    BIT = "bit"
+    BLOB = "blob"
+    BOOL = "bool"
+    CHAR = "char"
+    DATE = "date"
+    DATETIME = "datetime"
+    DECIMAL = "decimal"
+    DOUBLE = "double"
+    ENUM = "enum"
+    FLOAT = "float"
+    INT = "int"
+    LONGBLOB = "longblob"
+    LONGTEXT = "longtext"
+    MEDIUMBLOB = "mediumblob"
+    MEDIUMINT = "mediumint"
+    MEDIUMTEXT = "mediumtext"
+    SET = "set"
+    SMALLINT = "smallint"
+    TEXT = "text"
+    TIMESTAMP = "timestamp"
+    TINYBLOB = "tinyblob"
+    TINYINT = "tinyint"
+    TINYTEXT = "tinytext"
+    YEAR = "year"
+    VARBINARY = "varbinary"
+    VARCHAR = "varchar"
 
 
-class TableStat(BaseModel):
-    columns: dict[str, ColumnStat]
+class ColumnAnalysisDto(BaseModel):
+    type: DataTypeDto
+    null_allowed: bool
+    size: Optional[int] = None
+    d: Optional[int] = None
+    dfid: Optional[int] = None
+    enums: Optional[list] = None
+    sets: Optional[list] = None
+
+
+class AnalysisDto(BaseModel):
+    columns: dict[str, ColumnAnalysisDto]
+    separator: str
+    line_termination: str
+
+
+class ColumnStatDto(BaseModel):
+    val_min: Optional[float] = None
+    val_max: Optional[float] = None
+    mean: Optional[float] = None
+    median: Optional[float] = None
+    std_dev: Optional[float] = None
+
+
+class TableStatDto(BaseModel):
+    columns: dict[str, ColumnStatDto]
diff --git a/dbrepo-analyse-service/app.py b/dbrepo-analyse-service/app.py
index bbce751508..35beb86015 100644
--- a/dbrepo-analyse-service/app.py
+++ b/dbrepo-analyse-service/app.py
@@ -77,10 +77,17 @@ template = {
     "openapi": "3.0.0",
     "components": {
         "schemas": {
-            "DataTypesDto": {
+            "AnalysisDto": {
                 "properties": {
                     "columns": {
-                        "$ref": "#/components/schemas/SuggestedColumnDto"
+                        "type": "array",
+                        "items": {
+                            "properties": {
+                                "column_name": {
+                                    "$ref": "#/components/schemas/ColumnAnalysisDto"
+                                }
+                            }
+                        }
                     },
                     "line_termination": {
                         "example": "\r\n",
@@ -125,10 +132,40 @@ template = {
                 ],
                 "type": "object"
             },
-            "SuggestedColumnDto": {
+            "ColumnAnalysisDto": {
                 "properties": {
-                    "column_name": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "example": "decimal"
+                    },
+                    "null_allowed": {
+                        "type": "boolean"
+                    },
+                    "size": {
+                        "type": "integer",
+                        "example": 10
+                    },
+                    "d": {
+                        "type": "integer",
+                        "example": 4
+                    },
+                    "dfid": {
+                        "type": "integer",
+                        "example": None
+                    },
+                    "enums": {
+                        "type": "array",
+                        "example": None,
+                        "properties": {
+                            "type": "string"
+                        }
+                    },
+                    "sets": {
+                        "type": "array",
+                        "example": None,
+                        "properties": {
+                            "type": "string"
+                        }
                     }
                 },
                 "type": "object"
diff --git a/dbrepo-analyse-service/as-yml/analyse_datatypes.yml b/dbrepo-analyse-service/as-yml/analyse_datatypes.yml
index 14529bb34b..78f84f9e27 100644
--- a/dbrepo-analyse-service/as-yml/analyse_datatypes.yml
+++ b/dbrepo-analyse-service/as-yml/analyse_datatypes.yml
@@ -38,7 +38,7 @@ responses:
     content:
       application/json:
         schema:
-          $ref: '#/components/schemas/DataTypesDto'
+          $ref: '#/components/schemas/AnalysisDto'
   400:
     description: "Failed to determine data types"
     content:
diff --git a/dbrepo-analyse-service/determine_dt.py b/dbrepo-analyse-service/determine_dt.py
index 6a22401866..d78959f3c4 100644
--- a/dbrepo-analyse-service/determine_dt.py
+++ b/dbrepo-analyse-service/determine_dt.py
@@ -9,8 +9,10 @@ import pandas
 
 from numpy import dtype, max, min
 from flask import current_app
+from pandas import DataFrame
 from pandas.errors import EmptyDataError
 
+from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto
 from clients.s3_client import S3Client
 
 
@@ -63,52 +65,58 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') ->
         r = {}
 
         for name, dataType in df.dtypes.items():
+            col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name]))
             if dataType == dtype('float64'):
                 if pandas.to_numeric(df[name], errors='coerce').notnull().all():
                     logging.debug(f"mapped column {name} from float64 to decimal")
-                    r[name] = 'decimal'
+                    col.type = DataTypeDto.DECIMAL
+                    col.size = 10
+                    col.d = 4
                 else:
                     logging.debug(f"mapped column {name} from float64 to text")
-                    r[name] = 'text'
+                    col.type = DataTypeDto.TEXT
             elif dataType == dtype('int64'):
                 min_val = min(df[name])
                 max_val = max(df[name])
                 if 0 <= min_val <= 1 and 0 <= max_val <= 1:
                     logging.debug(f"mapped column {name} from int64 to bool")
-                    r[name] = 'bool'
+                    col.type = DataTypeDto.BOOL
                     continue
                 logging.debug(f"mapped column {name} from int64 to bigint")
-                r[name] = 'bigint'
+                col.type = DataTypeDto.BIGINT
+                col.size = 255
             elif dataType == dtype('O'):
                 try:
                     pandas.to_datetime(df[name], format='mixed')
                     logging.debug(f"mapped column {name} from O to timestamp")
-                    r[name] = 'timestamp'
+                    col.type = DataTypeDto.TIMESTAMP
                     continue
                 except ValueError:
                     pass
                 max_size = max(df[name].astype(str).map(len))
                 if max_size <= 1:
                     logging.debug(f"mapped column {name} from O to char")
-                    r[name] = 'char'
+                    col.type = DataTypeDto.CHAR
+                    col.size = 1
                 if 0 <= max_size <= 255:
                     logging.debug(f"mapped column {name} from O to varchar")
-                    r[name] = 'varchar'
+                    col.type = DataTypeDto.VARCHAR
+                    col.size = 255
                 else:
                     logging.debug(f"mapped column {name} from O to text")
-                    r[name] = 'text'
+                    col.type = DataTypeDto.TEXT
             elif dataType == dtype('bool'):
                 logging.debug(f"mapped column {name} from bool to bool")
-                r[name] = 'bool'
+                col.type = DataTypeDto.BOOL
             elif dataType == dtype('datetime64'):
                 logging.debug(f"mapped column {name} from datetime64 to datetime")
-                r[name] = 'datetime'
+                col.type = DataTypeDto.DATETIME
             else:
                 logging.warning(f'default to \'text\' for column {name} and type {dtype}')
-                r[name] = 'text'
-        s = {"columns": r, "separator": separator, "line_termination": line_terminator}
+            r[name] = col
+        s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator)
         logging.info("Determined data types %s", s)
-    return json.dumps(s)
+    return s.model_dump_json()
 
 
 def peek_line(f) -> bytes:
@@ -116,3 +124,9 @@ def peek_line(f) -> bytes:
     line: bytes = f.readline()
     f.seek(pos)
     return line
+
+
+def contains_null(df: DataFrame) -> bool:
+    if '\\N' in df.values:
+        return True
+    return df.isnull().values.any()
diff --git a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java
new file mode 100644
index 0000000000..a1a3ef4dad
--- /dev/null
+++ b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/mapper/MariaDbMapperUnitTest.java
@@ -0,0 +1,43 @@
+package at.tuwien.mapper;
+
+import at.tuwien.test.AbstractUnitTest;
+import lombok.extern.log4j.Log4j2;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.context.junit.jupiter.SpringExtension;
+
+import java.util.stream.Stream;
+
+import static org.junit.Assert.assertEquals;
+
+@Log4j2
+@SpringBootTest
+@ExtendWith(SpringExtension.class)
+public class MariaDbMapperUnitTest extends AbstractUnitTest {
+
+    @Autowired
+    private MariaDbMapper mariaDbMapper;
+
+    public static Stream<Arguments> nameToInternalName_parameters() {
+        return Stream.of(
+                Arguments.arguments("dash_minus", "OE/NO-027", "oe_no_027"),
+                Arguments.arguments("percent", "OE%NO-027", "oe_no_027"),
+                Arguments.arguments("umlaut", "OE/NĂ–-027", "oe_no__027"),
+                Arguments.arguments("dot", "OE.NO-027", "oe_no_027"),
+                Arguments.arguments("double_dot", "OE:NO-027", "oe_no_027")
+        );
+    }
+
+    @ParameterizedTest
+    @MethodSource("nameToInternalName_parameters")
+    public void nameToInternalName_succeeds(String name, String input, String expected) {
+
+        /* test */
+        assertEquals(expected, mariaDbMapper.nameToInternalName(input));
+    }
+
+}
diff --git a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java
index cc64476927..be1f6b5dae 100644
--- a/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java
+++ b/dbrepo-data-service/rest-service/src/test/java/at/tuwien/service/SchemaServiceIntegrationTest.java
@@ -59,7 +59,10 @@ public class SchemaServiceIntegrationTest extends AbstractUnitTest {
     }
 
     @Test
-    public void inspectTable_succeeds() throws TableNotFoundException, SQLException {
+    public void inspectTable_sameNameDifferentDb_succeeds() throws TableNotFoundException, SQLException {
+
+        /* mock */
+        MariaDbConfig.execute(DATABASE_2_PRIVILEGED_DTO, "CREATE TABLE not_in_metadata_db (wrong_id BIGINT NOT NULL PRIMARY KEY, given_name VARCHAR(255) NOT NULL, middle_name VARCHAR(255), family_name VARCHAR(255) NOT NULL, age INT NOT NULL) WITH SYSTEM VERSIONING;");
 
         /* test */
         final TableDto response = schemaService.inspectTable(DATABASE_1_PRIVILEGED_DTO, "not_in_metadata_db");
diff --git a/dbrepo-ui/components/table/TableImport.vue b/dbrepo-ui/components/table/TableImport.vue
index 65fdd4930d..e89c920d32 100644
--- a/dbrepo-ui/components/table/TableImport.vue
+++ b/dbrepo-ui/components/table/TableImport.vue
@@ -509,19 +509,17 @@ export default {
       analyseService.suggest(payload)
         .then((analysis) => {
           const {columns, separator, line_termination} = analysis
-          const queryService = useQueryService()
-          const dataTypes = queryService.mySql8DataTypes()
           this.columns = Object.entries(columns)
-            .map(([key, val]) => {
+            .map(([name, analyse]) => {
               return {
-                name: key,
-                type: val,
-                null_allowed: true,
+                name: name,
+                type: analyse.type,
+                null_allowed: analyse.null_allowed,
                 primary_key: false,
-                size: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultSize : null,
-                d: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultD : null,
-                enums: [],
-                sets: []
+                size: analyse.size,
+                d: analyse.d,
+                enums: analyse.enums,
+                sets: analyse.sets
               }
             })
           this.suggestedAnalyseSeparator = separator
diff --git a/dbrepo-ui/components/table/TableSchema.vue b/dbrepo-ui/components/table/TableSchema.vue
index e820d0aea2..25c4f66cb5 100644
--- a/dbrepo-ui/components/table/TableSchema.vue
+++ b/dbrepo-ui/components/table/TableSchema.vue
@@ -227,9 +227,6 @@ export default {
     database () {
       return this.cacheStore.getDatabase
     },
-    needsSequence () {
-      return this.columns.filter(c => c.primary_key).length === 0
-    },
     dateFormats () {
       if (!this.database || !('container' in this.database) || !('image' in this.database.container) || !('date_formats' in this.database.container.image)) {
         return []
@@ -287,9 +284,6 @@ export default {
       if (idx > 0) {
         return true
       }
-      if (this.needsSequence) {
-        return true
-      }
       if (this.columns[0].primary_key) {
         return false
       }
diff --git a/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue b/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue
index 045c1932c2..c3b5a38c7a 100644
--- a/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue
+++ b/dbrepo-ui/pages/database/[database_id]/table/create/dataset.vue
@@ -373,10 +373,14 @@ export default {
     },
     async onImport () {
       this.loadingImport = true
+      const cacheStore = useCacheStore()
+      cacheStore.reloadDatabase()
       await this.$router.push({ path: `/database/${this.$route.params.database_id}/table/${this.table.id}/import`, query: this.tableImport })
     },
     async onContinue () {
       this.loadingContinue = true
+      const cacheStore = useCacheStore()
+      cacheStore.reloadDatabase()
       await this.$router.push(`/database/${this.$route.params.database_id}/table/${this.table.id}/data`)
     }
   }
-- 
GitLab