Skip to content
Snippets Groups Projects
Verified Commit 61ad0c0e authored by Martin Weise's avatar Martin Weise
Browse files

Hotfix the data type sizes

parent 2426c0cf
No related branches found
No related tags found
No related merge requests found
from typing import Optional from enum import Enum
from typing import Optional, List
from pydantic import BaseModel from pydantic import BaseModel
class ColumnStat(BaseModel): class DataTypeDto(str, Enum):
val_min: Optional[float] """
val_max: Optional[float] Enumeration of languages.
mean: Optional[float] """
median: Optional[float] BIGINT = "bigint"
std_dev: Optional[float] BINARY = "binary"
BIT = "bit"
BLOB = "blob"
BOOL = "bool"
CHAR = "char"
DATE = "date"
DATETIME = "datetime"
DECIMAL = "decimal"
DOUBLE = "double"
ENUM = "enum"
FLOAT = "float"
INT = "int"
LONGBLOB = "longblob"
LONGTEXT = "longtext"
MEDIUMBLOB = "mediumblob"
MEDIUMINT = "mediumint"
MEDIUMTEXT = "mediumtext"
SET = "set"
SMALLINT = "smallint"
TEXT = "text"
TIMESTAMP = "timestamp"
TINYBLOB = "tinyblob"
TINYINT = "tinyint"
TINYTEXT = "tinytext"
YEAR = "year"
VARBINARY = "varbinary"
VARCHAR = "varchar"
class TableStat(BaseModel): class ColumnAnalysisDto(BaseModel):
columns: dict[str, ColumnStat] type: DataTypeDto
null_allowed: bool
size: Optional[int] = None
d: Optional[int] = None
dfid: Optional[int] = None
enums: Optional[list] = None
sets: Optional[list] = None
class AnalysisDto(BaseModel):
columns: dict[str, ColumnAnalysisDto]
separator: str
line_termination: str
class ColumnStatDto(BaseModel):
val_min: Optional[float] = None
val_max: Optional[float] = None
mean: Optional[float] = None
median: Optional[float] = None
std_dev: Optional[float] = None
class TableStatDto(BaseModel):
columns: dict[str, ColumnStatDto]
...@@ -77,10 +77,17 @@ template = { ...@@ -77,10 +77,17 @@ template = {
"openapi": "3.0.0", "openapi": "3.0.0",
"components": { "components": {
"schemas": { "schemas": {
"DataTypesDto": { "AnalysisDto": {
"properties": { "properties": {
"columns": { "columns": {
"$ref": "#/components/schemas/SuggestedColumnDto" "type": "array",
"items": {
"properties": {
"column_name": {
"$ref": "#/components/schemas/ColumnAnalysisDto"
}
}
}
}, },
"line_termination": { "line_termination": {
"example": "\r\n", "example": "\r\n",
...@@ -125,12 +132,42 @@ template = { ...@@ -125,12 +132,42 @@ template = {
], ],
"type": "object" "type": "object"
}, },
"SuggestedColumnDto": { "ColumnAnalysisDto": {
"properties": {
"type": {
"type": "string",
"example": "decimal"
},
"null_allowed": {
"type": "boolean"
},
"size": {
"type": "integer",
"example": 10
},
"d": {
"type": "integer",
"example": 4
},
"dfid": {
"type": "integer",
"example": None
},
"enums": {
"type": "array",
"example": None,
"properties": { "properties": {
"column_name": {
"type": "string" "type": "string"
} }
}, },
"sets": {
"type": "array",
"example": None,
"properties": {
"type": "string"
}
}
},
"type": "object" "type": "object"
} }
}, },
......
...@@ -38,7 +38,7 @@ responses: ...@@ -38,7 +38,7 @@ responses:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/DataTypesDto' $ref: '#/components/schemas/AnalysisDto'
400: 400:
description: "Failed to determine data types" description: "Failed to determine data types"
content: content:
......
...@@ -9,8 +9,10 @@ import pandas ...@@ -9,8 +9,10 @@ import pandas
from numpy import dtype, max, min from numpy import dtype, max, min
from flask import current_app from flask import current_app
from pandas import DataFrame
from pandas.errors import EmptyDataError from pandas.errors import EmptyDataError
from api.dto import ColumnAnalysisDto, DataTypeDto, AnalysisDto
from clients.s3_client import S3Client from clients.s3_client import S3Client
...@@ -63,52 +65,58 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') -> ...@@ -63,52 +65,58 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=',') ->
r = {} r = {}
for name, dataType in df.dtypes.items(): for name, dataType in df.dtypes.items():
col = ColumnAnalysisDto(type=DataTypeDto.TEXT, null_allowed=contains_null(df[name]))
if dataType == dtype('float64'): if dataType == dtype('float64'):
if pandas.to_numeric(df[name], errors='coerce').notnull().all(): if pandas.to_numeric(df[name], errors='coerce').notnull().all():
logging.debug(f"mapped column {name} from float64 to decimal") logging.debug(f"mapped column {name} from float64 to decimal")
r[name] = 'decimal' col.type = DataTypeDto.DECIMAL
col.size = 10
col.d = 4
else: else:
logging.debug(f"mapped column {name} from float64 to text") logging.debug(f"mapped column {name} from float64 to text")
r[name] = 'text' col.type = DataTypeDto.TEXT
elif dataType == dtype('int64'): elif dataType == dtype('int64'):
min_val = min(df[name]) min_val = min(df[name])
max_val = max(df[name]) max_val = max(df[name])
if 0 <= min_val <= 1 and 0 <= max_val <= 1: if 0 <= min_val <= 1 and 0 <= max_val <= 1:
logging.debug(f"mapped column {name} from int64 to bool") logging.debug(f"mapped column {name} from int64 to bool")
r[name] = 'bool' col.type = DataTypeDto.BOOL
continue continue
logging.debug(f"mapped column {name} from int64 to bigint") logging.debug(f"mapped column {name} from int64 to bigint")
r[name] = 'bigint' col.type = DataTypeDto.BIGINT
col.size = 255
elif dataType == dtype('O'): elif dataType == dtype('O'):
try: try:
pandas.to_datetime(df[name], format='mixed') pandas.to_datetime(df[name], format='mixed')
logging.debug(f"mapped column {name} from O to timestamp") logging.debug(f"mapped column {name} from O to timestamp")
r[name] = 'timestamp' col.type = DataTypeDto.TIMESTAMP
continue continue
except ValueError: except ValueError:
pass pass
max_size = max(df[name].astype(str).map(len)) max_size = max(df[name].astype(str).map(len))
if max_size <= 1: if max_size <= 1:
logging.debug(f"mapped column {name} from O to char") logging.debug(f"mapped column {name} from O to char")
r[name] = 'char' col.type = DataTypeDto.CHAR
col.size = 1
if 0 <= max_size <= 255: if 0 <= max_size <= 255:
logging.debug(f"mapped column {name} from O to varchar") logging.debug(f"mapped column {name} from O to varchar")
r[name] = 'varchar' col.type = DataTypeDto.VARCHAR
col.size = 255
else: else:
logging.debug(f"mapped column {name} from O to text") logging.debug(f"mapped column {name} from O to text")
r[name] = 'text' col.type = DataTypeDto.TEXT
elif dataType == dtype('bool'): elif dataType == dtype('bool'):
logging.debug(f"mapped column {name} from bool to bool") logging.debug(f"mapped column {name} from bool to bool")
r[name] = 'bool' col.type = DataTypeDto.BOOL
elif dataType == dtype('datetime64'): elif dataType == dtype('datetime64'):
logging.debug(f"mapped column {name} from datetime64 to datetime") logging.debug(f"mapped column {name} from datetime64 to datetime")
r[name] = 'datetime' col.type = DataTypeDto.DATETIME
else: else:
logging.warning(f'default to \'text\' for column {name} and type {dtype}') logging.warning(f'default to \'text\' for column {name} and type {dtype}')
r[name] = 'text' r[name] = col
s = {"columns": r, "separator": separator, "line_termination": line_terminator} s = AnalysisDto(columns=r, separator=separator, line_termination=line_terminator)
logging.info("Determined data types %s", s) logging.info("Determined data types %s", s)
return json.dumps(s) return s.model_dump_json()
def peek_line(f) -> bytes: def peek_line(f) -> bytes:
...@@ -116,3 +124,9 @@ def peek_line(f) -> bytes: ...@@ -116,3 +124,9 @@ def peek_line(f) -> bytes:
line: bytes = f.readline() line: bytes = f.readline()
f.seek(pos) f.seek(pos)
return line return line
def contains_null(df: DataFrame) -> bool:
if '\\N' in df.values:
return True
return df.isnull().values.any()
package at.tuwien.mapper;
import at.tuwien.test.AbstractUnitTest;
import lombok.extern.log4j.Log4j2;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
@Log4j2
@SpringBootTest
@ExtendWith(SpringExtension.class)
public class MariaDbMapperUnitTest extends AbstractUnitTest {
@Autowired
private MariaDbMapper mariaDbMapper;
public static Stream<Arguments> nameToInternalName_parameters() {
return Stream.of(
Arguments.arguments("dash_minus", "OE/NO-027", "oe_no_027"),
Arguments.arguments("percent", "OE%NO-027", "oe_no_027"),
Arguments.arguments("umlaut", "OE/NÖ-027", "oe_no__027"),
Arguments.arguments("dot", "OE.NO-027", "oe_no_027"),
Arguments.arguments("double_dot", "OE:NO-027", "oe_no_027")
);
}
@ParameterizedTest
@MethodSource("nameToInternalName_parameters")
public void nameToInternalName_succeeds(String name, String input, String expected) {
/* test */
assertEquals(expected, mariaDbMapper.nameToInternalName(input));
}
}
...@@ -59,7 +59,10 @@ public class SchemaServiceIntegrationTest extends AbstractUnitTest { ...@@ -59,7 +59,10 @@ public class SchemaServiceIntegrationTest extends AbstractUnitTest {
} }
@Test @Test
public void inspectTable_succeeds() throws TableNotFoundException, SQLException { public void inspectTable_sameNameDifferentDb_succeeds() throws TableNotFoundException, SQLException {
/* mock */
MariaDbConfig.execute(DATABASE_2_PRIVILEGED_DTO, "CREATE TABLE not_in_metadata_db (wrong_id BIGINT NOT NULL PRIMARY KEY, given_name VARCHAR(255) NOT NULL, middle_name VARCHAR(255), family_name VARCHAR(255) NOT NULL, age INT NOT NULL) WITH SYSTEM VERSIONING;");
/* test */ /* test */
final TableDto response = schemaService.inspectTable(DATABASE_1_PRIVILEGED_DTO, "not_in_metadata_db"); final TableDto response = schemaService.inspectTable(DATABASE_1_PRIVILEGED_DTO, "not_in_metadata_db");
......
...@@ -509,19 +509,17 @@ export default { ...@@ -509,19 +509,17 @@ export default {
analyseService.suggest(payload) analyseService.suggest(payload)
.then((analysis) => { .then((analysis) => {
const {columns, separator, line_termination} = analysis const {columns, separator, line_termination} = analysis
const queryService = useQueryService()
const dataTypes = queryService.mySql8DataTypes()
this.columns = Object.entries(columns) this.columns = Object.entries(columns)
.map(([key, val]) => { .map(([name, analyse]) => {
return { return {
name: key, name: name,
type: val, type: analyse.type,
null_allowed: true, null_allowed: analyse.null_allowed,
primary_key: false, primary_key: false,
size: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultSize : null, size: analyse.size,
d: dataTypes.filter(d => d.value === val).length > 0 ? dataTypes.filter(d => d.value === val)[0].defaultD : null, d: analyse.d,
enums: [], enums: analyse.enums,
sets: [] sets: analyse.sets
} }
}) })
this.suggestedAnalyseSeparator = separator this.suggestedAnalyseSeparator = separator
......
...@@ -227,9 +227,6 @@ export default { ...@@ -227,9 +227,6 @@ export default {
database () { database () {
return this.cacheStore.getDatabase return this.cacheStore.getDatabase
}, },
needsSequence () {
return this.columns.filter(c => c.primary_key).length === 0
},
dateFormats () { dateFormats () {
if (!this.database || !('container' in this.database) || !('image' in this.database.container) || !('date_formats' in this.database.container.image)) { if (!this.database || !('container' in this.database) || !('image' in this.database.container) || !('date_formats' in this.database.container.image)) {
return [] return []
...@@ -287,9 +284,6 @@ export default { ...@@ -287,9 +284,6 @@ export default {
if (idx > 0) { if (idx > 0) {
return true return true
} }
if (this.needsSequence) {
return true
}
if (this.columns[0].primary_key) { if (this.columns[0].primary_key) {
return false return false
} }
......
...@@ -373,10 +373,14 @@ export default { ...@@ -373,10 +373,14 @@ export default {
}, },
async onImport () { async onImport () {
this.loadingImport = true this.loadingImport = true
const cacheStore = useCacheStore()
cacheStore.reloadDatabase()
await this.$router.push({ path: `/database/${this.$route.params.database_id}/table/${this.table.id}/import`, query: this.tableImport }) await this.$router.push({ path: `/database/${this.$route.params.database_id}/table/${this.table.id}/import`, query: this.tableImport })
}, },
async onContinue () { async onContinue () {
this.loadingContinue = true this.loadingContinue = true
const cacheStore = useCacheStore()
cacheStore.reloadDatabase()
await this.$router.push(`/database/${this.$route.params.database_id}/table/${this.table.id}/data`) await this.$router.push(`/database/${this.$route.params.database_id}/table/${this.table.id}/data`)
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment