Skip to content
Snippets Groups Projects

WIP

Merged Martin Weise requested to merge 496-create-table-from-dataframe-in-the-python-library into dev
2 files
+ 99
12
Compare changes
  • Side-by-side
  • Inline
Files
2
from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order
import logging
import pandas
from numpy import dtype
from pandas import DataFrame, Series
from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order, CreateTableColumn, \
CreateTableConstraints, ColumnType
from dbrepo.api.exceptions import MalformedError
@@ -38,3 +45,78 @@ def query_to_subset(database: Database, image: Image, query: QueryDefinition) ->
raise MalformedError(f'Failed to create view: order column name not found in database')
orders.append(Order(column_id=order_column_ids[0], direction=order.direction))
return Subset(table_id=tables[0].id, columns=filtered_column_ids, filter=filters, order=orders)
def dataframe_to_table_definition(dataframe: DataFrame) -> ([CreateTableColumn], CreateTableConstraints):
if dataframe.index.name is None:
raise MalformedError(f'Failed to map dataframe: index not set')
constraints = CreateTableConstraints(uniques=[],
checks=[],
foreign_keys=[],
primary_key=dataframe.index.names)
dataframe = dataframe.reset_index()
columns = []
for name, series in dataframe.items():
column = CreateTableColumn(name=str(name),
type=ColumnType.TEXT,
null_allowed=contains_null(dataframe[name]))
if series.dtype == dtype('float64'):
if pandas.to_numeric(dataframe[name], errors='coerce').notnull().all():
logging.debug(f"mapped column {name} from float64 to decimal")
column.type = ColumnType.DECIMAL
column.size = 40
column.d = 20
else:
logging.debug(f"mapped column {name} from float64 to text")
column.type = ColumnType.TEXT
elif series.dtype == dtype('int64'):
min_val = min(dataframe[name])
max_val = max(dataframe[name])
if 0 <= min_val <= 1 and 0 <= max_val <= 1 and 'id' not in name:
logging.debug(f"mapped column {name} from int64 to bool")
column.type = ColumnType.BOOL
columns.append(column)
continue
logging.debug(f"mapped column {name} from int64 to bigint")
column.type = ColumnType.BIGINT
elif series.dtype == dtype('O'):
try:
pandas.to_datetime(dataframe[name], format='mixed')
if dataframe[name].str.contains(':').any():
logging.debug(f"mapped column {name} from O to timestamp")
column.type = ColumnType.TIMESTAMP
columns.append(column)
continue
logging.debug(f"mapped column {name} from O to date")
column.type = ColumnType.DATE
columns.append(column)
continue
except ValueError:
pass
max_size = max(dataframe[name].astype(str).map(len))
if max_size <= 1:
logging.debug(f"mapped column {name} from O to char")
column.type = ColumnType.CHAR
column.size = 1
if 0 <= max_size <= 255:
logging.debug(f"mapped column {name} from O to varchar")
column.type = ColumnType.VARCHAR
column.size = 255
else:
logging.debug(f"mapped column {name} from O to text")
column.type = ColumnType.TEXT
elif series.dtype == dtype('bool'):
logging.debug(f"mapped column {name} from bool to bool")
column.type = ColumnType.BOOL
elif series.dtype == dtype('datetime64'):
logging.debug(f"mapped column {name} from datetime64 to datetime")
column.type = ColumnType.DATETIME
else:
logging.warning(f'default to \'text\' for column {name} and type {dtype}')
columns.append(column)
return columns, constraints
def contains_null(dataframe: DataFrame) -> bool:
if '\\N' in dataframe.values:
return True
return dataframe.isnull().values.any()
Loading