From ac252a16e8fc7d67050973a32ae59e8834d1a375 Mon Sep 17 00:00:00 2001 From: Martin Weise <martin.weise@tuwien.ac.at> Date: Wed, 5 Mar 2025 09:30:30 +0100 Subject: [PATCH] Updated lib Signed-off-by: Martin Weise <martin.weise@tuwien.ac.at> --- lib/python/dbrepo/RestClient.py | 27 ++++++----- lib/python/dbrepo/api/mapper.py | 84 ++++++++++++++++++++++++++++++++- 2 files changed, 99 insertions(+), 12 deletions(-) diff --git a/lib/python/dbrepo/RestClient.py b/lib/python/dbrepo/RestClient.py index 2f2467d960..f2a93151ec 100644 --- a/lib/python/dbrepo/RestClient.py +++ b/lib/python/dbrepo/RestClient.py @@ -11,7 +11,7 @@ from dbrepo.api.dto import * from dbrepo.api.exceptions import ResponseCodeError, NotExistsError, \ ForbiddenError, MalformedError, NameExistsError, QueryStoreError, ExternalSystemError, \ AuthenticationError, FormatNotAvailable, RequestError, ServiceError, ServiceConnectionError -from dbrepo.api.mapper import query_to_subset +from dbrepo.api.mapper import query_to_subset, dataframe_to_table_definition logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-6s %(message)s', level=logging.INFO, stream=sys.stdout) @@ -463,9 +463,8 @@ class RestClient: raise ResponseCodeError( f'Failed to update database schema: response code: {response.status_code} is not 200 (OK)') - def create_table(self, database_id: str, name: str, is_public: bool, is_schema_public: bool, - columns: List[CreateTableColumn], constraints: CreateTableConstraints, - description: str = None) -> TableBrief: + def create_table(self, database_id: str, name: str, is_public: bool, is_schema_public: bool, dataframe: DataFrame, + description: str = None, with_data: bool = True) -> TableBrief: """ Updates the database owner of a database with given database id. @@ -473,9 +472,9 @@ class RestClient: :param name: The name of the created table. :param is_public: The visibility of the data. If set to true the data will be publicly visible. :param is_schema_public: The visibility of the schema metadata. If set to true the schema metadata will be publicly visible. - :param constraints: The constraints of the created table. - :param columns: The columns of the created table. + :param dataframe: The `pandas` dataframe. :param description: The description of the created table. Optional. + :param with_data: If set to `True`, the data will be included in the new table. Optional. Default: `True`. :returns: The table, if successful. @@ -488,12 +487,18 @@ class RestClient: :raises ResponseCodeError: If something went wrong with the creation. """ url = f'/api/database/{database_id}/table' + columns, constraints = dataframe_to_table_definition(dataframe) response = self._wrapper(method="post", url=url, force_auth=True, payload=CreateTable(name=name, is_public=is_public, is_schema_public=is_schema_public, description=description, columns=columns, constraints=constraints)) if response.status_code == 201: body = response.json() - return TableBrief.model_validate(body) + table = TableBrief.model_validate(body) + if with_data: + self.import_table_data(database_id=database_id, + table_id=table.id, + dataframe=dataframe.reset_index()) + return table if response.status_code == 400: raise MalformedError(f'Failed to create table: {response.text}') if response.status_code == 403: @@ -919,9 +924,9 @@ class RestClient: :raises ResponseCodeError: If something went wrong with the insert. """ - url = f'/api/upload' buffer = BytesIO() - dataframe.to_csv(path_or_buf=buffer, header=False, index=False) + dataframe.to_csv(path_or_buf=buffer, header=True, index=False) + url = f'/api/upload' response = self._wrapper(method="post", url=url, force_auth=True, files={'file': ('dataframe.csv', buffer.getvalue())}) if response.status_code == 201: @@ -949,8 +954,8 @@ class RestClient: url = f'/api/database/{database_id}/table/{table_id}/data/import' response = self._wrapper(method="post", url=url, force_auth=True, - payload=Import(location=self._upload(dataframe), separator=',', quote='"', - header=True, line_termination='\n')) + payload=Import(location=self._upload(dataframe), separator=',', quote='"', header=True, + line_termination='\n')) if response.status_code == 202: return if response.status_code == 400: diff --git a/lib/python/dbrepo/api/mapper.py b/lib/python/dbrepo/api/mapper.py index aca29be040..bede7b3838 100644 --- a/lib/python/dbrepo/api/mapper.py +++ b/lib/python/dbrepo/api/mapper.py @@ -1,4 +1,11 @@ -from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order +import logging + +import pandas +from numpy import dtype +from pandas import DataFrame, Series + +from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order, CreateTableColumn, \ + CreateTableConstraints, ColumnType from dbrepo.api.exceptions import MalformedError @@ -38,3 +45,78 @@ def query_to_subset(database: Database, image: Image, query: QueryDefinition) -> raise MalformedError(f'Failed to create view: order column name not found in database') orders.append(Order(column_id=order_column_ids[0], direction=order.direction)) return Subset(table_id=tables[0].id, columns=filtered_column_ids, filter=filters, order=orders) + + +def dataframe_to_table_definition(dataframe: DataFrame) -> ([CreateTableColumn], CreateTableConstraints): + if dataframe.index.name is None: + raise MalformedError(f'Failed to map dataframe: index not set') + constraints = CreateTableConstraints(uniques=[], + checks=[], + foreign_keys=[], + primary_key=dataframe.index.names) + dataframe = dataframe.reset_index() + columns = [] + for name, series in dataframe.items(): + column = CreateTableColumn(name=str(name), + type=ColumnType.TEXT, + null_allowed=contains_null(dataframe[name])) + if series.dtype == dtype('float64'): + if pandas.to_numeric(dataframe[name], errors='coerce').notnull().all(): + logging.debug(f"mapped column {name} from float64 to decimal") + column.type = ColumnType.DECIMAL + column.size = 40 + column.d = 20 + else: + logging.debug(f"mapped column {name} from float64 to text") + column.type = ColumnType.TEXT + elif series.dtype == dtype('int64'): + min_val = min(dataframe[name]) + max_val = max(dataframe[name]) + if 0 <= min_val <= 1 and 0 <= max_val <= 1 and 'id' not in name: + logging.debug(f"mapped column {name} from int64 to bool") + column.type = ColumnType.BOOL + columns.append(column) + continue + logging.debug(f"mapped column {name} from int64 to bigint") + column.type = ColumnType.BIGINT + elif series.dtype == dtype('O'): + try: + pandas.to_datetime(dataframe[name], format='mixed') + if dataframe[name].str.contains(':').any(): + logging.debug(f"mapped column {name} from O to timestamp") + column.type = ColumnType.TIMESTAMP + columns.append(column) + continue + logging.debug(f"mapped column {name} from O to date") + column.type = ColumnType.DATE + columns.append(column) + continue + except ValueError: + pass + max_size = max(dataframe[name].astype(str).map(len)) + if max_size <= 1: + logging.debug(f"mapped column {name} from O to char") + column.type = ColumnType.CHAR + column.size = 1 + if 0 <= max_size <= 255: + logging.debug(f"mapped column {name} from O to varchar") + column.type = ColumnType.VARCHAR + column.size = 255 + else: + logging.debug(f"mapped column {name} from O to text") + column.type = ColumnType.TEXT + elif series.dtype == dtype('bool'): + logging.debug(f"mapped column {name} from bool to bool") + column.type = ColumnType.BOOL + elif series.dtype == dtype('datetime64'): + logging.debug(f"mapped column {name} from datetime64 to datetime") + column.type = ColumnType.DATETIME + else: + logging.warning(f'default to \'text\' for column {name} and type {dtype}') + columns.append(column) + return columns, constraints + +def contains_null(dataframe: DataFrame) -> bool: + if '\\N' in dataframe.values: + return True + return dataframe.isnull().values.any() -- GitLab