Updated lib

Signed-off-by: Martin Weise <martin.weise@tuwien.ac.at>

Updated lib
ac252a16 · Martin Weise · 932bdc48 · ac252a16 · ac252a16
Verified Commit ac252a16 authored 4 months ago by Martin Weise
--- a/lib/python/dbrepo/RestClient.py
+++ b/lib/python/dbrepo/RestClient.py
@@ -11,7 +11,7 @@ from dbrepo.api.dto import *
 from dbrepo.api.exceptions import ResponseCodeError, NotExistsError, \
    ForbiddenError, MalformedError, NameExistsError, QueryStoreError, ExternalSystemError, \
    AuthenticationError, FormatNotAvailable, RequestError, ServiceError, ServiceConnectionError
-from dbrepo.api.mapper import query_to_subset
+from dbrepo.api.mapper import query_to_subset, dataframe_to_table_definition

 logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-6s %(message)s', level=logging.INFO,
                    stream=sys.stdout)
@@ -463,9 +463,8 @@ class RestClient:
        raise ResponseCodeError(
            f'Failed to update database schema: response code: {response.status_code} is not 200 (OK)')

-    def create_table(self, database_id: str, name: str, is_public: bool, is_schema_public: bool,
-                     columns: List[CreateTableColumn], constraints: CreateTableConstraints,
-                     description: str = None) -> TableBrief:
+    def create_table(self, database_id: str, name: str, is_public: bool, is_schema_public: bool, dataframe: DataFrame,
+                     description: str = None, with_data: bool = True) -> TableBrief:
        """
        Updates the database owner of a database with given database id.

@@ -473,9 +472,9 @@ class RestClient:
        :param name: The name of the created table.
        :param is_public: The visibility of the data. If set to true the data will be publicly visible.
        :param is_schema_public: The visibility of the schema metadata. If set to true the schema metadata will be publicly visible.
-        :param constraints: The constraints of the created table.
-        :param columns: The columns of the created table.
+        :param dataframe: The `pandas` dataframe.
        :param description: The description of the created table. Optional.
+        :param with_data: If set to `True`, the data will be included in the new table. Optional. Default: `True`.

        :returns: The table, if successful.

@@ -488,12 +487,18 @@ class RestClient:
        :raises ResponseCodeError: If something went wrong with the creation.
        """
        url = f'/api/database/{database_id}/table'
+        columns, constraints = dataframe_to_table_definition(dataframe)
        response = self._wrapper(method="post", url=url, force_auth=True,
                                 payload=CreateTable(name=name, is_public=is_public, is_schema_public=is_schema_public,
                                                     description=description, columns=columns, constraints=constraints))
        if response.status_code == 201:
            body = response.json()
-            return TableBrief.model_validate(body)
+            table = TableBrief.model_validate(body)
+            if with_data:
+                self.import_table_data(database_id=database_id,
+                                       table_id=table.id,
+                                       dataframe=dataframe.reset_index())
+            return table
        if response.status_code == 400:
            raise MalformedError(f'Failed to create table: {response.text}')
        if response.status_code == 403:
@@ -919,9 +924,9 @@ class RestClient:
        
        :raises ResponseCodeError: If something went wrong with the insert.
        """
-        url = f'/api/upload'
        buffer = BytesIO()
-        dataframe.to_csv(path_or_buf=buffer, header=False, index=False)
+        dataframe.to_csv(path_or_buf=buffer, header=True, index=False)
+        url = f'/api/upload'
        response = self._wrapper(method="post", url=url, force_auth=True,
                                 files={'file': ('dataframe.csv', buffer.getvalue())})
        if response.status_code == 201:
@@ -949,8 +954,8 @@ class RestClient:

        url = f'/api/database/{database_id}/table/{table_id}/data/import'
        response = self._wrapper(method="post", url=url, force_auth=True,
-                                 payload=Import(location=self._upload(dataframe), separator=',', quote='"',
-                                                header=True, line_termination='\n'))
+                                 payload=Import(location=self._upload(dataframe), separator=',', quote='"', header=True,
+                                                line_termination='\n'))
        if response.status_code == 202:
            return
        if response.status_code == 400:

--- a/lib/python/dbrepo/api/mapper.py
+++ b/lib/python/dbrepo/api/mapper.py
-from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order
+import logging
+
+import pandas
+from numpy import dtype
+from pandas import DataFrame, Series
+
+from dbrepo.api.dto import Subset, QueryDefinition, Database, Table, Image, Filter, Order, CreateTableColumn, \
+    CreateTableConstraints, ColumnType
 from dbrepo.api.exceptions import MalformedError


@@ -38,3 +45,78 @@ def query_to_subset(database: Database, image: Image, query: QueryDefinition) ->
                raise MalformedError(f'Failed to create view: order column name not found in database')
            orders.append(Order(column_id=order_column_ids[0], direction=order.direction))
    return Subset(table_id=tables[0].id, columns=filtered_column_ids, filter=filters, order=orders)
+
+
+def dataframe_to_table_definition(dataframe: DataFrame) -> ([CreateTableColumn], CreateTableConstraints):
+    if dataframe.index.name is None:
+        raise MalformedError(f'Failed to map dataframe: index not set')
+    constraints = CreateTableConstraints(uniques=[],
+                                         checks=[],
+                                         foreign_keys=[],
+                                         primary_key=dataframe.index.names)
+    dataframe = dataframe.reset_index()
+    columns = []
+    for name, series in dataframe.items():
+        column = CreateTableColumn(name=str(name),
+                                   type=ColumnType.TEXT,
+                                   null_allowed=contains_null(dataframe[name]))
+        if series.dtype == dtype('float64'):
+            if pandas.to_numeric(dataframe[name], errors='coerce').notnull().all():
+                logging.debug(f"mapped column {name} from float64 to decimal")
+                column.type = ColumnType.DECIMAL
+                column.size = 40
+                column.d = 20
+            else:
+                logging.debug(f"mapped column {name} from float64 to text")
+                column.type = ColumnType.TEXT
+        elif series.dtype == dtype('int64'):
+            min_val = min(dataframe[name])
+            max_val = max(dataframe[name])
+            if 0 <= min_val <= 1 and 0 <= max_val <= 1 and 'id' not in name:
+                logging.debug(f"mapped column {name} from int64 to bool")
+                column.type = ColumnType.BOOL
+                columns.append(column)
+                continue
+            logging.debug(f"mapped column {name} from int64 to bigint")
+            column.type = ColumnType.BIGINT
+        elif series.dtype == dtype('O'):
+            try:
+                pandas.to_datetime(dataframe[name], format='mixed')
+                if dataframe[name].str.contains(':').any():
+                    logging.debug(f"mapped column {name} from O to timestamp")
+                    column.type = ColumnType.TIMESTAMP
+                    columns.append(column)
+                    continue
+                logging.debug(f"mapped column {name} from O to date")
+                column.type = ColumnType.DATE
+                columns.append(column)
+                continue
+            except ValueError:
+                pass
+            max_size = max(dataframe[name].astype(str).map(len))
+            if max_size <= 1:
+                logging.debug(f"mapped column {name} from O to char")
+                column.type = ColumnType.CHAR
+                column.size = 1
+            if 0 <= max_size <= 255:
+                logging.debug(f"mapped column {name} from O to varchar")
+                column.type = ColumnType.VARCHAR
+                column.size = 255
+            else:
+                logging.debug(f"mapped column {name} from O to text")
+                column.type = ColumnType.TEXT
+        elif series.dtype == dtype('bool'):
+            logging.debug(f"mapped column {name} from bool to bool")
+            column.type = ColumnType.BOOL
+        elif series.dtype == dtype('datetime64'):
+            logging.debug(f"mapped column {name} from datetime64 to datetime")
+            column.type = ColumnType.DATETIME
+        else:
+            logging.warning(f'default to \'text\' for column {name} and type {dtype}')
+        columns.append(column)
+    return columns, constraints
+
+def contains_null(dataframe: DataFrame) -> bool:
+    if '\\N' in dataframe.values:
+        return True
+    return dataframe.isnull().values.any()