Migrated to Pipfile to enforce Python 3.9

a30d5fb4 · Martin Weise · 4d348850 · a30d5fb4 · a30d5fb4 · a30d5fb4
Verified Commit a30d5fb4 authored 1 year ago by Martin Weise
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -374,27 +374,6 @@ scan-ui:
    reports:
      container_scanning: ./.trivy/trivy-ui-report.json
-scan-log-service:
-  image: bitnami/trivy:latest
-  stage: scan
-  only:
-    refs:
-      - dev
-      - master
-  allow_failure: true
-  script:
-    - trivy image --insecure --exit-code 0 --format template --template "@.trivy/gitlab.tpl" -o ./.trivy/trivy-log-service-report.json docker.io/dbrepo/log-service:latest
-    - trivy image --insecure --exit-code 0 docker.io/dbrepo/log-service:latest
-    - trivy image --insecure --exit-code 1 --severity CRITICAL docker.io/dbrepo/log-service:latest
-  cache:
-    paths:
-      - .trivycache/
-  artifacts:
-    when: always
-    expire_in: 1 days
-    reports:
-      container_scanning: ./.trivy/trivy-log-service-report.json
 scan-storage-service:
  image: bitnami/trivy:latest
  stage: scan

--- a/Pipfile
+++ b/Pipfile
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+[dev-packages]
+[requires]
+python_version = "3.11"
--- a/dbrepo-analyse-service/Dockerfile
+++ b/dbrepo-analyse-service/Dockerfile
@@ -5,8 +5,11 @@ RUN apt update && apt install -y curl
 WORKDIR /app
-COPY ./requirements.txt ./requirements.txt
+COPY Pipfile Pipfile.lock ./
-RUN pip install -r requirements.txt
+RUN pip install pipenv && \
+    pipenv install gunicorn && \
+    pipenv install --system --deploy
 ENV FLASK_APP=app.py
 ENV FLASK_RUN_HOST=0.0.0.0

--- a/dbrepo-analyse-service/Pipfile
+++ b/dbrepo-analyse-service/Pipfile
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+boto3 = "*"
+exceptiongroup = "*"
+flask = "*"
+flasgger = "*"
+gevent = "*"
+prometheus-flask-exporter = "*"
+numpy = "*"
+pandas = "*"
+messytables = "*"
+minio = "*"
+[dev-packages]
+coverage = "*"
+pytest = "*"
+testcontainers-minio = "*"
+[requires]
+python_version = "3.9"
--- a/dbrepo-analyse-service/Pipfile.lock
+++ b/dbrepo-analyse-service/Pipfile.lock
--- a/dbrepo-analyse-service/README.md
+++ b/dbrepo-analyse-service/README.md
@@ -9,3 +9,27 @@ data provenance, db description ... to the metadata database. Remark: if you use
 * Prometheus metrics [`/metrics`](http://localhost:5000/metrics)
 * Health check [`/health`](http://localhost:5000/health)
 * API 
+## Development
+Install all dev dependencies from the `Pipfile`:
+```shell
+pipenv install --dev
+```
+## Test
+Run all tests in `test/`:
+```shell
+coverage run -m pytest test/test_determine_dt.py test/test_determine_pk.py test/test_s3_client.py --junitxml=report.xml
+coverage html --omit="test/*" # (optional html report)
+```
+## Other
+Potential issues when upgrading to Python 3.10+ as `messytables` requires `collections` and the interface changed for
+Python 3.10 onwards, see
+the [StackOverflow](https://stackoverflow.com/questions/69381312/importerror-cannot-import-name-mapping-from-collections-using-python-3-10)
+post.
\ No newline at end of file
--- a/dbrepo-analyse-service/analysecsv.py
+++ b/dbrepo-analyse-service/analysecsv.py
-import json  
-import numpy as np
-import messytables, pandas as pd
-from messytables import CSVTableSet, headers_guess
-from determine_dt import determine_datatypes
-from psycopg2 import connect
-import requests
-def analysecsv(path,seper,internaldbname, dbhost, dbid, tname, header = True): 
-    # Connect to Meta database 
-    try: 
-        conn=connect(
-            dbname="fda", 
-            user = "postgres",
-            host = "fda-metadata-db", 
-            password = "postgres"
-        )
-        cursor = conn.cursor() 
-    except (Exception, psycopg2.DatabaseError) as error:
-        print(error)
-    r={}
-    # Check if csv format is suitable 
-    if header == True: 
-        df = pd.read_csv(path,sep=seper)
-    else: 
-        df = pd.read_csv(path,sep=seper,header=None)
-    csvcol = df.shape[1]
-    cursor.execute(f"""select numcols from mdb_tables;"""
-    )
-    mdcol = cursor.fetchone()
-    if csvcol != mdcol: 
-        r["dim"] = "Dimension mismatch. Specify which colums should be filled."
-    # Check if determined datatypes matches 
-    dt = json.loads(determine_datatypes(path,seperator=seper)) 
-    if header == True: 
-        fh = open(path, 'rb')
-        # Load a file object:
-        table_set = CSVTableSet(fh)
-        # A table set is a collection of tables:
-        row_set = table_set.tables[0]
-        # guess header names and the offset of the header:
-        offset, headers = headers_guess(row_set.sample)
-        for i in dt["columns"].keys(): 
-            cursor.execute("select datatype from mdb_columns where cdbid = %s and tname = %s and cname = %s;""", (dbid, tname, i, ))
-            res = cursor.fetchone()
-            if res != dt["columns"][i]: 
-                r["dt"] = "Datatype mismatch in {}. {} vs {}".format(i,res,dt["columns"][i])
-        conn.close()
-    else: 
-        conn=connect(
-            dbname=internaldbname, 
-            user = "postgres",
-            host = dbhost,
-            password = "postgres"
-        )
-        cursor = conn.cursor() 
-        cursor.execute("select ordinal_position, data_type from information_schema.columns where table_name = %s;",(tname,))
-        pos_dt = cursor.fetchall()
-        ldt = list(dt["columns"].values())
-        for i in range(0,len(ldt)): 
-            if pos_dt[i][1].lower() != ldt[i].lower(): 
-                r["dt"] = "Datatype mismatch at ordinal position {}".format(i+1)
-        conn.close()
-    # Check constraints (Primary key, Foreignkey, nullable, other constraints?)
-    conn=connect(
-        dbname=internaldbname, 
-        user = "postgres",
-        host = dbhost,
-        password = "postgres"
-    )
-    cursor = conn.cursor() 
-    # Get ordinal position of primary key attributes
-    cursor.execute("""SELECT c.ordinal_position
-                   FROM information_schema.table_constraints tc 
-                   JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_name) 
-                   JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema
-                   AND tc.table_name = c.table_name AND ccu.column_name = c.column_name
-                   WHERE constraint_type = 'PRIMARY KEY' and tc.table_name = %s;""",(tname,))
-    pk = cursor.fetchall()
-    pk_flattend = [item for items in pk for item in items]
-    pk_aditer = list(map(lambda x: x -1, pk_flattend))
-    tmp = df[df.iloc[:,np.r_[pk_aditer]].duplicated()]
-    if not tmp.empty: 
-        r["pk"] = "Rows {} violate primary key".format(tmp)
-    # detect enum values
-    return json.dumps(r)
\ No newline at end of file
--- a/dbrepo-analyse-service/build.sh
+++ b/dbrepo-analyse-service/build.sh
 #!/bin/bash
 python3 -m venv ./dbrepo-analyse-service/venv
 source ./dbrepo-analyse-service/venv/bin/activate
-pip install -r ./dbrepo-analyse-service/requirements.txt
+PIPENV_PIPFILE=./dbrepo-analyse-service/Pipfile pipenv install --dev
\ No newline at end of file
--- a/dbrepo-analyse-service/clients/s3_client.py
+++ b/dbrepo-analyse-service/clients/s3_client.py
@@ -2,6 +2,7 @@ import os
 import boto3
 import logging
+from boto3.exceptions import S3UploadFailedError
 from botocore.exceptions import ClientError
@@ -34,31 +35,24 @@ class S3Client:
            logging.error(f'Failed to find .csv at {filepath}')
            raise FileNotFoundError(f'Failed to find .csv at {filepath}')
        try:
-            if self.client.upload_file(filepath, bucket, filename) is False:
+            self.client.upload_file(filepath, bucket, filename)
-                logging.warning(f"Failed to upload file with key {filename}")
-                raise ConnectionRefusedError(f"Failed to upload file with key {filename}")
            logging.info(f"Uploaded .csv {filepath} with key {filename}")
            return True
-        except ClientError as e:
+        except (ClientError, S3UploadFailedError) as e:
-            logging.error(e)
+            logging.warning(f"Failed to upload file with key {filename}")
-            return False
+            raise ConnectionRefusedError(f"Failed to upload file with key {filename}", e)
-    def download_file(self, filename) -> bool:
+    def download_file(self, filename, bucket="dbrepo-upload"):
        """
        Downloads a file from the blob storage.
        Follows the official API https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-download-file.html
        :param filename: The filename.
-        :return: True if the file was downloaded and saved.
+        :param bucket: The bucket to download the file from.
        """
-        self.file_exists("dbrepo-upload", filename)
+        self.file_exists(bucket, filename)
        filepath = os.path.join("/tmp/", filename)
-        try:
+        self.client.download_file(bucket, filename, filepath)
-            self.client.download_file("dbrepo-upload", filename, filepath)
        logging.info(f"Downloaded .csv with key {filename} into {filepath}")
-            return True
-        except ClientError:
-            logging.error(f"Failed to download file with key {filename} into {filepath}")
-            return False
    def file_exists(self, bucket, filename):
        try:

--- a/dbrepo-analyse-service/determine_dt.py
+++ b/dbrepo-analyse-service/determine_dt.py
@@ -52,11 +52,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
        r = {}
-        # list of rows
-        if enum == True:
-            rows = pd.read_csv(fh, sep=separator, header=offset)
-            n = len(rows)
        for i in range(0, (len(types))):
            if type(types[i]) == messytables.types.BoolType:
                r[headers[i]] = "bool"
@@ -72,29 +67,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
                r[headers[i]] = "decimal"
            elif type(types[i]) == messytables.types.StringType:
                r[headers[i]] = "varchar"
-            elif type(types[i]) == messytables.types.PercentageType:
-                r[headers[i]] = "double"
-            elif type(types[i]) == messytables.types.CurrencyType:
-                r[headers[i]] = "double"
-            elif type(types[i]) == messytables.types.TimeType:
-                r[headers[i]] = "time"
-            else:
-                if enum == True:
-                    enum_set = set()
-                    m = 0
-                    is_enum = True
-                    for elem in range(0, n):
-                        if (m < enum_tol * n):
-                            enum_set.add(rows.iloc[elem, i])
-                        else:
-                            is_enum = False
-                            break
-                        m = len(enum_set)
-                    if is_enum:
-                        enum_set.discard(None)
-                        r[headers[i]] = {"enums": list(enum_set)}
-                    else:
-                        r[headers[i]] = "text"
            else:
                r[headers[i]] = "text"
        fh.close()

--- a/dbrepo-analyse-service/requirements.txt
+++ b/dbrepo-analyse-service/requirements.txt
-attrs==23.1.0
-certifi==2023.5.7
-chardet==5.1.0
-charset-normalizer==2.0.12
-click==8.1.3
-coverage==7.1.0
-docker==5.0.0
-exceptiongroup==1.1.1
-flasgger==0.9.5
-Flask==2.2.2
-gevent==21.8.0
-greenlet==1.1.3.post0
-html5lib==1.1
-idna==3.4
-importlib-metadata==6.6.0
-iniconfig==2.0.0
-itsdangerous==2.1.2
-Jinja2==3.1.2
-json-table-schema==0.2.1
-jsonschema==4.17.3
-lxml==4.9.2
-MarkupSafe==2.1.2
-messytables==0.15.2
-mistune==2.0.5
-numpy==1.24.3
-packaging==23.1
-pandas==1.2.3
-pluggy==1.0.0
-prometheus-client==0.16.0
-prometheus-flask-exporter==0.21.0
-psycopg2-binary==2.8.6
-pyrsistent==0.19.3
-pytest==7.2.1
-python-dateutil==2.8.2
-python-magic==0.4.27
-pytz==2023.3
-PyYAML==6.0
-requests==2.26.0
-six==1.16.0
-SQLAlchemy==1.4.15
-tomli==2.0.1
-urllib3==1.26.15
-webencodings==0.5.1
-websocket-client==1.5.1
-Werkzeug==2.3.3
-xlrd==2.0.1
-zipp==3.15.0
-zope.event==4.6
-zope.interface==6.0
-boto3==1.28.82
-testcontainers-minio==0.0.1rc1
--- a/dbrepo-analyse-service/test/test_s3_client.py
+++ b/dbrepo-analyse-service/test/test_s3_client.py
@@ -21,6 +21,19 @@ class S3ClientTest(unittest.TestCase):
        response = S3Client().upload_file(filename="testdt01.csv", path="./data/")
        self.assertTrue(response)
+    # @Test
+    def test_upload_bucket_notFound_fails(self):
+        # test
+        try:
+            S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="invalidbucket")
+        except ConnectionRefusedError:
+            pass
+        except Exception:
+            self.fail('unexpected exception raised')
+        else:
+            self.fail('ConnectionRefusedError not raised')
    # @Test
    def test_upload_file_notFound_fails(self):
@@ -41,8 +54,7 @@ class S3ClientTest(unittest.TestCase):
        S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="dbrepo-upload")
        # test
-        response = S3Client().download_file(filename="testdt01.csv")
+        S3Client().download_file(filename="testdt01.csv")
-        self.assertTrue(response)
    # @Test
    def test_download_file_notFound_fails(self):
@@ -57,6 +69,19 @@ class S3ClientTest(unittest.TestCase):
        else:
            self.fail('ClientError not raised')
+    # @Test
+    def test_download_bucket_notFound_fails(self):
+        # test
+        try:
+            S3Client().download_file(filename="testdt01.csv", bucket="invalidbucket")
+        except ClientError:
+            pass
+        except Exception:
+            self.fail('unexpected exception raised')
+        else:
+            self.fail('ClientError not raised')
    # @Test
    def test_get_file_succeeds(self):