Skip to content
Snippets Groups Projects
Verified Commit a30d5fb4 authored by Martin Weise's avatar Martin Weise
Browse files

Migrated to Pipfile to enforce Python 3.9

parent 4d348850
Branches
Tags
3 merge requests!231CI: Remove build for log-service,!228Better error message handling in the frontend,!223Release of version 1.4.0
...@@ -374,27 +374,6 @@ scan-ui: ...@@ -374,27 +374,6 @@ scan-ui:
reports: reports:
container_scanning: ./.trivy/trivy-ui-report.json container_scanning: ./.trivy/trivy-ui-report.json
scan-log-service:
image: bitnami/trivy:latest
stage: scan
only:
refs:
- dev
- master
allow_failure: true
script:
- trivy image --insecure --exit-code 0 --format template --template "@.trivy/gitlab.tpl" -o ./.trivy/trivy-log-service-report.json docker.io/dbrepo/log-service:latest
- trivy image --insecure --exit-code 0 docker.io/dbrepo/log-service:latest
- trivy image --insecure --exit-code 1 --severity CRITICAL docker.io/dbrepo/log-service:latest
cache:
paths:
- .trivycache/
artifacts:
when: always
expire_in: 1 days
reports:
container_scanning: ./.trivy/trivy-log-service-report.json
scan-storage-service: scan-storage-service:
image: bitnami/trivy:latest image: bitnami/trivy:latest
stage: scan stage: scan
......
Pipfile 0 → 100644
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
[dev-packages]
[requires]
python_version = "3.11"
...@@ -5,8 +5,11 @@ RUN apt update && apt install -y curl ...@@ -5,8 +5,11 @@ RUN apt update && apt install -y curl
WORKDIR /app WORKDIR /app
COPY ./requirements.txt ./requirements.txt COPY Pipfile Pipfile.lock ./
RUN pip install -r requirements.txt
RUN pip install pipenv && \
pipenv install gunicorn && \
pipenv install --system --deploy
ENV FLASK_APP=app.py ENV FLASK_APP=app.py
ENV FLASK_RUN_HOST=0.0.0.0 ENV FLASK_RUN_HOST=0.0.0.0
......
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
boto3 = "*"
exceptiongroup = "*"
flask = "*"
flasgger = "*"
gevent = "*"
prometheus-flask-exporter = "*"
numpy = "*"
pandas = "*"
messytables = "*"
minio = "*"
[dev-packages]
coverage = "*"
pytest = "*"
testcontainers-minio = "*"
[requires]
python_version = "3.9"
This diff is collapsed.
...@@ -9,3 +9,27 @@ data provenance, db description ... to the metadata database. Remark: if you use ...@@ -9,3 +9,27 @@ data provenance, db description ... to the metadata database. Remark: if you use
* Prometheus metrics [`/metrics`](http://localhost:5000/metrics) * Prometheus metrics [`/metrics`](http://localhost:5000/metrics)
* Health check [`/health`](http://localhost:5000/health) * Health check [`/health`](http://localhost:5000/health)
* API * API
## Development
Install all dev dependencies from the `Pipfile`:
```shell
pipenv install --dev
```
## Test
Run all tests in `test/`:
```shell
coverage run -m pytest test/test_determine_dt.py test/test_determine_pk.py test/test_s3_client.py --junitxml=report.xml
coverage html --omit="test/*" # (optional html report)
```
## Other
Potential issues when upgrading to Python 3.10+ as `messytables` requires `collections` and the interface changed for
Python 3.10 onwards, see
the [StackOverflow](https://stackoverflow.com/questions/69381312/importerror-cannot-import-name-mapping-from-collections-using-python-3-10)
post.
\ No newline at end of file
import json
import numpy as np
import messytables, pandas as pd
from messytables import CSVTableSet, headers_guess
from determine_dt import determine_datatypes
from psycopg2 import connect
import requests
def analysecsv(path,seper,internaldbname, dbhost, dbid, tname, header = True):
# Connect to Meta database
try:
conn=connect(
dbname="fda",
user = "postgres",
host = "fda-metadata-db",
password = "postgres"
)
cursor = conn.cursor()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
r={}
# Check if csv format is suitable
if header == True:
df = pd.read_csv(path,sep=seper)
else:
df = pd.read_csv(path,sep=seper,header=None)
csvcol = df.shape[1]
cursor.execute(f"""select numcols from mdb_tables;"""
)
mdcol = cursor.fetchone()
if csvcol != mdcol:
r["dim"] = "Dimension mismatch. Specify which colums should be filled."
# Check if determined datatypes matches
dt = json.loads(determine_datatypes(path,seperator=seper))
if header == True:
fh = open(path, 'rb')
# Load a file object:
table_set = CSVTableSet(fh)
# A table set is a collection of tables:
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
for i in dt["columns"].keys():
cursor.execute("select datatype from mdb_columns where cdbid = %s and tname = %s and cname = %s;""", (dbid, tname, i, ))
res = cursor.fetchone()
if res != dt["columns"][i]:
r["dt"] = "Datatype mismatch in {}. {} vs {}".format(i,res,dt["columns"][i])
conn.close()
else:
conn=connect(
dbname=internaldbname,
user = "postgres",
host = dbhost,
password = "postgres"
)
cursor = conn.cursor()
cursor.execute("select ordinal_position, data_type from information_schema.columns where table_name = %s;",(tname,))
pos_dt = cursor.fetchall()
ldt = list(dt["columns"].values())
for i in range(0,len(ldt)):
if pos_dt[i][1].lower() != ldt[i].lower():
r["dt"] = "Datatype mismatch at ordinal position {}".format(i+1)
conn.close()
# Check constraints (Primary key, Foreignkey, nullable, other constraints?)
conn=connect(
dbname=internaldbname,
user = "postgres",
host = dbhost,
password = "postgres"
)
cursor = conn.cursor()
# Get ordinal position of primary key attributes
cursor.execute("""SELECT c.ordinal_position
FROM information_schema.table_constraints tc
JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_name)
JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema
AND tc.table_name = c.table_name AND ccu.column_name = c.column_name
WHERE constraint_type = 'PRIMARY KEY' and tc.table_name = %s;""",(tname,))
pk = cursor.fetchall()
pk_flattend = [item for items in pk for item in items]
pk_aditer = list(map(lambda x: x -1, pk_flattend))
tmp = df[df.iloc[:,np.r_[pk_aditer]].duplicated()]
if not tmp.empty:
r["pk"] = "Rows {} violate primary key".format(tmp)
# detect enum values
return json.dumps(r)
\ No newline at end of file
#!/bin/bash #!/bin/bash
python3 -m venv ./dbrepo-analyse-service/venv python3 -m venv ./dbrepo-analyse-service/venv
source ./dbrepo-analyse-service/venv/bin/activate source ./dbrepo-analyse-service/venv/bin/activate
pip install -r ./dbrepo-analyse-service/requirements.txt PIPENV_PIPFILE=./dbrepo-analyse-service/Pipfile pipenv install --dev
\ No newline at end of file \ No newline at end of file
...@@ -2,6 +2,7 @@ import os ...@@ -2,6 +2,7 @@ import os
import boto3 import boto3
import logging import logging
from boto3.exceptions import S3UploadFailedError
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
...@@ -34,31 +35,24 @@ class S3Client: ...@@ -34,31 +35,24 @@ class S3Client:
logging.error(f'Failed to find .csv at {filepath}') logging.error(f'Failed to find .csv at {filepath}')
raise FileNotFoundError(f'Failed to find .csv at {filepath}') raise FileNotFoundError(f'Failed to find .csv at {filepath}')
try: try:
if self.client.upload_file(filepath, bucket, filename) is False: self.client.upload_file(filepath, bucket, filename)
logging.warning(f"Failed to upload file with key {filename}")
raise ConnectionRefusedError(f"Failed to upload file with key {filename}")
logging.info(f"Uploaded .csv {filepath} with key {filename}") logging.info(f"Uploaded .csv {filepath} with key {filename}")
return True return True
except ClientError as e: except (ClientError, S3UploadFailedError) as e:
logging.error(e) logging.warning(f"Failed to upload file with key {filename}")
return False raise ConnectionRefusedError(f"Failed to upload file with key {filename}", e)
def download_file(self, filename) -> bool: def download_file(self, filename, bucket="dbrepo-upload"):
""" """
Downloads a file from the blob storage. Downloads a file from the blob storage.
Follows the official API https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-download-file.html Follows the official API https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-download-file.html
:param filename: The filename. :param filename: The filename.
:return: True if the file was downloaded and saved. :param bucket: The bucket to download the file from.
""" """
self.file_exists("dbrepo-upload", filename) self.file_exists(bucket, filename)
filepath = os.path.join("/tmp/", filename) filepath = os.path.join("/tmp/", filename)
try: self.client.download_file(bucket, filename, filepath)
self.client.download_file("dbrepo-upload", filename, filepath)
logging.info(f"Downloaded .csv with key {filename} into {filepath}") logging.info(f"Downloaded .csv with key {filename} into {filepath}")
return True
except ClientError:
logging.error(f"Failed to download file with key {filename} into {filepath}")
return False
def file_exists(self, bucket, filename): def file_exists(self, bucket, filename):
try: try:
......
...@@ -52,11 +52,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) - ...@@ -52,11 +52,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
r = {} r = {}
# list of rows
if enum == True:
rows = pd.read_csv(fh, sep=separator, header=offset)
n = len(rows)
for i in range(0, (len(types))): for i in range(0, (len(types))):
if type(types[i]) == messytables.types.BoolType: if type(types[i]) == messytables.types.BoolType:
r[headers[i]] = "bool" r[headers[i]] = "bool"
...@@ -72,29 +67,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) - ...@@ -72,29 +67,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
r[headers[i]] = "decimal" r[headers[i]] = "decimal"
elif type(types[i]) == messytables.types.StringType: elif type(types[i]) == messytables.types.StringType:
r[headers[i]] = "varchar" r[headers[i]] = "varchar"
elif type(types[i]) == messytables.types.PercentageType:
r[headers[i]] = "double"
elif type(types[i]) == messytables.types.CurrencyType:
r[headers[i]] = "double"
elif type(types[i]) == messytables.types.TimeType:
r[headers[i]] = "time"
else:
if enum == True:
enum_set = set()
m = 0
is_enum = True
for elem in range(0, n):
if (m < enum_tol * n):
enum_set.add(rows.iloc[elem, i])
else:
is_enum = False
break
m = len(enum_set)
if is_enum:
enum_set.discard(None)
r[headers[i]] = {"enums": list(enum_set)}
else:
r[headers[i]] = "text"
else: else:
r[headers[i]] = "text" r[headers[i]] = "text"
fh.close() fh.close()
......
attrs==23.1.0
certifi==2023.5.7
chardet==5.1.0
charset-normalizer==2.0.12
click==8.1.3
coverage==7.1.0
docker==5.0.0
exceptiongroup==1.1.1
flasgger==0.9.5
Flask==2.2.2
gevent==21.8.0
greenlet==1.1.3.post0
html5lib==1.1
idna==3.4
importlib-metadata==6.6.0
iniconfig==2.0.0
itsdangerous==2.1.2
Jinja2==3.1.2
json-table-schema==0.2.1
jsonschema==4.17.3
lxml==4.9.2
MarkupSafe==2.1.2
messytables==0.15.2
mistune==2.0.5
numpy==1.24.3
packaging==23.1
pandas==1.2.3
pluggy==1.0.0
prometheus-client==0.16.0
prometheus-flask-exporter==0.21.0
psycopg2-binary==2.8.6
pyrsistent==0.19.3
pytest==7.2.1
python-dateutil==2.8.2
python-magic==0.4.27
pytz==2023.3
PyYAML==6.0
requests==2.26.0
six==1.16.0
SQLAlchemy==1.4.15
tomli==2.0.1
urllib3==1.26.15
webencodings==0.5.1
websocket-client==1.5.1
Werkzeug==2.3.3
xlrd==2.0.1
zipp==3.15.0
zope.event==4.6
zope.interface==6.0
boto3==1.28.82
testcontainers-minio==0.0.1rc1
...@@ -21,6 +21,19 @@ class S3ClientTest(unittest.TestCase): ...@@ -21,6 +21,19 @@ class S3ClientTest(unittest.TestCase):
response = S3Client().upload_file(filename="testdt01.csv", path="./data/") response = S3Client().upload_file(filename="testdt01.csv", path="./data/")
self.assertTrue(response) self.assertTrue(response)
# @Test
def test_upload_bucket_notFound_fails(self):
# test
try:
S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="invalidbucket")
except ConnectionRefusedError:
pass
except Exception:
self.fail('unexpected exception raised')
else:
self.fail('ConnectionRefusedError not raised')
# @Test # @Test
def test_upload_file_notFound_fails(self): def test_upload_file_notFound_fails(self):
...@@ -41,8 +54,7 @@ class S3ClientTest(unittest.TestCase): ...@@ -41,8 +54,7 @@ class S3ClientTest(unittest.TestCase):
S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="dbrepo-upload") S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="dbrepo-upload")
# test # test
response = S3Client().download_file(filename="testdt01.csv") S3Client().download_file(filename="testdt01.csv")
self.assertTrue(response)
# @Test # @Test
def test_download_file_notFound_fails(self): def test_download_file_notFound_fails(self):
...@@ -57,6 +69,19 @@ class S3ClientTest(unittest.TestCase): ...@@ -57,6 +69,19 @@ class S3ClientTest(unittest.TestCase):
else: else:
self.fail('ClientError not raised') self.fail('ClientError not raised')
# @Test
def test_download_bucket_notFound_fails(self):
# test
try:
S3Client().download_file(filename="testdt01.csv", bucket="invalidbucket")
except ClientError:
pass
except Exception:
self.fail('unexpected exception raised')
else:
self.fail('ClientError not raised')
# @Test # @Test
def test_get_file_succeeds(self): def test_get_file_succeeds(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment