Skip to content
Snippets Groups Projects
Verified Commit a30d5fb4 authored by Martin Weise's avatar Martin Weise
Browse files

Migrated to Pipfile to enforce Python 3.9

parent 4d348850
No related branches found
No related tags found
3 merge requests!231CI: Remove build for log-service,!228Better error message handling in the frontend,!223Release of version 1.4.0
...@@ -374,27 +374,6 @@ scan-ui: ...@@ -374,27 +374,6 @@ scan-ui:
reports: reports:
container_scanning: ./.trivy/trivy-ui-report.json container_scanning: ./.trivy/trivy-ui-report.json
scan-log-service:
image: bitnami/trivy:latest
stage: scan
only:
refs:
- dev
- master
allow_failure: true
script:
- trivy image --insecure --exit-code 0 --format template --template "@.trivy/gitlab.tpl" -o ./.trivy/trivy-log-service-report.json docker.io/dbrepo/log-service:latest
- trivy image --insecure --exit-code 0 docker.io/dbrepo/log-service:latest
- trivy image --insecure --exit-code 1 --severity CRITICAL docker.io/dbrepo/log-service:latest
cache:
paths:
- .trivycache/
artifacts:
when: always
expire_in: 1 days
reports:
container_scanning: ./.trivy/trivy-log-service-report.json
scan-storage-service: scan-storage-service:
image: bitnami/trivy:latest image: bitnami/trivy:latest
stage: scan stage: scan
......
Pipfile 0 → 100644
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
[dev-packages]
[requires]
python_version = "3.11"
...@@ -5,8 +5,11 @@ RUN apt update && apt install -y curl ...@@ -5,8 +5,11 @@ RUN apt update && apt install -y curl
WORKDIR /app WORKDIR /app
COPY ./requirements.txt ./requirements.txt COPY Pipfile Pipfile.lock ./
RUN pip install -r requirements.txt
RUN pip install pipenv && \
pipenv install gunicorn && \
pipenv install --system --deploy
ENV FLASK_APP=app.py ENV FLASK_APP=app.py
ENV FLASK_RUN_HOST=0.0.0.0 ENV FLASK_RUN_HOST=0.0.0.0
......
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
boto3 = "*"
exceptiongroup = "*"
flask = "*"
flasgger = "*"
gevent = "*"
prometheus-flask-exporter = "*"
numpy = "*"
pandas = "*"
messytables = "*"
minio = "*"
[dev-packages]
coverage = "*"
pytest = "*"
testcontainers-minio = "*"
[requires]
python_version = "3.9"
This diff is collapsed.
...@@ -9,3 +9,27 @@ data provenance, db description ... to the metadata database. Remark: if you use ...@@ -9,3 +9,27 @@ data provenance, db description ... to the metadata database. Remark: if you use
* Prometheus metrics [`/metrics`](http://localhost:5000/metrics) * Prometheus metrics [`/metrics`](http://localhost:5000/metrics)
* Health check [`/health`](http://localhost:5000/health) * Health check [`/health`](http://localhost:5000/health)
* API * API
## Development
Install all dev dependencies from the `Pipfile`:
```shell
pipenv install --dev
```
## Test
Run all tests in `test/`:
```shell
coverage run -m pytest test/test_determine_dt.py test/test_determine_pk.py test/test_s3_client.py --junitxml=report.xml
coverage html --omit="test/*" # (optional html report)
```
## Other
Potential issues when upgrading to Python 3.10+ as `messytables` requires `collections` and the interface changed for
Python 3.10 onwards, see
the [StackOverflow](https://stackoverflow.com/questions/69381312/importerror-cannot-import-name-mapping-from-collections-using-python-3-10)
post.
\ No newline at end of file
import json
import numpy as np
import messytables, pandas as pd
from messytables import CSVTableSet, headers_guess
from determine_dt import determine_datatypes
from psycopg2 import connect
import requests
def analysecsv(path,seper,internaldbname, dbhost, dbid, tname, header = True):
# Connect to Meta database
try:
conn=connect(
dbname="fda",
user = "postgres",
host = "fda-metadata-db",
password = "postgres"
)
cursor = conn.cursor()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
r={}
# Check if csv format is suitable
if header == True:
df = pd.read_csv(path,sep=seper)
else:
df = pd.read_csv(path,sep=seper,header=None)
csvcol = df.shape[1]
cursor.execute(f"""select numcols from mdb_tables;"""
)
mdcol = cursor.fetchone()
if csvcol != mdcol:
r["dim"] = "Dimension mismatch. Specify which colums should be filled."
# Check if determined datatypes matches
dt = json.loads(determine_datatypes(path,seperator=seper))
if header == True:
fh = open(path, 'rb')
# Load a file object:
table_set = CSVTableSet(fh)
# A table set is a collection of tables:
row_set = table_set.tables[0]
# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
for i in dt["columns"].keys():
cursor.execute("select datatype from mdb_columns where cdbid = %s and tname = %s and cname = %s;""", (dbid, tname, i, ))
res = cursor.fetchone()
if res != dt["columns"][i]:
r["dt"] = "Datatype mismatch in {}. {} vs {}".format(i,res,dt["columns"][i])
conn.close()
else:
conn=connect(
dbname=internaldbname,
user = "postgres",
host = dbhost,
password = "postgres"
)
cursor = conn.cursor()
cursor.execute("select ordinal_position, data_type from information_schema.columns where table_name = %s;",(tname,))
pos_dt = cursor.fetchall()
ldt = list(dt["columns"].values())
for i in range(0,len(ldt)):
if pos_dt[i][1].lower() != ldt[i].lower():
r["dt"] = "Datatype mismatch at ordinal position {}".format(i+1)
conn.close()
# Check constraints (Primary key, Foreignkey, nullable, other constraints?)
conn=connect(
dbname=internaldbname,
user = "postgres",
host = dbhost,
password = "postgres"
)
cursor = conn.cursor()
# Get ordinal position of primary key attributes
cursor.execute("""SELECT c.ordinal_position
FROM information_schema.table_constraints tc
JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_name)
JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema
AND tc.table_name = c.table_name AND ccu.column_name = c.column_name
WHERE constraint_type = 'PRIMARY KEY' and tc.table_name = %s;""",(tname,))
pk = cursor.fetchall()
pk_flattend = [item for items in pk for item in items]
pk_aditer = list(map(lambda x: x -1, pk_flattend))
tmp = df[df.iloc[:,np.r_[pk_aditer]].duplicated()]
if not tmp.empty:
r["pk"] = "Rows {} violate primary key".format(tmp)
# detect enum values
return json.dumps(r)
\ No newline at end of file
#!/bin/bash #!/bin/bash
python3 -m venv ./dbrepo-analyse-service/venv python3 -m venv ./dbrepo-analyse-service/venv
source ./dbrepo-analyse-service/venv/bin/activate source ./dbrepo-analyse-service/venv/bin/activate
pip install -r ./dbrepo-analyse-service/requirements.txt PIPENV_PIPFILE=./dbrepo-analyse-service/Pipfile pipenv install --dev
\ No newline at end of file \ No newline at end of file
...@@ -2,6 +2,7 @@ import os ...@@ -2,6 +2,7 @@ import os
import boto3 import boto3
import logging import logging
from boto3.exceptions import S3UploadFailedError
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
...@@ -34,31 +35,24 @@ class S3Client: ...@@ -34,31 +35,24 @@ class S3Client:
logging.error(f'Failed to find .csv at {filepath}') logging.error(f'Failed to find .csv at {filepath}')
raise FileNotFoundError(f'Failed to find .csv at {filepath}') raise FileNotFoundError(f'Failed to find .csv at {filepath}')
try: try:
if self.client.upload_file(filepath, bucket, filename) is False: self.client.upload_file(filepath, bucket, filename)
logging.warning(f"Failed to upload file with key {filename}")
raise ConnectionRefusedError(f"Failed to upload file with key {filename}")
logging.info(f"Uploaded .csv {filepath} with key {filename}") logging.info(f"Uploaded .csv {filepath} with key {filename}")
return True return True
except ClientError as e: except (ClientError, S3UploadFailedError) as e:
logging.error(e) logging.warning(f"Failed to upload file with key {filename}")
return False raise ConnectionRefusedError(f"Failed to upload file with key {filename}", e)
def download_file(self, filename) -> bool: def download_file(self, filename, bucket="dbrepo-upload"):
""" """
Downloads a file from the blob storage. Downloads a file from the blob storage.
Follows the official API https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-download-file.html Follows the official API https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-download-file.html
:param filename: The filename. :param filename: The filename.
:return: True if the file was downloaded and saved. :param bucket: The bucket to download the file from.
""" """
self.file_exists("dbrepo-upload", filename) self.file_exists(bucket, filename)
filepath = os.path.join("/tmp/", filename) filepath = os.path.join("/tmp/", filename)
try: self.client.download_file(bucket, filename, filepath)
self.client.download_file("dbrepo-upload", filename, filepath)
logging.info(f"Downloaded .csv with key {filename} into {filepath}") logging.info(f"Downloaded .csv with key {filename} into {filepath}")
return True
except ClientError:
logging.error(f"Failed to download file with key {filename} into {filepath}")
return False
def file_exists(self, bucket, filename): def file_exists(self, bucket, filename):
try: try:
......
...@@ -52,11 +52,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) - ...@@ -52,11 +52,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
r = {} r = {}
# list of rows
if enum == True:
rows = pd.read_csv(fh, sep=separator, header=offset)
n = len(rows)
for i in range(0, (len(types))): for i in range(0, (len(types))):
if type(types[i]) == messytables.types.BoolType: if type(types[i]) == messytables.types.BoolType:
r[headers[i]] = "bool" r[headers[i]] = "bool"
...@@ -72,29 +67,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) - ...@@ -72,29 +67,6 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
r[headers[i]] = "decimal" r[headers[i]] = "decimal"
elif type(types[i]) == messytables.types.StringType: elif type(types[i]) == messytables.types.StringType:
r[headers[i]] = "varchar" r[headers[i]] = "varchar"
elif type(types[i]) == messytables.types.PercentageType:
r[headers[i]] = "double"
elif type(types[i]) == messytables.types.CurrencyType:
r[headers[i]] = "double"
elif type(types[i]) == messytables.types.TimeType:
r[headers[i]] = "time"
else:
if enum == True:
enum_set = set()
m = 0
is_enum = True
for elem in range(0, n):
if (m < enum_tol * n):
enum_set.add(rows.iloc[elem, i])
else:
is_enum = False
break
m = len(enum_set)
if is_enum:
enum_set.discard(None)
r[headers[i]] = {"enums": list(enum_set)}
else:
r[headers[i]] = "text"
else: else:
r[headers[i]] = "text" r[headers[i]] = "text"
fh.close() fh.close()
......
attrs==23.1.0
certifi==2023.5.7
chardet==5.1.0
charset-normalizer==2.0.12
click==8.1.3
coverage==7.1.0
docker==5.0.0
exceptiongroup==1.1.1
flasgger==0.9.5
Flask==2.2.2
gevent==21.8.0
greenlet==1.1.3.post0
html5lib==1.1
idna==3.4
importlib-metadata==6.6.0
iniconfig==2.0.0
itsdangerous==2.1.2
Jinja2==3.1.2
json-table-schema==0.2.1
jsonschema==4.17.3
lxml==4.9.2
MarkupSafe==2.1.2
messytables==0.15.2
mistune==2.0.5
numpy==1.24.3
packaging==23.1
pandas==1.2.3
pluggy==1.0.0
prometheus-client==0.16.0
prometheus-flask-exporter==0.21.0
psycopg2-binary==2.8.6
pyrsistent==0.19.3
pytest==7.2.1
python-dateutil==2.8.2
python-magic==0.4.27
pytz==2023.3
PyYAML==6.0
requests==2.26.0
six==1.16.0
SQLAlchemy==1.4.15
tomli==2.0.1
urllib3==1.26.15
webencodings==0.5.1
websocket-client==1.5.1
Werkzeug==2.3.3
xlrd==2.0.1
zipp==3.15.0
zope.event==4.6
zope.interface==6.0
boto3==1.28.82
testcontainers-minio==0.0.1rc1
...@@ -21,6 +21,19 @@ class S3ClientTest(unittest.TestCase): ...@@ -21,6 +21,19 @@ class S3ClientTest(unittest.TestCase):
response = S3Client().upload_file(filename="testdt01.csv", path="./data/") response = S3Client().upload_file(filename="testdt01.csv", path="./data/")
self.assertTrue(response) self.assertTrue(response)
# @Test
def test_upload_bucket_notFound_fails(self):
# test
try:
S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="invalidbucket")
except ConnectionRefusedError:
pass
except Exception:
self.fail('unexpected exception raised')
else:
self.fail('ConnectionRefusedError not raised')
# @Test # @Test
def test_upload_file_notFound_fails(self): def test_upload_file_notFound_fails(self):
...@@ -41,8 +54,7 @@ class S3ClientTest(unittest.TestCase): ...@@ -41,8 +54,7 @@ class S3ClientTest(unittest.TestCase):
S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="dbrepo-upload") S3Client().upload_file(filename="testdt01.csv", path="./data/", bucket="dbrepo-upload")
# test # test
response = S3Client().download_file(filename="testdt01.csv") S3Client().download_file(filename="testdt01.csv")
self.assertTrue(response)
# @Test # @Test
def test_download_file_notFound_fails(self): def test_download_file_notFound_fails(self):
...@@ -57,6 +69,19 @@ class S3ClientTest(unittest.TestCase): ...@@ -57,6 +69,19 @@ class S3ClientTest(unittest.TestCase):
else: else:
self.fail('ClientError not raised') self.fail('ClientError not raised')
# @Test
def test_download_bucket_notFound_fails(self):
# test
try:
S3Client().download_file(filename="testdt01.csv", bucket="invalidbucket")
except ClientError:
pass
except Exception:
self.fail('unexpected exception raised')
else:
self.fail('ClientError not raised')
# @Test # @Test
def test_get_file_succeeds(self): def test_get_file_succeeds(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment