Partly implemented, needing the data db to import them

ee0fb83f · Martin Weise · d0cc05bf · ee0fb83f · ee0fb83f · ee0fb83f
Verified Commit ee0fb83f authored 1 year ago by Martin Weise
--- a/dbrepo-analyse-service/Dockerfile
+++ b/dbrepo-analyse-service/Dockerfile
@@ -13,11 +13,13 @@ ENV FLASK_RUN_HOST=0.0.0.0
 ENV PORT_APP=5000
 ENV FLASK_ENV=production
 ENV HOSTNAME=analyse-service
-ENV SHARED_FILESYSTEM=/tmp
+ENV UPLOAD_ENDPOINT=http://upload-service:1080/api/upload/files

 COPY ./as-yml/ ./as-yml/
 COPY ./*.py ./

+RUN mkdir -p /data
+
 EXPOSE $PORT_APP

 ENTRYPOINT [ "python", "./pywsgi.py" ]

--- a/dbrepo-analyse-service/determine_dt.py
+++ b/dbrepo-analyse-service/determine_dt.py
@@ -13,6 +13,7 @@ import json
 import csv
 import logging
 import os
+import urllib.request

 import messytables, pandas as pd
 from messytables import CSVTableSet, type_guess, \
@@ -23,7 +24,10 @@ def determine_datatypes(filename, enum=False, enum_tol=0.0001, separator=None) -
    # Use option enum=True for searching Postgres ENUM Types in CSV file. Remark
    # Enum is not SQL standard, hence, it might not be supported by all db-engines.
    # However, it can be used in Postgres and MySQL.
-    path = os.path.join(os.getenv('SHARED_FILESYSTEM', '/tmp'), filename)
+    path = "/data/" + filename
+    api_path = os.getenv('UPLOAD_ENDPOINT', 'http://127.0.0.1:1080/api/upload/files') + "/" + filename
+    logging.info('retrieve api_path: %s and save it to path: %s', api_path, path)
+    urllib.request.urlretrieve(api_path, path)
    if separator is None:
        with open(path) as csvfile:
            dialect = csv.Sniffer().sniff(csvfile.readline())

--- a/dbrepo-data-db/.gitignore
+++ b/dbrepo-data-db/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# SQLite db
+*.db
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/dbrepo-data-db/README.md
+++ b/dbrepo-data-db/README.md
+# Data Database
\ No newline at end of file
--- a/dbrepo-data-db/sidecar/Dockerfile
+++ b/dbrepo-data-db/sidecar/Dockerfile
+FROM python:3.10-alpine
+
+RUN apk add bash curl jq && adduser -D alpine
+
+WORKDIR /home/alpine
+
+COPY Pipfile Pipfile.lock ./
+
+RUN pip install pipenv && \
+    pipenv install gunicorn && \
+    pipenv install --system --deploy
+
+COPY ./ds-yml ./ds-yml
+COPY ./app.py ./app.py
+
+ENV UPLOAD_ENDPOINT="http://upload-service:1080/api/upload/files"
+
+RUN chown -R alpine:alpine ./
+USER alpine
+
+EXPOSE 5000
+
+ENTRYPOINT [ "gunicorn", "-w", "4", "-b", ":5000", "app:app" ]
--- a/dbrepo-data-db/sidecar/Pipfile
+++ b/dbrepo-data-db/sidecar/Pipfile
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+flasgger = "*"
+flask = "~=2.0"
+flask-cors = "~=4.0"
+flask-jwt-extended = "~=4.5"
+flask-sqlalchemy = "~=3.0"
+prometheus-flask-exporter = "*"
+python-dotenv = "~=1.0"
+sqlalchemy-utils = "*"
+gunicorn = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.10"
--- a/dbrepo-data-db/sidecar/Pipfile.lock
+++ b/dbrepo-data-db/sidecar/Pipfile.lock
--- a/dbrepo-data-db/sidecar/README.md
+++ b/dbrepo-data-db/sidecar/README.md
+# Data Database Sidecar
+
+Sidecar that downloads the .csv from the Upload Service to deposit on the same pod as the data-database.
+
+## Endpoints
+
+* Prometheus metrics [`/metrics`](http://localhost:8080/metrics)
+* Health check [`/health`](http://localhost:8080/health)
+* Swagger API [`/swagger-ui/`](http://localhost:8080/swagger-ui/)
\ No newline at end of file
--- a/dbrepo-data-db/sidecar/app.py
+++ b/dbrepo-data-db/sidecar/app.py
+import os
+import logging
+from urllib.error import URLError, ContentTooShortError, HTTPError
+
+from flasgger import LazyJSONEncoder, Swagger
+from flask import Flask, request, Response
+from flasgger.utils import swag_from
+import urllib.request
+from prometheus_flask_exporter import PrometheusMetrics
+
+logging.basicConfig(level=logging.DEBUG)
+
+from logging.config import dictConfig
+
+# logging configuration
+dictConfig({
+    'version': 1,
+    'formatters': {
+        'default': {
+            'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
+        },
+        'simple': {
+            'format': '[%(asctime)s] %(levelname)s: %(message)s',
+        },
+    },
+    'handlers': {'wsgi': {
+        'class': 'logging.StreamHandler',
+        'stream': 'ext://flask.logging.wsgi_errors_stream',
+        'formatter': 'simple'  # default
+    }},
+    'root': {
+        'level': 'DEBUG',
+        'handlers': ['wsgi']
+    }
+})
+
+# create app object
+app = Flask(__name__)
+
+metrics = PrometheusMetrics(app)
+metrics.info("app_info", "Application info", version="0.0.1")
+app.config["SWAGGER"] = {"openapi": "3.0.1", "title": "Swagger UI", "uiversion": 3}
+
+swagger_config = {
+    "headers": [],
+    "specs": [
+        {
+            "endpoint": "api-sidecar",
+            "route": "/api-sidecar.json",
+            "rule_filter": lambda rule: rule.endpoint.startswith('actuator') or rule.endpoint.startswith('sidecar'),
+            "model_filter": lambda tag: True,  # all in
+        }
+    ],
+    "static_url_path": "/flasgger_static",
+    "swagger_ui": True,
+    "specs_route": "/swagger-ui/",
+}
+
+template = {
+    "openapi": "3.0.0",
+    "info": {
+        "title": "Database Repository Data Database sidecar API",
+        "description": "Sidecar that downloads the import .csv file",
+        "version": "1.3.0",
+        "contact": {
+            "name": "Prof. Andreas Rauber",
+            "email": "andreas.rauber@tuwien.ac.at"
+        },
+        "license": {
+            "name": "Apache 2.0",
+            "url": "https://www.apache.org/licenses/LICENSE-2.0"
+        },
+    },
+    "externalDocs": {
+        "description": "Sourcecode Documentation",
+        "url": "https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services"
+    },
+    "servers": [
+        {
+            "url": "http://localhost:5000",
+            "description": "Generated server url"
+        },
+        {
+            "url": "https://test.dbrepo.tuwien.ac.at",
+            "description": "Sandbox"
+        }
+    ]
+}
+
+swagger = Swagger(app, config=swagger_config, template=template)
+# https://flask-jwt-extended.readthedocs.io/en/stable/options/
+app.config["JWT_ALGORITHM"] = "HS256"
+app.config["JWT_DECODE_ISSUER"] = os.getenv("JWT_ISSUER")
+app.config["JWT_PUBLIC_KEY"] = os.getenv("JWT_PUBKEY")
+
+app.json_encoder = LazyJSONEncoder
+
+
+@app.route("/health", methods=["GET"], endpoint="actuator_health")
+@swag_from("ds-yml/health.yml")
+def health():
+    return Response({"status": "UP"}, mimetype="application/json"), 200
+
+
+@app.route("/sidecar/import", methods=["POST"], endpoint="sidecar_import")
+@swag_from("ds-yml/import.yml")
+def import_csv():
+    logging.debug('endpoint import csv, body=%s', request)
+    input_json = request.get_json()
+    filepath = str(input_json['filepath'])
+    api = os.getenv("UPLOAD_ENDPOINT", "http://localhost:1080/api/auth/files")
+    try:
+        urllib.request.urlretrieve(api + "/" + filepath, "/tmp/" + filepath)
+    except URLError as e:
+        logging.error('Failed to import .csv: %s', e)
+        return Response(), 503
+    return Response(), 202
--- a/dbrepo-data-db/sidecar/ds-yml/health.yml
+++ b/dbrepo-data-db/sidecar/ds-yml/health.yml
+summary: Return a healthcheck
+description: |
+  Return UP if the instance is ready to serve connections.
+consumes:
+  - application/json
+produces:
+  - application/json
+parameters: [ ]
+
+responses:
+  200:
+    description: OK, service is up and running
+    content:
+      application/json:
+        schema:
+          $ref: "#/components/schemas/Health"
+  404:
+    description: Service is not yet ready
+tags:
+  - actuator
+
+components:
+  schemas:
+    Health:
+      title: Status object
+      type: object
+      properties:
+        status:
+          type: string
+          example: UP
+      required:
+        - status
\ No newline at end of file
--- a/dbrepo-data-db/sidecar/ds-yml/import.yml
+++ b/dbrepo-data-db/sidecar/ds-yml/import.yml
+summary: Imports a .csv from the Upload Service
+description: |
+  Imports a specific .csv file from the Upload Service via HTTP
+consumes:
+  - application/json
+produces:
+  - application/json
+parameters:
+  - in: "body"
+    name: "body"
+    description: "Payload to import the .csv"
+    required: true
+    schema:
+      $ref: "#/components/schemas/Import"
+
+responses:
+  202:
+    description: Imported the .csv
+    content: { }
+  503:
+    description: The Upload Service could not be contacted or .csv was not found.
+tags:
+  - sidecar
+
+components:
+  schemas:
+    Import:
+      type: "object"
+      properties:
+        filepath:
+          type: "string"
+          example: "sample.csv"
\ No newline at end of file
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -87,7 +87,7 @@ services:
      - "--base-path=/api/upload/files/"
    volumes:
      - upload-service-data:/data
-      - "${SHARED_FILESYSTEM:-/tmp}:/srv/tusd-data/data"
+#      - "${SHARED_FILESYSTEM:-/tmp}:/srv/tusd-data/data"
    logging:
      driver: json-file

@@ -196,7 +196,7 @@ services:
    ports:
      - "5000:5000"
    environment:
-      SHARED_FILESYSTEM: "${SHARED_FILESYSTEM:-/tmp}"
+      UPLOAD_ENDPOINT: "${UPLOAD_ENDPOINT:-http://upload-service:1080/api/upload/files}"
    volumes:
      - "${SHARED_FILESYSTEM:-/tmp}:/tmp"
    healthcheck:
@@ -267,6 +267,25 @@ services:
      FLASK_DEBUG: ${SEARCH_DEBUG_MODE:-true}
      OPENSEARCH_HOST: ${OPENSEARCH_HOST:-dbrepo-search-db}

+  dbrepo-data-db-sidecar:
+    restart: "no"
+    container_name: dbrepo-data-db-sidecar
+    hostname: data-db-sidecar
+    build: ./dbrepo-data-db/sidecar
+    image: dbrepo-data-db-sidecar:latest
+    ports:
+      - "3600:5000"
+    environment:
+      FLASK_DEBUG: ${SEARCH_DEBUG_MODE:-true}
+      UPLOAD_ENDPOINT: "${UPLOAD_ENDPOINT:-http://upload-service:1080/api/upload/files}"
+    volumes:
+      - "${SHARED_FILESYSTEM:-/tmp}:/tmp"
+    healthcheck:
+      test: curl -sSL 127.0.0.1:5000/health | jq .status | grep "UP" || exit 1
+      interval: 10s
+      timeout: 5s
+      retries: 12
+
  dbrepo-ui:
    restart: "no"
    container_name: dbrepo-ui