Skip to content
Snippets Groups Projects
Commit de647574 authored by Martin Weise's avatar Martin Weise
Browse files

Merge branch 'dev' of gitlab.phaidra.org:fair-data-austria-db-repository/fda-services into dev

parents 52160ac2 7610c0b4
No related branches found
No related tags found
3 merge requests!231CI: Remove build for log-service,!228Better error message handling in the frontend,!223Release of version 1.4.0
Showing
with 9638 additions and 2 deletions
{"definitions":{},"externalDocs":{"description":"Sourcecode Documentation","url":"https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services"},"info":{"contact":{"email":"andreas.rauber@tuwien.ac.at","name":"Prof. Andreas Rauber"},"description":"Service that analyses data structures","license":{"name":"Apache 2.0","url":"https://www.apache.org/licenses/LICENSE-2.0"},"title":"Database Repository Analyse Service API","version":"1.3.0"},"openapi":"3.0.0","paths":{"/api/analyse/determinedt":{"post":{"consumes":["application/json"],"description":"This is a simple API which returns the datatypes of a (path) csv file","parameters":[{"description":"to-do description","in":"body","name":"body","required":true,"schema":{"properties":{"enum":{"example":true,"type":"boolean"},"enum_tol":{"example":0.1},"filepath":{"example":"/data/testdt08.csv","type":"string"},"separator":{"example":",","type":"string"}},"type":"object"}}],"produces":["application/json"],"responses":{"200":{"description":"OK"},"405":{"description":"Invalid input"}},"summary":"Determine datatypes"}},"/api/analyse/determinepk":{"post":{"consumes":["application/json"],"description":"This is a simple API which returns the primary keys + ranking of a (path) csv file","parameters":[{"description":"to-do description","in":"body","name":"body","required":true,"schema":{"properties":{"filepath":{"example":"/data/testdt08.csv","type":"string"},"seperator":{"example":",","type":"string"}},"type":"object"}}],"produces":["application/json"],"responses":{"200":{"description":"OK"},"405":{"description":"Invalid input"}},"summary":"Determine primary keys"}},"/health":{"get":{"consumes":["application/json"],"description":"This is a simple API which checks if the application is healthy","parameters":[{"description":"to-do description","in":"body","name":"body","required":true,"schema":{"properties":{"status":{"example":"UP","type":"string"}},"type":"object"}}],"produces":["application/json"],"responses":{"200":{"description":"OK"}},"summary":"Check if application is running"}}},"servers":[{"description":"Generated server url","url":"http://localhost:5000"},{"description":"Sandbox","url":"https://dbrepo2.tuwien.ac.at"}]}
openapi: 3.0.1
info:
title: Database Repository Data Service API
description: Service that manages the data
contact:
name: Prof. Andreas Rauber
email: andreas.rauber@tuwien.ac.at
license:
name: Apache 2.0
url: https://www.apache.org/licenses/LICENSE-2.0
version: 1.3.0
externalDocs:
description: Sourcecode Documentation
url: https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services
servers:
- url: http://localhost:9093
description: Generated server url
- url: https://test.dbrepo.tuwien.ac.at
description: Sandbox
paths: {}
components:
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
This diff is collapsed.
openapi: 3.0.1
info:
title: OpenAPI definition
version: v0
servers:
- url: http://localhost:9050
description: Generated server url
paths: {}
components:
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
openapi: 3.0.1
info:
title: Database Repository Upload Service API
description: Service that manages the uploads
contact:
name: Prof. Andreas Rauber
email: andreas.rauber@tuwien.ac.at
license:
name: Apache 2.0
url: https://www.apache.org/licenses/LICENSE-2.0
version: 1.3.0
externalDocs:
description: Sourcecode Documentation
url: https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services
servers:
- url: http://localhost:1080
description: Generated server url
- url: https://test.dbrepo.tuwien.ac.at
description: Sandbox
paths:
/api/upload/files:
post:
tags:
- upload-endpoint
summary: Uploads a file
operationId: upload
responses:
"201":
description: "Successfully uploaded a file"
security: {}
\ No newline at end of file
#!/bin/bash
declare -A services
services[5000]=analyse
services[9050]=mirror
services[9093]=data
services[9099]=metadata
function retrieve () {
if [[ "$2" == analyse ]]; then
echo "... retrieve json api from localhost:$1"
wget "http://localhost:$1/api-$2.json" -O "./api-$2.yaml" -q
else
echo "... retrieve yaml api from localhost:$1"
wget "http://localhost:$1/v3/api-docs.yaml" -O "./api-$2.yaml" -q
fi
}
for key in "${!services[@]}"; do
echo "Generating ${services[$key]} API"
retrieve "$key" "${services[$key]}"
done
\ No newline at end of file
......@@ -12,7 +12,6 @@ ready
.docs/.swagger/site/
site/
# temporary files
.$*
# Notebooks
......
......@@ -58,6 +58,13 @@ build-frontend:
- "yarn --cwd ./dbrepo-ui install --legacy-peer-deps"
- "yarn --cwd ./dbrepo-ui run build"
build-search-service:
image: python:3.10-alpine
stage: build
script:
- "pip install pipenv"
- "cd dbrepo-search-service && pipenv install --system --deploy"
build-docker:
image: docker.io/docker:24-dind
stage: build
......
......@@ -132,6 +132,15 @@ server {
proxy_read_timeout 90;
}
location /api/search {
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://search-service:4000;
proxy_read_timeout 90;
}
location / {
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
......
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# SQLite db
*.db
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
File added
FROM python:3.10-alpine
RUN adduser -D alpine
WORKDIR /home/alpine
COPY Pipfile Pipfile.lock ./
RUN pip install pipenv && \
pipenv install gunicorn && \
pipenv install --system --deploy
COPY ./app ./app
COPY ./scripts ./scripts
COPY config.py wsgi.py ./
ENV FLASK_APP=wsgi.py
RUN chown -R alpine:alpine ./
USER alpine
ENTRYPOINT ["sh", "./scripts/docker-entrypoint.sh"]
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
elasticsearch = "~=8.0"
flasgger = "*"
flask = "~=2.0"
flask-cors = "~=4.0"
flask-jwt-extended = "~=4.5"
flask-sqlalchemy = "~=3.0"
opensearch-py = "~=2.2"
prometheus-flask-exporter = "~=0.22"
python-dotenv = "~=1.0"
sqlalchemy-utils = "*"
testcontainers-opensearch = "*"
[dev-packages]
[requires]
python_version = "3.10"
This diff is collapsed.
# Search service
🚧 WIP 🚧
The dbrepo search service is used to enable searching for
entries in the opensearch databse.
## Running the app
Test the app locally:
```shell
pipenv install && pipenv run flask run --debug --port 4000
```
## Overview
Here's an overview about the different endpoints available at this service:
(`<index>` has to be one of the following indices:
table, user, database, column, identifier, concept, unit, view)
---
`/api/search/<index>` :
returns all entries for a given index
---
`/api/search/<index>/fields`:
returns all the fields that are saved in a given entry
---
`/api/search`:
this is the main endpoint for searching entries in the opensearch db.
You can specify a search term, a time period
and certain fields that should match a certain value.
ToDo: Continue
\ No newline at end of file
from gevent.pywsgi import WSGIServer
from app import create_app
app = create_app()
if __name__ == '__main__':
http_server = WSGIServer(('', 5050), app)
http_server.serve_forever()
"""Search App Initialization."""
import os
import logging
from flasgger import LazyJSONEncoder
from flask import Flask
from opensearchpy import OpenSearch
from config import Config
from prometheus_flask_exporter import PrometheusMetrics
logging.basicConfig(level=logging.DEBUG)
from logging.config import dictConfig
def create_app(config_class=Config):
# logging configuration
dictConfig({
'version': 1,
'formatters': {
'default': {
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
},
'simple': {
'format': '[%(asctime)s] %(levelname)s: %(message)s',
},
},
'handlers': {'wsgi': {
'class': 'logging.StreamHandler',
'stream': 'ext://flask.logging.wsgi_errors_stream',
'formatter': 'simple' # default
}},
'root': {
'level': 'DEBUG',
'handlers': ['wsgi']
}
})
# create app object
app = Flask(__name__)
metrics = PrometheusMetrics(app)
metrics.info("app_info", "Application info", version="0.0.1")
app.config["SWAGGER"] = {"openapi": "3.0.0", "title": "Swagger UI", "uiversion": 3}
# https://flask-jwt-extended.readthedocs.io/en/stable/options/
app.config["JWT_ALGORITHM"] = "HS256"
app.config["JWT_DECODE_ISSUER"] = os.getenv("JWT_ISSUER")
app.config["JWT_PUBLIC_KEY"] = os.getenv("JWT_PUBKEY")
app.json_encoder = LazyJSONEncoder
# load configuration
app.config.from_object(config_class)
logging.info('opensearch endpoint 1: %s:%d', app.config["SEARCH_HOST"], app.config["SEARCH_PORT"])
app.opensearch_client = (
OpenSearch(hosts=[{"host": app.config["SEARCH_HOST"], "port": app.config["SEARCH_PORT"]}],
http_compress=True,
http_auth=(app.config["SEARCH_USERNAME"], app.config["SEARCH_PASSWORD"]),
)
if app.config["SEARCH_HOST"]
else None
)
# register blueprints
from app.api import api_bp
app.register_blueprint(api_bp)
return app
from flask import Blueprint
api_bp = Blueprint("api", __name__, url_prefix="/api/search")
from app.api import routes
# -*- coding: utf-8 -*-
"""
This file defines the endpoints for the dbrepo-search-service.
"""
import logging
from flask import request
# ToDo: make import recognisable by PyCharm
from app.api import api_bp
from flasgger.utils import swag_from
from app.opensearch_client import *
import math
from opensearchpy import OpenSearch
host = "localhost"
port = 9200
auth = ("admin", "admin")
client = OpenSearch(
hosts=[{"host": host, "port": port}],
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
)
def general_filter(index, results):
"""
Applies filtering to the result of opensearch queries.
we only want to return specific entries of the result dict to the user, depending on the queried index.
the keys for the entries per index that shouldn't be deleted are specified in the important_keys dict.
:param index: the search index the query results are about
:param results: the raw response of the query_index_by_term_opensearch function.
:return:
"""
important_keys = {
"column": ["id", "name", "column_type"],
"table": ["id", "name", "description"],
"identifier": ["id", "title", "type"],
"user": ["id", "username"],
"database": ["id", "name", "is_public", "details"],
"concept": ["uri", "name"],
"unit": [],
"view": ["id", "name", "creator", " created"],
}
if index not in important_keys.keys():
error_msg = "the keys to be returned to the user for your index aren't specified in the important Keys dict"
raise KeyError(error_msg)
for result in results:
result_keys_copy = tuple(result.keys())
for key in result_keys_copy:
if key not in important_keys[index]:
del result[key]
logging.debug('general filter results: %s', results)
return results
@api_bp.route("<string:index>", methods=["GET"], endpoint="endpoint")
@swag_from("us-yml") # ToDo: get the SWAG right
def get_index(index):
"""
returns all entries in a specific index
:param index: desired index
:return: list of the results
"""
logging.info('Searching for index: %s', index)
available_indices = [
"table",
"user",
"database",
"column",
"identifier",
"concept",
"unit",
"view",
]
if index not in available_indices:
return {
"results": {},
"status": 404,
}, 404 # ToDo: replace with better error handling
results = query_index_by_term_opensearch(index, "*", "contains")
results = general_filter(index, results)
total_number_of_results = len(results)
results_per_page = min(request.args.get("results_per_page", 50, type=int), 500)
max_pages = math.ceil(len(results) / results_per_page)
page = min(request.args.get("page", 1, type=int), max_pages)
results = results[(results_per_page * (page - 1)) : (results_per_page * page)]
return {"results": results, "total": total_number_of_results, "status": 200}
@api_bp.route("<string:index>/fields", methods=["GET"], endpoint="blabla")
def get_fields(index):
"""
returns a list of attributes of the data for a specific index.
:param index:
:return:
"""
logging.info('Getting fields for index: %s', index)
available_indices = [
"table",
"user",
"database",
"column",
"identifier",
"concept",
"unit",
"view",
]
if index not in available_indices:
return {
"results": {},
"status": 404,
}, 404 # ToDo: replace with better error handling
fields = []
fields = get_fields_for_index(index)
logging.debug('get fields for index %s resulted in fields: %s', index, fields)
return {"fields": fields, "status": 200}
@api_bp.route("", methods=["POST"], endpoint="endpoint2")
def search():
"""
Main endpoint for general searching.
There are three ways of searching:
* if you specify 'search_term' in the request json, all entries that have relevant fields matching the 'search_term' are returned.
No wildcards are allowed, although fuzzy search is enabled (meaning, there are also matches when 1 or two characters differ)
* if you specify 't1' and/or 't2' entries that are newer than timestamp 't1' and entries that are younger than timestamp 't2' are returned.
the timestamp has to have the format YYYY-MM-DD
* if 'field' and 'value' are specified, only entries where the 'field' matches the 'value' are returned.
For example, if the 'field' is 'creator.orcid' and the 'value' is '0000-0002-6778-0887',
only entries created by the person with this specific orcid id are returned.
If there are multiple parameters specified, they are combined via an AND-conjunction, so you can e.g. search for entries that match a certain keyword,
were created in a certain time period, by a specific person.
:return:
"""
if request.content_type != "application/json":
return {
"status": 415,
"message": "Unsupported Media Type",
"suggested_content_types": ["application/json"],
}, 415
req_body = request.json
logging.debug('search request body: %s', req_body)
search_term = req_body.get("search_term")
t1 = req_body.get("t1")
t2 = req_body.get("t2")
field = req_body.get("field")
value = req_body.get("value")
fieldValuePairs = req_body.get("fieldValuePairs")
response = general_search(search_term, t1, t2, fieldValuePairs)
return response, 200
"""
The opensearch_client.py is used by the different API endpoints in routes.py to handle requests to the opensearch db
"""
import json
import logging
from flask import current_app
from collections.abc import MutableMapping
def flatten_dict(
d: MutableMapping, parent_key: str = "", sep: str = "."
) -> MutableMapping:
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, MutableMapping):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def create_friendly_name(attribute_name):
"""
replaces the attribute names so they are more human readable for the front end
:todo: extend special_attribute_names
:param attribute_name:
:return:
"""
special_attribute_names = {
"creator.properties.username": "Username (creator)",
"owner.properties.username": "Username (owner)",
"owner.properties.id": "Id (owner)",
"creator.properties.id": "Id (creator)",
}
if attribute_name not in special_attribute_names:
friendly_name = attribute_name.split(".")[-1]
friendly_name = friendly_name.replace("_", " ").strip()
friendly_name = friendly_name.capitalize()
return friendly_name
else:
return special_attribute_names[attribute_name]
def get_keys(d, parent_key=""):
# currently not in use, probably obsolete?
keys = []
for key, value in d.items():
new_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict):
if "type" in value.keys():
keys.append(
{
"attribute_name": new_key,
"data_type": value["type"],
}
)
else:
keys.extend(get_keys(value, new_key))
return keys
def query_index_by_term_opensearch(index, term, mode):
"""
old code, is effectively replaced by general_search() now
sends an opensearch query
:return list of dicts
"""
query_str = ""
if mode == "exact":
query_str = f"{term}"
elif mode == "contains":
query_str = f"*{term}*"
response = current_app.opensearch_client.search(
index=index,
body={
"query": {
"query_string": {
"query": query_str,
"allow_leading_wildcard": "true", # default true
}
},
},
)
results = [hit["_source"] for hit in response["hits"]["hits"]]
return results
def get_fields_for_index(index):
"""
returns a list of attributes of the data for a specific index.
:param index: the index of interest
:return: list of fields
"""
logging.debug('request fields for index: %s', index)
fields = current_app.opensearch_client.indices.get_mapping(index)
fields = fields[index]["mappings"]["properties"]
logging.debug('fields: %s', fields)
fields_list = []
fd = flatten_dict(fields)
for key in fd.keys():
entry = {}
if key.split(".")[-1] == "type":
entry["attribute_name"] = ".".join(key.split(".")[:-1])
entry["type"] = fd[key]
fields_list.append(entry)
return fields_list
def general_search(search_term=None, t1=None, t2=None, fieldValuePairs=None):
"""
Main method for seaching stuff in the opensearch db
all parameters are optional
:param search_term: the term you want to search for (no wildcards are allowed)
:param t1: beginn time period
:param t2: end time period
:param field: name of the field you want to look at
:param value: the value the specified field should match
:return:
"""
searchable_indices = ["database", "user", "table", "column", "identifier", "view", "concept", "unit"]
index = searchable_indices
field_list = [
"name",
"identifier.titles.title",
"identifier.descriptions.description",
"identifier.publisher",
"identifier.creators.*.firstname",
"identifier.creators.*.lastname",
"identifier.creators.*.creator_name",
"funders",
"title",
"description",
"creator.username",
"concept.name",
"concept.uri",
"author",
"database.*",
"internal_name",
"public",
]
queries = []
if search_term is not None:
logging.debug('query has search_term present')
text_query = {
"multi_match": {
"fields": field_list,
"query": search_term,
"fuzziness": "AUTO",
}
}
queries.append(text_query)
if t1 and t2 is not None:
logging.debug('query has time range present')
time_range_query = {
"range": {
"created": {
"gte": t1,
"lte": t2,
}
}
}
queries.append(time_range_query)
if fieldValuePairs is not None:
logging.debug('query has fieldValuePairs present')
musts = []
for field, value in fieldValuePairs.items():
if field == "type" and value in searchable_indices:
logging.debug("search for specific index: %s", value)
index = value
continue
if field in field_list:
musts.append({
"match": {
field: {"query": value, "minimum_should_match": "90%"}
}
})
specific_query = {"bool": {"must": musts}}
queries.append(specific_query)
logging.debug("queries: %s", queries)
body = {
"query": {"bool": {"must": queries}},
"_source": [
"_class",
"id",
"name",
"identifier.*",
"column_type",
"description",
"title",
"type",
"username",
"is_public",
"created",
"_score",
"concept",
"author",
"docID",
"creator.*",
"owner.*",
"details.*",
],
}
logging.debug('search index: %s', index)
logging.debug('search body: %s', body)
response = current_app.opensearch_client.search(
index=index,
body=body
)
response["status"] = 200
# response = [hit["_source"] for hit in response["hits"]["hits"]]
return response
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment