From aa0a40fd8cccd19877488da18a7029673bc339d4 Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Wed, 4 Dec 2024 17:32:24 +0100 Subject: [PATCH] adding more information to GFF and EMBL file --- 08-submission/gff-02-functional_annot.ipynb | 66 +++++++++++++++++++++ 08-submission/gff-04-convert_to_embl.sh | 4 +- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb index 590383e..dd80c03 100644 --- a/08-submission/gff-02-functional_annot.ipynb +++ b/08-submission/gff-02-functional_annot.ipynb @@ -136,6 +136,55 @@ " return None" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_EC(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Retrieve the EC number assigned by EggNOG-mapper for a certain gene ID.\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " the EC number (if available) for the current gene (product).\n", + " \"\"\"\n", + " if gene_id in lookup.index:\n", + " ec = lookup.loc[gene_id]['EC']\n", + " if ec != '-':\n", + " return ec.split(',')\n", + " return None\n", + "\n", + "def find_PFAMs(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Retrieve the PFAM domains assigned by EggNOG-mapper for a certain gene ID.\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " list\n", + " a list of PFAM domains (if available) associated with the gene (product).\n", + " \"\"\"\n", + " if gene_id in lookup.index:\n", + " pfam = lookup.loc[gene_id]['PFAMs']\n", + " if pfam != '-':\n", + " return pfam.split(',')\n", + " return None" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -157,6 +206,15 @@ "emapper.set_index('gene', inplace=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "emapper.columns" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -203,6 +261,8 @@ " name = find_protein(gene, emapper)\n", " line = f'{line};gene={name}'\n", " description = find_description(gene, emapper)\n", + " ec_list = find_EC(gene, emapper)\n", + " pfams = find_PFAMs(gene, emapper)\n", " if description:\n", " line = f'{line};function={description}'\n", " if name != f'Uncharacterised protein {gene}' or description:\n", @@ -223,6 +283,12 @@ " line = f'{line};function={description}'\n", " if name != f'Uncharacterised protein {gene}' or description:\n", " line = f'{line};note=function predicted by EggNOG-mapper'\n", + " if ec_list is not None:\n", + " for ec in ec_list:\n", + " line = f'{line};EC_number=\"{ec}\"'\n", + " if pfams is not None:\n", + " for pfam in pfams:\n", + " line = f'{line};Dbxref=\"PFAM:{pfam}\"'\n", " named.write(line + '\\n')" ] } diff --git a/08-submission/gff-04-convert_to_embl.sh b/08-submission/gff-04-convert_to_embl.sh index 4b39f04..05d8ce7 100644 --- a/08-submission/gff-04-convert_to_embl.sh +++ b/08-submission/gff-04-convert_to_embl.sh @@ -23,4 +23,6 @@ EMBLmyGFF3 $GFF $GENOME \ -v \ -o result.embl -gzip result.embl \ No newline at end of file +# when zipping: overwrite file if it already exists, +# else the command will hang while waiting for confirmation +gzip -f result.embl \ No newline at end of file -- GitLab