diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb index 590383ece54fcc4ebe7a66416a100342a1ad451e..dd80c03bce8534a7ddbd5ddc06fee57cfce0ebe4 100644 --- a/08-submission/gff-02-functional_annot.ipynb +++ b/08-submission/gff-02-functional_annot.ipynb @@ -136,6 +136,55 @@ " return None" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_EC(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Retrieve the EC number assigned by EggNOG-mapper for a certain gene ID.\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " the EC number (if available) for the current gene (product).\n", + " \"\"\"\n", + " if gene_id in lookup.index:\n", + " ec = lookup.loc[gene_id]['EC']\n", + " if ec != '-':\n", + " return ec.split(',')\n", + " return None\n", + "\n", + "def find_PFAMs(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Retrieve the PFAM domains assigned by EggNOG-mapper for a certain gene ID.\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " list\n", + " a list of PFAM domains (if available) associated with the gene (product).\n", + " \"\"\"\n", + " if gene_id in lookup.index:\n", + " pfam = lookup.loc[gene_id]['PFAMs']\n", + " if pfam != '-':\n", + " return pfam.split(',')\n", + " return None" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -157,6 +206,15 @@ "emapper.set_index('gene', inplace=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "emapper.columns" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -203,6 +261,8 @@ " name = find_protein(gene, emapper)\n", " line = f'{line};gene={name}'\n", " description = find_description(gene, emapper)\n", + " ec_list = find_EC(gene, emapper)\n", + " pfams = find_PFAMs(gene, emapper)\n", " if description:\n", " line = f'{line};function={description}'\n", " if name != f'Uncharacterised protein {gene}' or description:\n", @@ -223,6 +283,12 @@ " line = f'{line};function={description}'\n", " if name != f'Uncharacterised protein {gene}' or description:\n", " line = f'{line};note=function predicted by EggNOG-mapper'\n", + " if ec_list is not None:\n", + " for ec in ec_list:\n", + " line = f'{line};EC_number=\"{ec}\"'\n", + " if pfams is not None:\n", + " for pfam in pfams:\n", + " line = f'{line};Dbxref=\"PFAM:{pfam}\"'\n", " named.write(line + '\\n')" ] } diff --git a/08-submission/gff-04-convert_to_embl.sh b/08-submission/gff-04-convert_to_embl.sh index 4b39f04bf8e07f4d23ece7edef930419b25b1041..05d8ce752fe426e2b16d15ce6f2c9090db42cfb5 100644 --- a/08-submission/gff-04-convert_to_embl.sh +++ b/08-submission/gff-04-convert_to_embl.sh @@ -23,4 +23,6 @@ EMBLmyGFF3 $GFF $GENOME \ -v \ -o result.embl -gzip result.embl \ No newline at end of file +# when zipping: overwrite file if it already exists, +# else the command will hang while waiting for confirmation +gzip -f result.embl \ No newline at end of file