From 4c3735a2f0894cd3cc9a36a7e82be9224a97504e Mon Sep 17 00:00:00 2001
From: Niko <nikolaos.papadopoulos@univie.ac.at>
Date: Thu, 28 Nov 2024 12:18:31 +0100
Subject: [PATCH] include emapper description, use ENA specification

---
 08-submission/gff-02-functional_annot.ipynb | 76 ++++++++++++++++-----
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb
index 98a24f3..394b059 100644
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -82,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -104,10 +104,38 @@
     "    if gene_id in lookup.index:\n",
     "        name = lookup.loc[gene_id]['Preferred_name']\n",
     "        if name != '-':\n",
-    "            return name\n",
+    "            return f'{name} (predicted)'\n",
     "    return f'Uncharacterised protein {gene_id}'"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_description(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    \"\"\"Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        the description (if available); will return None if no description is available.\n",
+    "    \"\"\"\n",
+    "    if gene_id in lookup.index:\n",
+    "        description = lookup.loc[gene_id]['Description']\n",
+    "        if description != '-':\n",
+    "            return description\n",
+    "    return None"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -118,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -136,14 +164,15 @@
     "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n",
     "gene symbol (name) in some form:\n",
     "\n",
-    "- Gene: `name=Hox3;gene_name=Hox3`\n",
-    "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n",
-    "- CDS: `gene_name=Hox3`\n",
-    "- exon: `gene_name=Hox3`\n",
+    "- Gene: `gene=Hox3;product=Hox3`\n",
+    "- mRNA: `product=Hox3 isoform 1;gene=Hox3`\n",
+    "- CDS: `gene=Hox3;product=Hox3`\n",
+    "- exon: `gene=Hox3;product=Hox3`\n",
     "\n",
     "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n",
-    "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n",
-    "`CellRanger`) still have access to the functional annotation."
+    "include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep\n",
+    "parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems\n",
+    "to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons."
    ]
   },
   {
@@ -168,17 +197,32 @@
     "            if not conditions_skip:\n",
     "                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n",
     "                attributes = parse_attributes(attributes)\n",
+    "\n",
     "                if feature_type == 'gene':\n",
     "                    gene = attributes['ID']\n",
     "                    name = find_protein(gene, emapper)\n",
-    "                    name = f'{name} (predicted)'\n",
-    "                    line = f'{line};name={name}'\n",
+    "                    line = f'{line};gene={name}'\n",
+    "                    description = find_description(gene, emapper)\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
+    "\n",
     "                if feature_type == 'mRNA':\n",
     "                    mRNA = attributes['ID']\n",
     "                    isoform = mRNA.split('.')[-1]\n",
-    "                    line = f'{line};name={name} isoform {isoform};gene_name={name}'\n",
+    "                    line = f'{line};gene={name};product={name} isoform {isoform}'\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
+    "\n",
     "                if feature_type == 'CDS' or feature_type == 'exon':\n",
-    "                    line = f'{line};gene_name={name}'\n",
+    "                    line = f'{line};gene={name};product={name};'\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
     "            named.write(line + '\\n')"
    ]
   }
-- 
GitLab