From fdb02d43a0e32648a17fbdcf2a30d7368f48f248 Mon Sep 17 00:00:00 2001
From: Niko <nikolaos.papadopoulos@univie.ac.at>
Date: Wed, 27 Nov 2024 16:42:00 +0100
Subject: [PATCH] add eggNOG-mapper preferred name to GFF

---
 08-submission/gff-02-functional_annot.ipynb | 198 ++++++++++----------
 1 file changed, 95 insertions(+), 103 deletions(-)

diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb
index ba2eba6..1ab90de 100644
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -14,27 +14,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
+    "from tqdm import tqdm\n",
+    "\n",
+    "import pandas as pd"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
-    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)"
+    "### util functions\n",
+    "\n",
+    "We are going to need three helper functions:\n",
+    "\n",
+    "- extract the gene ID from the `#query` field of the EggNOG-mapper output\n",
+    "- break up the content of the attributes field of the GFF file into a dictionary\n",
+    "- find the correct protein name for a gene ID"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "def parse_gene_id(x):\n",
+    "    \"\"\"Extract gene ID from a string\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    x : str\n",
+    "        A protein ID from the eggNOG-mapper output.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'\n",
+    "    \"\"\"\n",
     "    if 'PB' in x:\n",
     "        parts = x.split('.')\n",
     "        return '.'.join(parts[:2])\n",
@@ -44,75 +60,9 @@
     "        return ValueError('Unknown gene ID format')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "apply the parsing on each row..."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "emapper.set_index('gene', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#query                                                    PB.3.1.p1\n",
-       "seed_ortholog                                       136037.KDR12978\n",
-       "evalue                                                          0.0\n",
-       "score                                                         570.0\n",
-       "eggNOG_OGs        COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...\n",
-       "max_annot_lvl                                         33208|Metazoa\n",
-       "COG_category                                                     EF\n",
-       "Description       ribose phosphate diphosphokinase activity. It ...\n",
-       "Preferred_name                                              PRPSAP1\n",
-       "GOs               GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...\n",
-       "EC                                                                -\n",
-       "KEGG_ko                                                           -\n",
-       "KEGG_Pathway                                                      -\n",
-       "KEGG_Module                                                       -\n",
-       "KEGG_Reaction                                                     -\n",
-       "KEGG_rclass                                                       -\n",
-       "BRITE                                                             -\n",
-       "KEGG_TC                                                           -\n",
-       "CAZy                                                              -\n",
-       "BiGG_Reaction                                                     -\n",
-       "PFAMs                                 Pribosyl_synth,Pribosyltran_N\n",
-       "Name: PB.3, dtype: object"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "emapper.loc['PB.3']"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -132,31 +82,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'-'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "emapper.loc['PB.1']['Preferred_name']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    \"\"\"Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        the gene symbol (if available) or \"Uncharacterised protein {gene_id}\" if no gene symbol is available.\n",
+    "    \"\"\"\n",
     "    if gene_id in lookup.index:\n",
     "        name = lookup.loc[gene_id]['Preferred_name']\n",
     "        if name != '-':\n",
@@ -164,20 +108,66 @@
     "    return f'Uncharacterised protein {gene_id}'"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;\n",
+    "then set that as the index of the dataframe."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'\n",
-    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'\n",
+    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
+    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)\n",
+    "\n",
+    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)\n",
+    "emapper.set_index('gene', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n",
+    "gene symbol (name) in some form:\n",
+    "\n",
+    "- Gene: `name=Hox3;gene_name=Hox3`\n",
+    "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n",
+    "- CDS: `gene_name=Hox3`\n",
+    "- exon: `gene_name=Hox3`\n",
+    "\n",
+    "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n",
+    "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n",
+    "`CellRanger`) still have access to the functional annotation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'\n",
+    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'\n",
     "\n",
     "with open(gff_loc, 'r') as gff:\n",
     "    with open(named_loc, 'w') as named:\n",
     "        gene = ''\n",
     "        mRNA = ''\n",
-    "        for line in gff:\n",
+    "        for line in tqdm(gff.readlines()):\n",
     "            line = line.strip()\n",
     "            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n",
     "            if not conditions_skip:\n",
@@ -186,12 +176,14 @@
     "                if feature_type == 'gene':\n",
     "                    gene = attributes['ID']\n",
     "                    name = find_protein(gene, emapper)\n",
-    "                    line = f'{line}name={name} (predicted)'\n",
+    "                    name = f'{name} (predicted)'\n",
+    "                    line = f'{line}name={name}'\n",
     "                if feature_type == 'mRNA':\n",
     "                    mRNA = attributes['ID']\n",
     "                    isoform = mRNA.split('.')[-1]\n",
-    "                    line = f'{line}name={name} (predicted) isoform {isoform}'\n",
-    "\n",
+    "                    line = f'{line}name={name} isoform {isoform};gene_name={name}'\n",
+    "                if feature_type == 'CDS' or feature_type == 'exon':\n",
+    "                    line = f'{line}gene_name={name}'\n",
     "            named.write(line + '\\n')"
    ]
   }
-- 
GitLab