add eggNOG-mapper preferred name to GFF

fdb02d43 · Niko (Nikolaos) Papadopoulos · 500a24a6 · fdb02d43
Commit fdb02d43 authored 7 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -14,27 +14,43 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
-    "import numpy as np"
+    "\n",
+    "import pandas as pd"
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
-   "execution_count": null,
   "metadata": {},
-   "outputs": [],
   "source": [
-    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
+    "### util functions\n",
-    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)"
+    "\n",
+    "We are going to need three helper functions:\n",
+    "\n",
+    "- extract the gene ID from the `#query` field of the EggNOG-mapper output\n",
+    "- break up the content of the attributes field of the GFF file into a dictionary\n",
+    "- find the correct protein name for a gene ID"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_gene_id(x):\n",
+    "    \"\"\"Extract gene ID from a string\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    x : str\n",
+    "        A protein ID from the eggNOG-mapper output.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'\n",
+    "    \"\"\"\n",
    "    if 'PB' in x:\n",
    "        parts = x.split('.')\n",
    "        return '.'.join(parts[:2])\n",
@@ -44,75 +60,9 @@
    "        return ValueError('Unknown gene ID format')"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "apply the parsing on each row..."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "emapper.set_index('gene', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#query                                                    PB.3.1.p1\n",
-       "seed_ortholog                                       136037.KDR12978\n",
-       "evalue                                                          0.0\n",
-       "score                                                         570.0\n",
-       "eggNOG_OGs        COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...\n",
-       "max_annot_lvl                                         33208|Metazoa\n",
-       "COG_category                                                     EF\n",
-       "Description       ribose phosphate diphosphokinase activity. It ...\n",
-       "Preferred_name                                              PRPSAP1\n",
-       "GOs               GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...\n",
-       "EC                                                                -\n",
-       "KEGG_ko                                                           -\n",
-       "KEGG_Pathway                                                      -\n",
-       "KEGG_Module                                                       -\n",
-       "KEGG_Reaction                                                     -\n",
-       "KEGG_rclass                                                       -\n",
-       "BRITE                                                             -\n",
-       "KEGG_TC                                                           -\n",
-       "CAZy                                                              -\n",
-       "BiGG_Reaction                                                     -\n",
-       "PFAMs                                 Pribosyl_synth,Pribosyltran_N\n",
-       "Name: PB.3, dtype: object"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "emapper.loc['PB.3']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -132,52 +82,92 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
+   "source": [
-     "data": {
+    "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n",
-      "text/plain": [
+    "    \"\"\"Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n",
-       "'-'"
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        the gene symbol (if available) or \"Uncharacterised protein {gene_id}\" if no gene symbol is available.\n",
+    "    \"\"\"\n",
+    "    if gene_id in lookup.index:\n",
+    "        name = lookup.loc[gene_id]['Preferred_name']\n",
+    "        if name != '-':\n",
+    "            return name\n",
+    "    return f'Uncharacterised protein {gene_id}'"
   ]
  },
-     "execution_count": 8,
+  {
+   "cell_type": "markdown",
   "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
-    "emapper.loc['PB.1']['Preferred_name']"
+    "Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;\n",
+    "then set that as the index of the dataframe."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
-    "    if gene_id in lookup.index:\n",
+    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)\n",
-    "        name = lookup.loc[gene_id]['Preferred_name']\n",
+    "\n",
-    "        if name != '-':\n",
+    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)\n",
-    "            return name\n",
+    "emapper.set_index('gene', inplace=True)"
-    "    return f'Uncharacterised protein {gene_id}'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n",
+    "gene symbol (name) in some form:\n",
+    "\n",
+    "- Gene: `name=Hox3;gene_name=Hox3`\n",
+    "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n",
+    "- CDS: `gene_name=Hox3`\n",
+    "- exon: `gene_name=Hox3`\n",
+    "\n",
+    "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n",
+    "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n",
+    "`CellRanger`) still have access to the functional annotation."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]\n"
+     ]
+    }
+   ],
   "source": [
-    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'\n",
+    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'\n",
-    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'\n",
+    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'\n",
    "\n",
    "with open(gff_loc, 'r') as gff:\n",
    "    with open(named_loc, 'w') as named:\n",
    "        gene = ''\n",
    "        mRNA = ''\n",
-    "        for line in gff:\n",
+    "        for line in tqdm(gff.readlines()):\n",
    "            line = line.strip()\n",
    "            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n",
    "            if not conditions_skip:\n",
@@ -186,12 +176,14 @@
    "                if feature_type == 'gene':\n",
    "                    gene = attributes['ID']\n",
    "                    name = find_protein(gene, emapper)\n",
-    "                    line = f'{line}name={name} (predicted)'\n",
+    "                    name = f'{name} (predicted)'\n",
+    "                    line = f'{line}name={name}'\n",
    "                if feature_type == 'mRNA':\n",
    "                    mRNA = attributes['ID']\n",
    "                    isoform = mRNA.split('.')[-1]\n",
-    "                    line = f'{line}name={name} (predicted) isoform {isoform}'\n",
+    "                    line = f'{line}name={name} isoform {isoform};gene_name={name}'\n",
-    "\n",
+    "                if feature_type == 'CDS' or feature_type == 'exon':\n",
+    "                    line = f'{line}gene_name={name}'\n",
    "            named.write(line + '\\n')"
   ]
  }

 %% Cell type:markdown id: tags:
 # Adding functional annotation from EggNOG-mapper
 %% Cell type:code id: tags:
 ``` python
+from tqdm import tqdm
 import pandas as pd
-import numpy as np
 ```
-%% Cell type:code id: tags:
+%% Cell type:markdown id: tags:
-``` python
+### util functions
-best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'
-emapper = pd.read_csv(best_per_gene, sep='\t', header=0)
+We are going to need three helper functions:
-```
+- extract the gene ID from the `#query` field of the EggNOG-mapper output
+- break up the content of the attributes field of the GFF file into a dictionary
+- find the correct protein name for a gene ID
 %% Cell type:code id: tags:
 ``` python
 def parse_gene_id(x):
+    """Extract gene ID from a string
+    Parameters
+    ----------
+    x : str
+        A protein ID from the eggNOG-mapper output.
+    Returns
+    -------
+    str
+        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
+    """
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')
 ```
-%% Cell type:markdown id: tags:
-apply the parsing on each row...
-%% Cell type:code id: tags:
-``` python
-emapper['gene'] = emapper['#query'].apply(parse_gene_id)
-```
-%% Cell type:code id: tags:
-``` python
-emapper.set_index('gene', inplace=True)
-```
-%% Cell type:code id: tags:
-``` python
-emapper.loc['PB.3']
-```
-%% Output
-    #query                                                    PB.3.1.p1
-    seed_ortholog                                       136037.KDR12978
-    evalue                                                          0.0
-    score                                                         570.0
-    eggNOG_OGs        COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...
-    max_annot_lvl                                         33208|Metazoa
-    COG_category                                                     EF
-    Description       ribose phosphate diphosphokinase activity. It ...
-    Preferred_name                                              PRPSAP1
-    GOs               GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...
-    EC                                                                -
-    KEGG_ko                                                           -
-    KEGG_Pathway                                                      -
-    KEGG_Module                                                       -
-    KEGG_Reaction                                                     -
-    KEGG_rclass                                                       -
-    BRITE                                                             -
-    KEGG_TC                                                           -
-    CAZy                                                              -
-    BiGG_Reaction                                                     -
-    PFAMs                                 Pribosyl_synth,Pribosyltran_N
-    Name: PB.3, dtype: object
 %% Cell type:code id: tags:
 ``` python
 def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary
    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
 ```
 %% Cell type:code id: tags:
 ``` python
-emapper.loc['PB.1']['Preferred_name']
+def find_protein(gene_id, lookup): # expects a protein-coding gene as input
+    """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
+    Parameters
+    ----------
+    gene_id : str
+        a P. litorale protein-coding gene ID
+    lookup : pd.DataFrame
+        the eggNOG-mapper output file, filtered to maximum one entry per gene ID
+    Returns
+    -------
+    str
+        the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
+    """
+    if gene_id in lookup.index:
+        name = lookup.loc[gene_id]['Preferred_name']
+        if name != '-':
+            return name
+    return f'Uncharacterised protein {gene_id}'
 ```
-%% Output
+%% Cell type:markdown id: tags:
-    '-'
+Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
+then set that as the index of the dataframe.
 %% Cell type:code id: tags:
 ``` python
-def find_protein(gene_id, lookup): # expects a protein-coding gene as input
+best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'
-    if gene_id in lookup.index:
+emapper = pd.read_csv(best_per_gene, sep='\t', header=0)
-        name = lookup.loc[gene_id]['Preferred_name']
-        if name != '-':
+emapper['gene'] = emapper['#query'].apply(parse_gene_id)
-            return name
+emapper.set_index('gene', inplace=True)
-    return f'Uncharacterised protein {gene_id}'
 ```
+%% Cell type:markdown id: tags:
+Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
+gene symbol (name) in some form:
+- Gene: `name=Hox3;gene_name=Hox3`
+- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`
+- CDS: `gene_name=Hox3`
+- exon: `gene_name=Hox3`
+All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
+include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like
+`CellRanger`) still have access to the functional annotation.
 %% Cell type:code id: tags:
 ``` python
-gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'
+gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'
-named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'
+named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'
 with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
-        for line in gff:
+        for line in tqdm(gff.readlines()):
            line = line.strip()
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)
                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
-                    line = f'{line}name={name} (predicted)'
+                    name = f'{name} (predicted)'
+                    line = f'{line}name={name}'
                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
-                    line = f'{line}name={name} (predicted) isoform {isoform}'
+                    line = f'{line}name={name} isoform {isoform};gene_name={name}'
+                if feature_type == 'CDS' or feature_type == 'exon':
+                    line = f'{line}gene_name={name}'
            named.write(line + '\n')
 ```
+%% Output
+    100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]