include emapper description, use ENA specification

4c3735a2 · Niko (Nikolaos) Papadopoulos · 5a3aa67f · 4c3735a2
Commit 4c3735a2 authored 7 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -34,7 +34,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -62,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -82,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -104,10 +104,38 @@
    "    if gene_id in lookup.index:\n",
    "        name = lookup.loc[gene_id]['Preferred_name']\n",
    "        if name != '-':\n",
-    "            return name\n",
+    "            return f'{name} (predicted)'\n",
    "    return f'Uncharacterised protein {gene_id}'"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_description(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    \"\"\"Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        the description (if available); will return None if no description is available.\n",
+    "    \"\"\"\n",
+    "    if gene_id in lookup.index:\n",
+    "        description = lookup.loc[gene_id]['Description']\n",
+    "        if description != '-':\n",
+    "            return description\n",
+    "    return None"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -118,7 +146,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -136,14 +164,15 @@
    "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n",
    "gene symbol (name) in some form:\n",
    "\n",
-    "- Gene: `name=Hox3;gene_name=Hox3`\n",
-    "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n",
-    "- CDS: `gene_name=Hox3`\n",
-    "- exon: `gene_name=Hox3`\n",
+    "- Gene: `gene=Hox3;product=Hox3`\n",
+    "- mRNA: `product=Hox3 isoform 1;gene=Hox3`\n",
+    "- CDS: `gene=Hox3;product=Hox3`\n",
+    "- exon: `gene=Hox3;product=Hox3`\n",
    "\n",
    "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n",
-    "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n",
-    "`CellRanger`) still have access to the functional annotation."
+    "include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep\n",
+    "parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems\n",
+    "to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons."
   ]
  },
  {
@@ -168,17 +197,32 @@
    "            if not conditions_skip:\n",
    "                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n",
    "                attributes = parse_attributes(attributes)\n",
+    "\n",
    "                if feature_type == 'gene':\n",
    "                    gene = attributes['ID']\n",
    "                    name = find_protein(gene, emapper)\n",
-    "                    name = f'{name} (predicted)'\n",
-    "                    line = f'{line};name={name}'\n",
+    "                    line = f'{line};gene={name}'\n",
+    "                    description = find_description(gene, emapper)\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
+    "\n",
    "                if feature_type == 'mRNA':\n",
    "                    mRNA = attributes['ID']\n",
    "                    isoform = mRNA.split('.')[-1]\n",
-    "                    line = f'{line};name={name} isoform {isoform};gene_name={name}'\n",
+    "                    line = f'{line};gene={name};product={name} isoform {isoform}'\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
+    "\n",
    "                if feature_type == 'CDS' or feature_type == 'exon':\n",
-    "                    line = f'{line};gene_name={name}'\n",
+    "                    line = f'{line};gene={name};product={name};'\n",
+    "                    if description:\n",
+    "                        line = f'{line};function={description}'\n",
+    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
+    "                        line = f'{line};note=source:EggNOG-mapper'\n",
    "            named.write(line + '\\n')"
   ]
  }

 %% Cell type:markdown id: tags:

 # Adding functional annotation from EggNOG-mapper


 %% Cell type:code id: tags:

 ``` python
 # from tqdm import tqdm # install it for nice progress bars

 import pandas as pd
 ```

 %% Cell type:markdown id: tags:

 ### util functions

 We are going to need three helper functions:

 - extract the gene ID from the `#query` field of the EggNOG-mapper output
 - break up the content of the attributes field of the GFF file into a dictionary
 - find the correct protein name for a gene ID

 %% Cell type:code id: tags:

 ``` python
 def parse_gene_id(x):
    """Extract gene ID from a string

    Parameters
    ----------
    x : str
        A protein ID from the eggNOG-mapper output.

    Returns
    -------
    str
        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
    """
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')
 ```

 %% Cell type:code id: tags:

 ``` python
 def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary

    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
 ```

 %% Cell type:code id: tags:

 ``` python
 def find_protein(gene_id, lookup): # expects a protein-coding gene as input
    """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).

    Parameters
    ----------
    gene_id : str
        a P. litorale protein-coding gene ID
    lookup : pd.DataFrame
        the eggNOG-mapper output file, filtered to maximum one entry per gene ID

    Returns
    -------
    str
        the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
    """
    if gene_id in lookup.index:
        name = lookup.loc[gene_id]['Preferred_name']
        if name != '-':
-            return name
+            return f'{name} (predicted)'
    return f'Uncharacterised protein {gene_id}'
 ```

+%% Cell type:code id: tags:
+
+``` python
+def find_description(gene_id, lookup): # expects a protein-coding gene as input
+    """Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
+
+    Parameters
+    ----------
+    gene_id : str
+        a P. litorale protein-coding gene ID
+    lookup : pd.DataFrame
+        the eggNOG-mapper output file, filtered to maximum one entry per gene ID
+
+    Returns
+    -------
+    str
+        the description (if available); will return None if no description is available.
+    """
+    if gene_id in lookup.index:
+        description = lookup.loc[gene_id]['Description']
+        if description != '-':
+            return description
+    return None
+```
+
 %% Cell type:markdown id: tags:

 Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
 then set that as the index of the dataframe.

 %% Cell type:code id: tags:

 ``` python
 best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
 emapper = pd.read_csv(best_per_gene, sep='\t', header=0)

 emapper['gene'] = emapper['#query'].apply(parse_gene_id)
 emapper.set_index('gene', inplace=True)
 ```

 %% Cell type:markdown id: tags:

 Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
 gene symbol (name) in some form:

- Gene: `name=Hox3;gene_name=Hox3`
- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`
- CDS: `gene_name=Hox3`
- exon: `gene_name=Hox3`
+- Gene: `gene=Hox3;product=Hox3`
+- mRNA: `product=Hox3 isoform 1;gene=Hox3`
+- CDS: `gene=Hox3;product=Hox3`
+- exon: `gene=Hox3;product=Hox3`

 All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
-include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like
-`CellRanger`) still have access to the functional annotation.
+include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep
+parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems
+to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons.

 %% Cell type:code id: tags:

 ``` python
 gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
 named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'

 with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
        # for line in tqdm(gff.readlines()):
        for line in gff.readlines():
            line = line.strip()
            if line[-1] == ';':
                line = line[:-1]
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)
+
                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
-                    name = f'{name} (predicted)'
-                    line = f'{line};name={name}'
+                    line = f'{line};gene={name}'
+                    description = find_description(gene, emapper)
+                    if description:
+                        line = f'{line};function={description}'
+                    if name != f'Uncharacterised protein {gene}' or description:
+                        line = f'{line};note=source:EggNOG-mapper'
+
                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
-                    line = f'{line};name={name} isoform {isoform};gene_name={name}'
+                    line = f'{line};gene={name};product={name} isoform {isoform}'
+                    if description:
+                        line = f'{line};function={description}'
+                    if name != f'Uncharacterised protein {gene}' or description:
+                        line = f'{line};note=source:EggNOG-mapper'
+
                if feature_type == 'CDS' or feature_type == 'exon':
-                    line = f'{line};gene_name={name}'
+                    line = f'{line};gene={name};product={name};'
+                    if description:
+                        line = f'{line};function={description}'
+                    if name != f'Uncharacterised protein {gene}' or description:
+                        line = f'{line};note=source:EggNOG-mapper'
            named.write(line + '\n')
 ```