adding more information to GFF and EMBL file

aa0a40fd · Niko (Nikolaos) Papadopoulos · 8b91326f · aa0a40fd · aa0a40fd
Commit aa0a40fd authored 6 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -136,6 +136,55 @@
    "    return None"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_EC(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    \"\"\"Retrieve the EC number assigned by EggNOG-mapper for a certain gene ID.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    str\n",
+    "        the EC number (if available) for the current gene (product).\n",
+    "    \"\"\"\n",
+    "    if gene_id in lookup.index:\n",
+    "        ec = lookup.loc[gene_id]['EC']\n",
+    "        if ec != '-':\n",
+    "            return ec.split(',')\n",
+    "    return None\n",
+    "\n",
+    "def find_PFAMs(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    \"\"\"Retrieve the PFAM domains assigned by EggNOG-mapper for a certain gene ID.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    gene_id : str\n",
+    "        a P. litorale protein-coding gene ID\n",
+    "    lookup : pd.DataFrame\n",
+    "        the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    list\n",
+    "        a list of PFAM domains (if available) associated with the gene (product).\n",
+    "    \"\"\"\n",
+    "    if gene_id in lookup.index:\n",
+    "        pfam = lookup.loc[gene_id]['PFAMs']\n",
+    "        if pfam != '-':\n",
+    "            return pfam.split(',')\n",
+    "    return None"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -157,6 +206,15 @@
    "emapper.set_index('gene', inplace=True)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emapper.columns"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -203,6 +261,8 @@
    "                    name = find_protein(gene, emapper)\n",
    "                    line = f'{line};gene={name}'\n",
    "                    description = find_description(gene, emapper)\n",
+    "                    ec_list = find_EC(gene, emapper)\n",
+    "                    pfams = find_PFAMs(gene, emapper)\n",
    "                    if description:\n",
    "                        line = f'{line};function={description}'\n",
    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
@@ -223,6 +283,12 @@
    "                        line = f'{line};function={description}'\n",
    "                    if name != f'Uncharacterised protein {gene}' or description:\n",
    "                        line = f'{line};note=function predicted by EggNOG-mapper'\n",
+    "                    if ec_list is not None:\n",
+    "                        for ec in ec_list:\n",
+    "                            line = f'{line};EC_number=\"{ec}\"'\n",
+    "                    if pfams is not None:\n",
+    "                        for pfam in pfams:\n",
+    "                            line = f'{line};Dbxref=\"PFAM:{pfam}\"'\n",
    "            named.write(line + '\\n')"
   ]
  }

 %% Cell type:markdown id: tags:

 # Adding functional annotation from EggNOG-mapper


 %% Cell type:code id: tags:

 ``` python
 # from tqdm import tqdm # install it for nice progress bars

 import pandas as pd
 ```

 %% Cell type:markdown id: tags:

 ### util functions

 We are going to need three helper functions:

 - extract the gene ID from the `#query` field of the EggNOG-mapper output
 - break up the content of the attributes field of the GFF file into a dictionary
 - find the correct protein name for a gene ID

 %% Cell type:code id: tags:

 ``` python
 def parse_gene_id(x):
    """Extract gene ID from a string

    Parameters
    ----------
    x : str
        A protein ID from the eggNOG-mapper output.

    Returns
    -------
    str
        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
    """
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')
 ```

 %% Cell type:code id: tags:

 ``` python
 def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary

    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
 ```

 %% Cell type:code id: tags:

 ``` python
 def find_protein(gene_id, lookup): # expects a protein-coding gene as input
    """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).

    Parameters
    ----------
    gene_id : str
        a P. litorale protein-coding gene ID
    lookup : pd.DataFrame
        the eggNOG-mapper output file, filtered to maximum one entry per gene ID

    Returns
    -------
    str
        the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
    """
    if gene_id in lookup.index:
        name = lookup.loc[gene_id]['Preferred_name']
        if name != '-':
            return f'{name} (predicted)'
    return f'Uncharacterised protein {gene_id}'
 ```

 %% Cell type:code id: tags:

 ``` python
 def find_description(gene_id, lookup): # expects a protein-coding gene as input
    """Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).

    Parameters
    ----------
    gene_id : str
        a P. litorale protein-coding gene ID
    lookup : pd.DataFrame
        the eggNOG-mapper output file, filtered to maximum one entry per gene ID

    Returns
    -------
    str
        the description (if available); will return None if no description is available.
    """
    if gene_id in lookup.index:
        description = lookup.loc[gene_id]['Description']
        if description != '-':
            return description
    return None
 ```

+%% Cell type:code id: tags:
+
+``` python
+def find_EC(gene_id, lookup): # expects a protein-coding gene as input
+    """Retrieve the EC number assigned by EggNOG-mapper for a certain gene ID.
+
+    Parameters
+    ----------
+    gene_id : str
+        a P. litorale protein-coding gene ID
+    lookup : pd.DataFrame
+        the eggNOG-mapper output file, filtered to maximum one entry per gene ID
+
+    Returns
+    -------
+    str
+        the EC number (if available) for the current gene (product).
+    """
+    if gene_id in lookup.index:
+        ec = lookup.loc[gene_id]['EC']
+        if ec != '-':
+            return ec.split(',')
+    return None
+
+def find_PFAMs(gene_id, lookup): # expects a protein-coding gene as input
+    """Retrieve the PFAM domains assigned by EggNOG-mapper for a certain gene ID.
+
+    Parameters
+    ----------
+    gene_id : str
+        a P. litorale protein-coding gene ID
+    lookup : pd.DataFrame
+        the eggNOG-mapper output file, filtered to maximum one entry per gene ID
+
+    Returns
+    -------
+    list
+        a list of PFAM domains (if available) associated with the gene (product).
+    """
+    if gene_id in lookup.index:
+        pfam = lookup.loc[gene_id]['PFAMs']
+        if pfam != '-':
+            return pfam.split(',')
+    return None
+```
+
 %% Cell type:markdown id: tags:

 Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
 then set that as the index of the dataframe.

 %% Cell type:code id: tags:

 ``` python
 best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
 emapper = pd.read_csv(best_per_gene, sep='\t', header=0)

 emapper['gene'] = emapper['#query'].apply(parse_gene_id)
 emapper.set_index('gene', inplace=True)
 ```

+%% Cell type:code id: tags:
+
+``` python
+emapper.columns
+```
+
 %% Cell type:markdown id: tags:

 Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
 gene symbol (name) in some form:

 - Gene: `gene=Hox3;product=Hox3`
 - mRNA: `product=Hox3 isoform 1;gene=Hox3`
 - CDS: `gene=Hox3;product=Hox3`
 - exon: `gene=Hox3;product=Hox3`

 All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
 include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep
 parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems
 to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons.

 %% Cell type:code id: tags:

 ``` python
 gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
 named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'

 with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
        # for line in tqdm(gff.readlines()):
        for line in gff.readlines():
            line = line.strip()
            if line[-1] == ';':
                line = line[:-1]
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)

                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
                    line = f'{line};gene={name}'
                    description = find_description(gene, emapper)
+                    ec_list = find_EC(gene, emapper)
+                    pfams = find_PFAMs(gene, emapper)
                    if description:
                        line = f'{line};function={description}'
                    if name != f'Uncharacterised protein {gene}' or description:
                        line = f'{line};note=function predicted by EggNOG-mapper'

                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
                    line = f'{line};gene={name};product={name} isoform {isoform}'
                    if description:
                        line = f'{line};function={description}'
                    if name != f'Uncharacterised protein {gene}' or description:
                        line = f'{line};note=function predicted by EggNOG-mapper'

                if feature_type == 'CDS' or feature_type == 'exon':
                    line = f'{line};gene={name};product={name}'
                    if description:
                        line = f'{line};function={description}'
                    if name != f'Uncharacterised protein {gene}' or description:
                        line = f'{line};note=function predicted by EggNOG-mapper'
+                    if ec_list is not None:
+                        for ec in ec_list:
+                            line = f'{line};EC_number="{ec}"'
+                    if pfams is not None:
+                        for pfam in pfams:
+                            line = f'{line};Dbxref="PFAM:{pfam}"'
            named.write(line + '\n')
 ```

--- a/08-submission/gff-04-convert_to_embl.sh
+++ b/08-submission/gff-04-convert_to_embl.sh
@@ -23,4 +23,6 @@ EMBLmyGFF3 $GFF $GENOME \
        -v \
        -o result.embl

-gzip result.embl
\ No newline at end of file
+# when zipping: overwrite file if it already exists,
+# else the command will hang while waiting for confirmation
+gzip -f result.embl
\ No newline at end of file