wrapper for functional input notebook; fixed double/missing semicolons

0bdd277c · Niko (Nikolaos) Papadopoulos · 1abeee81 · 0bdd277c · 0bdd277c
Commit 0bdd277c authored 4 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -148,7 +148,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -162,6 +162,8 @@
    "        # for line in tqdm(gff.readlines()):\n",
    "        for line in gff.readlines():\n",
    "            line = line.strip()\n",
+    "            if line[-1] == ';':\n",
+    "                line = line[:-1]\n",
    "            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n",
    "            if not conditions_skip:\n",
    "                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n",
@@ -170,13 +172,13 @@
    "                    gene = attributes['ID']\n",
    "                    name = find_protein(gene, emapper)\n",
    "                    name = f'{name} (predicted)'\n",
-    "                    line = f'{line}name={name}'\n",
+    "                    line = f'{line};name={name}'\n",
    "                if feature_type == 'mRNA':\n",
    "                    mRNA = attributes['ID']\n",
    "                    isoform = mRNA.split('.')[-1]\n",
-    "                    line = f'{line}name={name} isoform {isoform};gene_name={name}'\n",
+    "                    line = f'{line};name={name} isoform {isoform};gene_name={name}'\n",
    "                if feature_type == 'CDS' or feature_type == 'exon':\n",
-    "                    line = f'{line}gene_name={name}'\n",
+    "                    line = f'{line};gene_name={name}'\n",
    "            named.write(line + '\\n')"
   ]
  }

 %% Cell type:markdown id: tags:
 # Adding functional annotation from EggNOG-mapper
 %% Cell type:code id: tags:
 ``` python
 # from tqdm import tqdm # install it for nice progress bars
 import pandas as pd
 ```
 %% Cell type:markdown id: tags:
 ### util functions
 We are going to need three helper functions:
 - extract the gene ID from the `#query` field of the EggNOG-mapper output
 - break up the content of the attributes field of the GFF file into a dictionary
 - find the correct protein name for a gene ID
 %% Cell type:code id: tags:
 ``` python
 def parse_gene_id(x):
    """Extract gene ID from a string
    Parameters
    ----------
    x : str
        A protein ID from the eggNOG-mapper output.
    Returns
    -------
    str
        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
    """
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')
 ```
 %% Cell type:code id: tags:
 ``` python
 def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary
    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
 ```
 %% Cell type:code id: tags:
 ``` python
 def find_protein(gene_id, lookup): # expects a protein-coding gene as input
    """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
    Parameters
    ----------
    gene_id : str
        a P. litorale protein-coding gene ID
    lookup : pd.DataFrame
        the eggNOG-mapper output file, filtered to maximum one entry per gene ID
    Returns
    -------
    str
        the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
    """
    if gene_id in lookup.index:
        name = lookup.loc[gene_id]['Preferred_name']
        if name != '-':
            return name
    return f'Uncharacterised protein {gene_id}'
 ```
 %% Cell type:markdown id: tags:
 Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
 then set that as the index of the dataframe.
 %% Cell type:code id: tags:
 ``` python
 best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
 emapper = pd.read_csv(best_per_gene, sep='\t', header=0)
 emapper['gene'] = emapper['#query'].apply(parse_gene_id)
 emapper.set_index('gene', inplace=True)
 ```
 %% Cell type:markdown id: tags:
 Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
 gene symbol (name) in some form:
 - Gene: `name=Hox3;gene_name=Hox3`
 - mRNA: `name=Hox3 isoform 1;gene_name=Hox3`
 - CDS: `gene_name=Hox3`
 - exon: `gene_name=Hox3`
 All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
 include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like
 `CellRanger`) still have access to the functional annotation.
 %% Cell type:code id: tags:
 ``` python
 gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
 named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'
 with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
        # for line in tqdm(gff.readlines()):
        for line in gff.readlines():
            line = line.strip()
+            if line[-1] == ';':
+                line = line[:-1]
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)
                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
                    name = f'{name} (predicted)'
-                    line = f'{line}name={name}'
+                    line = f'{line};name={name}'
                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
-                    line = f'{line}name={name} isoform {isoform};gene_name={name}'
+                    line = f'{line};name={name} isoform {isoform};gene_name={name}'
                if feature_type == 'CDS' or feature_type == 'exon':
-                    line = f'{line}gene_name={name}'
+                    line = f'{line};gene_name={name}'
            named.write(line + '\n')
 ```

--- a/08-submission/gff-02-functional_annot.sh
+++ b/08-submission/gff-02-functional_annot.sh
+#!/usr/bin/env bash
+# wrapper script for the functional annotation of the GFF3 file, which is a jupyter notebook
+module load conda
+conda activate jupyterhub-5.2.1
+# a conda environment that has pandas and can run jupyter notebooks
+jupyter execute gff-02-functional_annot.ipynb
\ No newline at end of file