now running on cluster

c33f2ad0 · Niko (Nikolaos) Papadopoulos · fdb02d43 · c33f2ad0
Commit c33f2ad0 authored 7 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/gff-02-functional_annot.ipynb
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -14,7 +14,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from tqdm import tqdm\n",
+    "# from tqdm import tqdm # install it for nice progress bars\n",
    "\n",
    "import pandas as pd"
   ]
@@ -118,11 +118,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
-    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
+    "best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'\n",
    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)\n",
    "\n",
    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)\n",
@@ -148,26 +148,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'\n",
-    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'\n",
+    "gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'\n",
+    "named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'\n",
    "\n",
    "with open(gff_loc, 'r') as gff:\n",
    "    with open(named_loc, 'w') as named:\n",
    "        gene = ''\n",
    "        mRNA = ''\n",
-    "        for line in tqdm(gff.readlines()):\n",
+    "        # for line in tqdm(gff.readlines()):\n",
+    "        for line in gff.readlines():\n",
    "            line = line.strip()\n",
    "            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n",
    "            if not conditions_skip:\n",
@@ -190,7 +183,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "ascc24",
+   "display_name": "jupyterhub-5.1.0",
   "language": "python",
   "name": "python3"
  },
@@ -204,7 +197,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.12.4"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id: tags:

 # Adding functional annotation from EggNOG-mapper


 %% Cell type:code id: tags:

 ``` python
-from tqdm import tqdm
+# from tqdm import tqdm # install it for nice progress bars

 import pandas as pd
 ```

 %% Cell type:markdown id: tags:

 ### util functions

 We are going to need three helper functions:

 - extract the gene ID from the `#query` field of the EggNOG-mapper output
 - break up the content of the attributes field of the GFF file into a dictionary
 - find the correct protein name for a gene ID

 %% Cell type:code id: tags:

 ``` python
 def parse_gene_id(x):
    """Extract gene ID from a string

    Parameters
    ----------
    x : str
        A protein ID from the eggNOG-mapper output.

    Returns
    -------
    str
        will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
    """
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')
 ```

 %% Cell type:code id: tags:

 ``` python
 def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary

    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
 ```

 %% Cell type:code id: tags:

 ``` python
 def find_protein(gene_id, lookup): # expects a protein-coding gene as input
    """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).

    Parameters
    ----------
    gene_id : str
        a P. litorale protein-coding gene ID
    lookup : pd.DataFrame
        the eggNOG-mapper output file, filtered to maximum one entry per gene ID

    Returns
    -------
    str
        the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
    """
    if gene_id in lookup.index:
        name = lookup.loc[gene_id]['Preferred_name']
        if name != '-':
            return name
    return f'Uncharacterised protein {gene_id}'
 ```

 %% Cell type:markdown id: tags:

 Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
 then set that as the index of the dataframe.

 %% Cell type:code id: tags:

 ``` python
-best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'
+best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
 emapper = pd.read_csv(best_per_gene, sep='\t', header=0)

 emapper['gene'] = emapper['#query'].apply(parse_gene_id)
 emapper.set_index('gene', inplace=True)
 ```

 %% Cell type:markdown id: tags:

 Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
 gene symbol (name) in some form:

 - Gene: `name=Hox3;gene_name=Hox3`
 - mRNA: `name=Hox3 isoform 1;gene_name=Hox3`
 - CDS: `gene_name=Hox3`
 - exon: `gene_name=Hox3`

 All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
 include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like
 `CellRanger`) still have access to the functional annotation.

 %% Cell type:code id: tags:

 ``` python
-gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'
-named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'
+gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
+named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'

 with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
-        for line in tqdm(gff.readlines()):
+        # for line in tqdm(gff.readlines()):
+        for line in gff.readlines():
            line = line.strip()
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)
                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
                    name = f'{name} (predicted)'
                    line = f'{line}name={name}'
                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
                    line = f'{line}name={name} isoform {isoform};gene_name={name}'
                if feature_type == 'CDS' or feature_type == 'exon':
                    line = f'{line}gene_name={name}'
            named.write(line + '\n')
 ```
-
-%% Output
-
-    100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]