Skip to content
Snippets Groups Projects
Commit 0bdd277c authored by Niko (Nikolaos) Papadopoulos's avatar Niko (Nikolaos) Papadopoulos
Browse files

wrapper for functional input notebook; fixed double/missing semicolons

parent 1abeee81
Branches
Tags
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Adding functional annotation from EggNOG-mapper # Adding functional annotation from EggNOG-mapper
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# from tqdm import tqdm # install it for nice progress bars # from tqdm import tqdm # install it for nice progress bars
import pandas as pd import pandas as pd
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### util functions ### util functions
We are going to need three helper functions: We are going to need three helper functions:
- extract the gene ID from the `#query` field of the EggNOG-mapper output - extract the gene ID from the `#query` field of the EggNOG-mapper output
- break up the content of the attributes field of the GFF file into a dictionary - break up the content of the attributes field of the GFF file into a dictionary
- find the correct protein name for a gene ID - find the correct protein name for a gene ID
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def parse_gene_id(x): def parse_gene_id(x):
"""Extract gene ID from a string """Extract gene ID from a string
Parameters Parameters
---------- ----------
x : str x : str
A protein ID from the eggNOG-mapper output. A protein ID from the eggNOG-mapper output.
Returns Returns
------- -------
str str
will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)' will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
""" """
if 'PB' in x: if 'PB' in x:
parts = x.split('.') parts = x.split('.')
return '.'.join(parts[:2]) return '.'.join(parts[:2])
elif x.startswith('r2') or x.startswith('g') or x.startswith('at'): elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
return x.split('.')[0] return x.split('.')[0]
else: else:
return ValueError('Unknown gene ID format') return ValueError('Unknown gene ID format')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def parse_attributes(x): def parse_attributes(x):
'''Parses a semi-colon separated string into a dictionary '''Parses a semi-colon separated string into a dictionary
Parameters Parameters
---------- ----------
x : str x : str
a semicolon-separated string that holds attributes a semicolon-separated string that holds attributes
''' '''
attributes = x.split(';') attributes = x.split(';')
if attributes[-1] == '': if attributes[-1] == '':
attributes.pop() attributes.pop()
return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes} return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def find_protein(gene_id, lookup): # expects a protein-coding gene as input def find_protein(gene_id, lookup): # expects a protein-coding gene as input
"""Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes). """Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
Parameters Parameters
---------- ----------
gene_id : str gene_id : str
a P. litorale protein-coding gene ID a P. litorale protein-coding gene ID
lookup : pd.DataFrame lookup : pd.DataFrame
the eggNOG-mapper output file, filtered to maximum one entry per gene ID the eggNOG-mapper output file, filtered to maximum one entry per gene ID
Returns Returns
------- -------
str str
the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available. the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
""" """
if gene_id in lookup.index: if gene_id in lookup.index:
name = lookup.loc[gene_id]['Preferred_name'] name = lookup.loc[gene_id]['Preferred_name']
if name != '-': if name != '-':
return name return name
return f'Uncharacterised protein {gene_id}' return f'Uncharacterised protein {gene_id}'
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column; Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
then set that as the index of the dataframe. then set that as the index of the dataframe.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations' best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
emapper = pd.read_csv(best_per_gene, sep='\t', header=0) emapper = pd.read_csv(best_per_gene, sep='\t', header=0)
emapper['gene'] = emapper['#query'].apply(parse_gene_id) emapper['gene'] = emapper['#query'].apply(parse_gene_id)
emapper.set_index('gene', inplace=True) emapper.set_index('gene', inplace=True)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
gene symbol (name) in some form: gene symbol (name) in some form:
- Gene: `name=Hox3;gene_name=Hox3` - Gene: `name=Hox3;gene_name=Hox3`
- mRNA: `name=Hox3 isoform 1;gene_name=Hox3` - mRNA: `name=Hox3 isoform 1;gene_name=Hox3`
- CDS: `gene_name=Hox3` - CDS: `gene_name=Hox3`
- exon: `gene_name=Hox3` - exon: `gene_name=Hox3`
All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like
`CellRanger`) still have access to the functional annotation. `CellRanger`) still have access to the functional annotation.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3' gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3' named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'
with open(gff_loc, 'r') as gff: with open(gff_loc, 'r') as gff:
with open(named_loc, 'w') as named: with open(named_loc, 'w') as named:
gene = '' gene = ''
mRNA = '' mRNA = ''
# for line in tqdm(gff.readlines()): # for line in tqdm(gff.readlines()):
for line in gff.readlines(): for line in gff.readlines():
line = line.strip() line = line.strip()
if line[-1] == ';':
line = line[:-1]
conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
if not conditions_skip: if not conditions_skip:
seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t') seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
attributes = parse_attributes(attributes) attributes = parse_attributes(attributes)
if feature_type == 'gene': if feature_type == 'gene':
gene = attributes['ID'] gene = attributes['ID']
name = find_protein(gene, emapper) name = find_protein(gene, emapper)
name = f'{name} (predicted)' name = f'{name} (predicted)'
line = f'{line}name={name}' line = f'{line};name={name}'
if feature_type == 'mRNA': if feature_type == 'mRNA':
mRNA = attributes['ID'] mRNA = attributes['ID']
isoform = mRNA.split('.')[-1] isoform = mRNA.split('.')[-1]
line = f'{line}name={name} isoform {isoform};gene_name={name}' line = f'{line};name={name} isoform {isoform};gene_name={name}'
if feature_type == 'CDS' or feature_type == 'exon': if feature_type == 'CDS' or feature_type == 'exon':
line = f'{line}gene_name={name}' line = f'{line};gene_name={name}'
named.write(line + '\n') named.write(line + '\n')
``` ```
......
#!/usr/bin/env bash
# wrapper script for the functional annotation of the GFF3 file, which is a jupyter notebook
module load conda
conda activate jupyterhub-5.2.1
# a conda environment that has pandas and can run jupyter notebooks
jupyter execute gff-02-functional_annot.ipynb
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment