Skip to content
Snippets Groups Projects
Commit aa0a40fd authored by Niko (Nikolaos) Papadopoulos's avatar Niko (Nikolaos) Papadopoulos
Browse files

adding more information to GFF and EMBL file

parent 8b91326f
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Adding functional annotation from EggNOG-mapper
%% Cell type:code id: tags:
``` python
# from tqdm import tqdm # install it for nice progress bars
import pandas as pd
```
%% Cell type:markdown id: tags:
### util functions
We are going to need three helper functions:
- extract the gene ID from the `#query` field of the EggNOG-mapper output
- break up the content of the attributes field of the GFF file into a dictionary
- find the correct protein name for a gene ID
%% Cell type:code id: tags:
``` python
def parse_gene_id(x):
"""Extract gene ID from a string
Parameters
----------
x : str
A protein ID from the eggNOG-mapper output.
Returns
-------
str
will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'
"""
if 'PB' in x:
parts = x.split('.')
return '.'.join(parts[:2])
elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
return x.split('.')[0]
else:
return ValueError('Unknown gene ID format')
```
%% Cell type:code id: tags:
``` python
def parse_attributes(x):
'''Parses a semi-colon separated string into a dictionary
Parameters
----------
x : str
a semicolon-separated string that holds attributes
'''
attributes = x.split(';')
if attributes[-1] == '':
attributes.pop()
return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}
```
%% Cell type:code id: tags:
``` python
def find_protein(gene_id, lookup): # expects a protein-coding gene as input
"""Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
Parameters
----------
gene_id : str
a P. litorale protein-coding gene ID
lookup : pd.DataFrame
the eggNOG-mapper output file, filtered to maximum one entry per gene ID
Returns
-------
str
the gene symbol (if available) or "Uncharacterised protein {gene_id}" if no gene symbol is available.
"""
if gene_id in lookup.index:
name = lookup.loc[gene_id]['Preferred_name']
if name != '-':
return f'{name} (predicted)'
return f'Uncharacterised protein {gene_id}'
```
%% Cell type:code id: tags:
``` python
def find_description(gene_id, lookup): # expects a protein-coding gene as input
"""Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).
Parameters
----------
gene_id : str
a P. litorale protein-coding gene ID
lookup : pd.DataFrame
the eggNOG-mapper output file, filtered to maximum one entry per gene ID
Returns
-------
str
the description (if available); will return None if no description is available.
"""
if gene_id in lookup.index:
description = lookup.loc[gene_id]['Description']
if description != '-':
return description
return None
```
%% Cell type:code id: tags:
``` python
def find_EC(gene_id, lookup): # expects a protein-coding gene as input
"""Retrieve the EC number assigned by EggNOG-mapper for a certain gene ID.
Parameters
----------
gene_id : str
a P. litorale protein-coding gene ID
lookup : pd.DataFrame
the eggNOG-mapper output file, filtered to maximum one entry per gene ID
Returns
-------
str
the EC number (if available) for the current gene (product).
"""
if gene_id in lookup.index:
ec = lookup.loc[gene_id]['EC']
if ec != '-':
return ec.split(',')
return None
def find_PFAMs(gene_id, lookup): # expects a protein-coding gene as input
"""Retrieve the PFAM domains assigned by EggNOG-mapper for a certain gene ID.
Parameters
----------
gene_id : str
a P. litorale protein-coding gene ID
lookup : pd.DataFrame
the eggNOG-mapper output file, filtered to maximum one entry per gene ID
Returns
-------
list
a list of PFAM domains (if available) associated with the gene (product).
"""
if gene_id in lookup.index:
pfam = lookup.loc[gene_id]['PFAMs']
if pfam != '-':
return pfam.split(',')
return None
```
%% Cell type:markdown id: tags:
Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;
then set that as the index of the dataframe.
%% Cell type:code id: tags:
``` python
best_per_gene = '/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge/emapper/out.emapper.best.annotations'
emapper = pd.read_csv(best_per_gene, sep='\t', header=0)
emapper['gene'] = emapper['#query'].apply(parse_gene_id)
emapper.set_index('gene', inplace=True)
```
%% Cell type:code id: tags:
``` python
emapper.columns
```
%% Cell type:markdown id: tags:
Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the
gene symbol (name) in some form:
- Gene: `gene=Hox3;product=Hox3`
- mRNA: `product=Hox3 isoform 1;gene=Hox3`
- CDS: `gene=Hox3;product=Hox3`
- exon: `gene=Hox3;product=Hox3`
All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should
include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep
parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems
to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons.
%% Cell type:code id: tags:
``` python
gff_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted.gff3'
named_loc = '/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named.gff3'
with open(gff_loc, 'r') as gff:
with open(named_loc, 'w') as named:
gene = ''
mRNA = ''
# for line in tqdm(gff.readlines()):
for line in gff.readlines():
line = line.strip()
if line[-1] == ';':
line = line[:-1]
conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
if not conditions_skip:
seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
attributes = parse_attributes(attributes)
if feature_type == 'gene':
gene = attributes['ID']
name = find_protein(gene, emapper)
line = f'{line};gene={name}'
description = find_description(gene, emapper)
ec_list = find_EC(gene, emapper)
pfams = find_PFAMs(gene, emapper)
if description:
line = f'{line};function={description}'
if name != f'Uncharacterised protein {gene}' or description:
line = f'{line};note=function predicted by EggNOG-mapper'
if feature_type == 'mRNA':
mRNA = attributes['ID']
isoform = mRNA.split('.')[-1]
line = f'{line};gene={name};product={name} isoform {isoform}'
if description:
line = f'{line};function={description}'
if name != f'Uncharacterised protein {gene}' or description:
line = f'{line};note=function predicted by EggNOG-mapper'
if feature_type == 'CDS' or feature_type == 'exon':
line = f'{line};gene={name};product={name}'
if description:
line = f'{line};function={description}'
if name != f'Uncharacterised protein {gene}' or description:
line = f'{line};note=function predicted by EggNOG-mapper'
if ec_list is not None:
for ec in ec_list:
line = f'{line};EC_number="{ec}"'
if pfams is not None:
for pfam in pfams:
line = f'{line};Dbxref="PFAM:{pfam}"'
named.write(line + '\n')
```
......
......@@ -23,4 +23,6 @@ EMBLmyGFF3 $GFF $GENOME \
-v \
-o result.embl
gzip result.embl
\ No newline at end of file
# when zipping: overwrite file if it already exists,
# else the command will hang while waiting for confirmation
gzip -f result.embl
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment