" name = lookup.loc[gene_id]['Preferred_name']\n",
" if name != '-':\n",
" return name\n",
" return f'{name} (predicted)'\n",
" return f'Uncharacterised protein {gene_id}'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def find_description(gene_id, lookup): # expects a protein-coding gene as input\n",
" \"\"\"Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n",
"\n",
" Parameters\n",
" ----------\n",
" gene_id : str\n",
" a P. litorale protein-coding gene ID\n",
" lookup : pd.DataFrame\n",
" the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n",
"\n",
" Returns\n",
" -------\n",
" str\n",
" the description (if available); will return None if no description is available.\n",
" if name != f'Uncharacterised protein {gene}' or description:\n",
" line = f'{line};note=source:EggNOG-mapper'\n",
"\n",
" if feature_type == 'mRNA':\n",
" mRNA = attributes['ID']\n",
" isoform = mRNA.split('.')[-1]\n",
" line = f'{line};name={name} isoform {isoform};gene_name={name}'\n",
" line = f'{line};gene={name};product={name} isoform {isoform}'\n",
" if description:\n",
" line = f'{line};function={description}'\n",
" if name != f'Uncharacterised protein {gene}' or description:\n",
" line = f'{line};note=source:EggNOG-mapper'\n",
"\n",
" if feature_type == 'CDS' or feature_type == 'exon':\n",
" line = f'{line};gene_name={name}'\n",
" line = f'{line};gene={name};product={name};'\n",
" if description:\n",
" line = f'{line};function={description}'\n",
" if name != f'Uncharacterised protein {gene}' or description:\n",
" line = f'{line};note=source:EggNOG-mapper'\n",
" named.write(line + '\\n')"
]
}
...
...
%% Cell type:markdown id: tags:
# Adding functional annotation from EggNOG-mapper
%% Cell type:code id: tags:
``` python
# from tqdm import tqdm # install it for nice progress bars
importpandasaspd
```
%% Cell type:markdown id: tags:
### util functions
We are going to need three helper functions:
- extract the gene ID from the `#query` field of the EggNOG-mapper output
- break up the content of the attributes field of the GFF file into a dictionary
- find the correct protein name for a gene ID
%% Cell type:code id: tags:
``` python
defparse_gene_id(x):
"""Extract gene ID from a string
Parameters
----------
x : str
A protein ID from the eggNOG-mapper output.
Returns
-------
str
will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'