" if name != f'Uncharacterised protein {gene}' or description:\n",
...
...
@@ -223,6 +283,12 @@
" line = f'{line};function={description}'\n",
" if name != f'Uncharacterised protein {gene}' or description:\n",
" line = f'{line};note=function predicted by EggNOG-mapper'\n",
" if ec_list is not None:\n",
" for ec in ec_list:\n",
" line = f'{line};EC_number=\"{ec}\"'\n",
" if pfams is not None:\n",
" for pfam in pfams:\n",
" line = f'{line};Dbxref=\"PFAM:{pfam}\"'\n",
" named.write(line + '\\n')"
]
}
...
...
%% Cell type:markdown id: tags:
# Adding functional annotation from EggNOG-mapper
%% Cell type:code id: tags:
``` python
# from tqdm import tqdm # install it for nice progress bars
importpandasaspd
```
%% Cell type:markdown id: tags:
### util functions
We are going to need three helper functions:
- extract the gene ID from the `#query` field of the EggNOG-mapper output
- break up the content of the attributes field of the GFF file into a dictionary
- find the correct protein name for a gene ID
%% Cell type:code id: tags:
``` python
defparse_gene_id(x):
"""Extract gene ID from a string
Parameters
----------
x : str
A protein ID from the eggNOG-mapper output.
Returns
-------
str
will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'