From fdb02d43a0e32648a17fbdcf2a30d7368f48f248 Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Wed, 27 Nov 2024 16:42:00 +0100 Subject: [PATCH] add eggNOG-mapper preferred name to GFF --- 08-submission/gff-02-functional_annot.ipynb | 198 ++++++++++---------- 1 file changed, 95 insertions(+), 103 deletions(-) diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb index ba2eba6..1ab90de 100644 --- a/08-submission/gff-02-functional_annot.ipynb +++ b/08-submission/gff-02-functional_annot.ipynb @@ -14,27 +14,43 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np" + "from tqdm import tqdm\n", + "\n", + "import pandas as pd" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n", - "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)" + "### util functions\n", + "\n", + "We are going to need three helper functions:\n", + "\n", + "- extract the gene ID from the `#query` field of the EggNOG-mapper output\n", + "- break up the content of the attributes field of the GFF file into a dictionary\n", + "- find the correct protein name for a gene ID" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def parse_gene_id(x):\n", + " \"\"\"Extract gene ID from a string\n", + "\n", + " Parameters\n", + " ----------\n", + " x : str\n", + " A protein ID from the eggNOG-mapper output.\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " will return the gene ID in the format of 'PB.X' (PacBio genes) or 'gX' (BRAKER round 1) or 'r2_gX' (BRAKER round 2) or 'at_DNX (de-novo transcriptome-assembled genes)'\n", + " \"\"\"\n", " if 'PB' in x:\n", " parts = x.split('.')\n", " return '.'.join(parts[:2])\n", @@ -44,75 +60,9 @@ " return ValueError('Unknown gene ID format')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "apply the parsing on each row..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "emapper['gene'] = emapper['#query'].apply(parse_gene_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "emapper.set_index('gene', inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "#query PB.3.1.p1\n", - "seed_ortholog 136037.KDR12978\n", - "evalue 0.0\n", - "score 570.0\n", - "eggNOG_OGs COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...\n", - "max_annot_lvl 33208|Metazoa\n", - "COG_category EF\n", - "Description ribose phosphate diphosphokinase activity. It ...\n", - "Preferred_name PRPSAP1\n", - "GOs GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...\n", - "EC -\n", - "KEGG_ko -\n", - "KEGG_Pathway -\n", - "KEGG_Module -\n", - "KEGG_Reaction -\n", - "KEGG_rclass -\n", - "BRITE -\n", - "KEGG_TC -\n", - "CAZy -\n", - "BiGG_Reaction -\n", - "PFAMs Pribosyl_synth,Pribosyltran_N\n", - "Name: PB.3, dtype: object" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "emapper.loc['PB.3']" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -132,31 +82,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'-'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "emapper.loc['PB.1']['Preferred_name']" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Generate the correct protein name for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " the gene symbol (if available) or \"Uncharacterised protein {gene_id}\" if no gene symbol is available.\n", + " \"\"\"\n", " if gene_id in lookup.index:\n", " name = lookup.loc[gene_id]['Preferred_name']\n", " if name != '-':\n", @@ -164,20 +108,66 @@ " return f'Uncharacterised protein {gene_id}'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read the emapper output (filtered to best hit per gene ID) and extract the gene ID in a new column;\n", + "then set that as the index of the dataframe." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'\n", - "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'\n", + "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n", + "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)\n", + "\n", + "emapper['gene'] = emapper['#query'].apply(parse_gene_id)\n", + "emapper.set_index('gene', inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n", + "gene symbol (name) in some form:\n", + "\n", + "- Gene: `name=Hox3;gene_name=Hox3`\n", + "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n", + "- CDS: `gene_name=Hox3`\n", + "- exon: `gene_name=Hox3`\n", + "\n", + "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n", + "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n", + "`CellRanger`) still have access to the functional annotation." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 779251/779251 [00:04<00:00, 183551.40it/s]\n" + ] + } + ], + "source": [ + "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff3'\n", + "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff3'\n", "\n", "with open(gff_loc, 'r') as gff:\n", " with open(named_loc, 'w') as named:\n", " gene = ''\n", " mRNA = ''\n", - " for line in gff:\n", + " for line in tqdm(gff.readlines()):\n", " line = line.strip()\n", " conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n", " if not conditions_skip:\n", @@ -186,12 +176,14 @@ " if feature_type == 'gene':\n", " gene = attributes['ID']\n", " name = find_protein(gene, emapper)\n", - " line = f'{line}name={name} (predicted)'\n", + " name = f'{name} (predicted)'\n", + " line = f'{line}name={name}'\n", " if feature_type == 'mRNA':\n", " mRNA = attributes['ID']\n", " isoform = mRNA.split('.')[-1]\n", - " line = f'{line}name={name} (predicted) isoform {isoform}'\n", - "\n", + " line = f'{line}name={name} isoform {isoform};gene_name={name}'\n", + " if feature_type == 'CDS' or feature_type == 'exon':\n", + " line = f'{line}gene_name={name}'\n", " named.write(line + '\\n')" ] } -- GitLab