From 4c3735a2f0894cd3cc9a36a7e82be9224a97504e Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Thu, 28 Nov 2024 12:18:31 +0100 Subject: [PATCH] include emapper description, use ENA specification --- 08-submission/gff-02-functional_annot.ipynb | 76 ++++++++++++++++----- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb index 98a24f3..394b059 100644 --- a/08-submission/gff-02-functional_annot.ipynb +++ b/08-submission/gff-02-functional_annot.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,10 +104,38 @@ " if gene_id in lookup.index:\n", " name = lookup.loc[gene_id]['Preferred_name']\n", " if name != '-':\n", - " return name\n", + " return f'{name} (predicted)'\n", " return f'Uncharacterised protein {gene_id}'" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_description(gene_id, lookup): # expects a protein-coding gene as input\n", + " \"\"\"Retrieve the EggNOG-mapper description for a gene ID. Expects a protein-coding gene as input (will not work correctly for tRNA or rRNA genes).\n", + "\n", + " Parameters\n", + " ----------\n", + " gene_id : str\n", + " a P. litorale protein-coding gene ID\n", + " lookup : pd.DataFrame\n", + " the eggNOG-mapper output file, filtered to maximum one entry per gene ID\n", + "\n", + " Returns\n", + " -------\n", + " str\n", + " the description (if available); will return None if no description is available.\n", + " \"\"\"\n", + " if gene_id in lookup.index:\n", + " description = lookup.loc[gene_id]['Description']\n", + " if description != '-':\n", + " return description\n", + " return None" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -118,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -136,14 +164,15 @@ "Loop over the entire GFF file and decorate all the (putative) protein-coding entries to include the\n", "gene symbol (name) in some form:\n", "\n", - "- Gene: `name=Hox3;gene_name=Hox3`\n", - "- mRNA: `name=Hox3 isoform 1;gene_name=Hox3`\n", - "- CDS: `gene_name=Hox3`\n", - "- exon: `gene_name=Hox3`\n", + "- Gene: `gene=Hox3;product=Hox3`\n", + "- mRNA: `product=Hox3 isoform 1;gene=Hox3`\n", + "- CDS: `gene=Hox3;product=Hox3`\n", + "- exon: `gene=Hox3;product=Hox3`\n", "\n", "All putative mRNAs should have an isoform name (even if it is just `1`); all exons and CDS's should\n", - "include the gene name as well, so that lazy GFF readers that grep parts of the GFF (like\n", - "`CellRanger`) still have access to the functional annotation." + "include the gene symbol (`product=` and `gene=` tags) as well, so that lazy GFF readers that grep\n", + "parts of the GFF (like `CellRanger`) still have access to the functional annotation. This also seems\n", + "to be the ENA/EMBL strategy; NCBI also directly infers the gene symbol from the CDS/exons." ] }, { @@ -168,17 +197,32 @@ " if not conditions_skip:\n", " seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n", " attributes = parse_attributes(attributes)\n", + "\n", " if feature_type == 'gene':\n", " gene = attributes['ID']\n", " name = find_protein(gene, emapper)\n", - " name = f'{name} (predicted)'\n", - " line = f'{line};name={name}'\n", + " line = f'{line};gene={name}'\n", + " description = find_description(gene, emapper)\n", + " if description:\n", + " line = f'{line};function={description}'\n", + " if name != f'Uncharacterised protein {gene}' or description:\n", + " line = f'{line};note=source:EggNOG-mapper'\n", + "\n", " if feature_type == 'mRNA':\n", " mRNA = attributes['ID']\n", " isoform = mRNA.split('.')[-1]\n", - " line = f'{line};name={name} isoform {isoform};gene_name={name}'\n", + " line = f'{line};gene={name};product={name} isoform {isoform}'\n", + " if description:\n", + " line = f'{line};function={description}'\n", + " if name != f'Uncharacterised protein {gene}' or description:\n", + " line = f'{line};note=source:EggNOG-mapper'\n", + "\n", " if feature_type == 'CDS' or feature_type == 'exon':\n", - " line = f'{line};gene_name={name}'\n", + " line = f'{line};gene={name};product={name};'\n", + " if description:\n", + " line = f'{line};function={description}'\n", + " if name != f'Uncharacterised protein {gene}' or description:\n", + " line = f'{line};note=source:EggNOG-mapper'\n", " named.write(line + '\\n')" ] } -- GitLab