From 3031847f7ac73f61355685371c77cff0d745b1b4 Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Wed, 27 Nov 2024 15:44:48 +0100 Subject: [PATCH] re-organising submission folder --- 08-submission/gff-01-compose_gff.sh | 23 ++ 08-submission/gff-02-functional_annot.ipynb | 220 ++++++++++++++++++ ...t_to_embl.sh => gff-03-convert_to_embl.sh} | 5 +- 08-submission/gff-04-submit_to_ENA.sh | 1 + ...me_manifest.py => tx-01-txome_manifest.py} | 0 5 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 08-submission/gff-01-compose_gff.sh create mode 100644 08-submission/gff-02-functional_annot.ipynb rename 08-submission/{convert_to_embl.sh => gff-03-convert_to_embl.sh} (90%) create mode 100644 08-submission/gff-04-submit_to_ENA.sh rename 08-submission/{txome_manifest.py => tx-01-txome_manifest.py} (100%) diff --git a/08-submission/gff-01-compose_gff.sh b/08-submission/gff-01-compose_gff.sh new file mode 100644 index 0000000..a2c9b89 --- /dev/null +++ b/08-submission/gff-01-compose_gff.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# compose the GFF3 file to prepare for ENA submission + +# first gather the various input sources +BASE=/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge +ISOSEQ=$BASE/isoseq.gff +BRAKER_R1=$BASE/braker.gff +BRAKER_R2=$BASE/braker2_unique_renamed_nocodon_intron.gff3 +DENOVO=$BASE/overlap_translated.gff3 + +ls ISOSEQ +ls BRAKER_R1 +ls BRAKER_R2 +ls DENOVO + +# cat isoseq.gff > merged.gff3 +# cat braker.gff >> merged.gff3 +# cat braker2_unique_renamed_nocodon_intron.gff3 >> merged.gff3 +# cat denovo_txomes/overlap_translated.gff3 >> merged.gff3 +# cat ../trnascan/trnascan.gff3 >> merged.gff3 +# module load genometools/ +# gt gff3 -tidy -retainids -o merged_sorted.gff3 -force merged.gff3 \ No newline at end of file diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb new file mode 100644 index 0000000..ba2eba6 --- /dev/null +++ b/08-submission/gff-02-functional_annot.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adding functional annotation from EggNOG-mapper\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n", + "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_gene_id(x):\n", + " if 'PB' in x:\n", + " parts = x.split('.')\n", + " return '.'.join(parts[:2])\n", + " elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):\n", + " return x.split('.')[0]\n", + " else:\n", + " return ValueError('Unknown gene ID format')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "apply the parsing on each row..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "emapper['gene'] = emapper['#query'].apply(parse_gene_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "emapper.set_index('gene', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "#query PB.3.1.p1\n", + "seed_ortholog 136037.KDR12978\n", + "evalue 0.0\n", + "score 570.0\n", + "eggNOG_OGs COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...\n", + "max_annot_lvl 33208|Metazoa\n", + "COG_category EF\n", + "Description ribose phosphate diphosphokinase activity. It ...\n", + "Preferred_name PRPSAP1\n", + "GOs GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...\n", + "EC -\n", + "KEGG_ko -\n", + "KEGG_Pathway -\n", + "KEGG_Module -\n", + "KEGG_Reaction -\n", + "KEGG_rclass -\n", + "BRITE -\n", + "KEGG_TC -\n", + "CAZy -\n", + "BiGG_Reaction -\n", + "PFAMs Pribosyl_synth,Pribosyltran_N\n", + "Name: PB.3, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emapper.loc['PB.3']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_attributes(x):\n", + " '''Parses a semi-colon separated string into a dictionary\n", + "\n", + " Parameters\n", + " ----------\n", + " x : str\n", + " a semicolon-separated string that holds attributes\n", + " '''\n", + " attributes = x.split(';')\n", + " if attributes[-1] == '':\n", + " attributes.pop()\n", + " return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'-'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emapper.loc['PB.1']['Preferred_name']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n", + " if gene_id in lookup.index:\n", + " name = lookup.loc[gene_id]['Preferred_name']\n", + " if name != '-':\n", + " return name\n", + " return f'Uncharacterised protein {gene_id}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'\n", + "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'\n", + "\n", + "with open(gff_loc, 'r') as gff:\n", + " with open(named_loc, 'w') as named:\n", + " gene = ''\n", + " mRNA = ''\n", + " for line in gff:\n", + " line = line.strip()\n", + " conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n", + " if not conditions_skip:\n", + " seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n", + " attributes = parse_attributes(attributes)\n", + " if feature_type == 'gene':\n", + " gene = attributes['ID']\n", + " name = find_protein(gene, emapper)\n", + " line = f'{line}name={name} (predicted)'\n", + " if feature_type == 'mRNA':\n", + " mRNA = attributes['ID']\n", + " isoform = mRNA.split('.')[-1]\n", + " line = f'{line}name={name} (predicted) isoform {isoform}'\n", + "\n", + " named.write(line + '\\n')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ascc24", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/08-submission/convert_to_embl.sh b/08-submission/gff-03-convert_to_embl.sh similarity index 90% rename from 08-submission/convert_to_embl.sh rename to 08-submission/gff-03-convert_to_embl.sh index 892017c..8447494 100644 --- a/08-submission/convert_to_embl.sh +++ b/08-submission/gff-03-convert_to_embl.sh @@ -4,12 +4,13 @@ module load conda conda activate emblmygff3 GENOME=/lisc/project/zoology/pycnogonum/paper/results/draft.fasta -GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted.gff3 +GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted_dedup.gff3 RESDIR=/lisc/scratch/zoology/pycnogonum/genome/submission cd $RESDIR || exit EMBLmyGFF3 $GFF $GENOME \ + --expose_translations \ --topology linear \ --molecule_type 'genomic DNA' \ --transl_table 1 \ @@ -17,5 +18,5 @@ EMBLmyGFF3 $GFF $GENOME \ --taxonomy INV \ --locus_tag VPG \ --project_id PRJEB80537 \ - --verbose \ + -vvv \ -o result.embl \ No newline at end of file diff --git a/08-submission/gff-04-submit_to_ENA.sh b/08-submission/gff-04-submit_to_ENA.sh new file mode 100644 index 0000000..f1f641a --- /dev/null +++ b/08-submission/gff-04-submit_to_ENA.sh @@ -0,0 +1 @@ +#!/usr/bin/env bash diff --git a/08-submission/txome_manifest.py b/08-submission/tx-01-txome_manifest.py similarity index 100% rename from 08-submission/txome_manifest.py rename to 08-submission/tx-01-txome_manifest.py -- GitLab