From 3031847f7ac73f61355685371c77cff0d745b1b4 Mon Sep 17 00:00:00 2001
From: Niko <nikolaos.papadopoulos@univie.ac.at>
Date: Wed, 27 Nov 2024 15:44:48 +0100
Subject: [PATCH] re-organising submission folder

---
 08-submission/gff-01-compose_gff.sh           |  23 ++
 08-submission/gff-02-functional_annot.ipynb   | 220 ++++++++++++++++++
 ...t_to_embl.sh => gff-03-convert_to_embl.sh} |   5 +-
 08-submission/gff-04-submit_to_ENA.sh         |   1 +
 ...me_manifest.py => tx-01-txome_manifest.py} |   0
 5 files changed, 247 insertions(+), 2 deletions(-)
 create mode 100644 08-submission/gff-01-compose_gff.sh
 create mode 100644 08-submission/gff-02-functional_annot.ipynb
 rename 08-submission/{convert_to_embl.sh => gff-03-convert_to_embl.sh} (90%)
 create mode 100644 08-submission/gff-04-submit_to_ENA.sh
 rename 08-submission/{txome_manifest.py => tx-01-txome_manifest.py} (100%)

diff --git a/08-submission/gff-01-compose_gff.sh b/08-submission/gff-01-compose_gff.sh
new file mode 100644
index 0000000..a2c9b89
--- /dev/null
+++ b/08-submission/gff-01-compose_gff.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# compose the GFF3 file to prepare for ENA submission
+
+# first gather the various input sources
+BASE=/lisc/scratch/zoology/pycnogonum/genome/draft/annot_merge
+ISOSEQ=$BASE/isoseq.gff
+BRAKER_R1=$BASE/braker.gff
+BRAKER_R2=$BASE/braker2_unique_renamed_nocodon_intron.gff3
+DENOVO=$BASE/overlap_translated.gff3
+
+ls ISOSEQ
+ls BRAKER_R1
+ls BRAKER_R2
+ls DENOVO
+
+# cat isoseq.gff > merged.gff3
+# cat braker.gff >> merged.gff3
+# cat braker2_unique_renamed_nocodon_intron.gff3 >> merged.gff3
+# cat denovo_txomes/overlap_translated.gff3 >> merged.gff3
+# cat ../trnascan/trnascan.gff3 >> merged.gff3
+# module load genometools/
+# gt gff3 -tidy -retainids -o merged_sorted.gff3 -force merged.gff3
\ No newline at end of file
diff --git a/08-submission/gff-02-functional_annot.ipynb b/08-submission/gff-02-functional_annot.ipynb
new file mode 100644
index 0000000..ba2eba6
--- /dev/null
+++ b/08-submission/gff-02-functional_annot.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adding functional annotation from EggNOG-mapper\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'\n",
+    "emapper = pd.read_csv(best_per_gene, sep='\\t', header=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_gene_id(x):\n",
+    "    if 'PB' in x:\n",
+    "        parts = x.split('.')\n",
+    "        return '.'.join(parts[:2])\n",
+    "    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):\n",
+    "        return x.split('.')[0]\n",
+    "    else:\n",
+    "        return ValueError('Unknown gene ID format')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "apply the parsing on each row..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emapper['gene'] = emapper['#query'].apply(parse_gene_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emapper.set_index('gene', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "#query                                                    PB.3.1.p1\n",
+       "seed_ortholog                                       136037.KDR12978\n",
+       "evalue                                                          0.0\n",
+       "score                                                         570.0\n",
+       "eggNOG_OGs        COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...\n",
+       "max_annot_lvl                                         33208|Metazoa\n",
+       "COG_category                                                     EF\n",
+       "Description       ribose phosphate diphosphokinase activity. It ...\n",
+       "Preferred_name                                              PRPSAP1\n",
+       "GOs               GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...\n",
+       "EC                                                                -\n",
+       "KEGG_ko                                                           -\n",
+       "KEGG_Pathway                                                      -\n",
+       "KEGG_Module                                                       -\n",
+       "KEGG_Reaction                                                     -\n",
+       "KEGG_rclass                                                       -\n",
+       "BRITE                                                             -\n",
+       "KEGG_TC                                                           -\n",
+       "CAZy                                                              -\n",
+       "BiGG_Reaction                                                     -\n",
+       "PFAMs                                 Pribosyl_synth,Pribosyltran_N\n",
+       "Name: PB.3, dtype: object"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emapper.loc['PB.3']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_attributes(x):\n",
+    "    '''Parses a semi-colon separated string into a dictionary\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    x : str\n",
+    "        a semicolon-separated string that holds attributes\n",
+    "    '''\n",
+    "    attributes = x.split(';')\n",
+    "    if attributes[-1] == '':\n",
+    "        attributes.pop()\n",
+    "    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'-'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emapper.loc['PB.1']['Preferred_name']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_protein(gene_id, lookup): # expects a protein-coding gene as input\n",
+    "    if gene_id in lookup.index:\n",
+    "        name = lookup.loc[gene_id]['Preferred_name']\n",
+    "        if name != '-':\n",
+    "            return name\n",
+    "    return f'Uncharacterised protein {gene_id}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'\n",
+    "named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'\n",
+    "\n",
+    "with open(gff_loc, 'r') as gff:\n",
+    "    with open(named_loc, 'w') as named:\n",
+    "        gene = ''\n",
+    "        mRNA = ''\n",
+    "        for line in gff:\n",
+    "            line = line.strip()\n",
+    "            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line\n",
+    "            if not conditions_skip:\n",
+    "                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\\t')\n",
+    "                attributes = parse_attributes(attributes)\n",
+    "                if feature_type == 'gene':\n",
+    "                    gene = attributes['ID']\n",
+    "                    name = find_protein(gene, emapper)\n",
+    "                    line = f'{line}name={name} (predicted)'\n",
+    "                if feature_type == 'mRNA':\n",
+    "                    mRNA = attributes['ID']\n",
+    "                    isoform = mRNA.split('.')[-1]\n",
+    "                    line = f'{line}name={name} (predicted) isoform {isoform}'\n",
+    "\n",
+    "            named.write(line + '\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ascc24",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/08-submission/convert_to_embl.sh b/08-submission/gff-03-convert_to_embl.sh
similarity index 90%
rename from 08-submission/convert_to_embl.sh
rename to 08-submission/gff-03-convert_to_embl.sh
index 892017c..8447494 100644
--- a/08-submission/convert_to_embl.sh
+++ b/08-submission/gff-03-convert_to_embl.sh
@@ -4,12 +4,13 @@ module load conda
 conda activate emblmygff3
 
 GENOME=/lisc/project/zoology/pycnogonum/paper/results/draft.fasta
-GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted.gff3
+GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted_dedup.gff3
 RESDIR=/lisc/scratch/zoology/pycnogonum/genome/submission
 
 cd $RESDIR || exit
 
 EMBLmyGFF3 $GFF $GENOME \
+        --expose_translations \
         --topology linear \
         --molecule_type 'genomic DNA' \
         --transl_table 1  \
@@ -17,5 +18,5 @@ EMBLmyGFF3 $GFF $GENOME \
         --taxonomy INV \
         --locus_tag VPG \
         --project_id PRJEB80537 \
-        --verbose \
+        -vvv \
         -o result.embl
\ No newline at end of file
diff --git a/08-submission/gff-04-submit_to_ENA.sh b/08-submission/gff-04-submit_to_ENA.sh
new file mode 100644
index 0000000..f1f641a
--- /dev/null
+++ b/08-submission/gff-04-submit_to_ENA.sh
@@ -0,0 +1 @@
+#!/usr/bin/env bash
diff --git a/08-submission/txome_manifest.py b/08-submission/tx-01-txome_manifest.py
similarity index 100%
rename from 08-submission/txome_manifest.py
rename to 08-submission/tx-01-txome_manifest.py
-- 
GitLab