fixed paths, added validate script

b387ad91 · Niko (Nikolaos) Papadopoulos · 0bdd277c · b387ad91 · b387ad91 · b387ad91
Commit b387ad91 authored 5 months ago by Niko (Nikolaos) Papadopoulos
--- a/08-submission/Makefile
+++ b/08-submission/Makefile
@@ -4,10 +4,7 @@ compose:
 	bash gff-01-compose_gff.sh
 annotate:
-	module load conda
+	bash gff-02-functional_annot.sh
-	# a conda environment that has pandas and can run jupyter notebooks
-	conda activate jupyterhub-5.1.0
-	jupyter nbconvert --to notebook --execute gff-02-annotate.ipynb
 conform:
 	bash gff-03-ENA_conform.sh
@@ -16,4 +13,10 @@ embl:
 	bash gff-04-convert_to_embl.sh
 validate:
-	bash gff-05-submit_to_ENA.sh validate
+	bash gff-05-submit_to_ENA.sh -validate
\ No newline at end of file
+submit:
+	bash gff-05-submit_to_ENA.sh -submit
+all: compose annotate conform embl validate 2>&1 | tee log.txt
+	@echo Done.
\ No newline at end of file
--- a/08-submission/gff-03-ENA_conform.sh
+++ b/08-submission/gff-03-ENA_conform.sh
@@ -13,15 +13,16 @@ RESULT=/lisc/scratch/zoology/pycnogonum/genome/submission
 cd $RESULT || exit
 # define inputs and outputs
-GFF=$BASE/merged_sorted_named.gff3
+GFF=./merged_sorted_named.gff3
-DEDUP=$BASE/merged_sorted_named_dedup.gff3
+DEDUP=./merged_sorted_named_dedup.gff3
-SHORT_INTRONS=$BASE/short_introns.tsv
+SHORT_INTRONS=./short_introns.tsv
-KILL_LIST=$BASE/kill_list.tsv
+KILL_LIST=./kill_list.tsv
-FILTERED_GFF=$BASE/merged_sorted_named_dedup_filtered.gff3
+FILTERED_GFF=./merged_sorted_named_dedup_filtered.gff3
+FILTERED_mRNA=./short_introns.gff3
 # the python script that will generate the kill list
-KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-04-build_kill_list.py
+KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-03-build_kill_list.py
 # first remove duplicate features
 agat_sp_fix_features_locations_duplicated.pl --gff "$GFF" -o "$DEDUP"
@@ -31,7 +32,11 @@ agat_sp_list_short_introns.pl --gff "$DEDUP" --size 10 --out "$SHORT_INTRONS"
 # this table contains the locus (chromosome), gene, start position, and length of 
 # map the short introns to mRNAs in the GFF3 file
+# this was written in Python 3.12 but Python >3 should be fine; we only use default libraries
 python $KILLSCRIPT "$SHORT_INTRONS" "$DEDUP" > "$KILL_LIST"
 # use the kill list to filter the offending mRNAs out of the GFF3
 agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF"
\ No newline at end of file
+# also make a supplementary GFF with only the short introns:
+agat_sp_filter_feature_from_keep_list.pl --gff "$DEDUP" --keep_list "$KILL_LIST" -p mRNA -o "$FILTERED_mRNA"
\ No newline at end of file
--- a/08-submission/gff-04-convert_to_embl.sh
+++ b/08-submission/gff-04-convert_to_embl.sh
@@ -3,8 +3,8 @@
 module load conda
 conda activate emblmygff3
-GENOME=/lisc/project/zoology/pycnogonum/paper/results/draft.fasta
+GENOME=/lisc/project/zoology/pycnogonum/paper/zenodo/results/draft.fasta
-GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted_dedup.gff3
+GFF=/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named_dedup_filtered.gff3
 RESDIR=/lisc/scratch/zoology/pycnogonum/genome/submission
 cd $RESDIR || exit
@@ -19,4 +19,6 @@ EMBLmyGFF3 $GFF $GENOME \
        --locus_tag VPG \
        --project_id PRJEB80537 \
        -vvv \
        -o result.embl
\ No newline at end of file
+gzip result.embl
\ No newline at end of file
--- a/08-submission/gff-05-submit_to_ENA.sh
+++ b/08-submission/gff-05-submit_to_ENA.sh
 #!/usr/bin/env bash
+# submit the GFF3 file to ENA. Assumes we are in the submission directory
+# and the genome manifest file is present and valid.
+module load java
+MODE=$1 # either -validate or -submit
+java -jar ~/bin/webin-cli-8.0.0.jar \
+    -context genome \
+    -userName Webin-68127 \
+    -passwordFile ~/webin.pwd \
+    -manifest genome.manifest \
+    "$MODE"
\ No newline at end of file