From b387ad91c548b23f6b2265f1fb72abdd501373e2 Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Wed, 27 Nov 2024 22:13:52 +0100 Subject: [PATCH] fixed paths, added validate script --- 08-submission/Makefile | 13 ++++++++----- 08-submission/gff-03-ENA_conform.sh | 19 ++++++++++++------- 08-submission/gff-04-convert_to_embl.sh | 8 +++++--- 08-submission/gff-05-submit_to_ENA.sh | 13 +++++++++++++ 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/08-submission/Makefile b/08-submission/Makefile index 273236f..a5c23d3 100644 --- a/08-submission/Makefile +++ b/08-submission/Makefile @@ -4,10 +4,7 @@ compose: bash gff-01-compose_gff.sh annotate: - module load conda - # a conda environment that has pandas and can run jupyter notebooks - conda activate jupyterhub-5.1.0 - jupyter nbconvert --to notebook --execute gff-02-annotate.ipynb + bash gff-02-functional_annot.sh conform: bash gff-03-ENA_conform.sh @@ -16,4 +13,10 @@ embl: bash gff-04-convert_to_embl.sh validate: - bash gff-05-submit_to_ENA.sh validate \ No newline at end of file + bash gff-05-submit_to_ENA.sh -validate + +submit: + bash gff-05-submit_to_ENA.sh -submit + +all: compose annotate conform embl validate 2>&1 | tee log.txt + @echo Done. \ No newline at end of file diff --git a/08-submission/gff-03-ENA_conform.sh b/08-submission/gff-03-ENA_conform.sh index 4a3249d..398fec2 100644 --- a/08-submission/gff-03-ENA_conform.sh +++ b/08-submission/gff-03-ENA_conform.sh @@ -13,15 +13,16 @@ RESULT=/lisc/scratch/zoology/pycnogonum/genome/submission cd $RESULT || exit # define inputs and outputs -GFF=$BASE/merged_sorted_named.gff3 -DEDUP=$BASE/merged_sorted_named_dedup.gff3 -SHORT_INTRONS=$BASE/short_introns.tsv -KILL_LIST=$BASE/kill_list.tsv +GFF=./merged_sorted_named.gff3 +DEDUP=./merged_sorted_named_dedup.gff3 +SHORT_INTRONS=./short_introns.tsv +KILL_LIST=./kill_list.tsv -FILTERED_GFF=$BASE/merged_sorted_named_dedup_filtered.gff3 +FILTERED_GFF=./merged_sorted_named_dedup_filtered.gff3 +FILTERED_mRNA=./short_introns.gff3 # the python script that will generate the kill list -KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-04-build_kill_list.py +KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-03-build_kill_list.py # first remove duplicate features agat_sp_fix_features_locations_duplicated.pl --gff "$GFF" -o "$DEDUP" @@ -31,7 +32,11 @@ agat_sp_list_short_introns.pl --gff "$DEDUP" --size 10 --out "$SHORT_INTRONS" # this table contains the locus (chromosome), gene, start position, and length of # map the short introns to mRNAs in the GFF3 file +# this was written in Python 3.12 but Python >3 should be fine; we only use default libraries python $KILLSCRIPT "$SHORT_INTRONS" "$DEDUP" > "$KILL_LIST" # use the kill list to filter the offending mRNAs out of the GFF3 -agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF" \ No newline at end of file +agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF" + +# also make a supplementary GFF with only the short introns: +agat_sp_filter_feature_from_keep_list.pl --gff "$DEDUP" --keep_list "$KILL_LIST" -p mRNA -o "$FILTERED_mRNA" \ No newline at end of file diff --git a/08-submission/gff-04-convert_to_embl.sh b/08-submission/gff-04-convert_to_embl.sh index 8447494..bbe13d8 100644 --- a/08-submission/gff-04-convert_to_embl.sh +++ b/08-submission/gff-04-convert_to_embl.sh @@ -3,8 +3,8 @@ module load conda conda activate emblmygff3 -GENOME=/lisc/project/zoology/pycnogonum/paper/results/draft.fasta -GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted_dedup.gff3 +GENOME=/lisc/project/zoology/pycnogonum/paper/zenodo/results/draft.fasta +GFF=/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named_dedup_filtered.gff3 RESDIR=/lisc/scratch/zoology/pycnogonum/genome/submission cd $RESDIR || exit @@ -19,4 +19,6 @@ EMBLmyGFF3 $GFF $GENOME \ --locus_tag VPG \ --project_id PRJEB80537 \ -vvv \ - -o result.embl \ No newline at end of file + -o result.embl + +gzip result.embl \ No newline at end of file diff --git a/08-submission/gff-05-submit_to_ENA.sh b/08-submission/gff-05-submit_to_ENA.sh index f1f641a..947b173 100644 --- a/08-submission/gff-05-submit_to_ENA.sh +++ b/08-submission/gff-05-submit_to_ENA.sh @@ -1 +1,14 @@ #!/usr/bin/env bash + +# submit the GFF3 file to ENA. Assumes we are in the submission directory +# and the genome manifest file is present and valid. +module load java + +MODE=$1 # either -validate or -submit + +java -jar ~/bin/webin-cli-8.0.0.jar \ + -context genome \ + -userName Webin-68127 \ + -passwordFile ~/webin.pwd \ + -manifest genome.manifest \ + "$MODE" \ No newline at end of file -- GitLab