diff --git a/08-submission/Makefile b/08-submission/Makefile index 273236f20d831ee27c234f5460d318f1b2410f98..a5c23d3ca5c64517163cd2abcf231f37ed706a5e 100644 --- a/08-submission/Makefile +++ b/08-submission/Makefile @@ -4,10 +4,7 @@ compose: bash gff-01-compose_gff.sh annotate: - module load conda - # a conda environment that has pandas and can run jupyter notebooks - conda activate jupyterhub-5.1.0 - jupyter nbconvert --to notebook --execute gff-02-annotate.ipynb + bash gff-02-functional_annot.sh conform: bash gff-03-ENA_conform.sh @@ -16,4 +13,10 @@ embl: bash gff-04-convert_to_embl.sh validate: - bash gff-05-submit_to_ENA.sh validate \ No newline at end of file + bash gff-05-submit_to_ENA.sh -validate + +submit: + bash gff-05-submit_to_ENA.sh -submit + +all: compose annotate conform embl validate 2>&1 | tee log.txt + @echo Done. \ No newline at end of file diff --git a/08-submission/gff-03-ENA_conform.sh b/08-submission/gff-03-ENA_conform.sh index 4a3249de72e9b98de5afa543976377065da81e5e..398fec2f6ef29d7d377e199ded81a013c3e7f8c1 100644 --- a/08-submission/gff-03-ENA_conform.sh +++ b/08-submission/gff-03-ENA_conform.sh @@ -13,15 +13,16 @@ RESULT=/lisc/scratch/zoology/pycnogonum/genome/submission cd $RESULT || exit # define inputs and outputs -GFF=$BASE/merged_sorted_named.gff3 -DEDUP=$BASE/merged_sorted_named_dedup.gff3 -SHORT_INTRONS=$BASE/short_introns.tsv -KILL_LIST=$BASE/kill_list.tsv +GFF=./merged_sorted_named.gff3 +DEDUP=./merged_sorted_named_dedup.gff3 +SHORT_INTRONS=./short_introns.tsv +KILL_LIST=./kill_list.tsv -FILTERED_GFF=$BASE/merged_sorted_named_dedup_filtered.gff3 +FILTERED_GFF=./merged_sorted_named_dedup_filtered.gff3 +FILTERED_mRNA=./short_introns.gff3 # the python script that will generate the kill list -KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-04-build_kill_list.py +KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-03-build_kill_list.py # first remove duplicate features agat_sp_fix_features_locations_duplicated.pl --gff "$GFF" -o "$DEDUP" @@ -31,7 +32,11 @@ agat_sp_list_short_introns.pl --gff "$DEDUP" --size 10 --out "$SHORT_INTRONS" # this table contains the locus (chromosome), gene, start position, and length of # map the short introns to mRNAs in the GFF3 file +# this was written in Python 3.12 but Python >3 should be fine; we only use default libraries python $KILLSCRIPT "$SHORT_INTRONS" "$DEDUP" > "$KILL_LIST" # use the kill list to filter the offending mRNAs out of the GFF3 -agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF" \ No newline at end of file +agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF" + +# also make a supplementary GFF with only the short introns: +agat_sp_filter_feature_from_keep_list.pl --gff "$DEDUP" --keep_list "$KILL_LIST" -p mRNA -o "$FILTERED_mRNA" \ No newline at end of file diff --git a/08-submission/gff-04-convert_to_embl.sh b/08-submission/gff-04-convert_to_embl.sh index 8447494d9d2c927702bddc1cee2e7f4ddc9b9619..bbe13d8f08be27eb522d032908f2304757d8027f 100644 --- a/08-submission/gff-04-convert_to_embl.sh +++ b/08-submission/gff-04-convert_to_embl.sh @@ -3,8 +3,8 @@ module load conda conda activate emblmygff3 -GENOME=/lisc/project/zoology/pycnogonum/paper/results/draft.fasta -GFF=/lisc/project/zoology/pycnogonum/paper/results/merged_sorted_dedup.gff3 +GENOME=/lisc/project/zoology/pycnogonum/paper/zenodo/results/draft.fasta +GFF=/lisc/scratch/zoology/pycnogonum/genome/submission/merged_sorted_named_dedup_filtered.gff3 RESDIR=/lisc/scratch/zoology/pycnogonum/genome/submission cd $RESDIR || exit @@ -19,4 +19,6 @@ EMBLmyGFF3 $GFF $GENOME \ --locus_tag VPG \ --project_id PRJEB80537 \ -vvv \ - -o result.embl \ No newline at end of file + -o result.embl + +gzip result.embl \ No newline at end of file diff --git a/08-submission/gff-05-submit_to_ENA.sh b/08-submission/gff-05-submit_to_ENA.sh index f1f641af19bf623505554b29f35499b5d15f3fd3..947b1732cb5f5f0fa5e7638dd9d6bda5e21516b2 100644 --- a/08-submission/gff-05-submit_to_ENA.sh +++ b/08-submission/gff-05-submit_to_ENA.sh @@ -1 +1,14 @@ #!/usr/bin/env bash + +# submit the GFF3 file to ENA. Assumes we are in the submission directory +# and the genome manifest file is present and valid. +module load java + +MODE=$1 # either -validate or -submit + +java -jar ~/bin/webin-cli-8.0.0.jar \ + -context genome \ + -userName Webin-68127 \ + -passwordFile ~/webin.pwd \ + -manifest genome.manifest \ + "$MODE" \ No newline at end of file