From 25ec3b3bc848ec201c62ed8862d548962c60746a Mon Sep 17 00:00:00 2001 From: Niko <nikolaos.papadopoulos@univie.ac.at> Date: Wed, 27 Nov 2024 20:51:24 +0100 Subject: [PATCH] checks and balances for ENA submission --- 08-submission/gff-03-ENA_conform.sh | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 08-submission/gff-03-ENA_conform.sh diff --git a/08-submission/gff-03-ENA_conform.sh b/08-submission/gff-03-ENA_conform.sh new file mode 100644 index 0000000..4a3249d --- /dev/null +++ b/08-submission/gff-03-ENA_conform.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# modify the sorted GFF3 file to conform to ENA requirements. +# this means: +# - remove duplicate features +# - remove short introns (<10nt) + +module load conda +conda activate agat-1.4.1 + +# switch to the submission directory +RESULT=/lisc/scratch/zoology/pycnogonum/genome/submission +cd $RESULT || exit + +# define inputs and outputs +GFF=$BASE/merged_sorted_named.gff3 +DEDUP=$BASE/merged_sorted_named_dedup.gff3 +SHORT_INTRONS=$BASE/short_introns.tsv +KILL_LIST=$BASE/kill_list.tsv + +FILTERED_GFF=$BASE/merged_sorted_named_dedup_filtered.gff3 + +# the python script that will generate the kill list +KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-04-build_kill_list.py + +# first remove duplicate features +agat_sp_fix_features_locations_duplicated.pl --gff "$GFF" -o "$DEDUP" + +# now find short introns +agat_sp_list_short_introns.pl --gff "$DEDUP" --size 10 --out "$SHORT_INTRONS" + +# this table contains the locus (chromosome), gene, start position, and length of +# map the short introns to mRNAs in the GFF3 file +python $KILLSCRIPT "$SHORT_INTRONS" "$DEDUP" > "$KILL_LIST" + +# use the kill list to filter the offending mRNAs out of the GFF3 +agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF" \ No newline at end of file -- GitLab