From b96bfe8ebf8bfa0010324c4dd7a648ff562f03f4 Mon Sep 17 00:00:00 2001
From: Niko <nikolaos.papadopoulos@univie.ac.at>
Date: Thu, 28 Nov 2024 12:18:56 +0100
Subject: [PATCH] fall back to pseudo= tag instead of editing the GFF

---
 08-submission/gff-03-ENA_conform.sh | 43 ++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/08-submission/gff-03-ENA_conform.sh b/08-submission/gff-03-ENA_conform.sh
index 398fec2..9ddb137 100644
--- a/08-submission/gff-03-ENA_conform.sh
+++ b/08-submission/gff-03-ENA_conform.sh
@@ -15,28 +15,39 @@ cd $RESULT || exit
 # define inputs and outputs
 GFF=./merged_sorted_named.gff3
 DEDUP=./merged_sorted_named_dedup.gff3
-SHORT_INTRONS=./short_introns.tsv
-KILL_LIST=./kill_list.tsv
+FLAGGED=./merged_sorted_named_dedup_flagged.gff3
+# SHORT_INTRONS=./short_introns.tsv
+# KILL_LIST=./kill_list.tsv
 
-FILTERED_GFF=./merged_sorted_named_dedup_filtered.gff3
-FILTERED_mRNA=./short_introns.gff3
+# FILTERED_GFF=./merged_sorted_named_dedup_filtered.gff3
+# FILTERED_mRNA=./short_introns.gff3
+# FILTERED_mRNA_SORTED=./short_introns_sorted.gff3
 
 # the python script that will generate the kill list
-KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-03-build_kill_list.py
+# KILLSCRIPT=/lisc/user/papadopoulos/repos/plit-genome/08-submission/gff-03-build_kill_list.py
 
 # first remove duplicate features
-agat_sp_fix_features_locations_duplicated.pl --gff "$GFF" -o "$DEDUP"
+agat_sp_fix_features_locations_duplicated.pl --gff $GFF -o $DEDUP
 
-# now find short introns
-agat_sp_list_short_introns.pl --gff "$DEDUP" --size 10 --out "$SHORT_INTRONS"
+# flag short introns, according to https://www.biostars.org/p/374618/
+# The NCBI documentation suggests to use the pseudo=True tag for genes that are broken in some way
+# but still thought to be real genes (not pseudogenes): https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/
+agat_sp_flag_short_introns.pl --gff $DEDUP --out $FLAGGED
 
-# this table contains the locus (chromosome), gene, start position, and length of 
-# map the short introns to mRNAs in the GFF3 file
-# this was written in Python 3.12 but Python >3 should be fine; we only use default libraries
-python $KILLSCRIPT "$SHORT_INTRONS" "$DEDUP" > "$KILL_LIST"
+# # now find short introns
+# agat_sp_list_short_introns.pl --gff $DEDUP --size 10 --out $SHORT_INTRONS
 
-# use the kill list to filter the offending mRNAs out of the GFF3
-agat_sp_filter_feature_from_kill_list.pl --gff "$DEDUP" --kill_list "$KILL_LIST" -p mRNA -o "$FILTERED_GFF"
+# # this table contains the locus (chromosome), gene, start position, and length of 
+# # map the short introns to mRNAs in the GFF3 file
+# # this was written in Python 3.12 but Python >3 should be fine; we only use default libraries
+# python $KILLSCRIPT $SHORT_INTRONS $DEDUP > $KILL_LIST
 
-# also make a supplementary GFF with only the short introns:
-agat_sp_filter_feature_from_keep_list.pl --gff "$DEDUP" --keep_list "$KILL_LIST" -p mRNA -o "$FILTERED_mRNA"
\ No newline at end of file
+# # use the kill list to filter the offending mRNAs out of the GFF3
+# agat_sp_filter_feature_from_kill_list.pl --gff $DEDUP --kill_list $KILL_LIST -p mRNA -o $FILTERED_GFF
+
+# # also make a supplementary GFF with only the short introns:
+# agat_sp_filter_feature_from_keep_list.pl --gff $DEDUP --keep_list $KILL_LIST -p mRNA -o $FILTERED_mRNA
+
+# # and sort it properly:
+# module load genometools/
+# gt gff3 -tidy -retainids -o $FILTERED_mRNA_SORTED -force $FILTERED_mRNA
-- 
GitLab