diff --git a/03-repeats/README.md b/03-repeats/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f9e27563135f0ad288ed20682c5d04c3b77c9579 --- /dev/null +++ b/03-repeats/README.md @@ -0,0 +1,10 @@ +# Repeat analysis + +Code in this folder covers the repeat analysis. It follows the production of a scaffolded draft +genome (after juicebox). + +### Repeat prediction + +We modeled repeat families on the draft genome of _P. litorale_ using +[RepeatModeler](prep-repeat-modeler.sh) and (soft-)masked them with +[RepeatMasker](prep-repeat_masker.sh). \ No newline at end of file diff --git a/03-repeats/prep-repeat-modeler.sh b/03-repeats/prep-repeat-modeler.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1c82b0d6db33f94ab74f52955f9bcb4198e0583 --- /dev/null +++ b/03-repeats/prep-repeat-modeler.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=repeatmodeler_pycno +#SBATCH --cpus-per-task=32 +#SBATCH --mem=20G +#SBATCH --time=30:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-repeats-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-repeats-%j.err + +# will need optimisation from scratch when I next need it +module load repeatmodeler/2.0.5-5.40.0-1.0.6 +# OLDDIR=/lisc/slurm/node-d02/tmp/slurm-9303351 + +cd "$TMPDIR" || exit 1 + +OUTDIR=/lisc/scratch/zoology/pycnogonum/genome/draft/repeats/repeat_modeller +SCAFFOLDS=/lisc/scratch/zoology/pycnogonum/genome/draft/draft.fasta + +mkdir -p "$OUTDIR" || exit 1 + +BuildDatabase -name pycno "$SCAFFOLDS" +RepeatModeler -database pycno -threads 32 -LTRStruct > $OUTDIR/repeat_modeller_run.out + +# tar "$TMPDIR"/RM_*/ -czf "$OUTDIR"/repeatmodeler.tar.gz +# cp "$TMPDIR"/run.out "$OUTDIR"/run.out + +# if the run was successful, there should be three result files: +cp "$TMPDIR"/pycno-families.fa "$OUTDIR"/pycno-families.fa +cp "$TMPDIR"/pycno-families.stk "$OUTDIR"/pycno-families.stk +cp "$TMPDIR"/pycno-rmod.log "$OUTDIR"/pycno-rmod.log \ No newline at end of file diff --git a/03-repeats/prep-repeat_masker.sh b/03-repeats/prep-repeat_masker.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5e1f8fd9238635a2e80d402973d447eba4c33c2 --- /dev/null +++ b/03-repeats/prep-repeat_masker.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=repeatmasker_pycno +#SBATCH --cpus-per-task=4 +#SBATCH --mem=500M +#SBATCH --time=4:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-repeat-masker-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-repeat-masker-%j.err + +# RepeatMasker is a lot less resource-intense than RepeatModeler, so it makes sense to run it as a separate script. +# it ran out of memory with 500M, but it's unclear how much the last step needs +module load repeatmasker/4.1.6-3.12.4-5.40.0 + +SCAFFOLDS=/lisc/scratch/zoology/pycnogonum/genome/draft/draft.fasta +OUTDIR=/lisc/scratch/zoology/pycnogonum/genome/draft/repeats/repeat_masker_gff +mkdir -p "$OUTDIR" || exit 1 + +cd "$OUTDIR" || exit 1 +FAMILIES="../repeat_modeller/pycno-families.fa" + +RepeatMasker -pa 4 -xsmall -gff -dir "$TMPDIR" -lib "$FAMILIES" "$SCAFFOLDS" + +# copy the results to the output directory: +cp "$TMPDIR"/*.masked "$OUTDIR"/ +cp "$TMPDIR"/*.out "$OUTDIR"/ +cp "$TMPDIR"/*.tbl "$OUTDIR"/ +cp "$TMPDIR"/*.cat* "$OUTDIR"/ +cp "$TMPDIR"/*.gff "$OUTDIR"/ \ No newline at end of file