diff --git a/01-assembly/.DS_Store b/01-assembly/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/01-assembly/.DS_Store differ diff --git a/01-assembly/README.md b/01-assembly/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c72e1713ac154942f2692cc4310998c202d23e7 --- /dev/null +++ b/01-assembly/README.md @@ -0,0 +1,42 @@ +# Genome assembly + +Code in this folder covers the steps from receiving the data until producing a favored draft +assembly for scaffolding. + +### Preprocessing + +To estimate genome size, we counted kmers for the [ONT](preprocess-jellyfish_count-ont.sh) and +[PacBio](preprocess-jellyfish_count-pb.sh) data using $k=21$. The resulting k-mer spectra were +analyzed with [GenomeScope](http://genomescope.org) and +[GenomeScope2](http://genomescope.org/genomescope2.0/). + +### Assembly + +The sequencing coverage was enough for robust genome size estimates (and also ONT data is +technically not meant for k-mer spectrum analysis because of the high error rates). Since some +genome assemblers require an estimated genome size, we assumed a genome size around 500Mb. + +We tried out a variety of assemblers, mostly on the ONT data. + +- [canu v2.2](assemble-canu.sh) (ONT) +- [flye v2.9.2](assemble-flye.sh) (ONT) +- [shasta v.0.11.1](assemble-shasta.sh) (ONT) +- [Verkko v2.0](assemble-verkko.sh) (ONT) +- [NextDenovo v2.5.2](assemble-nextdenovo.sh) (ONT) +- [flye v2.9.2](assemble-flye-pb.sh) (PacBio) +- [hifiasm v0.19.8](assemble-hifiasm.sh) (PacBio) + + +### Evaluation + +Multiple tools failed, or required such extensive troubleshooting that we decided against using +them. We performed basic QC on each finished assembly with the +[`evaluate_assemblies.sh`](evaluate_assemblies.sh) script. + +- basic statistics with `quast`, +- assessed completeness of metazoan and arthropod single-copy ortholog genes with `BUSCO`, +- mapped available RNA-seq data onto the assembly, +- mapped the ONT and PacBio data back to the assembly. + +The best assembly, in terms of contiguity and BUSCO completeness, was produced by Flye using the ONT +data; this assembly was kept for further work. diff --git a/01-assembly/assemble-canu.sh b/01-assembly/assemble-canu.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d6bc534dd70b0aa792ee583bd1eaea138da7f78 --- /dev/null +++ b/01-assembly/assemble-canu.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=canu_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=30 +#SBATCH --mem=100G +#SBATCH --time=48:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-canu-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-canu-%j.err + +module load assembly/canu/2.2 + +# long read input +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" + +# output directory +OUTPUT="/lisc/scratch/zoology/pycnogonum/genome/canu_full/" +mkdir -p "$OUTPUT" || exit 1 + +# go to $TMPDIR +cd "$TMPDIR" || exit 1 + +# run canu +canu \ + -p ecoli \ + -d ecoli-oxford \ + genomeSize=550m \ + maxMemory=100g \ + maxThreads=30 \ + minReadLength=500 \ + fast=true \ + -nanopore $NANOPORE + +# copy all the relevant output to the output directory +cp -r "$TMPDIR"/ "$OUTPUT" + +# remove $TMPDIR +cd "$OUTPUT" || exit 1 +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/assemble-flye-pb.sh b/01-assembly/assemble-flye-pb.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0560e899c76ce1947b873dc82e589a297da1e97 --- /dev/null +++ b/01-assembly/assemble-flye-pb.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=flye_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=50G +#SBATCH --time=10:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-flye-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-flye-%j.err + +module load conda +conda activate flye-2.9.5 + +HIFI="/lisc/scratch/zoology/pycnogonum/raw/m64120_240312_152428.ccs.fasta" +OUTPUT=/lisc/scratch/zoology/pycnogonum/genome/flye_pb/ + +mkdir -p $OUTPUT + +# Run Flye +flye --pacbio-hifi $HIFI \ +--threads 16 \ +--out-dir $OUTPUT diff --git a/01-assembly/assemble-flye.sh b/01-assembly/assemble-flye.sh new file mode 100644 index 0000000000000000000000000000000000000000..730d3b252ce997a71f56afb78a89e733fe0e7528 --- /dev/null +++ b/01-assembly/assemble-flye.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=flye_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=30 +#SBATCH --mem=100G +#SBATCH --time=10:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-flye-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-flye-%j.err + +module load conda +conda activate flye-2.9.2 + +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" +OUTPUT=/lisc/scratch/zoology/pycnogonum/genome/flye_full/ + +mkdir -p $OUTPUT + +# Run Flye +flye --nano-raw $NANOPORE \ +--threads 30 \ +--out-dir $OUTPUT diff --git a/01-assembly/assemble-hifiasm.sh b/01-assembly/assemble-hifiasm.sh new file mode 100644 index 0000000000000000000000000000000000000000..b09cebb927848e7e51885f7cd5dfd86798c96253 --- /dev/null +++ b/01-assembly/assemble-hifiasm.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=flye_hifiasm +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=50G +#SBATCH --time=16:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-hifiasm-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-hifiasm-%j.err + +module load hifiasm/0.19.8 + +HIFI="/lisc/scratch/zoology/pycnogonum/raw/m64120_240312_152428.ccs.fasta" +# NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" +# OMNIC_R1="/lisc/scratch/zoology/pycnogonum/raw/omniC/DTG_OmniC_1016_R1.fastq" +# OMNIC_R2="/lisc/scratch/zoology/pycnogonum/raw/omniC/DTG_OmniC_1016_R2.fastq" +OUTPUT=/lisc/scratch/zoology/pycnogonum/genome/hifiasm/ + +mkdir -p $OUTPUT + +hifiasm -o $OUTPUT/asm -t32 --primary $HIFI \ No newline at end of file diff --git a/01-assembly/assemble-nextdenovo.sh b/01-assembly/assemble-nextdenovo.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e0b384f6568068266ce04a67293a8f8f517ff43 --- /dev/null +++ b/01-assembly/assemble-nextdenovo.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=nextdenovo_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=20 +#SBATCH --mem=300G +#SBATCH --time=168:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-nextdenovo-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-nextdenovo-%j.err + +module load conda +conda activate nextdenovo + +# long read input +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" +# nextdenovo path +nextdenovo="/lisc/user/papadopoulos/repos/NextDenovo/nextDenovo" +# output directory +OUTPUT="/lisc/scratch/zoology/pycnogonum/genome/nextdenovo_full/" +mkdir -p "$OUTPUT" || exit 1 + +# go to $TMPDIR +cd "$TMPDIR" || exit 1 + +# copy the nanopore reads to $TMPDIR +cp $NANOPORE ./ +cp /lisc/user/papadopoulos/repos/NextDenovo/doc/pycno.cfg ./ +ls PAK64436_1_202306050.fastq.gz > input.fofn + +# run nextdenovo +$nextdenovo pycno.cfg + +# copy all the relevant output to the output directory +cp -r "$TMPDIR"/* "$OUTPUT" + +# remove $TMPDIR +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/assemble-shasta.sh b/01-assembly/assemble-shasta.sh new file mode 100644 index 0000000000000000000000000000000000000000..7434c70eb044d7a75a2e17075b4d93bdc6b41736 --- /dev/null +++ b/01-assembly/assemble-shasta.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=shasta_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=48 +#SBATCH --mem=200G +#SBATCH --time=5:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-shasta-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-shasta-%j.err + +SHASTA=/lisc/user/papadopoulos/bin/shasta-Linux-0.11.1 + +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/nanopore.fasta" # long read input +OUTPUT="/lisc/scratch/zoology/pycnogonum/genome/shasta_full/" # output directory + +# run shasta +$SHASTA --input $NANOPORE --assemblyDirectory $OUTPUT --threads 48 --config Nanopore-R10-Fast-Nov2022 + +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/assemble-verkko.sh b/01-assembly/assemble-verkko.sh new file mode 100644 index 0000000000000000000000000000000000000000..a4fa636797b35db6f762e6b784c270a1840d0427 --- /dev/null +++ b/01-assembly/assemble-verkko.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=verkko_controller +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=1G +#SBATCH --time=30-00:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-verkko-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-verkko-%j.err + +module load conda +conda activate verkko + +# long read input - ONT +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" +# long read input - PacBio CCS +PACBIO=/lisc/scratch/zoology/pycnogonum/raw/m64120_240312_152428.ccs.fastq.gz + +# output directory +OUTPUT="/lisc/scratch/zoology/pycnogonum/genome/verkko/" +mkdir -p "$OUTPUT" || exit 1 + +verkko -d $OUTPUT --hifi $PACBIO --nano $NANOPORE --local-cpus 1 --slurm --snakeopts "--cores 64 --jobs 10" \ No newline at end of file diff --git a/01-assembly/eval-backmap-ont.sh b/01-assembly/eval-backmap-ont.sh new file mode 100644 index 0000000000000000000000000000000000000000..04bae1a8d5449148fbb59ca7bb9b4f9e1861cee8 --- /dev/null +++ b/01-assembly/eval-backmap-ont.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=backmap_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=20G +#SBATCH --time=15:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-backmap-ont-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-backmap-ont-%j.err + +module purge + +module load samtools +module load bedtools +# module load bwa # only needed for short reads +module load qualimap +module unload R/4.2.3 +module load R +module load perl/5.38.2 +export PERL5LIB="/lisc/user/papadopoulos/perl5/lib/perl5" + +module load conda +conda activate minimap2-2.28 + +cd "$TMPDIR" || exit + +ASSEMBLY=$1 +OUTPUT=$2 +mkdir -p "$OUTPUT" || exit +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/PAK64436_1_202306050.fastq.gz" +BACKMAP="/lisc/user/papadopoulos/repos/backmap/backmap.pl" + +$BACKMAP -a "$ASSEMBLY" \ +-ont $NANOPORE \ +-o "$OUTPUT" \ +-t 16 \ +-qo "--java-mem-size=28G" + +# copy all the relevant output to the output directory +cp -r "$TMPDIR"/*.bam "$OUTPUT" +# delete $TMPDIR +cd "$OUTPUT" || exit +rm -rf "$TMPDIR" + +samtools stats -@ 16 *.bam > stats.txt \ No newline at end of file diff --git a/01-assembly/eval-backmap-pb.sh b/01-assembly/eval-backmap-pb.sh new file mode 100644 index 0000000000000000000000000000000000000000..72f2037096044fe68031b142878263988854108c --- /dev/null +++ b/01-assembly/eval-backmap-pb.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=backmap_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=20G +#SBATCH --time=15:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-backmap-pacbio-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-backmap-pacbio-%j.err + +module purge + +module load samtools +module load bedtools +# module load bwa # only needed for short reads +module load qualimap +module unload R/4.2.3 +module load R +module load perl/5.38.2 +export PERL5LIB="/lisc/user/papadopoulos/perl5/lib/perl5" + +module load conda +conda activate minimap2-2.28 + +cd "$TMPDIR" || exit + +ASSEMBLY=$1 +OUTPUT=$2 +mkdir -p "$OUTPUT" + +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/m64120_240312_152428.ccs.fasta" +BACKMAP="/lisc/user/papadopoulos/repos/backmap/backmap.pl" + +$BACKMAP -a "$ASSEMBLY" \ +-hifi $PACBIO \ +-o "$OUTPUT" \ +-t 16 \ +-qo "--java-mem-size=28G" + +# copy all the relevant output to the output directory +cp -r "$TMPDIR"/*.bam "$OUTPUT" +# delete $TMPDIR +cd "$OUTPUT" || exit +rm -rf "$TMPDIR" + +samtools stats -@ 16 *.bam > stats.txt \ No newline at end of file diff --git a/01-assembly/eval-busco.sh b/01-assembly/eval-busco.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc4b584166dd29cd6d2ae2273e6259deea7ef1a2 --- /dev/null +++ b/01-assembly/eval-busco.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=busco_pycno +#SBATCH --cpus-per-task=16 +#SBATCH --mem=50G +#SBATCH --time=6:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-busco-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-busco-%j.err + +module load conda +conda activate busco-5.7.1 + +RESDIR=$1 +FASTA=$2 +MODE=$3 + +BUSCO="/lisc/scratch/zoology/db/busco/" + +# mkdir -p "$RESDIR"/busco_arthropoda || exit +mkdir -p "$RESDIR"/busco_metazoa || exit +cd "$TMPDIR" || exit + +# busco -i "$FASTA" -l arthropoda -m "$MODE" -o arthropoda -r -c 16 --offline --download_path $BUSCO +busco -i "$FASTA" -l metazoa -m "$MODE" -o metazoa -r -c 16 --offline --download_path $BUSCO + +cd "$RESDIR" || exit +# cp -r "$TMPDIR"/arthropoda ./busco_arthropoda/ +cp -r "$TMPDIR"/metazoa ./busco_metazoa/ +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/eval-map_RNA.sh b/01-assembly/eval-map_RNA.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff28adb201cebf4cb60b97897069b1a40d5ed55d --- /dev/null +++ b/01-assembly/eval-map_RNA.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=star_pycno +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=30 +#SBATCH --mem=30G +#SBATCH --time=6:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-star-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-star-%j.err + +module load ngstools/star/2.7.11a + +FASTA=$1 +ASSEMBLY=$(dirname "$FASTA") # path to the genome directory where the STAR index was generated +OUTPUT=$ASSEMBLY/transcriptome + +mkdir "$OUTPUT" -p || exit + +# first go into the assembly directory and create the STAR index +cd "$ASSEMBLY" || exit + +# create the STAR index +STAR --runThreadN 30 \ +--runMode genomeGenerate \ +--genomeDir "$ASSEMBLY" \ +--genomeFastaFiles "$FASTA" \ +--genomeSAindexNbases 13 + + +BASE="/lisc/scratch/zoology/pycnogonum/raw/HTT33DSX5_4_R15615_20230713/demultiplexed" + +EMBRYO3="$BASE/235255/235255_S49" +INSTAR1="$BASE/235253/235253_S47" +INSTAR6="$BASE/235254/235254_S48" +INSTAR2="$BASE/235256/235256_S50" +INSTAR3="$BASE/235257/235257_S51" +INSTAR4="$BASE/235258/235258_S52" +INSTAR5="$BASE/235259/235259_S53" +JUV1="$BASE/235260/235260_S54" +SUBADULT="$BASE/235261/235261_S55" + +# put them all in a list +TRANSCRIPTOMES=("$INSTAR1" "$INSTAR6" "$INSTAR2" "$INSTAR3" "$INSTAR4" "$INSTAR5" "$JUV1" "$SUBADULT") + +R1="$EMBRYO3"_L004_R1_001.fastq.gz +R2="$EMBRYO3"_L004_R2_001.fastq.gz + +# now loop through the list and concatenate the R1/R2 files +for stage in "${TRANSCRIPTOMES[@]}"; do + R1=$R1,"$stage"_L004_R1_001.fastq.gz + R2=$R2,"$stage"_L004_R2_001.fastq.gz +done + +cd "$TMPDIR" || exit + +STAR --runThreadN 30 \ +--genomeDir "$ASSEMBLY" \ +--readFilesIn "$R1" "$R2" \ +--readFilesCommand gunzip -c + +# copy relevant files +mkdir "$OUTPUT" -p || exit + +cp -r "$TMPDIR"/Log.* "$OUTPUT"/ +cp -r "$TMPDIR"/Aligned.* "$OUTPUT"/ + +# remove $TMPDIR +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/eval-quast.sh b/01-assembly/eval-quast.sh new file mode 100644 index 0000000000000000000000000000000000000000..d7df3081a884a83bda1aa9a9b2912430bd82d1d3 --- /dev/null +++ b/01-assembly/eval-quast.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=quast_pycno +#SBATCH --cpus-per-task=1 +#SBATCH --mem=600M +#SBATCH --time=10:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-quast-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-quast-%j.err + +module load conda +conda activate quast-5.2.0 + +ASSEMBLY=$1 +RESDIR=$2 +NAME=$3 +mkdir "$RESDIR" -p + +quast.py -t 1 \ +-l "$NAME" \ +-o "$RESDIR" \ +--gene-finding \ +--eukaryote \ +"$ASSEMBLY" \ No newline at end of file diff --git a/01-assembly/evaluate_assemblies.sh b/01-assembly/evaluate_assemblies.sh new file mode 100644 index 0000000000000000000000000000000000000000..db9a5e0180b054acbb23341a2c7e43ccbcb4e93e --- /dev/null +++ b/01-assembly/evaluate_assemblies.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +# This script is used to start all evaluation analysis for the different genome assemblies + +# keep track of our assemblies +BASE="/lisc/scratch/zoology/pycnogonum/genome" +FLYE=${BASE}/flye_full/assembly.fasta +SCAFFOLD=${BASE}/draft/draft.fasta +REPEATS=${BASE}/draft/draft_softmasked.fasta +SHASTA=${BASE}/shasta_full/Assembly.fasta + + +# put assemblies in an array +ASSEMBLIES=("$SCAFFOLD") +NAMES=("draft") + +# I will check three things for each assembly: +# 1. BUSCO +# 2. QUAST +# 3. RNAseq mapping +# 4. mapping to the Nymphon genome +# 5. DNA backmapping + +##### Tools +BUSCO=/lisc/user/papadopoulos/repos/pycno-seq/nanopore/eval-busco.sh +QUAST=/lisc/user/papadopoulos/repos/pycno-seq/nanopore/eval-quast.sh +STAR=/lisc/user/papadopoulos/repos/pycno-seq/nanopore/eval-map_RNA.sh +BACKMAP_ONT=/lisc/user/papadopoulos/repos/pycno-seq/nanopore/eval-backmap-ont.sh +BACKMAP_PB=/lisc/user/papadopoulos/repos/pycno-seq/pacbio/eval-backmap-pb.sh + +# loop over the assemblies and names and submit the evaluation jobs for each +# use the appropriate name for each assembly +for i in "${!ASSEMBLIES[@]}"; do + # BUSCO completeness + ASSEMBLY=${ASSEMBLIES[$i]} + RESDIR=$(dirname "$ASSEMBLY") + sbatch "$BUSCO" "$RESDIR" "$ASSEMBLY" + + # quast for basic assembly qc stats + NAME=${NAMES[$i]} + RESDIR=$(dirname "$ASSEMBLY") + sbatch "$QUAST" "$ASSEMBLY" "$RESDIR"/quast "$NAME" + + # map RNA reads with STAR + sbatch "$STAR" "$ASSEMBLY" + + # map the DNA reads to the assembly with backmap + RESDIR=$(dirname "$ASSEMBLY")/backmap/ + sbatch $BACKMAP_ONT "$ASSEMBLY" "$RESDIR"/backmap/ont + sbatch $BACKMAP_PB "$ASSEMBLY" "$RESDIR"/backmap/pacbio +done diff --git a/01-assembly/preprocess-jellyfish_count-ont.sh b/01-assembly/preprocess-jellyfish_count-ont.sh new file mode 100644 index 0000000000000000000000000000000000000000..4499717454cbd6adc5b37badf9bc29a9b9b8909b --- /dev/null +++ b/01-assembly/preprocess-jellyfish_count-ont.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=jellyfish_pycno +#SBATCH --cpus-per-task=16 +#SBATCH --mem=75G +#SBATCH --time=1:30:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-pre-jf-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-pre-jf-%j.err + +module load jellyfish/2.3.0 + +k=$1 +NANOPORE="/lisc/scratch/zoology/pycnogonum/raw/20230601_0904_3C_PAK64436_4d39d6a6/basecalls/202306050/nanopore.fasta" +OUTPUT="/lisc/scratch/zoology/pycnogonum/genome/ont-jellyfish-k$k.res" + +cd "$TMPDIR" || exit 1 + +jellyfish count -C -m "$k" -s 1000000000 -t 16 $NANOPORE -o reads.jf +jellyfish histo -t 16 reads.jf > "$OUTPUT" + +# remove $TMPDIR +cd "$OUTPUT" || exit 1 +rm -rf "$TMPDIR" \ No newline at end of file diff --git a/01-assembly/preprocess-jellyfish_count-pb.sh b/01-assembly/preprocess-jellyfish_count-pb.sh new file mode 100644 index 0000000000000000000000000000000000000000..7911b5d23a13886f22d923627e4c8ab359705d23 --- /dev/null +++ b/01-assembly/preprocess-jellyfish_count-pb.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=jellyfish_pycno +#SBATCH --cpus-per-task=16 +#SBATCH --mem=50G +#SBATCH --time=3:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/home/user/papadopoulos/log/pycno-pre-jf-%j.out +#SBATCH --error=/home/user/papadopoulos/log/pycno-pre-jf-%j.err + +module load jellyfish/2.3.0 + +k=$1 +PACBIO="/scratch/zoology/pycnogonum/raw/m64120_240312_152428.ccs.fasta" +OUTPUT="/scratch/zoology/pycnogonum/genome/pb-jellyfish-k$k.res" + +cd "$TMPDIR" || exit 1 + +jellyfish count -C -m "$k" -s 1000000000 -t 16 $PACBIO -o reads.jf +jellyfish histo -t 16 reads.jf > "$OUTPUT" + +# remove $TMPDIR +cd || exit 1 +rm -rf "$TMPDIR" \ No newline at end of file