diff --git a/05-transcriptomes/README.md b/05-transcriptomes/README.md new file mode 100644 index 0000000000000000000000000000000000000000..65c423af82f25d48ae956dafb1939a2ccbfea0e2 --- /dev/null +++ b/05-transcriptomes/README.md @@ -0,0 +1,66 @@ +# Developmental transcriptomes + +We obtained developmental transcriptomes from different stages of *Pycnogonum litorale* development. +Staging was done after molts, where possible. The following stages were sampled: + +- embryonic stage 3-4 +- instar I-protonymphon +- instar II +- instar III +- instar IV +- instar V +- instar VI +- juvenile I +- subadult + +The samples were sequenced on the Illumina NovaSeq platform, with 150bp paired-end reads. + +We also obtained short-read sequencing data from our collaborators, though at lower sequencing +coverage: + +- zygote +- early cleavage +- embryo 0-1 +- embryo 3-5 +- embryo 9-10 +- mixed instar II-V + +We also generated long-read HiFi datasete from mixed developmental stages of _Pycnogonum litorale_ +as part of a multiplexed Iso-seq run, together samples from two further species. + +### Short-read de novo transcriptome assembly + +We assembled the deeply sequenced short-read transcriptomes (embryonic stage 3-4, instar +I-protonymphon, instar II, instar III, instar IV, instar V, instar VI, juvenile I, subadult) with +Trinity. + +- [`assemble-all-sep.sh`](assemble-all-sep.sh): main script, submits single scripts +- [`assemble-single.sh`](assemble-single.sh): commands to assemble a single dataset + +### Iso-seq processing + +We followed the protocol for Iso-seq processing as [described by the +manufacturer](https://isoseq.how/clustering/) + +- Produce [consensus](isoseq-01-consensus.sh) HiFi reads from the ZMWs. +- Remove primers and identify barcodes with [`lima`](isoseq-02-lima.sh) +- Trim poly(A) tails and remove concatemers with [`refine`](isoseq-03-refine.sh) +- [Cluster](isoseq-04-cluster.sh) the reads into isoforms +- [Align](isoseq-05-align.sh) to the genome + +### Evaluation + +We [evaluated](eval-busco.sh) the de-novo assemblies with BUSCO v5.2.0, using the arthropoda_odb10 +database. + +| stage | complete | single | duplicated | fragmented | missing | total | +|-----------------------|----------|--------|------------|------------|---------|-------| +| embryonic_stage3-4 | 98.1% | 44.7% | 53.4% | 0.7% | 1.2% | 1013 | +| instar_I-protonymphon | 98.8% | 47.3% | 51.5% | 0.7% | 0.5% | 1013 | +| instar_II | 99.4% | 42.3% | 57.1% | 0.3% | 0.3% | 1013 | +| instar_III | 99.2% | 43.5% | 55.7% | 0.4% | 0.4% | 1013 | +| instar_IV | 99.2% | 37.3% | 61.9% | 0.4% | 0.4% | 1013 | +| instar_V | 99.3% | 44.7% | 54.6% | 0.4% | 0.3% | 1013 | +| instar_VI | 99.0% | 47.0% | 52.0% | 0.5% | 0.5% | 1013 | +| juvenile_I | 99.2% | 44.1% | 55.1% | 0.4% | 0.4% | 1013 | +| subadult | 99.3% | 47.5% | 51.8% | 0.2% | 0.5% | 1013 | diff --git a/05-transcriptomes/assemble-all-sep.sh b/05-transcriptomes/assemble-all-sep.sh new file mode 100644 index 0000000000000000000000000000000000000000..4701d3b5a061d6b61037d253b32f36c85bb520a3 --- /dev/null +++ b/05-transcriptomes/assemble-all-sep.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# A script to submit the transcriptome of each developmental stage to the cluster to be assembled +# with Trinity. I will then merge the assemblies into a single one. + + +INPUT="/scratch/zoology/pycnogonum/HTT33DSX5_4_R15615_20230713/demultiplexed" +BASE="/scratch/zoology/pycnogonum/transcriptome/dev_timepoints" +SCRIPT="/home/user/papadopoulos/repos/pycno-seq/dev_transcriptomes/assemble-single.sh" + + +sbatch -J pycno-trinity-protonymphon_instar1 $SCRIPT "$INPUT"/235253/235253_S47_L004_R1_001.fastq.gz "$INPUT"/235253/235253_S47_L004_R2_001.fastq.gz "${BASE}/235253_trinity_protonymphon_instar1/" +sbatch -J pycno-trinity-instar6 $SCRIPT "$INPUT"/235254/235254_S48_L004_R1_001.fastq.gz "$INPUT"/235254/235254_S48_L004_R2_001.fastq.gz "${BASE}/235254_trinity_instar6/" +sbatch -J pycno-trinity-embryonic_stage3 $SCRIPT "$INPUT"/235255/235255_S49_L004_R1_001.fastq.gz "$INPUT"/235255/235255_S49_L004_R2_001.fastq.gz "${BASE}/235255_trinity_embryonic_stage3-4/" +sbatch -J pycno-trinity-instar2 $SCRIPT "$INPUT"/235256/235256_S50_L004_R1_001.fastq.gz "$INPUT"/235256/235256_S50_L004_R2_001.fastq.gz "${BASE}/235256_trinity_instar2/" +sbatch -J pycno-trinity-instar3 $SCRIPT "$INPUT"/235257/235257_S51_L004_R1_001.fastq.gz "$INPUT"/235257/235257_S51_L004_R2_001.fastq.gz "${BASE}/235257_trinity_instar3/" +sbatch -J pycno-trinity-instar4 $SCRIPT "$INPUT"/235258/235258_S52_L004_R1_001.fastq.gz "$INPUT"/235258/235258_S52_L004_R2_001.fastq.gz "${BASE}/235258_trinity_instar4/" +sbatch -J pycno-trinity-instar5 $SCRIPT "$INPUT"/235259/235259_S53_L004_R1_001.fastq.gz "$INPUT"/235259/235259_S53_L004_R2_001.fastq.gz "${BASE}/235259_trinity_instar5/" +sbatch -J pycno-trinity-juvenile1 $SCRIPT "$INPUT"/235260/235260_S54_L004_R1_001.fastq.gz "$INPUT"/235260/235260_S54_L004_R2_001.fastq.gz "${BASE}/235260_trinity_juvenile1/" +sbatch -J pycno-trinity-subadult $SCRIPT "$INPUT"/235261/235261_S55_L004_R1_001.fastq.gz "$INPUT"/235261/235261_S55_L004_R2_001.fastq.gz "${BASE}/235261_trinity_subadult/" \ No newline at end of file diff --git a/05-transcriptomes/assemble-single.sh b/05-transcriptomes/assemble-single.sh new file mode 100644 index 0000000000000000000000000000000000000000..878b5e0c59c4761030f59581a1dd20dfc8dc7585 --- /dev/null +++ b/05-transcriptomes/assemble-single.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +#SBATCH --cpus-per-task=16 +#SBATCH --mem=50G +#SBATCH --time=48:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/%x-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/%x-%j.err + +module load assembly/trinityrnaseq/2.15.1 + +R1=$1 +R2=$2 +ASSEMBLY=$3 + +mkdir -p "$ASSEMBLY" || exit 1 + +Trinity \ + --seqType fq \ + --max_memory 50G \ + --trimmomatic \ + --no_salmon \ + --output "$ASSEMBLY" \ + --CPU 16 \ + --left "$R1" \ + --right "$R2" \ No newline at end of file diff --git a/05-transcriptomes/eval-busco.sh b/05-transcriptomes/eval-busco.sh new file mode 100644 index 0000000000000000000000000000000000000000..04fbec1527bcc891a14d3f69b914db969ca7c410 --- /dev/null +++ b/05-transcriptomes/eval-busco.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# put de novo transcriptomes in an array +BASE="/scratch/zoology/pycnogonum/transcriptome/development" + +embryonic_stage34=$BASE/embryonic_stage3-4 +instarI_protonymphon=$BASE/instarI-protonymphon +instar_II=$BASE/instar_II +instar_III=$BASE/instar_III +instar_IV=$BASE/instar_IV +instar_V=$BASE/instar_V +instar_VI=$BASE/instar_VI +juvenile_I=$BASE/juvenile_I +subadult=$BASE/subadult + +TXOMES=("$embryonic_stage34" "$instarI_protonymphon" "$instar_II" "$instar_III" "$instar_IV" "$instar_V" "$instar_VI" "$juvenile_I" "$subadult") + +# define BUSCO script +BUSCO=/lisc/user/papadopoulos/repos/pycno-seq/nanopore/eval-busco.sh + +# loop over the assemblies and submit the BUSCO job for each +for LOC in "${TXOMES[@]}"; do + sbatch "$BUSCO" "$LOC" "$LOC"/Trinity.fasta +done diff --git a/05-transcriptomes/isoseq-01-consensus.sh b/05-transcriptomes/isoseq-01-consensus.sh new file mode 100644 index 0000000000000000000000000000000000000000..f50352e62389a38eafc524fe6996792d41acca9a --- /dev/null +++ b/05-transcriptomes/isoseq-01-consensus.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=pb-consensus +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=64 +#SBATCH --mem=5G +#SBATCH --time=5:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-isoseq-ccs-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-isoseq-ccs-%j.err + +module load conda +conda activate pbccs-6.4.0 + +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/r64046_20240215_071715_C02/" +SUBREADS=$PACBIO/m64046_240218_003256.subreads.bam +CCS=$PACBIO/ccs.bam + +cd "$TMPDIR" || exit + +# generate CCS reads +# use the MIMALLOC env proposed by the pbccs documentation +MIMALLOC_PAGE_RESET=0 MIMALLOC_LARGE_OS_PAGES=1 ccs "$SUBREADS" "$CCS" --num-threads 64 --log-level INFO --report-file "$CCS".report.txt \ No newline at end of file diff --git a/05-transcriptomes/isoseq-02-lima.sh b/05-transcriptomes/isoseq-02-lima.sh new file mode 100644 index 0000000000000000000000000000000000000000..2dff87cdd0e521fa4239f901badec67d3ddfa2ec --- /dev/null +++ b/05-transcriptomes/isoseq-02-lima.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=pb-demux +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=500M +#SBATCH --time=5:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-isoseq-demux-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-isoseq-demux-%j.err + +module load conda +conda activate lima-2.9.0 + +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/r64046_20240215_071715_C02/" +CCS=$PACBIO/ccs.bam +BARCODES=/lisc/user/papadopoulos/repos/pycno-seq/dev_transcriptomes/barcodes.fasta +DEMUX=$PACBIO/fl.bam + +cd "$TMPDIR" || exit + +# demultiplex CCS reads with lima +lima "$CCS" "$BARCODES" "$DEMUX" --isoseq --peek-guess --num-threads 32 --log-level INFO --log-file "$DEMUX".report.txt \ No newline at end of file diff --git a/05-transcriptomes/isoseq-03-refine.sh b/05-transcriptomes/isoseq-03-refine.sh new file mode 100644 index 0000000000000000000000000000000000000000..8a1a53b01fcf9e907bb7609c8e67a13b7903af4e --- /dev/null +++ b/05-transcriptomes/isoseq-03-refine.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=pb-refine +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=1G +#SBATCH --time=10:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-isoseq-refine-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-isoseq-refine-%j.err + +module load conda +conda activate isoseq3-4.0.0 + +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/r64046_20240215_071715_C02/processed" +BARCODES=/lisc/user/papadopoulos/repos/pycno-seq/dev_transcriptomes/barcodes.fasta + +cd "$TMPDIR" || exit + +DEMUX=$PACBIO/fl.thylaeodus_NEB_5p--NEB_Clontech_3p.bam +FLNC=$PACBIO/flnc.thylaeodus.bam +isoseq refine "$DEMUX" "$BARCODES" "$FLNC" --num-threads 32 --log-level INFO --log-file "$FLNC".report.txt --require-polya + +DEMUX=$PACBIO/fl.scutopus_NEB_5p--NEB_Clontech_3p.bam +FLNC=$PACBIO/flnc.scutopus.bam +isoseq refine "$DEMUX" "$BARCODES" "$FLNC" --num-threads 32 --log-level INFO --log-file "$FLNC".report.txt --require-polya + +DEMUX=$PACBIO/fl.pycnogonum_NEB_5p--NEB_Clontech_3p.bam +FLNC=$PACBIO/flnc.pycnogonum.bam +isoseq refine "$DEMUX" "$BARCODES" "$FLNC" --num-threads 32 --log-level INFO --log-file "$FLNC".report.txt --require-polya + +# DEMUX=$PACBIO/fl.pycnogonum_NEB_5p--NEB_Clontech_3p.bam +# FLNC=$PACBIO/flnc.pycnogonum-nopolyA.bam +# isoseq refine "$DEMUX" "$BARCODES" "$FLNC" --num-threads 32 --log-level INFO --log-file "$FLNC".report.txt \ No newline at end of file diff --git a/05-transcriptomes/isoseq-04-cluster.sh b/05-transcriptomes/isoseq-04-cluster.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc4e729405a7e1a4e80e3b4d0bcb2f53d2085837 --- /dev/null +++ b/05-transcriptomes/isoseq-04-cluster.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=pb-cluster +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=20G +#SBATCH --time=60:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-isoseq-cluster-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-isoseq-cluster-%j.err + +module load conda +# conda activate isoseq3-4.0.0 + +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/r64046_20240215_071715_C02/processed" + +cd "$TMPDIR" || exit + +FLNC=$PACBIO/flnc.thylaeodus.bam +TRANSCRIPTS=$PACBIO/clustered.thylaeodus.bam +isoseq cluster2 "$FLNC" "$TRANSCRIPTS" --num-threads 32 --log-level INFO + +FLNC=$PACBIO/flnc.scutopus.bam +TRANSCRIPTS=$PACBIO/clustered.scutopus.bam +isoseq cluster2 "$FLNC" "$TRANSCRIPTS" --num-threads 32 --log-level INFO + +FLNC=$PACBIO/flnc.pycnogonum.bam +TRANSCRIPTS=$PACBIO/clustered.pycnogonum.bam +isoseq cluster2 "$FLNC" "$TRANSCRIPTS" --num-threads 32 --log-level INFO + + +# now convert the clustered BAM files to FASTA +conda activate pacbio + +TRANSCRIPTS=$PACBIO/clustered.thylaeodus.bam +bam2fasta -o $PACBIO/thylaeodus_isoseq --num-threads 32 "$TRANSCRIPTS" +TRANSCRIPTS=$PACBIO/clustered.scutopus.bam +bam2fasta -o $PACBIO/scutopus_isoseq --num-threads 32 "$TRANSCRIPTS" +TRANSCRIPTS=$PACBIO/clustered.pycnogonum.bam +bam2fasta -o $PACBIO/pycnogonum_isoseq --num-threads 32 "$TRANSCRIPTS" \ No newline at end of file diff --git a/05-transcriptomes/isoseq-05-align.sh b/05-transcriptomes/isoseq-05-align.sh new file mode 100644 index 0000000000000000000000000000000000000000..5f6acf017e921c591f0e3395381a4de3c0268e72 --- /dev/null +++ b/05-transcriptomes/isoseq-05-align.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +#SBATCH --job-name=pb-align +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=5 +#SBATCH --mem=10G +#SBATCH --time=10:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-isoseq-align-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-isoseq-align-%j.err + +module load conda + +# input +PACBIO="/lisc/scratch/zoology/pycnogonum/raw/r64046_20240215_071715_C02/processed" +DRAFT="/lisc/scratch/zoology/pycnogonum/genome/draft/draft_softmasked.fasta" +FLNC=$PACBIO/flnc.pycnogonum.bam +TRANSCRIPTS=$PACBIO/clustered.pycnogonum.bam + +# output +MAPPED="/lisc/scratch/zoology/pycnogonum/transcriptome/isoseq/mapped.bam" +GFF="/lisc/scratch/zoology/pycnogonum/transcriptome/isoseq/collapsed.gff" + +cd "$TMPDIR" || exit + +conda activate pacbio +pbmm2 align --preset ISOSEQ --sort -j 16 -J 4 -m 1200M --log-level INFO $TRANSCRIPTS $DRAFT $MAPPED +conda deactivate + +conda activate isoseq3-4.0.0 +isoseq collapse --do-not-collapse-extra-5exons --log-level INFO -j 32 $MAPPED $FLNC $GFF +conda deactivate \ No newline at end of file