diff --git a/02-scaffolding/README.md b/02-scaffolding/README.md new file mode 100644 index 0000000000000000000000000000000000000000..00b5672787c30efc026934bbbff2673b0738ce5e --- /dev/null +++ b/02-scaffolding/README.md @@ -0,0 +1,23 @@ +# Scaffolding + +Code in this folder covers the steps from the initial draft genome (produced by Flye with the ONT +data) until the raw scaffolded assembly (pre-juicebox). + +Here we repurposed a pipeline by Darrin Schultz, available via the [Genome Assembly +Pipelines](https://github.com/conchoecia/genome_assembly_pipelines) repository, in particular the +[yahs +rulefile](https://github.com/conchoecia/genome_assembly_pipelines/blob/master/snakefiles/GAP_yahs). +To fit it better to our computing environment, we broke it down into three steps: + +- mapping the omni-C data onto the draft genome: + [`scaffold-01-chromap.sh`](./scaffold-01-chromap.sh) +- actually scaffolding with [`yahs`](./scaffold-02-yahs.sh)... +- or [`salsa`](./scaffold-02-salsa.sh) +- producing the files that would be needed for visualisation (and editing) with juicebox: + [`scaffold-03-visualize.sh`](./scaffold-03-visualize.sh) + +We used the same evaluation scripts as during the assembly procedure to evaluate the quality of the +scaffolded assemblies. We decided in favor of the `yahs` assembly, as it had higher contiguity and +better BUSCO scores. Following that, the assembly and omniC map were manually edited in +[juicebox](https://github.com/aidenlab/Juicebox) to correct clear chromosome rearrangements and +smaller misassemblies. The corrected scaffold was exported in FASTA form and used from here on out. \ No newline at end of file diff --git a/02-scaffolding/scaffold-01-chromap.sh b/02-scaffolding/scaffold-01-chromap.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed1e68e0b657839492dfb8d3e1b63e770e68b6aa --- /dev/null +++ b/02-scaffolding/scaffold-01-chromap.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=pycno-chromap +#SBATCH --cpus-per-task=16 +#SBATCH --mem=15G +#SBATCH --time=1:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-scaf-chromap-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-scaf-chromap-%j.err + +# I had trouble figuring out how to correctly run YAHS so I asked Darrin. He pointed me to his +# genome assembly pipeline repository and in particular the yahs rulefile +# (https://github.com/conchoecia/genome_assembly_pipelines/blob/master/snakefiles/GAP_yahs). I tried +# installing/running that on my mac but ran into stupid issues because of clang/llvm problems. + +# I don't think I can run the snakemake workflow on the cluster as-is. since it isn't optimised for +# our cluster and will try to install stuff instead of using already existing modules. I will +# instead try to run all the steps one by one and adapt the rules to LiSC/SLURM scripts. + +# part 1: from raw files to YAHS input + +module load conda +conda activate chromap + +module load samtools + +# ASSEMBLY="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.fasta" +ASSEMBLY="/lisc/scratch/zoology/pycnogonum/genome/draft/draft.fasta" +R1="/lisc/scratch/zoology/pycnogonum/raw/omniC/DTG_OmniC_1016_R1.fastq" +R2="/lisc/scratch/zoology/pycnogonum/raw/omniC/DTG_OmniC_1016_R2.fastq" +OUT="/lisc/scratch/zoology/pycnogonum/genome/draft/chromap" + +mkdir -p $OUT +cd $OUT || exit + +# index ref (line 100) +chromap -i -r $ASSEMBLY -o asm.index +# hic to pairs (line 112) +chromap --preset hic -x asm.index -r $ASSEMBLY -1 $R1 -2 $R2 -t 16 -q 5 -o asm_q_5.pairs +# possibly run with -q 0 to figure out true number of chromosomes? +# hic to sam (line 138) +chromap --preset hic -x asm.index -r $ASSEMBLY -1 $R1 -2 $R2 -t 16 --SAM -o asm_hic.sam + +# sam to bam (line 163) +samtools view -@ 16 -hb asm_hic.sam | samtools sort -@ 16 -o asm_hic.sorted.bam +# index assembly (line 204) +samtools faidx $ASSEMBLY \ No newline at end of file diff --git a/02-scaffolding/scaffold-02-salsa.sh b/02-scaffolding/scaffold-02-salsa.sh new file mode 100644 index 0000000000000000000000000000000000000000..5ae6994b743e31057b04ae285af96306bae02f72 --- /dev/null +++ b/02-scaffolding/scaffold-02-salsa.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=pycno-salsa2 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=25G +#SBATCH --time=2:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-scaf-salsa2-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-scaf-salsa2-%j.err + +module load bedtools +module load conda +conda activate salsa2-2.3 + +ASSEMBLY="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.fasta" +LENGTH="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.fasta.fai" +GFA="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.gfa" +OUT="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/salsa2/" +BAM="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/asm_hic.sorted.bam" + +mkdir -p "$OUT" || exit 1 +cd "$TMPDIR" || exit 1 + +# echo "Converting BAM to BED" +# # convert final BAM file to BED format +# bedtools bamtobed -i $BAM > $BED +# # sort bed file by name +# sort -k 4 $BED > tmp && mv tmp $BED + +echo "Running SALSA2" +run_pipeline.py -a $ASSEMBLY -l $LENGTH -b $BAM -o $OUT -g $GFA -e DNASE -s 600 -m yes -p yes \ No newline at end of file diff --git a/02-scaffolding/scaffold-02-yahs.sh b/02-scaffolding/scaffold-02-yahs.sh new file mode 100644 index 0000000000000000000000000000000000000000..e01a3d8bde67ccdcd3eec3f4279abaa6b0b42b31 --- /dev/null +++ b/02-scaffolding/scaffold-02-yahs.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=pycno-yahs +#SBATCH --cpus-per-task=1 +#SBATCH --mem=25G +#SBATCH --time=20:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-scaf-yahs-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-scaf-yahs-%j.err + +# I had trouble figuring out how to correctly run YAHS so I asked Darrin. He pointed me to his +# genome assembly pipeline repository and in particular the yahs rulefile +# (https://github.com/conchoecia/genome_assembly_pipelines/blob/master/snakefiles/GAP_yahs). I tried +# installing/running that on my mac but ran into stupid issues because of clang/llvm problems. I am +# adapting the rules to SLURM scripts. + +# rule run_yahs_on_the_data (line 237) +module load yahs/1.2a.2 + +ASSEMBLY="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.fasta" +BAM="/lisc/scratch/zoology/pycnogonum/genome/flye_full/darrin/asm_hic.sorted.bam" +OUT="/lisc/scratch/zoology/pycnogonum/genome/flye_full/darrin/" + +mkdir -p "$OUT" || exit 1 + +yahs -o $OUT/yahs.out --no-contig-ec --no-scaffold-ec --no-mem-check -v 2 $ASSEMBLY $BAM \ No newline at end of file diff --git a/02-scaffolding/scaffold-03-visualize.sh b/02-scaffolding/scaffold-03-visualize.sh new file mode 100644 index 0000000000000000000000000000000000000000..78ec53adc4cabc0df0ecbc363cec8a69c104dd37 --- /dev/null +++ b/02-scaffolding/scaffold-03-visualize.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# +#SBATCH --job-name=pycno-juicer +#SBATCH --cpus-per-task=16 +#SBATCH --mem=32G +#SBATCH --time=1:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=nikolaos.papadopoulos@univie.ac.at +#SBATCH --output=/lisc/user/papadopoulos/log/pycno-scaf-juicer-%j.out +#SBATCH --error=/lisc/user/papadopoulos/log/pycno-scaf-juicer-%j.err + +# I had trouble figuring out how to correctly run YAHS so I asked Darrin. He pointed me to his +# genome assembly pipeline repository and in particular the yahs rulefile +# (https://github.com/conchoecia/genome_assembly_pipelines/blob/master/snakefiles/GAP_yahs). I tried +# installing/running that on my mac but ran into stupid issues because of clang/llvm problems. I am +# adapting the rules to SLURM scripts. + +# rule generate_long_file_for_JBAT (line 259) +module load yahs/1.2a.2 + +INDEX="/lisc/scratch/zoology/pycnogonum/genome/flye_full/assembly.fasta.fai" +BIN="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/yahs.out.bin" +AGP="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/yahs.out_scaffolds_final.agp" +ASSEMBLY="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/yahs.out_scaffolds_final.fa.assembly" +OUT="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/" + +cd $OUT || exit 1 + +juicer pre $BIN $AGP $INDEX | \ + sort -k2,2d -k6,6d -k3,3n -k7,7n -T ./ --parallel=16 -S32G | \ + awk 'NF' | \ + awk '$2<=$6{{print $1, $2, $3, $4, 0, $6, $7, $8, "1 - - 1 - - -" }} \ + $6<$2{{print $1, $6, $7, $4, 0, $2, $3, $8, "1 - - 1 - - -" }}' > yahs_out_scaffolds_final.long + +LONG="/lisc/scratch/zoology/pycnogonum/genome/flye_full/scaffold/yahs_out_scaffolds_final.long" + +# rule generate_assembly_for_hic_gen (line 308) +AWKSCRIPT="/lisc/user/papadopoulos/repos/genome_assembly_pipelines/bin/generate-assembly-file-from-fasta.awk" +awk -f $AWKSCRIPT yahs.out_scaffolds_final.fa > yahs.out_scaffolds_final.fa.assembly + +# rule JBAT_pairs_to_hic (line 338)" +cd "$TMPDIR" || exit 1 + +VIS3DDNA="/lisc/user/papadopoulos/repos/3d-dna/visualize/run-assembly-visualizer.sh" +# make a hic file +bash $VIS3DDNA -p false $ASSEMBLY $LONG || true +# move to result dir, get out, delete TMPDIR +cp "$TMPDIR"/*.hic $OUT +cd $OUT || exit 1 +rm -rf "$TMPDIR" \ No newline at end of file