Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
Hinch Single Sperm DNA Seq Processing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
BioCellGen-public
Hinch Single Sperm DNA Seq Processing
Commits
70177c4b
Commit
70177c4b
authored
3 years ago
by
Ruqian Lyu
Browse files
Options
Downloads
Patches
Plain Diff
add run_alingment workflow
parent
04205596
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
run_alignment.snk
+175
-0
175 additions, 0 deletions
run_alignment.snk
sampleNames_meta.txt
+204
-0
204 additions, 0 deletions
sampleNames_meta.txt
with
379 additions
and
0 deletions
run_alignment.snk
0 → 100755
+
175
−
0
View file @
70177c4b
## Snakefile for processing mapping DNA reads of each sperm cell to the mouse refernece genomes
## Author: Ruqian Lyu
## rlyu@svi.edu.au
## Date: 2021/03/25
## run with snakemake -s run_alignment.snk --cores 10 --keep-going --use-singularity --singularity-args "-B /mnt/mcscratch:/mnt/mcscratch,/mnt/beegfs:/mnt/beegfs" --use-conda -j 10 -c sbatch --mem {resources.mem} --time=120:00:00 --cpus-per-task={resources.cpus} --error=Snakefile_bulkagrf.log.out"
import pandas as pd
outdir = "output/alignment"
beegfs_dir="/mnt/beegfs/mcfiles/hinch"
ref_genome='/mnt/mcscratch/rlyu/Projects/Snakemake_projects/yeln_2019_spermtyping/references/cellranger-ref-dna-mm10/fasta/genome.fa'
fasta_idx='/mnt/mcscratch/rlyu/Projects/Snakemake_projects/yeln_2019_spermtyping/references/cellranger-ref-dna-mm10/fasta/genome.fa.fai'
fasta_dict='/mnt/mcscratch/rlyu/Projects/Snakemake_projects/yeln_2019_spermtyping/references/cellranger-ref-dna-mm10/fasta/genome.dict'
## a sample meta file to align sample name and corresponding fastqs
sample_meta_file = "sampleNames_meta.txt"
sample_meta = pd.read_csv(sample_meta_file, delimiter='\t',header =0)
sample_names = pd.Series(sample_meta['sample_name'])
rule all:
input:
expand(outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam",sample_name = sample_names)
rule fastp:
input:
fastq_file1="{sample_name}/{sample_name}_1.fastq",
fastq_file2="{sample_name}/{sample_name}_2.fastq"
output:
fastq_file_qd1=outdir+"/fastp/"+"{sample_name}_R1.qd.fastq.gz",
fastq_file_qd2=outdir+"/fastp/"+"{sample_name}_R2.qd.fastq.gz"
conda:
"envs/env_bulkDNA.yml"
threads: 6
resources:
cpus=6,
mem=8000
log: outdir+"/fastp/fastp_log/{sample_name}.fastp.log"
shell:
"""
fastp -i {input.fastq_file1} -w {threads} -I {input.fastq_file2} -o {output.fastq_file_qd1} -O {output.fastq_file_qd2} 2> {log}
"""
rule cp_fastq:
input:
fastq_file_qd1=outdir+"/fastp/"+"{sample_name}_R1.qd.fastq.gz",
fastq_file_qd2=outdir+"/fastp/"+"{sample_name}_R2.qd.fastq.gz"
output:
fastq_file_qd1_beegfs=temp(beegfs_dir+"/fastp/"+"{sample_name}_R1.qd.fastq.gz"),
fastq_file_qd2_beegfs=temp(beegfs_dir+"/fastp/"+"{sample_name}_R2.qd.fastq.gz")
resources:
cpus=1,
mem=1240
shell:
"""
cp {input.fastq_file_qd1} {output.fastq_file_qd1_beegfs}
cp {input.fastq_file_qd2} {output.fastq_file_qd2_beegfs}
"""
rule run_alignment:
input:
ref_mm10=ref_genome,
fastq_file_qd1=beegfs_dir+"/fastp/"+"{sample_name}_R1.qd.fastq.gz",
fastq_file_qd2=beegfs_dir+"/fastp/"+"{sample_name}_R2.qd.fastq.gz"
threads: 4
resources:
cpus=4,
mem=20240
log:
outdir+"/minimap2/log/{sample_name}.minimap2.log"
output:
minimap2_sam=temp(beegfs_dir+"/minimap2/"+"{sample_name}.sam")
shell:
"""
/mnt/mcfiles/rlyu/Software/minimap2-2.7_x64-linux/minimap2 -t {threads} \
-ax sr {input.ref_mm10} {input.fastq_file_qd1} {input.fastq_file_qd2} > {output.minimap2_sam} 2> {log}
"""
rule sam_sort:
input:
minimap2_sam=beegfs_dir+"/minimap2/"+"{sample_name}.sam"
threads: 3
resources:
cpus=3,
mem=8000
log:outdir+"/minimap2/"+"{sample_name}.sort.sam.log"
output:
sam_sort=temp(beegfs_dir+"/minimap2/"+"{sample_name}.sort.sam")
shell:
"""
/mnt/mcfiles/rlyu/Software/samtools-1.10/samtools sort {input.minimap2_sam} --threads {threads} -O SAM -o {output.sam_sort} 2> {log}
"""
rule markduplicates:
input:
sam_sort=beegfs_dir+"/minimap2/"+"{sample_name}.sort.sam"
threads: 1
resources:
cpus=1,
mem=30240
output:
dm_sam= temp(beegfs_dir+"/markDup/"+"{sample_name}.mkdup.sam"),
dm_metric= outdir+"/markDup/"+"{sample_name}.mkdup.metric"
singularity:
"docker://broadinstitute/gatk"
log:outdir+"/markDup/"+"{sample_name}.mkdup.log"
shell:
"""
gatk MarkDuplicates -I {input.sam_sort} -M {output.dm_metric} -O {output.dm_sam} 2> {log}
"""
rule sort_bam:
input:
dm_merge_sam=beegfs_dir+"/markDup/"+"{sample_name}.mkdup.sam"
threads:2
resources:
cpus=2,
mem=10000
output:
dm_merge_bam=temp(beegfs_dir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam"),
dm_merge_bai=temp(beegfs_dir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam.bai")
shell:
"""
/mnt/mcfiles/rlyu/Software/samtools-1.10/samtools sort --threads {threads} {input.dm_merge_sam} -O BAM -o {output.dm_merge_bam}
/mnt/mcfiles/rlyu/Software/samtools-1.10/samtools index -@ {threads} {output.dm_merge_bam}
"""
rule cp_back_bam:
input:
dm_merge_bam=beegfs_dir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam",
dm_merge_bai=beegfs_dir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam.bai"
output:
dm_merge_bam_bk=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam",
dm_merge_bai_bk=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam.bai"
resources:
cpus=1,
mem=1240
shell:
"""
cp {input.dm_merge_bam} {output.dm_merge_bam_bk}
cp {input.dm_merge_bai} {output.dm_merge_bai_bk}
"""
## add dummy read group information
rule addRG:
input:
sam_sort=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.bam"
output:
rg_bam=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.rg.bam"
singularity:
"docker://broadinstitute/gatk"
threads: 2
resources:
cpus=2,
mem=20240
log:outdir+"/cleanBam/"+"{sample_name}.sort.rg.addrg.log"
# lanenumber= $(grep "^[^@;]" {wildcards.sample_name} | head -n 1 | cut -f 1 | cut -f 4 -d ":")
# flowcell=$(grep "^[^@;]" {wildcards.sample_name} | head -n 1 | cut -f 1 | cut -f 3 -d ":")
shell:
"""
flowcell=C1
lanenumber=001
sample={wildcards.sample_name}
rgid=$flowcell.$lanenumber.$sample
pl='unknown'
rglb=$sample-$lanenumber
gatk AddOrReplaceReadGroups -I {input.sam_sort} -O {output.rg_bam} -RGID $rgid -RGLB $rglb -RGPL $pl -RGPU $rgid -RGSM $sample 2> {log}
"""
rule index_rg_bam:
input:
dm_merge_sam=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.rg.bam"
threads: 2
resources:
cpus=2,
mem=10000
output:
dm_merge_bai=outdir+"/cleanBam/"+"{sample_name}.mkdup.sort.rg.bam.bai"
shell:
"""
/mnt/mcfiles/rlyu/Software/samtools-1.10/samtools index -@ {threads} {input.dm_merge_sam}
"""
\ No newline at end of file
This diff is collapsed.
Click to expand it.
sampleNames_meta.txt
0 → 100644
+
204
−
0
View file @
70177c4b
sample_name
SRR8454760
SRR8454683
SRR8454717
SRR8454784
SRR8454667
SRR8454689
SRR8454833
SRR8454844
SRR8454725
SRR8454752
SRR8454655
SRR8454758
SRR8454801
SRR8454779
SRR8454674
SRR8454773
SRR8454690
SRR8454704
SRR8454820
SRR8454857
SRR8454736
SRR8454741
SRR8454818
SRR8454865
SRR8454812
SRR8454868
SRR8454815
SRR8454862
SRR8454746
SRR8454731
SRR8454850
SRR8454827
SRR8454673
SRR8454790
SRR8454709
SRR8454679
SRR8454697
SRR8454774
SRR8454806
SRR8454871
SRR8454755
SRR8454722
SRR8454658
SRR8454728
SRR8454843
SRR8454834
SRR8454710
SRR8454684
SRR8454789
SRR8454660
SRR8454783
SRR8454830
SRR8454664
SRR8454787
SRR8454769
SRR8454680
SRR8454714
SRR8454763
SRR8454808
SRR8454802
SRR8454656
SRR8454751
SRR8454726
SRR8454854
SRR8454823
SRR8454829
SRR8454693
SRR8454707
SRR8454770
SRR8454699
SRR8454677
SRR8454794
SRR8454811
SRR8454866
SRR8454742
SRR8454735
SRR8454748
SRR8454732
SRR8454745
SRR8454738
SRR8454861
SRR8454816
SRR8454799
SRR8454777
SRR8454700
SRR8454694
SRR8454793
SRR8454670
SRR8454824
SRR8454853
SRR8454859
SRR8454721
SRR8454756
SRR8454805
SRR8454719
SRR8454780
SRR8454663
SRR8454764
SRR8454713
SRR8454687
SRR8454669
SRR8454837
SRR8454840
SRR8454870
SRR8454807
SRR8454729
SRR8454653
SRR8454723
SRR8454754
SRR8454659
SRR8454848
SRR8454835
SRR8454842
SRR8454782
SRR8454661
SRR8454766
SRR8454711
SRR8454685
SRR8454863
SRR8454814
SRR8454869
SRR8454730
SRR8454747
SRR8454678
SRR8454775
SRR8454702
SRR8454696
SRR8454791
SRR8454672
SRR8454708
SRR8454691
SRR8454705
SRR8454772
SRR8454778
SRR8454675
SRR8454796
SRR8454856
SRR8454821
SRR8454740
SRR8454737
SRR8454813
SRR8454864
SRR8454819
SRR8454666
SRR8454785
SRR8454688
SRR8454682
SRR8454716
SRR8454761
SRR8454838
SRR8454832
SRR8454759
SRR8454753
SRR8454724
SRR8454800
SRR8454720
SRR8454804
SRR8454712
SRR8454686
SRR8454765
SRR8454668
SRR8454718
SRR8454662
SRR8454841
SRR8454836
SRR8454739
SRR8454744
SRR8454733
SRR8454860
SRR8454671
SRR8454792
SRR8454798
SRR8454701
SRR8454695
SRR8454858
SRR8454852
SRR8454825
SRR8454828
SRR8454822
SRR8454855
SRR8454698
SRR8454795
SRR8454676
SRR8454771
SRR8454692
SRR8454706
SRR8454867
SRR8454810
SRR8454749
SRR8454734
SRR8454743
SRR8454831
SRR8454762
SRR8454681
SRR8454715
SRR8454786
SRR8454665
SRR8454768
SRR8454803
SRR8454809
SRR8454727
SRR8454750
SRR8454657
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment