Skip to content
Snippets Groups Projects
Commit 5c5de158 authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

add run variant calling

parent 3c0b77b1
No related branches found
No related tags found
No related merge requests found
......@@ -19,3 +19,20 @@ In addition, before merging reads from each sperm, the CB (cell barcode, the SRR
Refer to steps defined in `run_subsample.snk`.
### Step4 merge single-sperm bam files into one Bam
`samtools` was used for merge CB-taged reads from all single sperm to one BAM file. See
`submit-mergeBams.sh`
### Step5 Findg informative SNP markers
The informative SNP markers are those SNPs which differ between the two mouse stains
that were used to generate the F1 hybrid mouse (CAST and BL6). The following steps were
applied which largely align with what has been described in the original paper [@Hinch2019-dt].
The bulk sperm sample `SRR8454653` was used for calling de no vo variants for this
mouse individual using GATK HaplotypeCaller. Only the HET SNPs with `MQ>50` AND
`DP>10` AND `DP<80` were kept.
The SNPs were further filtered to only keep the positions which have been called
as Homo_alternative `CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz` downloaded from the
dbsnp database from Mouse Genome Project.
\ No newline at end of file
# Running GATK HaplotypeCaller for variant calling usings bulk sperm sequencing data
# Author Ruqian Lyu
# Date 4/15/2021
# snakemake -s run_vcalling.snk --cores 10 --keep-going -j 1 --use-singularity --singularity-args "-B /mnt/beegfs/mccarthy/scratch:/mnt/beegfs/mccarthy/scratch,/mnt/mcfiles/rlyu/Projects/:/mnt/mcfiles/rlyu/Projects/" -j 1 -c sbatch "--mem {resources.mem} --time=120:00:00 --cpus-per-task={resources.cpus} --error=runDevnoVC.log.out"
outdir = "output/variants/"
rule all:
input: outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz","denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.sorted.vcf.gz"
rule addRG_gatk:
input:
bulkSpermBam = "output/alignment/cleanBam/SRR8454653.mkdup.sort.bam"
output:
bulkSpermRG = "output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam"
threads: 4
singularity:
"docker://broadinstitute/gatk"
resources:
cpus = 4,
mem = 10240
shell:
"""
gatk AddOrReplaceReadGroups \
-I {input.bulkSpermBam} \
-O {output.bulkSpermRG} \
-ID SRR8454653 \
-PL ILLUMINA \
-SM SRR8454653 \
-LB libx \
-PU SRR8454653 \
"""
rule sam_index:
input:
pcbam="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam"
output:
pcidx="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam.bai"
resources:
cpus=4,
mem=2000
shell:
"""
samtools index -@ {resources.cpus} {input.pcbam}
"""
rule run_gatk_hc:
input:
bulkSpermRG = "output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam",
bulkSpermRGIdx="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam.bai",
ref_fa = "./references/genome.fa"
output:
vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.vcf.gz"
singularity:
"docker://broadinstitute/gatk"
log: "log/hc/SRR8454653.hc.log"
threads:
4
resources:
cpus = 4,
mem = 51200
shell:
"""
gatk HaplotypeCaller -R {input.ref_fa} \
--native-pair-hmm-threads {resources.cpus} \
-I '{input.bulkSpermRG}' -O '{output.vcf}' 2> {log}
"""
rule run_filterVCF:
input:
vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.vcf.gz",
ref_fa = "./references/genome.fa"
output:
filter_vcf="denovoVar/SRR8454653.mkdup.sort.rg.filter.vcf.gz"
singularity:
"docker://broadinstitute/gatk"
log: "log/filterVNs/SRR8454653.vf.log"
threads:
4
resources:
cpus = 4,
mem = 51200
shell:
"""
gatk VariantFiltration -V '{input.vcf}' -O '{output.filter_vcf}' \
-R {input.ref_fa} \
--filter-name "MQ50" \
--filter-expression "MQ < 50.0" \
--filter-name "minDP" --filter-expression "DP < 10" \
--filter-name "maxDP" --filter-expression "DP > 80" \
--genotype-filter-name "NOTHETGeno" \
--genotype-filter-expression isHet!=1 \
--set-filtered-genotype-to-no-call true 2> {log}
"""
rule run_selectSNPs:
input:
filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.vcf.gz",
ref_fa = "./references/genome.fa"
output:
snp_filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.vcf.gz"
singularity:
"docker://broadinstitute/gatk"
log: "log/selectVNs/SRR8454653.vf.log"
threads:
4
resources:
cpus = 4,
mem = 51200
shell:
"""
gatk SelectVariants \
-R {input.ref_fa} \
-V {input.filter_vcf} \
--select-type-to-include SNP \
--exclude-filtered true \
-O {output.snp_filter_vcf} 2> {log}
"""
rule filter_nocalGT:
input:
filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.vcf.gz",
cast_ref = "./references/CAST_EiJ.mgp.v5.snps.dbSNP142.homo.var.chr.vcf.gz"
output:
snp_checked=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.vcf.gz"
log: "log/crossCAST/SRR8454653.vf.log"
threads:
4
resources:
cpus = 4,
mem = 11200
shell:
"""
bcftools filter --threads {resources.cpus} --exclude "GT=='./.'" --regions-file {input.cast_ref} {input.filter_vcf} -Oz -o {output.snp_checked} 2> {log}
"""
rule filter_bl6variants:
input:
bl6_ref = "./references/C57BL_6NJ.mgp.v5.snps.dbSNP142.vcf.gz",
snp_checked=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.vcf.gz"
output:
snp_checked_bl6=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz"
log: "log/crossBL6/SRR8454653.vf.log"
threads:
1
resources:
cpus = 1,
mem = 11200
shell:
"""
bedtools intersect -a {input.snp_checked} -b {input.bl6_ref} -header -sorted -v > {output.snp_checked_bl6} 2> {log}
"""
rule sort_index:
input:
snp_checked_bl6=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz"
output:
snp_checked_bl6_sorted=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.sorted.vcf.gz"
resources:
cpus = 4,
mem = 21200
shell:
"""
bcftools view {input.snp_checked_bl6} -Oz -o {input.snp_checked_bl6}.tmp.gz
bcftools sort --threads {resources.cpus} {input.snp_checked_bl6}.tmp.gz -Oz -o {output.snp_checked_bl6_sorted}
bcftools index --threads {resources.cpus} {output.snp_checked_bl6_sorted}
rm {input.snp_checked_bl6}.tmp.gz
"""
.
\ No newline at end of file
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --time=124:00:00
#SBATCH --mem=1000
#SBATCH --output=runsFilterHC.out
#SBATCH --partition=primary
/mnt/mcfiles/rlyu/Software/miniconda3.4.7/envs/rlyubase/bin/snakemake -s run_vcalling.snk --rerun-incomplete -j 1 --use-singularity --singularity-args "-B /mnt/beegfs/mccarthy/scratch:/mnt/beegfs/mccarthy/scratch,/mnt/mcfiles/rlyu/Projects/:/mnt/mcfiles/rlyu/Projects/" -c "sbatch --mem {resources.mem} --time=120:00:00 --cpus-per-task={resources.cpus} --error=Snakefile_hc.log.out"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment