Skip to content
Snippets Groups Projects
run_vcalling.snk 5.87 KiB
Newer Older
Ruqian Lyu's avatar
Ruqian Lyu committed
# Running GATK HaplotypeCaller for variant calling usings bulk sperm sequencing data
# Author Ruqian Lyu
# Date 4/15/2021

# snakemake -s run_vcalling.snk --cores 10 --keep-going -j 1  --use-singularity --singularity-args  "-B /mnt/beegfs/mccarthy/scratch:/mnt/beegfs/mccarthy/scratch,/mnt/mcfiles/rlyu/Projects/:/mnt/mcfiles/rlyu/Projects/" -j 1 -c sbatch "--mem {resources.mem} --time=120:00:00 --cpus-per-task={resources.cpus} --error=runDevnoVC.log.out"
outdir = "output/variants/"
rule all:
    input:  outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz","denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.sorted.vcf.gz"
    

rule addRG_gatk:
    input:
        bulkSpermBam = "output/alignment/cleanBam/SRR8454653.mkdup.sort.bam"
    output:
        bulkSpermRG = "output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam"
    threads: 4 
    singularity:
        "docker://broadinstitute/gatk"
    resources:
        cpus = 4,
        mem = 10240
    shell:
        """
        gatk AddOrReplaceReadGroups \
            -I {input.bulkSpermBam} \
            -O {output.bulkSpermRG} \
            -ID SRR8454653 \
            -PL ILLUMINA \
            -SM SRR8454653 \
            -LB libx \
            -PU SRR8454653 \
        """
rule sam_index:
    input:
        pcbam="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam"
    output:
        pcidx="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam.bai"
    resources:
        cpus=4,
        mem=2000
    shell:
        """
            samtools index -@ {resources.cpus} {input.pcbam}
            
        """
        
rule run_gatk_hc:
    input:
        bulkSpermRG = "output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam",
        bulkSpermRGIdx="output/alignment/bulkBam/SRR8454653.mkdup.sort.piccard.rg.bam.bai",
        ref_fa = "./references/genome.fa"
    output:
        vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.vcf.gz"
    singularity:
        "docker://broadinstitute/gatk"
    log: "log/hc/SRR8454653.hc.log"
    threads:
        4
    resources:
        cpus = 4,
        mem = 51200
    shell:
        """
            gatk HaplotypeCaller -R {input.ref_fa} \
            --native-pair-hmm-threads {resources.cpus}  \
            -I '{input.bulkSpermRG}' -O '{output.vcf}' 2> {log}       
        """

          
rule run_filterVCF:
    input:
        vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.vcf.gz",
        ref_fa = "./references/genome.fa"
    output:
        filter_vcf="denovoVar/SRR8454653.mkdup.sort.rg.filter.vcf.gz"
    singularity:
        "docker://broadinstitute/gatk"
    log: "log/filterVNs/SRR8454653.vf.log"
    threads:
        4
    resources:
        cpus = 4,
        mem = 51200
    shell:
        """
            gatk VariantFiltration -V '{input.vcf}' -O '{output.filter_vcf}' \
                 -R {input.ref_fa} \
                 --filter-name "MQ50" \
                 --filter-expression "MQ < 50.0" \
                 --filter-name "minDP"  --filter-expression "DP < 10" \
                 --filter-name "maxDP"  --filter-expression "DP > 80" \
                 --genotype-filter-name "NOTHETGeno" \
                 --genotype-filter-expression isHet!=1 \
                 --set-filtered-genotype-to-no-call true 2> {log}       
        """
   
rule run_selectSNPs:
    input:
        filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.vcf.gz",
        ref_fa = "./references/genome.fa"
    output:
        snp_filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.vcf.gz"
    singularity:
        "docker://broadinstitute/gatk"
    log: "log/selectVNs/SRR8454653.vf.log"
    threads:
        4
    resources:
        cpus = 4,
        mem = 51200
    shell:
        """
            gatk SelectVariants \
                -R {input.ref_fa} \
                -V {input.filter_vcf} \
                --select-type-to-include SNP \
                --exclude-filtered true \
                -O {output.snp_filter_vcf}      2> {log}
        """ 

rule filter_nocalGT:
    input:
        filter_vcf=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.vcf.gz",
        cast_ref = "./references/CAST_EiJ.mgp.v5.snps.dbSNP142.homo.var.chr.vcf.gz"
    output:
        snp_checked=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.vcf.gz"
    log: "log/crossCAST/SRR8454653.vf.log"
    threads:
        4
    resources:
        cpus = 4,
        mem = 11200
    shell:
        """
            bcftools filter --threads {resources.cpus} --exclude "GT=='./.'" --regions-file {input.cast_ref} {input.filter_vcf} -Oz -o {output.snp_checked} 2> {log}
            
        """ 
rule filter_bl6variants:
    input:
        bl6_ref = "./references/C57BL_6NJ.mgp.v5.snps.dbSNP142.vcf.gz",
        snp_checked=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.vcf.gz"
    output:
        snp_checked_bl6=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz"
    log: "log/crossBL6/SRR8454653.vf.log"
    threads:
        1
    resources:
        cpus = 1,
        mem = 11200
    shell:
        """
            bedtools intersect -a {input.snp_checked} -b {input.bl6_ref} -header -sorted -v > {output.snp_checked_bl6} 2> {log}
        """
    
rule sort_index:
    input:
        snp_checked_bl6=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.vcf.gz"
    output:
        snp_checked_bl6_sorted=outdir+"denovoVar/SRR8454653.mkdup.sort.rg.filter.snps.castVar.bl6.nonVar.sorted.vcf.gz" 
    resources:
        cpus = 4,
        mem = 21200
    shell:
        """
            bcftools view {input.snp_checked_bl6} -Oz -o {input.snp_checked_bl6}.tmp.gz
            bcftools sort --threads {resources.cpus} {input.snp_checked_bl6}.tmp.gz -Oz -o {output.snp_checked_bl6_sorted}
            bcftools index --threads {resources.cpus} {output.snp_checked_bl6_sorted}
            rm {input.snp_checked_bl6}.tmp.gz
        """
        .