diff --git a/src/sscocaller.nim b/src/sscocaller.nim index 50026692187723b9b68435ded3999297d67c45e5..9976b7a25ff41fd77e17ea621d3e0aa79b6e86e2 100755 --- a/src/sscocaller.nim +++ b/src/sscocaller.nim @@ -16,141 +16,127 @@ import math from distributions/rmath import dbinom import streams -#echo paramCount(), " ", paramStr(1) +type + allele_expr = ref object + cref: int + calt: int +type + AlleleExpr = object + cRef: int + cAlt: int + # Nucleotide = enum + # A, C, T, G + ViState = enum + stateRef, stateAlt, stateN + ViNode = object + pos: int + cRef: int + cAlt: int + pathScore: array[stateRef..stateAlt, float] + pathState: array[stateRef..stateAlt, ViState] + state: ViState + SpermViNodes = object + viNodeseq: seq[ViNode] + ## look up from SNPIndex to current Sperm SNP index + snpIndexLookUp: Table[int,int] + ## look up from current Sperm SNP index to SNPIndex + spermSnpIndexLookUp: Table[int,int] + SeqSpermViNodes = seq[SpermViNodes] +#var alexprSeq:seq[AlleleExpr] -proc sscocaller(argv: var seq[string]): int = - let doc = """ - Obtain allele counts for cell barcodes listed in barcodeFile at the SNP positions in VCF from the BAM file. - - Usage: - sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix> - - Options: - -t --threads <threads> number of BAM decompression threads [default: 4] - -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] - -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] - -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) - -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] - -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] - -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] - -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] - -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix - -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] - -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] - -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] - -h --help show help - - Examples - ./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp- - """ - let args = docopt(doc, argv=argv) - - var - ibam:Bam - threads:int - vcff:string - ivcf:VCF - barcodeFile:string - bamfile:string - selectedChrs:string - out_dir:string - mapq:int - minbsq:int - mintotal:int - maxtotal:int - mindp:int - maxdp:int - thetaREF:float - thetaALT:float - cmPmb:float - type - allele_expr = ref object - cref: int - calt: int - - proc inc_count_cref(a:allele_expr) = inc(a.cref) - proc inc_count_calt(a:allele_expr) = inc(a.calt) - proc reset_count(a:allele_expr) = - a.calt = 0 - a.cref = 0 - - # proc cref(a:allele_expr): int {.inline.} = return a.cref - # proc calt(a:allele_expr): int {.inline.} = return a.calt - - proc tostring(a:allele_expr, s:var string) {.inline.} = - s.set_len(0) - s.add("\t" & $a.cref & "," & $a.calt) +proc inc_count_cref(a:allele_expr) = inc(a.cref) +proc inc_count_calt(a:allele_expr) = inc(a.calt) +proc reset_count(a:allele_expr) = + a.calt = 0 + a.cref = 0 +proc tostring(a:allele_expr, s:var string) {.inline.} = + s.set_len(0) + s.add("\t" & $a.cref & "," & $a.calt) +proc getEmission(thetaRef=0.1,thetaAlt=0.9, + cRef:int,cAlt:int): array[stateRef..stateAlt, float] = - if($args["--threads"] != "nil"): - threads = parse_int($args["--threads"]) - else: threads = 4 - if($args["--minDP"] != "nil"): - mindp = parse_int($args["--minDP"]) - else: mindp = 1 - if($args["--maxDP"] != "nil"): - maxdp = parse_int($args["--maxDP"]) - else: maxdp = 10 + var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), + dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] + return emissionScore - if($args["--maxTotalDP"] != "nil"): - maxtotal = parse_int($args["--maxTotalDP"]) - else: maxtotal = 30 - - if($args["--minTotalDP"] != "nil"): - mintotal = parse_int($args["--minTotalDP"]) - else: mintotal = 10 - - if($args["<BAM>"] == "nil"): - quit "input bam is required" - else: bamfile = $args["<BAM>"] - - if($args["<barcodeFile>"] == "nil"): - quit "input barcodeFile is required" - else: barcodeFile = $args["<barcodeFile>"] - - if($args["<out_prefix>"] == "nil"): - quit "output prefix is required" - else: out_dir = $args["<out_prefix>"] - - if($args["<VCF>"] == "nil"): - quit "input VCF file is required" - else: vcff = $args["<VCF>"] +proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float = + if pos2<pos1: + quit "Wrong order of snps" + var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb + return rec + # +proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int, + minTotalReads:int, + chrom:string, mapq: int, + barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] = + var alleleCountTable = initTable[string,allele_expr]() + var rec_alt:char + var total_reads=0 + rec_alt = rec.ALT[0][0] + for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): + var cbt = tag[string](aln, "CB") + if cbt.isNone: + # echo "no cb " + continue + if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue + ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary + ## if not aln.flag.proper_pair: continue - if($args["--minMAPQ"] != "nil"): - mapq = parse_int($args["--minMAPQ"]) - else: mapq = 20 - if($args["--baseq"] != "nil"): - minbsq = parse_int($args["--baseq"]) - else: minbsq = 13 - if($args["--thetaREF"] != "nil"): - thetaRef = parse_float($args["--thetaREF"]) - else: thetaRef = 0.1 + if not barcodeTable.hasKey(cbt.get): + continue + var + off = aln.start.int + qoff = 0 + roff_only = 0 + base = 'n' + position = rec.POS.cint + over = 0 + for event in aln.cigar: + var cons = event.consumes + #echo $cons + if cons.query: + qoff+=event.len ## the offs to the query sequences + if cons.reference: + off += event.len ## the offs to the reference sequences + if not cons.query: + roff_only += event.len + if off <= position: + continue + over = off - position + # get the base + base = aln.base_at(qoff - over-1) + break + if aln.base_quality_at(qoff - over-1).cint < minbsq: + continue + total_reads+=1 + if alleleCountTable.hasKey(cbt.get): + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[cbt.get].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[cbt.get].inc_count_calt + continue + else: + var new_snp = allele_expr(cref:0,calt:0) + alleleCountTable[cbt.get] = new_snp + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[cbt.get].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[cbt.get].inc_count_calt + continue + if total_reads > maxTotalReads or total_reads < minTotalReads: + return initTable[string,allele_expr]() + return alleleCountTable +proc sscocaller(threads:int, vcff:string, barcodeFile:string, + bamfile:string, out_dir:string, mapq:int, + minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, + thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq): int = - if($args["--thetaALT"] != "nil"): - thetaAlt = parse_float($args["--thetaALT"]) - else: thetaAlt = 0.9 - - if($args["--cmPmb"] != "nil"): - cmPmb = parse_float($args["--cmPmb"]) - else: cmPmb = 0.1 - - echo "min mapq ", mapq, " min base qual ", minbsq - - if($args["--chrom"] != "nil"): - selectedChrs = $args["--chrom"] - else: - let a = toSeq(1..19) - if($args["--chrName"] == "nil"): - let b = map(a, proc (x: int): string = "" & $x) - let all_Chrs = concat(b,@["X","Y"]) - selectedChrs = all_Chrs.join(",") - else: - let b = map(a, proc (x: int): string = "chr" & $x) - let all_Chrs = concat(b,@["chrX","chrY"]) - selectedChrs = all_Chrs.join(",") - - let s_Chrs = selectedChrs.split(',') - echo $s_Chrs + var + ibam:Bam + ivcf:VCF if not open(ibam, bamfile, threads=threads, index = true): quit "couldn't open input bam" if not open(ivcf, vcff, threads=threads): @@ -170,107 +156,9 @@ proc sscocaller(argv: var seq[string]): int = if $kstr.s[0] == "#": continue var v = $kstr.s - ## barcodekey:spermIndex pair discard barcodeTable.hasKeyOrPut(v, ithSperm) - ithSperm+=1 - echo "fetch given SNPs in ", barcodeTable.len ," single cells" - - type - AlleleExpr = object - cRef: int - cAlt: int - # Nucleotide = enum - # A, C, T, G - ViState = enum - stateRef, stateAlt, stateN - ViNode = object - pos: int - cRef: int - cAlt: int - pathScore: array[stateRef..stateAlt, float] - pathState: array[stateRef..stateAlt, ViState] - state: ViState - SpermViNodes = object - viNodeseq: seq[ViNode] - ## look up from SNPIndex to current Sperm SNP index - snpIndexLookUp: Table[int,int] - ## look up from current Sperm SNP index to SNPIndex - spermSnpIndexLookUp: Table[int,int] - + ithSperm+=1 let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] - var alexprSeq:seq[AlleleExpr] - proc getEmission(thetaRef=0.1,thetaAlt=0.9, - cRef:int,cAlt:int): array[stateRef..stateAlt, float] = - - var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), - dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] - return emissionScore - - #echo getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=10,cAlt=0) - - proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float = - if pos2<pos1: - quit "Wrong order of snps" - var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb - return rec - # - proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int, - minTotalReads:int, - chrom:string, mapq: int, - barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] = - var alleleCountTable = initTable[string,allele_expr]() - var rec_alt:char - var total_reads=0 - rec_alt = rec.ALT[0][0] - for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): - if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue - ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary - ## if not aln.flag.proper_pair: continue - var cbt = tag[string](aln, "CB") - if cbt.isNone: continue - if not barcodeTable.hasKey(cbt.get): continue - var - off = aln.start.int - qoff = 0 - roff_only = 0 - base = 'n' - position = rec.POS.cint - over = 0 - for event in aln.cigar: - var cons = event.consumes - if cons.query: - qoff+=event.len ## the offs to the query sequences - if cons.reference: - off += event.len ## the offs to the reference sequences - # if not cons.query: - # roff_only += event.len - if off <= position: continue - over = off - position - # get the base - #base = aln.base_at(qoff - over-1) - break - if aln.base_quality_at(qoff - over-1).cint < minbsq: continue - total_reads+=1 - if alleleCountTable.hasKey(cbt.get): - if aln.base_at(qoff - over-1) == rec.REF[0]: - alleleCountTable[cbt.get].inc_count_cref - continue - if aln.base_at(qoff - over-1) == rec_alt: - alleleCountTable[cbt.get].inc_count_calt - continue - else: - var new_snp = allele_expr(cref:0,calt:0) - alleleCountTable[cbt.get] = new_snp - if aln.base_at(qoff - over-1) == rec.REF[0]: - alleleCountTable[cbt.get].inc_count_cref - continue - if aln.base_at(qoff - over-1) == rec_alt: - alleleCountTable[cbt.get].inc_count_calt - continue - if total_reads > maxTotalReads or total_reads < minTotalReads: - return initTable[string,allele_expr]() - return alleleCountTable - ## iterate through each selected chromosome for chrom in s_Chrs: # var scTable = initTable[string,SingleSpermRec]() @@ -278,8 +166,7 @@ proc sscocaller(argv: var seq[string]): int = var spermIndex = 0 ## number of non zeros var nnsize = 0 -# var snpAnnoSeq:seq[SNPAnnot] - var scSpermSeq:seq[SpermViNodes] + var scSpermSeq:SeqSpermViNodes ## matches with the order in barcodeTable scSpermSeq.setLen(barcodeTable.len) var outFileSNPanno:FileStream @@ -338,7 +225,6 @@ proc sscocaller(argv: var seq[string]): int = nnsize += 1 var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) if scSpermSeq[ithSperm].viNodeseq.len==0: - echo "first node to ", ithSperm var currentViNode = ViNode() currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] @@ -353,7 +239,6 @@ proc sscocaller(argv: var seq[string]): int = scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex else: - #echo "additional node to a cell" let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) @@ -430,8 +315,6 @@ proc sscocaller(argv: var seq[string]): int = transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos, scSpermSeq[ithSperm].viNodeseq[^(i)].pos, cmPmb=cmPmb) - # echo inferProb - # echo reverseProb if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef: outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") if state == stateRef: @@ -448,8 +331,6 @@ proc sscocaller(argv: var seq[string]): int = inferProb+=leftGap[0] reverseProb+=leftGap[1] viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2") - #echo "ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(reverseProb/inferProb) & " " & $cSNP - #echo $(inferProb/reverseProb) transitFlag = true cSNP = 1 rightGap = leftGap @@ -524,6 +405,133 @@ proc sscocaller(argv: var seq[string]): int = ivcf.close() return 0 + +when(isMainModule): + let version = "0.1.0" + var doc = format(""" + $version + + Usage: + sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix> + +Arguments: + + <BAM> the read alignment file with records of single-cell DNA reads + + <VCF> the variant call file with records of SNPs + + <barcodeFile> the text file containing the list of cell barcodes + + <out_prefix> the prefix of output files + + +Options: + -t --threads <threads> number of BAM decompression threads [default: 4] + -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] + -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] + -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) + -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] + -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] + -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] + -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] + -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix + -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] + -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] + -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] + -h --help show help + + Examples + ./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp- +""" % ["version", version]) + + let args = docopt(doc, version=version) + + var + threads:int + vcff:string + barcodeFile:string + bamfile:string + selectedChrs:string + out_dir:string + mapq:int + minbsq:int + mintotal:int + maxtotal:int + mindp:int + maxdp:int + thetaREF:float + thetaALT:float + cmPmb:float + + if($args["--threads"] != "nil"): + threads = parse_int($args["--threads"]) + else: threads = 4 + if($args["--minDP"] != "nil"): + mindp = parse_int($args["--minDP"]) + else: mindp = 1 + if($args["--maxDP"] != "nil"): + maxdp = parse_int($args["--maxDP"]) + else: maxdp = 10 + + if($args["--maxTotalDP"] != "nil"): + maxtotal = parse_int($args["--maxTotalDP"]) + else: maxtotal = 30 + + if($args["--minTotalDP"] != "nil"): + mintotal = parse_int($args["--minTotalDP"]) + else: mintotal = 10 + + if($args["<BAM>"] == "nil"): + quit "input bam is required" + else: bamfile = $args["<BAM>"] + + if($args["<barcodeFile>"] == "nil"): + quit "input barcodeFile is required" + else: barcodeFile = $args["<barcodeFile>"] + + if($args["<out_prefix>"] == "nil"): + quit "output prefix is required" + else: out_dir = $args["<out_prefix>"] + + if($args["<VCF>"] == "nil"): + quit "input VCF file is required" + else: vcff = $args["<VCF>"] + + if($args["--minMAPQ"] != "nil"): + mapq = parse_int($args["--minMAPQ"]) + else: mapq = 20 + if($args["--baseq"] != "nil"): + minbsq = parse_int($args["--baseq"]) + else: minbsq = 13 + if($args["--thetaREF"] != "nil"): + thetaRef = parse_float($args["--thetaREF"]) + else: thetaRef = 0.1 + + if($args["--thetaALT"] != "nil"): + thetaAlt = parse_float($args["--thetaALT"]) + else: thetaAlt = 0.9 + + if($args["--cmPmb"] != "nil"): + cmPmb = parse_float($args["--cmPmb"]) + else: cmPmb = 0.1 + if($args["--chrom"] != "nil"): + selectedChrs = $args["--chrom"] + else: + let a = toSeq(1..19) + if($args["--chrName"] == "nil"): + let b = map(a, proc (x: int): string = "" & $x) + let all_Chrs = concat(b,@["X","Y"]) + selectedChrs = all_Chrs.join(",") + else: + let b = map(a, proc (x: int): string = "chr" & $x) + let all_Chrs = concat(b,@["chrX","chrY"]) + selectedChrs = all_Chrs.join(",") -var args = commandLineParams() -discard sscocaller(args) + let s_Chrs = selectedChrs.split(',') + #echo $s_Chrs + + #var args = commandLineParams() + discard sscocaller(threads, vcff, barcodeFile, bamfile, + out_dir, mapq, minbsq, mintotal, + maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs) +