## Process the large bam that contains reads from all barcodes and get allele counts ## for the list of SNPs provided in a VCF file ## Generate per cell per chr Allele counts ## Runs viterbi algorithm and output sequence of inferred viterbi state ## Authror: Ruqian Lyu ## Date: 04/11/2020 import os import docopt import strutils import hts import tables import sequtils import math from distributions/rmath import dbinom import streams type allele_expr = ref object cref: int calt: int type AlleleExpr = object cRef: int cAlt: int # Nucleotide = enum # A, C, T, G ViState = enum stateRef, stateAlt, stateN ViNode = object pos: int cRef: int cAlt: int pathScore: array[stateRef..stateAlt, float] pathState: array[stateRef..stateAlt, ViState] state: ViState SpermViNodes = object viNodeseq: seq[ViNode] ## look up from SNPIndex to current Sperm SNP index snpIndexLookUp: Table[int,int] ## look up from current Sperm SNP index to SNPIndex spermSnpIndexLookUp: Table[int,int] SeqSpermViNodes = seq[SpermViNodes] #var alexprSeq:seq[AlleleExpr] proc inc_count_cref(a:allele_expr) = inc(a.cref) proc inc_count_calt(a:allele_expr) = inc(a.calt) proc reset_count(a:allele_expr) = a.calt = 0 a.cref = 0 proc tostring(a:allele_expr, s:var string) {.inline.} = s.set_len(0) s.add("\t" & $a.cref & "," & $a.calt) proc getEmission(thetaRef=0.1,thetaAlt=0.9, cRef:int,cAlt:int): array[stateRef..stateAlt, float] = var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] return emissionScore proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float = if pos2<pos1: quit "Wrong order of snps" var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb return rec # proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int, minTotalReads:int, chrom:string, mapq: int, barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] = var alleleCountTable = initTable[string,allele_expr]() var rec_alt:char var total_reads=0 rec_alt = rec.ALT[0][0] for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): var cbt = tag[string](aln, "CB") if cbt.isNone: # echo "no cb " continue if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary ## if not aln.flag.proper_pair: continue if not barcodeTable.hasKey(cbt.get): continue var off = aln.start.int qoff = 0 roff_only = 0 base = 'n' position = rec.POS.cint over = 0 for event in aln.cigar: var cons = event.consumes #echo $cons if cons.query: qoff+=event.len ## the offs to the query sequences if cons.reference: off += event.len ## the offs to the reference sequences if not cons.query: roff_only += event.len if off <= position: continue over = off - position # get the base base = aln.base_at(qoff - over-1) break if aln.base_quality_at(qoff - over-1).cint < minbsq: continue total_reads+=1 if alleleCountTable.hasKey(cbt.get): if aln.base_at(qoff - over-1) == rec.REF[0]: alleleCountTable[cbt.get].inc_count_cref continue if aln.base_at(qoff - over-1) == rec_alt: alleleCountTable[cbt.get].inc_count_calt continue else: var new_snp = allele_expr(cref:0,calt:0) alleleCountTable[cbt.get] = new_snp if aln.base_at(qoff - over-1) == rec.REF[0]: alleleCountTable[cbt.get].inc_count_cref continue if aln.base_at(qoff - over-1) == rec_alt: alleleCountTable[cbt.get].inc_count_calt continue if total_reads > maxTotalReads or total_reads < minTotalReads: return initTable[string,allele_expr]() return alleleCountTable proc sscocaller(threads:int, vcff:string, barcodeFile:string, bamfile:string, out_dir:string, mapq:int, minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq): int = var ibam:Bam ivcf:VCF if not open(ibam, bamfile, threads=threads, index = true): quit "couldn't open input bam" if not open(ivcf, vcff, threads=threads): quit "couldn't open: vcff" var hf = hts.hts_open(cstring(barcodeFile), "r") #### TODO : Table size var barcodeTable = newOrderedTable[string,int](initialSize = 1024) # var outFileTable = initTable[string,File]() var kstr: hts.kstring_t kstr.l = 0 kstr.m = 0 kstr.s = nil var ithSperm=0 ## initiate the table with CB as has keys, allele counts (ref object) as elements while hts_getline(hf, cint(10), addr kstr) > 0: if $kstr.s[0] == "#": continue var v = $kstr.s discard barcodeTable.hasKeyOrPut(v, ithSperm) ithSperm+=1 let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] ## iterate through each selected chromosome for chrom in s_Chrs: # var scTable = initTable[string,SingleSpermRec]() var snpIndex = 0 var spermIndex = 0 ## number of non zeros var nnsize = 0 var scSpermSeq:SeqSpermViNodes ## matches with the order in barcodeTable scSpermSeq.setLen(barcodeTable.len) var outFileSNPanno:FileStream var outFileTotalCountMtx:FileStream var outFileAltCountMtx:FileStream # var outHeaderMtx:FileStream var outFileVStateMtx:FileStream var viSegmentInfo: FileStream try: outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite) # outHeaderMtx = openFileStream(out_dir & chrom & "_header.mtx", fmWrite) outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite) viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite) #echo strm.readLine() except: stderr.write getCurrentExceptionMsg() ## write headers to those mtx files and the first line place holder for total_row total_column total_entry outFileTotalCountMtx.writeLine("%%MatrixMarket matrix coordinate integer general") outFileTotalCountMtx.writeLine(' '.repeat(50)) outFileAltCountMtx.writeLine("%%MatrixMarket matrix coordinate integer general") outFileAltCountMtx.writeLine(' '.repeat(50)) outFileVStateMtx.writeLine("%%MatrixMarket matrix coordinate integer general") outFileVStateMtx.writeLine(' '.repeat(50)) outFileSNPanno.writeLine("POS" & "\t" & "REF" & "\t" & "ALT" ) for rec in ivcf.query(chrom): if rec.ALT.len > 1: continue if rec.ALT.len == 0: continue ## alleleCountTable contains for this SNP.POS, each cell barcode's allele counts var alleleCountTable = countAllele(rec=rec, ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable, minbsq=minbsq, maxTotalReads = maxtotal, minTotalReads = mintotal) if alleleCountTable.len==0: continue var rec_alt:char rec_alt = rec.ALT[0][0] # out_line = $rec.CHROM & "\t" & $rec.POS & "\t" & rec.REF[0] & "\t" & rec_alt ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations snpIndex += 1 outFileSNPanno.writeLine($rec.POS & "\t" & $rec.REF[0] & "\t" & $rec_alt ) for bc, ac in alleleCountTable.mpairs: # ac.tostring(acs) ## mindp, maxdp, they are values per cell if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue var ithSperm = barcodeTable[bc] ## write to mtx Ref count outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) nnsize += 1 var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) if scSpermSeq[ithSperm].viNodeseq.len==0: var currentViNode = ViNode() currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] currentViNode.pathState[stateRef] = stateN currentViNode.pathState[stateAlt] = stateN currentViNode.state = stateN currentViNode.pos = int(rec.POS) currentViNode.cAlt = int(ac.cAlt) currentViNode.cRef = int(ac.cRef) scSpermSeq[ithSperm].viNodeseq.add(currentViNode) scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex else: let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) var currentViNode = ViNode() # ref/alt -> ref var refTref = preVNode.pathScore[stateRef] + lnoTransProb var altTref = preVNode.pathScore[stateAlt] + ltransProb if refTref > altTref: currentViNode.pathScore[stateRef] = refTref currentViNode.pathState[stateRef] = stateRef else: currentViNode.pathScore[stateRef] = altTref currentViNode.pathState[stateRef] = stateAlt # ref/alt -> alt var refTalt = preVNode.pathScore[stateRef] + ltransProb var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb if refTalt > altTalt: currentViNode.pathScore[stateAlt] = refTalt currentViNode.pathState[stateAlt] = stateRef else: currentViNode.pathScore[stateAlt] = altTalt currentViNode.pathState[stateAlt] = stateAlt currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] currentViNode.pathScore[stateRef] += emissionArray[stateRef] currentViNode.cAlt = int(ac.cAlt) currentViNode.cRef = int(ac.cRef) currentViNode.pos = int(rec.POS) scSpermSeq[ithSperm].viNodeseq.add(currentViNode) scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex # The size line var transitFlag = false var cSNP = 0 var posEnd,posStart:int64 var inferProb,reverseProb,transProb = 0.0 var currentEm,prevEm: array[stateRef..stateAlt, float] var lastNode: ViNode var spermVNseq: SpermViNodes var ithSNP: int ## trans/no trans var rightGap,leftGap: array[0..1, float] for ithSperm in 0..(scSpermSeq.len-1): ## rightGap,leftGap = [0.0,0.0] spermVNseq = scSpermSeq[ithSperm] if spermVNseq.viNodeseq.len==0: continue lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, cRef=lastNode.cRef, cAlt=lastNode.cAlt) if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") inferProb = currentEm[stateRef] reverseProb = currentEm[stateAlt] else: scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") inferProb = currentEm[stateAlt] reverseProb = currentEm[stateRef] posEnd = lastNode.pos cSNP = 1 ## traceback for yielding the most probable state sequence for i in 1..high(scSpermSeq[ithSperm].viNodeseq): var state = scSpermSeq[ithSperm].viNodeseq[^i].state scSpermSeq[ithSperm].viNodeseq[^(i+1)].state = scSpermSeq[ithSperm].viNodeseq[^i].pathState[state] prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, cRef=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cRef, cAlt=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cAlt) ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)-i+1] transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos, scSpermSeq[ithSperm].viNodeseq[^(i)].pos, cmPmb=cmPmb) if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef: outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") if state == stateRef: # not transitioning to a different state ie still in this segment of same state inferProb+=prevEm[stateRef] reverseProb+=prevEm[stateAlt] cSNP += 1 transitFlag = false #posStart = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos else: # there is transition to different state: ref(start) to alt(end) now output the segment info posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos #leftGapSize = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos leftGap=[math.ln(transProb),math.ln(1-transProb)] inferProb+=leftGap[0] reverseProb+=leftGap[1] viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2") transitFlag = true cSNP = 1 rightGap = leftGap inferProb = prevEm[stateRef]+rightGap[0] reverseProb = prevEm[stateAlt]+rightGap[1] posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos else: outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") if state == stateAlt: # not transitioning to a different state ie still in this segment of same state inferProb += prevEm[stateAlt] reverseProb += prevEm[stateRef] cSNP += 1 transitFlag = false else: ## state transit posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos leftGap=[math.ln(transProb),math.ln(1-transProb)] inferProb += leftGap[0] reverseProb += leftGap[1] viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) transitFlag = true cSNP = 1 rightGap = leftGap inferProb = prevEm[stateAlt]+rightGap[0] reverseProb = prevEm[stateRef]+rightGap[1] posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos ## traced to the start position of the chromosome for this cell #leftGap = [0.0,0.0] if not transitFlag: ## the first SNP is included in the segment from traced from back posStart = scSpermSeq[ithSperm].viNodeseq[0].pos if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) else: viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) else: ## the first node has different state from the second, the first node has its own segment posStart = scSpermSeq[ithSperm].viNodeseq[0].pos posEnd = posStart cSNP = 1 currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, cRef=scSpermSeq[ithSperm].viNodeseq[0].cRef, cAlt=scSpermSeq[ithSperm].viNodeseq[0].cAlt) transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[0].pos, scSpermSeq[ithSperm].viNodeseq[1].pos, cmPmb=cmPmb) if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: inferProb = currentEm[stateRef]+math.ln(transProb) reverseProb = currentEm[stateAlt]+math.ln(1-transProb) viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) else: inferProb = currentEm[stateAlt]+math.ln(transProb) reverseProb = currentEm[stateRef]+math.ln(1-transProb) viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) transitFlag = false outFileTotalCountMtx.setPosition(49) outFileVStateMtx.setPosition(49) outFileAltCountMtx.setPosition(49) outFileAltCountMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) outFileVStateMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) outFileTotalCountMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) # outHeaderMtx.writeLine() outFileSNPanno.close() outFileTotalCountMtx.close() outFileAltCountMtx.close() # outHeaderMtx.close() outFileVStateMtx.close() viSegmentInfo.close() ibam.close() ivcf.close() return 0 when(isMainModule): let version = "0.1.0" var doc = format(""" $version Usage: sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix> Arguments: <BAM> the read alignment file with records of single-cell DNA reads <VCF> the variant call file with records of SNPs <barcodeFile> the text file containing the list of cell barcodes <out_prefix> the prefix of output files Options: -t --threads <threads> number of BAM decompression threads [default: 4] -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] -h --help show help Examples ./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp- """ % ["version", version]) let args = docopt(doc, version=version) var threads:int vcff:string barcodeFile:string bamfile:string selectedChrs:string out_dir:string mapq:int minbsq:int mintotal:int maxtotal:int mindp:int maxdp:int thetaREF:float thetaALT:float cmPmb:float if($args["--threads"] != "nil"): threads = parse_int($args["--threads"]) else: threads = 4 if($args["--minDP"] != "nil"): mindp = parse_int($args["--minDP"]) else: mindp = 1 if($args["--maxDP"] != "nil"): maxdp = parse_int($args["--maxDP"]) else: maxdp = 10 if($args["--maxTotalDP"] != "nil"): maxtotal = parse_int($args["--maxTotalDP"]) else: maxtotal = 30 if($args["--minTotalDP"] != "nil"): mintotal = parse_int($args["--minTotalDP"]) else: mintotal = 10 if($args["<BAM>"] == "nil"): quit "input bam is required" else: bamfile = $args["<BAM>"] if($args["<barcodeFile>"] == "nil"): quit "input barcodeFile is required" else: barcodeFile = $args["<barcodeFile>"] if($args["<out_prefix>"] == "nil"): quit "output prefix is required" else: out_dir = $args["<out_prefix>"] if($args["<VCF>"] == "nil"): quit "input VCF file is required" else: vcff = $args["<VCF>"] if($args["--minMAPQ"] != "nil"): mapq = parse_int($args["--minMAPQ"]) else: mapq = 20 if($args["--baseq"] != "nil"): minbsq = parse_int($args["--baseq"]) else: minbsq = 13 if($args["--thetaREF"] != "nil"): thetaRef = parse_float($args["--thetaREF"]) else: thetaRef = 0.1 if($args["--thetaALT"] != "nil"): thetaAlt = parse_float($args["--thetaALT"]) else: thetaAlt = 0.9 if($args["--cmPmb"] != "nil"): cmPmb = parse_float($args["--cmPmb"]) else: cmPmb = 0.1 if($args["--chrom"] != "nil"): selectedChrs = $args["--chrom"] else: let a = toSeq(1..19) if($args["--chrName"] == "nil"): let b = map(a, proc (x: int): string = "" & $x) let all_Chrs = concat(b,@["X","Y"]) selectedChrs = all_Chrs.join(",") else: let b = map(a, proc (x: int): string = "chr" & $x) let all_Chrs = concat(b,@["chrX","chrY"]) selectedChrs = all_Chrs.join(",") let s_Chrs = selectedChrs.split(',') #echo $s_Chrs #var args = commandLineParams() discard sscocaller(threads, vcff, barcodeFile, bamfile, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs)