From 1c69f700a2dc4c82e9a5ebbc23cda8cd82779bc1 Mon Sep 17 00:00:00 2001 From: rlyu <rlyu@svi.edu.au> Date: Wed, 23 Jun 2021 10:37:50 +1000 Subject: [PATCH] breakdown to modules and refactor --- src/private/findpath.nim | 109 +++++++++++ src/private/graph.nim | 90 +++++++++ src/private/utils.nim | 108 +++++++++++ src/sscocaller.nim | 403 ++++++--------------------------------- 4 files changed, 362 insertions(+), 348 deletions(-) create mode 100755 src/private/findpath.nim create mode 100755 src/private/graph.nim create mode 100755 src/private/utils.nim diff --git a/src/private/findpath.nim b/src/private/findpath.nim new file mode 100755 index 0000000..cdbd8b9 --- /dev/null +++ b/src/private/findpath.nim @@ -0,0 +1,109 @@ +## implements the traceback function +import utils +from graph import SpermViNodes +import streams +import tables +import math + + +proc pathTrackBack*(currentSperm: var SpermViNodes, + ithSperm: int, + thetaRef: float, + thetaAlt: float, + cmPmb: float, + outFileVStateMtx: var FileStream, + viSegmentInfo: var FileStream, + posEnd: var int64, + inferProb: var float, + reverseProb: var float): int = + var currentEm,prevEm: array[stateRef..stateAlt, float] + var posStart:int64 + var transitFlag = false + var cSNP = 1 + var ithSNP: int + var transProb: float + var rightGap,leftGap: array[0..1, float] + for i in 1..high(currentSperm.viNodeseq): + var state = currentSperm.viNodeseq[^i].state + currentSperm.viNodeseq[^(i+1)].state = currentSperm.viNodeseq[^i].pathState[state] + prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, + cRef=currentSperm.viNodeseq[^(i+1)].cRef, + cAlt=currentSperm.viNodeseq[^(i+1)].cAlt) + ithSNP = currentSperm.spermSnpIndexLookUp[high(currentSperm.viNodeseq)-i+1] + transProb = getTrans(currentSperm.viNodeseq[^(i+1)].pos, + currentSperm.viNodeseq[^(i)].pos, + cmPmb=cmPmb) + if currentSperm.viNodeseq[^(i+1)].state == stateRef: + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") + if state == stateRef: + # not transitioning to a different state ie still in this segment of same state + inferProb+=prevEm[stateRef] + reverseProb+=prevEm[stateAlt] + cSNP += 1 + transitFlag = false + #posStart = currentSperm[^(i+1)].pos + else: # there is transition to different state: ref(start) to alt(end) now output the segment info + posStart = currentSperm.viNodeseq[^(i)].pos + #leftGapSize = currentSperm[^(i)].pos - currentSperm[^(i+1)].pos + leftGap=[math.ln(transProb),math.ln(1-transProb)] + inferProb+=leftGap[0] + reverseProb+=leftGap[1] + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2") + transitFlag = true + cSNP = 1 + rightGap = leftGap + inferProb = prevEm[stateRef]+rightGap[0] + reverseProb = prevEm[stateAlt]+rightGap[1] + posEnd = currentSperm.viNodeseq[^(i+1)].pos + else: + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") + if state == stateAlt: + # not transitioning to a different state ie still in this segment of same state + inferProb += prevEm[stateAlt] + reverseProb += prevEm[stateRef] + cSNP += 1 + transitFlag = false + else: + ## state transit + posStart = currentSperm.viNodeseq[^(i)].pos + leftGap=[math.ln(transProb),math.ln(1-transProb)] + inferProb += leftGap[0] + reverseProb += leftGap[1] + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + transitFlag = true + cSNP = 1 + rightGap = leftGap + inferProb = prevEm[stateAlt]+rightGap[0] + reverseProb = prevEm[stateRef]+rightGap[1] + posEnd = currentSperm.viNodeseq[^(i+1)].pos + ## traced to the start position of the chromosome for this cell + #leftGap = [0.0,0.0] + if not transitFlag: + ## the first SNP is included in the segment from traced from back + posStart = currentSperm.viNodeseq[0].pos + if currentSperm.viNodeseq[0].state == stateRef: + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + else: + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) + else: + ## the first node has different state from the second, the first node has its own segment + posStart = currentSperm.viNodeseq[0].pos + posEnd = posStart + cSNP = 1 + currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, + cRef=currentSperm.viNodeseq[0].cRef, + cAlt=currentSperm.viNodeseq[0].cAlt) + transProb = getTrans(currentSperm.viNodeseq[0].pos, + currentSperm.viNodeseq[1].pos, + cmPmb=cmPmb) + if currentSperm.viNodeseq[0].state == stateRef: + inferProb = currentEm[stateRef]+math.ln(transProb) + reverseProb = currentEm[stateAlt]+math.ln(1-transProb) + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + else: + inferProb = currentEm[stateAlt]+math.ln(transProb) + reverseProb = currentEm[stateRef]+math.ln(1-transProb) + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) + + + return 0 diff --git a/src/private/graph.nim b/src/private/graph.nim new file mode 100755 index 0000000..4ebb0fb --- /dev/null +++ b/src/private/graph.nim @@ -0,0 +1,90 @@ +import tables +import utils +import streams +import hts +import math + +type + # Nucleotide = enum + # A, C, T, G + SpermViNodes* = object + viNodeseq*: seq[ViNode] + ## look up from SNPIndex to current Sperm SNP index + snpIndexLookUp*: Table[int,int] + ## look up from current Sperm SNP index to SNPIndex + spermSnpIndexLookUp*: Table[int,int] + SeqSpermViNodes* = seq[SpermViNodes] + + +proc addViNode*(barcodeTable: OrderedTableRef, + alleleCountTable: Table[string,allele_expr], + scSpermSeq: var SeqSpermViNodes, + outFileTotalCountMtx: var FileStream, + outFileAltCountMtx: var FileStream, + nnsize: var int, + mindp: int, + maxdp: int, + snpIndex: int, + thetaRef: float, + thetaAlt: float, + rec: Variant, + initProb: array, + cmPmb: float): int = + for bc, ac in alleleCountTable.pairs: + # ac.tostring(acs) + ## mindp, maxdp, they are values per cell + if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue + var ithSperm = barcodeTable[bc] + ## write to mtx Ref count + outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) + outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) + nnsize += 1 + var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) + if scSpermSeq[ithSperm].viNodeseq.len==0: + var currentViNode = ViNode() + currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] + currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] + currentViNode.pathState[stateRef] = stateN + currentViNode.pathState[stateAlt] = stateN + currentViNode.state = stateN + currentViNode.pos = int(rec.POS) + currentViNode.cAlt = int(ac.cAlt) + currentViNode.cRef = int(ac.cRef) + + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + else: + let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] + var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) + var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) + var currentViNode = ViNode() + # ref/alt -> ref + var refTref = preVNode.pathScore[stateRef] + lnoTransProb + var altTref = preVNode.pathScore[stateAlt] + ltransProb + if refTref > altTref: + currentViNode.pathScore[stateRef] = refTref + currentViNode.pathState[stateRef] = stateRef + else: + currentViNode.pathScore[stateRef] = altTref + currentViNode.pathState[stateRef] = stateAlt + # ref/alt -> alt + var refTalt = preVNode.pathScore[stateRef] + ltransProb + var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb + + if refTalt > altTalt: + currentViNode.pathScore[stateAlt] = refTalt + currentViNode.pathState[stateAlt] = stateRef + else: + currentViNode.pathScore[stateAlt] = altTalt + currentViNode.pathState[stateAlt] = stateAlt + currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] + currentViNode.pathScore[stateRef] += emissionArray[stateRef] + currentViNode.cAlt = int(ac.cAlt) + currentViNode.cRef = int(ac.cRef) + currentViNode.pos = int(rec.POS) + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + return 0 +# The size line \ No newline at end of file diff --git a/src/private/utils.nim b/src/private/utils.nim new file mode 100755 index 0000000..3d5eb4f --- /dev/null +++ b/src/private/utils.nim @@ -0,0 +1,108 @@ + +from distributions/rmath import dbinom +import math +import tables +import hts + +type + allele_expr* = ref object + cref*: int + calt*: int + ViState* = enum + stateRef, stateAlt, stateN + ViNode* = object + pos*: int + cRef*: int + cAlt*: int + pathScore*: array[stateRef..stateAlt, float] + pathState*: array[stateRef..stateAlt, ViState] + state*: ViState + +# proc cref*(a:allele_expr):int = a.cref +# proc calt*(a:allele_expr):int = a.calt + +proc inc_count_cref*(a:allele_expr) = inc(a.cref) +proc inc_count_calt*(a:allele_expr) = inc(a.calt) +# proc pathScore*(v:ViNode):array[stateRef..stateAlt, float] = v.pathScore + +proc getEmission*(thetaRef=0.1,thetaAlt=0.9, + cRef:int,cAlt:int): array[stateRef..stateAlt, float] = + + var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), + dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] + return emissionScore + +proc getTrans*(pos1:int64,pos2:int64,cmPmb=0.1): float = + if pos2<pos1: + quit "Wrong order of snps" + var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb + return rec + # +proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int, + minTotalReads:int, + chrom:string, mapq: int, + barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] = + var alleleCountTable = initTable[string,allele_expr]() + var rec_alt:char + var total_reads=0 + rec_alt = rec.ALT[0][0] + for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): + var cbt = tag[string](aln, barcodeTag) + var currentCB: string + if cbt.isNone: + # echo "no cb " + if not bulkBam: + continue + else: + currentCB = "bulk" + else: + currentCB = cbt.get + if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue + ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary + ## if not aln.flag.proper_pair: continue + if not barcodeTable.hasKey(currentCB): + continue + var + off = aln.start.int + qoff = 0 + roff_only = 0 + base = 'n' + position = rec.POS.cint + over = 0 + for event in aln.cigar: + var cons = event.consumes + #echo $cons + if cons.query: + qoff+=event.len ## the offs to the query sequences + if cons.reference: + off += event.len ## the offs to the reference sequences + if not cons.query: + roff_only += event.len + if off <= position: + continue + over = off - position + # get the base + base = aln.base_at(qoff - over-1) + break + if aln.base_quality_at(qoff - over-1).cint < minbsq: + continue + total_reads+=1 + if alleleCountTable.hasKey(currentCB): + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[currentCB].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[currentCB].inc_count_calt + continue + else: + var new_snp = allele_expr(cref:0,calt:0) + alleleCountTable[currentCB] = new_snp + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[currentCB].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[currentCB].inc_count_calt + continue + if total_reads > maxTotalReads or total_reads < minTotalReads: + return initTable[string,allele_expr]() + return alleleCountTable \ No newline at end of file diff --git a/src/sscocaller.nim b/src/sscocaller.nim index 83d7b2a..6f2fb95 100755 --- a/src/sscocaller.nim +++ b/src/sscocaller.nim @@ -12,123 +12,17 @@ import strutils import hts import tables import sequtils +import private/utils + import math -from distributions/rmath import dbinom import streams +import private/graph +import private/findpath -type - allele_expr = ref object - cref: int - calt: int -type - # Nucleotide = enum - # A, C, T, G - ViState = enum - stateRef, stateAlt, stateN - ViNode = object - pos: int - cRef: int - cAlt: int - pathScore: array[stateRef..stateAlt, float] - pathState: array[stateRef..stateAlt, ViState] - state: ViState - SpermViNodes = object - viNodeseq: seq[ViNode] - ## look up from SNPIndex to current Sperm SNP index - snpIndexLookUp: Table[int,int] - ## look up from current Sperm SNP index to SNPIndex - spermSnpIndexLookUp: Table[int,int] - SeqSpermViNodes = seq[SpermViNodes] - -proc inc_count_cref(a:allele_expr) = inc(a.cref) -proc inc_count_calt(a:allele_expr) = inc(a.calt) - -proc getEmission(thetaRef=0.1,thetaAlt=0.9, - cRef:int,cAlt:int): array[stateRef..stateAlt, float] = - - var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), - dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] - return emissionScore - -proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float = - if pos2<pos1: - quit "Wrong order of snps" - var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb - return rec - # -proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int, - minTotalReads:int, - chrom:string, mapq: int, - barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] = - var alleleCountTable = initTable[string,allele_expr]() - var rec_alt:char - var total_reads=0 - rec_alt = rec.ALT[0][0] - for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): - var cbt = tag[string](aln, barcodeTag) - var currentCB: string - if cbt.isNone: - # echo "no cb " - if not bulkBam: - continue - else: - currentCB = "bulk" - else: - currentCB = cbt.get - if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue - ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary - ## if not aln.flag.proper_pair: continue - if not barcodeTable.hasKey(currentCB): - continue - var - off = aln.start.int - qoff = 0 - roff_only = 0 - base = 'n' - position = rec.POS.cint - over = 0 - for event in aln.cigar: - var cons = event.consumes - #echo $cons - if cons.query: - qoff+=event.len ## the offs to the query sequences - if cons.reference: - off += event.len ## the offs to the reference sequences - if not cons.query: - roff_only += event.len - if off <= position: - continue - over = off - position - # get the base - base = aln.base_at(qoff - over-1) - break - if aln.base_quality_at(qoff - over-1).cint < minbsq: - continue - total_reads+=1 - if alleleCountTable.hasKey(currentCB): - if aln.base_at(qoff - over-1) == rec.REF[0]: - alleleCountTable[currentCB].inc_count_cref - continue - if aln.base_at(qoff - over-1) == rec_alt: - alleleCountTable[currentCB].inc_count_calt - continue - else: - var new_snp = allele_expr(cref:0,calt:0) - alleleCountTable[currentCB] = new_snp - if aln.base_at(qoff - over-1) == rec.REF[0]: - alleleCountTable[currentCB].inc_count_cref - continue - if aln.base_at(qoff - over-1) == rec_alt: - alleleCountTable[currentCB].inc_count_calt - continue - if total_reads > maxTotalReads or total_reads < minTotalReads: - return initTable[string,allele_expr]() - return alleleCountTable proc sscocaller(threads:int, vcff:string, barcodeFile:string, bamfile:string, out_dir:string, mapq:int, minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string): int = - var ibam:Bam ivcf:VCF @@ -167,12 +61,8 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, var scSpermSeq:SeqSpermViNodes ## matches with the order in barcodeTable scSpermSeq.setLen(barcodeTable.len) - var outFileSNPanno:FileStream - var outFileTotalCountMtx:FileStream - var outFileAltCountMtx:FileStream - # var outHeaderMtx:FileStream - var outFileVStateMtx:FileStream - var viSegmentInfo: FileStream + var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream + try: outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) @@ -190,20 +80,14 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, outFileAltCountMtx.writeLine(' '.repeat(50)) outFileVStateMtx.writeLine("%%MatrixMarket matrix coordinate integer general") outFileVStateMtx.writeLine(' '.repeat(50)) - outFileSNPanno.writeLine("POS" & "\t" & "REF" & "\t" & "ALT" ) + for rec in ivcf.query(chrom): if rec.ALT.len > 1: continue if rec.ALT.len == 0: continue ## alleleCountTable contains for this SNP.POS, each cell barcode's allele counts - var alleleCountTable = countAllele(rec=rec, ibam=ibam, - chrom=chrom, mapq=mapq, - barcodeTable=barcodeTable, - minbsq=minbsq, - maxTotalReads = maxtotal, - minTotalReads = mintotal, - bulkBam = bulkBam, - barcodeTag = barcodeTag) + var alleleCountTable = countAllele(rec=rec, ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, + maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag) if alleleCountTable.len==0: continue var rec_alt:char @@ -212,82 +96,24 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations snpIndex += 1 outFileSNPanno.writeLine($rec.POS & "\t" & $rec.REF[0] & "\t" & $rec_alt ) - for bc, ac in alleleCountTable.mpairs: - # ac.tostring(acs) - ## mindp, maxdp, they are values per cell - if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue - var ithSperm = barcodeTable[bc] - ## write to mtx Ref count - outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) - outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) - nnsize += 1 - var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) - if scSpermSeq[ithSperm].viNodeseq.len==0: - var currentViNode = ViNode() - currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] - currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] - currentViNode.pathState[stateRef] = stateN - currentViNode.pathState[stateAlt] = stateN - currentViNode.state = stateN - currentViNode.pos = int(rec.POS) - currentViNode.cAlt = int(ac.cAlt) - currentViNode.cRef = int(ac.cRef) - - scSpermSeq[ithSperm].viNodeseq.add(currentViNode) - scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len - scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex - else: - let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] - var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) - var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) - var currentViNode = ViNode() - # ref/alt -> ref - var refTref = preVNode.pathScore[stateRef] + lnoTransProb - var altTref = preVNode.pathScore[stateAlt] + ltransProb - if refTref > altTref: - currentViNode.pathScore[stateRef] = refTref - currentViNode.pathState[stateRef] = stateRef - else: - currentViNode.pathScore[stateRef] = altTref - currentViNode.pathState[stateRef] = stateAlt - # ref/alt -> alt - var refTalt = preVNode.pathScore[stateRef] + ltransProb - var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb - - if refTalt > altTalt: - currentViNode.pathScore[stateAlt] = refTalt - currentViNode.pathState[stateAlt] = stateRef - else: - currentViNode.pathScore[stateAlt] = altTalt - currentViNode.pathState[stateAlt] = stateAlt - currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] - currentViNode.pathScore[stateRef] += emissionArray[stateRef] - currentViNode.cAlt = int(ac.cAlt) - currentViNode.cRef = int(ac.cRef) - currentViNode.pos = int(rec.POS) - scSpermSeq[ithSperm].viNodeseq.add(currentViNode) - scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len - scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex - # The size line - - var transitFlag = false - var cSNP = 0 - var posEnd,posStart:int64 - var inferProb,reverseProb,transProb = 0.0 - var currentEm,prevEm: array[stateRef..stateAlt, float] + discard addViNode(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq, + outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx = outFileAltCountMtx, nnsize = nnsize, + mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec=rec, + initProb = initProb, cmPmb = cmPmb) + var posEnd: int64 + var inferProb,reverseProb = 0.0 + var currentEm: array[stateRef..stateAlt, float] var lastNode: ViNode var spermVNseq: SpermViNodes var ithSNP: int ## trans/no trans - var rightGap,leftGap: array[0..1, float] + for ithSperm in 0..(scSpermSeq.len-1): ## rightGap,leftGap = [0.0,0.0] spermVNseq = scSpermSeq[ithSperm] if spermVNseq.viNodeseq.len==0: continue lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] - currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, - cRef=lastNode.cRef, - cAlt=lastNode.cAlt) + currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt) if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] @@ -301,90 +127,13 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, inferProb = currentEm[stateAlt] reverseProb = currentEm[stateRef] posEnd = lastNode.pos - cSNP = 1 - ## traceback for yielding the most probable state sequence - for i in 1..high(scSpermSeq[ithSperm].viNodeseq): - var state = scSpermSeq[ithSperm].viNodeseq[^i].state - scSpermSeq[ithSperm].viNodeseq[^(i+1)].state = scSpermSeq[ithSperm].viNodeseq[^i].pathState[state] - prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, - cRef=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cRef, - cAlt=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cAlt) - ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)-i+1] - transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos, - scSpermSeq[ithSperm].viNodeseq[^(i)].pos, - cmPmb=cmPmb) - if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef: - outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") - if state == stateRef: - # not transitioning to a different state ie still in this segment of same state - inferProb+=prevEm[stateRef] - reverseProb+=prevEm[stateAlt] - cSNP += 1 - transitFlag = false - #posStart = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos - else: # there is transition to different state: ref(start) to alt(end) now output the segment info - posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - #leftGapSize = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos - leftGap=[math.ln(transProb),math.ln(1-transProb)] - inferProb+=leftGap[0] - reverseProb+=leftGap[1] - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2") - transitFlag = true - cSNP = 1 - rightGap = leftGap - inferProb = prevEm[stateRef]+rightGap[0] - reverseProb = prevEm[stateAlt]+rightGap[1] - posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos - else: - outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") - if state == stateAlt: - # not transitioning to a different state ie still in this segment of same state - inferProb += prevEm[stateAlt] - reverseProb += prevEm[stateRef] - cSNP += 1 - transitFlag = false - else: - ## state transit - posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - leftGap=[math.ln(transProb),math.ln(1-transProb)] - inferProb += leftGap[0] - reverseProb += leftGap[1] - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) - transitFlag = true - cSNP = 1 - rightGap = leftGap - inferProb = prevEm[stateAlt]+rightGap[0] - reverseProb = prevEm[stateRef]+rightGap[1] - posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos - ## traced to the start position of the chromosome for this cell - #leftGap = [0.0,0.0] - if not transitFlag: - ## the first SNP is included in the segment from traced from back - posStart = scSpermSeq[ithSperm].viNodeseq[0].pos - if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) - else: - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) - else: - ## the first node has different state from the second, the first node has its own segment - posStart = scSpermSeq[ithSperm].viNodeseq[0].pos - posEnd = posStart - cSNP = 1 - currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, - cRef=scSpermSeq[ithSperm].viNodeseq[0].cRef, - cAlt=scSpermSeq[ithSperm].viNodeseq[0].cAlt) - transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[0].pos, - scSpermSeq[ithSperm].viNodeseq[1].pos, - cmPmb=cmPmb) - if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: - inferProb = currentEm[stateRef]+math.ln(transProb) - reverseProb = currentEm[stateAlt]+math.ln(1-transProb) - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) - else: - inferProb = currentEm[stateAlt]+math.ln(transProb) - reverseProb = currentEm[stateRef]+math.ln(1-transProb) - viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) - transitFlag = false + ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the + ## relevant File Streams. + + discard pathTrackBack(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm, thetaRef = thetaRef, + thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx, + viSegmentInfo = viSegmentInfo, posEnd = posEnd, inferProb = inferProb,reverseProb = reverseProb) + outFileTotalCountMtx.setPosition(49) outFileVStateMtx.setPosition(49) outFileAltCountMtx.setPosition(49) @@ -401,11 +150,10 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, viSegmentInfo.close() ibam.close() ivcf.close() - return 0 when(isMainModule): - let version = "0.2.1" + let version = "0.2.2" var doc = format(""" $version @@ -424,19 +172,19 @@ Arguments: Options: - -t --threads <threads> number of BAM decompression threads [default: 4] - -cb --cellbarcode <cellbarcode> the cell barcode tag, by default it is CB - -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] + -t --threads <threads> number of BAM decompression threads [default: 4] + -cb --cellbarcode <cellbarcode> the cell barcode tag, by default it is CB + -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] - -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) - -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] - -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] - -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] - -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] - -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix - -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] - -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] - -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] + -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) + -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] + -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] + -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 30] + -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] + -chrName --chrName the chr names with chr prefix or not, if not supplied then no prefix + -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] + -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] + -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] -h --help show help Examples @@ -462,65 +210,27 @@ Options: thetaALT:float cmPmb:float barcodeTag="CB" + threads = parse_int($args["--threads"]) + barcodeTag = $args["--barcodeTag"] + mindp = parse_int($args["--minDP"]) + maxdp = parse_int($args["--maxDP"]) + maxtotal = parse_int($args["--maxTotalDP"]) + mintotal = parse_int($args["--minTotalDP"]) + bamfile = $args["<BAM>"] + barcodeFile = $args["<barcodeFile>"] + out_dir = $args["<out_prefix>"] + vcff = $args["<VCF>"] + mapq = parse_int($args["--minMAPQ"]) + minbsq = parse_int($args["--baseq"]) + thetaRef = parse_float($args["--thetaREF"]) + thetaAlt = parse_float($args["--thetaALT"]) + cmPmb = parse_float($args["--cmPmb"]) - if($args["--threads"] != "nil"): - threads = parse_int($args["--threads"]) - if($args["--cellbarcode"] != "nil"): - barcodeTag = $args["--barcodeTag"] - else: threads = 4 - if($args["--minDP"] != "nil"): - mindp = parse_int($args["--minDP"]) - else: mindp = 1 - if($args["--maxDP"] != "nil"): - maxdp = parse_int($args["--maxDP"]) - else: maxdp = 10 - - if($args["--maxTotalDP"] != "nil"): - maxtotal = parse_int($args["--maxTotalDP"]) - else: maxtotal = 30 - - if($args["--minTotalDP"] != "nil"): - mintotal = parse_int($args["--minTotalDP"]) - else: mintotal = 10 - - if($args["<BAM>"] == "nil"): - quit "input bam is required" - else: bamfile = $args["<BAM>"] - - if($args["<barcodeFile>"] == "nil"): - quit "input barcodeFile is required" - else: barcodeFile = $args["<barcodeFile>"] - - if($args["<out_prefix>"] == "nil"): - quit "output prefix is required" - else: out_dir = $args["<out_prefix>"] - - if($args["<VCF>"] == "nil"): - quit "input VCF file is required" - else: vcff = $args["<VCF>"] - - if($args["--minMAPQ"] != "nil"): - mapq = parse_int($args["--minMAPQ"]) - else: mapq = 20 - if($args["--baseq"] != "nil"): - minbsq = parse_int($args["--baseq"]) - else: minbsq = 13 - if($args["--thetaREF"] != "nil"): - thetaRef = parse_float($args["--thetaREF"]) - else: thetaRef = 0.1 - - if($args["--thetaALT"] != "nil"): - thetaAlt = parse_float($args["--thetaALT"]) - else: thetaAlt = 0.9 - - if($args["--cmPmb"] != "nil"): - cmPmb = parse_float($args["--cmPmb"]) - else: cmPmb = 0.1 if($args["--chrom"] != "nil"): selectedChrs = $args["--chrom"] else: let a = toSeq(1..19) - if($args["--chrName"] == "nil"): + if(not args["--chrName"]): let b = map(a, proc (x: int): string = "" & $x) let all_Chrs = concat(b,@["X","Y"]) selectedChrs = all_Chrs.join(",") @@ -528,11 +238,8 @@ Options: let b = map(a, proc (x: int): string = "chr" & $x) let all_Chrs = concat(b,@["chrX","chrY"]) selectedChrs = all_Chrs.join(",") - let s_Chrs = selectedChrs.split(',') - #echo $s_Chrs - - #var args = commandLineParams() + discard sscocaller(threads, vcff, barcodeFile, bamfile, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag) -- GitLab