Skip to content
Snippets Groups Projects
Commit 1c69f700 authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

breakdown to modules and refactor

parent 460f8534
No related branches found
No related tags found
No related merge requests found
Pipeline #6980 failed
## implements the traceback function
import utils
from graph import SpermViNodes
import streams
import tables
import math
proc pathTrackBack*(currentSperm: var SpermViNodes,
ithSperm: int,
thetaRef: float,
thetaAlt: float,
cmPmb: float,
outFileVStateMtx: var FileStream,
viSegmentInfo: var FileStream,
posEnd: var int64,
inferProb: var float,
reverseProb: var float): int =
var currentEm,prevEm: array[stateRef..stateAlt, float]
var posStart:int64
var transitFlag = false
var cSNP = 1
var ithSNP: int
var transProb: float
var rightGap,leftGap: array[0..1, float]
for i in 1..high(currentSperm.viNodeseq):
var state = currentSperm.viNodeseq[^i].state
currentSperm.viNodeseq[^(i+1)].state = currentSperm.viNodeseq[^i].pathState[state]
prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,
cRef=currentSperm.viNodeseq[^(i+1)].cRef,
cAlt=currentSperm.viNodeseq[^(i+1)].cAlt)
ithSNP = currentSperm.spermSnpIndexLookUp[high(currentSperm.viNodeseq)-i+1]
transProb = getTrans(currentSperm.viNodeseq[^(i+1)].pos,
currentSperm.viNodeseq[^(i)].pos,
cmPmb=cmPmb)
if currentSperm.viNodeseq[^(i+1)].state == stateRef:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
if state == stateRef:
# not transitioning to a different state ie still in this segment of same state
inferProb+=prevEm[stateRef]
reverseProb+=prevEm[stateAlt]
cSNP += 1
transitFlag = false
#posStart = currentSperm[^(i+1)].pos
else: # there is transition to different state: ref(start) to alt(end) now output the segment info
posStart = currentSperm.viNodeseq[^(i)].pos
#leftGapSize = currentSperm[^(i)].pos - currentSperm[^(i+1)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb+=leftGap[0]
reverseProb+=leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2")
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateRef]+rightGap[0]
reverseProb = prevEm[stateAlt]+rightGap[1]
posEnd = currentSperm.viNodeseq[^(i+1)].pos
else:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2")
if state == stateAlt:
# not transitioning to a different state ie still in this segment of same state
inferProb += prevEm[stateAlt]
reverseProb += prevEm[stateRef]
cSNP += 1
transitFlag = false
else:
## state transit
posStart = currentSperm.viNodeseq[^(i)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb += leftGap[0]
reverseProb += leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateAlt]+rightGap[0]
reverseProb = prevEm[stateRef]+rightGap[1]
posEnd = currentSperm.viNodeseq[^(i+1)].pos
## traced to the start position of the chromosome for this cell
#leftGap = [0.0,0.0]
if not transitFlag:
## the first SNP is included in the segment from traced from back
posStart = currentSperm.viNodeseq[0].pos
if currentSperm.viNodeseq[0].state == stateRef:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
else:
## the first node has different state from the second, the first node has its own segment
posStart = currentSperm.viNodeseq[0].pos
posEnd = posStart
cSNP = 1
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,
cRef=currentSperm.viNodeseq[0].cRef,
cAlt=currentSperm.viNodeseq[0].cAlt)
transProb = getTrans(currentSperm.viNodeseq[0].pos,
currentSperm.viNodeseq[1].pos,
cmPmb=cmPmb)
if currentSperm.viNodeseq[0].state == stateRef:
inferProb = currentEm[stateRef]+math.ln(transProb)
reverseProb = currentEm[stateAlt]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
inferProb = currentEm[stateAlt]+math.ln(transProb)
reverseProb = currentEm[stateRef]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
return 0
import tables
import utils
import streams
import hts
import math
type
# Nucleotide = enum
# A, C, T, G
SpermViNodes* = object
viNodeseq*: seq[ViNode]
## look up from SNPIndex to current Sperm SNP index
snpIndexLookUp*: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp*: Table[int,int]
SeqSpermViNodes* = seq[SpermViNodes]
proc addViNode*(barcodeTable: OrderedTableRef,
alleleCountTable: Table[string,allele_expr],
scSpermSeq: var SeqSpermViNodes,
outFileTotalCountMtx: var FileStream,
outFileAltCountMtx: var FileStream,
nnsize: var int,
mindp: int,
maxdp: int,
snpIndex: int,
thetaRef: float,
thetaAlt: float,
rec: Variant,
initProb: array,
cmPmb: float): int =
for bc, ac in alleleCountTable.pairs:
# ac.tostring(acs)
## mindp, maxdp, they are values per cell
if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue
var ithSperm = barcodeTable[bc]
## write to mtx Ref count
outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt))
outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
if scSpermSeq[ithSperm].viNodeseq.len==0:
var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
currentViNode.pathState[stateRef] = stateN
currentViNode.pathState[stateAlt] = stateN
currentViNode.state = stateN
currentViNode.pos = int(rec.POS)
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
else:
let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var currentViNode = ViNode()
# ref/alt -> ref
var refTref = preVNode.pathScore[stateRef] + lnoTransProb
var altTref = preVNode.pathScore[stateAlt] + ltransProb
if refTref > altTref:
currentViNode.pathScore[stateRef] = refTref
currentViNode.pathState[stateRef] = stateRef
else:
currentViNode.pathScore[stateRef] = altTref
currentViNode.pathState[stateRef] = stateAlt
# ref/alt -> alt
var refTalt = preVNode.pathScore[stateRef] + ltransProb
var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb
if refTalt > altTalt:
currentViNode.pathScore[stateAlt] = refTalt
currentViNode.pathState[stateAlt] = stateRef
else:
currentViNode.pathScore[stateAlt] = altTalt
currentViNode.pathState[stateAlt] = stateAlt
currentViNode.pathScore[stateAlt] += emissionArray[stateAlt]
currentViNode.pathScore[stateRef] += emissionArray[stateRef]
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
currentViNode.pos = int(rec.POS)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
return 0
# The size line
\ No newline at end of file
from distributions/rmath import dbinom
import math
import tables
import hts
type
allele_expr* = ref object
cref*: int
calt*: int
ViState* = enum
stateRef, stateAlt, stateN
ViNode* = object
pos*: int
cRef*: int
cAlt*: int
pathScore*: array[stateRef..stateAlt, float]
pathState*: array[stateRef..stateAlt, ViState]
state*: ViState
# proc cref*(a:allele_expr):int = a.cref
# proc calt*(a:allele_expr):int = a.calt
proc inc_count_cref*(a:allele_expr) = inc(a.cref)
proc inc_count_calt*(a:allele_expr) = inc(a.calt)
# proc pathScore*(v:ViNode):array[stateRef..stateAlt, float] = v.pathScore
proc getEmission*(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
return emissionScore
proc getTrans*(pos1:int64,pos2:int64,cmPmb=0.1): float =
if pos2<pos1:
quit "Wrong order of snps"
var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb
return rec
#
proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int,
minTotalReads:int,
chrom:string, mapq: int,
barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] =
var alleleCountTable = initTable[string,allele_expr]()
var rec_alt:char
var total_reads=0
rec_alt = rec.ALT[0][0]
for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint):
var cbt = tag[string](aln, barcodeTag)
var currentCB: string
if cbt.isNone:
# echo "no cb "
if not bulkBam:
continue
else:
currentCB = "bulk"
else:
currentCB = cbt.get
if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue
## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary
## if not aln.flag.proper_pair: continue
if not barcodeTable.hasKey(currentCB):
continue
var
off = aln.start.int
qoff = 0
roff_only = 0
base = 'n'
position = rec.POS.cint
over = 0
for event in aln.cigar:
var cons = event.consumes
#echo $cons
if cons.query:
qoff+=event.len ## the offs to the query sequences
if cons.reference:
off += event.len ## the offs to the reference sequences
if not cons.query:
roff_only += event.len
if off <= position:
continue
over = off - position
# get the base
base = aln.base_at(qoff - over-1)
break
if aln.base_quality_at(qoff - over-1).cint < minbsq:
continue
total_reads+=1
if alleleCountTable.hasKey(currentCB):
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
else:
var new_snp = allele_expr(cref:0,calt:0)
alleleCountTable[currentCB] = new_snp
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
if total_reads > maxTotalReads or total_reads < minTotalReads:
return initTable[string,allele_expr]()
return alleleCountTable
\ No newline at end of file
...@@ -12,123 +12,17 @@ import strutils ...@@ -12,123 +12,17 @@ import strutils
import hts import hts
import tables import tables
import sequtils import sequtils
import private/utils
import math import math
from distributions/rmath import dbinom
import streams import streams
import private/graph
import private/findpath
type
allele_expr = ref object
cref: int
calt: int
type
# Nucleotide = enum
# A, C, T, G
ViState = enum
stateRef, stateAlt, stateN
ViNode = object
pos: int
cRef: int
cAlt: int
pathScore: array[stateRef..stateAlt, float]
pathState: array[stateRef..stateAlt, ViState]
state: ViState
SpermViNodes = object
viNodeseq: seq[ViNode]
## look up from SNPIndex to current Sperm SNP index
snpIndexLookUp: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp: Table[int,int]
SeqSpermViNodes = seq[SpermViNodes]
proc inc_count_cref(a:allele_expr) = inc(a.cref)
proc inc_count_calt(a:allele_expr) = inc(a.calt)
proc getEmission(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
return emissionScore
proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float =
if pos2<pos1:
quit "Wrong order of snps"
var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb
return rec
#
proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int,
minTotalReads:int,
chrom:string, mapq: int,
barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] =
var alleleCountTable = initTable[string,allele_expr]()
var rec_alt:char
var total_reads=0
rec_alt = rec.ALT[0][0]
for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint):
var cbt = tag[string](aln, barcodeTag)
var currentCB: string
if cbt.isNone:
# echo "no cb "
if not bulkBam:
continue
else:
currentCB = "bulk"
else:
currentCB = cbt.get
if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue
## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary
## if not aln.flag.proper_pair: continue
if not barcodeTable.hasKey(currentCB):
continue
var
off = aln.start.int
qoff = 0
roff_only = 0
base = 'n'
position = rec.POS.cint
over = 0
for event in aln.cigar:
var cons = event.consumes
#echo $cons
if cons.query:
qoff+=event.len ## the offs to the query sequences
if cons.reference:
off += event.len ## the offs to the reference sequences
if not cons.query:
roff_only += event.len
if off <= position:
continue
over = off - position
# get the base
base = aln.base_at(qoff - over-1)
break
if aln.base_quality_at(qoff - over-1).cint < minbsq:
continue
total_reads+=1
if alleleCountTable.hasKey(currentCB):
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
else:
var new_snp = allele_expr(cref:0,calt:0)
alleleCountTable[currentCB] = new_snp
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
if total_reads > maxTotalReads or total_reads < minTotalReads:
return initTable[string,allele_expr]()
return alleleCountTable
proc sscocaller(threads:int, vcff:string, barcodeFile:string, proc sscocaller(threads:int, vcff:string, barcodeFile:string,
bamfile:string, out_dir:string, mapq:int, bamfile:string, out_dir:string, mapq:int,
minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int,
thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string): int = thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string): int =
var var
ibam:Bam ibam:Bam
ivcf:VCF ivcf:VCF
...@@ -167,12 +61,8 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ...@@ -167,12 +61,8 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string,
var scSpermSeq:SeqSpermViNodes var scSpermSeq:SeqSpermViNodes
## matches with the order in barcodeTable ## matches with the order in barcodeTable
scSpermSeq.setLen(barcodeTable.len) scSpermSeq.setLen(barcodeTable.len)
var outFileSNPanno:FileStream var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream
var outFileTotalCountMtx:FileStream
var outFileAltCountMtx:FileStream
# var outHeaderMtx:FileStream
var outFileVStateMtx:FileStream
var viSegmentInfo: FileStream
try: try:
outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite)
outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite)
...@@ -190,20 +80,14 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ...@@ -190,20 +80,14 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string,
outFileAltCountMtx.writeLine(' '.repeat(50)) outFileAltCountMtx.writeLine(' '.repeat(50))
outFileVStateMtx.writeLine("%%MatrixMarket matrix coordinate integer general") outFileVStateMtx.writeLine("%%MatrixMarket matrix coordinate integer general")
outFileVStateMtx.writeLine(' '.repeat(50)) outFileVStateMtx.writeLine(' '.repeat(50))
outFileSNPanno.writeLine("POS" & "\t" & "REF" & "\t" & "ALT" ) outFileSNPanno.writeLine("POS" & "\t" & "REF" & "\t" & "ALT" )
for rec in ivcf.query(chrom): for rec in ivcf.query(chrom):
if rec.ALT.len > 1: continue if rec.ALT.len > 1: continue
if rec.ALT.len == 0: continue if rec.ALT.len == 0: continue
## alleleCountTable contains for this SNP.POS, each cell barcode's allele counts ## alleleCountTable contains for this SNP.POS, each cell barcode's allele counts
var alleleCountTable = countAllele(rec=rec, ibam=ibam, var alleleCountTable = countAllele(rec=rec, ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq,
chrom=chrom, mapq=mapq, maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag)
barcodeTable=barcodeTable,
minbsq=minbsq,
maxTotalReads = maxtotal,
minTotalReads = mintotal,
bulkBam = bulkBam,
barcodeTag = barcodeTag)
if alleleCountTable.len==0: continue if alleleCountTable.len==0: continue
var rec_alt:char var rec_alt:char
...@@ -212,82 +96,24 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ...@@ -212,82 +96,24 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string,
## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations
snpIndex += 1 snpIndex += 1
outFileSNPanno.writeLine($rec.POS & "\t" & $rec.REF[0] & "\t" & $rec_alt ) outFileSNPanno.writeLine($rec.POS & "\t" & $rec.REF[0] & "\t" & $rec_alt )
for bc, ac in alleleCountTable.mpairs: discard addViNode(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq,
# ac.tostring(acs) outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx = outFileAltCountMtx, nnsize = nnsize,
## mindp, maxdp, they are values per cell mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec=rec,
if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue initProb = initProb, cmPmb = cmPmb)
var ithSperm = barcodeTable[bc] var posEnd: int64
## write to mtx Ref count var inferProb,reverseProb = 0.0
outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) var currentEm: array[stateRef..stateAlt, float]
outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
if scSpermSeq[ithSperm].viNodeseq.len==0:
var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
currentViNode.pathState[stateRef] = stateN
currentViNode.pathState[stateAlt] = stateN
currentViNode.state = stateN
currentViNode.pos = int(rec.POS)
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
else:
let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var currentViNode = ViNode()
# ref/alt -> ref
var refTref = preVNode.pathScore[stateRef] + lnoTransProb
var altTref = preVNode.pathScore[stateAlt] + ltransProb
if refTref > altTref:
currentViNode.pathScore[stateRef] = refTref
currentViNode.pathState[stateRef] = stateRef
else:
currentViNode.pathScore[stateRef] = altTref
currentViNode.pathState[stateRef] = stateAlt
# ref/alt -> alt
var refTalt = preVNode.pathScore[stateRef] + ltransProb
var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb
if refTalt > altTalt:
currentViNode.pathScore[stateAlt] = refTalt
currentViNode.pathState[stateAlt] = stateRef
else:
currentViNode.pathScore[stateAlt] = altTalt
currentViNode.pathState[stateAlt] = stateAlt
currentViNode.pathScore[stateAlt] += emissionArray[stateAlt]
currentViNode.pathScore[stateRef] += emissionArray[stateRef]
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
currentViNode.pos = int(rec.POS)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
# The size line
var transitFlag = false
var cSNP = 0
var posEnd,posStart:int64
var inferProb,reverseProb,transProb = 0.0
var currentEm,prevEm: array[stateRef..stateAlt, float]
var lastNode: ViNode var lastNode: ViNode
var spermVNseq: SpermViNodes var spermVNseq: SpermViNodes
var ithSNP: int var ithSNP: int
## trans/no trans ## trans/no trans
var rightGap,leftGap: array[0..1, float]
for ithSperm in 0..(scSpermSeq.len-1): for ithSperm in 0..(scSpermSeq.len-1):
## rightGap,leftGap = [0.0,0.0] ## rightGap,leftGap = [0.0,0.0]
spermVNseq = scSpermSeq[ithSperm] spermVNseq = scSpermSeq[ithSperm]
if spermVNseq.viNodeseq.len==0: continue if spermVNseq.viNodeseq.len==0: continue
lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)]
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt)
cRef=lastNode.cRef,
cAlt=lastNode.cAlt)
if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]:
scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef
ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1]
...@@ -301,90 +127,13 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ...@@ -301,90 +127,13 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string,
inferProb = currentEm[stateAlt] inferProb = currentEm[stateAlt]
reverseProb = currentEm[stateRef] reverseProb = currentEm[stateRef]
posEnd = lastNode.pos posEnd = lastNode.pos
cSNP = 1 ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the
## traceback for yielding the most probable state sequence ## relevant File Streams.
for i in 1..high(scSpermSeq[ithSperm].viNodeseq):
var state = scSpermSeq[ithSperm].viNodeseq[^i].state discard pathTrackBack(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm, thetaRef = thetaRef,
scSpermSeq[ithSperm].viNodeseq[^(i+1)].state = scSpermSeq[ithSperm].viNodeseq[^i].pathState[state] thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx,
prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, viSegmentInfo = viSegmentInfo, posEnd = posEnd, inferProb = inferProb,reverseProb = reverseProb)
cRef=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cRef,
cAlt=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cAlt)
ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)-i+1]
transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos,
scSpermSeq[ithSperm].viNodeseq[^(i)].pos,
cmPmb=cmPmb)
if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
if state == stateRef:
# not transitioning to a different state ie still in this segment of same state
inferProb+=prevEm[stateRef]
reverseProb+=prevEm[stateAlt]
cSNP += 1
transitFlag = false
#posStart = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos
else: # there is transition to different state: ref(start) to alt(end) now output the segment info
posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos
#leftGapSize = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb+=leftGap[0]
reverseProb+=leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2")
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateRef]+rightGap[0]
reverseProb = prevEm[stateAlt]+rightGap[1]
posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos
else:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2")
if state == stateAlt:
# not transitioning to a different state ie still in this segment of same state
inferProb += prevEm[stateAlt]
reverseProb += prevEm[stateRef]
cSNP += 1
transitFlag = false
else:
## state transit
posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb += leftGap[0]
reverseProb += leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateAlt]+rightGap[0]
reverseProb = prevEm[stateRef]+rightGap[1]
posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos
## traced to the start position of the chromosome for this cell
#leftGap = [0.0,0.0]
if not transitFlag:
## the first SNP is included in the segment from traced from back
posStart = scSpermSeq[ithSperm].viNodeseq[0].pos
if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
else:
## the first node has different state from the second, the first node has its own segment
posStart = scSpermSeq[ithSperm].viNodeseq[0].pos
posEnd = posStart
cSNP = 1
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,
cRef=scSpermSeq[ithSperm].viNodeseq[0].cRef,
cAlt=scSpermSeq[ithSperm].viNodeseq[0].cAlt)
transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[0].pos,
scSpermSeq[ithSperm].viNodeseq[1].pos,
cmPmb=cmPmb)
if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef:
inferProb = currentEm[stateRef]+math.ln(transProb)
reverseProb = currentEm[stateAlt]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
inferProb = currentEm[stateAlt]+math.ln(transProb)
reverseProb = currentEm[stateRef]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
transitFlag = false
outFileTotalCountMtx.setPosition(49) outFileTotalCountMtx.setPosition(49)
outFileVStateMtx.setPosition(49) outFileVStateMtx.setPosition(49)
outFileAltCountMtx.setPosition(49) outFileAltCountMtx.setPosition(49)
...@@ -401,11 +150,10 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string, ...@@ -401,11 +150,10 @@ proc sscocaller(threads:int, vcff:string, barcodeFile:string,
viSegmentInfo.close() viSegmentInfo.close()
ibam.close() ibam.close()
ivcf.close() ivcf.close()
return 0 return 0
when(isMainModule): when(isMainModule):
let version = "0.2.1" let version = "0.2.2"
var doc = format(""" var doc = format("""
$version $version
...@@ -424,19 +172,19 @@ Arguments: ...@@ -424,19 +172,19 @@ Arguments:
Options: Options:
-t --threads <threads> number of BAM decompression threads [default: 4] -t --threads <threads> number of BAM decompression threads [default: 4]
-cb --cellbarcode <cellbarcode> the cell barcode tag, by default it is CB -cb --cellbarcode <cellbarcode> the cell barcode tag, by default it is CB
-MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20]
-BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13]
-CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms)
-minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1]
-maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5]
-maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] -maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 30]
-minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] -minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10]
-chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix -chrName --chrName the chr names with chr prefix or not, if not supplied then no prefix
-thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1]
-thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
-cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1]
-h --help show help -h --help show help
Examples Examples
...@@ -462,65 +210,27 @@ Options: ...@@ -462,65 +210,27 @@ Options:
thetaALT:float thetaALT:float
cmPmb:float cmPmb:float
barcodeTag="CB" barcodeTag="CB"
threads = parse_int($args["--threads"])
barcodeTag = $args["--barcodeTag"]
mindp = parse_int($args["--minDP"])
maxdp = parse_int($args["--maxDP"])
maxtotal = parse_int($args["--maxTotalDP"])
mintotal = parse_int($args["--minTotalDP"])
bamfile = $args["<BAM>"]
barcodeFile = $args["<barcodeFile>"]
out_dir = $args["<out_prefix>"]
vcff = $args["<VCF>"]
mapq = parse_int($args["--minMAPQ"])
minbsq = parse_int($args["--baseq"])
thetaRef = parse_float($args["--thetaREF"])
thetaAlt = parse_float($args["--thetaALT"])
cmPmb = parse_float($args["--cmPmb"])
if($args["--threads"] != "nil"):
threads = parse_int($args["--threads"])
if($args["--cellbarcode"] != "nil"):
barcodeTag = $args["--barcodeTag"]
else: threads = 4
if($args["--minDP"] != "nil"):
mindp = parse_int($args["--minDP"])
else: mindp = 1
if($args["--maxDP"] != "nil"):
maxdp = parse_int($args["--maxDP"])
else: maxdp = 10
if($args["--maxTotalDP"] != "nil"):
maxtotal = parse_int($args["--maxTotalDP"])
else: maxtotal = 30
if($args["--minTotalDP"] != "nil"):
mintotal = parse_int($args["--minTotalDP"])
else: mintotal = 10
if($args["<BAM>"] == "nil"):
quit "input bam is required"
else: bamfile = $args["<BAM>"]
if($args["<barcodeFile>"] == "nil"):
quit "input barcodeFile is required"
else: barcodeFile = $args["<barcodeFile>"]
if($args["<out_prefix>"] == "nil"):
quit "output prefix is required"
else: out_dir = $args["<out_prefix>"]
if($args["<VCF>"] == "nil"):
quit "input VCF file is required"
else: vcff = $args["<VCF>"]
if($args["--minMAPQ"] != "nil"):
mapq = parse_int($args["--minMAPQ"])
else: mapq = 20
if($args["--baseq"] != "nil"):
minbsq = parse_int($args["--baseq"])
else: minbsq = 13
if($args["--thetaREF"] != "nil"):
thetaRef = parse_float($args["--thetaREF"])
else: thetaRef = 0.1
if($args["--thetaALT"] != "nil"):
thetaAlt = parse_float($args["--thetaALT"])
else: thetaAlt = 0.9
if($args["--cmPmb"] != "nil"):
cmPmb = parse_float($args["--cmPmb"])
else: cmPmb = 0.1
if($args["--chrom"] != "nil"): if($args["--chrom"] != "nil"):
selectedChrs = $args["--chrom"] selectedChrs = $args["--chrom"]
else: else:
let a = toSeq(1..19) let a = toSeq(1..19)
if($args["--chrName"] == "nil"): if(not args["--chrName"]):
let b = map(a, proc (x: int): string = "" & $x) let b = map(a, proc (x: int): string = "" & $x)
let all_Chrs = concat(b,@["X","Y"]) let all_Chrs = concat(b,@["X","Y"])
selectedChrs = all_Chrs.join(",") selectedChrs = all_Chrs.join(",")
...@@ -528,11 +238,8 @@ Options: ...@@ -528,11 +238,8 @@ Options:
let b = map(a, proc (x: int): string = "chr" & $x) let b = map(a, proc (x: int): string = "chr" & $x)
let all_Chrs = concat(b,@["chrX","chrY"]) let all_Chrs = concat(b,@["chrX","chrY"])
selectedChrs = all_Chrs.join(",") selectedChrs = all_Chrs.join(",")
let s_Chrs = selectedChrs.split(',') let s_Chrs = selectedChrs.split(',')
#echo $s_Chrs
#var args = commandLineParams()
discard sscocaller(threads, vcff, barcodeFile, bamfile, discard sscocaller(threads, vcff, barcodeFile, bamfile,
out_dir, mapq, minbsq, mintotal, out_dir, mapq, minbsq, mintotal,
maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag) maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment