Commit 1c69f700 authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

breakdown to modules and refactor

parent 460f8534
Pipeline #6980 failed with stages
in 1 minute and 3 seconds
## implements the traceback function
import utils
from graph import SpermViNodes
import streams
import tables
import math
proc pathTrackBack*(currentSperm: var SpermViNodes,
ithSperm: int,
thetaRef: float,
thetaAlt: float,
cmPmb: float,
outFileVStateMtx: var FileStream,
viSegmentInfo: var FileStream,
posEnd: var int64,
inferProb: var float,
reverseProb: var float): int =
var currentEm,prevEm: array[stateRef..stateAlt, float]
var posStart:int64
var transitFlag = false
var cSNP = 1
var ithSNP: int
var transProb: float
var rightGap,leftGap: array[0..1, float]
for i in 1..high(currentSperm.viNodeseq):
var state = currentSperm.viNodeseq[^i].state
currentSperm.viNodeseq[^(i+1)].state = currentSperm.viNodeseq[^i].pathState[state]
prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,
cRef=currentSperm.viNodeseq[^(i+1)].cRef,
cAlt=currentSperm.viNodeseq[^(i+1)].cAlt)
ithSNP = currentSperm.spermSnpIndexLookUp[high(currentSperm.viNodeseq)-i+1]
transProb = getTrans(currentSperm.viNodeseq[^(i+1)].pos,
currentSperm.viNodeseq[^(i)].pos,
cmPmb=cmPmb)
if currentSperm.viNodeseq[^(i+1)].state == stateRef:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
if state == stateRef:
# not transitioning to a different state ie still in this segment of same state
inferProb+=prevEm[stateRef]
reverseProb+=prevEm[stateAlt]
cSNP += 1
transitFlag = false
#posStart = currentSperm[^(i+1)].pos
else: # there is transition to different state: ref(start) to alt(end) now output the segment info
posStart = currentSperm.viNodeseq[^(i)].pos
#leftGapSize = currentSperm[^(i)].pos - currentSperm[^(i+1)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb+=leftGap[0]
reverseProb+=leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2")
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateRef]+rightGap[0]
reverseProb = prevEm[stateAlt]+rightGap[1]
posEnd = currentSperm.viNodeseq[^(i+1)].pos
else:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2")
if state == stateAlt:
# not transitioning to a different state ie still in this segment of same state
inferProb += prevEm[stateAlt]
reverseProb += prevEm[stateRef]
cSNP += 1
transitFlag = false
else:
## state transit
posStart = currentSperm.viNodeseq[^(i)].pos
leftGap=[math.ln(transProb),math.ln(1-transProb)]
inferProb += leftGap[0]
reverseProb += leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
transitFlag = true
cSNP = 1
rightGap = leftGap
inferProb = prevEm[stateAlt]+rightGap[0]
reverseProb = prevEm[stateRef]+rightGap[1]
posEnd = currentSperm.viNodeseq[^(i+1)].pos
## traced to the start position of the chromosome for this cell
#leftGap = [0.0,0.0]
if not transitFlag:
## the first SNP is included in the segment from traced from back
posStart = currentSperm.viNodeseq[0].pos
if currentSperm.viNodeseq[0].state == stateRef:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
else:
## the first node has different state from the second, the first node has its own segment
posStart = currentSperm.viNodeseq[0].pos
posEnd = posStart
cSNP = 1
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,
cRef=currentSperm.viNodeseq[0].cRef,
cAlt=currentSperm.viNodeseq[0].cAlt)
transProb = getTrans(currentSperm.viNodeseq[0].pos,
currentSperm.viNodeseq[1].pos,
cmPmb=cmPmb)
if currentSperm.viNodeseq[0].state == stateRef:
inferProb = currentEm[stateRef]+math.ln(transProb)
reverseProb = currentEm[stateAlt]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" )
else:
inferProb = currentEm[stateAlt]+math.ln(transProb)
reverseProb = currentEm[stateRef]+math.ln(1-transProb)
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" )
return 0
import tables
import utils
import streams
import hts
import math
type
# Nucleotide = enum
# A, C, T, G
SpermViNodes* = object
viNodeseq*: seq[ViNode]
## look up from SNPIndex to current Sperm SNP index
snpIndexLookUp*: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp*: Table[int,int]
SeqSpermViNodes* = seq[SpermViNodes]
proc addViNode*(barcodeTable: OrderedTableRef,
alleleCountTable: Table[string,allele_expr],
scSpermSeq: var SeqSpermViNodes,
outFileTotalCountMtx: var FileStream,
outFileAltCountMtx: var FileStream,
nnsize: var int,
mindp: int,
maxdp: int,
snpIndex: int,
thetaRef: float,
thetaAlt: float,
rec: Variant,
initProb: array,
cmPmb: float): int =
for bc, ac in alleleCountTable.pairs:
# ac.tostring(acs)
## mindp, maxdp, they are values per cell
if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue
var ithSperm = barcodeTable[bc]
## write to mtx Ref count
outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt))
outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
if scSpermSeq[ithSperm].viNodeseq.len==0:
var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
currentViNode.pathState[stateRef] = stateN
currentViNode.pathState[stateAlt] = stateN
currentViNode.state = stateN
currentViNode.pos = int(rec.POS)
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
else:
let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var currentViNode = ViNode()
# ref/alt -> ref
var refTref = preVNode.pathScore[stateRef] + lnoTransProb
var altTref = preVNode.pathScore[stateAlt] + ltransProb
if refTref > altTref:
currentViNode.pathScore[stateRef] = refTref
currentViNode.pathState[stateRef] = stateRef
else:
currentViNode.pathScore[stateRef] = altTref
currentViNode.pathState[stateRef] = stateAlt
# ref/alt -> alt
var refTalt = preVNode.pathScore[stateRef] + ltransProb
var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb
if refTalt > altTalt:
currentViNode.pathScore[stateAlt] = refTalt
currentViNode.pathState[stateAlt] = stateRef
else:
currentViNode.pathScore[stateAlt] = altTalt
currentViNode.pathState[stateAlt] = stateAlt
currentViNode.pathScore[stateAlt] += emissionArray[stateAlt]
currentViNode.pathScore[stateRef] += emissionArray[stateRef]
currentViNode.cAlt = int(ac.cAlt)
currentViNode.cRef = int(ac.cRef)
currentViNode.pos = int(rec.POS)
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
return 0
# The size line
\ No newline at end of file
from distributions/rmath import dbinom
import math
import tables
import hts
type
allele_expr* = ref object
cref*: int
calt*: int
ViState* = enum
stateRef, stateAlt, stateN
ViNode* = object
pos*: int
cRef*: int
cAlt*: int
pathScore*: array[stateRef..stateAlt, float]
pathState*: array[stateRef..stateAlt, ViState]
state*: ViState
# proc cref*(a:allele_expr):int = a.cref
# proc calt*(a:allele_expr):int = a.calt
proc inc_count_cref*(a:allele_expr) = inc(a.cref)
proc inc_count_calt*(a:allele_expr) = inc(a.calt)
# proc pathScore*(v:ViNode):array[stateRef..stateAlt, float] = v.pathScore
proc getEmission*(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
return emissionScore
proc getTrans*(pos1:int64,pos2:int64,cmPmb=0.1): float =
if pos2<pos1:
quit "Wrong order of snps"
var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb
return rec
#
proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int,
minTotalReads:int,
chrom:string, mapq: int,
barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] =
var alleleCountTable = initTable[string,allele_expr]()
var rec_alt:char
var total_reads=0
rec_alt = rec.ALT[0][0]
for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint):
var cbt = tag[string](aln, barcodeTag)
var currentCB: string
if cbt.isNone:
# echo "no cb "
if not bulkBam:
continue
else:
currentCB = "bulk"
else:
currentCB = cbt.get
if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue
## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary
## if not aln.flag.proper_pair: continue
if not barcodeTable.hasKey(currentCB):
continue
var
off = aln.start.int
qoff = 0
roff_only = 0
base = 'n'
position = rec.POS.cint
over = 0
for event in aln.cigar:
var cons = event.consumes
#echo $cons
if cons.query:
qoff+=event.len ## the offs to the query sequences
if cons.reference:
off += event.len ## the offs to the reference sequences
if not cons.query:
roff_only += event.len
if off <= position:
continue
over = off - position
# get the base
base = aln.base_at(qoff - over-1)
break
if aln.base_quality_at(qoff - over-1).cint < minbsq:
continue
total_reads+=1
if alleleCountTable.hasKey(currentCB):
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
else:
var new_snp = allele_expr(cref:0,calt:0)
alleleCountTable[currentCB] = new_snp
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[currentCB].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[currentCB].inc_count_calt
continue
if total_reads > maxTotalReads or total_reads < minTotalReads:
return initTable[string,allele_expr]()
return alleleCountTable
\ No newline at end of file
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment