From ce69d55ad6f49114146fff304715be5dad6eb037 Mon Sep 17 00:00:00 2001
From: rlyu <rlyu@svi.edu.au>
Date: Mon, 29 Nov 2021 10:03:08 +1100
Subject: [PATCH] add supplying template cell option

---
 plotDiagnosticHap.R            |   5 +-
 private/correctPhase.nim       | 297 +++++++++++++++
 private/find_template_cell.nim |  27 +-
 private/findpath.nim           |  43 ++-
 private/genotype_mtx.nim       | 272 +++++++-------
 private/graph.nim              |  96 ++---
 private/infer_missing_snps.nim | 188 +++++-----
 private/infer_snp_hap.nim      |   5 -
 private/sgcocaller_sxo.nim     | 147 ++++++++
 private/sgphase.nim            |  41 ++-
 private/write_out_vcf.nim      | 101 ++++++
 sgcocaller.nim                 | 641 +++++++++++++++------------------
 12 files changed, 1210 insertions(+), 653 deletions(-)
 create mode 100755 private/correctPhase.nim
 delete mode 100755 private/infer_snp_hap.nim
 create mode 100755 private/sgcocaller_sxo.nim
 create mode 100755 private/write_out_vcf.nim

diff --git a/plotDiagnosticHap.R b/plotDiagnosticHap.R
index 7b79021..3956174 100644
--- a/plotDiagnosticHap.R
+++ b/plotDiagnosticHap.R
@@ -43,12 +43,11 @@ plot_df_match <- apply(plot_df,2,function(x){
 
 data.frame(cbind(plot_df_match,snpAnnot_df)) %>%
   tidyr::pivot_longer(cols = colnames(plot_df)) %>%
-  dplyr::filter(!is.na(value),POS > 1.3e8, POS < 1.35e8) %>%
+  dplyr::filter(!is.na(value)) %>%
   ggplot()+
   geom_point(mapping = aes(x = POS , y = name, color = value))+
   scale_color_manual(values = c("FALSE"='red',
                                 "TRUE" = "blue"))+
-    theme_bw(base_size = 18)+geom_vline(mapping = aes(xintercept = c(switch_pos[2])),
-                                        color = "red",size = 1.2)
+    theme_bw(base_size = 18)
 
 ggsave(pngfile, dpi = 100, width = 14, height = 6)
diff --git a/private/correctPhase.nim b/private/correctPhase.nim
new file mode 100755
index 0000000..6c9ab35
--- /dev/null
+++ b/private/correctPhase.nim
@@ -0,0 +1,297 @@
+## calcualte switch score for snps positions that are high risk
+import utils
+import sequtils
+import math
+import streams
+# bins of SNP indexes
+import strutils
+
+let binSize = 2000
+let movingStep = 200
+let dissimThresh = 0.0099 
+let lookBeyondSnps = 25
+let debug = false
+let switchPrior = 0.5
+type 
+  switchScoreTuple = tuple
+    switch_scores: seq[float]
+    switch_scores_snpIndex: seq[int]
+    
+# reduce the search space
+# return seq of snp indexes
+# low risk bins are bins that do not have crossover happening in majority of the cells. 
+
+## rough crossover estimates: return binary results, 0 means no crossovers, 1 means there is.
+## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin
+
+## cell_geno snp by cell
+proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = 
+  let ncells = cell_geno[0].len
+  var ncompared = newSeq[int](ncells)
+  var nmatch = newSeq[int](ncells)
+  for snpi,g in templ_geno:
+    for cellj in 0..(ncells-1):
+      if g != gUnknown and cell_geno[snpi][cellj] != gUnknown:
+        ncompared[cellj].inc
+        if g == cell_geno[snpi][cellj]: nmatch[cellj].inc
+  if debug: echo "ncompared " & $ncompared      
+  let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float =  nmatch[x]/ncompared[x])
+  if debug: echo "dissim " & $dissim
+  var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh)))
+  if debug: echo "ncrossover " & $ncrossover
+  let nxcells = foldl(ncrossover, a + b)
+  if nxcells < int(floor(0.5 * float(ncells))):
+    if debug: echo "bin had no crossovers : " & $nxcells
+    return false
+  else:
+    if debug: echo "bin had many crossovers : " & $nxcells
+    return true
+
+proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = 
+  #let ncells = gtMtxByCell[0].len
+  let nSnps = gtMtxByCell.len
+  let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
+  if debug: echo "nBins: " & $nBins
+  let binIds = toSeq(0..(nBins-1))
+  var snpPosStart,snpPosEnd: int
+  var highRiskSnps: seq[int]
+  for binI in binIds:
+    if debug: echo "binI " & $binI
+    snpPosStart = (binI)*(binSize - movingStep)
+    if (snpPosStart + binSize) > (nSnps-1): 
+      snpPosEnd = (nSnps-1)
+    else:
+      snpPosEnd = snpPosStart + binSize
+    if (hasCrossover(templ_geno =  fullGeno[snpPosStart..snpPosEnd], 
+                     cell_geno =  gtMtxByCell[snpPosStart..snpPosEnd] )):
+      highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd))
+
+    
+  # add a last bin as from end of chrom with length binSize
+  let lastBinStart = nSnps - binSize
+  let lastBinEnd = nSnps - 1
+  if (hasCrossover(templ_geno =  fullGeno[snpPosStart..snpPosEnd], 
+                     cell_geno =  gtMtxByCell[snpPosStart..snpPosEnd])):
+    highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd))
+
+  highRiskSnps = deduplicate(highRiskSnps)
+  return highRiskSnps
+   
+proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] =
+  var fullGeno = newSeq[BinaryGeno](nSnps)
+  var i = 0
+  var currentEntrySeq: seq[string]
+  while not phasedSnpAnnotFileStream.atEnd():
+    currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace()
+    fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01")
+    i.inc
+  if i != nSnps :
+    quit "phased snpAnnot file does not have the same number of rows with gtMtx"
+  return fullGeno
+
+proc calculateProbslog10(hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno], error_rate = 0.1) : float = 
+  var nmatch,ncompare:int
+  for i,g in hap:
+    if (g != gUnknown) and (cell_hap[i] != gUnknown):
+      ncompare.inc
+      if (g == cell_hap[i]): nmatch.inc
+  let nmis = ncompare - nmatch
+  let prob = 0.5*(error_rate^nmis*(1-error_rate)^nmatch + error_rate^nmatch*(1-error_rate)^nmis)
+  return(log10(prob))
+
+proc countNmatch( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = 
+  var nmatch,ncompare:int
+  for i,g in hap:
+    if (g != gUnknown) and (cell_hap[i] != gUnknown):
+      ncompare.inc
+      if (g == cell_hap[i]): nmatch.inc
+  let nmis = ncompare - nmatch
+  return nmatch
+
+proc countNMis( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = 
+  var nmatch,ncompare:int
+  for i,g in hap:
+    if (g != gUnknown) and (cell_hap[i] != gUnknown):
+      ncompare.inc
+      if (g == cell_hap[i]): nmatch.inc
+  let nmis = ncompare - nmatch
+  return nmis
+
+proc binarySwitch(x: BinaryGeno):BinaryGeno = 
+    if(x == gUnknown):  gUnknown 
+    elif(x == gREF):gALT
+    else: gREF
+
+proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = 
+  map(hap, binarySwitch)
+
+proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = 
+  map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex])
+
+proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno]): switchScoreTuple = 
+  var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int
+  let nSnps = gtMtxByCell.len
+  let ncells =  gtMtxByCell[0].len
+  var hswitch,cell_hap,cell_full_hap,templ_geno,templ_geno_left,templ_geno_right: seq[BinaryGeno]
+  var prob_switch, prob_nonsw: float
+  var leftIndex,rightIndex, switch_score_snpIndex: seq[int]
+  var swscore = newSeq[float](nSnps)
+  for rsnpi in riskySnps:
+    prob_switch = 0.0
+    prob_nonsw = 0.0
+    if fullGeno[rsnpi] == gUnknown: continue
+    for celli in 0..(ncells-1):
+      # if celli == 2: continue
+      leftIndex = newSeq[int]()
+      rightIndex = newSeq[int]()
+      offset = 1
+      cell_full_hap = getIthCellHap(gtMtxByCell, celli)
+      if(cell_full_hap[rsnpi] != gUnknown and fullGeno[rsnpi] != gUnknown ):
+        rightIndex.add(rsnpi)
+      ## need to find the nearest coexisting N snps (N = lookBeyondSnps*2)
+      offset = 1
+      while(leftIndex.len < lookBeyondSnps and ((rsnpi-offset) > 0)):
+        if((cell_full_hap[rsnpi-offset] != gUnknown) and (fullGeno[rsnpi-offset] != gUnknown)):
+          leftIndex.add(rsnpi-offset)
+        offset.inc
+      offset = 1
+      while(rightIndex.len < lookBeyondSnps and ((rsnpi+offset) < nSnps)):
+        if((cell_full_hap[rsnpi+offset] != gUnknown) and (fullGeno[rsnpi+offset] != gUnknown)):
+          rightIndex.add(rsnpi+offset)
+        offset.inc
+      cell_hap = concat(map(concat(leftIndex,rightIndex), proc(x:int):BinaryGeno = cell_full_hap[x])) 
+      templ_geno_left = map(leftIndex, proc(x:int):BinaryGeno = fullGeno[x])
+      templ_geno_right = map(rightIndex, proc(x:int):BinaryGeno = fullGeno[x])
+      templ_geno = concat(templ_geno_left,templ_geno_right)
+      hswitch = concat(templ_geno_left,switchHap(templ_geno_right))
+      # if rsnpi == 16933:
+      #   echo "cell " & $celli
+      #   echo "cell_hap " & $cell_hap
+      #   echo "templ_geno " & $templ_geno
+      #   echo "hswitch " & $hswitch
+      #   echo "prev prob nosw " & $prob_nonsw
+      #   echo "prev prob_switch " & $prob_switch   
+      #   echo "nmatch temp_geno " & $(countNmatch(hap = templ_geno, cell_hap = cell_hap))
+      #   echo "nmatch hswitch " & $(countNmatch(hap = hswitch, cell_hap = cell_hap))
+      #   echo "nmis temp_geno " & $(countNMis(hap = templ_geno, cell_hap = cell_hap))
+      #   echo "nmis hswitch " & $(countNMis(hap = hswitch, cell_hap = cell_hap))    
+      prob_nonsw = prob_nonsw + calculateProbslog10(hap = templ_geno, cell_hap = cell_hap)
+      prob_switch = prob_switch + calculateProbslog10(hap = hswitch, cell_hap = cell_hap)
+
+    #if(prob_switch > prob_nonsw): echo "found positive switch score"
+    swscore[rsnpi] =  log10(switchPrior) + prob_switch - log10(1-switchPrior) - prob_nonsw
+    switch_score_snpIndex.add(rsnpi)
+  return (swscore,switch_score_snpIndex)
+
+## return SNP indexs to switch
+## switchScore, a dense list of switch scores
+proc findSwitchSites(switchScoreT: switchScoreTuple): seq[int] = 
+  var sitesToSwitch = newSeq[int]()
+  var positiveScores: seq[float]
+  var positiveScoresIndex: seq[int]
+  
+  var inPositiveBlock = false
+  for site, score in switchScoreT.switch_scores:
+    if switchScoreT.switch_scores_snpIndex.find(site)>=0:
+      if score <= 0.0 :
+        if inPositiveBlock:
+          inPositiveBlock = false
+          if positiveScoresIndex.len > int(floor(lookBeyondSnps/3)):
+            sitesToSwitch.add(positiveScoresIndex[maxIndex(positiveScores)])
+          continue
+      elif not inPositiveBlock:
+        positiveScores = @[score]
+        positiveScoresIndex = @[site]
+        inPositiveBlock = true
+        continue
+      else:
+        positiveScores.add(score)
+        positiveScoresIndex.add(site)
+  return sitesToSwitch
+
+proc switchPhaseString(oriString:string): string = 
+  if oriString == "-1": return "-1"
+  if oriString == "0": return "1"
+  if oriString == "1": return "0"
+ 
+proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, phaseSnpAnnotFile:string ): int = 
+  var switchedPhasedAnnotFileFS,phaseSnpAnnotFileFS: FileStream
+  var switch = 0
+  var currentEntry:seq[string]
+  var snpIndex = 0
+  var writtenSnps = 0
+  try:
+    switchedPhasedAnnotFileFS = openFileStream(switchedPhasedAnnotFile, fmWrite)
+    phaseSnpAnnotFileFS = openFileStream(phaseSnpAnnotFile, fmRead)
+  except:
+    stderr.write getCurrentExceptionMsg() 
+  switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine())
+  if siteToSwitch.len == 0:
+    while not phaseSnpAnnotFileFS.atEnd():
+      switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine())
+    switchedPhasedAnnotFileFS.close()
+    phaseSnpAnnotFileFS.close()
+    return 0
+  while not phaseSnpAnnotFileFS.atEnd():
+    currentEntry = phaseSnpAnnotFileFS.readLine.splitWhitespace()
+    if siteToSwitch.find(snpIndex) != -1:
+      switch = switch xor 1 
+    if switch == 0:
+      switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t"))
+    else:
+      currentEntry[3] = switchPhaseString(currentEntry[3])
+      switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t"))
+    snpIndex.inc
+  switchedPhasedAnnotFileFS.close()
+  phaseSnpAnnotFileFS.close()
+  return 0
+
+proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string): int = 
+  var gtMtxByCell: seq[seq[BinaryGeno]]
+  var currentEntrySeq: seq[string]
+  var currentEntry:seq[int]
+  var nSnps,ncells:int
+  var gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream:FileStream
+  ## sort entries in mtx files 
+  try: 
+    gtMtxFileStream = openFileStream(gtMtxFile, fmRead)
+    phaseSnpAnnotFileStream = openFileStream(phaseSnpAnnotFile, fmRead)
+    switchScoreFileStream = openFileStream(switchScoreFile, fmWrite)
+  except:
+    stderr.write getCurrentExceptionMsg() 
+  echo "reading gtMtx to by cell"
+  discard gtMtxFileStream.readLine()
+  discard phaseSnpAnnotFileStream.readLine()
+  #N, i,j
+  currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace()
+  currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x))
+  nSnps = currentEntry[0]
+  echo "nSnps " & $nSnps
+  ncells = currentEntry[1]
+  echo "ncells " & $ncells
+
+  echo "totalEntries " & $ currentEntry[2]
+  ## gtMtx is cell by Snp format
+  gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
+  discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true)
+  var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps )
+  var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell)
+  echo "riskySnps.len " & $riskySnps.len
+  var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno)
+  let switchSites = findSwitchSites(switchScoresT)
+  switchScoreFileStream.writeLine("#" & $switchSites)
+  for snpi,score in switchScoresT.switch_scores:
+    if switchScoresT.switch_scores_snpIndex.find(snpi)>=0:
+      switchScoreFileStream.writeLine($score)
+    else: 
+      switchScoreFileStream.writeLine("NA")
+  for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]:
+    fs.close()
+  discard writeSwitchedPhase(switchSites,switchedPhasedAnnotFile,phaseSnpAnnotFile)
+
+# for chrom in @["chr4"]:    
+#   let gtMtxFile = "data/WC_CNV_42/sgcocaller/gtMtx/" & chrom & "_gtMtx.mtx"
+#   let phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_phased_snpAnnot.txt"
+#   let switchedPhasedAnnotFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_corrected_phased_snpAnnot.txt"
+#   let switchScoreFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_switch_score.txt"  
+#   discard correctPhase(gtMtxFile,phaseSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile)
\ No newline at end of file
diff --git a/private/find_template_cell.nim b/private/find_template_cell.nim
index ed743a6..67d91e3 100755
--- a/private/find_template_cell.nim
+++ b/private/find_template_cell.nim
@@ -31,7 +31,7 @@ proc countCoexistMatch(seq1: seq[BinaryGeno], seq2: seq[BinaryGeno]): seq[int] =
         nmatch += 1
   return [cell1Snps,cell2Snps,nmatch,ncoexist].toSeq
 
-proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[int]): seq[CellPairs] = 
+proc findCellPairs(gtMtx:seq[seq[BinaryGeno]], nPairs = 3, nSnpsPercell: var seq[int]): seq[CellPairs] = 
   let ncells = gtMtx.len
   var genoMatch = @[0,0,0,0]
   var returnCp = newSeq[CellPairs](nPairs)
@@ -49,13 +49,15 @@ proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[
       if k == nPairs: break
   returnCp
 
+proc compareNSnps(x, y: (int,int)): int =
+  cmp(x[0], y[0])
 
-proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.75): int = 
+proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.85): int = 
   # return the selected cells' index
   # not too many SNPs or too few
-  var seqNsnps = nSnpsPercell
-  sort(seqNsnps, system.cmp[int])
-  int(floor(float(seqNsnps.len) * q))
+  var indexed = zip(nSnpsPercell, toSeq(0..(nSnpsPercell.len-1)))
+  indexed.sort(compareNSnps)
+  return indexed[int(floor(float(nSnpsPercell.len) * q))][1]
 
 proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = 
   var nSnpsCells = newSeq[int](gtMtx.len)
@@ -66,12 +68,15 @@ proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int =
   if cellPairs[0].coexistSnps == 0 :
     # no good pairs found
     selectedCell = findCellBynSNPs(nSnpsPercell = nSnpsCells)
+    echo "no good cell pairs found, template cell is selected by nSNPs "
+    echo "selected cell:" & $selectedCell
+    return selectedCell
   else:
     let coExistSnps = map(cellPairs, proc(x: CellPairs) : int = x.coexistSnps)
     icp = maxIndex(coExistSnps)
-  if nSnpsCells[cellPairs[icp].cell1] > nSnpsCells[cellPairs[icp].cell2]:
-    echo "selected cell: " & $cellPairs[icp].cell1 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell1]
-    return cellPairs[icp].cell1
-  else:
-    echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2]
-    return cellPairs[icp].cell2
+    if nSnpsCells[cellPairs[icp].cell1] > nSnpsCells[cellPairs[icp].cell2]:
+      echo "selected cell: " & $cellPairs[icp].cell1 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell1]
+      return cellPairs[icp].cell1
+    else:
+      echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2]
+      return cellPairs[icp].cell2
diff --git a/private/findpath.nim b/private/findpath.nim
index 902ffe5..4b5ba6e 100755
--- a/private/findpath.nim
+++ b/private/findpath.nim
@@ -1,12 +1,13 @@
 ## implements the traceback function
 import utils
-from graph import SpermViNodes
+import graph 
 import streams
 import tables
 import math
 import strutils
 
-proc pathTrackBack*(currentSperm: var SpermViNodes,
+
+proc pathTrackBackIthSperm(currentSperm: var SpermViNodes,
                     ithSperm: int,
                     thetaRef: float,
                     thetaAlt: float,
@@ -102,3 +103,41 @@ proc pathTrackBack*(currentSperm: var SpermViNodes,
       viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") )
   
   return 0
+proc pathTrackBack*(scSpermSeq:  var SeqSpermViNodes,
+                    thetaRef: float,
+                    thetaAlt: float,
+                    cmPmb: float,
+                    outFileVStateMtx: var FileStream,
+                    viSegmentInfo: var FileStream): int = 
+
+  var posEnd: int64
+  var inferProb,reverseProb = 0.0
+  var currentEm: array[stateRef..stateAlt, float]
+  var lastNode: ViNode
+  var spermVNseq: SpermViNodes
+  var ithSNP: int
+
+  for ithSperm in 0..(scSpermSeq.len-1):
+    ## rightGap,leftGap = [0.0,0.0]
+    spermVNseq = scSpermSeq[ithSperm]
+    if spermVNseq.viNodeseq.len==0: continue
+    lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)]
+    currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt)
+    if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]:
+      scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef
+      ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1]
+      outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
+      inferProb  = currentEm[stateRef]
+      reverseProb = currentEm[stateAlt]
+    else:
+      scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt
+      ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1]
+      outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2")
+      inferProb  = currentEm[stateAlt]
+      reverseProb = currentEm[stateRef]
+    posEnd = lastNode.pos
+    ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the  relevant File Streams.
+    discard pathTrackBackIthSperm(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm,  thetaRef = thetaRef, 
+                          thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx,
+                          viSegmentInfo = viSegmentInfo, posEnd = posEnd,  inferProb = inferProb,reverseProb = reverseProb)                    
+
diff --git a/private/genotype_mtx.nim b/private/genotype_mtx.nim
index 9c11d79..68ace6b 100755
--- a/private/genotype_mtx.nim
+++ b/private/genotype_mtx.nim
@@ -1,136 +1,136 @@
-## generate gtMtx, gtMtx_snpAnnot, totalCount.mtx altCount.mtx
-
-## author Ruqian Lyu
-## Date: 2021 Oct
-## gtMtx in 1, or 2
-import tables
-import strutils
-import hts
-import utils
-import streams
-import findGtNodes
-
-## these outputs will be per chromosome
-
-
-proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, 
-              outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int,
-              minTotalReads:int,  mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.8):int = 
-  var variantIndex = 1
-  var writtenVarintIndex = 1
-  var calt,cellDP,geno,cellIndex,nnsize = 0
-  var cellGtNodes,cellGtNodeCopy: Table[string, GtNode]
-    
-  ## write headers to those mtx files and the first line place holder for total_row total_column total_entry
-  let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
-
-  for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]:
-    outFileStream.writeLine(sparseMatrixHeader)
-    outFileStream.writeLine(' '.repeat(50))
-  outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t"))
-  for rec in ivcf.query("*"): 
-    var rec_alt = rec.ALT[0][0]
-    var rec_ref = rec.REF[0]
-    cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,
-                                  minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,
-                                  minbsq=minbsq,barcodeTag=barcodeTag)
-    cellGtNodeCopy = cellGtNodes
-    for cellbarcode in cellGtNodeCopy.keys:
-      cellDP = cellGtNodes[cellbarcode].alleles.len
-      if cellDP < minCellDp:
-        cellGtNodes.del(cellbarcode)
-        continue
-      calt = cellGtNodes[cellbarcode].alleles.count("1")
-      if calt/cellDP < p0:
-        cellGtNodes[cellbarcode].genotype = gREF
-      elif calt/cellDP > p1:
-        cellGtNodes[cellbarcode].genotype = gALT
-      else:
-        cellGtNodes.del(cellbarcode)
-        continue
-    if cellGtNodes.len < minSnpDepth:
-      variantIndex.inc
-      continue
-    else:
-      for cellbarcode in cellGtNodes.keys:
-        cellDP = cellGtNodes[cellbarcode].alleles.len
-        calt = cellGtNodes[cellbarcode].alleles.count("1")
-        ## $cellGtNodes[cellbarcode].genotype BinaryGeno to "0", "1" or "-1" (not -1), write to file coding in 1 or 2
-        geno = parseInt($cellGtNodes[cellbarcode].genotype) + 1
-        # index from 1 for matrices
-        cellIndex = barcodeTable[cellbarcode] + 1
-        outGtMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $geno],sep = " "))
-        outTotalCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $cellDP],sep = " "))
-        outAltCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $calt],sep = " "))
-        nnsize.inc
-      outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") )
-      writtenVarintIndex.inc
-    ## write to matrices 
-          
-  echo "processed " & $(variantIndex) & " variants"
-  echo "wrote " & $(writtenVarintIndex-1) & " variants to matrices and snpAnnot file"
-  for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]:
-    outFileStream.setPosition(49)
-    outFileStream.write($(writtenVarintIndex-1) & " " & $barcodeTable.len & " " & $nnsize) 
-  for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]:
-    outFileStream.close()
-  return 0
-
-# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz"
-# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam"
-# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv"
-# var
-#   ibam:Bam
-#   ivcf:VCF
-#   outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
-# if not open(ibam, bam_file, threads=1, index = true):
-#     quit "couldn't open input bam"
-# if not open(ivcf, vcf_file, threads=1):
-#     quit "couldn't open: vcf file"
-
-# var hf = hts.hts_open(cstring(barcode_file), "r")
-# #### TODO : Table size
-# var barcodeTable =  newTable[string,int](initialSize = 1024)
-# var kstr: hts.kstring_t
-# kstr.l = 0
-# kstr.m = 0
-# kstr.s = nil
-# var ithSperm = 0
-# ## initiate the table with CB as keys, allele counts (ref object) as elements
-# while hts_getline(hf, cint(10), addr kstr) > 0:
-#   if $kstr.s[0] == "#":
-#     continue
-#   var v = $kstr.s
-#   discard barcodeTable.hasKeyOrPut(v, ithSperm)
-#   ithSperm.inc
-# discard hf.hts_close()
-
-# let s_Chrs = @["chr1"]
-# var chrom = "chr1"
-# let out_prefix = "test_data/getMtx/"
-# echo "generating gtMtx"
-# try: 
-#   outGtMtx = openFileStream(out_prefix & chrom &  "_gtMtx.mtx", fmWrite)
-#   outSnpAnnot = openFileStream(out_prefix & chrom &  "_snpAnnot.txt", fmWrite)
-#   outTotalCountMtx = openFileStream(out_prefix & chrom &  "_totalMtx.mtx", fmWrite)
-#   outAltCountMtx = openFileStream(out_prefix & chrom &  "_AltMtx.mtx", fmWrite)
-# except:
-#   stderr.write getCurrentExceptionMsg()
-
-
-# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
-#                  outSnpAnnot = outSnpAnnot, maxTotalReads = 30,
-#                  minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1)
-
-
-# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string
-
-# ## sort entries in mtx files 
-# for chrom in s_Chrs:
-#   outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx"
-#   outTotalCountMtxFile = out_prefix & chrom &  "_totalMtx.mtx"
-#   outAltCountMtxFile = out_prefix & chrom &  "_AltMtx.mtx"
-#   for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]:
-#     var imtx = readMtx(mtx_file = mtxFile)
-#     discard sortWriteMtx(imtx, mtx_file = mtxFile)
-
+## generate gtMtx, gtMtx_snpAnnot, totalCount.mtx altCount.mtx
+
+## author Ruqian Lyu
+## Date: 2021 Oct
+## gtMtx in 1, or 2
+import tables
+import strutils 
+import hts
+import utils
+import streams
+import findGtNodes
+
+## these outputs will be per chromosome
+
+
+proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, 
+              outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int,
+              minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.7,chrom: string):int = 
+  var variantIndex = 1
+  var writtenVarintIndex = 1
+  var calt,cellDP,geno,cellIndex,nnsize = 0
+  var cellGtNodes,cellGtNodeCopy: Table[string, GtNode]
+    
+  ## write headers to those mtx files and the first line place holder for total_row total_column total_entry
+  let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
+
+  for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]:
+    outFileStream.writeLine(sparseMatrixHeader)
+    outFileStream.writeLine(' '.repeat(50))
+  outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t"))
+  for rec in ivcf.query(chrom): 
+    var rec_alt = rec.ALT[0][0]
+    var rec_ref = rec.REF[0]
+    cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,
+                                  minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,
+                                  minbsq=minbsq,barcodeTag=barcodeTag)
+    cellGtNodeCopy = cellGtNodes
+    for cellbarcode in cellGtNodeCopy.keys:
+      cellDP = cellGtNodes[cellbarcode].alleles.len
+      if cellDP < minCellDp:
+        cellGtNodes.del(cellbarcode)
+        continue
+      calt = cellGtNodes[cellbarcode].alleles.count("1")
+      if calt/cellDP < p0:
+        cellGtNodes[cellbarcode].genotype = gREF
+      elif calt/cellDP > p1:
+        cellGtNodes[cellbarcode].genotype = gALT
+      else:
+        cellGtNodes.del(cellbarcode)
+        continue
+    if cellGtNodes.len < minSnpDepth:
+      variantIndex.inc
+      continue
+    else:
+      for cellbarcode in cellGtNodes.keys:
+        cellDP = cellGtNodes[cellbarcode].alleles.len
+        calt = cellGtNodes[cellbarcode].alleles.count("1")
+        ## $cellGtNodes[cellbarcode].genotype BinaryGeno to "0", "1" or "-1" (not -1), write to file coding in 1 or 2
+        geno = parseInt($cellGtNodes[cellbarcode].genotype) + 1
+        # index from 1 for matrices
+        cellIndex = barcodeTable[cellbarcode] + 1
+        outGtMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $geno],sep = " "))
+        outTotalCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $cellDP],sep = " "))
+        outAltCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $calt],sep = " "))
+        nnsize.inc
+      outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") )
+      writtenVarintIndex.inc
+    ## write to matrices 
+          
+  echo "processed " & $(variantIndex) & " variants"
+  echo "wrote " & $(writtenVarintIndex-1) & " variants to matrices and snpAnnot file"
+  for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]:
+    outFileStream.setPosition(49)
+    outFileStream.write($(writtenVarintIndex-1) & " " & $barcodeTable.len & " " & $nnsize) 
+  for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]:
+    outFileStream.close()
+  return 0
+
+# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz"
+# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam"
+# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv"
+# var
+#   ibam:Bam
+#   ivcf:VCF
+#   outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
+# if not open(ibam, bam_file, threads=1, index = true):
+#     quit "couldn't open input bam"
+# if not open(ivcf, vcf_file, threads=1):
+#     quit "couldn't open: vcf file"
+
+# var hf = hts.hts_open(cstring(barcode_file), "r")
+# #### TODO : Table size
+# var barcodeTable =  newTable[string,int](initialSize = 1024)
+# var kstr: hts.kstring_t
+# kstr.l = 0
+# kstr.m = 0
+# kstr.s = nil
+# var ithSperm = 0
+# ## initiate the table with CB as keys, allele counts (ref object) as elements
+# while hts_getline(hf, cint(10), addr kstr) > 0:
+#   if $kstr.s[0] == "#":
+#     continue
+#   var v = $kstr.s
+#   discard barcodeTable.hasKeyOrPut(v, ithSperm)
+#   ithSperm.inc
+# discard hf.hts_close()
+
+# let s_Chrs = @["chr1"]
+# var chrom = "chr1"
+# let out_prefix = "test_data/getMtx/"
+# echo "generating gtMtx"
+# try: 
+#   outGtMtx = openFileStream(out_prefix & chrom &  "_gtMtx.mtx", fmWrite)
+#   outSnpAnnot = openFileStream(out_prefix & chrom &  "_snpAnnot.txt", fmWrite)
+#   outTotalCountMtx = openFileStream(out_prefix & chrom &  "_totalMtx.mtx", fmWrite)
+#   outAltCountMtx = openFileStream(out_prefix & chrom &  "_AltMtx.mtx", fmWrite)
+# except:
+#   stderr.write getCurrentExceptionMsg()
+
+
+# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
+#                  outSnpAnnot = outSnpAnnot, maxTotalReads = 30,
+#                  minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1)
+
+
+# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string
+
+# ## sort entries in mtx files 
+# for chrom in s_Chrs:
+#   outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx"
+#   outTotalCountMtxFile = out_prefix & chrom &  "_totalMtx.mtx"
+#   outAltCountMtxFile = out_prefix & chrom &  "_AltMtx.mtx"
+#   for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]:
+#     var imtx = readMtx(mtx_file = mtxFile)
+#     discard sortWriteMtx(imtx, mtx_file = mtxFile)
+
diff --git a/private/graph.nim b/private/graph.nim
index fe02980..215e1e6 100755
--- a/private/graph.nim
+++ b/private/graph.nim
@@ -15,7 +15,54 @@ type
     spermSnpIndexLookUp*: Table[int,int]
   SeqSpermViNodes* = seq[SpermViNodes]
 
-
+proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int = 
+  if scSpermSeq[ithSperm].viNodeseq.len==0:
+    var currentViNode = ViNode()
+    currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
+    currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
+    currentViNode.pathState[stateRef] = stateN
+    currentViNode.pathState[stateAlt] = stateN
+    currentViNode.state = stateN
+    currentViNode.pos = rec_pos
+    currentViNode.cAlt = cAlt
+    currentViNode.cRef = cRef
+    
+    scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
+    scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
+    scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
+  else:
+    let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
+    var ltransProb = math.ln(getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb))
+    var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb))
+    var currentViNode = ViNode()
+      # ref/alt -> ref
+    var refTref = preVNode.pathScore[stateRef] + lnoTransProb
+    var altTref = preVNode.pathScore[stateAlt] + ltransProb
+    if refTref > altTref:
+      currentViNode.pathScore[stateRef] = refTref
+      currentViNode.pathState[stateRef] = stateRef
+    else:
+      currentViNode.pathScore[stateRef] = altTref
+      currentViNode.pathState[stateRef] = stateAlt
+      # ref/alt -> alt
+    var refTalt = preVNode.pathScore[stateRef] + ltransProb
+    var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb
+      
+    if refTalt > altTalt:
+      currentViNode.pathScore[stateAlt] = refTalt
+      currentViNode.pathState[stateAlt] = stateRef
+    else:
+      currentViNode.pathScore[stateAlt] = altTalt
+      currentViNode.pathState[stateAlt] = stateAlt
+    currentViNode.pathScore[stateAlt] += emissionArray[stateAlt]
+    currentViNode.pathScore[stateRef] += emissionArray[stateRef]
+    currentViNode.cAlt = cAlt
+    currentViNode.cRef = cRef
+    currentViNode.pos = rec_pos
+    scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
+    scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
+    scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex  
+  return 0
 proc addViNode*(barcodeTable: TableRef, 
                alleleCountTable: Table[string,allele_expr],
                scSpermSeq: var SeqSpermViNodes,
@@ -40,51 +87,6 @@ proc addViNode*(barcodeTable: TableRef,
     outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
     nnsize += 1
     var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
-    if scSpermSeq[ithSperm].viNodeseq.len==0:
-      var currentViNode = ViNode()
-      currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
-      currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
-      currentViNode.pathState[stateRef] = stateN
-      currentViNode.pathState[stateAlt] = stateN
-      currentViNode.state = stateN
-      currentViNode.pos = int(rec_pos)
-      currentViNode.cAlt = int(ac.cAlt)
-      currentViNode.cRef = int(ac.cRef)
-      
-      scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
-      scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
-      scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
-    else:
-      let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
-      var ltransProb = math.ln(getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb))
-      var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb))
-      var currentViNode = ViNode()
-        # ref/alt -> ref
-      var refTref = preVNode.pathScore[stateRef] + lnoTransProb
-      var altTref = preVNode.pathScore[stateAlt] + ltransProb
-      if refTref > altTref:
-        currentViNode.pathScore[stateRef] = refTref
-        currentViNode.pathState[stateRef] = stateRef
-      else:
-        currentViNode.pathScore[stateRef] = altTref
-        currentViNode.pathState[stateRef] = stateAlt
-        # ref/alt -> alt
-      var refTalt = preVNode.pathScore[stateRef] + ltransProb
-      var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb
-        
-      if refTalt > altTalt:
-        currentViNode.pathScore[stateAlt] = refTalt
-        currentViNode.pathState[stateAlt] = stateRef
-      else:
-        currentViNode.pathScore[stateAlt] = altTalt
-        currentViNode.pathState[stateAlt] = stateAlt
-      currentViNode.pathScore[stateAlt] += emissionArray[stateAlt]
-      currentViNode.pathScore[stateRef] += emissionArray[stateRef]
-      currentViNode.cAlt = int(ac.cAlt)
-      currentViNode.cRef = int(ac.cRef)
-      currentViNode.pos = int(rec_pos)
-      scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
-      scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
-      scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
+    discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb)
   return 0
 # The size line  
diff --git a/private/infer_missing_snps.nim b/private/infer_missing_snps.nim
index b3874f7..2d958ec 100755
--- a/private/infer_missing_snps.nim
+++ b/private/infer_missing_snps.nim
@@ -1,89 +1,99 @@
-## infer missing snp in template cell by looking at the snp's link to other nearby snps in other cells
-
-## author Ruqian Lyu
-## rlyu@svi.edu.au
-import sequtils
-import math
-import utils
-
-let nAnchorSnps = 10
-
-proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = 
-  map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex])
-
-# cell_geno seq of length nAnchorSnps
-# temp_geno seq of length nAnchorSnps
-# return whether cell_geno matches with temp_geno in hap0 or hap1(bitwise complementary to temp_geno)
-proc matchTemplateHap(cell_geno:seq[BinaryGeno], temp_geno:seq[BinaryGeno],error_rate = 0.1): seq[float] = 
-  var nmatch = 0
-  for i,g in cell_geno:
-    if g == temp_geno[i]:
-      nmatch += 1
-  let mis = cell_geno.len - nmatch
-  let ll_h0 = error_rate^mis*(1-error_rate)^(nmatch)
-  let ll_h1 = error_rate^nmatch*(1-error_rate)^(mis)
-  if ll_h0 > ll_h1:
-    return @[0.0, ll_h0/(ll_h0 + ll_h1)]
-  elif ll_h0 == ll_h1:
-    return @[-1.0, 0.5]
-  else:
-    return @[1.0, ll_h1/(ll_h0 + ll_h1)]
-
-proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.98): BinaryGeno = 
-  let prob_0 = error_rate^type10*(1-error_rate)^(type00)
-  let prob_1 = error_rate^type00*(1-error_rate)^(type10)
-  let prob_0_p = prob_0/(prob_0 + prob_1)
-  let prob_1_p = 1 - prob_0_p 
-  if max(prob_0_p,prob_1_p) > posterior_thresh:
-    if prob_0_p > prob_1_p:
-      return gREF
-    else:
-      return gALT
-  return gUnknown
-# return full sequence of template geno by inferring missing SNPs's genotypes 
-proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99):seq[BinaryGeno] = 
-  var fullGeno = templateGeno
-  let nSnps = gtMtx[0].len
-  var coexisPos:seq[int]
-  var snpLD: seq[float]
-  var type00, type10: int
-
-  var snpiInCells = newSeq[BinaryGeno](gtMtx.len)
-  var offset = 1
-  for i,missingSnpi in templateGeno:
-    if missingSnpi != gUnknown: continue
-    snpiInCells = sliceColumn(gtMtx,i)
-    type00 = 0
-    type10 = 0
-    for j,snpiGeno in snpiInCells:
-      if snpiGeno == gUnknown: continue
-      # find 10 coexisting positions for cell j with template geno seq
-      offset = 1
-      coexisPos = newSeq[int]()
-      while(coexisPos.len < nAnchorSnps):
-        if ((i+offset) < nSnps):
-          if((gtMtx[j][i+offset] != gUnknown) and (fullGeno[i+offset] != gUnknown)):
-            coexisPos.add(i+offset)
-        if (i-offset) >= 0:
-          if((gtMtx[j][i-offset] != gUnknown) and (fullGeno[i-offset] != gUnknown)):
-            coexisPos.add(i-offset)
-        offset += 1
-      snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno =  gtMtx[j][x]), 
-                               temp_geno = map(coexisPos,proc(x:int): BinaryGeno =  fullGeno[x]) )
-      if snpLD[1] > posterior_thresh:
-        if snpLD[0] == 1.0:
-          if snpiGeno == gREF: ## snpiGeno is 1 or 2 
-            type10 += 1 
-          else:
-            type00 += 1
-        else:
-          if snpiGeno == gREF: 
-            type00 += 1 
-          else:
-            type10 += 1
-      else: continue
-    # echo "type00 " & $type00
-    # echo "type10 " & $type10
-    fullGeno[i] = inferGeno(type00, type10)  
-  return fullGeno
-
+## infer missing snp in template cell by looking at the snp's link to other nearby snps in other cells
+
+## author Ruqian Lyu
+## rlyu@svi.edu.au
+import sequtils
+import math
+import utils
+
+let nAnchorSnps = 10
+let maxExpand = 1000
+proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = 
+  map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex])
+
+# cell_geno seq of length nAnchorSnps
+# temp_geno seq of length nAnchorSnps
+# return whether cell_geno matches with temp_geno in hap0 or hap1(bitwise complementary to temp_geno)
+proc matchTemplateHap(cell_geno:seq[BinaryGeno], temp_geno:seq[BinaryGeno],error_rate = 0.1): seq[float] = 
+  var nmatch = 0
+  for i,g in cell_geno:
+    if g == temp_geno[i]:
+      nmatch += 1
+  let mis = cell_geno.len - nmatch
+  let ll_h0 = error_rate^mis*(1-error_rate)^(nmatch)
+  let ll_h1 = error_rate^nmatch*(1-error_rate)^(mis)
+  if ll_h0 > ll_h1:
+    return @[0.0, ll_h0/(ll_h0 + ll_h1)]
+  elif ll_h0 == ll_h1:
+    return @[-1.0, 0.5]
+  else:
+    return @[1.0, ll_h1/(ll_h0 + ll_h1)]
+
+proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.99): BinaryGeno = 
+  let prob_0 = error_rate^type10*(1-error_rate)^(type00)
+  let prob_1 = error_rate^type00*(1-error_rate)^(type10)
+  let prob_0_p = prob_0/(prob_0 + prob_1)
+  let prob_1_p = 1 - prob_0_p 
+  if max(prob_0_p,prob_1_p) > posterior_thresh:
+    echo "posterior_prob: " & $(max(prob_0_p,prob_1_p))
+    if prob_0_p > prob_1_p:
+      echo "returned inferred geno: gREF"
+      return gREF
+    else:
+      echo "returned inferred geno: gALT"
+      return gALT
+  return gUnknown
+# return full sequence of template geno by inferring missing SNPs's genotypes 
+proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99, runCorrection = false):seq[BinaryGeno] = 
+  var fullGeno = templateGeno
+  let nSnps = gtMtx[0].len
+  var coexisPos:seq[int]
+  var snpLD: seq[float]
+  var type00, type10: int
+  var snpiInCells = newSeq[BinaryGeno](gtMtx.len)
+  var offset,totalIter = 1
+  for i,missingSnpi in templateGeno:
+    if (not runCorrection) and (missingSnpi != gUnknown): continue
+    #echo "inferring ith SNP: " & $i
+    snpiInCells = sliceColumn(gtMtx,i)
+    type00 = 0
+    type10 = 0
+    for j,snpiGeno in snpiInCells:
+      if snpiGeno == gUnknown: continue
+      # find 10 coexisting positions for cell j with template geno seq
+      offset = 1
+      totalIter = 1
+      coexisPos = newSeq[int]()
+      while(coexisPos.len < nAnchorSnps and totalIter < maxExpand):
+        if ((i+offset) < nSnps):
+          if((gtMtx[j][i+offset] != gUnknown) and (templateGeno[i+offset] != gUnknown)):
+            coexisPos.add(i+offset)
+        if (i-offset) >= 0:
+          if((gtMtx[j][i-offset] != gUnknown) and (templateGeno[i-offset] != gUnknown)):
+            coexisPos.add(i-offset)
+        offset += 1
+        totalIter += 1
+      if coexisPos.len < int(nAnchorSnps/2): continue
+      snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno =  gtMtx[j][x]), 
+                               temp_geno = map(coexisPos,proc(x:int): BinaryGeno =  templateGeno[x]) )
+      if i == 1628:
+        echo  "ithSNP: " & $i & "jth cell: " & $j
+        echo "snpLD results: " & $snpLD
+        
+      if snpLD[1] > 0.999:
+        if snpLD[0] == 1.0:  ## match to template H1
+          if snpiGeno == gREF: ## snpiGeno is 1 or 2 
+            type10 += 1 
+          else:
+            type00 += 1
+        else:
+          if snpiGeno == gREF: 
+            type00 += 1 
+          else:
+            type10 += 1
+      else: continue
+    echo "ithSNP: " & $i & " type00: " & $type00 & ",type10: " & $type10
+    # echo 
+    fullGeno[i] = inferGeno(type00, type10)  
+  return fullGeno
+
diff --git a/private/infer_snp_hap.nim b/private/infer_snp_hap.nim
deleted file mode 100755
index 0ef2896..0000000
--- a/private/infer_snp_hap.nim
+++ /dev/null
@@ -1,5 +0,0 @@
-# infer SNP hap, give the current hap
-
-var hapVCFFile = ""
-var inputVCFFile:
-var outVCFFile:
\ No newline at end of file
diff --git a/private/sgcocaller_sxo.nim b/private/sgcocaller_sxo.nim
new file mode 100755
index 0000000..91721ed
--- /dev/null
+++ b/private/sgcocaller_sxo.nim
@@ -0,0 +1,147 @@
+## sgcocaller sxo, using pre-generated mtx files for finding crossovers using a HMM model
+from findpath import pathTrackBack
+from graph  import addViNodeIthSperm, SeqSpermViNodes
+import tables
+import hts
+import utils
+import sequtils
+import streams
+# bins of SNP indexes
+import strutils
+import os
+
+proc readCountMtxToSeq*(mtxFileStream:FileStream, countMtx:var seq[seq[int]], by_cell = false):int = 
+  var currentLineSeq:seq[int]
+  var currentLine:seq[string]
+  while not mtxFileStream.atEnd():
+    currentLine = mtxFileStream.readLine().splitWhitespace()
+    ## i j 1-based from file
+    currentLineSeq = map(currentLine, proc(x: string): int = parseInt(x))
+    if by_cell:
+      countMtx[(currentLineSeq[0]-1)][(currentLineSeq[1]-1)] = currentLineSeq[2]
+    else:
+      countMtx[(currentLineSeq[1]-1)][(currentLineSeq[0]-1)] = currentLineSeq[2]
+  return 0
+proc addViNodeSXO(barcodeTable: TableRef, alleleCountTable: Table[string,allele_expr], scSpermSeq: var SeqSpermViNodes,
+                  snpIndex: int,
+                  thetaRef: float,
+                  thetaAlt: float,
+                  rec_pos: int,
+                  nnsize : var int,
+                  initProb: array[stateRef..stateAlt, float],
+                  cmPmb: float,
+                  outaltCountMtxFS:FileStream, outtotalCountMtxFS:FileStream): int = 
+  for bc, ac in alleleCountTable.pairs:
+    var ithSperm = barcodeTable[bc]
+    nnsize.inc()
+    outaltCountMtxFS.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.calt)
+    outtotalCountMtxFS.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.calt+ac.cref))
+    var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cref,cAlt=ac.cAlt)
+    discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray = emissionArray, snpIndex = snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb)
+  return 0
+## barcodeTable, cell barcode:cell index
+proc sgcocallerSXO*(barcodeTable:TableRef, phase_dir:string, out_dir:string, thetaREF:float, thetaALT:float, cmPmb:float, s_Chrs:seq[string], initProb: array[stateRef..stateAlt, float], phasedSnpAnnotFileName:string): int =
+  var ncells = barcodeTable.len
+  var nsnps, ithSperm:int
+  var currentEntrySeq: seq[string]
+  var totalCountMtxByCell: seq[seq[int]]
+  var altCountMtxByCell: seq[seq[int]]
+  var alleleCountTable: Table[string,allele_expr]
+  ## iterate through each selected chromosome
+  for chrom in s_Chrs:
+
+    var nnsize = 0
+    ## number of non zeros
+    var scSpermSeq:SeqSpermViNodes
+    ## matches with the order in barcodeTable
+    scSpermSeq.setLen(barcodeTable.len)
+    var phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo:FileStream
+    let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
+    # if fileExists(phase_dir & chrom & "_corrected_phased_snpAnnot.txt"):
+    #   phasedSnpannoFS = openFileStream(phase_dir & chrom & "_corrected_phased_snpAnnot.txt", fmRead)
+    # elif fileExists(phase_dir & chrom & "_phased_snpAnnot.txt"):
+    #   phasedSnpannoFS = openFileStream(phase_dir & chrom & "_phased_snpAnnot.txt", fmRead)
+    phasedSnpannoFS =  openFileStream(phasedSnpAnnotFileName, fmRead)
+    try:
+      #_totalCount
+      totalCountMtxFS = openFileStream(phase_dir & chrom & "_totalMtx.mtx", fmRead)
+      altCountMtxFS = openFileStream(phase_dir & chrom & "_altMtx.mtx", fmRead)
+      outaltCountMtxFS = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite)
+      outtotalCountMtxFS = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite)
+      outSnpAnnotFS = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite)
+      outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite)
+      viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite)
+    except:
+      stderr.write getCurrentExceptionMsg()    
+    ## write headers to those mtx files and the first line place holder for total_row total_column total_entry    
+    for fs in [outFileVStateMtx,outaltCountMtxFS,outtotalCountMtxFS]:
+      fs.writeLine(sparseMatrixHeader)
+      fs.writeLine(' '.repeat(50))  
+    outSnpAnnotFS.writeLine(join(["POS","REF", "ALT"], sep = "\t"))
+    discard totalCountMtxFS.readLine() 
+    discard altCountMtxFS.readLine()
+    discard altCountMtxFS.readLine()
+    #N, i,j
+    currentEntrySeq = totalCountMtxFS.readLine().splitWhitespace()
+    nsnps = parseInt(currentEntrySeq[0])
+    ## gtMtx is cell by Snp format
+    totalCountMtxByCell = newSeqWith(nsnps,newSeq[int](ncells))
+    altCountMtxByCell = newSeqWith(nsnps,newSeq[int](ncells))
+    discard readCountMtxToSeq(mtxFileStream = totalCountMtxFS, countMtx = totalCountMtxByCell, by_cell = true)
+    discard readCountMtxToSeq(mtxFileStream = altCountMtxFS, countMtx = altCountMtxByCell, by_cell = true)
+    discard phasedSnpannoFS.readLine()
+    var isnpIndex = -1 
+    var osnpIndex = 0
+    while not phasedSnpannoFS.atEnd():
+      currentEntrySeq = phasedSnpannoFS.readLine().splitWhitespace()
+      isnpIndex.inc()
+      if currentEntrySeq[3] == "-1":        
+        continue
+      else:
+        alleleCountTable = initTable[string,allele_expr]()
+        for bc in barcodeTable.keys():
+          ithSperm = barcodeTable[bc]
+          if totalCountMtxByCell[isnpIndex][ithSperm] == 0:
+            ## not adding this vi node to the spermSeq
+            continue
+          if currentEntrySeq[3] == "1":
+            alleleCountTable[bc] = allele_expr(cref:(altCountMtxByCell[isnpIndex][ithSperm]), calt: (totalCountMtxByCell[isnpIndex][ithSperm] - altCountMtxByCell[isnpIndex][ithSperm]))
+          else:
+            alleleCountTable[bc] = allele_expr(cref:(totalCountMtxByCell[isnpIndex][ithSperm] - altCountMtxByCell[isnpIndex][ithSperm]), calt: (altCountMtxByCell[isnpIndex][ithSperm]))     
+        if alleleCountTable.len == 0:
+          continue
+        osnpIndex.inc()
+        if currentEntrySeq[3] == "1":
+          outSnpAnnotFS.writeLine(join([currentEntrySeq[0], currentEntrySeq[2], currentEntrySeq[1]], sep="\t") )
+        else:
+          outSnpAnnotFS.writeLine(join([currentEntrySeq[0], currentEntrySeq[1], currentEntrySeq[2]], sep="\t") )
+        discard addViNodeSXO(barcodeTable = barcodeTable,  alleleCountTable = alleleCountTable,  scSpermSeq = scSpermSeq,
+                           thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = osnpIndex, rec_pos = parseInt(currentEntrySeq[0]), nnsize = nnsize,
+                           initProb = initProb, cmPmb = cmPmb, outaltCountMtxFS = outaltCountMtxFS, outtotalCountMtxFS = outtotalCountMtxFS)
+       
+    discard pathTrackBack(scSpermSeq = scSpermSeq, thetaRef = thetaRef,thetaAlt=thetaAlt,cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx,
+                          viSegmentInfo = viSegmentInfo)
+    for fs in [outFileVStateMtx,outaltCountMtxFS,outtotalCountMtxFS]:
+      fs.setPosition(49)
+      fs.write($osnpIndex & " " & $barcodeTable.len & " " & $nnsize) 
+    for outFileStream in [phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo]:
+      outFileStream.close()
+  return 0
+
+# let barcodeFile = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv"
+# var barcodeTable = newTable[string,int](initialSize = 1024)
+# var hf = hts.hts_open(cstring(barcodeFile), "r")
+# var kstr: hts.kstring_t
+# kstr.l = 0
+# kstr.m = 0
+# kstr.s = nil
+# var ithSperm = 0
+# ## initiate the table with CB as keys, gamete index as elements
+# while hts_getline(hf, cint(10), addr kstr) > 0:
+#   if $kstr.s[0] == "#": continue
+#   var v = $kstr.s
+#   discard barcodeTable.hasKeyOrPut(v, ithSperm)
+#   ithSperm.inc
+# discard hf.hts_close()
+# var s_Chrs = @["chr6"]
+# discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = "data/WC_CNV_42/sgcocaller/phaseOneStep/", out_dir = "data/WC_CNV_42/sgcocaller/sxo/",thetaREF = 0.1, thetaALT = 0.9, cmPmb = 0.001,s_Chrs =s_Chrs,initProb = [0.5,0.5])
\ No newline at end of file
diff --git a/private/sgphase.nim b/private/sgphase.nim
index 6c669d6..602771c 100755
--- a/private/sgphase.nim
+++ b/private/sgphase.nim
@@ -7,7 +7,7 @@ import streams
 import strutils
 import sequtils
 
-# phaseOutdir/phased_snpAnnot.txt
+# phaseOutdir/chrom_phased_snpAnnot.txt
 # phaseOutdir/cellGenoVersusTemplate.txt
 # phaseOutdir/cellGenoVersusTemplate.png by executing the R code
 # let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_gtMtx.mtx"
@@ -30,14 +30,14 @@ proc writePhasedSnpAnnot*(fullGeno:seq[BinaryGeno], snpAnnotFileStream:FileStrea
     quit "snpAnnot.txt does not have the same number of rows with gtMtx"
   return 0 
  
-proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = 
+proc sgphase*(mtxFile: string, snpAnnotFile:string, phasedSnpAnnotFile:string, diagnosticDataframeFile:string, templateCell:int): int = 
   var nsnps,ncells,totalEntries:int
   var gtMtx:seq[seq[BinaryGeno]]  
   var currentEntry:seq[int]
   var currentEntrySeq:seq[string]
   var gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream:FileStream
-  let phasedSnpAnnotFile = phaseOutdir & "phased_snpAnnot.txt"
-  let diagnosticDataframeFile = phaseOutdir & "cellGenoVersusTemplate.txt"
+  #let phasedSnpAnnotFile = phaseOutdir & "phased_snpAnnot.txt"
+  #let diagnosticDataframeFile = phaseOutdir & "cellGenoVersusTemplate.txt"
   ## i, j are 1-based, entries with value 1 or 2
   try: 
     gtMtxFileStream = openFileStream(mtxFile, fmRead)
@@ -60,11 +60,22 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int =
   ## gtMtx is cell by Snp format
   gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps))
   discard readGtMtxToSeq(gtMtxFileStream,gtMtx)
-  var tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3)
+  var tempcell: int
+  if templateCell != -1:
+    tempcell = templateCell
+  else:
+    tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3)
   echo "template cell is " & $tempcell
   var tempcellGeno = gtMtx[tempcell]
-  let fullGeno = inferSnpGeno(tempcellGeno, gtMtx)
+  var fullGeno = inferSnpGeno(tempcellGeno, gtMtx)
+  echo "inferSnoGeno done"
+  # echo "second pass"
+  # fullGeno = inferSnpGeno(fullGeno, gtMtx)
+  # echo "second pass done"
   # generate the txt for diagnositc plot
+  echo "run correction"
+  fullGeno = inferSnpGeno(fullGeno, gtMtx,posterior_thresh = 0.99, runCorrection = true)
+  echo "run correction done"
   var selectedCells:seq[int]
   if ncells < 10:
     selectedCells = (0..(ncells-1)).toSeq
@@ -83,7 +94,6 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int =
       for cellg in sliceColumn(subsetGtMtx, k):
         writeOut = writeOut & " " & $(parseInt($cellg)+1)
     ddframFileStream.writeLine(writeOut)
-  
   discard writePhasedSnpAnnot(fullGeno, snpAnnotFileStream, phasedSnpAnnotFileStream)
   for fs in [gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream]:
     fs.close()
@@ -92,3 +102,20 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int =
 
 
 
+# var s_Chrs = @["CUR6G"]
+
+# for chrom in s_Chrs:
+#   var mtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_gtMtx.mtx"
+#   var phasedSnpFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_"  & chrom & "_phased_snpAnnot.txt"
+#   var snpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_"  & chrom & "_snpAnnot.txt"
+#   var ddframe = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_"  & chrom & "_cellGenoVersusTemplate.txt"
+  
+#   discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe)
+
+# for chrom in s_Chrs:
+#   var mtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx"
+#   var phasedSnpFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt"
+#   var snpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_snpAnnot.txt"
+#   var ddframe = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_cellGenoVersusTemplate.txt"
+  
+#   discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe)
\ No newline at end of file
diff --git a/private/write_out_vcf.nim b/private/write_out_vcf.nim
new file mode 100755
index 0000000..8de8b53
--- /dev/null
+++ b/private/write_out_vcf.nim
@@ -0,0 +1,101 @@
+## write output VCF file
+import hts
+import streams
+import sequtils
+import strutils
+import correctPhase
+
+let debug = true
+
+proc readNextPhased(phasedAnnotFS:var FileStream): seq[string] = 
+  var phaseRec = newSeq[string](4)
+  var lineRec: string
+  while not phasedAnnotFS.atEnd():
+    lineRec = phasedAnnotFS.readLine()
+    phaseRec = lineRec.splitWhitespace()
+    if debug and phaseRec.len != 4:
+      quit "len not 4 at " & lineRec & $phasedAnnotFS.atEnd()
+    if phaseRec[3] == "-1": continue
+    else: break
+  return phaseRec
+
+proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string, threads:int, add_header_string = """##sgcocaller_v0.1=phase""", chrom = "nil"):int = 
+
+  if debug: echo "ivcffile " & ivcfFile
+  if debug: echo "ovcfFile " & ovcfFile
+  if debug: echo "phasedAnnotFile " & phasedAnnotFile
+  
+  var ivcf: VCF
+  var ovcf: VCF
+  var v_off = 0
+  var current_block = 0
+  var phasedAnnotFS: FileStream
+  var phaseRec = newSeq[string](4)
+  var phasePos: int
+  var phasePhase: int
+  if not open(ivcf, ivcfFile, threads=threads):
+      quit "couldn't open input vcf file"
+  if not open(ovcf, ovcfFile, threads=threads, mode ="w"):
+    quit "couldn't open output vcf file"
+  try: 
+    phasedAnnotFS = openFileStream(phasedAnnotFile, fmRead)
+  except:
+    stderr.write getCurrentExceptionMsg()
+  ovcf.copy_header(ivcf.header)
+  discard ovcf.header.add_string(add_header_string)
+  discard ovcf.write_header()
+  discard phasedAnnotFS.readLine()
+  var gt_string:seq[int32]
+  var snpIndexInPhasedAnnotFile = 0
+  phaseRec = readNextPhased(phasedAnnotFS)
+  if debug: echo phasedAnnotFS.atEnd()
+  phasePos = parseInt(phaseRec[0])
+  phasePhase = parseInt(phaseRec[3])
+  var queryChrom = chrom
+  if queryChrom == "nil":
+    queryChrom = "*"
+  for v in ivcf.query(queryChrom):
+    if v.POS.cint != phasePos: continue
+    else:
+      var f = v.format()
+      if phasePhase == 0:
+        ## 0|1
+        #if debug: echo "block hap0 and this h0 is 0 0|1"
+        gt_string = @[int32(2),int32(5)]
+      elif phasePhase == 1:
+        ## 1|0
+        #if debug: echo "block hap0 and this h0 is 1 1|0"
+        gt_string = @[int32(4),int32(3)]
+      else:
+        if phasedAnnotFS.atEnd(): break 
+      if f.set("GT",gt_string) != Status.OK:
+        quit "set GT failed"
+      if not ovcf.write_variant(v) :
+        quit "write vcf failed for " & $voff
+      if phasedAnnotFS.atEnd(): break 
+      phaseRec = readNextPhased(phasedAnnotFS)
+      if debug:
+        if phaseRec.len != 4: 
+          echo " phaseRec.len " &  $phaseRec.len
+          echo " phaseRec" & $phaseRec
+      phasePos = parseInt(phaseRec[0])
+      phasePhase = parseInt(phaseRec[3])
+  phasedAnnotFS.close()
+  ovcf.close()
+  ivcf.close()
+  
+  return 0
+
+# let s_Chrs = map(toSeq(1..19), proc(x:int):string =  "chr" & $x)
+# for chrom in @["chr8"]:
+#   var ivcf = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT." & chrom & ".vcf.gz"
+#   var ovcf = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz"
+#   var swphasedSnpFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.txt"
+#   var gtMtxFile =  "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx"
+#   var phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt"
+#   var switchScoreFile ="data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_switch_score.txt" 
+#   var switchedPhasedAnnotVcfFile  = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz"
+#   discard correctPhase(gtMtxFile,phaseSnpAnnotFile,swphasedSnpFile,switchScoreFile)
+#   discard writePhaseToVCF(ivcf, ovcf, swphasedSnpFile,1)
+
+
diff --git a/sgcocaller.nim b/sgcocaller.nim
index 542c5ba..7969087 100755
--- a/sgcocaller.nim
+++ b/sgcocaller.nim
@@ -1,354 +1,289 @@
-
-## two modules (generate pair-wise fragments, and then run viterbi)
-
-
-import os
-import docopt
-import strutils
-import hts
-import tables
-import sequtils
-import private/utils
-import math
-import streams
-import private/graph
-import private/findpath
-import private/genotype_mtx
-import private/sgphase
-
-# proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTotalReads:int,
-#                   minTotalReads:int,  mapq: int,minbsq:int,mindp:int,barcodeTag:string, minsnpdepth:int):int = 
-#   var variantIndex = 1
-#   var cellFragmentsTable = newTable[string,Fragment]()
-
-#   echo "generated fragments from " & $barcodeTable.len & " single cells"
-#   var cellGtNodes: Table[string, GtNode]
-#   for rec in ivcf.query("*"): 
-#     cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,
-#                                   minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,
-#                                   minbsq=minbsq,barcodeTag=barcodeTag)
-#     cellGtNodes = callNodeGenotype(cellGtNodes,minCellDp = mindp, minSnpDepth = minsnpdepth, p0 = 0.3, p1 = 0.8)
-#     cellFragmentsTable = updateCellFragment(barcodedNodesTable=cellGtNodes,cellFragmentsTable = cellFragmentsTable, fmf_out = outfmf)              
-#     variantIndex = variantIndex + 1
-#   echo "processed " & $(variantIndex) & " variants"
-#   return 0
-
-# proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfile:string, threads:int):int =
-#   var cellGenoTable: TableRef[string,TableRef[string,int]]
-#   var cellBlockLeftPhaseTable,cellBlockRightPhaseTable :TableRef[string, seq[int]]
-#   var chr_blocks:seq[Block]
-#   var blockkseq:seq[Block_linkage]
-#   var phased_blocks: seq[int]
-#   echo "Phasing for hapfile " & block_hapfile
-#   cellGenoTable = newTable[string,TableRef[string,int]]()
-#   discard readFmf(fmf_file,cellGenoTable)
-#   chr_blocks = read_blocks(block_hapfile)
-#   cellBlockLeftPhaseTable = newTable[string, seq[int]]()
-#   cellBlockRightPhaseTable = newTable[string, seq[int]]()
-#   ## blocks at the ends with 20 SNPs or fewer, removed?
-#   ## blocks not at the ends with 20SNPs or fewer are not used for linking blocks but if their linkage with pre block and post-block is consistent, 
-#   ## these short blocks' haplotypes are retained. 
-#   ## 
-#   ## What if blocks cannot be linked ? the number of 11(00) == 10(01)
-#   echo "Total blocks " & $chr_blocks.len
-
-#   var contig_blocks:seq[int]
-#   for b_j, bl in chr_blocks:
-#     echo "block " & $b_j & " len: " & $bl.h0.len
-#     # echo "last 5" & $bl.h0[(high(bl.h0)-5)..high(bl.h0)]
-#     # echo "first 5" & $bl.h0[0..5]
-#     discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, true, cellBlockLeftPhaseTable)
-#     discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, false, cellBlockRightPhaseTable)
-#     contig_blocks = find_contig_blocks(chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable)
-#   # echo "blocks for contigs" & $contig_blocks
-#   blockkseq = build_contig(contig_blocks = contig_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable)
-#   for bk in blockkseq:
-#     echo "prevBlock" &  $bk.prevBlock & "nextBlock" &  $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 &  " 01 type " & $bk.linkageType_01 &  " switch posterior prob " & $bk.switch_posterior_prob  & " switch value :" & $bk.switch
-#   phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable)
-
-#   discard  write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads)
-#   return 0
-proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
-                ibam:Bam, out_dir:string, mapq:int, 
-                minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, 
-                thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string,phased:bool): int =
-  
-  var bulkBam = false
-  if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"):
-    echo "running in bulk mode and all reads are regarded as from one sample/cell"
-    bulkBam = true
-  else:
-    echo "running sgcoaller for " & $barcodeTable.len & " single cells"
-
-  let initProb:array[stateRef..stateAlt, float]=[0.5,0.5]
-  ## iterate through each selected chromosome
-  for chrom in s_Chrs:
-    var snpIndex, nnsize = 0
-    ## number of non zeros
-    var scSpermSeq:SeqSpermViNodes
-    ## matches with the order in barcodeTable
-    scSpermSeq.setLen(barcodeTable.len)
-    var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream
-
-    try:
-      outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite)
-      outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite)
-      outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite)
-      outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite)
-      viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite)
-    except:
-      stderr.write getCurrentExceptionMsg()
-    
-    ## write headers to those mtx files and the first line place holder for total_row total_column total_entry
-    let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
-    
-    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]:
-      outFileStream.writeLine(sparseMatrixHeader)
-      outFileStream.writeLine(' '.repeat(50))
-    
-    outFileSNPanno.writeLine(join(["POS","REF", "ALT"], sep = "\t"))
-    var ac = new_seq[int32](10)
-    for rec in ivcf.query(chrom):
-      if rec.ALT.len > 1 or rec.ALT.len == 0 : continue
-      ## alleleCountTable contains for this rec.POS, each cell barcode's allele counts 
-      var rec_alt = rec.ALT[0][0]
-      var rec_ref = rec.REF[0]
-      if phased:
-        var f = rec.format()
-        var gts = f.genotypes(ac)
-        if ac[0] == 4 and ac[1] == 3:
-          # gt is 1|0
-            rec_alt = rec.REF[0]
-            rec_ref = rec.ALT[0][0]
-          # if not gt == 0|1 continue
-        elif not (ac[0] == 2 and ac[1] == 5): continue
-      var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq,
-                                        maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag,
-                                        startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt =  rec_alt, rec_ref = rec_ref)
-      if alleleCountTable.len==0: continue
-
-      ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations
-      snpIndex += 1
-      outFileSNPanno.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") )
-      discard addViNode(barcodeTable = barcodeTable,  alleleCountTable = alleleCountTable,  scSpermSeq = scSpermSeq,
-               outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx  = outFileAltCountMtx, nnsize = nnsize,
-               mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec_pos=int(rec.POS),
-               initProb = initProb, cmPmb = cmPmb)
-    var posEnd: int64
-    var inferProb,reverseProb = 0.0
-    var currentEm: array[stateRef..stateAlt, float]
-    var lastNode: ViNode
-    var spermVNseq: SpermViNodes
-    var ithSNP: int
-
-    for ithSperm in 0..(scSpermSeq.len-1):
-      ## rightGap,leftGap = [0.0,0.0]
-      spermVNseq = scSpermSeq[ithSperm]
-      if spermVNseq.viNodeseq.len==0: continue
-      lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)]
-      currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt)
-      if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]:
-        scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef
-        ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1]
-        outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
-        inferProb  = currentEm[stateRef]
-        reverseProb = currentEm[stateAlt]
-      else:
-        scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt
-        ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1]
-        outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2")
-        inferProb  = currentEm[stateAlt]
-        reverseProb = currentEm[stateRef]
-      posEnd = lastNode.pos
-      ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the  relevant File Streams.
-      discard pathTrackBack(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm,  thetaRef = thetaRef, 
-                           thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx,
-                           viSegmentInfo = viSegmentInfo, posEnd = posEnd,  inferProb = inferProb,reverseProb = reverseProb)
-    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]:
-      outFileStream.setPosition(49)
-      outFileStream.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) 
-    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,outFileSNPanno,viSegmentInfo]:
-      outFileStream.close()
-  return 0
-
-when(isMainModule):
-  let version = "0.3.3"
-  var doc = format("""
-  $version
-
-  Usage:
-      sgcocaller gtMtx [options] <BAM> <VCF> <barcodeFile> <out_prefix>
-      sgcocaller phase [options] <gtMtxFile> <phaseOutputDir>
-      sgcocaller swphase [options] <gtMtxFile> <phaseSnpAnnotFile>
-      sgcocaller sxo [options] <phaseOutputDir> <out_prefix>
-      sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix>
-      
-
-
-Arguments:
-
-  <BAM> the read alignment file with records of single-cell DNA reads
-  
-  <VCF> the variant call file with records of SNPs
-
-  <barcodeFile> the text file containing the list of cell barcodes
-
-  <out_prefix>  the prefix of output files
-
-  <fmf> the fragment file from running sgcocaller fmf
-
-  <out_vcf> the output vcf aftering phasing blocks in hapfile
-
-
-Options:
-  -t --threads <threads>  number of BAM decompression threads [default: 4]
-  --barcodeTag <barcodeTag>  the cell barcode tag in BAM [default: CB]
-  --minMAPQ <mapq>  Minimum MAPQ for read filtering [default: 20]
-  --baseq <baseq>  base quality threshold for a base to be used for counting [default: 13]
-  --chrom <chrom>  the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms)
-  --minDP <minDP>  the minimum DP for a SNP to be included in the output file [default: 1]
-  --maxDP <maxDP>  the maximum DP for a SNP to be included in the output file [default: 5]
-  --maxTotalDP <maxTotalDP>  the maximum DP across all barcodes for a SNP to be included in the output file [default: 25]
-  --minTotalDP <minTotalDP>  the minimum DP across all barcodes for a SNP to be included in the output file [default: 10]
-  --minSNPdepth <minSNPdepth>  the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 1]
-  --thetaREF <thetaREF>  the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1]
-  --thetaALT <thetaALT>  the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
-  --cmPmb <cmPmb>  the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1]
-  --phased  the VCF contains the phased GT of heterozygous
-  -h --help  show help
-
-
-  Examples
-      ./sgcocaller gtMtx --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/cellfragment.fmf
-      ./sgcocaller phase gtMtxFile phaseOutputDir
-      ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp
-      ./sgcocaller sxo phaseOutputDir barcodeFile.tsv ./percell/ccsnp
-
-""" % ["version", version])
-
-  let args = docopt(doc, version=version)
-  
-  var
-    threads:int
-    selectedChrs:string
-    out_dir:string
-    mapq:int
-    minbsq:int
-    mintotal:int
-    maxtotal:int
-    mindp:int
-    maxdp:int
-    thetaREF:float
-    thetaALT:float
-    cmPmb:float
-    barcodeTag="CB"
-    minsnpdepth:int
-    barcodeFile,bamfile,vcff:string
-    vcfGtPhased = false
-  echo $args
-  threads = parse_int($args["--threads"])
-  barcodeTag = $args["--barcodeTag"]
-  mindp = parse_int($args["--minDP"])
-  maxdp = parse_int($args["--maxDP"])
-  maxtotal = parse_int($args["--maxTotalDP"])
-  mintotal = parse_int($args["--minTotalDP"])
-  minsnpdepth = parse_int($args["--minSNPdepth"])
-  mapq = parse_int($args["--minMAPQ"])
-  minbsq = parse_int($args["--baseq"])
-  thetaRef = parse_float($args["--thetaREF"])
-  thetaAlt = parse_float($args["--thetaALT"])
-  cmPmb = parse_float($args["--cmPmb"])
-  vcfGtPhased = parse_bool($args["--phased"])
-  
-  if args["gtMtx"] or args["xo"]:
-    var 
-      ibam:Bam
-      ivcf:VCF
-      s_Chrs: seq[string]
-    bamfile = $args["<BAM>"]
-    barcodeFile = $args["<barcodeFile>"]
-    out_dir = $args["<out_prefix>"]
-    if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_"
-    vcff = $args["<VCF>"]
-    if not open(ibam, bamfile, threads=threads, index = true):
-        quit "couldn't open input bam"
-    if not open(ivcf, vcff, threads=threads):
-        quit "couldn't open: vcf file"
-    if($args["--chrom"] != "nil"):
-      selectedChrs = $args["--chrom"]
-      s_Chrs = selectedChrs.split(',')
-    else:
-      ## find all chroms from headers of vcf file (Contigs)
-      let contigs = ivcf.contigs 
-      s_Chrs = map(contigs, proc(x: Contig): string = x.name)
-    var hf = hts.hts_open(cstring(barcodeFile), "r")
-    var barcodeTable =  newTable[string,int](initialSize = 1024)
-    var kstr: hts.kstring_t
-    kstr.l = 0
-    kstr.m = 0
-    kstr.s = nil
-    var ithSperm = 0
-    ## initiate the table with CB as keys, allele counts (ref object) as elements
-    while hts_getline(hf, cint(10), addr kstr) > 0:
-      if $kstr.s[0] == "#": continue
-      var v = $kstr.s
-      discard barcodeTable.hasKeyOrPut(v, ithSperm)
-      ithSperm.inc
-    discard hf.hts_close()
-    if args["gtMtx"]:
-      var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string
-      var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
-      for chrom in s_Chrs:
-        echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom
-        outGtMtxFile = out_dir & chrom & "_gtMtx.mtx"
-        outTotalCountMtxFile = out_dir & chrom &  "_totalMtx.mtx"
-        outAltCountMtxFile = out_dir & chrom &  "_altMtx.mtx"
-        try: 
-          outGtMtx = openFileStream(outGtMtxFile, fmWrite)
-          outSnpAnnot = openFileStream(out_dir & chrom &  "_snpAnnot.txt", fmWrite)
-          outTotalCountMtx = openFileStream(outTotalCountMtxFile, fmWrite)
-          outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite)
-        except:
-          stderr.write getCurrentExceptionMsg()
-
-        discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, 
-                         outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
-                         outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal,
-                         minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp,
-                         barcodeTag = barcodeTag, minsnpdepth=minsnpdepth)                    
-        for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]:
-          var imtx = readMtx(mtx_file = mtxFile)
-          discard sortWriteMtx(imtx, mtx_file = mtxFile)      
-    elif args["xo"]:
-      echo "running crossover calling from VCF and BAM file\n"
-      echo "running for these chromosomes as specified in the header of the provided VCF file for chroms: " & $s_Chrs
-      discard sgcocaller(threads, ivcf, barcodeTable, ibam,
-                      out_dir, mapq, minbsq, mintotal, 
-                      maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased)
-      var outFileTotalCountMtxFile, outFileAltCountMtxFile: string
-        ## sort entries in mtx files 
-      for chrom in s_Chrs:
-        outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
-        outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
-        for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
-          var imtx = readMtx(mtx_file = mtxFile)
-          discard sortWriteMtx(imtx, mtx_file = mtxFile)
-    ibam.close()
-    ivcf.close()
-  elif args["phase"]:
-    let gtMtxFile = $args["<gtMtxFile>"]
-    var phaseOutDir = $args["<phaseOutputDir>"]
-    if (phaseOutdir[^1] != '_') and (phaseOutdir[^1] != '/'): phaseOutdir = phaseOutdir & "_"
-    let snpAnnotFile = gtMtxFile.replace(sub = "_gtMtx.mtx", by = "_snpAnnot.txt")
-    echo "running phasing from single cell genotype matrix (one chromosome) " & gtMtxFile
-    echo "using the " & snpAnnotFile & "for generating phased haplotypes"
-    discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir)
-  elif args["swphase"]:
-    let gtMtxFile = $args["<gtMtxFile>"]
-    let phaseSnpAnnotFile = $args["<phaseSnpAnnotFile>"]
-    echo "finding snps positions to switch from single cell genotype matrix (one chromosome) " & gtMtxFile
-    echo "using the " & phaseSnpAnnotFile & "for generating final phased haplotypes"
-    discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir)
-
-
+
+import os
+import docopt
+import strutils
+import hts
+import tables
+import sequtils
+import private/utils
+import math
+import streams
+import private/graph
+import private/findpath
+import private/genotype_mtx
+import private/sgphase
+import private/write_out_vcf
+import private/correctPhase
+import private/sgcocaller_sxo
+
+
+let initProb:array[stateRef..stateAlt, float]=[0.5,0.5]
+
+proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
+                ibam:Bam, out_dir:string, mapq:int, 
+                minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, 
+                thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string,phased:bool): int =
+  
+  var bulkBam = false
+  if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"):
+    echo "running in bulk mode and all reads are regarded as from one sample/cell"
+    bulkBam = true
+  else:
+    echo "running sgcoaller for " & $barcodeTable.len & " single cells"
+
+  ## iterate through each selected chromosome
+  for chrom in s_Chrs:
+    var snpIndex, nnsize = 0
+    ## number of non zeros
+    var scSpermSeq:SeqSpermViNodes
+    ## matches with the order in barcodeTable
+    scSpermSeq.setLen(barcodeTable.len)
+    var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream
+    let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
+    try:
+      outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite)
+      outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite)
+      outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite)
+      outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite)
+      viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite)
+    except:
+      stderr.write getCurrentExceptionMsg()    
+    ## write headers to those mtx files and the first line place holder for total_row total_column total_entry    
+    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]:
+      outFileStream.writeLine(sparseMatrixHeader)
+      outFileStream.writeLine(' '.repeat(50))    
+    outFileSNPanno.writeLine(join(["POS","REF", "ALT"], sep = "\t"))
+    var ac = new_seq[int32](10)
+    for rec in ivcf.query(chrom):
+      if rec.ALT.len > 1 or rec.ALT.len == 0 : continue
+      ## alleleCountTable contains for this rec.POS, each cell barcode's allele counts 
+      var rec_alt = rec.ALT[0][0]
+      var rec_ref = rec.REF[0]
+      if phased:
+        var f = rec.format()
+        var gts = f.genotypes(ac)
+        if ac[0] == 4 and ac[1] == 3:
+          # gt is 1|0
+          rec_alt = rec.REF[0]
+          rec_ref = rec.ALT[0][0]
+          # if not gt == 0|1 continue
+        elif not (ac[0] == 2 and ac[1] == 5): continue
+      var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq,
+                                        maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag,
+                                        startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt =  rec_alt, rec_ref = rec_ref)
+      if alleleCountTable.len==0: continue
+
+      ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations
+      snpIndex += 1
+      outFileSNPanno.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") )
+      discard addViNode(barcodeTable = barcodeTable,  alleleCountTable = alleleCountTable,  scSpermSeq = scSpermSeq,
+               outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx  = outFileAltCountMtx, nnsize = nnsize,
+               mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec_pos=int(rec.POS),
+               initProb = initProb, cmPmb = cmPmb)
+    discard pathTrackBack(scSpermSeq = scSpermSeq, thetaRef = thetaRef,thetaAlt=thetaAlt,cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx,
+                          viSegmentInfo = viSegmentInfo )
+
+    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]:
+      outFileStream.setPosition(49)
+      outFileStream.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) 
+    for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,outFileSNPanno,viSegmentInfo]:
+      outFileStream.close()
+  return 0
+
+when(isMainModule):
+  let version = "0.3.3"
+  var doc = format("""
+  $version
+  Usage:
+      sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix> 
+      sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix> 
+      sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix>
+      sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix>
+      
+
+
+Arguments:
+
+  <BAM> the read alignment file with records of single-cell DNA reads
+  
+  <VCF> the variant call file with records of SNPs
+
+  <barcodeFile> the text file containing the list of cell barcodes
+
+  <out_prefix>  the prefix of output files
+
+  <fmf> the fragment file from running sgcocaller fmf
+
+  <out_vcf> the output vcf aftering phasing blocks in hapfile
+
+
+Options:
+  -t --threads <threads>  number of BAM decompression threads [default: 4]
+  --barcodeTag <barcodeTag>  the cell barcode tag in BAM [default: CB]
+  --minMAPQ <mapq>  Minimum MAPQ for read filtering [default: 20]
+  --baseq <baseq>  base quality threshold for a base to be used for counting [default: 13]
+  --chrom <chrom>  the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms)
+  --minDP <minDP>  the minimum DP for a SNP to be included in the output file [default: 1]
+  --maxDP <maxDP>  the maximum DP for a SNP to be included in the output file [default: 5]
+  --maxTotalDP <maxTotalDP>  the maximum DP across all barcodes for a SNP to be included in the output file [default: 25]
+  --minTotalDP <minTotalDP>  the minimum DP across all barcodes for a SNP to be included in the output file [default: 10]
+  --minSNPdepth <minSNPdepth>  the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 1]
+  --thetaREF <thetaREF>  the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1]
+  --thetaALT <thetaALT>  the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
+  --cmPmb <cmPmb>  the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1]
+  --phased  the input VCF for calling crossovers contains the phased GT of heterozygous SNPs
+  --outvcf  generate the output in vcf format (phase)  
+  --templateCell <templateCell>  the cell's genotype to be used a template cell, as the cell's index (0-starting) in the barcode file, default as not supplied [default: -1]
+  -h --help  show help
+
+
+  Examples
+      ./sgcocaller phase gtMtxFile phaseOutputPrefix
+      ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp
+      ./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp
+
+""" % ["version", version])
+
+  let args = docopt(doc, version=version)
+  
+  var
+    threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth:int
+    thetaREF,thetaALT,cmPmb:float
+    barcodeTag="CB"
+    out_dir,selectedChrs,barcodeFile,bamfile,vcff:string
+    vcfGtPhased = false
+    s_Chrs: seq[string]
+    outFileTotalCountMtxFile, outFileAltCountMtxFile: string
+
+  echo $args
+  threads = parse_int($args["--threads"])
+  barcodeTag = $args["--barcodeTag"]
+  mindp = parse_int($args["--minDP"])
+  maxdp = parse_int($args["--maxDP"])
+  maxtotal = parse_int($args["--maxTotalDP"])
+  mintotal = parse_int($args["--minTotalDP"])
+  minsnpdepth = parse_int($args["--minSNPdepth"])
+  mapq = parse_int($args["--minMAPQ"])
+  minbsq = parse_int($args["--baseq"])
+  thetaRef = parse_float($args["--thetaREF"])
+  thetaAlt = parse_float($args["--thetaALT"])
+  cmPmb = parse_float($args["--cmPmb"])
+  vcfGtPhased = parse_bool($args["--phased"])
+  out_dir = $args["<out_prefix>"]
+  if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_"
+
+  if args["phase"] or args["xo"] or args["sxo"]:
+    barcodeFile = $args["<barcodeFile>"]
+    var hf = hts.hts_open(cstring(barcodeFile), "r")
+    var barcodeTable =  newTable[string,int](initialSize = 1024)
+    var kstr: hts.kstring_t
+    kstr.l = 0
+    kstr.m = 0
+    kstr.s = nil
+    var ithSperm = 0
+    ## initiate the table with CB as keys, gamete index as elements
+    while hts_getline(hf, cint(10), addr kstr) > 0:
+      if $kstr.s[0] == "#": continue
+      var v = $kstr.s
+      discard barcodeTable.hasKeyOrPut(v, ithSperm)
+      ithSperm.inc
+    discard hf.hts_close()
+    if args["phase"] or args["xo"]:
+      var 
+        ibam:Bam
+        ivcf:VCF
+      bamfile = $args["<BAM>"]
+      vcff = $args["<VCF>"]
+      if($args["--chrom"] != "nil"):
+        selectedChrs = $args["--chrom"]
+        s_Chrs = selectedChrs.split(',')
+      else:
+        ## find all chroms from headers of vcf file (Contigs)
+        let contigs = ivcf.contigs 
+        s_Chrs = map(contigs, proc(x: Contig): string = x.name)      
+      if not open(ibam, bamfile, threads=threads, index = true):
+          quit "couldn't open input bam"
+      if not open(ivcf, vcff, threads=threads):
+          quit "couldn't open: vcf file"
+      if args["phase"]:
+        var templateCell = parse_int($args["--templateCell"])     
+        var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string
+        var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
+        for chrom in s_Chrs:
+          echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom
+          outGtMtxFile = out_dir & chrom & "_gtMtx.mtx"
+          outTotalCountMtxFile = out_dir & chrom &  "_totalMtx.mtx"
+          outAltCountMtxFile = out_dir & chrom &  "_altMtx.mtx"
+          outSnpAnnotFile = out_dir & chrom &  "_snpAnnot.txt"
+          outphasedSnpAnnotFile = out_dir & chrom &  "_phased_snpAnnot.txt"
+          outdiagnosticDataframeFile = out_dir & chrom &  "_cellGenoVersusTemplate.txt"
+          try: 
+            outGtMtx = openFileStream(outGtMtxFile, fmWrite)
+            outSnpAnnot = openFileStream(outSnpAnnotFile, fmWrite)
+            outTotalCountMtx = openFileStream(outTotalCountMtxFile, fmWrite)
+            outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite)
+          except:
+            stderr.write getCurrentExceptionMsg()
+          discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, 
+                          outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
+                          outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal,
+                          minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp,
+                          barcodeTag = barcodeTag, minsnpdepth=minsnpdepth, chrom = chrom)                    
+          for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]:
+            var imtx = readMtx(mtx_file = mtxFile)
+            discard sortWriteMtx(imtx, mtx_file = mtxFile) 
+          echo "running phasing from single cell genotype matrix (one chromosome) " & outGtMtxFile
+          echo "using the " & outSnpAnnotFile & "for generating phased haplotypes"
+          discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile, diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell)
+          if parse_bool($args["--outvcf"]):
+            var outvcfFile = out_dir & chrom &  "_phased_snpAnnot.vcf.gz"   
+            discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads)    
+      elif args["xo"]:
+        echo "running crossover calling from VCF and BAM file\n"
+        echo "running for these chromosomes: " & $s_Chrs
+        discard sgcocaller(threads, ivcf, barcodeTable, ibam,
+                        out_dir, mapq, minbsq, mintotal, 
+                        maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased)
+        ## sort entries in mtx files 
+        for chrom in s_Chrs:
+          outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
+          outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
+          for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
+            var imtx = readMtx(mtx_file = mtxFile)
+            discard sortWriteMtx(imtx, mtx_file = mtxFile)
+      ibam.close()
+      ivcf.close()
+    elif args["sxo"]:
+      if($args["--chrom"] != "nil"):
+        selectedChrs = $args["--chrom"]
+        s_Chrs = selectedChrs.split(',')
+      else:
+        quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes using comma as separator"
+      echo "running crossover calling from genotype and allele count matrices generate from sgcocaller phase/swphase "
+      echo "running for these chromosomes : " & $s_Chrs
+      let phase_dir = $args["<phaseOutputPrefix>"]
+      let phasedSnpAnnotFile = $args["<SNPPhaseFile>"]
+      discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, phasedSnpAnnotFileName = phasedSnpAnnotFile)
+      for chrom in s_Chrs:
+        outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
+        outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
+        for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
+          var imtx = readMtx(mtx_file = mtxFile)
+          discard sortWriteMtx(imtx, mtx_file = mtxFile)
+  elif args["swphase"]:
+    echo "find switch spots and generate corrected phase"
+    let gtMtxFile =  $args["<gtMtxFile>"]
+    let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"]
+    let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt"
+    let switchScoreFile = out_dir & "switch_score.txt" 
+    let switchedPhasedAnnotVcfFile  = out_dir & "corrected_phased_snpAnnot.vcf.gz"
+    discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile)
+    if ($args["--chrom"] == "nil"):
+      echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option"
+    
+    discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"])
+
+
+
+
  
\ No newline at end of file
-- 
GitLab