From 32741fbf4dfbf9f84c2bb6b8e409a7ca58e52b05 Mon Sep 17 00:00:00 2001
From: rlyu <rlyu@svi.edu.au>
Date: Tue, 21 Dec 2021 16:42:15 +1100
Subject: [PATCH] v0.3.4 (new option binsize)

---
 sgcocaller.nimble               |  2 +-
 src/sgcocaller.nim              | 10 +++++++---
 src/sgcocaller/correctPhase.nim | 15 +++++++--------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/sgcocaller.nimble b/sgcocaller.nimble
index 3e27492..3fd3c84 100644
--- a/sgcocaller.nimble
+++ b/sgcocaller.nimble
@@ -1,6 +1,6 @@
 # Package
 
-version       = "0.3.4"
+version       = "0.3.5"
 author        = "Ruqian Lyu"
 description   = "sgcocaller: calling crossovers from single-gamete DNA sequencing datasets"
 license       = "MIT"
diff --git a/src/sgcocaller.nim b/src/sgcocaller.nim
index 3907994..a4f9b21 100755
--- a/src/sgcocaller.nim
+++ b/src/sgcocaller.nim
@@ -91,7 +91,7 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
   return 0
 
 when(isMainModule):
-  let version = "0.3.4"
+  let version = "0.3.5"
   var doc = format("""
   $version
   Usage:
@@ -140,6 +140,8 @@ Options:
   --lookBeyondSnps <lookBeyondSnps>  the number of local SNPs to use when finding switch positions [default: 25]
   --minSwitchScore <minSwitchScore>  the minimum switch score for a site to be identified as having a switch error in the inferred haplotype  [default: 50.0]
   --minPositiveSwitchScores <minPositiveSwitchScores>  the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8]  
+  --binSize <binSize>  the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default 2000]
+  --stepSize <stepSize>  the move step size used in combination with --binSize. [default 200]
   -h --help  show help
 
 
@@ -153,7 +155,7 @@ Options:
   let args = docopt(doc, version=version)
   
   var
-    threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores:int
+    threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize:int
     thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore:float
     barcodeTag="CB"
     out_dir,selectedChrs,barcodeFile,bamfile,vcff:string
@@ -173,6 +175,8 @@ Options:
   minbsq = parse_int($args["--baseq"])
   maxExpand = parse_int($args["--maxExpand"])
   lookBeyondSnps = parse_int($args["--lookBeyondSnps"])
+  binSize = parse_int($args["--binSize"])
+  stepSize = parse_int($args["--stepSize"])
   minPositiveSwitchScores =  parse_int($args["--minPositiveSwitchScores"])
   thetaRef = parse_float($args["--thetaREF"])
   thetaAlt = parse_float($args["--thetaALT"])
@@ -291,7 +295,7 @@ Options:
     let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt"
     let switchScoreFile = out_dir & "switch_score.txt" 
     let switchedPhasedAnnotVcfFile  = out_dir & "corrected_phased_snpAnnot.vcf.gz"
-    discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores)
+    discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize)
     if ($args["--chrom"] == "nil"):
       echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option"
     
diff --git a/src/sgcocaller/correctPhase.nim b/src/sgcocaller/correctPhase.nim
index ce226df..f76660f 100755
--- a/src/sgcocaller/correctPhase.nim
+++ b/src/sgcocaller/correctPhase.nim
@@ -6,8 +6,8 @@ import streams
 # bins of SNP indexes
 import strutils
 
-let binSize = 2000
-let movingStep = 200
+# let binSize = 2000
+# let movingStep = 200
 let dissimThresh = 0.0099 
 #let lookBeyondSnps = 25
 let debug = false
@@ -47,7 +47,7 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]):
     if debug: echo "bin had many crossovers : " & $nxcells
     return true
 
-proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = 
+proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] = 
   #let ncells = gtMtxByCell[0].len
   let nSnps = gtMtxByCell.len
   let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
@@ -65,8 +65,7 @@ proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]
     if (hasCrossover(templ_geno =  fullGeno[snpPosStart..snpPosEnd], 
                      cell_geno =  gtMtxByCell[snpPosStart..snpPosEnd] )):
       highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd))
-
-    
+  
   # add a last bin as from end of chrom with length binSize
   let lastBinStart = nSnps - binSize
   let lastBinEnd = nSnps - 1
@@ -236,7 +235,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph
   phaseSnpAnnotFileFS.close()
   return 0
 
-proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int): int = 
+proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int): int = 
   var gtMtxByCell: seq[seq[BinaryGeno]]
   var currentEntrySeq: seq[string]
   var currentEntry:seq[int]
@@ -265,8 +264,8 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
   gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
   discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true)
   var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps )
-  var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell)
-  echo "riskySnps.len " & $riskySnps.len
+  var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize)
+  echo "riskySnps.length " & $riskySnps.len
   var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps)
   let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores)
   switchScoreFileStream.writeLine("#" & $switchSites)
-- 
GitLab