From 32741fbf4dfbf9f84c2bb6b8e409a7ca58e52b05 Mon Sep 17 00:00:00 2001 From: rlyu <rlyu@svi.edu.au> Date: Tue, 21 Dec 2021 16:42:15 +1100 Subject: [PATCH] v0.3.4 (new option binsize) --- sgcocaller.nimble | 2 +- src/sgcocaller.nim | 10 +++++++--- src/sgcocaller/correctPhase.nim | 15 +++++++-------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/sgcocaller.nimble b/sgcocaller.nimble index 3e27492..3fd3c84 100644 --- a/sgcocaller.nimble +++ b/sgcocaller.nimble @@ -1,6 +1,6 @@ # Package -version = "0.3.4" +version = "0.3.5" author = "Ruqian Lyu" description = "sgcocaller: calling crossovers from single-gamete DNA sequencing datasets" license = "MIT" diff --git a/src/sgcocaller.nim b/src/sgcocaller.nim index 3907994..a4f9b21 100755 --- a/src/sgcocaller.nim +++ b/src/sgcocaller.nim @@ -91,7 +91,7 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, return 0 when(isMainModule): - let version = "0.3.4" + let version = "0.3.5" var doc = format(""" $version Usage: @@ -140,6 +140,8 @@ Options: --lookBeyondSnps <lookBeyondSnps> the number of local SNPs to use when finding switch positions [default: 25] --minSwitchScore <minSwitchScore> the minimum switch score for a site to be identified as having a switch error in the inferred haplotype [default: 50.0] --minPositiveSwitchScores <minPositiveSwitchScores> the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8] + --binSize <binSize> the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default 2000] + --stepSize <stepSize> the move step size used in combination with --binSize. [default 200] -h --help show help @@ -153,7 +155,7 @@ Options: let args = docopt(doc, version=version) var - threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores:int + threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize:int thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore:float barcodeTag="CB" out_dir,selectedChrs,barcodeFile,bamfile,vcff:string @@ -173,6 +175,8 @@ Options: minbsq = parse_int($args["--baseq"]) maxExpand = parse_int($args["--maxExpand"]) lookBeyondSnps = parse_int($args["--lookBeyondSnps"]) + binSize = parse_int($args["--binSize"]) + stepSize = parse_int($args["--stepSize"]) minPositiveSwitchScores = parse_int($args["--minPositiveSwitchScores"]) thetaRef = parse_float($args["--thetaREF"]) thetaAlt = parse_float($args["--thetaALT"]) @@ -291,7 +295,7 @@ Options: let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt" let switchScoreFile = out_dir & "switch_score.txt" let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz" - discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores) + discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize) if ($args["--chrom"] == "nil"): echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option" diff --git a/src/sgcocaller/correctPhase.nim b/src/sgcocaller/correctPhase.nim index ce226df..f76660f 100755 --- a/src/sgcocaller/correctPhase.nim +++ b/src/sgcocaller/correctPhase.nim @@ -6,8 +6,8 @@ import streams # bins of SNP indexes import strutils -let binSize = 2000 -let movingStep = 200 +# let binSize = 2000 +# let movingStep = 200 let dissimThresh = 0.0099 #let lookBeyondSnps = 25 let debug = false @@ -47,7 +47,7 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): if debug: echo "bin had many crossovers : " & $nxcells return true -proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = +proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] = #let ncells = gtMtxByCell[0].len let nSnps = gtMtxByCell.len let nBins = (int)(ceil(nSnps / (binSize - movingStep))) @@ -65,8 +65,7 @@ proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]] if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )): highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd)) - - + # add a last bin as from end of chrom with length binSize let lastBinStart = nSnps - binSize let lastBinEnd = nSnps - 1 @@ -236,7 +235,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph phaseSnpAnnotFileFS.close() return 0 -proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int): int = +proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int): int = var gtMtxByCell: seq[seq[BinaryGeno]] var currentEntrySeq: seq[string] var currentEntry:seq[int] @@ -265,8 +264,8 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true) var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps ) - var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell) - echo "riskySnps.len " & $riskySnps.len + var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize) + echo "riskySnps.length " & $riskySnps.len var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps) let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores) switchScoreFileStream.writeLine("#" & $switchSites) -- GitLab