diff --git a/src/sgcocaller.nim b/src/sgcocaller.nim index b6f7529476fac25973bdb959a675e12e684f004b..77942269236828ee62ba4d25f394db7a54807b5e 100755 --- a/src/sgcocaller.nim +++ b/src/sgcocaller.nim @@ -15,7 +15,7 @@ import sgcocaller/sgphase import sgcocaller/writeVCF import sgcocaller/correctPhase import sgcocaller/sgcocaller_sxo - +import times let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] @@ -26,18 +26,18 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, var bulkBam = false if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"): - echo "running in bulk mode and all reads are regarded as from one sample/cell" + echo $now() & " running in bulk mode and all reads are regarded as from one sample/cell" bulkBam = true else: - echo "running sgcoaller xo for " & $barcodeTable.len & " single cells" + echo $now() & " running sgcoaller xo for " & $barcodeTable.len & " single cells" ## iterate through each selected chromosome for chrom in s_Chrs: var snpIndex, nnsize = 0 ## number of non zeros - var scSpermSeq:SeqSpermViNodes + var scSpermSeq = newTable[int,SpermViNodes]() ## matches with the order in barcodeTable - scSpermSeq.setLen(barcodeTable.len) + ## scSpermSeq.setLen(barcodeTable.len) var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" try: @@ -91,17 +91,17 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, return 0 when(isMainModule): - let version = "0.3.7" + let version = "0.3.9" var doc = format(""" $version Usage: + sgcocaller autophase [options] <BAM> <VCF> <barcodeFile> <out_prefix> sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix> sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix> sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix> sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> - Arguments: <BAM> the read alignment file with records of single-cell DNA reads @@ -112,10 +112,11 @@ Arguments: <out_prefix> the prefix of output files - <fmf> the fragment file from running sgcocaller fmf - <out_vcf> the output vcf aftering phasing blocks in hapfile + + <gtMtxFile> the output gtMtx.mtx file from running sgcocaller phase + <phasedSnpAnnotFile> the output phased_snpAnnot.txt from running sgcocaller phase Options: -t --threads <threads> number of BAM decompression threads [default: 4] @@ -142,28 +143,34 @@ Options: --minPositiveSwitchScores <minPositiveSwitchScores> the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8] --binSize <binSize> the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default: 2000] --stepSize <stepSize> the move step size used in combination with --binSize. [default: 200] + --dissimThresh <dissimThresh> the threshold used on the allele concordance ratio for determining if a SNP bin contains a crossover. [default: 0.0099] + --batchSize <batchSize> the number of cells to process in one batch when running sxo. This option is only needed when the memory is limited. + --notSortMtx do not sort the output mtx. + --maxUseNcells <maxUseNcells> the number of cells to use for calculating switch scores. All cells are used if not set -h --help show help Examples - ./sgcocaller phase gtMtxFile phaseOutputPrefix + ./sgcocaller autophase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix + ./sgcocaller phase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix ./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp - ./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp + ./sgcocaller sxo snp_phase.txt phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp """ % ["version", version]) let args = docopt(doc, version=version) var - threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize:int - thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore:float + threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize,batchSize,maxUseNcells:int + thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore,dissimThresh:float barcodeTag="CB" out_dir,selectedChrs,barcodeFile,bamfile,vcff:string - vcfGtPhased = false + vcfGtPhased,notSortMtx = false s_Chrs: seq[string] outFileTotalCountMtxFile, outFileAltCountMtxFile: string - echo $args + echo $now() & $args + echo $now() & " running sgcocaller " & $version threads = parse_int($args["--threads"]) barcodeTag = $args["--barcodeTag"] mindp = parse_int($args["--minDP"]) @@ -177,18 +184,33 @@ Options: lookBeyondSnps = parse_int($args["--lookBeyondSnps"]) binSize = parse_int($args["--binSize"]) stepSize = parse_int($args["--stepSize"]) + dissimThresh = parse_float($args["--dissimThresh"]) minPositiveSwitchScores = parse_int($args["--minPositiveSwitchScores"]) thetaRef = parse_float($args["--thetaREF"]) thetaAlt = parse_float($args["--thetaALT"]) posteriorProbMin = parse_float($args["--posteriorProbMin"]) maxDissim = parse_float($args["--maxDissim"]) minSwitchScore = parse_float($args["--minSwitchScore"]) + if $args["--maxUseNcells"] != "nil": maxUseNcells = parse_int($args["--maxUseNcells"]) cmPmb = parse_float($args["--cmPmb"]) vcfGtPhased = parse_bool($args["--phased"]) + notSortMtx = parse_bool($args["--notSortMtx"]) out_dir = $args["<out_prefix>"] if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_" - - if args["phase"] or args["xo"] or args["sxo"]: + if $args["--batchSize"] != "nil": + batchSize = parse_int($args["--batchSize"]) + else: + batchSize = 0 + var run_phase, run_swphase, run_xo, run_sxo, run_auto_phase = false + run_phase = args["phase"] + run_swphase = args["swphase"] + run_xo = args["xo"] + run_sxo = args["sxo"] + run_auto_phase = args["autophase"] + if run_auto_phase: + run_phase = true + run_swphase = true + if run_phase or run_xo or run_sxo: barcodeFile = $args["<barcodeFile>"] var hf = hts.hts_open(cstring(barcodeFile), "r") var barcodeTable = newTable[string,int](initialSize = 1024) @@ -204,7 +226,7 @@ Options: discard barcodeTable.hasKeyOrPut(v, ithSperm) ithSperm.inc discard hf.hts_close() - if args["phase"] or args["xo"]: + if run_phase or run_xo: var ibam:Bam ivcf:VCF @@ -221,12 +243,13 @@ Options: quit "couldn't open input bam" if not open(ivcf, vcff, threads=threads): quit "couldn't open: vcf file" - if args["phase"]: + if run_phase: + echo $now() & " running phase" var templateCell = parse_int($args["--templateCell"]) var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream for chrom in s_Chrs: - echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom + echo $now() & " generating genotype sparse matrix file to " & out_dir & "for chr: " & chrom outGtMtxFile = out_dir & chrom & "_gtMtx.mtx" outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx" outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx" @@ -240,66 +263,80 @@ Options: outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite) except: stderr.write getCurrentExceptionMsg() - discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, - outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, - outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, - minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp, - barcodeTag = barcodeTag, minsnpdepth=minsnpdepth, chrom = chrom) - for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) - echo "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile - echo "using the " & outSnpAnnotFile & " for generating phased haplotypes" + discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp = maxdp,barcodeTag = barcodeTag, minsnpdepth = minsnpdepth, chrom = chrom) + if not notSortMtx: + for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + echo $now() & "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile + echo $now() & "using the " & outSnpAnnotFile & " for generating phased haplotypes" discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile, diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell, maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim) - if parse_bool($args["--outvcf"]): - var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz" + if parse_bool($args["--outvcf"]) and (not run_swphase): + var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz" + echo $now() & " write phased haplotypes to VCF " & outvcfFile discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads) - elif args["xo"]: - echo "running crossover calling from VCF and BAM file\n" - echo "running for these chromosomes: " & $s_Chrs - discard sgcocaller(threads, ivcf, barcodeTable, ibam, - out_dir, mapq, minbsq, mintotal, - maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) + if run_swphase: + echo $now() & " running swphase to find switch spots and generate corrected phase" + let switchedPhasedAnnotFile = out_dir & chrom & "_corrected_phased_snpAnnot.txt" + let switchScoreFile = out_dir & chrom & "_switch_score.txt" + let switchedPhasedAnnotVcfFile = out_dir & chrom & "_corrected_phased_snpAnnot.vcf.gz" + echo $now() & "scanning with swphase to find switch spots and generate corrected phase" + discard correctPhase(outGtMtxFile, outphasedSnpAnnotFile, switchedPhasedAnnotFile, switchScoreFile, lookBeyondSnps = lookBeyondSnps, minSwitchScore = minSwitchScore, + minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize,dissimThresh = dissimThresh, maxUseNcells = maxUseNcells ) + echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile + discard writePhaseToVCF(vcff, switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = chrom) + elif run_xo: + echo $now() & " running crossover calling from VCF and BAM file\n" + echo $now() & " running for these chromosomes: " & $s_Chrs + discard sgcocaller(threads, ivcf, barcodeTable, ibam, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs, barcodeTag, phased = vcfGtPhased) ## sort entries in mtx files - for chrom in s_Chrs: - outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" - outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" - for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) + if not notSortMtx: + for chrom in s_Chrs: + outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" + outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" + for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) ibam.close() ivcf.close() - elif args["sxo"]: + elif run_sxo: if($args["--chrom"] != "nil"): selectedChrs = $args["--chrom"] s_Chrs = selectedChrs.split(',') else: quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator" - echo "running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase " - echo "running for these chromosomes : " & $s_Chrs + echo $now() & " running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase " + echo $now() & " running for these chromosomes : " & $s_Chrs let phase_dir = $args["<phaseOutputPrefix>"] let phasedSnpAnnotFile = $args["<SNPPhaseFile>"] - discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, phasedSnpAnnotFileName = phasedSnpAnnotFile) - for chrom in s_Chrs: - outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" - outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" - for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) - elif args["swphase"]: - echo "running swphase to find switch spots and generate corrected phase" + if (batchSize == 0) or (batchSize >= barcodeTable.len): + batchSize = barcodeTable.len + discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir, + thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, + phasedSnpAnnotFileName = phasedSnpAnnotFile,batchSize=batchSize) + if not notSortMtx: + for chrom in s_Chrs: + outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" + outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" + for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + elif run_swphase: + echo $now() & " running swphase to find switch spots and generate corrected phase" let gtMtxFile = $args["<gtMtxFile>"] let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"] let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt" let switchScoreFile = out_dir & "switch_score.txt" let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz" - discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize) + echo $now() & " scanning with swphase to find switch spots and generate corrected phase" + discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, + minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize, dissimThresh =dissimThresh,maxUseNcells = maxUseNcells) if ($args["--chrom"] == "nil"): - echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option" - - discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"]) + echo $now() & " assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option" + echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile + discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = $args["--chrom"]) diff --git a/src/sgcocaller/correctPhase.nim b/src/sgcocaller/correctPhase.nim index d5046d19fff22cb692905c593afc891ba4827c69..86a3b2316eb63858cac7cb2cb2cdc39978bdc3a4 100755 --- a/src/sgcocaller/correctPhase.nim +++ b/src/sgcocaller/correctPhase.nim @@ -4,13 +4,16 @@ import sequtils import math import streams import strutils +import times + # let binSize = 2000 # let movingStep = 200 -let dissimThresh = 0.0099 +#let dissimThresh = 0.0099 #let lookBeyondSnps = 25 let debug = false let switchPrior = 0.5 +#let maxUseNcells = 100 type switchScoreTuple = tuple switch_scores: seq[float] @@ -24,7 +27,7 @@ type ## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin ## cell_geno snp by cell -proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = +proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]], dissimThresh: float): bool = let ncells = cell_geno[0].len var ncompared = newSeq[int](ncells) var nmatch = newSeq[int](ncells) @@ -46,33 +49,36 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): # if debug: echo "bin had many crossovers : " & $nxcells return true -proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] = +proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int, dissimThresh:float): seq[int] = #let ncells = gtMtxByCell[0].len let nSnps = gtMtxByCell.len let nBins = (int)(ceil(nSnps / (binSize - movingStep))) -# if debug: echo "nBins: " & $nBins +# echo "nBins: " & $nBins let binIds = toSeq(0..(nBins-1)) var snpPosStart,snpPosEnd: int var highRiskSnps: seq[int] for binI in binIds: -# if debug: echo "binI " & $binI +# echo "binI " & $binI snpPosStart = (binI)*(binSize - movingStep) if (snpPosStart + binSize) > (nSnps-1): snpPosEnd = (nSnps-1) else: snpPosEnd = snpPosStart + binSize if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], - cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )): + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd], + dissimThresh = dissimThresh )): highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd)) # add a last bin as from end of chrom with length binSize let lastBinStart = nSnps - binSize - let lastBinEnd = nSnps - 1 - if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], - cell_geno = gtMtxByCell[snpPosStart..snpPosEnd])): - highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd)) - + if lastBinStart > 0: + let lastBinEnd = nSnps - 1 + if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd], + dissimThresh = dissimThresh)): + highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd)) highRiskSnps = deduplicate(highRiskSnps) +# echo highRiskSnps return highRiskSnps proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] = @@ -81,7 +87,7 @@ proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[Bi var currentEntrySeq: seq[string] while not phasedSnpAnnotFileStream.atEnd(): currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace() - fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01") + fullGeno[i] = int8(parseInt(currentEntrySeq[3])).toBinaryGeno(format = "01") i.inc if i != nSnps : quit "phased snpAnnot file does not have the same number of rows with gtMtx" @@ -126,7 +132,7 @@ proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex]) -proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25): switchScoreTuple = +proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25, maxUseNcells:int): switchScoreTuple = var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int let nSnps = gtMtxByCell.len let ncells = gtMtxByCell[0].len @@ -134,11 +140,17 @@ proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullG var prob_switch, prob_nonsw: float var leftIndex,rightIndex, switch_score_snpIndex: seq[int] var swscore = newSeq[float](nSnps) + var useCells = newSeq[int]() + if (ncells <= maxUseNcells) or (maxUseNcells==0): + useCells = (0..(ncells-1)).toSeq() + else: + let everyN = (int)floor(ncells/maxUseNcells) + useCells = map(toSeq(0..(maxUseNcells-1)),proc(x:int):int = (x * everyN)) for rsnpi in riskySnps: prob_switch = 0.0 prob_nonsw = 0.0 if fullGeno[rsnpi] == gUnknown: continue - for celli in 0..(ncells-1): + for celli in useCells: # if celli == 2: continue leftIndex = newSeq[int]() rightIndex = newSeq[int]() @@ -234,7 +246,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph phaseSnpAnnotFileFS.close() return 0 -proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int): int = +proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int, dissimThresh:float, maxUseNcells:int): int = var gtMtxByCell: seq[seq[BinaryGeno]] var currentEntrySeq: seq[string] var currentEntry:seq[int] @@ -247,25 +259,26 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn switchScoreFileStream = openFileStream(switchScoreFile, fmWrite) except: stderr.write getCurrentExceptionMsg() - echo "reading gtMtx by cell" + echo $now() & "reading gtMtx by cell" discard gtMtxFileStream.readLine() discard phaseSnpAnnotFileStream.readLine() #N, i,j currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) nSnps = currentEntry[0] - echo "nSnps " & $nSnps + echo "nSnps: " & $nSnps ncells = currentEntry[1] - echo "ncells " & $ncells + echo "ncells: " & $ncells echo "totalEntries " & $ currentEntry[2] ## gtMtx is cell by Snp format gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true) var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps ) - var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize) - echo "riskySnps.length " & $riskySnps.len - var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps) + var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize, dissimThresh = dissimThresh) + echo $now() & "riskySnps.length " & $riskySnps.len + var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell = gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps, maxUseNcells= maxUseNcells) let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores) + echo "see switch_sore.txt file, and corrected switch errors found at positions: " & $switchSites switchScoreFileStream.writeLine("#" & $switchSites) for snpi,score in switchScoresT.switch_scores: if switchScoresT.switch_scores_snpIndex.find(snpi)>=0: diff --git a/src/sgcocaller/findGtNodes.nim b/src/sgcocaller/findGtNodes.nim index 8885112ed7f81c7aadf0e9802d265f08b467e627..ca3c146fcf08d7a8d866bf0e381fc1872c154b9f 100755 --- a/src/sgcocaller/findGtNodes.nim +++ b/src/sgcocaller/findGtNodes.nim @@ -31,7 +31,7 @@ proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, m if not barcodeTable.hasKey(currentCB): continue base_off = get_base_offset(position = rec.POS.cint, align = aln) - base = aln.base_at(base_off) + base = aln.base_at(base_off) if aln.base_quality_at(base_off).cint < minbsq: continue total_reads+=1 diff --git a/src/sgcocaller/findPath.nim b/src/sgcocaller/findPath.nim index 4b5ba6e10199d6d2c936d0287d31cbfb6eb12954..0a3013e8b94c3e4f190cbc388fff846e15e10302 100755 --- a/src/sgcocaller/findPath.nim +++ b/src/sgcocaller/findPath.nim @@ -101,26 +101,25 @@ proc pathTrackBackIthSperm(currentSperm: var SpermViNodes, inferProb = currentEm[stateAlt]+math.ln(transProb) reverseProb = currentEm[stateRef]+math.ln(1-transProb) viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") ) - return 0 -proc pathTrackBack*(scSpermSeq: var SeqSpermViNodes, +proc pathTrackBack*(scSpermSeq: TableRef[int,SpermViNodes], thetaRef: float, thetaAlt: float, cmPmb: float, outFileVStateMtx: var FileStream, viSegmentInfo: var FileStream): int = - var posEnd: int64 var inferProb,reverseProb = 0.0 var currentEm: array[stateRef..stateAlt, float] var lastNode: ViNode - var spermVNseq: SpermViNodes +# var spermVNseq: SpermViNodes +# 0..(scSpermSeq.len-1) +# rightGap,leftGap = [0.0,0.0] +# spermVNseq = scSpermSeq[ithSperm] var ithSNP: int - - for ithSperm in 0..(scSpermSeq.len-1): - ## rightGap,leftGap = [0.0,0.0] - spermVNseq = scSpermSeq[ithSperm] - if spermVNseq.viNodeseq.len==0: continue + for ithSperm,spermVNseq in scSpermSeq: + if spermVNseq.viNodeseq.len==0: + continue lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt) if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: diff --git a/src/sgcocaller/getGtMtx.nim b/src/sgcocaller/getGtMtx.nim index 90816bc2425a67e962ab05d70598d82062774a49..7d4e84cbc12c2c365ddff2c11cae065ac9884b95 100755 --- a/src/sgcocaller/getGtMtx.nim +++ b/src/sgcocaller/getGtMtx.nim @@ -30,9 +30,7 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o for rec in ivcf.query(chrom): var rec_alt = rec.ALT[0][0] var rec_ref = rec.REF[0] - cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, - minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, - minbsq=minbsq,barcodeTag=barcodeTag) + cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,minbsq=minbsq,barcodeTag=barcodeTag) cellGtNodeCopy = cellGtNodes for cellbarcode in cellGtNodeCopy.keys: cellDP = cellGtNodes[cellbarcode].alleles.len diff --git a/src/sgcocaller/graph.nim b/src/sgcocaller/graph.nim index 215e1e64d11fef79e6eddf6d880bc3e340315066..83e88e63f76a2d214006ffbf9140bc8d5831d795 100755 --- a/src/sgcocaller/graph.nim +++ b/src/sgcocaller/graph.nim @@ -13,10 +13,11 @@ type snpIndexLookUp*: Table[int,int] ## look up from current Sperm SNP index to SNPIndex spermSnpIndexLookUp*: Table[int,int] - SeqSpermViNodes* = seq[SpermViNodes] +# SeqSpermViNodes* = seq[SpermViNodes] -proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int = - if scSpermSeq[ithSperm].viNodeseq.len==0: +proc addViNodeIthSperm*(scSpermSeq: TableRef[int,SpermViNodes], cAlt: int16, cRef: int16, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int = + if not scSpermSeq.hasKey(ithSperm): + scSpermSeq[ithSperm] = SpermViNodes() var currentViNode = ViNode() currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] @@ -26,7 +27,6 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i currentViNode.pos = rec_pos currentViNode.cAlt = cAlt currentViNode.cRef = cRef - scSpermSeq[ithSperm].viNodeseq.add(currentViNode) scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex @@ -65,7 +65,7 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i return 0 proc addViNode*(barcodeTable: TableRef, alleleCountTable: Table[string,allele_expr], - scSpermSeq: var SeqSpermViNodes, + scSpermSeq: TableRef[int,SpermViNodes], outFileTotalCountMtx: var FileStream, outFileAltCountMtx: var FileStream, nnsize: var int, @@ -80,13 +80,13 @@ proc addViNode*(barcodeTable: TableRef, for bc, ac in alleleCountTable.pairs: # ac.tostring(acs) ## mindp, maxdp, they are values per cell - if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue + if (ac.cref+ac.calt) < mindp or (ac.cref+ac.calt) > maxdp: continue var ithSperm = barcodeTable[bc] ## write to mtx Ref count outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) nnsize += 1 var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) - discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb) + discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = ac.calt, cRef = ac.cref, ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb) return 0 # The size line diff --git a/src/sgcocaller/inferMissingSnps.nim b/src/sgcocaller/inferMissingSnps.nim index 676978ce2bbbcfd1462a30839b0a68f6137315ec..95bf50698435e4d0b6a3330a9e5b1265d1839641 100755 --- a/src/sgcocaller/inferMissingSnps.nim +++ b/src/sgcocaller/inferMissingSnps.nim @@ -37,11 +37,11 @@ proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh: float) let prob_1_p = 1 - prob_0_p if max(prob_0_p,prob_1_p) >= posterior_thresh: # echo "posterior_prob: " & $(max(prob_0_p,prob_1_p)) - if prob_0_p > prob_1_p: -# echo "returned inferred geno: gREF" + if prob_0_p > prob_1_p: + # "returned inferred geno: gREF" return gREF else: -# echo "returned inferred geno: gALT" + #"returned inferred geno: gALT" return gALT return gUnknown # return full sequence of template geno by inferring missing SNPs's genotypes @@ -75,7 +75,7 @@ proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], po offset += 1 totalIter += 1 if coexisPos.len < int(nAnchorSnps/2): - if debug: echo "not enough coexisting SNPs for jth cell: " & $j +# if debug: echo "not enough coexisting SNPs for jth cell: " & $j continue snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), temp_geno = map(coexisPos,proc(x:int): BinaryGeno = templateGeno[x]) ) diff --git a/src/sgcocaller/utils.nim b/src/sgcocaller/utils.nim index 01370af097ed3a68c0fa66b81d510d2183e5999c..9935127f27f732f768cfb897c54d3fd84225b7f9 100755 --- a/src/sgcocaller/utils.nim +++ b/src/sgcocaller/utils.nim @@ -9,16 +9,16 @@ import sequtils type allele_expr* = ref object - cref*: int - calt*: int + cref*: int16 + calt*: int16 ViState* = enum stateRef, stateAlt, stateN BinaryGeno* = enum gUnknown, gREF, gALT ViNode* = object pos*: int - cRef*: int - cAlt*: int + cRef*: int16 + cAlt*: int16 pathScore*: array[stateRef..stateAlt, float] pathState*: array[stateRef..stateAlt, ViState] state*: ViState @@ -37,7 +37,7 @@ type MtxRow* = tuple rowIndex: int colIndex: int - value: int + value: int16 type Mtx* = ref object header*: string @@ -70,10 +70,10 @@ proc get_base_offset*(position:int, align: Record): int = # get the base base_offset = qoff - over-1 break - return base_offset + return base_offset -proc toBinaryGeno*(x: int, format = "12") : BinaryGeno = +proc toBinaryGeno*(x: int8, format = "12") : BinaryGeno = if(format == "01"): if x == 1: return gALT if x == 0: return gREF @@ -88,15 +88,18 @@ proc toBinaryGeno*(x: int, format = "12") : BinaryGeno = proc readGtMtxToSeq*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]], by_cell = false):int = var currentEntrySeq:seq[int] var currentLine:seq[string] - - while not mtxFileStream.atEnd(): - currentLine = mtxFileStream.readLine().splitWhitespace() - ## i j 1-based from file - currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) - if by_cell: - gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = currentEntrySeq[2].toBinaryGeno - else: - gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno + if by_cell: + while not mtxFileStream.atEnd(): + currentLine = mtxFileStream.readLine().splitWhitespace() + ## i j 1-based from file + currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) + gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = int8(currentEntrySeq[2]).toBinaryGeno + else: + while not mtxFileStream.atEnd(): + currentLine = mtxFileStream.readLine().splitWhitespace() + ## i j 1-based from file + currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) + gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = int8(currentEntrySeq[2]).toBinaryGeno return 0 proc add_allele*(g:GtNode, alleleBinGeno:int):string = g.alleles & $alleleBinGeno @@ -105,7 +108,7 @@ proc inc_count_cref*(a:allele_expr) = inc(a.cref) proc inc_count_calt*(a:allele_expr) = inc(a.calt) proc getEmission*(thetaRef=0.1,thetaAlt=0.9, - cRef:int,cAlt:int): array[stateRef..stateAlt, float] = + cRef:int16,cAlt:int16): array[stateRef..stateAlt, float] = var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] @@ -182,7 +185,7 @@ proc readMtx*(mtx_file:string): Mtx = current_line = mtxFileStream.readLine() sparseMtx.lines.add((rowIndex: parseInt(current_line.splitWhitespace()[0]), colIndex: parseInt(current_line.splitWhitespace()[1]), - value: parseInt(current_line.splitWhitespace()[2]) )) + value: int16(parseInt(current_line.splitWhitespace()[2])) )) mtxFileStream.close() return sparseMtx