From ce69d55ad6f49114146fff304715be5dad6eb037 Mon Sep 17 00:00:00 2001 From: rlyu <rlyu@svi.edu.au> Date: Mon, 29 Nov 2021 10:03:08 +1100 Subject: [PATCH] add supplying template cell option --- plotDiagnosticHap.R | 5 +- private/correctPhase.nim | 297 +++++++++++++++ private/find_template_cell.nim | 27 +- private/findpath.nim | 43 ++- private/genotype_mtx.nim | 272 +++++++------- private/graph.nim | 96 ++--- private/infer_missing_snps.nim | 188 +++++----- private/infer_snp_hap.nim | 5 - private/sgcocaller_sxo.nim | 147 ++++++++ private/sgphase.nim | 41 ++- private/write_out_vcf.nim | 101 ++++++ sgcocaller.nim | 641 +++++++++++++++------------------ 12 files changed, 1210 insertions(+), 653 deletions(-) create mode 100755 private/correctPhase.nim delete mode 100755 private/infer_snp_hap.nim create mode 100755 private/sgcocaller_sxo.nim create mode 100755 private/write_out_vcf.nim diff --git a/plotDiagnosticHap.R b/plotDiagnosticHap.R index 7b79021..3956174 100644 --- a/plotDiagnosticHap.R +++ b/plotDiagnosticHap.R @@ -43,12 +43,11 @@ plot_df_match <- apply(plot_df,2,function(x){ data.frame(cbind(plot_df_match,snpAnnot_df)) %>% tidyr::pivot_longer(cols = colnames(plot_df)) %>% - dplyr::filter(!is.na(value),POS > 1.3e8, POS < 1.35e8) %>% + dplyr::filter(!is.na(value)) %>% ggplot()+ geom_point(mapping = aes(x = POS , y = name, color = value))+ scale_color_manual(values = c("FALSE"='red', "TRUE" = "blue"))+ - theme_bw(base_size = 18)+geom_vline(mapping = aes(xintercept = c(switch_pos[2])), - color = "red",size = 1.2) + theme_bw(base_size = 18) ggsave(pngfile, dpi = 100, width = 14, height = 6) diff --git a/private/correctPhase.nim b/private/correctPhase.nim new file mode 100755 index 0000000..6c9ab35 --- /dev/null +++ b/private/correctPhase.nim @@ -0,0 +1,297 @@ +## calcualte switch score for snps positions that are high risk +import utils +import sequtils +import math +import streams +# bins of SNP indexes +import strutils + +let binSize = 2000 +let movingStep = 200 +let dissimThresh = 0.0099 +let lookBeyondSnps = 25 +let debug = false +let switchPrior = 0.5 +type + switchScoreTuple = tuple + switch_scores: seq[float] + switch_scores_snpIndex: seq[int] + +# reduce the search space +# return seq of snp indexes +# low risk bins are bins that do not have crossover happening in majority of the cells. + +## rough crossover estimates: return binary results, 0 means no crossovers, 1 means there is. +## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin + +## cell_geno snp by cell +proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = + let ncells = cell_geno[0].len + var ncompared = newSeq[int](ncells) + var nmatch = newSeq[int](ncells) + for snpi,g in templ_geno: + for cellj in 0..(ncells-1): + if g != gUnknown and cell_geno[snpi][cellj] != gUnknown: + ncompared[cellj].inc + if g == cell_geno[snpi][cellj]: nmatch[cellj].inc + if debug: echo "ncompared " & $ncompared + let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float = nmatch[x]/ncompared[x]) + if debug: echo "dissim " & $dissim + var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh))) + if debug: echo "ncrossover " & $ncrossover + let nxcells = foldl(ncrossover, a + b) + if nxcells < int(floor(0.5 * float(ncells))): + if debug: echo "bin had no crossovers : " & $nxcells + return false + else: + if debug: echo "bin had many crossovers : " & $nxcells + return true + +proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = + #let ncells = gtMtxByCell[0].len + let nSnps = gtMtxByCell.len + let nBins = (int)(ceil(nSnps / (binSize - movingStep))) + if debug: echo "nBins: " & $nBins + let binIds = toSeq(0..(nBins-1)) + var snpPosStart,snpPosEnd: int + var highRiskSnps: seq[int] + for binI in binIds: + if debug: echo "binI " & $binI + snpPosStart = (binI)*(binSize - movingStep) + if (snpPosStart + binSize) > (nSnps-1): + snpPosEnd = (nSnps-1) + else: + snpPosEnd = snpPosStart + binSize + if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )): + highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd)) + + + # add a last bin as from end of chrom with length binSize + let lastBinStart = nSnps - binSize + let lastBinEnd = nSnps - 1 + if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd])): + highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd)) + + highRiskSnps = deduplicate(highRiskSnps) + return highRiskSnps + +proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] = + var fullGeno = newSeq[BinaryGeno](nSnps) + var i = 0 + var currentEntrySeq: seq[string] + while not phasedSnpAnnotFileStream.atEnd(): + currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace() + fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01") + i.inc + if i != nSnps : + quit "phased snpAnnot file does not have the same number of rows with gtMtx" + return fullGeno + +proc calculateProbslog10(hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno], error_rate = 0.1) : float = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + let prob = 0.5*(error_rate^nmis*(1-error_rate)^nmatch + error_rate^nmatch*(1-error_rate)^nmis) + return(log10(prob)) + +proc countNmatch( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + return nmatch + +proc countNMis( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + return nmis + +proc binarySwitch(x: BinaryGeno):BinaryGeno = + if(x == gUnknown): gUnknown + elif(x == gREF):gALT + else: gREF + +proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = + map(hap, binarySwitch) + +proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = + map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex]) + +proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno]): switchScoreTuple = + var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int + let nSnps = gtMtxByCell.len + let ncells = gtMtxByCell[0].len + var hswitch,cell_hap,cell_full_hap,templ_geno,templ_geno_left,templ_geno_right: seq[BinaryGeno] + var prob_switch, prob_nonsw: float + var leftIndex,rightIndex, switch_score_snpIndex: seq[int] + var swscore = newSeq[float](nSnps) + for rsnpi in riskySnps: + prob_switch = 0.0 + prob_nonsw = 0.0 + if fullGeno[rsnpi] == gUnknown: continue + for celli in 0..(ncells-1): + # if celli == 2: continue + leftIndex = newSeq[int]() + rightIndex = newSeq[int]() + offset = 1 + cell_full_hap = getIthCellHap(gtMtxByCell, celli) + if(cell_full_hap[rsnpi] != gUnknown and fullGeno[rsnpi] != gUnknown ): + rightIndex.add(rsnpi) + ## need to find the nearest coexisting N snps (N = lookBeyondSnps*2) + offset = 1 + while(leftIndex.len < lookBeyondSnps and ((rsnpi-offset) > 0)): + if((cell_full_hap[rsnpi-offset] != gUnknown) and (fullGeno[rsnpi-offset] != gUnknown)): + leftIndex.add(rsnpi-offset) + offset.inc + offset = 1 + while(rightIndex.len < lookBeyondSnps and ((rsnpi+offset) < nSnps)): + if((cell_full_hap[rsnpi+offset] != gUnknown) and (fullGeno[rsnpi+offset] != gUnknown)): + rightIndex.add(rsnpi+offset) + offset.inc + cell_hap = concat(map(concat(leftIndex,rightIndex), proc(x:int):BinaryGeno = cell_full_hap[x])) + templ_geno_left = map(leftIndex, proc(x:int):BinaryGeno = fullGeno[x]) + templ_geno_right = map(rightIndex, proc(x:int):BinaryGeno = fullGeno[x]) + templ_geno = concat(templ_geno_left,templ_geno_right) + hswitch = concat(templ_geno_left,switchHap(templ_geno_right)) + # if rsnpi == 16933: + # echo "cell " & $celli + # echo "cell_hap " & $cell_hap + # echo "templ_geno " & $templ_geno + # echo "hswitch " & $hswitch + # echo "prev prob nosw " & $prob_nonsw + # echo "prev prob_switch " & $prob_switch + # echo "nmatch temp_geno " & $(countNmatch(hap = templ_geno, cell_hap = cell_hap)) + # echo "nmatch hswitch " & $(countNmatch(hap = hswitch, cell_hap = cell_hap)) + # echo "nmis temp_geno " & $(countNMis(hap = templ_geno, cell_hap = cell_hap)) + # echo "nmis hswitch " & $(countNMis(hap = hswitch, cell_hap = cell_hap)) + prob_nonsw = prob_nonsw + calculateProbslog10(hap = templ_geno, cell_hap = cell_hap) + prob_switch = prob_switch + calculateProbslog10(hap = hswitch, cell_hap = cell_hap) + + #if(prob_switch > prob_nonsw): echo "found positive switch score" + swscore[rsnpi] = log10(switchPrior) + prob_switch - log10(1-switchPrior) - prob_nonsw + switch_score_snpIndex.add(rsnpi) + return (swscore,switch_score_snpIndex) + +## return SNP indexs to switch +## switchScore, a dense list of switch scores +proc findSwitchSites(switchScoreT: switchScoreTuple): seq[int] = + var sitesToSwitch = newSeq[int]() + var positiveScores: seq[float] + var positiveScoresIndex: seq[int] + + var inPositiveBlock = false + for site, score in switchScoreT.switch_scores: + if switchScoreT.switch_scores_snpIndex.find(site)>=0: + if score <= 0.0 : + if inPositiveBlock: + inPositiveBlock = false + if positiveScoresIndex.len > int(floor(lookBeyondSnps/3)): + sitesToSwitch.add(positiveScoresIndex[maxIndex(positiveScores)]) + continue + elif not inPositiveBlock: + positiveScores = @[score] + positiveScoresIndex = @[site] + inPositiveBlock = true + continue + else: + positiveScores.add(score) + positiveScoresIndex.add(site) + return sitesToSwitch + +proc switchPhaseString(oriString:string): string = + if oriString == "-1": return "-1" + if oriString == "0": return "1" + if oriString == "1": return "0" + +proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, phaseSnpAnnotFile:string ): int = + var switchedPhasedAnnotFileFS,phaseSnpAnnotFileFS: FileStream + var switch = 0 + var currentEntry:seq[string] + var snpIndex = 0 + var writtenSnps = 0 + try: + switchedPhasedAnnotFileFS = openFileStream(switchedPhasedAnnotFile, fmWrite) + phaseSnpAnnotFileFS = openFileStream(phaseSnpAnnotFile, fmRead) + except: + stderr.write getCurrentExceptionMsg() + switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine()) + if siteToSwitch.len == 0: + while not phaseSnpAnnotFileFS.atEnd(): + switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine()) + switchedPhasedAnnotFileFS.close() + phaseSnpAnnotFileFS.close() + return 0 + while not phaseSnpAnnotFileFS.atEnd(): + currentEntry = phaseSnpAnnotFileFS.readLine.splitWhitespace() + if siteToSwitch.find(snpIndex) != -1: + switch = switch xor 1 + if switch == 0: + switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t")) + else: + currentEntry[3] = switchPhaseString(currentEntry[3]) + switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t")) + snpIndex.inc + switchedPhasedAnnotFileFS.close() + phaseSnpAnnotFileFS.close() + return 0 + +proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string): int = + var gtMtxByCell: seq[seq[BinaryGeno]] + var currentEntrySeq: seq[string] + var currentEntry:seq[int] + var nSnps,ncells:int + var gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream:FileStream + ## sort entries in mtx files + try: + gtMtxFileStream = openFileStream(gtMtxFile, fmRead) + phaseSnpAnnotFileStream = openFileStream(phaseSnpAnnotFile, fmRead) + switchScoreFileStream = openFileStream(switchScoreFile, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + echo "reading gtMtx to by cell" + discard gtMtxFileStream.readLine() + discard phaseSnpAnnotFileStream.readLine() + #N, i,j + currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() + currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) + nSnps = currentEntry[0] + echo "nSnps " & $nSnps + ncells = currentEntry[1] + echo "ncells " & $ncells + + echo "totalEntries " & $ currentEntry[2] + ## gtMtx is cell by Snp format + gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) + discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true) + var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps ) + var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell) + echo "riskySnps.len " & $riskySnps.len + var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno) + let switchSites = findSwitchSites(switchScoresT) + switchScoreFileStream.writeLine("#" & $switchSites) + for snpi,score in switchScoresT.switch_scores: + if switchScoresT.switch_scores_snpIndex.find(snpi)>=0: + switchScoreFileStream.writeLine($score) + else: + switchScoreFileStream.writeLine("NA") + for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]: + fs.close() + discard writeSwitchedPhase(switchSites,switchedPhasedAnnotFile,phaseSnpAnnotFile) + +# for chrom in @["chr4"]: +# let gtMtxFile = "data/WC_CNV_42/sgcocaller/gtMtx/" & chrom & "_gtMtx.mtx" +# let phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_phased_snpAnnot.txt" +# let switchedPhasedAnnotFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_corrected_phased_snpAnnot.txt" +# let switchScoreFile = "data/WC_CNV_42/sgcocaller/phase/" & chrom & "_switch_score.txt" +# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile) \ No newline at end of file diff --git a/private/find_template_cell.nim b/private/find_template_cell.nim index ed743a6..67d91e3 100755 --- a/private/find_template_cell.nim +++ b/private/find_template_cell.nim @@ -31,7 +31,7 @@ proc countCoexistMatch(seq1: seq[BinaryGeno], seq2: seq[BinaryGeno]): seq[int] = nmatch += 1 return [cell1Snps,cell2Snps,nmatch,ncoexist].toSeq -proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[int]): seq[CellPairs] = +proc findCellPairs(gtMtx:seq[seq[BinaryGeno]], nPairs = 3, nSnpsPercell: var seq[int]): seq[CellPairs] = let ncells = gtMtx.len var genoMatch = @[0,0,0,0] var returnCp = newSeq[CellPairs](nPairs) @@ -49,13 +49,15 @@ proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[ if k == nPairs: break returnCp +proc compareNSnps(x, y: (int,int)): int = + cmp(x[0], y[0]) -proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.75): int = +proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.85): int = # return the selected cells' index # not too many SNPs or too few - var seqNsnps = nSnpsPercell - sort(seqNsnps, system.cmp[int]) - int(floor(float(seqNsnps.len) * q)) + var indexed = zip(nSnpsPercell, toSeq(0..(nSnpsPercell.len-1))) + indexed.sort(compareNSnps) + return indexed[int(floor(float(nSnpsPercell.len) * q))][1] proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = var nSnpsCells = newSeq[int](gtMtx.len) @@ -66,12 +68,15 @@ proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = if cellPairs[0].coexistSnps == 0 : # no good pairs found selectedCell = findCellBynSNPs(nSnpsPercell = nSnpsCells) + echo "no good cell pairs found, template cell is selected by nSNPs " + echo "selected cell:" & $selectedCell + return selectedCell else: let coExistSnps = map(cellPairs, proc(x: CellPairs) : int = x.coexistSnps) icp = maxIndex(coExistSnps) - if nSnpsCells[cellPairs[icp].cell1] > nSnpsCells[cellPairs[icp].cell2]: - echo "selected cell: " & $cellPairs[icp].cell1 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell1] - return cellPairs[icp].cell1 - else: - echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2] - return cellPairs[icp].cell2 + if nSnpsCells[cellPairs[icp].cell1] > nSnpsCells[cellPairs[icp].cell2]: + echo "selected cell: " & $cellPairs[icp].cell1 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell1] + return cellPairs[icp].cell1 + else: + echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2] + return cellPairs[icp].cell2 diff --git a/private/findpath.nim b/private/findpath.nim index 902ffe5..4b5ba6e 100755 --- a/private/findpath.nim +++ b/private/findpath.nim @@ -1,12 +1,13 @@ ## implements the traceback function import utils -from graph import SpermViNodes +import graph import streams import tables import math import strutils -proc pathTrackBack*(currentSperm: var SpermViNodes, + +proc pathTrackBackIthSperm(currentSperm: var SpermViNodes, ithSperm: int, thetaRef: float, thetaAlt: float, @@ -102,3 +103,41 @@ proc pathTrackBack*(currentSperm: var SpermViNodes, viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") ) return 0 +proc pathTrackBack*(scSpermSeq: var SeqSpermViNodes, + thetaRef: float, + thetaAlt: float, + cmPmb: float, + outFileVStateMtx: var FileStream, + viSegmentInfo: var FileStream): int = + + var posEnd: int64 + var inferProb,reverseProb = 0.0 + var currentEm: array[stateRef..stateAlt, float] + var lastNode: ViNode + var spermVNseq: SpermViNodes + var ithSNP: int + + for ithSperm in 0..(scSpermSeq.len-1): + ## rightGap,leftGap = [0.0,0.0] + spermVNseq = scSpermSeq[ithSperm] + if spermVNseq.viNodeseq.len==0: continue + lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] + currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt) + if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: + scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef + ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") + inferProb = currentEm[stateRef] + reverseProb = currentEm[stateAlt] + else: + scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt + ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") + inferProb = currentEm[stateAlt] + reverseProb = currentEm[stateRef] + posEnd = lastNode.pos + ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the relevant File Streams. + discard pathTrackBackIthSperm(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm, thetaRef = thetaRef, + thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx, + viSegmentInfo = viSegmentInfo, posEnd = posEnd, inferProb = inferProb,reverseProb = reverseProb) + diff --git a/private/genotype_mtx.nim b/private/genotype_mtx.nim index 9c11d79..68ace6b 100755 --- a/private/genotype_mtx.nim +++ b/private/genotype_mtx.nim @@ -1,136 +1,136 @@ -## generate gtMtx, gtMtx_snpAnnot, totalCount.mtx altCount.mtx - -## author Ruqian Lyu -## Date: 2021 Oct -## gtMtx in 1, or 2 -import tables -import strutils -import hts -import utils -import streams -import findGtNodes - -## these outputs will be per chromosome - - -proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, - outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int, - minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.8):int = - var variantIndex = 1 - var writtenVarintIndex = 1 - var calt,cellDP,geno,cellIndex,nnsize = 0 - var cellGtNodes,cellGtNodeCopy: Table[string, GtNode] - - ## write headers to those mtx files and the first line place holder for total_row total_column total_entry - let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" - - for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: - outFileStream.writeLine(sparseMatrixHeader) - outFileStream.writeLine(' '.repeat(50)) - outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t")) - for rec in ivcf.query("*"): - var rec_alt = rec.ALT[0][0] - var rec_ref = rec.REF[0] - cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, - minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, - minbsq=minbsq,barcodeTag=barcodeTag) - cellGtNodeCopy = cellGtNodes - for cellbarcode in cellGtNodeCopy.keys: - cellDP = cellGtNodes[cellbarcode].alleles.len - if cellDP < minCellDp: - cellGtNodes.del(cellbarcode) - continue - calt = cellGtNodes[cellbarcode].alleles.count("1") - if calt/cellDP < p0: - cellGtNodes[cellbarcode].genotype = gREF - elif calt/cellDP > p1: - cellGtNodes[cellbarcode].genotype = gALT - else: - cellGtNodes.del(cellbarcode) - continue - if cellGtNodes.len < minSnpDepth: - variantIndex.inc - continue - else: - for cellbarcode in cellGtNodes.keys: - cellDP = cellGtNodes[cellbarcode].alleles.len - calt = cellGtNodes[cellbarcode].alleles.count("1") - ## $cellGtNodes[cellbarcode].genotype BinaryGeno to "0", "1" or "-1" (not -1), write to file coding in 1 or 2 - geno = parseInt($cellGtNodes[cellbarcode].genotype) + 1 - # index from 1 for matrices - cellIndex = barcodeTable[cellbarcode] + 1 - outGtMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $geno],sep = " ")) - outTotalCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $cellDP],sep = " ")) - outAltCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $calt],sep = " ")) - nnsize.inc - outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) - writtenVarintIndex.inc - ## write to matrices - - echo "processed " & $(variantIndex) & " variants" - echo "wrote " & $(writtenVarintIndex-1) & " variants to matrices and snpAnnot file" - for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: - outFileStream.setPosition(49) - outFileStream.write($(writtenVarintIndex-1) & " " & $barcodeTable.len & " " & $nnsize) - for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]: - outFileStream.close() - return 0 - -# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" -# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" -# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" -# var -# ibam:Bam -# ivcf:VCF -# outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream -# if not open(ibam, bam_file, threads=1, index = true): -# quit "couldn't open input bam" -# if not open(ivcf, vcf_file, threads=1): -# quit "couldn't open: vcf file" - -# var hf = hts.hts_open(cstring(barcode_file), "r") -# #### TODO : Table size -# var barcodeTable = newTable[string,int](initialSize = 1024) -# var kstr: hts.kstring_t -# kstr.l = 0 -# kstr.m = 0 -# kstr.s = nil -# var ithSperm = 0 -# ## initiate the table with CB as keys, allele counts (ref object) as elements -# while hts_getline(hf, cint(10), addr kstr) > 0: -# if $kstr.s[0] == "#": -# continue -# var v = $kstr.s -# discard barcodeTable.hasKeyOrPut(v, ithSperm) -# ithSperm.inc -# discard hf.hts_close() - -# let s_Chrs = @["chr1"] -# var chrom = "chr1" -# let out_prefix = "test_data/getMtx/" -# echo "generating gtMtx" -# try: -# outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite) -# outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite) -# outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite) -# outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite) -# except: -# stderr.write getCurrentExceptionMsg() - - -# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, -# outSnpAnnot = outSnpAnnot, maxTotalReads = 30, -# minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) - - -# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string - -# ## sort entries in mtx files -# for chrom in s_Chrs: -# outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" -# outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" -# outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" -# for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: -# var imtx = readMtx(mtx_file = mtxFile) -# discard sortWriteMtx(imtx, mtx_file = mtxFile) - +## generate gtMtx, gtMtx_snpAnnot, totalCount.mtx altCount.mtx + +## author Ruqian Lyu +## Date: 2021 Oct +## gtMtx in 1, or 2 +import tables +import strutils +import hts +import utils +import streams +import findGtNodes + +## these outputs will be per chromosome + + +proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, + outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int, + minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.7,chrom: string):int = + var variantIndex = 1 + var writtenVarintIndex = 1 + var calt,cellDP,geno,cellIndex,nnsize = 0 + var cellGtNodes,cellGtNodeCopy: Table[string, GtNode] + + ## write headers to those mtx files and the first line place holder for total_row total_column total_entry + let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" + + for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: + outFileStream.writeLine(sparseMatrixHeader) + outFileStream.writeLine(' '.repeat(50)) + outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t")) + for rec in ivcf.query(chrom): + var rec_alt = rec.ALT[0][0] + var rec_ref = rec.REF[0] + cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, + minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, + minbsq=minbsq,barcodeTag=barcodeTag) + cellGtNodeCopy = cellGtNodes + for cellbarcode in cellGtNodeCopy.keys: + cellDP = cellGtNodes[cellbarcode].alleles.len + if cellDP < minCellDp: + cellGtNodes.del(cellbarcode) + continue + calt = cellGtNodes[cellbarcode].alleles.count("1") + if calt/cellDP < p0: + cellGtNodes[cellbarcode].genotype = gREF + elif calt/cellDP > p1: + cellGtNodes[cellbarcode].genotype = gALT + else: + cellGtNodes.del(cellbarcode) + continue + if cellGtNodes.len < minSnpDepth: + variantIndex.inc + continue + else: + for cellbarcode in cellGtNodes.keys: + cellDP = cellGtNodes[cellbarcode].alleles.len + calt = cellGtNodes[cellbarcode].alleles.count("1") + ## $cellGtNodes[cellbarcode].genotype BinaryGeno to "0", "1" or "-1" (not -1), write to file coding in 1 or 2 + geno = parseInt($cellGtNodes[cellbarcode].genotype) + 1 + # index from 1 for matrices + cellIndex = barcodeTable[cellbarcode] + 1 + outGtMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $geno],sep = " ")) + outTotalCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $cellDP],sep = " ")) + outAltCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $calt],sep = " ")) + nnsize.inc + outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) + writtenVarintIndex.inc + ## write to matrices + + echo "processed " & $(variantIndex) & " variants" + echo "wrote " & $(writtenVarintIndex-1) & " variants to matrices and snpAnnot file" + for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: + outFileStream.setPosition(49) + outFileStream.write($(writtenVarintIndex-1) & " " & $barcodeTable.len & " " & $nnsize) + for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]: + outFileStream.close() + return 0 + +# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" +# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" +# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" +# var +# ibam:Bam +# ivcf:VCF +# outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream +# if not open(ibam, bam_file, threads=1, index = true): +# quit "couldn't open input bam" +# if not open(ivcf, vcf_file, threads=1): +# quit "couldn't open: vcf file" + +# var hf = hts.hts_open(cstring(barcode_file), "r") +# #### TODO : Table size +# var barcodeTable = newTable[string,int](initialSize = 1024) +# var kstr: hts.kstring_t +# kstr.l = 0 +# kstr.m = 0 +# kstr.s = nil +# var ithSperm = 0 +# ## initiate the table with CB as keys, allele counts (ref object) as elements +# while hts_getline(hf, cint(10), addr kstr) > 0: +# if $kstr.s[0] == "#": +# continue +# var v = $kstr.s +# discard barcodeTable.hasKeyOrPut(v, ithSperm) +# ithSperm.inc +# discard hf.hts_close() + +# let s_Chrs = @["chr1"] +# var chrom = "chr1" +# let out_prefix = "test_data/getMtx/" +# echo "generating gtMtx" +# try: +# outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite) +# outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite) +# outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite) +# outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite) +# except: +# stderr.write getCurrentExceptionMsg() + + +# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, +# outSnpAnnot = outSnpAnnot, maxTotalReads = 30, +# minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) + + +# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string + +# ## sort entries in mtx files +# for chrom in s_Chrs: +# outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" +# outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" +# outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" +# for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: +# var imtx = readMtx(mtx_file = mtxFile) +# discard sortWriteMtx(imtx, mtx_file = mtxFile) + diff --git a/private/graph.nim b/private/graph.nim index fe02980..215e1e6 100755 --- a/private/graph.nim +++ b/private/graph.nim @@ -15,7 +15,54 @@ type spermSnpIndexLookUp*: Table[int,int] SeqSpermViNodes* = seq[SpermViNodes] - +proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int = + if scSpermSeq[ithSperm].viNodeseq.len==0: + var currentViNode = ViNode() + currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] + currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] + currentViNode.pathState[stateRef] = stateN + currentViNode.pathState[stateAlt] = stateN + currentViNode.state = stateN + currentViNode.pos = rec_pos + currentViNode.cAlt = cAlt + currentViNode.cRef = cRef + + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + else: + let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] + var ltransProb = math.ln(getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) + var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) + var currentViNode = ViNode() + # ref/alt -> ref + var refTref = preVNode.pathScore[stateRef] + lnoTransProb + var altTref = preVNode.pathScore[stateAlt] + ltransProb + if refTref > altTref: + currentViNode.pathScore[stateRef] = refTref + currentViNode.pathState[stateRef] = stateRef + else: + currentViNode.pathScore[stateRef] = altTref + currentViNode.pathState[stateRef] = stateAlt + # ref/alt -> alt + var refTalt = preVNode.pathScore[stateRef] + ltransProb + var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb + + if refTalt > altTalt: + currentViNode.pathScore[stateAlt] = refTalt + currentViNode.pathState[stateAlt] = stateRef + else: + currentViNode.pathScore[stateAlt] = altTalt + currentViNode.pathState[stateAlt] = stateAlt + currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] + currentViNode.pathScore[stateRef] += emissionArray[stateRef] + currentViNode.cAlt = cAlt + currentViNode.cRef = cRef + currentViNode.pos = rec_pos + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + return 0 proc addViNode*(barcodeTable: TableRef, alleleCountTable: Table[string,allele_expr], scSpermSeq: var SeqSpermViNodes, @@ -40,51 +87,6 @@ proc addViNode*(barcodeTable: TableRef, outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) nnsize += 1 var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) - if scSpermSeq[ithSperm].viNodeseq.len==0: - var currentViNode = ViNode() - currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] - currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] - currentViNode.pathState[stateRef] = stateN - currentViNode.pathState[stateAlt] = stateN - currentViNode.state = stateN - currentViNode.pos = int(rec_pos) - currentViNode.cAlt = int(ac.cAlt) - currentViNode.cRef = int(ac.cRef) - - scSpermSeq[ithSperm].viNodeseq.add(currentViNode) - scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len - scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex - else: - let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] - var ltransProb = math.ln(getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) - var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) - var currentViNode = ViNode() - # ref/alt -> ref - var refTref = preVNode.pathScore[stateRef] + lnoTransProb - var altTref = preVNode.pathScore[stateAlt] + ltransProb - if refTref > altTref: - currentViNode.pathScore[stateRef] = refTref - currentViNode.pathState[stateRef] = stateRef - else: - currentViNode.pathScore[stateRef] = altTref - currentViNode.pathState[stateRef] = stateAlt - # ref/alt -> alt - var refTalt = preVNode.pathScore[stateRef] + ltransProb - var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb - - if refTalt > altTalt: - currentViNode.pathScore[stateAlt] = refTalt - currentViNode.pathState[stateAlt] = stateRef - else: - currentViNode.pathScore[stateAlt] = altTalt - currentViNode.pathState[stateAlt] = stateAlt - currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] - currentViNode.pathScore[stateRef] += emissionArray[stateRef] - currentViNode.cAlt = int(ac.cAlt) - currentViNode.cRef = int(ac.cRef) - currentViNode.pos = int(rec_pos) - scSpermSeq[ithSperm].viNodeseq.add(currentViNode) - scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len - scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb) return 0 # The size line diff --git a/private/infer_missing_snps.nim b/private/infer_missing_snps.nim index b3874f7..2d958ec 100755 --- a/private/infer_missing_snps.nim +++ b/private/infer_missing_snps.nim @@ -1,89 +1,99 @@ -## infer missing snp in template cell by looking at the snp's link to other nearby snps in other cells - -## author Ruqian Lyu -## rlyu@svi.edu.au -import sequtils -import math -import utils - -let nAnchorSnps = 10 - -proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = - map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex]) - -# cell_geno seq of length nAnchorSnps -# temp_geno seq of length nAnchorSnps -# return whether cell_geno matches with temp_geno in hap0 or hap1(bitwise complementary to temp_geno) -proc matchTemplateHap(cell_geno:seq[BinaryGeno], temp_geno:seq[BinaryGeno],error_rate = 0.1): seq[float] = - var nmatch = 0 - for i,g in cell_geno: - if g == temp_geno[i]: - nmatch += 1 - let mis = cell_geno.len - nmatch - let ll_h0 = error_rate^mis*(1-error_rate)^(nmatch) - let ll_h1 = error_rate^nmatch*(1-error_rate)^(mis) - if ll_h0 > ll_h1: - return @[0.0, ll_h0/(ll_h0 + ll_h1)] - elif ll_h0 == ll_h1: - return @[-1.0, 0.5] - else: - return @[1.0, ll_h1/(ll_h0 + ll_h1)] - -proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.98): BinaryGeno = - let prob_0 = error_rate^type10*(1-error_rate)^(type00) - let prob_1 = error_rate^type00*(1-error_rate)^(type10) - let prob_0_p = prob_0/(prob_0 + prob_1) - let prob_1_p = 1 - prob_0_p - if max(prob_0_p,prob_1_p) > posterior_thresh: - if prob_0_p > prob_1_p: - return gREF - else: - return gALT - return gUnknown -# return full sequence of template geno by inferring missing SNPs's genotypes -proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99):seq[BinaryGeno] = - var fullGeno = templateGeno - let nSnps = gtMtx[0].len - var coexisPos:seq[int] - var snpLD: seq[float] - var type00, type10: int - - var snpiInCells = newSeq[BinaryGeno](gtMtx.len) - var offset = 1 - for i,missingSnpi in templateGeno: - if missingSnpi != gUnknown: continue - snpiInCells = sliceColumn(gtMtx,i) - type00 = 0 - type10 = 0 - for j,snpiGeno in snpiInCells: - if snpiGeno == gUnknown: continue - # find 10 coexisting positions for cell j with template geno seq - offset = 1 - coexisPos = newSeq[int]() - while(coexisPos.len < nAnchorSnps): - if ((i+offset) < nSnps): - if((gtMtx[j][i+offset] != gUnknown) and (fullGeno[i+offset] != gUnknown)): - coexisPos.add(i+offset) - if (i-offset) >= 0: - if((gtMtx[j][i-offset] != gUnknown) and (fullGeno[i-offset] != gUnknown)): - coexisPos.add(i-offset) - offset += 1 - snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), - temp_geno = map(coexisPos,proc(x:int): BinaryGeno = fullGeno[x]) ) - if snpLD[1] > posterior_thresh: - if snpLD[0] == 1.0: - if snpiGeno == gREF: ## snpiGeno is 1 or 2 - type10 += 1 - else: - type00 += 1 - else: - if snpiGeno == gREF: - type00 += 1 - else: - type10 += 1 - else: continue - # echo "type00 " & $type00 - # echo "type10 " & $type10 - fullGeno[i] = inferGeno(type00, type10) - return fullGeno - +## infer missing snp in template cell by looking at the snp's link to other nearby snps in other cells + +## author Ruqian Lyu +## rlyu@svi.edu.au +import sequtils +import math +import utils + +let nAnchorSnps = 10 +let maxExpand = 1000 +proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = + map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex]) + +# cell_geno seq of length nAnchorSnps +# temp_geno seq of length nAnchorSnps +# return whether cell_geno matches with temp_geno in hap0 or hap1(bitwise complementary to temp_geno) +proc matchTemplateHap(cell_geno:seq[BinaryGeno], temp_geno:seq[BinaryGeno],error_rate = 0.1): seq[float] = + var nmatch = 0 + for i,g in cell_geno: + if g == temp_geno[i]: + nmatch += 1 + let mis = cell_geno.len - nmatch + let ll_h0 = error_rate^mis*(1-error_rate)^(nmatch) + let ll_h1 = error_rate^nmatch*(1-error_rate)^(mis) + if ll_h0 > ll_h1: + return @[0.0, ll_h0/(ll_h0 + ll_h1)] + elif ll_h0 == ll_h1: + return @[-1.0, 0.5] + else: + return @[1.0, ll_h1/(ll_h0 + ll_h1)] + +proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.99): BinaryGeno = + let prob_0 = error_rate^type10*(1-error_rate)^(type00) + let prob_1 = error_rate^type00*(1-error_rate)^(type10) + let prob_0_p = prob_0/(prob_0 + prob_1) + let prob_1_p = 1 - prob_0_p + if max(prob_0_p,prob_1_p) > posterior_thresh: + echo "posterior_prob: " & $(max(prob_0_p,prob_1_p)) + if prob_0_p > prob_1_p: + echo "returned inferred geno: gREF" + return gREF + else: + echo "returned inferred geno: gALT" + return gALT + return gUnknown +# return full sequence of template geno by inferring missing SNPs's genotypes +proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99, runCorrection = false):seq[BinaryGeno] = + var fullGeno = templateGeno + let nSnps = gtMtx[0].len + var coexisPos:seq[int] + var snpLD: seq[float] + var type00, type10: int + var snpiInCells = newSeq[BinaryGeno](gtMtx.len) + var offset,totalIter = 1 + for i,missingSnpi in templateGeno: + if (not runCorrection) and (missingSnpi != gUnknown): continue + #echo "inferring ith SNP: " & $i + snpiInCells = sliceColumn(gtMtx,i) + type00 = 0 + type10 = 0 + for j,snpiGeno in snpiInCells: + if snpiGeno == gUnknown: continue + # find 10 coexisting positions for cell j with template geno seq + offset = 1 + totalIter = 1 + coexisPos = newSeq[int]() + while(coexisPos.len < nAnchorSnps and totalIter < maxExpand): + if ((i+offset) < nSnps): + if((gtMtx[j][i+offset] != gUnknown) and (templateGeno[i+offset] != gUnknown)): + coexisPos.add(i+offset) + if (i-offset) >= 0: + if((gtMtx[j][i-offset] != gUnknown) and (templateGeno[i-offset] != gUnknown)): + coexisPos.add(i-offset) + offset += 1 + totalIter += 1 + if coexisPos.len < int(nAnchorSnps/2): continue + snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), + temp_geno = map(coexisPos,proc(x:int): BinaryGeno = templateGeno[x]) ) + if i == 1628: + echo "ithSNP: " & $i & "jth cell: " & $j + echo "snpLD results: " & $snpLD + + if snpLD[1] > 0.999: + if snpLD[0] == 1.0: ## match to template H1 + if snpiGeno == gREF: ## snpiGeno is 1 or 2 + type10 += 1 + else: + type00 += 1 + else: + if snpiGeno == gREF: + type00 += 1 + else: + type10 += 1 + else: continue + echo "ithSNP: " & $i & " type00: " & $type00 & ",type10: " & $type10 + # echo + fullGeno[i] = inferGeno(type00, type10) + return fullGeno + diff --git a/private/infer_snp_hap.nim b/private/infer_snp_hap.nim deleted file mode 100755 index 0ef2896..0000000 --- a/private/infer_snp_hap.nim +++ /dev/null @@ -1,5 +0,0 @@ -# infer SNP hap, give the current hap - -var hapVCFFile = "" -var inputVCFFile: -var outVCFFile: \ No newline at end of file diff --git a/private/sgcocaller_sxo.nim b/private/sgcocaller_sxo.nim new file mode 100755 index 0000000..91721ed --- /dev/null +++ b/private/sgcocaller_sxo.nim @@ -0,0 +1,147 @@ +## sgcocaller sxo, using pre-generated mtx files for finding crossovers using a HMM model +from findpath import pathTrackBack +from graph import addViNodeIthSperm, SeqSpermViNodes +import tables +import hts +import utils +import sequtils +import streams +# bins of SNP indexes +import strutils +import os + +proc readCountMtxToSeq*(mtxFileStream:FileStream, countMtx:var seq[seq[int]], by_cell = false):int = + var currentLineSeq:seq[int] + var currentLine:seq[string] + while not mtxFileStream.atEnd(): + currentLine = mtxFileStream.readLine().splitWhitespace() + ## i j 1-based from file + currentLineSeq = map(currentLine, proc(x: string): int = parseInt(x)) + if by_cell: + countMtx[(currentLineSeq[0]-1)][(currentLineSeq[1]-1)] = currentLineSeq[2] + else: + countMtx[(currentLineSeq[1]-1)][(currentLineSeq[0]-1)] = currentLineSeq[2] + return 0 +proc addViNodeSXO(barcodeTable: TableRef, alleleCountTable: Table[string,allele_expr], scSpermSeq: var SeqSpermViNodes, + snpIndex: int, + thetaRef: float, + thetaAlt: float, + rec_pos: int, + nnsize : var int, + initProb: array[stateRef..stateAlt, float], + cmPmb: float, + outaltCountMtxFS:FileStream, outtotalCountMtxFS:FileStream): int = + for bc, ac in alleleCountTable.pairs: + var ithSperm = barcodeTable[bc] + nnsize.inc() + outaltCountMtxFS.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.calt) + outtotalCountMtxFS.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.calt+ac.cref)) + var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cref,cAlt=ac.cAlt) + discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray = emissionArray, snpIndex = snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb) + return 0 +## barcodeTable, cell barcode:cell index +proc sgcocallerSXO*(barcodeTable:TableRef, phase_dir:string, out_dir:string, thetaREF:float, thetaALT:float, cmPmb:float, s_Chrs:seq[string], initProb: array[stateRef..stateAlt, float], phasedSnpAnnotFileName:string): int = + var ncells = barcodeTable.len + var nsnps, ithSperm:int + var currentEntrySeq: seq[string] + var totalCountMtxByCell: seq[seq[int]] + var altCountMtxByCell: seq[seq[int]] + var alleleCountTable: Table[string,allele_expr] + ## iterate through each selected chromosome + for chrom in s_Chrs: + + var nnsize = 0 + ## number of non zeros + var scSpermSeq:SeqSpermViNodes + ## matches with the order in barcodeTable + scSpermSeq.setLen(barcodeTable.len) + var phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo:FileStream + let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" + # if fileExists(phase_dir & chrom & "_corrected_phased_snpAnnot.txt"): + # phasedSnpannoFS = openFileStream(phase_dir & chrom & "_corrected_phased_snpAnnot.txt", fmRead) + # elif fileExists(phase_dir & chrom & "_phased_snpAnnot.txt"): + # phasedSnpannoFS = openFileStream(phase_dir & chrom & "_phased_snpAnnot.txt", fmRead) + phasedSnpannoFS = openFileStream(phasedSnpAnnotFileName, fmRead) + try: + #_totalCount + totalCountMtxFS = openFileStream(phase_dir & chrom & "_totalMtx.mtx", fmRead) + altCountMtxFS = openFileStream(phase_dir & chrom & "_altMtx.mtx", fmRead) + outaltCountMtxFS = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite) + outtotalCountMtxFS = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) + outSnpAnnotFS = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) + outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite) + viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite) + except: + stderr.write getCurrentExceptionMsg() + ## write headers to those mtx files and the first line place holder for total_row total_column total_entry + for fs in [outFileVStateMtx,outaltCountMtxFS,outtotalCountMtxFS]: + fs.writeLine(sparseMatrixHeader) + fs.writeLine(' '.repeat(50)) + outSnpAnnotFS.writeLine(join(["POS","REF", "ALT"], sep = "\t")) + discard totalCountMtxFS.readLine() + discard altCountMtxFS.readLine() + discard altCountMtxFS.readLine() + #N, i,j + currentEntrySeq = totalCountMtxFS.readLine().splitWhitespace() + nsnps = parseInt(currentEntrySeq[0]) + ## gtMtx is cell by Snp format + totalCountMtxByCell = newSeqWith(nsnps,newSeq[int](ncells)) + altCountMtxByCell = newSeqWith(nsnps,newSeq[int](ncells)) + discard readCountMtxToSeq(mtxFileStream = totalCountMtxFS, countMtx = totalCountMtxByCell, by_cell = true) + discard readCountMtxToSeq(mtxFileStream = altCountMtxFS, countMtx = altCountMtxByCell, by_cell = true) + discard phasedSnpannoFS.readLine() + var isnpIndex = -1 + var osnpIndex = 0 + while not phasedSnpannoFS.atEnd(): + currentEntrySeq = phasedSnpannoFS.readLine().splitWhitespace() + isnpIndex.inc() + if currentEntrySeq[3] == "-1": + continue + else: + alleleCountTable = initTable[string,allele_expr]() + for bc in barcodeTable.keys(): + ithSperm = barcodeTable[bc] + if totalCountMtxByCell[isnpIndex][ithSperm] == 0: + ## not adding this vi node to the spermSeq + continue + if currentEntrySeq[3] == "1": + alleleCountTable[bc] = allele_expr(cref:(altCountMtxByCell[isnpIndex][ithSperm]), calt: (totalCountMtxByCell[isnpIndex][ithSperm] - altCountMtxByCell[isnpIndex][ithSperm])) + else: + alleleCountTable[bc] = allele_expr(cref:(totalCountMtxByCell[isnpIndex][ithSperm] - altCountMtxByCell[isnpIndex][ithSperm]), calt: (altCountMtxByCell[isnpIndex][ithSperm])) + if alleleCountTable.len == 0: + continue + osnpIndex.inc() + if currentEntrySeq[3] == "1": + outSnpAnnotFS.writeLine(join([currentEntrySeq[0], currentEntrySeq[2], currentEntrySeq[1]], sep="\t") ) + else: + outSnpAnnotFS.writeLine(join([currentEntrySeq[0], currentEntrySeq[1], currentEntrySeq[2]], sep="\t") ) + discard addViNodeSXO(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq, + thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = osnpIndex, rec_pos = parseInt(currentEntrySeq[0]), nnsize = nnsize, + initProb = initProb, cmPmb = cmPmb, outaltCountMtxFS = outaltCountMtxFS, outtotalCountMtxFS = outtotalCountMtxFS) + + discard pathTrackBack(scSpermSeq = scSpermSeq, thetaRef = thetaRef,thetaAlt=thetaAlt,cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx, + viSegmentInfo = viSegmentInfo) + for fs in [outFileVStateMtx,outaltCountMtxFS,outtotalCountMtxFS]: + fs.setPosition(49) + fs.write($osnpIndex & " " & $barcodeTable.len & " " & $nnsize) + for outFileStream in [phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo]: + outFileStream.close() + return 0 + +# let barcodeFile = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" +# var barcodeTable = newTable[string,int](initialSize = 1024) +# var hf = hts.hts_open(cstring(barcodeFile), "r") +# var kstr: hts.kstring_t +# kstr.l = 0 +# kstr.m = 0 +# kstr.s = nil +# var ithSperm = 0 +# ## initiate the table with CB as keys, gamete index as elements +# while hts_getline(hf, cint(10), addr kstr) > 0: +# if $kstr.s[0] == "#": continue +# var v = $kstr.s +# discard barcodeTable.hasKeyOrPut(v, ithSperm) +# ithSperm.inc +# discard hf.hts_close() +# var s_Chrs = @["chr6"] +# discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = "data/WC_CNV_42/sgcocaller/phaseOneStep/", out_dir = "data/WC_CNV_42/sgcocaller/sxo/",thetaREF = 0.1, thetaALT = 0.9, cmPmb = 0.001,s_Chrs =s_Chrs,initProb = [0.5,0.5]) \ No newline at end of file diff --git a/private/sgphase.nim b/private/sgphase.nim index 6c669d6..602771c 100755 --- a/private/sgphase.nim +++ b/private/sgphase.nim @@ -7,7 +7,7 @@ import streams import strutils import sequtils -# phaseOutdir/phased_snpAnnot.txt +# phaseOutdir/chrom_phased_snpAnnot.txt # phaseOutdir/cellGenoVersusTemplate.txt # phaseOutdir/cellGenoVersusTemplate.png by executing the R code # let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_gtMtx.mtx" @@ -30,14 +30,14 @@ proc writePhasedSnpAnnot*(fullGeno:seq[BinaryGeno], snpAnnotFileStream:FileStrea quit "snpAnnot.txt does not have the same number of rows with gtMtx" return 0 -proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = +proc sgphase*(mtxFile: string, snpAnnotFile:string, phasedSnpAnnotFile:string, diagnosticDataframeFile:string, templateCell:int): int = var nsnps,ncells,totalEntries:int var gtMtx:seq[seq[BinaryGeno]] var currentEntry:seq[int] var currentEntrySeq:seq[string] var gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream:FileStream - let phasedSnpAnnotFile = phaseOutdir & "phased_snpAnnot.txt" - let diagnosticDataframeFile = phaseOutdir & "cellGenoVersusTemplate.txt" + #let phasedSnpAnnotFile = phaseOutdir & "phased_snpAnnot.txt" + #let diagnosticDataframeFile = phaseOutdir & "cellGenoVersusTemplate.txt" ## i, j are 1-based, entries with value 1 or 2 try: gtMtxFileStream = openFileStream(mtxFile, fmRead) @@ -60,11 +60,22 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = ## gtMtx is cell by Snp format gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) discard readGtMtxToSeq(gtMtxFileStream,gtMtx) - var tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3) + var tempcell: int + if templateCell != -1: + tempcell = templateCell + else: + tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3) echo "template cell is " & $tempcell var tempcellGeno = gtMtx[tempcell] - let fullGeno = inferSnpGeno(tempcellGeno, gtMtx) + var fullGeno = inferSnpGeno(tempcellGeno, gtMtx) + echo "inferSnoGeno done" + # echo "second pass" + # fullGeno = inferSnpGeno(fullGeno, gtMtx) + # echo "second pass done" # generate the txt for diagnositc plot + echo "run correction" + fullGeno = inferSnpGeno(fullGeno, gtMtx,posterior_thresh = 0.99, runCorrection = true) + echo "run correction done" var selectedCells:seq[int] if ncells < 10: selectedCells = (0..(ncells-1)).toSeq @@ -83,7 +94,6 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = for cellg in sliceColumn(subsetGtMtx, k): writeOut = writeOut & " " & $(parseInt($cellg)+1) ddframFileStream.writeLine(writeOut) - discard writePhasedSnpAnnot(fullGeno, snpAnnotFileStream, phasedSnpAnnotFileStream) for fs in [gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream]: fs.close() @@ -92,3 +102,20 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = +# var s_Chrs = @["CUR6G"] + +# for chrom in s_Chrs: +# var mtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_gtMtx.mtx" +# var phasedSnpFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_phased_snpAnnot.txt" +# var snpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_snpAnnot.txt" +# var ddframe = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_cellGenoVersusTemplate.txt" + +# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe) + +# for chrom in s_Chrs: +# var mtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx" +# var phasedSnpFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt" +# var snpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_snpAnnot.txt" +# var ddframe = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_cellGenoVersusTemplate.txt" + +# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe) \ No newline at end of file diff --git a/private/write_out_vcf.nim b/private/write_out_vcf.nim new file mode 100755 index 0000000..8de8b53 --- /dev/null +++ b/private/write_out_vcf.nim @@ -0,0 +1,101 @@ +## write output VCF file +import hts +import streams +import sequtils +import strutils +import correctPhase + +let debug = true + +proc readNextPhased(phasedAnnotFS:var FileStream): seq[string] = + var phaseRec = newSeq[string](4) + var lineRec: string + while not phasedAnnotFS.atEnd(): + lineRec = phasedAnnotFS.readLine() + phaseRec = lineRec.splitWhitespace() + if debug and phaseRec.len != 4: + quit "len not 4 at " & lineRec & $phasedAnnotFS.atEnd() + if phaseRec[3] == "-1": continue + else: break + return phaseRec + +proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string, threads:int, add_header_string = """##sgcocaller_v0.1=phase""", chrom = "nil"):int = + + if debug: echo "ivcffile " & ivcfFile + if debug: echo "ovcfFile " & ovcfFile + if debug: echo "phasedAnnotFile " & phasedAnnotFile + + var ivcf: VCF + var ovcf: VCF + var v_off = 0 + var current_block = 0 + var phasedAnnotFS: FileStream + var phaseRec = newSeq[string](4) + var phasePos: int + var phasePhase: int + if not open(ivcf, ivcfFile, threads=threads): + quit "couldn't open input vcf file" + if not open(ovcf, ovcfFile, threads=threads, mode ="w"): + quit "couldn't open output vcf file" + try: + phasedAnnotFS = openFileStream(phasedAnnotFile, fmRead) + except: + stderr.write getCurrentExceptionMsg() + ovcf.copy_header(ivcf.header) + discard ovcf.header.add_string(add_header_string) + discard ovcf.write_header() + discard phasedAnnotFS.readLine() + var gt_string:seq[int32] + var snpIndexInPhasedAnnotFile = 0 + phaseRec = readNextPhased(phasedAnnotFS) + if debug: echo phasedAnnotFS.atEnd() + phasePos = parseInt(phaseRec[0]) + phasePhase = parseInt(phaseRec[3]) + var queryChrom = chrom + if queryChrom == "nil": + queryChrom = "*" + for v in ivcf.query(queryChrom): + if v.POS.cint != phasePos: continue + else: + var f = v.format() + if phasePhase == 0: + ## 0|1 + #if debug: echo "block hap0 and this h0 is 0 0|1" + gt_string = @[int32(2),int32(5)] + elif phasePhase == 1: + ## 1|0 + #if debug: echo "block hap0 and this h0 is 1 1|0" + gt_string = @[int32(4),int32(3)] + else: + if phasedAnnotFS.atEnd(): break + if f.set("GT",gt_string) != Status.OK: + quit "set GT failed" + if not ovcf.write_variant(v) : + quit "write vcf failed for " & $voff + if phasedAnnotFS.atEnd(): break + phaseRec = readNextPhased(phasedAnnotFS) + if debug: + if phaseRec.len != 4: + echo " phaseRec.len " & $phaseRec.len + echo " phaseRec" & $phaseRec + phasePos = parseInt(phaseRec[0]) + phasePhase = parseInt(phaseRec[3]) + phasedAnnotFS.close() + ovcf.close() + ivcf.close() + + return 0 + +# let s_Chrs = map(toSeq(1..19), proc(x:int):string = "chr" & $x) +# for chrom in @["chr8"]: +# var ivcf = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT." & chrom & ".vcf.gz" +# var ovcf = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz" +# var swphasedSnpFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.txt" +# var gtMtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx" +# var phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt" +# var switchScoreFile ="data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_switch_score.txt" +# var switchedPhasedAnnotVcfFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz" +# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,swphasedSnpFile,switchScoreFile) +# discard writePhaseToVCF(ivcf, ovcf, swphasedSnpFile,1) + + diff --git a/sgcocaller.nim b/sgcocaller.nim index 542c5ba..7969087 100755 --- a/sgcocaller.nim +++ b/sgcocaller.nim @@ -1,354 +1,289 @@ - -## two modules (generate pair-wise fragments, and then run viterbi) - - -import os -import docopt -import strutils -import hts -import tables -import sequtils -import private/utils -import math -import streams -import private/graph -import private/findpath -import private/genotype_mtx -import private/sgphase - -# proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTotalReads:int, -# minTotalReads:int, mapq: int,minbsq:int,mindp:int,barcodeTag:string, minsnpdepth:int):int = -# var variantIndex = 1 -# var cellFragmentsTable = newTable[string,Fragment]() - -# echo "generated fragments from " & $barcodeTable.len & " single cells" -# var cellGtNodes: Table[string, GtNode] -# for rec in ivcf.query("*"): -# cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, -# minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, -# minbsq=minbsq,barcodeTag=barcodeTag) -# cellGtNodes = callNodeGenotype(cellGtNodes,minCellDp = mindp, minSnpDepth = minsnpdepth, p0 = 0.3, p1 = 0.8) -# cellFragmentsTable = updateCellFragment(barcodedNodesTable=cellGtNodes,cellFragmentsTable = cellFragmentsTable, fmf_out = outfmf) -# variantIndex = variantIndex + 1 -# echo "processed " & $(variantIndex) & " variants" -# return 0 - -# proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfile:string, threads:int):int = -# var cellGenoTable: TableRef[string,TableRef[string,int]] -# var cellBlockLeftPhaseTable,cellBlockRightPhaseTable :TableRef[string, seq[int]] -# var chr_blocks:seq[Block] -# var blockkseq:seq[Block_linkage] -# var phased_blocks: seq[int] -# echo "Phasing for hapfile " & block_hapfile -# cellGenoTable = newTable[string,TableRef[string,int]]() -# discard readFmf(fmf_file,cellGenoTable) -# chr_blocks = read_blocks(block_hapfile) -# cellBlockLeftPhaseTable = newTable[string, seq[int]]() -# cellBlockRightPhaseTable = newTable[string, seq[int]]() -# ## blocks at the ends with 20 SNPs or fewer, removed? -# ## blocks not at the ends with 20SNPs or fewer are not used for linking blocks but if their linkage with pre block and post-block is consistent, -# ## these short blocks' haplotypes are retained. -# ## -# ## What if blocks cannot be linked ? the number of 11(00) == 10(01) -# echo "Total blocks " & $chr_blocks.len - -# var contig_blocks:seq[int] -# for b_j, bl in chr_blocks: -# echo "block " & $b_j & " len: " & $bl.h0.len -# # echo "last 5" & $bl.h0[(high(bl.h0)-5)..high(bl.h0)] -# # echo "first 5" & $bl.h0[0..5] -# discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, true, cellBlockLeftPhaseTable) -# discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, false, cellBlockRightPhaseTable) -# contig_blocks = find_contig_blocks(chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) -# # echo "blocks for contigs" & $contig_blocks -# blockkseq = build_contig(contig_blocks = contig_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) -# for bk in blockkseq: -# echo "prevBlock" & $bk.prevBlock & "nextBlock" & $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 & " 01 type " & $bk.linkageType_01 & " switch posterior prob " & $bk.switch_posterior_prob & " switch value :" & $bk.switch -# phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - -# discard write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads) -# return 0 -proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, - ibam:Bam, out_dir:string, mapq:int, - minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, - thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string,phased:bool): int = - - var bulkBam = false - if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"): - echo "running in bulk mode and all reads are regarded as from one sample/cell" - bulkBam = true - else: - echo "running sgcoaller for " & $barcodeTable.len & " single cells" - - let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] - ## iterate through each selected chromosome - for chrom in s_Chrs: - var snpIndex, nnsize = 0 - ## number of non zeros - var scSpermSeq:SeqSpermViNodes - ## matches with the order in barcodeTable - scSpermSeq.setLen(barcodeTable.len) - var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream - - try: - outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) - outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) - outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite) - outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite) - viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite) - except: - stderr.write getCurrentExceptionMsg() - - ## write headers to those mtx files and the first line place holder for total_row total_column total_entry - let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" - - for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]: - outFileStream.writeLine(sparseMatrixHeader) - outFileStream.writeLine(' '.repeat(50)) - - outFileSNPanno.writeLine(join(["POS","REF", "ALT"], sep = "\t")) - var ac = new_seq[int32](10) - for rec in ivcf.query(chrom): - if rec.ALT.len > 1 or rec.ALT.len == 0 : continue - ## alleleCountTable contains for this rec.POS, each cell barcode's allele counts - var rec_alt = rec.ALT[0][0] - var rec_ref = rec.REF[0] - if phased: - var f = rec.format() - var gts = f.genotypes(ac) - if ac[0] == 4 and ac[1] == 3: - # gt is 1|0 - rec_alt = rec.REF[0] - rec_ref = rec.ALT[0][0] - # if not gt == 0|1 continue - elif not (ac[0] == 2 and ac[1] == 5): continue - var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, - maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag, - startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt = rec_alt, rec_ref = rec_ref) - if alleleCountTable.len==0: continue - - ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations - snpIndex += 1 - outFileSNPanno.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) - discard addViNode(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq, - outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx = outFileAltCountMtx, nnsize = nnsize, - mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec_pos=int(rec.POS), - initProb = initProb, cmPmb = cmPmb) - var posEnd: int64 - var inferProb,reverseProb = 0.0 - var currentEm: array[stateRef..stateAlt, float] - var lastNode: ViNode - var spermVNseq: SpermViNodes - var ithSNP: int - - for ithSperm in 0..(scSpermSeq.len-1): - ## rightGap,leftGap = [0.0,0.0] - spermVNseq = scSpermSeq[ithSperm] - if spermVNseq.viNodeseq.len==0: continue - lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] - currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt) - if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: - scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef - ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] - outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") - inferProb = currentEm[stateRef] - reverseProb = currentEm[stateAlt] - else: - scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt - ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] - outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") - inferProb = currentEm[stateAlt] - reverseProb = currentEm[stateRef] - posEnd = lastNode.pos - ## call pathTrackBack will write the most probably hidden state seq and viterbi segment info to the relevant File Streams. - discard pathTrackBack(currentSperm = scSpermSeq[ithSperm], ithSperm = ithSperm, thetaRef = thetaRef, - thetaAlt = thetaAlt, cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx, - viSegmentInfo = viSegmentInfo, posEnd = posEnd, inferProb = inferProb,reverseProb = reverseProb) - for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]: - outFileStream.setPosition(49) - outFileStream.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) - for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,outFileSNPanno,viSegmentInfo]: - outFileStream.close() - return 0 - -when(isMainModule): - let version = "0.3.3" - var doc = format(""" - $version - - Usage: - sgcocaller gtMtx [options] <BAM> <VCF> <barcodeFile> <out_prefix> - sgcocaller phase [options] <gtMtxFile> <phaseOutputDir> - sgcocaller swphase [options] <gtMtxFile> <phaseSnpAnnotFile> - sgcocaller sxo [options] <phaseOutputDir> <out_prefix> - sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> - - - -Arguments: - - <BAM> the read alignment file with records of single-cell DNA reads - - <VCF> the variant call file with records of SNPs - - <barcodeFile> the text file containing the list of cell barcodes - - <out_prefix> the prefix of output files - - <fmf> the fragment file from running sgcocaller fmf - - <out_vcf> the output vcf aftering phasing blocks in hapfile - - -Options: - -t --threads <threads> number of BAM decompression threads [default: 4] - --barcodeTag <barcodeTag> the cell barcode tag in BAM [default: CB] - --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] - --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] - --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) - --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] - --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] - --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] - --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] - --minSNPdepth <minSNPdepth> the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 1] - --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] - --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] - --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1] - --phased the VCF contains the phased GT of heterozygous - -h --help show help - - - Examples - ./sgcocaller gtMtx --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/cellfragment.fmf - ./sgcocaller phase gtMtxFile phaseOutputDir - ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp - ./sgcocaller sxo phaseOutputDir barcodeFile.tsv ./percell/ccsnp - -""" % ["version", version]) - - let args = docopt(doc, version=version) - - var - threads:int - selectedChrs:string - out_dir:string - mapq:int - minbsq:int - mintotal:int - maxtotal:int - mindp:int - maxdp:int - thetaREF:float - thetaALT:float - cmPmb:float - barcodeTag="CB" - minsnpdepth:int - barcodeFile,bamfile,vcff:string - vcfGtPhased = false - echo $args - threads = parse_int($args["--threads"]) - barcodeTag = $args["--barcodeTag"] - mindp = parse_int($args["--minDP"]) - maxdp = parse_int($args["--maxDP"]) - maxtotal = parse_int($args["--maxTotalDP"]) - mintotal = parse_int($args["--minTotalDP"]) - minsnpdepth = parse_int($args["--minSNPdepth"]) - mapq = parse_int($args["--minMAPQ"]) - minbsq = parse_int($args["--baseq"]) - thetaRef = parse_float($args["--thetaREF"]) - thetaAlt = parse_float($args["--thetaALT"]) - cmPmb = parse_float($args["--cmPmb"]) - vcfGtPhased = parse_bool($args["--phased"]) - - if args["gtMtx"] or args["xo"]: - var - ibam:Bam - ivcf:VCF - s_Chrs: seq[string] - bamfile = $args["<BAM>"] - barcodeFile = $args["<barcodeFile>"] - out_dir = $args["<out_prefix>"] - if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_" - vcff = $args["<VCF>"] - if not open(ibam, bamfile, threads=threads, index = true): - quit "couldn't open input bam" - if not open(ivcf, vcff, threads=threads): - quit "couldn't open: vcf file" - if($args["--chrom"] != "nil"): - selectedChrs = $args["--chrom"] - s_Chrs = selectedChrs.split(',') - else: - ## find all chroms from headers of vcf file (Contigs) - let contigs = ivcf.contigs - s_Chrs = map(contigs, proc(x: Contig): string = x.name) - var hf = hts.hts_open(cstring(barcodeFile), "r") - var barcodeTable = newTable[string,int](initialSize = 1024) - var kstr: hts.kstring_t - kstr.l = 0 - kstr.m = 0 - kstr.s = nil - var ithSperm = 0 - ## initiate the table with CB as keys, allele counts (ref object) as elements - while hts_getline(hf, cint(10), addr kstr) > 0: - if $kstr.s[0] == "#": continue - var v = $kstr.s - discard barcodeTable.hasKeyOrPut(v, ithSperm) - ithSperm.inc - discard hf.hts_close() - if args["gtMtx"]: - var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string - var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream - for chrom in s_Chrs: - echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom - outGtMtxFile = out_dir & chrom & "_gtMtx.mtx" - outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx" - outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx" - try: - outGtMtx = openFileStream(outGtMtxFile, fmWrite) - outSnpAnnot = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) - outTotalCountMtx = openFileStream(outTotalCountMtxFile, fmWrite) - outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite) - except: - stderr.write getCurrentExceptionMsg() - - discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, - outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, - outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, - minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp, - barcodeTag = barcodeTag, minsnpdepth=minsnpdepth) - for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) - elif args["xo"]: - echo "running crossover calling from VCF and BAM file\n" - echo "running for these chromosomes as specified in the header of the provided VCF file for chroms: " & $s_Chrs - discard sgcocaller(threads, ivcf, barcodeTable, ibam, - out_dir, mapq, minbsq, mintotal, - maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) - var outFileTotalCountMtxFile, outFileAltCountMtxFile: string - ## sort entries in mtx files - for chrom in s_Chrs: - outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" - outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" - for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) - ibam.close() - ivcf.close() - elif args["phase"]: - let gtMtxFile = $args["<gtMtxFile>"] - var phaseOutDir = $args["<phaseOutputDir>"] - if (phaseOutdir[^1] != '_') and (phaseOutdir[^1] != '/'): phaseOutdir = phaseOutdir & "_" - let snpAnnotFile = gtMtxFile.replace(sub = "_gtMtx.mtx", by = "_snpAnnot.txt") - echo "running phasing from single cell genotype matrix (one chromosome) " & gtMtxFile - echo "using the " & snpAnnotFile & "for generating phased haplotypes" - discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir) - elif args["swphase"]: - let gtMtxFile = $args["<gtMtxFile>"] - let phaseSnpAnnotFile = $args["<phaseSnpAnnotFile>"] - echo "finding snps positions to switch from single cell genotype matrix (one chromosome) " & gtMtxFile - echo "using the " & phaseSnpAnnotFile & "for generating final phased haplotypes" - discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir) - - + +import os +import docopt +import strutils +import hts +import tables +import sequtils +import private/utils +import math +import streams +import private/graph +import private/findpath +import private/genotype_mtx +import private/sgphase +import private/write_out_vcf +import private/correctPhase +import private/sgcocaller_sxo + + +let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] + +proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, + ibam:Bam, out_dir:string, mapq:int, + minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, + thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string,phased:bool): int = + + var bulkBam = false + if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"): + echo "running in bulk mode and all reads are regarded as from one sample/cell" + bulkBam = true + else: + echo "running sgcoaller for " & $barcodeTable.len & " single cells" + + ## iterate through each selected chromosome + for chrom in s_Chrs: + var snpIndex, nnsize = 0 + ## number of non zeros + var scSpermSeq:SeqSpermViNodes + ## matches with the order in barcodeTable + scSpermSeq.setLen(barcodeTable.len) + var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream + let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" + try: + outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) + outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) + outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite) + outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite) + viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite) + except: + stderr.write getCurrentExceptionMsg() + ## write headers to those mtx files and the first line place holder for total_row total_column total_entry + for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]: + outFileStream.writeLine(sparseMatrixHeader) + outFileStream.writeLine(' '.repeat(50)) + outFileSNPanno.writeLine(join(["POS","REF", "ALT"], sep = "\t")) + var ac = new_seq[int32](10) + for rec in ivcf.query(chrom): + if rec.ALT.len > 1 or rec.ALT.len == 0 : continue + ## alleleCountTable contains for this rec.POS, each cell barcode's allele counts + var rec_alt = rec.ALT[0][0] + var rec_ref = rec.REF[0] + if phased: + var f = rec.format() + var gts = f.genotypes(ac) + if ac[0] == 4 and ac[1] == 3: + # gt is 1|0 + rec_alt = rec.REF[0] + rec_ref = rec.ALT[0][0] + # if not gt == 0|1 continue + elif not (ac[0] == 2 and ac[1] == 5): continue + var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, + maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag, + startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt = rec_alt, rec_ref = rec_ref) + if alleleCountTable.len==0: continue + + ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations + snpIndex += 1 + outFileSNPanno.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) + discard addViNode(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq, + outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx = outFileAltCountMtx, nnsize = nnsize, + mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec_pos=int(rec.POS), + initProb = initProb, cmPmb = cmPmb) + discard pathTrackBack(scSpermSeq = scSpermSeq, thetaRef = thetaRef,thetaAlt=thetaAlt,cmPmb = cmPmb,outFileVStateMtx = outFileVStateMtx, + viSegmentInfo = viSegmentInfo ) + + for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx]: + outFileStream.setPosition(49) + outFileStream.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) + for outFileStream in [outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,outFileSNPanno,viSegmentInfo]: + outFileStream.close() + return 0 + +when(isMainModule): + let version = "0.3.3" + var doc = format(""" + $version + Usage: + sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix> + sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix> + sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix> + sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> + + + +Arguments: + + <BAM> the read alignment file with records of single-cell DNA reads + + <VCF> the variant call file with records of SNPs + + <barcodeFile> the text file containing the list of cell barcodes + + <out_prefix> the prefix of output files + + <fmf> the fragment file from running sgcocaller fmf + + <out_vcf> the output vcf aftering phasing blocks in hapfile + + +Options: + -t --threads <threads> number of BAM decompression threads [default: 4] + --barcodeTag <barcodeTag> the cell barcode tag in BAM [default: CB] + --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] + --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] + --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) + --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] + --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] + --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] + --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] + --minSNPdepth <minSNPdepth> the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 1] + --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] + --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] + --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1] + --phased the input VCF for calling crossovers contains the phased GT of heterozygous SNPs + --outvcf generate the output in vcf format (phase) + --templateCell <templateCell> the cell's genotype to be used a template cell, as the cell's index (0-starting) in the barcode file, default as not supplied [default: -1] + -h --help show help + + + Examples + ./sgcocaller phase gtMtxFile phaseOutputPrefix + ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp + ./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp + +""" % ["version", version]) + + let args = docopt(doc, version=version) + + var + threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth:int + thetaREF,thetaALT,cmPmb:float + barcodeTag="CB" + out_dir,selectedChrs,barcodeFile,bamfile,vcff:string + vcfGtPhased = false + s_Chrs: seq[string] + outFileTotalCountMtxFile, outFileAltCountMtxFile: string + + echo $args + threads = parse_int($args["--threads"]) + barcodeTag = $args["--barcodeTag"] + mindp = parse_int($args["--minDP"]) + maxdp = parse_int($args["--maxDP"]) + maxtotal = parse_int($args["--maxTotalDP"]) + mintotal = parse_int($args["--minTotalDP"]) + minsnpdepth = parse_int($args["--minSNPdepth"]) + mapq = parse_int($args["--minMAPQ"]) + minbsq = parse_int($args["--baseq"]) + thetaRef = parse_float($args["--thetaREF"]) + thetaAlt = parse_float($args["--thetaALT"]) + cmPmb = parse_float($args["--cmPmb"]) + vcfGtPhased = parse_bool($args["--phased"]) + out_dir = $args["<out_prefix>"] + if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_" + + if args["phase"] or args["xo"] or args["sxo"]: + barcodeFile = $args["<barcodeFile>"] + var hf = hts.hts_open(cstring(barcodeFile), "r") + var barcodeTable = newTable[string,int](initialSize = 1024) + var kstr: hts.kstring_t + kstr.l = 0 + kstr.m = 0 + kstr.s = nil + var ithSperm = 0 + ## initiate the table with CB as keys, gamete index as elements + while hts_getline(hf, cint(10), addr kstr) > 0: + if $kstr.s[0] == "#": continue + var v = $kstr.s + discard barcodeTable.hasKeyOrPut(v, ithSperm) + ithSperm.inc + discard hf.hts_close() + if args["phase"] or args["xo"]: + var + ibam:Bam + ivcf:VCF + bamfile = $args["<BAM>"] + vcff = $args["<VCF>"] + if($args["--chrom"] != "nil"): + selectedChrs = $args["--chrom"] + s_Chrs = selectedChrs.split(',') + else: + ## find all chroms from headers of vcf file (Contigs) + let contigs = ivcf.contigs + s_Chrs = map(contigs, proc(x: Contig): string = x.name) + if not open(ibam, bamfile, threads=threads, index = true): + quit "couldn't open input bam" + if not open(ivcf, vcff, threads=threads): + quit "couldn't open: vcf file" + if args["phase"]: + var templateCell = parse_int($args["--templateCell"]) + var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string + var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream + for chrom in s_Chrs: + echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom + outGtMtxFile = out_dir & chrom & "_gtMtx.mtx" + outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx" + outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx" + outSnpAnnotFile = out_dir & chrom & "_snpAnnot.txt" + outphasedSnpAnnotFile = out_dir & chrom & "_phased_snpAnnot.txt" + outdiagnosticDataframeFile = out_dir & chrom & "_cellGenoVersusTemplate.txt" + try: + outGtMtx = openFileStream(outGtMtxFile, fmWrite) + outSnpAnnot = openFileStream(outSnpAnnotFile, fmWrite) + outTotalCountMtx = openFileStream(outTotalCountMtxFile, fmWrite) + outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, + outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, + outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, + minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp, + barcodeTag = barcodeTag, minsnpdepth=minsnpdepth, chrom = chrom) + for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + echo "running phasing from single cell genotype matrix (one chromosome) " & outGtMtxFile + echo "using the " & outSnpAnnotFile & "for generating phased haplotypes" + discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile, diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell) + if parse_bool($args["--outvcf"]): + var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz" + discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads) + elif args["xo"]: + echo "running crossover calling from VCF and BAM file\n" + echo "running for these chromosomes: " & $s_Chrs + discard sgcocaller(threads, ivcf, barcodeTable, ibam, + out_dir, mapq, minbsq, mintotal, + maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) + ## sort entries in mtx files + for chrom in s_Chrs: + outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" + outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" + for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + ibam.close() + ivcf.close() + elif args["sxo"]: + if($args["--chrom"] != "nil"): + selectedChrs = $args["--chrom"] + s_Chrs = selectedChrs.split(',') + else: + quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes using comma as separator" + echo "running crossover calling from genotype and allele count matrices generate from sgcocaller phase/swphase " + echo "running for these chromosomes : " & $s_Chrs + let phase_dir = $args["<phaseOutputPrefix>"] + let phasedSnpAnnotFile = $args["<SNPPhaseFile>"] + discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, phasedSnpAnnotFileName = phasedSnpAnnotFile) + for chrom in s_Chrs: + outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" + outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" + for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + elif args["swphase"]: + echo "find switch spots and generate corrected phase" + let gtMtxFile = $args["<gtMtxFile>"] + let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"] + let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt" + let switchScoreFile = out_dir & "switch_score.txt" + let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz" + discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile) + if ($args["--chrom"] == "nil"): + echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option" + + discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"]) + + + + \ No newline at end of file -- GitLab