diff --git a/private/blocks_utils.nim b/private/blocks_utils.nim index 6c3dbc164dc7c4b09ca96e202f8c9ceaae58c8de..8e76e43e6c4b32bfd509ea8aaf8ff4069980caec 100755 --- a/private/blocks_utils.nim +++ b/private/blocks_utils.nim @@ -39,7 +39,7 @@ type Block_linkage* = ref object linkageType_00* :int linkageType_01*: int switch_posterior_prob*: float - switch_confidence*: float + linkage_confidence*: float switch*: int @@ -53,6 +53,7 @@ proc cal_switch_posterior*(error_rate = 0.1, n_blocks:int, n_sw:int): float = var ll_01 = error_rate^(n_blocks-n_sw)*(1-error_rate)^n_sw var ll_00 = error_rate^(n_sw)*(1-error_rate)^(n_blocks-n_sw) return ll_01/(ll_01+ll_00) + proc is_block_header(line_string:string): bool = if line_string.splitWhitespace()[0] == "BLOCK:": # .len == 11: diff --git a/private/cal_switch_score.nim b/private/cal_switch_score.nim new file mode 100755 index 0000000000000000000000000000000000000000..e86b69e6d3ab59cc2245dc3dbd2be76c47f56b89 --- /dev/null +++ b/private/cal_switch_score.nim @@ -0,0 +1,9 @@ +## calcualte switch score for snps positions that are high risk + +let binSize = 1000 +let movingStep = 200 + +# reduce the search space +# return seq of snp indexes + +proc findHighRiskSnps(fullGeno:seq[int]): seq[int] = \ No newline at end of file diff --git a/private/find_template_cell b/private/find_template_cell new file mode 100755 index 0000000000000000000000000000000000000000..2349d215e9302d9bb859f146e3f73e55201b74e4 Binary files /dev/null and b/private/find_template_cell differ diff --git a/private/find_template_cell.nim b/private/find_template_cell.nim new file mode 100755 index 0000000000000000000000000000000000000000..41478f3105b4affa3176f83f60e060f13b43f832 --- /dev/null +++ b/private/find_template_cell.nim @@ -0,0 +1,137 @@ +# find template gamete that serves as a initial haplotype for this individual + +import streams +import sequtils +import strutils +import math +import algorithm +import utils + +type + CellPairs = object + cell1*: int + cell2*: int + coexistSnps*: int + dissimilarity*: float + +let + minCoexit = 1000 + maxDissim = 0.0099 + +var nsnps,ncells,totalEntries:int + +proc toBinaryGeno(x: int) : BinaryGeno = + if x == 1: return gREF + if x == 2: return gALT + quit "mtx geno is not 1 or 2" + +proc readGtMtx*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]]):int = + var currentEntrySeq:seq[int] + var currentLine:seq[string] + + while not mtxFileStream.atEnd(): + currentLine = mtxFileStream.readLine().splitWhitespace() + ## i j 1-based from file + currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) + gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno + return 0 + +proc countCoexistMatch(seq1: seq[BinaryGeno], seq2: seq[BinaryGeno]): seq[int] = + if not (seq1.len == seq2.len): + quit "cells' genotypes under comparison do not have the same length" + var ncoexist, nmatch, cell1Snps, cell2Snps:int + for i, g in seq1: + if g != gUnknown: cell1Snps += 1 + if (seq2[i] != gUnknown): cell2Snps += 1 + if (g != gUnknown and seq2[i]!= gUnknown): + ncoexist += 1 + if(g == seq2[i]): + nmatch += 1 + return [cell1Snps,cell2Snps,nmatch,ncoexist].toSeq + +proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[int]): seq[CellPairs] = + let ncells = gtMtx.len + var genoMatch = @[0,0,0,0] + var returnCp = newSeq[CellPairs](nPairs) + var k = 0 + for celli in 0..(ncells-2): + if k == nPairs: break + for cellj in (celli+1)..(ncells-1): + genoMatch = countCoexistMatch(seq1 = gtMtx[celli], seq2 = gtMtx[cellj]) + nSnpsPercell[celli] = genoMatch[0] + nSnpsPercell[cellj] = genoMatch[1] + if genoMatch[3] >= minCoexit: + if (genoMatch[2]/genoMatch[3] < maxDissim) or (genoMatch[2]/genoMatch[3] > (1 - maxDissim)): + returnCp[k] = CellPairs(cell1: celli, cell2: cellj, coexistSnps:genoMatch[3], dissimilarity:(genoMatch[2]/genoMatch[3])) + k += 1 + if k == nPairs: break + returnCp + + +proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.75): int = + # return the selected cells' index + # not too many SNPs or too few + var chosed:int + var seqNsnps = nSnpsPercell + sort(seqNsnps, system.cmp[int]) + int(floor(float(seqNsnps.len) * q)) + +proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = + var nSnpsCells = newSeq[int](gtMtx.len) + var cellPairs = findCellPairs(gtMtx = gtMtx, nPairs = nPairs, nSnpsPercell = nSnpsCells) + echo "cellPairs" + echo cellPairs + var selectedCell, icp: int + if cellPairs[0].coexistSnps == 0 : + # no good pairs found + selectedCell = findCellBynSNPs(nSnpsPercell = nSnpsCells) + else: + let coExistSnps = map(cellPairs, proc(x: CellPairs) : int = x.coexistSnps) + icp = maxIndex(coExistSnps) + if nSnpsCells[cellPairs[icp].cell1] > nSnpsCells[cellPairs[icp].cell2]: + echo "selected cell: " & $cellPairs[icp].cell1 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell1] + return cellPairs[icp].cell1 + else: + echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2] + return cellPairs[icp].cell2 + +var gtMtx:seq[seq[BinaryGeno]] +var currentEntry:seq[int] +var currentEntrySeq:seq[string] +var gtMtxFileStream:FileStream +## i, j are 1-based +let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/WC_CNV_chr1_nonzero.mtx" +echo mtxFile + +try: + gtMtxFileStream = openFileStream(mtx_file, fmRead) +except: + stderr.write getCurrentExceptionMsg() +# %%MatrixMarket matrix coordinate integer general +discard gtMtxFileStream.readLine() +#N, i,j +currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() +currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) +nsnps = currentEntry[0] +echo "nsnps " & $nsnps +ncells = currentEntry[1] +echo "ncells " & $ncells + +totalEntries = currentEntry[2] + +echo "totalEntries " & $totalEntries + + +gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) + +discard readGtMtx(gtMtxFileStream,gtMtx) +#var colIndex = 4 +#echo gtMtx[0][27..40] + +#echo sliceColumn(gtMtx,56) +#echo gtMtx.len + +#echo findCellPairs(gtMtx = gtMtx) +#var nSnpsCells = newSeqWith(ncells,0) +#selectTemplateCell +echo selectTemplateCell(gtMtx = gtMtx, nPairs =3) \ No newline at end of file diff --git a/private/findpath.nim b/private/findpath.nim index 7498a84a991e7bf2d3db0e0afa166651d421e74b..902ffe50c409b27ad500b888b032e9db69c583f8 100755 --- a/private/findpath.nim +++ b/private/findpath.nim @@ -48,7 +48,7 @@ proc pathTrackBack*(currentSperm: var SpermViNodes, leftGap=[math.ln(transProb),math.ln(1-transProb)] inferProb+=leftGap[0] reverseProb+=leftGap[1] - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ")) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ")) transitFlag = true cSNP = 1 rightGap = leftGap @@ -69,7 +69,7 @@ proc pathTrackBack*(currentSperm: var SpermViNodes, leftGap=[math.ln(transProb),math.ln(1-transProb)] inferProb += leftGap[0] reverseProb += leftGap[1] - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) transitFlag = true cSNP = 1 rightGap = leftGap @@ -82,9 +82,9 @@ proc pathTrackBack*(currentSperm: var SpermViNodes, ## the first SNP is included in the segment from traced from back posStart = currentSperm.viNodeseq[0].pos if currentSperm.viNodeseq[0].state == stateRef: - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) else: - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ")) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ")) else: ## the first node has different state from the second, the first node has its own segment posStart = currentSperm.viNodeseq[0].pos @@ -95,10 +95,10 @@ proc pathTrackBack*(currentSperm: var SpermViNodes, if currentSperm.viNodeseq[0].state == stateRef: inferProb = currentEm[stateRef]+math.ln(transProb) reverseProb = currentEm[stateAlt]+math.ln(1-transProb) - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "1"],sep = " ")) else: inferProb = currentEm[stateAlt]+math.ln(transProb) reverseProb = currentEm[stateRef]+math.ln(1-transProb) - viSegmentInfo.writeLine(join(["ithSperm", $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") ) + viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") ) return 0 diff --git a/private/fragments.nim b/private/fragments.nim index 7eac2de7339221de0f06166ba72033ebf76e946d..1e250bc9261f4521c313f7205eabe00ac164eeea 100755 --- a/private/fragments.nim +++ b/private/fragments.nim @@ -3,12 +3,12 @@ import strutils import hts import utils import streams -# minSnpDepth, this SNP should at least be covered by two cells +# minSnpDepth = 2, this SNP should at least be covered by two cells proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, minTotalReads:int, mapq: int, barcodeTable:TableRef, minbsq:int, - barcodeTag:string, minCellDp:int, minSnpDepth:int ): Table[string,GtNode] = + barcodeTag:string ): Table[string,GtNode] = var barcodedNodesTable = initTable[string,GtNode]() var rec_alt:char @@ -54,34 +54,37 @@ proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, m continue if total_reads > maxTotalReads or total_reads < minTotalReads: return initTable[string,GtNode]() + return barcodedNodesTable + +proc callNodeGenotype*(bcdTable: var Table[string, GtNode], minCellDp: int, minSnpDepth:int ,p0 = 0.3, p1 = 0.8): Table[string, GtNode] = var calt = 0 var cellDP = 0 var delBarcodes:seq[string] - for cellbarcode in barcodedNodesTable.keys: - cellDP = barcodedNodesTable[cellbarcode].alleles.len + for cellbarcode in bcdTable.keys: + cellDP = bcdTable[cellbarcode].alleles.len if cellDP < minCellDp: delBarcodes.add(cellbarcode) continue - calt = barcodedNodesTable[cellbarcode].alleles.count("1") - if calt/cellDP < 0.3: - barcodedNodesTable[cellbarcode].genotype = 0 - elif calt/cellDP > 0.8: - barcodedNodesTable[cellbarcode].genotype = 1 + calt = bcdTable[cellbarcode].alleles.count("1") + if calt/cellDP < p0: + bcdTable[cellbarcode].genotype = gREF + elif calt/cellDP > p1: + bcdTable[cellbarcode].genotype = gALT else: delBarcodes.add(cellbarcode) continue for bc in delBarcodes: - barcodedNodesTable.del(bc) - if barcodedNodesTable.len < minSnpDepth: + bcdTable.del(bc) + if bcdTable.len < minSnpDepth: return initTable[string,GtNode]() - return barcodedNodesTable + return bcdTable ## write the .fmf output stream proc writeToFMF*(twoNodesFrag: Fragment, fmf_out:FileStream): Fragment = if twoNodesFrag.node2.variantIndex - twoNodesFrag.node1.variantIndex == 1: # echo "writting for cell " & twoNodesFrag.cellbarcode & " with 1 block" - if $twoNodesFrag.node1.genotype == "" or $twoNodesFrag.node2.genotype == "": + if twoNodesFrag.node1.genotype == gUnknown or twoNodesFrag.node2.genotype == gUnknown: quit "Genotype unknown when writing out" fmf_out.writeLine(join(["1",twoNodesFrag.cellbarcode & "." & $twoNodesFrag.counter, $twoNodesFrag.node1.variantIndex, $twoNodesFrag.node1.genotype & $twoNodesFrag.node2.genotype,"~~"], sep = " ")) @@ -98,18 +101,8 @@ proc writeToFMF*(twoNodesFrag: Fragment, fmf_out:FileStream): Fragment = proc updateCellFragment*(barcodedNodesTable:Table[string,GtNode], cellFragmentsTable:TableRef[string,Fragment], fmf_out:FileStream): TableRef[string,Fragment] = - # echo "updating for " & $barcodedNodesTable.len for cellbarcode in barcodedNodesTable.keys: - # cellDP = barcodedNodesTable[cellbarcode].alleles.len - # if cellDP < minCellDp: continue - # calt = barcodedNodesTable[cellbarcode].alleles.count("1") - # if calt/cellDP < 0.3: - # barcodedNodesTable[cellbarcode].genotype = 0 - # elif calt/cellDP > 0.8: - # barcodedNodesTable[cellbarcode].genotype = 1 - # else: - # continue if cellFragmentsTable.hasKey(cellbarcode): if (not cellFragmentsTable[cellbarcode].node2.isNil) or cellFragmentsTable[cellbarcode].node1.isNil : quit "Writing fmf went wrong, the second allele shoud be moved to first after getting one fragment" diff --git a/private/genotype_mtx b/private/genotype_mtx new file mode 100755 index 0000000000000000000000000000000000000000..c535f52659c7803e1eb943eef8f990666301d1b4 Binary files /dev/null and b/private/genotype_mtx differ diff --git a/private/genotype_mtx.nim b/private/genotype_mtx.nim new file mode 100755 index 0000000000000000000000000000000000000000..66feb5819811ea34cbe1d6bd61512cc5f893315f --- /dev/null +++ b/private/genotype_mtx.nim @@ -0,0 +1,138 @@ +## generate gtMtx, gtMtx_snpAnnot, totalCount.mtx altCount.mtx + +## author Ruqian Lyu +## Date: 2021 Oct +## gtMtx in 1, or 2 +import tables +import strutils +import hts +import utils +import streams + +import fragments + +## these outputs will be per chromosome + + +proc getGtMtx(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, + outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int, + minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.8):int = + var variantIndex = 1 + var writtenVarintIndex = 1 + var calt,cellDP,geno,cellIndex,nnsize = 0 + var delBarcodes:seq[string] + var cellGtNodes: Table[string, GtNode] + var cellGtNodeCopy: Table[string, GtNode] + for rec in ivcf.query("*"): + var rec_alt = rec.ALT[0][0] + var rec_ref = rec.REF[0] + cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, + minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, + minbsq=minbsq,barcodeTag=barcodeTag) + cellGtNodeCopy = cellGtNodes + for cellbarcode in cellGtNodeCopy.keys: + cellDP = cellGtNodes[cellbarcode].alleles.len + if cellDP < minCellDp: + cellGtNodes.del(cellbarcode) + continue + calt = cellGtNodes[cellbarcode].alleles.count("1") + if calt/cellDP < p0: + cellGtNodes[cellbarcode].genotype = gREF + elif calt/cellDP > p1: + cellGtNodes[cellbarcode].genotype = gALT + else: + cellGtNodes.del(cellbarcode) + continue + if cellGtNodes.len < minSnpDepth: + variantIndex.inc + continue + else: + for cellbarcode in cellGtNodes.keys: + cellDP = cellGtNodes[cellbarcode].alleles.len + calt = cellGtNodes[cellbarcode].alleles.count("1") + ## $cellGtNodes[cellbarcode].genotype BinaryGeno to "0", "1" or "-1" (not -1), write to file coding in 1 or 2 + geno = parseInt($cellGtNodes[cellbarcode].genotype) + 1 + # index from 1 for matrices + cellIndex = barcodeTable[cellbarcode] + 1 + outGtMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $geno],sep = " ")) + outTotalCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $cellDP],sep = " ")) + outAltCountMtx.writeLine(join([$writtenVarintIndex, $cellIndex, $calt],sep = " ")) + outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) + nnsize.inc + writtenVarintIndex.inc + ## write to matrices + + echo "processed " & $(variantIndex) & " variants" + echo "wrote " & $(writtenVarintIndex) & " variants to matrices and snpAnnot file" + for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: + outFileStream.setPosition(49) + outFileStream.write($(writtenVarintIndex) & " " & $barcodeTable.len & " " & $nnsize) + for outFileStream in[outGtMtx,outTotalCountMtx,outAltCountMtx, outSnpAnnot]: + outFileStream.close() + return 0 + +let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" +let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" +let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" +var + ibam:Bam + ivcf:VCF + outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream +if not open(ibam, bam_file, threads=1, index = true): + quit "couldn't open input bam" +if not open(ivcf, vcf_file, threads=1): + quit "couldn't open: vcf file" + +var hf = hts.hts_open(cstring(barcode_file), "r") +#### TODO : Table size +var barcodeTable = newTable[string,int](initialSize = 1024) +var kstr: hts.kstring_t +kstr.l = 0 +kstr.m = 0 +kstr.s = nil +var ithSperm = 0 +## initiate the table with CB as keys, allele counts (ref object) as elements +while hts_getline(hf, cint(10), addr kstr) > 0: + if $kstr.s[0] == "#": + continue + var v = $kstr.s + discard barcodeTable.hasKeyOrPut(v, ithSperm) + ithSperm.inc +discard hf.hts_close() + +let s_Chrs = @["chr1"] +let out_prefix = "test_data/getMtx/" +echo "generating gtMtx" +try: + outGtMtx = openFileStream(out_prefix & "_gtMtx.mtx", fmWrite) + outSnpAnnot = openFileStream(out_prefix & "_snpAnnot.txt", fmWrite) + outTotalCountMtx = openFileStream(out_prefix & "_totalMtx.mtx", fmWrite) + outAltCountMtx = openFileStream(out_prefix & "_AltMtx.mtx", fmWrite) +except: + stderr.write getCurrentExceptionMsg() + +## write headers to those mtx files and the first line place holder for total_row total_column total_entry +let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" + +for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: + outFileStream.writeLine(sparseMatrixHeader) + outFileStream.writeLine(' '.repeat(50)) + +outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t")) + +discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, + outSnpAnnot = outSnpAnnot, maxTotalReads = 30, + minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) + + +var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string + +## sort entries in mtx files +for chrom in s_Chrs: + outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" + outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" + outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" + for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + diff --git a/private/infer_missing_snps b/private/infer_missing_snps new file mode 100755 index 0000000000000000000000000000000000000000..85df612e74316fd0810c1ef6ddc70cf358b34890 Binary files /dev/null and b/private/infer_missing_snps differ diff --git a/private/infer_missing_snps.nim b/private/infer_missing_snps.nim new file mode 100755 index 0000000000000000000000000000000000000000..ab4174623d6846b85cd8999c16bc2f1e4346e457 --- /dev/null +++ b/private/infer_missing_snps.nim @@ -0,0 +1,130 @@ +## infer missing snp in template cell by looking at the snp's link to other nearby snps in other cells + +## author Ruqian Lyu +## rlyu@svi.edu.au +import find_template_cell +import sequtils +import math +import streams +import strutils +import utils + +let nAnchorSnps = 10 +proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = + map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex]) + +# cell_geno seq of length nAnchorSnps +# temp_geno seq of length nAnchorSnps +# return whether cell_geno matches with temp_geno in hap0 or hap1(bitwise complementary to temp_geno) +proc matchTemplateHap(cell_geno:seq[BinaryGeno], temp_geno:seq[BinaryGeno],error_rate = 0.1): seq[float] = + var nmatch = 0 + for i,g in cell_geno: + if g == temp_geno[i]: + nmatch += 1 + let mis = cell_geno.len - nmatch + let ll_h0 = error_rate^mis*(1-error_rate)^(nmatch) + let ll_h1 = error_rate^nmatch*(1-error_rate)^(mis) + if ll_h0 > ll_h1: + return @[0.0, ll_h0/(ll_h0 + ll_h1)] + elif ll_h0 == ll_h1: + return @[-1.0, 0.5] + else: + return @[1.0, ll_h1/(ll_h0 + ll_h1)] + +proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.98): BinaryGeno = + let prob_0 = error_rate^type10*(1-error_rate)^(type00) + let prob_1 = error_rate^type00*(1-error_rate)^(type10) + let prob_0_p = prob_0/(prob_0 + prob_1) + let prob_1_p = 1 - prob_0_p + if max(prob_0_p,prob_1_p) > posterior_thresh: + if prob_0_p > prob_1_p: + return gREF + else: + return gALT + return gUnknown +# return full sequence of template geno by inferring missing SNPs's genotypes +proc inferSnpGeno(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99):seq[BinaryGeno] = + var fullGeno = templateGeno + let nCells = gtMtx.len + let nSnps = gtMtx[0].len + var coexisPos:seq[int] + var snpLD: seq[float] + var type00, type10: int + + var snpiInCells = newSeq[BinaryGeno](gtMtx.len) + var offset = 1 + for i,missingSnpi in templateGeno: + if missingSnpi != gUnknown: continue + snpiInCells = sliceColumn(gtMtx,i) + type00 = 0 + type10 = 0 + for j,snpiGeno in snpiInCells: + if snpiGeno == gUnknown: continue + # find 10 coexisting positions for cell j with template geno seq + offset = 1 + coexisPos = newSeq[int]() + while(coexisPos.len < nAnchorSnps): + if ((i+offset) < nSnps): + if((gtMtx[j][i+offset] != gUnknown) and (fullGeno[i+offset] != gUnknown)): + coexisPos.add(i+offset) + if (i-offset) >= 0: + if((gtMtx[j][i-offset] != gUnknown) and (fullGeno[i-offset] != gUnknown)): + coexisPos.add(i-offset) + offset += 1 + snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), temp_geno = map(coexisPos,proc(x:int): BinaryGeno = fullGeno[x]) ) + if snpLD[1] > posterior_thresh: + if snpLD[0] == 1.0: + if snpiGeno == gREF: ## snpiGeno is 1 or 2 + type10 += 1 + else: + type00 += 1 + else: + if snpiGeno == gREF: + type00 += 1 + else: + type10 += 1 + else: continue + # echo "type00 " & $type00 + # echo "type10 " & $type10 + fullGeno[i] = inferGeno(type00, type10) + if i > 20: + break + return fullGeno + + + +var gtMtx:seq[seq[BinaryGeno]] +var currentEntry:seq[int] +var currentEntrySeq:seq[string] +var gtMtxFileStream:FileStream +## i, j are 1-based +let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/WC_CNV_chr1_nonzero.mtx" +echo mtxFile + +try: + gtMtxFileStream = openFileStream(mtx_file, fmRead) +except: + stderr.write getCurrentExceptionMsg() +# %%MatrixMarket matrix coordinate integer general +discard gtMtxFileStream.readLine() +#N, i,j +currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() +currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) +var nsnps = currentEntry[0] +echo "nsnps " & $nsnps +var ncells = currentEntry[1] +echo "ncells " & $ncells + +var totalEntries = currentEntry[2] + +echo "totalEntries " & $totalEntries + + +gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) + +discard readGtMtx(gtMtxFileStream,gtMtx) + +var tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3) +var tempcellGeno = gtMtx[tempcell] +let fullGeno = inferSnpGeno(tempcellGeno, gtMtx) +echo fullGeno.len diff --git a/private/infer_snp_hap.nim b/private/infer_snp_hap.nim new file mode 100755 index 0000000000000000000000000000000000000000..0ef2896c2afd9aa5c835e9db4ead71550a642be5 --- /dev/null +++ b/private/infer_snp_hap.nim @@ -0,0 +1,5 @@ +# infer SNP hap, give the current hap + +var hapVCFFile = "" +var inputVCFFile: +var outVCFFile: \ No newline at end of file diff --git a/private/phase_blocks.nim b/private/phase_blocks.nim index 9f2b2db9f74207839c30609bdde79874fe5e1c0b..8fdbbb424d46bde3e2194a0f432cd30b5f4d10d9 100755 --- a/private/phase_blocks.nim +++ b/private/phase_blocks.nim @@ -7,10 +7,10 @@ import math import sequtils import hts -let n_anchor_snps = 9 -let posterior_threshold = 0.99 +let n_anchor_snps = 19 +let posterior_threshold = 0.9999 let confidence_threshold = 0.95 -let debug = false +let debug = true proc match_phase(cell_geno:seq[int], block_h0:seq[int]): int = if cell_geno.len != block_h0.len: @@ -75,6 +75,7 @@ proc find_contig_blocks*(blocks:seq[Block],cellBlockLeftPhaseTable: TableRef[str var contig_blocks = newSeq[int]() var count_missing = newSeqWith[blocks.len,0] var contig_block_allow_missing = int(ceil(float(cellBlockLeftPhaseTable.len) * 0.1)) + if debug: echo "contig_block_allow_missing: " & $contig_block_allow_missing # for b in 0..high(blocks): var left_right_phase: seq[tuple[left:int,right:int]] for bc in cellBlockLeftPhaseTable.keys(): @@ -86,12 +87,13 @@ proc find_contig_blocks*(blocks:seq[Block],cellBlockLeftPhaseTable: TableRef[str if debug: echo "block missing in X number of cells:" & $count_missing for i, c in count_missing: if c <= contig_block_allow_missing: contig_blocks.add(i) + if contig_blocks.len == 0 : + contig_blocks.add(minIndex(count_missing)) return contig_blocks ## find linkage of two blocks -## proc find_linkage*(block1: int, block2: int, cellBlockLeftPhaseTable: TableRef[string, seq[int]],cellBlockRightPhaseTable: TableRef[string, seq[int]]): Block_linkage= - if debug: echo "linking " & $block1 & "and" & $block2 + if debug: echo "linking " & $block1 & " and " & $block2 var left_right_phase: seq[tuple[left:int,right:int]] var linkString: string if not (block1 < block2): @@ -106,7 +108,8 @@ proc find_linkage*(block1: int, block2: int, cellBlockLeftPhaseTable: TableRef[s elif linkString in @["01","10"]: blockLinkage.linkageType_01.inc blockLinkage.switch_posterior_prob = cal_switch_posterior(error_rate = 0.1, (blockLinkage.linkageType_00 + blockLinkage.linkageType_01), blockLinkage.linkageType_01) - blockLinkage.switch_confidence = log10(blockLinkage.switch_posterior_prob) - log10(1-blockLinkage.switch_posterior_prob) + blockLinkage.linkage_confidence = log10(blockLinkage.switch_posterior_prob) - log10(1-blockLinkage.switch_posterior_prob) + echo "blockLinkage.linkage_confidence : " & $blockLinkage.linkage_confidence return blockLinkage # contig_blocks, the blocks to build contigs. Rest of the blocks are filled in if can @@ -115,7 +118,7 @@ proc build_contig*(contig_blocks:seq[int], cellBlockLeftPhaseTable: TableRef[str var blockLinkage:Block_linkage for b in 0..(contig_blocks.len-2): blockLinkage = find_linkage(block1=contig_blocks[b], block2=contig_blocks[b+1],cellBlockLeftPhaseTable,cellBlockRightPhaseTable) - if blockLinkage.switch_confidence < 0.0: + if blockLinkage.linkage_confidence < 0.0: blockLinkage.switch = 0 else: blockLinkage.switch = 1 @@ -127,12 +130,12 @@ proc infer_block_linkage_to_one_contig_block*(contig_block:int,to_infer_block:in cellBlockRightPhaseTable: TableRef[string, seq[int]], at_left: bool):int = if at_left: - if debug: echo "linking " & $to_infer_block & "to contig block " & $contig_block & " from left" + if debug: echo "linking " & $to_infer_block & " to contig block " & $contig_block & " from left" var blockLinkage = Block_linkage(prevBlock : to_infer_block, nextBlock :contig_block, switch : -1) blockLinkage = find_linkage(block1 = to_infer_block, block2 = contig_block, cellBlockLeftPhaseTable,cellBlockRightPhaseTable) - if abs(blockLinkage.switch_confidence) < confidence_threshold: + if abs(blockLinkage.linkage_confidence) < confidence_threshold: return -1 - elif blockLinkage.switch_confidence > 0: + elif blockLinkage.linkage_confidence > 0: return 1 else: return 0 @@ -140,9 +143,9 @@ proc infer_block_linkage_to_one_contig_block*(contig_block:int,to_infer_block:in if debug: echo "linking " & $to_infer_block & "to contig block " & $contig_block & " from right" var blockLinkage = Block_linkage(prevBlock: contig_block, nextBlock : to_infer_block , switch: -1) blockLinkage = find_linkage(block1 = contig_block, block2 = to_infer_block, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - if abs(blockLinkage.switch_confidence) < confidence_threshold: + if abs(blockLinkage.linkage_confidence) < confidence_threshold: return -1 - elif blockLinkage.switch_confidence > 0: + elif blockLinkage.linkage_confidence > 0.0 : return 1 else: return 0 @@ -157,29 +160,42 @@ proc infer_non_contig_blocks*(contig_blocks:seq[int], linkedContigs:seq[Block_li var blocks_phase = newSeqWith(blocks.len,-1) var left_contig_block:int var right_contig_block: int + if debug: echo "now try loop blocks_phase len " & $blocks_phase.len + if debug: echo "now try loop contig_blocks len contig_blocks.len " & $contig_blocks.len + blocks_phase[contig_blocks[0]] = 0 - for lc in linkedContigs: - if lc.switch == 0: - if debug: echo "blocks_phase[lc.nextBlock] " & $blocks_phase[lc.nextBlock] - - blocks_phase[lc.nextBlock] = blocks_phase[lc.prevBlock] - else: - blocks_phase[lc.nextBlock] = (1 xor blocks_phase[lc.prevBlock]) + if debug: echo "now try loop linkedContigs error due to len " & $linkedContigs.len + + try: + for lc in linkedContigs: + if lc.switch == 0: + if debug: echo "blocks_phase[lc.nextBlock] " & $blocks_phase[lc.nextBlock] + blocks_phase[lc.nextBlock] = blocks_phase[lc.prevBlock] + else: + blocks_phase[lc.nextBlock] = (1 xor blocks_phase[lc.prevBlock]) + except: + if debug: echo "loop linkedContigs error due to len " & $linkedContigs.len + if debug: echo "block contig haplotype : " & $blocks_phase + if debug: echo " contig blocks are : " & $contig_blocks + for b_j in 0..(blocks.len-1): if not (b_j in contig_blocks): if b_j < contig_blocks[0]: hap_sw = infer_block_linkage_to_one_contig_block(contig_block = contig_blocks[0], to_infer_block = b_j, cellBlockLeftPhaseTable,cellBlockRightPhaseTable, at_left=true) if hap_sw == 1: - blocks_phase[b_j] = (1 xor contig_blocks[0]) + if debug: echo "hap_sw == 1" + blocks_phase[b_j] = (1 xor blocks_phase[contig_blocks[0]]) elif hap_sw == 0: - blocks_phase[b_j] = contig_blocks[0] + if debug: echo "hap_sw == 0" + blocks_phase[b_j] = blocks_phase[contig_blocks[0]] + if debug: echo "blocks_phase[b_j] " & $blocks_phase[b_j] elif b_j > contig_blocks[contig_blocks.len-1]: hap_sw = infer_block_linkage_to_one_contig_block(contig_block = contig_blocks[contig_blocks.len-1], to_infer_block = b_j,cellBlockLeftPhaseTable,cellBlockRightPhaseTable,at_left=false) if hap_sw == 1: - blocks_phase[b_j] = (1 xor contig_blocks[0]) + blocks_phase[b_j] = (1 xor blocks_phase[contig_blocks[contig_blocks.len-1]]) elif hap_sw == 0: - blocks_phase[b_j] = contig_blocks[0] + blocks_phase[b_j] = blocks_phase[contig_blocks[contig_blocks.len-1]] else: ## in between two contig blocks left_contig_block = b_j @@ -196,11 +212,21 @@ proc infer_non_contig_blocks*(contig_blocks:seq[int], linkedContigs:seq[Block_li blocks_phase[b_j] = blocks_phase[right_contig_block] elif hap_sw_left == 0: blocks_phase[b_j] = blocks_phase[left_contig_block] - elif hap_sw_right == 0 and (hap_sw_left == 1 or hap_sw_left == -1): + elif hap_sw_right == 0 and (hap_sw_left in [1,-1]): blocks_phase[b_j] = blocks_phase[right_contig_block] - elif hap_sw_right == 1 and (hap_sw_left == 0 or hap_sw_left == -1): + elif hap_sw_right == 1 and (hap_sw_left in [0,-1]): + blocks_phase[b_j] = blocks_phase[left_contig_block] + elif blocks_phase[left_contig_block] == blocks_phase[right_contig_block]: + if (hap_sw_right == -1 and hap_sw_left == 1) or (hap_sw_right == 1 and hap_sw_left == -1) : + blocks_phase[b_j] = (1 xor blocks_phase[left_contig_block]) + elif (hap_sw_right == -1 and hap_sw_left == 0) or (hap_sw_right == 0 and hap_sw_left == -1) : + blocks_phase[b_j] = blocks_phase[left_contig_block] + elif hap_sw_right == 0 and hap_sw_left == 0: blocks_phase[b_j] = blocks_phase[left_contig_block] + elif hap_sw_right == 1 and hap_sw_left == 1: + blocks_phase[b_j] = (1 xor blocks_phase[left_contig_block]) else: continue + if debug: echo "After infer non contig blocks: " & $blocks_phase return blocks_phase proc write_linked_blocks*(vcfFile:string, outFile: string, blocks:seq[Block], blocks_phase:seq[int], threads:int):int = @@ -233,20 +259,20 @@ proc write_linked_blocks*(vcfFile:string, outFile: string, blocks:seq[Block], bl # don't need to flip if blocks[current_block].h0[block_pos_i] == 0: ## 0|1 - if debug: echo "block hap0 and this h0 is 0 0|1" + #if debug: echo "block hap0 and this h0 is 0 0|1" gt_string = @[int32(2),int32(5)] else: ## 1|0 - if debug: echo "block hap0 and this h0 is 1 1|0" + #if debug: echo "block hap0 and this h0 is 1 1|0" gt_string = @[int32(4),int32(3)] elif blocks_phase[current_block] == 1: if blocks[current_block].h0[block_pos_i] == 0: ## 1|0 - if debug: echo "block hap1 and this h0 is 0 1|0" + #if debug: echo "block hap1 and this h0 is 0 1|0" gt_string = @[int32(4),int32(3)] else: ## 0|1 - if debug: echo "block hap1 and this h0 is 1 0|1" + #if debug: echo "block hap1 and this h0 is 1 0|1" gt_string = @[int32(2),int32(5)] if f.set("GT",gt_string) != Status.OK: quit "set GT failed" diff --git a/private/utils.nim b/private/utils.nim index a4843a15059d469023ab808eced13f6a2e63adaa..e155e2b26500bfb4654c4997c6a0d35d3307c3e0 100755 --- a/private/utils.nim +++ b/private/utils.nim @@ -2,6 +2,9 @@ from distributions/rmath import dbinom import math import tables import hts +import algorithm +import streams +import strutils type allele_expr* = ref object @@ -9,6 +12,8 @@ type calt*: int ViState* = enum stateRef, stateAlt, stateN + BinaryGeno* = enum + gUnknown, gREF, gALT ViNode* = object pos*: int cRef*: int @@ -20,12 +25,29 @@ type chrom*: string variantIndex*: int alleles*: string - genotype*: int + genotype*: BinaryGeno Fragment* = ref object cellbarcode*: string counter*: int node1*: GtNode node2*: GtNode + +type + MtxRow* = tuple + rowIndex: int + colIndex: int + value: int +type + Mtx* = ref object + header*: string + dims*: string + lines*: seq[MtxRow] + +proc `$`*(x: BinaryGeno): string = + if x == gUnknown: return "-1" + if x == gREF: return "0" + if x == gALT: return "1" + proc get_base_offset*(position:int, align: Record): int = var off = align.start.int @@ -114,4 +136,49 @@ proc countAllele*(ibam:Bam, maxTotalReads:int, continue if total_reads > maxTotalReads or total_reads < minTotalReads: return initTable[string,allele_expr]() - return alleleCountTable \ No newline at end of file + return alleleCountTable + +proc readMtx*(mtx_file:string): Mtx = + var mtxFileStream:FileStream + try : + mtxFileStream = openFileStream(mtx_file, fmRead) + except: + stderr.write getCurrentExceptionMsg() + var sparseMtx = Mtx() + var current_line: string +# var entry: MtxRow + sparseMtx.header = mtxFileStream.readLine() + sparseMtx.dims = mtxFileStream.readLine() + while not mtxFileStream.atEnd(): + current_line = mtxFileStream.readLine() + sparseMtx.lines.add((rowIndex: parseInt(current_line.splitWhitespace()[0]), + colIndex: parseInt(current_line.splitWhitespace()[1]), + value: parseInt(current_line.splitWhitespace()[2]) )) + mtxFileStream.close() + + return sparseMtx +# sort by col index +proc compareMtxRow*(entry1: MtxRow, entry2: MtxRow): int = + if entry1.colIndex < entry2.colIndex: + return -1 + elif entry1.colIndex == entry2.colIndex: + if entry1.rowIndex < entry2.rowIndex: + return -1 + else: + return 1 + elif entry1.colIndex > entry2.colIndex: + return 1 + +proc sortWriteMtx*(sMtx:var Mtx, mtx_file:string):int = + var mtxFileStream:FileStream + try : + mtxFileStream = openFileStream(mtx_file, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + var current_line: string + sMtx.lines.sort(compareMtxRow) + mtxFileStream.writeLine(sMtx.header) + mtxFileStream.writeLine(sMtx.dims) + for entry in sMtx.lines: + mtxFileStream.writeLine(join([$entry.rowIndex, $entry.colIndex, $entry.value],sep = " ")) + mtxFileStream.close() diff --git a/sgcocaller.nim b/sgcocaller.nim index fb06183fd25f9f3b11a7e4da57390d7d39c7093a..f015b97d8f141f4d099d33211db3ee343a75e348 100755 --- a/sgcocaller.nim +++ b/sgcocaller.nim @@ -25,11 +25,12 @@ proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTota var cellFragmentsTable = newTable[string,Fragment]() echo "generated fragments from " & $barcodeTable.len & " single cells" - + var cellGtNodes: Table[string, GtNode] for rec in ivcf.query("*"): - var cellGtNodes = findGtNodes(rec=rec, variantIndex= variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, - minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, minbsq=minbsq,barcodeTag=barcodeTag, - minCellDp = mindp, minSnpDepth = minsnpdepth) + cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, + minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, + minbsq=minbsq,barcodeTag=barcodeTag) + cellGtNodes = callNodeGenotype(cellGtNodes,minCellDp = mindp, minSnpDepth = minsnpdepth, p0 = 0.3, p1 = 0.8) cellFragmentsTable = updateCellFragment(barcodedNodesTable=cellGtNodes,cellFragmentsTable = cellFragmentsTable, fmf_out = outfmf) variantIndex = variantIndex + 1 echo "processed " & $(variantIndex) & " variants" @@ -52,6 +53,7 @@ proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfi ## ## What if blocks cannot be linked ? the number of 11(00) == 10(01) echo "Total blocks " & $chr_blocks.len + var contig_blocks:seq[int] for b_j, bl in chr_blocks: echo "block " & $b_j & " len: " & $bl.h0.len @@ -66,8 +68,6 @@ proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfi echo "prevBlock" & $bk.prevBlock & "nextBlock" & $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 & " 01 type " & $bk.linkageType_01 & " switch posterior prob " & $bk.switch_posterior_prob & " switch value :" & $bk.switch phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - # echo phased_blocks - discard write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads) return 0 proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, @@ -125,7 +125,8 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, # if not gt == 0|1 continue elif not (ac[0] == 2 and ac[1] == 5): continue var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, - maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag, startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt = rec_alt, rec_ref = rec_ref) + maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag, + startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt = rec_alt, rec_ref = rec_ref) if alleleCountTable.len==0: continue ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations @@ -208,7 +209,7 @@ Options: --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5] --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25] --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10] - --minSNPdepth <minSNPdepth> the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 2] + --minSNPdepth <minSNPdepth> the minimum depth of coverage for a SNPs to be includes in generated fragments [default: 1] --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1] @@ -242,7 +243,6 @@ Options: minsnpdepth:int out_vcf,fmf,barcodeFile,bamfile,vcff,hapfile:string vcfGtPhased = false - echo $args threads = parse_int($args["--threads"]) barcodeTag = $args["--barcodeTag"] @@ -312,7 +312,15 @@ Options: echo "running for these chromosomes as specified in the header of the provided VCF file " & $s_Chrs discard sgcocaller(threads, ivcf, barcodeTable, ibam, out_dir, mapq, minbsq, mintotal, - maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag,phased = vcfGtPhased) + maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) + var outFileTotalCountMtxFile, outFileAltCountMtxFile: string + ## sort entries in mtx files + for chrom in s_Chrs: + outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" + outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" + for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) ibam.close() ivcf.close() if args["phase_blocks"]: