diff --git a/plotDiagnosticHap.R b/plotDiagnosticHap.R new file mode 100644 index 0000000000000000000000000000000000000000..7b790211030e0756323010f473ddef2ae137c817 --- /dev/null +++ b/plotDiagnosticHap.R @@ -0,0 +1,54 @@ +## diagnostic txt to png + +## phase/\ +.libPaths("/mnt/beegfs/mccarthy/scratch/general/rlyu/Software/R/4.0/library/") + +args <- (commandArgs(trailingOnly = TRUE)) + +for (i in seq_len(length(args))) { + eval(parse(text = args[[i]])) +} + +print(pngfile) +print(inputfile) +print(snpAnnotFile) + + +tryCatch( + { + library(ggplot2) + library(dplyr) + library(tidyr) + message("Loading pacakges") + + # for the condition handlers for warnings and error below) + }, + error=function(cond) { + + message("required R pacakges for plotting diagnostic plots are not available") + + return(NA) + }) + +plot_df <- read.table(file = inputfile, + header = T) +snpAnnot_df <- read.table(file = snpAnnotFile, header =TRUE) +snpAnnot_df <- snpAnnot_df[snpAnnot_df$Phase!="-1",] +plot_df_match <- apply(plot_df,2,function(x){ + tmp <- (x == plot_df$templateGeno) + tmp[x == 0] <- NA + tmp +}) + + +data.frame(cbind(plot_df_match,snpAnnot_df)) %>% + tidyr::pivot_longer(cols = colnames(plot_df)) %>% + dplyr::filter(!is.na(value),POS > 1.3e8, POS < 1.35e8) %>% + ggplot()+ + geom_point(mapping = aes(x = POS , y = name, color = value))+ + scale_color_manual(values = c("FALSE"='red', + "TRUE" = "blue"))+ + theme_bw(base_size = 18)+geom_vline(mapping = aes(xintercept = c(switch_pos[2])), + color = "red",size = 1.2) + +ggsave(pngfile, dpi = 100, width = 14, height = 6) diff --git a/private/cal_switch_score.nim b/private/cal_switch_score.nim index e86b69e6d3ab59cc2245dc3dbd2be76c47f56b89..b83d4a7e9db582214560d829ae84658ea0ec2fea 100755 --- a/private/cal_switch_score.nim +++ b/private/cal_switch_score.nim @@ -1,9 +1,265 @@ ## calcualte switch score for snps positions that are high risk +import utils +import sequtils +import math +import streams +# bins of SNP indexes +import strutils let binSize = 1000 -let movingStep = 200 - +let movingStep = 500 +let dissimThresh = 0.0099 +let lookBeyondSnps = 25 +let debug = false +let switchPrior = 0.5 +type + switchScoreTuple = tuple + switch_scores: seq[float] + switch_scores_snpIndex: seq[int] + # reduce the search space # return seq of snp indexes +# low risk bins are bins that do not have crossover happening in majority of the cells. + +## rough crossover estimates: return binary results, 0 means no crossovers, 1 means there is. +## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin + +## cell_geno snp by cell +proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = + let ncells = cell_geno[0].len + var ncompared = newSeq[int](ncells) + var nmatch = newSeq[int](ncells) + for snpi,g in templ_geno: + for cellj in 0..(ncells-1): + if g != gUnknown and cell_geno[snpi][cellj] != gUnknown: + ncompared[cellj].inc + if g == cell_geno[snpi][cellj]: nmatch[cellj].inc + if debug: echo "ncompared " & $ncompared + let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float = nmatch[x]/ncompared[x]) + if debug: echo "dissim " & $dissim + var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh))) + if debug: echo "ncrossover " & $ncrossover + let nxcells = foldl(ncrossover, a + b) + if nxcells < int(floor(0.5 * float(ncells))): + if debug: echo "bin had no crossovers : " & $nxcells + return false + else: + if debug: echo "bin had many crossovers : " & $nxcells + return true + +proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = + #let ncells = gtMtxByCell[0].len + let nSnps = gtMtxByCell.len + let nBins = (int)(ceil(nSnps / (binSize - movingStep))) + if debug: echo "nBins: " & $nBins + let binIds = toSeq(0..(nBins-1)) + var snpPosStart,snpPosEnd: int + var highRiskSnps: seq[int] + for binI in binIds: + if debug: echo "binI " & $binI + snpPosStart = (binI)*(binSize - movingStep) + if (snpPosStart + binSize) > (nSnps-1): + snpPosEnd = (nSnps-1) + else: + snpPosEnd = snpPosStart + binSize + if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )): + highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd)) + + + # add a last bin as from end of chrom with length binSize + let lastBinStart = nSnps - binSize + let lastBinEnd = nSnps - 1 + if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], + cell_geno = gtMtxByCell[snpPosStart..snpPosEnd])): + highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd)) + + highRiskSnps = deduplicate(highRiskSnps) + return highRiskSnps + +proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] = + var fullGeno = newSeq[BinaryGeno](nSnps) + var i = 0 + var currentEntrySeq: seq[string] + while not phasedSnpAnnotFileStream.atEnd(): + currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace() + fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01") + i.inc + if i != nSnps : + quit "phased snpAnnot file does not have the same number of rows with gtMtx" + return fullGeno + +proc calculateProbslog10(hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno], error_rate = 0.1) : float = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + let prob = 0.5*(error_rate^nmis*(1-error_rate)^nmatch + error_rate^nmatch*(1-error_rate)^nmis) + return(log10(prob)) + +proc countNmatch( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + return nmatch + +proc countNMis( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = + var nmatch,ncompare:int + for i,g in hap: + if (g != gUnknown) and (cell_hap[i] != gUnknown): + ncompare.inc + if (g == cell_hap[i]): nmatch.inc + let nmis = ncompare - nmatch + return nmis + +proc binarySwitch(x: BinaryGeno):BinaryGeno = + if(x == gUnknown): gUnknown + elif(x == gREF):gALT + else: gREF + +proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = + map(hap, binarySwitch) + +proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = + map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex]) + +proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno]): switchScoreTuple = + var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int + let nSnps = gtMtxByCell.len + let ncells = gtMtxByCell[0].len + var hswitch,cell_hap,cell_full_hap,templ_geno,templ_geno_left,templ_geno_right: seq[BinaryGeno] + var prob_switch, prob_nonsw: float + var leftIndex,rightIndex, switch_score_snpIndex: seq[int] + var swscore = newSeq[float](nSnps) + for rsnpi in riskySnps: + prob_switch = 0.0 + prob_nonsw = 0.0 + if fullGeno[rsnpi] == gUnknown: continue + for celli in 0..(ncells-1): + # if celli == 2: continue + leftIndex = newSeq[int]() + rightIndex = newSeq[int]() + offset = 1 + cell_full_hap = getIthCellHap(gtMtxByCell, celli) + if(cell_full_hap[rsnpi] != gUnknown and fullGeno[rsnpi] != gUnknown ): + rightIndex.add(rsnpi) + ## need to find the nearest coexisting N snps (N = lookBeyondSnps*2) + offset = 1 + while(leftIndex.len < lookBeyondSnps and ((rsnpi-offset) > 0)): + if((cell_full_hap[rsnpi-offset] != gUnknown) and (fullGeno[rsnpi-offset] != gUnknown)): + leftIndex.add(rsnpi-offset) + offset.inc + offset = 1 + while(rightIndex.len < lookBeyondSnps and ((rsnpi+offset) < nSnps)): + if((cell_full_hap[rsnpi+offset] != gUnknown) and (fullGeno[rsnpi+offset] != gUnknown)): + rightIndex.add(rsnpi+offset) + offset.inc + cell_hap = concat(map(concat(leftIndex,rightIndex), proc(x:int):BinaryGeno = cell_full_hap[x])) + templ_geno_left = map(leftIndex, proc(x:int):BinaryGeno = fullGeno[x]) + templ_geno_right = map(rightIndex, proc(x:int):BinaryGeno = fullGeno[x]) + templ_geno = concat(templ_geno_left,templ_geno_right) + hswitch = concat(templ_geno_left,switchHap(templ_geno_right)) + # if rsnpi == 16933: + # echo "cell " & $celli + # echo "cell_hap " & $cell_hap + # echo "templ_geno " & $templ_geno + # echo "hswitch " & $hswitch + # echo "prev prob nosw " & $prob_nonsw + # echo "prev prob_switch " & $prob_switch + # echo "nmatch temp_geno " & $(countNmatch(hap = templ_geno, cell_hap = cell_hap)) + # echo "nmatch hswitch " & $(countNmatch(hap = hswitch, cell_hap = cell_hap)) + # echo "nmis temp_geno " & $(countNMis(hap = templ_geno, cell_hap = cell_hap)) + # echo "nmis hswitch " & $(countNMis(hap = hswitch, cell_hap = cell_hap)) + prob_nonsw = prob_nonsw + calculateProbslog10(hap = templ_geno, cell_hap = cell_hap) + prob_switch = prob_switch + calculateProbslog10(hap = hswitch, cell_hap = cell_hap) + + #if(prob_switch > prob_nonsw): echo "found positive switch score" + swscore[rsnpi] = log10(switchPrior) + prob_switch - log10(1-switchPrior) - prob_nonsw + switch_score_snpIndex.add(rsnpi) + return (swscore,switch_score_snpIndex) + +## return SNP indexs to switch +## switchScore, a dense list of switch scores +proc findSwitchSites(switchScoreT: switchScoreTuple): seq[int] = + var sitesToSwitch = newSeq[int]() + var positiveScores: seq[float] + var positiveScoresIndex: seq[int] + + var inPositiveBlock = false + for site, score in switchScoreT.switch_scores: + if switchScoreT.switch_scores_snpIndex.find(site)>=0: + if score <= 0.0 : + if inPositiveBlock: + inPositiveBlock = false + if positiveScoresIndex.len > int(floor(lookBeyondSnps/2)): + sitesToSwitch.add(positiveScoresIndex[maxIndex(positiveScores)]) + continue + elif not inPositiveBlock: + positiveScores = @[score] + positiveScoresIndex = @[site] + inPositiveBlock = true + continue + else: + positiveScores.add(score) + positiveScoresIndex.add(site) + return sitesToSwitch +let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" +let bam_file = "data/WC_CNV_43/WC_CNV_43.bam" +let barcode_file = "data/WC_CNV_43/WC_CNV_43_filteredBC.tsv" +var + gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream:FileStream + + +let s_Chrs = map(toSeq(1..19), proc(x:int):string = "chr" & $x) + +var gtMtxByCell: seq[seq[BinaryGeno]] +var currentEntrySeq: seq[string] +var currentEntry: seq[int] +var nSnps,ncells:int +## sort entries in mtx files +for chrom in s_Chrs: + let gtMtxFile = "data/WC_CNV_43/sgcocaller/gtMtx/" & chrom & "_gtMtx.mtx" + let phaseSnpAnnotFile = "data/WC_CNV_43/sgcocaller/phase/" & chrom & "_phased_snpAnnot.txt" + let switchScoreFile = "data/WC_CNV_43/sgcocaller/phase/" & chrom & "_switch_score.txt" + try: + gtMtxFileStream = openFileStream(gtMtxFile, fmRead) + phaseSnpAnnotFileStream = openFileStream(phaseSnpAnnotFile, fmRead) + switchScoreFileStream = openFileStream(switchScoreFile, fmWrite) + + except: + stderr.write getCurrentExceptionMsg() + echo "reading gtMtx to by cell" + discard gtMtxFileStream.readLine() + discard phaseSnpAnnotFileStream.readLine() + #N, i,j + currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() + currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) + nSnps = currentEntry[0] + echo "nSnps " & $nSnps + ncells = currentEntry[1] + echo "ncells " & $ncells + + echo "totalEntries " & $ currentEntry[2] + ## gtMtx is cell by Snp format + gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) + discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true) + var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps ) -proc findHighRiskSnps(fullGeno:seq[int]): seq[int] = \ No newline at end of file + var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell) + echo "riskySnps.len " & $riskySnps.len + var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno) + switchScoreFileStream.writeLine("#" & $findSwitchSites(switchScoresT)) + echo $findSwitchSites(switchScoresT) + for snpi,score in switchScoresT.switch_scores: + if switchScoresT.switch_scores_snpIndex.find(snpi)>=0: + switchScoreFileStream.writeLine($score) + else: + switchScoreFileStream.writeLine("NA") + for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]: + fs.close() + diff --git a/private/findGtNodes.nim b/private/findGtNodes.nim new file mode 100755 index 0000000000000000000000000000000000000000..e3b4ea683cdbb9ad1a67467735943542e6b63765 --- /dev/null +++ b/private/findGtNodes.nim @@ -0,0 +1,57 @@ +## Genrate GtNode per variant (SNP) + +import tables +import hts +import utils + +# minSnpDepth = 2, this SNP should at least be covered by two cells +proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, minTotalReads:int, mapq: int, + barcodeTable:TableRef, + minbsq:int, + barcodeTag:string ): Table[string,GtNode] = + + var barcodedNodesTable = initTable[string,GtNode]() + var rec_alt:char + var total_reads = 0 + rec_alt = rec.ALT[0][0] + for aln in ibam.query(chrom = $rec.CHROM,start = rec.POS.cint-1, stop = rec.POS.cint): + var cbt = tag[string](aln, barcodeTag) + var currentCB: string + var base_off: int + var base: char + if cbt.isNone: + continue + else: + currentCB = cbt.get + if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue + ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary + ## if not aln.flag.proper_pair: continue + if not barcodeTable.hasKey(currentCB): + continue + base_off = get_base_offset(position = rec.POS.cint, align = aln) + base = aln.base_at(base_off) + if aln.base_quality_at(base_off).cint < minbsq: + continue + total_reads+=1 + if barcodedNodesTable.hasKey(currentCB): + if base == rec.REF[0]: + # echo "adding ref geno to cell " & currentCB + barcodedNodesTable[currentCB].alleles = add_allele(barcodedNodesTable[currentCB],alleleBinGeno = 0) + continue + if base == rec_alt: + barcodedNodesTable[currentCB].alleles = add_allele(barcodedNodesTable[currentCB],alleleBinGeno = 1) + continue + else: + # echo "creating new node for cell " & currentCB + var newGtNode = GtNode(chrom: $rec.CHROM, variantIndex: variantIndex) + if base == rec.REF[0]: + newGtNode.alleles = "0" + barcodedNodesTable[currentCB] = newGtNode + continue + if base == rec_alt: + newGtNode.alleles = "1" + barcodedNodesTable[currentCB] = newGtNode + continue + if total_reads > maxTotalReads or total_reads < minTotalReads: + return initTable[string,GtNode]() + return barcodedNodesTable diff --git a/private/find_template_cell.nim b/private/find_template_cell.nim index 41478f3105b4affa3176f83f60e060f13b43f832..ed743a6ec7237842af8bf0f8c9818935ff7ac1bf 100755 --- a/private/find_template_cell.nim +++ b/private/find_template_cell.nim @@ -18,24 +18,6 @@ let minCoexit = 1000 maxDissim = 0.0099 -var nsnps,ncells,totalEntries:int - -proc toBinaryGeno(x: int) : BinaryGeno = - if x == 1: return gREF - if x == 2: return gALT - quit "mtx geno is not 1 or 2" - -proc readGtMtx*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]]):int = - var currentEntrySeq:seq[int] - var currentLine:seq[string] - - while not mtxFileStream.atEnd(): - currentLine = mtxFileStream.readLine().splitWhitespace() - ## i j 1-based from file - currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) - gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno - return 0 - proc countCoexistMatch(seq1: seq[BinaryGeno], seq2: seq[BinaryGeno]): seq[int] = if not (seq1.len == seq2.len): quit "cells' genotypes under comparison do not have the same length" @@ -43,7 +25,7 @@ proc countCoexistMatch(seq1: seq[BinaryGeno], seq2: seq[BinaryGeno]): seq[int] = for i, g in seq1: if g != gUnknown: cell1Snps += 1 if (seq2[i] != gUnknown): cell2Snps += 1 - if (g != gUnknown and seq2[i]!= gUnknown): + if (g != gUnknown and seq2[i] != gUnknown): ncoexist += 1 if(g == seq2[i]): nmatch += 1 @@ -71,7 +53,6 @@ proc findCellPairs(gtMtx:seq[seq[BinaryGeno]],nPairs = 3, nSnpsPercell: var seq[ proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.75): int = # return the selected cells' index # not too many SNPs or too few - var chosed:int var seqNsnps = nSnpsPercell sort(seqNsnps, system.cmp[int]) int(floor(float(seqNsnps.len) * q)) @@ -79,7 +60,7 @@ proc findCellBynSNPs*(nSnpsPercell:seq[int],q = 0.75): int = proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = var nSnpsCells = newSeq[int](gtMtx.len) var cellPairs = findCellPairs(gtMtx = gtMtx, nPairs = nPairs, nSnpsPercell = nSnpsCells) - echo "cellPairs" + echo "cellPairs: " echo cellPairs var selectedCell, icp: int if cellPairs[0].coexistSnps == 0 : @@ -94,44 +75,3 @@ proc selectTemplateCell*(gtMtx:seq[seq[BinaryGeno]], nPairs = 3): int = else: echo "selected cell: " & $cellPairs[icp].cell2 & " nSnps: " & $nSnpsCells[cellPairs[icp].cell2] return cellPairs[icp].cell2 - -var gtMtx:seq[seq[BinaryGeno]] -var currentEntry:seq[int] -var currentEntrySeq:seq[string] -var gtMtxFileStream:FileStream -## i, j are 1-based -let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/WC_CNV_chr1_nonzero.mtx" -echo mtxFile - -try: - gtMtxFileStream = openFileStream(mtx_file, fmRead) -except: - stderr.write getCurrentExceptionMsg() -# %%MatrixMarket matrix coordinate integer general -discard gtMtxFileStream.readLine() -#N, i,j -currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() -currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) -nsnps = currentEntry[0] -echo "nsnps " & $nsnps -ncells = currentEntry[1] -echo "ncells " & $ncells - -totalEntries = currentEntry[2] - -echo "totalEntries " & $totalEntries - - -gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) - -discard readGtMtx(gtMtxFileStream,gtMtx) -#var colIndex = 4 -#echo gtMtx[0][27..40] - -#echo sliceColumn(gtMtx,56) -#echo gtMtx.len - -#echo findCellPairs(gtMtx = gtMtx) -#var nSnpsCells = newSeqWith(ncells,0) -#selectTemplateCell -echo selectTemplateCell(gtMtx = gtMtx, nPairs =3) \ No newline at end of file diff --git a/private/genotype_mtx.nim b/private/genotype_mtx.nim index a935cb1e645cb349f6c4017effb65e942c98089f..9c11d79009aac6a9b5e2054f11338bee6d153386 100755 --- a/private/genotype_mtx.nim +++ b/private/genotype_mtx.nim @@ -13,15 +13,21 @@ import findGtNodes ## these outputs will be per chromosome -proc getGtMtx(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, +proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int, minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.8):int = var variantIndex = 1 var writtenVarintIndex = 1 var calt,cellDP,geno,cellIndex,nnsize = 0 - var delBarcodes:seq[string] - var cellGtNodes: Table[string, GtNode] - var cellGtNodeCopy: Table[string, GtNode] + var cellGtNodes,cellGtNodeCopy: Table[string, GtNode] + + ## write headers to those mtx files and the first line place holder for total_row total_column total_entry + let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" + + for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: + outFileStream.writeLine(sparseMatrixHeader) + outFileStream.writeLine(' '.repeat(50)) + outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t")) for rec in ivcf.query("*"): var rec_alt = rec.ALT[0][0] var rec_ref = rec.REF[0] @@ -59,80 +65,72 @@ proc getGtMtx(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, ou nnsize.inc outSnpAnnot.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) writtenVarintIndex.inc - ## write to matrices + ## write to matrices echo "processed " & $(variantIndex) & " variants" - echo "wrote " & $(writtenVarintIndex) & " variants to matrices and snpAnnot file" + echo "wrote " & $(writtenVarintIndex-1) & " variants to matrices and snpAnnot file" for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: outFileStream.setPosition(49) - outFileStream.write($(writtenVarintIndex) & " " & $barcodeTable.len & " " & $nnsize) - for outFileStream in[outGtMtx,outTotalCountMtx,outAltCountMtx, outSnpAnnot]: + outFileStream.write($(writtenVarintIndex-1) & " " & $barcodeTable.len & " " & $nnsize) + for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]: outFileStream.close() return 0 -let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" -let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" -let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" -var - ibam:Bam - ivcf:VCF - outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream -if not open(ibam, bam_file, threads=1, index = true): - quit "couldn't open input bam" -if not open(ivcf, vcf_file, threads=1): - quit "couldn't open: vcf file" - -var hf = hts.hts_open(cstring(barcode_file), "r") -#### TODO : Table size -var barcodeTable = newTable[string,int](initialSize = 1024) -var kstr: hts.kstring_t -kstr.l = 0 -kstr.m = 0 -kstr.s = nil -var ithSperm = 0 -## initiate the table with CB as keys, allele counts (ref object) as elements -while hts_getline(hf, cint(10), addr kstr) > 0: - if $kstr.s[0] == "#": - continue - var v = $kstr.s - discard barcodeTable.hasKeyOrPut(v, ithSperm) - ithSperm.inc -discard hf.hts_close() - -let s_Chrs = @["chr1"] -var chrom = "chr1" -let out_prefix = "test_data/getMtx/" -echo "generating gtMtx" -try: - outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite) - outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite) - outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite) - outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite) -except: - stderr.write getCurrentExceptionMsg() +# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" +# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" +# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" +# var +# ibam:Bam +# ivcf:VCF +# outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream +# if not open(ibam, bam_file, threads=1, index = true): +# quit "couldn't open input bam" +# if not open(ivcf, vcf_file, threads=1): +# quit "couldn't open: vcf file" -## write headers to those mtx files and the first line place holder for total_row total_column total_entry -let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" +# var hf = hts.hts_open(cstring(barcode_file), "r") +# #### TODO : Table size +# var barcodeTable = newTable[string,int](initialSize = 1024) +# var kstr: hts.kstring_t +# kstr.l = 0 +# kstr.m = 0 +# kstr.s = nil +# var ithSperm = 0 +# ## initiate the table with CB as keys, allele counts (ref object) as elements +# while hts_getline(hf, cint(10), addr kstr) > 0: +# if $kstr.s[0] == "#": +# continue +# var v = $kstr.s +# discard barcodeTable.hasKeyOrPut(v, ithSperm) +# ithSperm.inc +# discard hf.hts_close() -for outFileStream in [outGtMtx,outTotalCountMtx,outAltCountMtx]: - outFileStream.writeLine(sparseMatrixHeader) - outFileStream.writeLine(' '.repeat(50)) +# let s_Chrs = @["chr1"] +# var chrom = "chr1" +# let out_prefix = "test_data/getMtx/" +# echo "generating gtMtx" +# try: +# outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite) +# outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite) +# outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite) +# outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite) +# except: +# stderr.write getCurrentExceptionMsg() -outSnpAnnot.writeLine(join(["POS","REF", "ALT"], sep = "\t")) -discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, - outSnpAnnot = outSnpAnnot, maxTotalReads = 30, - minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) +# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, +# outSnpAnnot = outSnpAnnot, maxTotalReads = 30, +# minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) -var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string +# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string -## sort entries in mtx files -for chrom in s_Chrs: - outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" - outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" - outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" - for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: - var imtx = readMtx(mtx_file = mtxFile) - discard sortWriteMtx(imtx, mtx_file = mtxFile) +# ## sort entries in mtx files +# for chrom in s_Chrs: +# outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" +# outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" +# outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" +# for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: +# var imtx = readMtx(mtx_file = mtxFile) +# discard sortWriteMtx(imtx, mtx_file = mtxFile) diff --git a/private/infer_missing_snps.nim b/private/infer_missing_snps.nim index ab4174623d6846b85cd8999c16bc2f1e4346e457..b3874f7b52233a223347cb995aebfff20da429c8 100755 --- a/private/infer_missing_snps.nim +++ b/private/infer_missing_snps.nim @@ -2,14 +2,12 @@ ## author Ruqian Lyu ## rlyu@svi.edu.au -import find_template_cell import sequtils import math -import streams -import strutils import utils let nAnchorSnps = 10 + proc sliceColumn*(gtMtx:seq[seq[BinaryGeno]],colIndex:int): seq[BinaryGeno] = map(gtMtx, proc(x:seq[BinaryGeno]):BinaryGeno = x[colIndex]) @@ -43,9 +41,8 @@ proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh = 0.98) return gALT return gUnknown # return full sequence of template geno by inferring missing SNPs's genotypes -proc inferSnpGeno(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99):seq[BinaryGeno] = +proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], posterior_thresh = 0.99):seq[BinaryGeno] = var fullGeno = templateGeno - let nCells = gtMtx.len let nSnps = gtMtx[0].len var coexisPos:seq[int] var snpLD: seq[float] @@ -71,7 +68,8 @@ proc inferSnpGeno(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], pos if((gtMtx[j][i-offset] != gUnknown) and (fullGeno[i-offset] != gUnknown)): coexisPos.add(i-offset) offset += 1 - snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), temp_geno = map(coexisPos,proc(x:int): BinaryGeno = fullGeno[x]) ) + snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), + temp_geno = map(coexisPos,proc(x:int): BinaryGeno = fullGeno[x]) ) if snpLD[1] > posterior_thresh: if snpLD[0] == 1.0: if snpiGeno == gREF: ## snpiGeno is 1 or 2 @@ -87,44 +85,5 @@ proc inferSnpGeno(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], pos # echo "type00 " & $type00 # echo "type10 " & $type10 fullGeno[i] = inferGeno(type00, type10) - if i > 20: - break return fullGeno - - -var gtMtx:seq[seq[BinaryGeno]] -var currentEntry:seq[int] -var currentEntrySeq:seq[string] -var gtMtxFileStream:FileStream -## i, j are 1-based -let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/WC_CNV_chr1_nonzero.mtx" -echo mtxFile - -try: - gtMtxFileStream = openFileStream(mtx_file, fmRead) -except: - stderr.write getCurrentExceptionMsg() -# %%MatrixMarket matrix coordinate integer general -discard gtMtxFileStream.readLine() -#N, i,j -currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() -currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) -var nsnps = currentEntry[0] -echo "nsnps " & $nsnps -var ncells = currentEntry[1] -echo "ncells " & $ncells - -var totalEntries = currentEntry[2] - -echo "totalEntries " & $totalEntries - - -gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) - -discard readGtMtx(gtMtxFileStream,gtMtx) - -var tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3) -var tempcellGeno = gtMtx[tempcell] -let fullGeno = inferSnpGeno(tempcellGeno, gtMtx) -echo fullGeno.len diff --git a/private/sgphase.nim b/private/sgphase.nim new file mode 100755 index 0000000000000000000000000000000000000000..6c669d6ba08acb114ce98d7cfe14781a529819ba --- /dev/null +++ b/private/sgphase.nim @@ -0,0 +1,94 @@ +# phase from genotype matrices generated from sgcocaller gtMtx + +import find_template_cell +import infer_missing_snps +import utils +import streams +import strutils +import sequtils + +# phaseOutdir/phased_snpAnnot.txt +# phaseOutdir/cellGenoVersusTemplate.txt +# phaseOutdir/cellGenoVersusTemplate.png by executing the R code +# let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_gtMtx.mtx" +# let phaseOutdir = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/" +# #let diagnosticDataframe = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/cellGenoVersusTemplate.txt" +# let snpAnnotFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_snpAnnot.txt" + + +proc writePhasedSnpAnnot*(fullGeno:seq[BinaryGeno], snpAnnotFileStream:FileStream, phasedSnpAnnotFileStream:FileStream): int = + var snpindex = 0 + var header: string + var snp_rec: string + header = snpAnnotFileStream.readLine() + phasedSnpAnnotFileStream.writeLine(join([header,"Phase"],sep = "\t")) + while not snpAnnotFileStream.atEnd(): + snp_rec = snpAnnotFileStream.readLine() + phasedSnpAnnotFileStream.writeLine(join([snp_rec,$fullGeno[snpindex]], sep = "\t") ) + snpindex.inc + if snpindex != (fullGeno.len): + quit "snpAnnot.txt does not have the same number of rows with gtMtx" + return 0 + +proc sgphase*(mtxFile: string, snpAnnotFile:string, phaseOutdir:string): int = + var nsnps,ncells,totalEntries:int + var gtMtx:seq[seq[BinaryGeno]] + var currentEntry:seq[int] + var currentEntrySeq:seq[string] + var gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream:FileStream + let phasedSnpAnnotFile = phaseOutdir & "phased_snpAnnot.txt" + let diagnosticDataframeFile = phaseOutdir & "cellGenoVersusTemplate.txt" + ## i, j are 1-based, entries with value 1 or 2 + try: + gtMtxFileStream = openFileStream(mtxFile, fmRead) + ddframFileStream = openFileStream(diagnosticDataframeFile, fmWrite) + snpAnnotFileStream = openFileStream(snpAnnotFile, fmRead) + phasedSnpAnnotFileStream = openFileStream(phasedSnpAnnotFile, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + # %%MatrixMarket matrix coordinate integer general + discard gtMtxFileStream.readLine() + #N, i,j + currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() + currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) + nsnps = currentEntry[0] + echo "nsnps " & $nsnps + ncells = currentEntry[1] + echo "ncells " & $ncells + totalEntries = currentEntry[2] + echo "totalEntries " & $totalEntries + ## gtMtx is cell by Snp format + gtMtx = newSeqWith(ncells,newSeq[BinaryGeno](nsnps)) + discard readGtMtxToSeq(gtMtxFileStream,gtMtx) + var tempcell = selectTemplateCell(gtMtx = gtMtx, nPairs =3) + echo "template cell is " & $tempcell + var tempcellGeno = gtMtx[tempcell] + let fullGeno = inferSnpGeno(tempcellGeno, gtMtx) + # generate the txt for diagnositc plot + var selectedCells:seq[int] + if ncells < 10: + selectedCells = (0..(ncells-1)).toSeq + else: + selectedCells = (0..9).toSeq + var headliner = "templateGeno" + var writeOut = "" + for c in selectedCells: + headliner = headliner & " cell" & $c + ddframFileStream.writeLine(headliner) + let subsetGtMtx = map(selectedCells, proc(x: int): seq[BinaryGeno] = gtMtx[x] ) + for k,g in fullGeno: + if g == gUnknown: continue + else: + writeOut = $(parseInt($g) + 1) + for cellg in sliceColumn(subsetGtMtx, k): + writeOut = writeOut & " " & $(parseInt($cellg)+1) + ddframFileStream.writeLine(writeOut) + + discard writePhasedSnpAnnot(fullGeno, snpAnnotFileStream, phasedSnpAnnotFileStream) + for fs in [gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream]: + fs.close() + + + + + diff --git a/private/utils.nim b/private/utils.nim index e155e2b26500bfb4654c4997c6a0d35d3307c3e0..01370af097ed3a68c0fa66b81d510d2183e5999c 100755 --- a/private/utils.nim +++ b/private/utils.nim @@ -5,6 +5,7 @@ import hts import algorithm import streams import strutils +import sequtils type allele_expr* = ref object @@ -70,6 +71,34 @@ proc get_base_offset*(position:int, align: Record): int = base_offset = qoff - over-1 break return base_offset + + +proc toBinaryGeno*(x: int, format = "12") : BinaryGeno = + if(format == "01"): + if x == 1: return gALT + if x == 0: return gREF + if x == -1: return gUnknown + quit "mtx file geno is not 0,1 or -1" + else: + if x == 1: return gREF + if x == 2: return gALT + quit "mtx file geno is not 1 or 2" + +## cell by SNP by default +proc readGtMtxToSeq*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]], by_cell = false):int = + var currentEntrySeq:seq[int] + var currentLine:seq[string] + + while not mtxFileStream.atEnd(): + currentLine = mtxFileStream.readLine().splitWhitespace() + ## i j 1-based from file + currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) + if by_cell: + gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = currentEntrySeq[2].toBinaryGeno + else: + gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno + return 0 + proc add_allele*(g:GtNode, alleleBinGeno:int):string = g.alleles & $alleleBinGeno proc inc_count_cref*(a:allele_expr) = inc(a.cref) @@ -175,7 +204,6 @@ proc sortWriteMtx*(sMtx:var Mtx, mtx_file:string):int = mtxFileStream = openFileStream(mtx_file, fmWrite) except: stderr.write getCurrentExceptionMsg() - var current_line: string sMtx.lines.sort(compareMtxRow) mtxFileStream.writeLine(sMtx.header) mtxFileStream.writeLine(sMtx.dims) diff --git a/sgcocaller.nim b/sgcocaller.nim index f015b97d8f141f4d099d33211db3ee343a75e348..542c5baa08f9dc943e9891b119a2469d59240657 100755 --- a/sgcocaller.nim +++ b/sgcocaller.nim @@ -9,67 +9,65 @@ import hts import tables import sequtils import private/utils -import private/fragments import math import streams import private/graph import private/findpath -import private/blocks_utils -import private/readfmf -import private/phase_blocks +import private/genotype_mtx +import private/sgphase +# proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTotalReads:int, +# minTotalReads:int, mapq: int,minbsq:int,mindp:int,barcodeTag:string, minsnpdepth:int):int = +# var variantIndex = 1 +# var cellFragmentsTable = newTable[string,Fragment]() -proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTotalReads:int, - minTotalReads:int, mapq: int,minbsq:int,mindp:int,barcodeTag:string, minsnpdepth:int):int = - var variantIndex = 1 - var cellFragmentsTable = newTable[string,Fragment]() +# echo "generated fragments from " & $barcodeTable.len & " single cells" +# var cellGtNodes: Table[string, GtNode] +# for rec in ivcf.query("*"): +# cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, +# minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, +# minbsq=minbsq,barcodeTag=barcodeTag) +# cellGtNodes = callNodeGenotype(cellGtNodes,minCellDp = mindp, minSnpDepth = minsnpdepth, p0 = 0.3, p1 = 0.8) +# cellFragmentsTable = updateCellFragment(barcodedNodesTable=cellGtNodes,cellFragmentsTable = cellFragmentsTable, fmf_out = outfmf) +# variantIndex = variantIndex + 1 +# echo "processed " & $(variantIndex) & " variants" +# return 0 - echo "generated fragments from " & $barcodeTable.len & " single cells" - var cellGtNodes: Table[string, GtNode] - for rec in ivcf.query("*"): - cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, - minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable, - minbsq=minbsq,barcodeTag=barcodeTag) - cellGtNodes = callNodeGenotype(cellGtNodes,minCellDp = mindp, minSnpDepth = minsnpdepth, p0 = 0.3, p1 = 0.8) - cellFragmentsTable = updateCellFragment(barcodedNodesTable=cellGtNodes,cellFragmentsTable = cellFragmentsTable, fmf_out = outfmf) - variantIndex = variantIndex + 1 - echo "processed " & $(variantIndex) & " variants" - return 0 -proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfile:string, threads:int):int = - var cellGenoTable: TableRef[string,TableRef[string,int]] - var cellBlockLeftPhaseTable,cellBlockRightPhaseTable :TableRef[string, seq[int]] - var chr_blocks:seq[Block] - var blockkseq:seq[Block_linkage] - var phased_blocks: seq[int] - echo "Phasing for hapfile " & block_hapfile - cellGenoTable = newTable[string,TableRef[string,int]]() - discard readFmf(fmf_file,cellGenoTable) - chr_blocks = read_blocks(block_hapfile) - cellBlockLeftPhaseTable = newTable[string, seq[int]]() - cellBlockRightPhaseTable = newTable[string, seq[int]]() - ## blocks at the ends with 20 SNPs or fewer, removed? - ## blocks not at the ends with 20SNPs or fewer are not used for linking blocks but if their linkage with pre block and post-block is consistent, - ## these short blocks' haplotypes are retained. - ## - ## What if blocks cannot be linked ? the number of 11(00) == 10(01) - echo "Total blocks " & $chr_blocks.len +# proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfile:string, threads:int):int = +# var cellGenoTable: TableRef[string,TableRef[string,int]] +# var cellBlockLeftPhaseTable,cellBlockRightPhaseTable :TableRef[string, seq[int]] +# var chr_blocks:seq[Block] +# var blockkseq:seq[Block_linkage] +# var phased_blocks: seq[int] +# echo "Phasing for hapfile " & block_hapfile +# cellGenoTable = newTable[string,TableRef[string,int]]() +# discard readFmf(fmf_file,cellGenoTable) +# chr_blocks = read_blocks(block_hapfile) +# cellBlockLeftPhaseTable = newTable[string, seq[int]]() +# cellBlockRightPhaseTable = newTable[string, seq[int]]() +# ## blocks at the ends with 20 SNPs or fewer, removed? +# ## blocks not at the ends with 20SNPs or fewer are not used for linking blocks but if their linkage with pre block and post-block is consistent, +# ## these short blocks' haplotypes are retained. +# ## +# ## What if blocks cannot be linked ? the number of 11(00) == 10(01) +# echo "Total blocks " & $chr_blocks.len - var contig_blocks:seq[int] - for b_j, bl in chr_blocks: - echo "block " & $b_j & " len: " & $bl.h0.len - # echo "last 5" & $bl.h0[(high(bl.h0)-5)..high(bl.h0)] - # echo "first 5" & $bl.h0[0..5] - discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, true, cellBlockLeftPhaseTable) - discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, false, cellBlockRightPhaseTable) - contig_blocks = find_contig_blocks(chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - # echo "blocks for contigs" & $contig_blocks - blockkseq = build_contig(contig_blocks = contig_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - for bk in blockkseq: - echo "prevBlock" & $bk.prevBlock & "nextBlock" & $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 & " 01 type " & $bk.linkageType_01 & " switch posterior prob " & $bk.switch_posterior_prob & " switch value :" & $bk.switch - phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) +# var contig_blocks:seq[int] +# for b_j, bl in chr_blocks: +# echo "block " & $b_j & " len: " & $bl.h0.len +# # echo "last 5" & $bl.h0[(high(bl.h0)-5)..high(bl.h0)] +# # echo "first 5" & $bl.h0[0..5] +# discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, true, cellBlockLeftPhaseTable) +# discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, false, cellBlockRightPhaseTable) +# contig_blocks = find_contig_blocks(chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) +# # echo "blocks for contigs" & $contig_blocks +# blockkseq = build_contig(contig_blocks = contig_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) +# for bk in blockkseq: +# echo "prevBlock" & $bk.prevBlock & "nextBlock" & $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 & " 01 type " & $bk.linkageType_01 & " switch posterior prob " & $bk.switch_posterior_prob & " switch value :" & $bk.switch +# phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) - discard write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads) - return 0 +# discard write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads) +# return 0 proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, ibam:Bam, out_dir:string, mapq:int, minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, @@ -179,11 +177,14 @@ when(isMainModule): $version Usage: - sgcocaller fmf [options] <BAM> <VCF> <barcodeFile> <out_prefix> + sgcocaller gtMtx [options] <BAM> <VCF> <barcodeFile> <out_prefix> + sgcocaller phase [options] <gtMtxFile> <phaseOutputDir> + sgcocaller swphase [options] <gtMtxFile> <phaseSnpAnnotFile> + sgcocaller sxo [options] <phaseOutputDir> <out_prefix> sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> - sgcocaller phase_blocks [options] <VCF> <fmf> <hapfile> <out_vcf> + Arguments: <BAM> the read alignment file with records of single-cell DNA reads @@ -218,9 +219,10 @@ Options: Examples - ./sgcocaller fmf --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/cellfragment.fmf + ./sgcocaller gtMtx --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/cellfragment.fmf + ./sgcocaller phase gtMtxFile phaseOutputDir ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp - ./sgcocaller phase_blocks --threads 4 AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz ./percell/cellfragment.fmf ./percell/ccsnp/linkedBlocks.vcf.gz + ./sgcocaller sxo phaseOutputDir barcodeFile.tsv ./percell/ccsnp """ % ["version", version]) @@ -241,7 +243,7 @@ Options: cmPmb:float barcodeTag="CB" minsnpdepth:int - out_vcf,fmf,barcodeFile,bamfile,vcff,hapfile:string + barcodeFile,bamfile,vcff:string vcfGtPhased = false echo $args threads = parse_int($args["--threads"]) @@ -258,7 +260,7 @@ Options: cmPmb = parse_float($args["--cmPmb"]) vcfGtPhased = parse_bool($args["--phased"]) - if args["fmf"] or args["xo"]: + if args["gtMtx"] or args["xo"]: var ibam:Bam ivcf:VCF @@ -266,13 +268,12 @@ Options: bamfile = $args["<BAM>"] barcodeFile = $args["<barcodeFile>"] out_dir = $args["<out_prefix>"] + if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_" vcff = $args["<VCF>"] - if not open(ibam, bamfile, threads=threads, index = true): quit "couldn't open input bam" if not open(ivcf, vcff, threads=threads): quit "couldn't open: vcf file" - if($args["--chrom"] != "nil"): selectedChrs = $args["--chrom"] s_Chrs = selectedChrs.split(',') @@ -280,9 +281,7 @@ Options: ## find all chroms from headers of vcf file (Contigs) let contigs = ivcf.contigs s_Chrs = map(contigs, proc(x: Contig): string = x.name) - var hf = hts.hts_open(cstring(barcodeFile), "r") - #### TODO : Table size var barcodeTable = newTable[string,int](initialSize = 1024) var kstr: hts.kstring_t kstr.l = 0 @@ -291,30 +290,43 @@ Options: var ithSperm = 0 ## initiate the table with CB as keys, allele counts (ref object) as elements while hts_getline(hf, cint(10), addr kstr) > 0: - if $kstr.s[0] == "#": - continue + if $kstr.s[0] == "#": continue var v = $kstr.s discard barcodeTable.hasKeyOrPut(v, ithSperm) ithSperm.inc discard hf.hts_close() - if args["fmf"]: - echo "generating fragment file to " & out_dir - var outfmf:FileStream - try: - outfmf = openFileStream(out_dir, fmWrite) - except: - stderr.write getCurrentExceptionMsg() - discard getfmf(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outfmf = outfmf, maxTotalReads = maxtotal, - minTotalReads = mintotal, mapq = mapq, minbsq =minbsq, mindp = mindp, barcodeTag = barcodeTag,minsnpdepth=minsnpdepth) - close(outfmf) - if args["xo"]: - echo "running crossover calling \n" - echo "running for these chromosomes as specified in the header of the provided VCF file " & $s_Chrs + if args["gtMtx"]: + var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string + var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream + for chrom in s_Chrs: + echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom + outGtMtxFile = out_dir & chrom & "_gtMtx.mtx" + outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx" + outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx" + try: + outGtMtx = openFileStream(outGtMtxFile, fmWrite) + outSnpAnnot = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) + outTotalCountMtx = openFileStream(outTotalCountMtxFile, fmWrite) + outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + + discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, + outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, + outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, + minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp, + barcodeTag = barcodeTag, minsnpdepth=minsnpdepth) + for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: + var imtx = readMtx(mtx_file = mtxFile) + discard sortWriteMtx(imtx, mtx_file = mtxFile) + elif args["xo"]: + echo "running crossover calling from VCF and BAM file\n" + echo "running for these chromosomes as specified in the header of the provided VCF file for chroms: " & $s_Chrs discard sgcocaller(threads, ivcf, barcodeTable, ibam, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) var outFileTotalCountMtxFile, outFileAltCountMtxFile: string - ## sort entries in mtx files + ## sort entries in mtx files for chrom in s_Chrs: outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" @@ -323,13 +335,20 @@ Options: discard sortWriteMtx(imtx, mtx_file = mtxFile) ibam.close() ivcf.close() - if args["phase_blocks"]: - fmf = $args["<fmf>"] - out_vcf = $args["<out_vcf>"] - vcff = $args["<VCF>"] - hapfile = $args["<hapfile>"] - echo "running phasing blocks \n" - discard phaseBlocks(fmf_file = fmf, inputVCF=vcff, outputVCF=out_vcf, block_hapfile=hapfile, threads=threads) + elif args["phase"]: + let gtMtxFile = $args["<gtMtxFile>"] + var phaseOutDir = $args["<phaseOutputDir>"] + if (phaseOutdir[^1] != '_') and (phaseOutdir[^1] != '/'): phaseOutdir = phaseOutdir & "_" + let snpAnnotFile = gtMtxFile.replace(sub = "_gtMtx.mtx", by = "_snpAnnot.txt") + echo "running phasing from single cell genotype matrix (one chromosome) " & gtMtxFile + echo "using the " & snpAnnotFile & "for generating phased haplotypes" + discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir) + elif args["swphase"]: + let gtMtxFile = $args["<gtMtxFile>"] + let phaseSnpAnnotFile = $args["<phaseSnpAnnotFile>"] + echo "finding snps positions to switch from single cell genotype matrix (one chromosome) " & gtMtxFile + echo "using the " & phaseSnpAnnotFile & "for generating final phased haplotypes" + discard sgphase(mtxFile = gtMtxFile, snpAnnotFile = snpAnnotFile, phaseOutdir = phaseOutdir) \ No newline at end of file