diff --git a/src/sgcocaller.nim b/src/sgcocaller.nim index 535dc22090768c338432d59f9feb0521eeabb895..65f28ae49292567e04e8c8600c0fd4dae7c84252 100755 --- a/src/sgcocaller.nim +++ b/src/sgcocaller.nim @@ -29,7 +29,7 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, echo "running in bulk mode and all reads are regarded as from one sample/cell" bulkBam = true else: - echo "running sgcoaller for " & $barcodeTable.len & " single cells" + echo "running sgcoaller xo for " & $barcodeTable.len & " single cells" ## iterate through each selected chromosome for chrom in s_Chrs: @@ -132,7 +132,7 @@ Options: --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1] --phased the input VCF for calling crossovers contains the phased GT of heterozygous SNPs - --outvcf generate the output in vcf format (phase) + --outvcf generate the output in vcf format (sgcocaller phase) --templateCell <templateCell> the cell's genotype to be used a template cell, as the cell's index (0-starting) in the barcode file, default as not supplied [default: -1] --maxDissim <maxDissim> the maximum dissimilarity for a pair of cell to be selected as potential template cells due to not having crossovers in either cell [default: 0.0099] --maxExpand <maxExpand> the maximum number of iterations to look for locally coexisting positions for inferring missing SNPs in template haplotype sequence [default: 1000] @@ -147,7 +147,7 @@ Options: Examples ./sgcocaller phase gtMtxFile phaseOutputPrefix - ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp + ./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp ./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp """ % ["version", version]) @@ -248,8 +248,8 @@ Options: for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]: var imtx = readMtx(mtx_file = mtxFile) discard sortWriteMtx(imtx, mtx_file = mtxFile) - echo "running phasing from single cell genotype matrix (one chromosome) " & outGtMtxFile - echo "using the " & outSnpAnnotFile & "for generating phased haplotypes" + echo "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile + echo "using the " & outSnpAnnotFile & " for generating phased haplotypes" discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile, diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell, maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim) @@ -276,8 +276,8 @@ Options: selectedChrs = $args["--chrom"] s_Chrs = selectedChrs.split(',') else: - quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes using comma as separator" - echo "running crossover calling from genotype and allele count matrices generate from sgcocaller phase/swphase " + quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator" + echo "running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase " echo "running for these chromosomes : " & $s_Chrs let phase_dir = $args["<phaseOutputPrefix>"] let phasedSnpAnnotFile = $args["<SNPPhaseFile>"] @@ -289,7 +289,7 @@ Options: var imtx = readMtx(mtx_file = mtxFile) discard sortWriteMtx(imtx, mtx_file = mtxFile) elif args["swphase"]: - echo "find switch spots and generate corrected phase" + echo "running swphase to find switch spots and generate corrected phase" let gtMtxFile = $args["<gtMtxFile>"] let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"] let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt" @@ -297,7 +297,7 @@ Options: let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz" discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize) if ($args["--chrom"] == "nil"): - echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option" + echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option" discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"]) diff --git a/src/sgcocaller/correctPhase.nim b/src/sgcocaller/correctPhase.nim index f76660ff7b8027949080050536067cf737ff85f4..d5046d19fff22cb692905c593afc891ba4827c69 100755 --- a/src/sgcocaller/correctPhase.nim +++ b/src/sgcocaller/correctPhase.nim @@ -3,7 +3,6 @@ import utils import sequtils import math import streams -# bins of SNP indexes import strutils # let binSize = 2000 @@ -34,29 +33,29 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): if g != gUnknown and cell_geno[snpi][cellj] != gUnknown: ncompared[cellj].inc if g == cell_geno[snpi][cellj]: nmatch[cellj].inc - if debug: echo "ncompared " & $ncompared +# if debug: echo "ncompared " & $ncompared let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float = nmatch[x]/ncompared[x]) - if debug: echo "dissim " & $dissim +# if debug: echo "dissim " & $dissim var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh))) - if debug: echo "ncrossover " & $ncrossover +# if debug: echo "ncrossover " & $ncrossover let nxcells = foldl(ncrossover, a + b) if nxcells < int(floor(0.5 * float(ncells))): - if debug: echo "bin had no crossovers : " & $nxcells +# if debug: echo "bin had no crossovers : " & $nxcells return false else: - if debug: echo "bin had many crossovers : " & $nxcells +# if debug: echo "bin had many crossovers : " & $nxcells return true proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] = #let ncells = gtMtxByCell[0].len let nSnps = gtMtxByCell.len let nBins = (int)(ceil(nSnps / (binSize - movingStep))) - if debug: echo "nBins: " & $nBins +# if debug: echo "nBins: " & $nBins let binIds = toSeq(0..(nBins-1)) var snpPosStart,snpPosEnd: int var highRiskSnps: seq[int] for binI in binIds: - if debug: echo "binI " & $binI +# if debug: echo "binI " & $binI snpPosStart = (binI)*(binSize - movingStep) if (snpPosStart + binSize) > (nSnps-1): snpPosEnd = (nSnps-1) @@ -180,7 +179,7 @@ proc findSwitchSites(switchScoreT: switchScoreTuple, lookBeyondSnps = 25, minSwi var inPositiveBlock = false for site, score in switchScoreT.switch_scores: - echo "site, " & $site & " score: " & $(score) +# echo "site, " & $site & " score: " & $(score) if switchScoreT.switch_scores_snpIndex.find(site)>=0: if score <= 0.0 or (switchScoreT.switch_scores_snpIndex.find(site) == (switchScoreT.switch_scores_snpIndex.len - 1)) : if inPositiveBlock: @@ -248,7 +247,7 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn switchScoreFileStream = openFileStream(switchScoreFile, fmWrite) except: stderr.write getCurrentExceptionMsg() - echo "reading gtMtx to by cell" + echo "reading gtMtx by cell" discard gtMtxFileStream.readLine() discard phaseSnpAnnotFileStream.readLine() #N, i,j @@ -258,7 +257,6 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn echo "nSnps " & $nSnps ncells = currentEntry[1] echo "ncells " & $ncells - echo "totalEntries " & $ currentEntry[2] ## gtMtx is cell by Snp format gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) @@ -277,10 +275,3 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]: fs.close() discard writeSwitchedPhase(switchSites,switchedPhasedAnnotFile,phaseSnpAnnotFile) - -# for chrom in @["chr5"]: -# let gtMtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_gtMtx.mtx" -# let phaseSnpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_phased_snpAnnot.txt" -# let switchedPhasedAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_corrected_phased_snpAnnot.txt" -# let switchScoreFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_switch_score.txt" -# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = 150,minSwitchScore = 280.0, minPositiveSwitchScores = 25) \ No newline at end of file diff --git a/src/sgcocaller/getGtMtx.nim b/src/sgcocaller/getGtMtx.nim index dc6f8b662ae2b7fd1b6dbbcba732bc1481399604..90816bc2425a67e962ab05d70598d82062774a49 100755 --- a/src/sgcocaller/getGtMtx.nim +++ b/src/sgcocaller/getGtMtx.nim @@ -12,7 +12,6 @@ import findGtNodes ## these outputs will be per chromosome - proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream, outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int, minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.7,chrom: string):int = @@ -75,62 +74,3 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]: outFileStream.close() return 0 - -# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz" -# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam" -# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" -# var -# ibam:Bam -# ivcf:VCF -# outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream -# if not open(ibam, bam_file, threads=1, index = true): -# quit "couldn't open input bam" -# if not open(ivcf, vcf_file, threads=1): -# quit "couldn't open: vcf file" - -# var hf = hts.hts_open(cstring(barcode_file), "r") -# #### TODO : Table size -# var barcodeTable = newTable[string,int](initialSize = 1024) -# var kstr: hts.kstring_t -# kstr.l = 0 -# kstr.m = 0 -# kstr.s = nil -# var ithSperm = 0 -# ## initiate the table with CB as keys, allele counts (ref object) as elements -# while hts_getline(hf, cint(10), addr kstr) > 0: -# if $kstr.s[0] == "#": -# continue -# var v = $kstr.s -# discard barcodeTable.hasKeyOrPut(v, ithSperm) -# ithSperm.inc -# discard hf.hts_close() - -# let s_Chrs = @["chr1"] -# var chrom = "chr1" -# let out_prefix = "test_data/getMtx/" -# echo "generating gtMtx" -# try: -# outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite) -# outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite) -# outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite) -# outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite) -# except: -# stderr.write getCurrentExceptionMsg() - - -# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, -# outSnpAnnot = outSnpAnnot, maxTotalReads = 30, -# minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1) - - -# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string - -# ## sort entries in mtx files -# for chrom in s_Chrs: -# outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx" -# outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx" -# outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx" -# for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]: -# var imtx = readMtx(mtx_file = mtxFile) -# discard sortWriteMtx(imtx, mtx_file = mtxFile) - diff --git a/src/sgcocaller/sgcocaller_sxo.nim b/src/sgcocaller/sgcocaller_sxo.nim index 4e1544219881defedaf5d740f37cc7c85dbf8fb5..2b0f8491ded0833fcbe679a6a809f9fa6732efa9 100755 --- a/src/sgcocaller/sgcocaller_sxo.nim +++ b/src/sgcocaller/sgcocaller_sxo.nim @@ -127,21 +127,3 @@ proc sgcocallerSXO*(barcodeTable:TableRef, phase_dir:string, out_dir:string, the for outFileStream in [phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo]: outFileStream.close() return 0 - -# let barcodeFile = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv" -# var barcodeTable = newTable[string,int](initialSize = 1024) -# var hf = hts.hts_open(cstring(barcodeFile), "r") -# var kstr: hts.kstring_t -# kstr.l = 0 -# kstr.m = 0 -# kstr.s = nil -# var ithSperm = 0 -# ## initiate the table with CB as keys, gamete index as elements -# while hts_getline(hf, cint(10), addr kstr) > 0: -# if $kstr.s[0] == "#": continue -# var v = $kstr.s -# discard barcodeTable.hasKeyOrPut(v, ithSperm) -# ithSperm.inc -# discard hf.hts_close() -# var s_Chrs = @["chr6"] -# discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = "data/WC_CNV_42/sgcocaller/phaseOneStep/", out_dir = "data/WC_CNV_42/sgcocaller/sxo/",thetaREF = 0.1, thetaALT = 0.9, cmPmb = 0.001,s_Chrs =s_Chrs,initProb = [0.5,0.5]) \ No newline at end of file diff --git a/src/sgcocaller/sgphase.nim b/src/sgcocaller/sgphase.nim index c84483c2bb615943fb27ca646c1978e5e231d230..79815136a89c8b053f3998d9ace13b3a67e41143 100755 --- a/src/sgcocaller/sgphase.nim +++ b/src/sgcocaller/sgphase.nim @@ -7,15 +7,6 @@ import streams import strutils import sequtils -# phaseOutdir/chrom_phased_snpAnnot.txt -# phaseOutdir/cellGenoVersusTemplate.txt -# phaseOutdir/cellGenoVersusTemplate.png by executing the R code -# let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_gtMtx.mtx" -# let phaseOutdir = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/" -# #let diagnosticDataframe = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/cellGenoVersusTemplate.txt" -# let snpAnnotFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_snpAnnot.txt" - - proc writePhasedSnpAnnot*(fullGeno:seq[BinaryGeno], snpAnnotFileStream:FileStream, phasedSnpAnnotFileStream:FileStream): int = var snpindex = 0 var header: string @@ -97,25 +88,3 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phasedSnpAnnotFile:string, d discard writePhasedSnpAnnot(fullGeno, snpAnnotFileStream, phasedSnpAnnotFileStream) for fs in [gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream]: fs.close() - - - - - -# var s_Chrs = @["CUR6G"] - -# for chrom in s_Chrs: -# var mtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_gtMtx.mtx" -# var phasedSnpFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_phased_snpAnnot.txt" -# var snpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_snpAnnot.txt" -# var ddframe = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_cellGenoVersusTemplate.txt" - -# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe) - -# for chrom in s_Chrs: -# var mtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx" -# var phasedSnpFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt" -# var snpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_snpAnnot.txt" -# var ddframe = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_cellGenoVersusTemplate.txt" - -# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe) \ No newline at end of file diff --git a/src/sgcocaller/writeVCF.nim b/src/sgcocaller/writeVCF.nim index 3fd826ad8da40ad203bd5a86770af96ed42de384..1a0f194025f52d989fd9f02d92bba99381904db6 100755 --- a/src/sgcocaller/writeVCF.nim +++ b/src/sgcocaller/writeVCF.nim @@ -22,7 +22,6 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string if debug: echo "ivcffile " & ivcfFile if debug: echo "ovcfFile " & ovcfFile if debug: echo "phasedAnnotFile " & phasedAnnotFile - var ivcf: VCF var ovcf: VCF var v_off = 0 @@ -72,10 +71,10 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string quit "write vcf failed for " & $voff if phasedAnnotFS.atEnd(): break phaseRec = readNextPhased(phasedAnnotFS) - if debug: - if phaseRec.len != 4: - echo " phaseRec.len " & $phaseRec.len - echo " phaseRec" & $phaseRec + # if debug: + # if phaseRec.len != 4: + # echo " phaseRec.len " & $phaseRec.len + # echo " phaseRec" & $phaseRec phasePos = parseInt(phaseRec[0]) phasePhase = parseInt(phaseRec[3]) phasedAnnotFS.close() @@ -84,16 +83,3 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string return 0 -# let s_Chrs = map(toSeq(1..19), proc(x:int):string = "chr" & $x) -# for chrom in @["chr8"]: -# var ivcf = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT." & chrom & ".vcf.gz" -# var ovcf = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz" -# var swphasedSnpFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.txt" -# var gtMtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx" -# var phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt" -# var switchScoreFile ="data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_switch_score.txt" -# var switchedPhasedAnnotVcfFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz" -# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,swphasedSnpFile,switchScoreFile) -# discard writePhaseToVCF(ivcf, ovcf, swphasedSnpFile,1) - -