Skip to content
Snippets Groups Projects
Commit 155207ec authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

v0.3.9

parent c7d2c49a
No related branches found
No related tags found
No related merge requests found
......@@ -15,7 +15,7 @@ import sgcocaller/sgphase
import sgcocaller/writeVCF
import sgcocaller/correctPhase
import sgcocaller/sgcocaller_sxo
import times
let initProb:array[stateRef..stateAlt, float]=[0.5,0.5]
......@@ -26,18 +26,18 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
var bulkBam = false
if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"):
echo "running in bulk mode and all reads are regarded as from one sample/cell"
echo $now() & " running in bulk mode and all reads are regarded as from one sample/cell"
bulkBam = true
else:
echo "running sgcoaller xo for " & $barcodeTable.len & " single cells"
echo $now() & " running sgcoaller xo for " & $barcodeTable.len & " single cells"
## iterate through each selected chromosome
for chrom in s_Chrs:
var snpIndex, nnsize = 0
## number of non zeros
var scSpermSeq:SeqSpermViNodes
var scSpermSeq = newTable[int,SpermViNodes]()
## matches with the order in barcodeTable
scSpermSeq.setLen(barcodeTable.len)
## scSpermSeq.setLen(barcodeTable.len)
var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream
let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
try:
......@@ -91,17 +91,17 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
return 0
when(isMainModule):
let version = "0.3.7"
let version = "0.3.9"
var doc = format("""
$version
Usage:
sgcocaller autophase [options] <BAM> <VCF> <barcodeFile> <out_prefix>
sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix>
sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix>
sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix>
sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix>
Arguments:
<BAM> the read alignment file with records of single-cell DNA reads
......@@ -112,10 +112,11 @@ Arguments:
<out_prefix> the prefix of output files
<fmf> the fragment file from running sgcocaller fmf
<out_vcf> the output vcf aftering phasing blocks in hapfile
<gtMtxFile> the output gtMtx.mtx file from running sgcocaller phase
<phasedSnpAnnotFile> the output phased_snpAnnot.txt from running sgcocaller phase
Options:
-t --threads <threads> number of BAM decompression threads [default: 4]
......@@ -142,28 +143,34 @@ Options:
--minPositiveSwitchScores <minPositiveSwitchScores> the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8]
--binSize <binSize> the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default: 2000]
--stepSize <stepSize> the move step size used in combination with --binSize. [default: 200]
--dissimThresh <dissimThresh> the threshold used on the allele concordance ratio for determining if a SNP bin contains a crossover. [default: 0.0099]
--batchSize <batchSize> the number of cells to process in one batch when running sxo. This option is only needed when the memory is limited.
--notSortMtx do not sort the output mtx.
--maxUseNcells <maxUseNcells> the number of cells to use for calculating switch scores. All cells are used if not set
-h --help show help
Examples
./sgcocaller phase gtMtxFile phaseOutputPrefix
./sgcocaller autophase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix
./sgcocaller phase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix
./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp
./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp
./sgcocaller sxo snp_phase.txt phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp
""" % ["version", version])
let args = docopt(doc, version=version)
var
threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize:int
thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore:float
threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize,batchSize,maxUseNcells:int
thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore,dissimThresh:float
barcodeTag="CB"
out_dir,selectedChrs,barcodeFile,bamfile,vcff:string
vcfGtPhased = false
vcfGtPhased,notSortMtx = false
s_Chrs: seq[string]
outFileTotalCountMtxFile, outFileAltCountMtxFile: string
echo $args
echo $now() & $args
echo $now() & " running sgcocaller " & $version
threads = parse_int($args["--threads"])
barcodeTag = $args["--barcodeTag"]
mindp = parse_int($args["--minDP"])
......@@ -177,18 +184,33 @@ Options:
lookBeyondSnps = parse_int($args["--lookBeyondSnps"])
binSize = parse_int($args["--binSize"])
stepSize = parse_int($args["--stepSize"])
dissimThresh = parse_float($args["--dissimThresh"])
minPositiveSwitchScores = parse_int($args["--minPositiveSwitchScores"])
thetaRef = parse_float($args["--thetaREF"])
thetaAlt = parse_float($args["--thetaALT"])
posteriorProbMin = parse_float($args["--posteriorProbMin"])
maxDissim = parse_float($args["--maxDissim"])
minSwitchScore = parse_float($args["--minSwitchScore"])
if $args["--maxUseNcells"] != "nil": maxUseNcells = parse_int($args["--maxUseNcells"])
cmPmb = parse_float($args["--cmPmb"])
vcfGtPhased = parse_bool($args["--phased"])
notSortMtx = parse_bool($args["--notSortMtx"])
out_dir = $args["<out_prefix>"]
if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_"
if args["phase"] or args["xo"] or args["sxo"]:
if $args["--batchSize"] != "nil":
batchSize = parse_int($args["--batchSize"])
else:
batchSize = 0
var run_phase, run_swphase, run_xo, run_sxo, run_auto_phase = false
run_phase = args["phase"]
run_swphase = args["swphase"]
run_xo = args["xo"]
run_sxo = args["sxo"]
run_auto_phase = args["autophase"]
if run_auto_phase:
run_phase = true
run_swphase = true
if run_phase or run_xo or run_sxo:
barcodeFile = $args["<barcodeFile>"]
var hf = hts.hts_open(cstring(barcodeFile), "r")
var barcodeTable = newTable[string,int](initialSize = 1024)
......@@ -204,7 +226,7 @@ Options:
discard barcodeTable.hasKeyOrPut(v, ithSperm)
ithSperm.inc
discard hf.hts_close()
if args["phase"] or args["xo"]:
if run_phase or run_xo:
var
ibam:Bam
ivcf:VCF
......@@ -221,12 +243,13 @@ Options:
quit "couldn't open input bam"
if not open(ivcf, vcff, threads=threads):
quit "couldn't open: vcf file"
if args["phase"]:
if run_phase:
echo $now() & " running phase"
var templateCell = parse_int($args["--templateCell"])
var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string
var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
for chrom in s_Chrs:
echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom
echo $now() & " generating genotype sparse matrix file to " & out_dir & "for chr: " & chrom
outGtMtxFile = out_dir & chrom & "_gtMtx.mtx"
outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx"
outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx"
......@@ -240,66 +263,80 @@ Options:
outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite)
except:
stderr.write getCurrentExceptionMsg()
discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx,
outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal,
minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp,
barcodeTag = barcodeTag, minsnpdepth=minsnpdepth, chrom = chrom)
for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
echo "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile
echo "using the " & outSnpAnnotFile & " for generating phased haplotypes"
discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp = maxdp,barcodeTag = barcodeTag, minsnpdepth = minsnpdepth, chrom = chrom)
if not notSortMtx:
for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
echo $now() & "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile
echo $now() & "using the " & outSnpAnnotFile & " for generating phased haplotypes"
discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile,
diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell,
maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim)
if parse_bool($args["--outvcf"]):
var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz"
if parse_bool($args["--outvcf"]) and (not run_swphase):
var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz"
echo $now() & " write phased haplotypes to VCF " & outvcfFile
discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads)
elif args["xo"]:
echo "running crossover calling from VCF and BAM file\n"
echo "running for these chromosomes: " & $s_Chrs
discard sgcocaller(threads, ivcf, barcodeTable, ibam,
out_dir, mapq, minbsq, mintotal,
maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased)
if run_swphase:
echo $now() & " running swphase to find switch spots and generate corrected phase"
let switchedPhasedAnnotFile = out_dir & chrom & "_corrected_phased_snpAnnot.txt"
let switchScoreFile = out_dir & chrom & "_switch_score.txt"
let switchedPhasedAnnotVcfFile = out_dir & chrom & "_corrected_phased_snpAnnot.vcf.gz"
echo $now() & "scanning with swphase to find switch spots and generate corrected phase"
discard correctPhase(outGtMtxFile, outphasedSnpAnnotFile, switchedPhasedAnnotFile, switchScoreFile, lookBeyondSnps = lookBeyondSnps, minSwitchScore = minSwitchScore,
minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize,dissimThresh = dissimThresh, maxUseNcells = maxUseNcells )
echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile
discard writePhaseToVCF(vcff, switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = chrom)
elif run_xo:
echo $now() & " running crossover calling from VCF and BAM file\n"
echo $now() & " running for these chromosomes: " & $s_Chrs
discard sgcocaller(threads, ivcf, barcodeTable, ibam, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs, barcodeTag, phased = vcfGtPhased)
## sort entries in mtx files
for chrom in s_Chrs:
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
if not notSortMtx:
for chrom in s_Chrs:
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
ibam.close()
ivcf.close()
elif args["sxo"]:
elif run_sxo:
if($args["--chrom"] != "nil"):
selectedChrs = $args["--chrom"]
s_Chrs = selectedChrs.split(',')
else:
quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator"
echo "running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase "
echo "running for these chromosomes : " & $s_Chrs
echo $now() & " running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase "
echo $now() & " running for these chromosomes : " & $s_Chrs
let phase_dir = $args["<phaseOutputPrefix>"]
let phasedSnpAnnotFile = $args["<SNPPhaseFile>"]
discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, phasedSnpAnnotFileName = phasedSnpAnnotFile)
for chrom in s_Chrs:
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
elif args["swphase"]:
echo "running swphase to find switch spots and generate corrected phase"
if (batchSize == 0) or (batchSize >= barcodeTable.len):
batchSize = barcodeTable.len
discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,
thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb,
phasedSnpAnnotFileName = phasedSnpAnnotFile,batchSize=batchSize)
if not notSortMtx:
for chrom in s_Chrs:
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
elif run_swphase:
echo $now() & " running swphase to find switch spots and generate corrected phase"
let gtMtxFile = $args["<gtMtxFile>"]
let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"]
let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt"
let switchScoreFile = out_dir & "switch_score.txt"
let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz"
discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize)
echo $now() & " scanning with swphase to find switch spots and generate corrected phase"
discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore,
minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize, dissimThresh =dissimThresh,maxUseNcells = maxUseNcells)
if ($args["--chrom"] == "nil"):
echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option"
discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"])
echo $now() & " assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option"
echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile
discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = $args["--chrom"])
......
......@@ -4,13 +4,16 @@ import sequtils
import math
import streams
import strutils
import times
# let binSize = 2000
# let movingStep = 200
let dissimThresh = 0.0099
#let dissimThresh = 0.0099
#let lookBeyondSnps = 25
let debug = false
let switchPrior = 0.5
#let maxUseNcells = 100
type
switchScoreTuple = tuple
switch_scores: seq[float]
......@@ -24,7 +27,7 @@ type
## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin
## cell_geno snp by cell
proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool =
proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]], dissimThresh: float): bool =
let ncells = cell_geno[0].len
var ncompared = newSeq[int](ncells)
var nmatch = newSeq[int](ncells)
......@@ -46,33 +49,36 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]):
# if debug: echo "bin had many crossovers : " & $nxcells
return true
proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] =
proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int, dissimThresh:float): seq[int] =
#let ncells = gtMtxByCell[0].len
let nSnps = gtMtxByCell.len
let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
# if debug: echo "nBins: " & $nBins
# echo "nBins: " & $nBins
let binIds = toSeq(0..(nBins-1))
var snpPosStart,snpPosEnd: int
var highRiskSnps: seq[int]
for binI in binIds:
# if debug: echo "binI " & $binI
# echo "binI " & $binI
snpPosStart = (binI)*(binSize - movingStep)
if (snpPosStart + binSize) > (nSnps-1):
snpPosEnd = (nSnps-1)
else:
snpPosEnd = snpPosStart + binSize
if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd],
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )):
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd],
dissimThresh = dissimThresh )):
highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd))
# add a last bin as from end of chrom with length binSize
let lastBinStart = nSnps - binSize
let lastBinEnd = nSnps - 1
if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd],
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd])):
highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd))
if lastBinStart > 0:
let lastBinEnd = nSnps - 1
if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd],
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd],
dissimThresh = dissimThresh)):
highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd))
highRiskSnps = deduplicate(highRiskSnps)
# echo highRiskSnps
return highRiskSnps
proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] =
......@@ -81,7 +87,7 @@ proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[Bi
var currentEntrySeq: seq[string]
while not phasedSnpAnnotFileStream.atEnd():
currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace()
fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01")
fullGeno[i] = int8(parseInt(currentEntrySeq[3])).toBinaryGeno(format = "01")
i.inc
if i != nSnps :
quit "phased snpAnnot file does not have the same number of rows with gtMtx"
......@@ -126,7 +132,7 @@ proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] =
proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] =
map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex])
proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25): switchScoreTuple =
proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25, maxUseNcells:int): switchScoreTuple =
var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int
let nSnps = gtMtxByCell.len
let ncells = gtMtxByCell[0].len
......@@ -134,11 +140,17 @@ proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullG
var prob_switch, prob_nonsw: float
var leftIndex,rightIndex, switch_score_snpIndex: seq[int]
var swscore = newSeq[float](nSnps)
var useCells = newSeq[int]()
if (ncells <= maxUseNcells) or (maxUseNcells==0):
useCells = (0..(ncells-1)).toSeq()
else:
let everyN = (int)floor(ncells/maxUseNcells)
useCells = map(toSeq(0..(maxUseNcells-1)),proc(x:int):int = (x * everyN))
for rsnpi in riskySnps:
prob_switch = 0.0
prob_nonsw = 0.0
if fullGeno[rsnpi] == gUnknown: continue
for celli in 0..(ncells-1):
for celli in useCells:
# if celli == 2: continue
leftIndex = newSeq[int]()
rightIndex = newSeq[int]()
......@@ -234,7 +246,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph
phaseSnpAnnotFileFS.close()
return 0
proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int): int =
proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int, dissimThresh:float, maxUseNcells:int): int =
var gtMtxByCell: seq[seq[BinaryGeno]]
var currentEntrySeq: seq[string]
var currentEntry:seq[int]
......@@ -247,25 +259,26 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
switchScoreFileStream = openFileStream(switchScoreFile, fmWrite)
except:
stderr.write getCurrentExceptionMsg()
echo "reading gtMtx by cell"
echo $now() & "reading gtMtx by cell"
discard gtMtxFileStream.readLine()
discard phaseSnpAnnotFileStream.readLine()
#N, i,j
currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace()
currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x))
nSnps = currentEntry[0]
echo "nSnps " & $nSnps
echo "nSnps: " & $nSnps
ncells = currentEntry[1]
echo "ncells " & $ncells
echo "ncells: " & $ncells
echo "totalEntries " & $ currentEntry[2]
## gtMtx is cell by Snp format
gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true)
var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps )
var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize)
echo "riskySnps.length " & $riskySnps.len
var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps)
var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize, dissimThresh = dissimThresh)
echo $now() & "riskySnps.length " & $riskySnps.len
var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell = gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps, maxUseNcells= maxUseNcells)
let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores)
echo "see switch_sore.txt file, and corrected switch errors found at positions: " & $switchSites
switchScoreFileStream.writeLine("#" & $switchSites)
for snpi,score in switchScoresT.switch_scores:
if switchScoresT.switch_scores_snpIndex.find(snpi)>=0:
......
......@@ -31,7 +31,7 @@ proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, m
if not barcodeTable.hasKey(currentCB):
continue
base_off = get_base_offset(position = rec.POS.cint, align = aln)
base = aln.base_at(base_off)
base = aln.base_at(base_off)
if aln.base_quality_at(base_off).cint < minbsq:
continue
total_reads+=1
......
......@@ -101,26 +101,25 @@ proc pathTrackBackIthSperm(currentSperm: var SpermViNodes,
inferProb = currentEm[stateAlt]+math.ln(transProb)
reverseProb = currentEm[stateRef]+math.ln(1-transProb)
viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") )
return 0
proc pathTrackBack*(scSpermSeq: var SeqSpermViNodes,
proc pathTrackBack*(scSpermSeq: TableRef[int,SpermViNodes],
thetaRef: float,
thetaAlt: float,
cmPmb: float,
outFileVStateMtx: var FileStream,
viSegmentInfo: var FileStream): int =
var posEnd: int64
var inferProb,reverseProb = 0.0
var currentEm: array[stateRef..stateAlt, float]
var lastNode: ViNode
var spermVNseq: SpermViNodes
# var spermVNseq: SpermViNodes
# 0..(scSpermSeq.len-1)
# rightGap,leftGap = [0.0,0.0]
# spermVNseq = scSpermSeq[ithSperm]
var ithSNP: int
for ithSperm in 0..(scSpermSeq.len-1):
## rightGap,leftGap = [0.0,0.0]
spermVNseq = scSpermSeq[ithSperm]
if spermVNseq.viNodeseq.len==0: continue
for ithSperm,spermVNseq in scSpermSeq:
if spermVNseq.viNodeseq.len==0:
continue
lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)]
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt)
if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]:
......
......@@ -30,9 +30,7 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o
for rec in ivcf.query(chrom):
var rec_alt = rec.ALT[0][0]
var rec_ref = rec.REF[0]
cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,
minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,
minbsq=minbsq,barcodeTag=barcodeTag)
cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,minbsq=minbsq,barcodeTag=barcodeTag)
cellGtNodeCopy = cellGtNodes
for cellbarcode in cellGtNodeCopy.keys:
cellDP = cellGtNodes[cellbarcode].alleles.len
......
......@@ -13,10 +13,11 @@ type
snpIndexLookUp*: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp*: Table[int,int]
SeqSpermViNodes* = seq[SpermViNodes]
# SeqSpermViNodes* = seq[SpermViNodes]
proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int =
if scSpermSeq[ithSperm].viNodeseq.len==0:
proc addViNodeIthSperm*(scSpermSeq: TableRef[int,SpermViNodes], cAlt: int16, cRef: int16, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int =
if not scSpermSeq.hasKey(ithSperm):
scSpermSeq[ithSperm] = SpermViNodes()
var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
......@@ -26,7 +27,6 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i
currentViNode.pos = rec_pos
currentViNode.cAlt = cAlt
currentViNode.cRef = cRef
scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
......@@ -65,7 +65,7 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i
return 0
proc addViNode*(barcodeTable: TableRef,
alleleCountTable: Table[string,allele_expr],
scSpermSeq: var SeqSpermViNodes,
scSpermSeq: TableRef[int,SpermViNodes],
outFileTotalCountMtx: var FileStream,
outFileAltCountMtx: var FileStream,
nnsize: var int,
......@@ -80,13 +80,13 @@ proc addViNode*(barcodeTable: TableRef,
for bc, ac in alleleCountTable.pairs:
# ac.tostring(acs)
## mindp, maxdp, they are values per cell
if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue
if (ac.cref+ac.calt) < mindp or (ac.cref+ac.calt) > maxdp: continue
var ithSperm = barcodeTable[bc]
## write to mtx Ref count
outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt))
outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb)
discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = ac.calt, cRef = ac.cref, ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb)
return 0
# The size line
......@@ -37,11 +37,11 @@ proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh: float)
let prob_1_p = 1 - prob_0_p
if max(prob_0_p,prob_1_p) >= posterior_thresh:
# echo "posterior_prob: " & $(max(prob_0_p,prob_1_p))
if prob_0_p > prob_1_p:
# echo "returned inferred geno: gREF"
if prob_0_p > prob_1_p:
# "returned inferred geno: gREF"
return gREF
else:
# echo "returned inferred geno: gALT"
#"returned inferred geno: gALT"
return gALT
return gUnknown
# return full sequence of template geno by inferring missing SNPs's genotypes
......@@ -75,7 +75,7 @@ proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], po
offset += 1
totalIter += 1
if coexisPos.len < int(nAnchorSnps/2):
if debug: echo "not enough coexisting SNPs for jth cell: " & $j
# if debug: echo "not enough coexisting SNPs for jth cell: " & $j
continue
snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]),
temp_geno = map(coexisPos,proc(x:int): BinaryGeno = templateGeno[x]) )
......
......@@ -9,16 +9,16 @@ import sequtils
type
allele_expr* = ref object
cref*: int
calt*: int
cref*: int16
calt*: int16
ViState* = enum
stateRef, stateAlt, stateN
BinaryGeno* = enum
gUnknown, gREF, gALT
ViNode* = object
pos*: int
cRef*: int
cAlt*: int
cRef*: int16
cAlt*: int16
pathScore*: array[stateRef..stateAlt, float]
pathState*: array[stateRef..stateAlt, ViState]
state*: ViState
......@@ -37,7 +37,7 @@ type
MtxRow* = tuple
rowIndex: int
colIndex: int
value: int
value: int16
type
Mtx* = ref object
header*: string
......@@ -70,10 +70,10 @@ proc get_base_offset*(position:int, align: Record): int =
# get the base
base_offset = qoff - over-1
break
return base_offset
return base_offset
proc toBinaryGeno*(x: int, format = "12") : BinaryGeno =
proc toBinaryGeno*(x: int8, format = "12") : BinaryGeno =
if(format == "01"):
if x == 1: return gALT
if x == 0: return gREF
......@@ -88,15 +88,18 @@ proc toBinaryGeno*(x: int, format = "12") : BinaryGeno =
proc readGtMtxToSeq*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]], by_cell = false):int =
var currentEntrySeq:seq[int]
var currentLine:seq[string]
while not mtxFileStream.atEnd():
currentLine = mtxFileStream.readLine().splitWhitespace()
## i j 1-based from file
currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x))
if by_cell:
gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = currentEntrySeq[2].toBinaryGeno
else:
gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno
if by_cell:
while not mtxFileStream.atEnd():
currentLine = mtxFileStream.readLine().splitWhitespace()
## i j 1-based from file
currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x))
gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = int8(currentEntrySeq[2]).toBinaryGeno
else:
while not mtxFileStream.atEnd():
currentLine = mtxFileStream.readLine().splitWhitespace()
## i j 1-based from file
currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x))
gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = int8(currentEntrySeq[2]).toBinaryGeno
return 0
proc add_allele*(g:GtNode, alleleBinGeno:int):string = g.alleles & $alleleBinGeno
......@@ -105,7 +108,7 @@ proc inc_count_cref*(a:allele_expr) = inc(a.cref)
proc inc_count_calt*(a:allele_expr) = inc(a.calt)
proc getEmission*(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
cRef:int16,cAlt:int16): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
......@@ -182,7 +185,7 @@ proc readMtx*(mtx_file:string): Mtx =
current_line = mtxFileStream.readLine()
sparseMtx.lines.add((rowIndex: parseInt(current_line.splitWhitespace()[0]),
colIndex: parseInt(current_line.splitWhitespace()[1]),
value: parseInt(current_line.splitWhitespace()[2]) ))
value: int16(parseInt(current_line.splitWhitespace()[2])) ))
mtxFileStream.close()
return sparseMtx
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment