Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • biocellgen-public/sgcocaller
1 result
Show changes
Commits on Source (2)
0.3.8 changed minDP minTotalDP to be inclusive
0.3.9 swphase uses 100 cells by default. sxo can call crossovers for batched cells. Fixed a few bugs.
...@@ -15,7 +15,7 @@ import sgcocaller/sgphase ...@@ -15,7 +15,7 @@ import sgcocaller/sgphase
import sgcocaller/writeVCF import sgcocaller/writeVCF
import sgcocaller/correctPhase import sgcocaller/correctPhase
import sgcocaller/sgcocaller_sxo import sgcocaller/sgcocaller_sxo
import times
let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] let initProb:array[stateRef..stateAlt, float]=[0.5,0.5]
...@@ -26,18 +26,18 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, ...@@ -26,18 +26,18 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
var bulkBam = false var bulkBam = false
if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"): if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"):
echo "running in bulk mode and all reads are regarded as from one sample/cell" echo $now() & " running in bulk mode and all reads are regarded as from one sample/cell"
bulkBam = true bulkBam = true
else: else:
echo "running sgcoaller xo for " & $barcodeTable.len & " single cells" echo $now() & " running sgcoaller xo for " & $barcodeTable.len & " single cells"
## iterate through each selected chromosome ## iterate through each selected chromosome
for chrom in s_Chrs: for chrom in s_Chrs:
var snpIndex, nnsize = 0 var snpIndex, nnsize = 0
## number of non zeros ## number of non zeros
var scSpermSeq:SeqSpermViNodes var scSpermSeq = newTable[int,SpermViNodes]()
## matches with the order in barcodeTable ## matches with the order in barcodeTable
scSpermSeq.setLen(barcodeTable.len) ## scSpermSeq.setLen(barcodeTable.len)
var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream var outFileSNPanno,outFileTotalCountMtx,outFileAltCountMtx,outFileVStateMtx,viSegmentInfo:FileStream
let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general" let sparseMatrixHeader = "%%MatrixMarket matrix coordinate integer general"
try: try:
...@@ -91,17 +91,17 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, ...@@ -91,17 +91,17 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
return 0 return 0
when(isMainModule): when(isMainModule):
let version = "0.3.7" let version = "0.3.9"
var doc = format(""" var doc = format("""
$version $version
Usage: Usage:
sgcocaller autophase [options] <BAM> <VCF> <barcodeFile> <out_prefix>
sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix> sgcocaller phase [options] <BAM> <VCF> <barcodeFile> <out_prefix>
sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix> sgcocaller swphase [options] <gtMtxFile> <phasedSnpAnnotFile> <referenceVCF> <out_prefix>
sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix> sgcocaller sxo [options] <SNPPhaseFile> <phaseOutputPrefix> <barcodeFile> <out_prefix>
sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix>
Arguments: Arguments:
<BAM> the read alignment file with records of single-cell DNA reads <BAM> the read alignment file with records of single-cell DNA reads
...@@ -112,10 +112,11 @@ Arguments: ...@@ -112,10 +112,11 @@ Arguments:
<out_prefix> the prefix of output files <out_prefix> the prefix of output files
<fmf> the fragment file from running sgcocaller fmf
<out_vcf> the output vcf aftering phasing blocks in hapfile <out_vcf> the output vcf aftering phasing blocks in hapfile
<gtMtxFile> the output gtMtx.mtx file from running sgcocaller phase
<phasedSnpAnnotFile> the output phased_snpAnnot.txt from running sgcocaller phase
Options: Options:
-t --threads <threads> number of BAM decompression threads [default: 4] -t --threads <threads> number of BAM decompression threads [default: 4]
...@@ -142,28 +143,34 @@ Options: ...@@ -142,28 +143,34 @@ Options:
--minPositiveSwitchScores <minPositiveSwitchScores> the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8] --minPositiveSwitchScores <minPositiveSwitchScores> the min number of continuing SNPs with positive switch scores to do switch error correction [default: 8]
--binSize <binSize> the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default: 2000] --binSize <binSize> the size of SNP bins for scanning swith errors, users are recommended to increase this option when SNP density is high. [default: 2000]
--stepSize <stepSize> the move step size used in combination with --binSize. [default: 200] --stepSize <stepSize> the move step size used in combination with --binSize. [default: 200]
--dissimThresh <dissimThresh> the threshold used on the allele concordance ratio for determining if a SNP bin contains a crossover. [default: 0.0099]
--batchSize <batchSize> the number of cells to process in one batch when running sxo. This option is only needed when the memory is limited.
--notSortMtx do not sort the output mtx.
--maxUseNcells <maxUseNcells> the number of cells to use for calculating switch scores. All cells are used if not set
-h --help show help -h --help show help
Examples Examples
./sgcocaller phase gtMtxFile phaseOutputPrefix ./sgcocaller autophase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix
./sgcocaller phase possorted_bam.bam hetSNPs.vcf.gz barcodeFile.tsv phaseOutputPrefix
./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp ./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp
./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp ./sgcocaller sxo snp_phase.txt phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp
""" % ["version", version]) """ % ["version", version])
let args = docopt(doc, version=version) let args = docopt(doc, version=version)
var var
threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize:int threads,mapq,minbsq,mintotal,maxtotal,mindp,maxdp,minsnpdepth,maxExpand,lookBeyondSnps,minPositiveSwitchScores,binSize,stepSize,batchSize,maxUseNcells:int
thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore:float thetaREF,thetaALT,cmPmb,posteriorProbMin,maxDissim,minSwitchScore,dissimThresh:float
barcodeTag="CB" barcodeTag="CB"
out_dir,selectedChrs,barcodeFile,bamfile,vcff:string out_dir,selectedChrs,barcodeFile,bamfile,vcff:string
vcfGtPhased = false vcfGtPhased,notSortMtx = false
s_Chrs: seq[string] s_Chrs: seq[string]
outFileTotalCountMtxFile, outFileAltCountMtxFile: string outFileTotalCountMtxFile, outFileAltCountMtxFile: string
echo $args echo $now() & $args
echo $now() & " running sgcocaller " & $version
threads = parse_int($args["--threads"]) threads = parse_int($args["--threads"])
barcodeTag = $args["--barcodeTag"] barcodeTag = $args["--barcodeTag"]
mindp = parse_int($args["--minDP"]) mindp = parse_int($args["--minDP"])
...@@ -177,18 +184,33 @@ Options: ...@@ -177,18 +184,33 @@ Options:
lookBeyondSnps = parse_int($args["--lookBeyondSnps"]) lookBeyondSnps = parse_int($args["--lookBeyondSnps"])
binSize = parse_int($args["--binSize"]) binSize = parse_int($args["--binSize"])
stepSize = parse_int($args["--stepSize"]) stepSize = parse_int($args["--stepSize"])
dissimThresh = parse_float($args["--dissimThresh"])
minPositiveSwitchScores = parse_int($args["--minPositiveSwitchScores"]) minPositiveSwitchScores = parse_int($args["--minPositiveSwitchScores"])
thetaRef = parse_float($args["--thetaREF"]) thetaRef = parse_float($args["--thetaREF"])
thetaAlt = parse_float($args["--thetaALT"]) thetaAlt = parse_float($args["--thetaALT"])
posteriorProbMin = parse_float($args["--posteriorProbMin"]) posteriorProbMin = parse_float($args["--posteriorProbMin"])
maxDissim = parse_float($args["--maxDissim"]) maxDissim = parse_float($args["--maxDissim"])
minSwitchScore = parse_float($args["--minSwitchScore"]) minSwitchScore = parse_float($args["--minSwitchScore"])
if $args["--maxUseNcells"] != "nil": maxUseNcells = parse_int($args["--maxUseNcells"])
cmPmb = parse_float($args["--cmPmb"]) cmPmb = parse_float($args["--cmPmb"])
vcfGtPhased = parse_bool($args["--phased"]) vcfGtPhased = parse_bool($args["--phased"])
notSortMtx = parse_bool($args["--notSortMtx"])
out_dir = $args["<out_prefix>"] out_dir = $args["<out_prefix>"]
if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_" if (out_dir[^1] != '_') and (out_dir[^1] != '/') : out_dir = out_dir & "_"
if $args["--batchSize"] != "nil":
if args["phase"] or args["xo"] or args["sxo"]: batchSize = parse_int($args["--batchSize"])
else:
batchSize = 0
var run_phase, run_swphase, run_xo, run_sxo, run_auto_phase = false
run_phase = args["phase"]
run_swphase = args["swphase"]
run_xo = args["xo"]
run_sxo = args["sxo"]
run_auto_phase = args["autophase"]
if run_auto_phase:
run_phase = true
run_swphase = true
if run_phase or run_xo or run_sxo:
barcodeFile = $args["<barcodeFile>"] barcodeFile = $args["<barcodeFile>"]
var hf = hts.hts_open(cstring(barcodeFile), "r") var hf = hts.hts_open(cstring(barcodeFile), "r")
var barcodeTable = newTable[string,int](initialSize = 1024) var barcodeTable = newTable[string,int](initialSize = 1024)
...@@ -204,7 +226,7 @@ Options: ...@@ -204,7 +226,7 @@ Options:
discard barcodeTable.hasKeyOrPut(v, ithSperm) discard barcodeTable.hasKeyOrPut(v, ithSperm)
ithSperm.inc ithSperm.inc
discard hf.hts_close() discard hf.hts_close()
if args["phase"] or args["xo"]: if run_phase or run_xo:
var var
ibam:Bam ibam:Bam
ivcf:VCF ivcf:VCF
...@@ -221,12 +243,13 @@ Options: ...@@ -221,12 +243,13 @@ Options:
quit "couldn't open input bam" quit "couldn't open input bam"
if not open(ivcf, vcff, threads=threads): if not open(ivcf, vcff, threads=threads):
quit "couldn't open: vcf file" quit "couldn't open: vcf file"
if args["phase"]: if run_phase:
echo $now() & " running phase"
var templateCell = parse_int($args["--templateCell"]) var templateCell = parse_int($args["--templateCell"])
var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile, outSnpAnnotFile, outphasedSnpAnnotFile,outdiagnosticDataframeFile : string
var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream var outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
for chrom in s_Chrs: for chrom in s_Chrs:
echo "generating genotype sparse matrix file to " & out_dir & "for chr " & chrom echo $now() & " generating genotype sparse matrix file to " & out_dir & "for chr: " & chrom
outGtMtxFile = out_dir & chrom & "_gtMtx.mtx" outGtMtxFile = out_dir & chrom & "_gtMtx.mtx"
outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx" outTotalCountMtxFile = out_dir & chrom & "_totalMtx.mtx"
outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx" outAltCountMtxFile = out_dir & chrom & "_altMtx.mtx"
...@@ -240,66 +263,80 @@ Options: ...@@ -240,66 +263,80 @@ Options:
outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite) outAltCountMtx = openFileStream(outAltCountMtxFile, fmWrite)
except: except:
stderr.write getCurrentExceptionMsg() stderr.write getCurrentExceptionMsg()
discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp = maxdp,barcodeTag = barcodeTag, minsnpdepth = minsnpdepth, chrom = chrom)
outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx, if not notSortMtx:
outSnpAnnot = outSnpAnnot, maxTotalReads = maxtotal, for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]:
minTotalReads = mintotal, mapq = mapq, minbsq = minbsq, minCellDp = mindp,maxCellDp =maxdp, var imtx = readMtx(mtx_file = mtxFile)
barcodeTag = barcodeTag, minsnpdepth=minsnpdepth, chrom = chrom) discard sortWriteMtx(imtx, mtx_file = mtxFile)
for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]: echo $now() & "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile
var imtx = readMtx(mtx_file = mtxFile) echo $now() & "using the " & outSnpAnnotFile & " for generating phased haplotypes"
discard sortWriteMtx(imtx, mtx_file = mtxFile)
echo "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile
echo "using the " & outSnpAnnotFile & " for generating phased haplotypes"
discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile, discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile,
diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell, diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell,
maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim) maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim)
if parse_bool($args["--outvcf"]): if parse_bool($args["--outvcf"]) and (not run_swphase):
var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz" var outvcfFile = out_dir & chrom & "_phased_snpAnnot.vcf.gz"
echo $now() & " write phased haplotypes to VCF " & outvcfFile
discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads) discard writePhaseToVCF(vcff, outvcfFile, outphasedSnpAnnotFile,threads = threads)
elif args["xo"]: if run_swphase:
echo "running crossover calling from VCF and BAM file\n" echo $now() & " running swphase to find switch spots and generate corrected phase"
echo "running for these chromosomes: " & $s_Chrs let switchedPhasedAnnotFile = out_dir & chrom & "_corrected_phased_snpAnnot.txt"
discard sgcocaller(threads, ivcf, barcodeTable, ibam, let switchScoreFile = out_dir & chrom & "_switch_score.txt"
out_dir, mapq, minbsq, mintotal, let switchedPhasedAnnotVcfFile = out_dir & chrom & "_corrected_phased_snpAnnot.vcf.gz"
maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs,barcodeTag,phased = vcfGtPhased) echo $now() & "scanning with swphase to find switch spots and generate corrected phase"
discard correctPhase(outGtMtxFile, outphasedSnpAnnotFile, switchedPhasedAnnotFile, switchScoreFile, lookBeyondSnps = lookBeyondSnps, minSwitchScore = minSwitchScore,
minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize,dissimThresh = dissimThresh, maxUseNcells = maxUseNcells )
echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile
discard writePhaseToVCF(vcff, switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = chrom)
elif run_xo:
echo $now() & " running crossover calling from VCF and BAM file\n"
echo $now() & " running for these chromosomes: " & $s_Chrs
discard sgcocaller(threads, ivcf, barcodeTable, ibam, out_dir, mapq, minbsq, mintotal, maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb, s_Chrs, barcodeTag, phased = vcfGtPhased)
## sort entries in mtx files ## sort entries in mtx files
for chrom in s_Chrs: if not notSortMtx:
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" for chrom in s_Chrs:
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
var imtx = readMtx(mtx_file = mtxFile) for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
discard sortWriteMtx(imtx, mtx_file = mtxFile) var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
ibam.close() ibam.close()
ivcf.close() ivcf.close()
elif args["sxo"]: elif run_sxo:
if($args["--chrom"] != "nil"): if($args["--chrom"] != "nil"):
selectedChrs = $args["--chrom"] selectedChrs = $args["--chrom"]
s_Chrs = selectedChrs.split(',') s_Chrs = selectedChrs.split(',')
else: else:
quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator" quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator"
echo "running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase " echo $now() & " running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase "
echo "running for these chromosomes : " & $s_Chrs echo $now() & " running for these chromosomes : " & $s_Chrs
let phase_dir = $args["<phaseOutputPrefix>"] let phase_dir = $args["<phaseOutputPrefix>"]
let phasedSnpAnnotFile = $args["<SNPPhaseFile>"] let phasedSnpAnnotFile = $args["<SNPPhaseFile>"]
discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb, phasedSnpAnnotFileName = phasedSnpAnnotFile) if (batchSize == 0) or (batchSize >= barcodeTable.len):
for chrom in s_Chrs: batchSize = barcodeTable.len
outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx" discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = phase_dir, out_dir = out_dir,
outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx" thetaREF = thetaREF, thetaALT = thetaALT, cmPmb = cmPmb,s_Chrs =s_Chrs,initProb = initProb,
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]: phasedSnpAnnotFileName = phasedSnpAnnotFile,batchSize=batchSize)
var imtx = readMtx(mtx_file = mtxFile) if not notSortMtx:
discard sortWriteMtx(imtx, mtx_file = mtxFile) for chrom in s_Chrs:
elif args["swphase"]: outFileTotalCountMtxFile = out_dir & chrom & "_totalCount.mtx"
echo "running swphase to find switch spots and generate corrected phase" outFileAltCountMtxFile = out_dir & chrom & "_altCount.mtx"
for mtxFile in [outFileTotalCountMtxFile, outFileAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
elif run_swphase:
echo $now() & " running swphase to find switch spots and generate corrected phase"
let gtMtxFile = $args["<gtMtxFile>"] let gtMtxFile = $args["<gtMtxFile>"]
let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"] let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"]
let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt" let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt"
let switchScoreFile = out_dir & "switch_score.txt" let switchScoreFile = out_dir & "switch_score.txt"
let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz" let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz"
discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize) echo $now() & " scanning with swphase to find switch spots and generate corrected phase"
discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore,
minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize, dissimThresh =dissimThresh,maxUseNcells = maxUseNcells)
if ($args["--chrom"] == "nil"): if ($args["--chrom"] == "nil"):
echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option" echo $now() & " assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option"
echo $now() & " write corrected phased haplotypes to " & switchedPhasedAnnotVcfFile
discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"]) discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.3.9=swphase""",threads = threads, chrom = $args["--chrom"])
......
...@@ -4,13 +4,16 @@ import sequtils ...@@ -4,13 +4,16 @@ import sequtils
import math import math
import streams import streams
import strutils import strutils
import times
# let binSize = 2000 # let binSize = 2000
# let movingStep = 200 # let movingStep = 200
let dissimThresh = 0.0099 #let dissimThresh = 0.0099
#let lookBeyondSnps = 25 #let lookBeyondSnps = 25
let debug = false let debug = false
let switchPrior = 0.5 let switchPrior = 0.5
#let maxUseNcells = 100
type type
switchScoreTuple = tuple switchScoreTuple = tuple
switch_scores: seq[float] switch_scores: seq[float]
...@@ -24,7 +27,7 @@ type ...@@ -24,7 +27,7 @@ type
## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin ## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin
## cell_geno snp by cell ## cell_geno snp by cell
proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]], dissimThresh: float): bool =
let ncells = cell_geno[0].len let ncells = cell_geno[0].len
var ncompared = newSeq[int](ncells) var ncompared = newSeq[int](ncells)
var nmatch = newSeq[int](ncells) var nmatch = newSeq[int](ncells)
...@@ -46,33 +49,36 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): ...@@ -46,33 +49,36 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]):
# if debug: echo "bin had many crossovers : " & $nxcells # if debug: echo "bin had many crossovers : " & $nxcells
return true return true
proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] = proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int, dissimThresh:float): seq[int] =
#let ncells = gtMtxByCell[0].len #let ncells = gtMtxByCell[0].len
let nSnps = gtMtxByCell.len let nSnps = gtMtxByCell.len
let nBins = (int)(ceil(nSnps / (binSize - movingStep))) let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
# if debug: echo "nBins: " & $nBins # echo "nBins: " & $nBins
let binIds = toSeq(0..(nBins-1)) let binIds = toSeq(0..(nBins-1))
var snpPosStart,snpPosEnd: int var snpPosStart,snpPosEnd: int
var highRiskSnps: seq[int] var highRiskSnps: seq[int]
for binI in binIds: for binI in binIds:
# if debug: echo "binI " & $binI # echo "binI " & $binI
snpPosStart = (binI)*(binSize - movingStep) snpPosStart = (binI)*(binSize - movingStep)
if (snpPosStart + binSize) > (nSnps-1): if (snpPosStart + binSize) > (nSnps-1):
snpPosEnd = (nSnps-1) snpPosEnd = (nSnps-1)
else: else:
snpPosEnd = snpPosStart + binSize snpPosEnd = snpPosStart + binSize
if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd],
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd] )): cell_geno = gtMtxByCell[snpPosStart..snpPosEnd],
dissimThresh = dissimThresh )):
highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd)) highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd))
# add a last bin as from end of chrom with length binSize # add a last bin as from end of chrom with length binSize
let lastBinStart = nSnps - binSize let lastBinStart = nSnps - binSize
let lastBinEnd = nSnps - 1 if lastBinStart > 0:
if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd], let lastBinEnd = nSnps - 1
cell_geno = gtMtxByCell[snpPosStart..snpPosEnd])): if (hasCrossover(templ_geno = fullGeno[snpPosStart..snpPosEnd],
highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd)) cell_geno = gtMtxByCell[snpPosStart..snpPosEnd],
dissimThresh = dissimThresh)):
highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd))
highRiskSnps = deduplicate(highRiskSnps) highRiskSnps = deduplicate(highRiskSnps)
# echo highRiskSnps
return highRiskSnps return highRiskSnps
proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] = proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] =
...@@ -81,7 +87,7 @@ proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[Bi ...@@ -81,7 +87,7 @@ proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[Bi
var currentEntrySeq: seq[string] var currentEntrySeq: seq[string]
while not phasedSnpAnnotFileStream.atEnd(): while not phasedSnpAnnotFileStream.atEnd():
currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace() currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace()
fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01") fullGeno[i] = int8(parseInt(currentEntrySeq[3])).toBinaryGeno(format = "01")
i.inc i.inc
if i != nSnps : if i != nSnps :
quit "phased snpAnnot file does not have the same number of rows with gtMtx" quit "phased snpAnnot file does not have the same number of rows with gtMtx"
...@@ -126,7 +132,7 @@ proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = ...@@ -126,7 +132,7 @@ proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] =
proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] =
map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex]) map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex])
proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25): switchScoreTuple = proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25, maxUseNcells:int): switchScoreTuple =
var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int
let nSnps = gtMtxByCell.len let nSnps = gtMtxByCell.len
let ncells = gtMtxByCell[0].len let ncells = gtMtxByCell[0].len
...@@ -134,11 +140,17 @@ proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullG ...@@ -134,11 +140,17 @@ proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullG
var prob_switch, prob_nonsw: float var prob_switch, prob_nonsw: float
var leftIndex,rightIndex, switch_score_snpIndex: seq[int] var leftIndex,rightIndex, switch_score_snpIndex: seq[int]
var swscore = newSeq[float](nSnps) var swscore = newSeq[float](nSnps)
var useCells = newSeq[int]()
if (ncells <= maxUseNcells) or (maxUseNcells==0):
useCells = (0..(ncells-1)).toSeq()
else:
let everyN = (int)floor(ncells/maxUseNcells)
useCells = map(toSeq(0..(maxUseNcells-1)),proc(x:int):int = (x * everyN))
for rsnpi in riskySnps: for rsnpi in riskySnps:
prob_switch = 0.0 prob_switch = 0.0
prob_nonsw = 0.0 prob_nonsw = 0.0
if fullGeno[rsnpi] == gUnknown: continue if fullGeno[rsnpi] == gUnknown: continue
for celli in 0..(ncells-1): for celli in useCells:
# if celli == 2: continue # if celli == 2: continue
leftIndex = newSeq[int]() leftIndex = newSeq[int]()
rightIndex = newSeq[int]() rightIndex = newSeq[int]()
...@@ -234,7 +246,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph ...@@ -234,7 +246,7 @@ proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, ph
phaseSnpAnnotFileFS.close() phaseSnpAnnotFileFS.close()
return 0 return 0
proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int): int = proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int, binSize:int, stepSize:int, dissimThresh:float, maxUseNcells:int): int =
var gtMtxByCell: seq[seq[BinaryGeno]] var gtMtxByCell: seq[seq[BinaryGeno]]
var currentEntrySeq: seq[string] var currentEntrySeq: seq[string]
var currentEntry:seq[int] var currentEntry:seq[int]
...@@ -247,25 +259,26 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn ...@@ -247,25 +259,26 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
switchScoreFileStream = openFileStream(switchScoreFile, fmWrite) switchScoreFileStream = openFileStream(switchScoreFile, fmWrite)
except: except:
stderr.write getCurrentExceptionMsg() stderr.write getCurrentExceptionMsg()
echo "reading gtMtx by cell" echo $now() & "reading gtMtx by cell"
discard gtMtxFileStream.readLine() discard gtMtxFileStream.readLine()
discard phaseSnpAnnotFileStream.readLine() discard phaseSnpAnnotFileStream.readLine()
#N, i,j #N, i,j
currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace() currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace()
currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x)) currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x))
nSnps = currentEntry[0] nSnps = currentEntry[0]
echo "nSnps " & $nSnps echo "nSnps: " & $nSnps
ncells = currentEntry[1] ncells = currentEntry[1]
echo "ncells " & $ncells echo "ncells: " & $ncells
echo "totalEntries " & $ currentEntry[2] echo "totalEntries " & $ currentEntry[2]
## gtMtx is cell by Snp format ## gtMtx is cell by Snp format
gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells)) gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true) discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true)
var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps ) var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps )
var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize) var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell, binSize = binSize, movingStep = stepSize, dissimThresh = dissimThresh)
echo "riskySnps.length " & $riskySnps.len echo $now() & "riskySnps.length " & $riskySnps.len
var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps) var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell = gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps, maxUseNcells= maxUseNcells)
let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores) let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores)
echo "see switch_sore.txt file, and corrected switch errors found at positions: " & $switchSites
switchScoreFileStream.writeLine("#" & $switchSites) switchScoreFileStream.writeLine("#" & $switchSites)
for snpi,score in switchScoresT.switch_scores: for snpi,score in switchScoresT.switch_scores:
if switchScoresT.switch_scores_snpIndex.find(snpi)>=0: if switchScoresT.switch_scores_snpIndex.find(snpi)>=0:
......
...@@ -31,7 +31,7 @@ proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, m ...@@ -31,7 +31,7 @@ proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, m
if not barcodeTable.hasKey(currentCB): if not barcodeTable.hasKey(currentCB):
continue continue
base_off = get_base_offset(position = rec.POS.cint, align = aln) base_off = get_base_offset(position = rec.POS.cint, align = aln)
base = aln.base_at(base_off) base = aln.base_at(base_off)
if aln.base_quality_at(base_off).cint < minbsq: if aln.base_quality_at(base_off).cint < minbsq:
continue continue
total_reads+=1 total_reads+=1
......
...@@ -101,26 +101,25 @@ proc pathTrackBackIthSperm(currentSperm: var SpermViNodes, ...@@ -101,26 +101,25 @@ proc pathTrackBackIthSperm(currentSperm: var SpermViNodes,
inferProb = currentEm[stateAlt]+math.ln(transProb) inferProb = currentEm[stateAlt]+math.ln(transProb)
reverseProb = currentEm[stateRef]+math.ln(1-transProb) reverseProb = currentEm[stateRef]+math.ln(1-transProb)
viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") ) viSegmentInfo.writeLine(join(["ithSperm" & $ithSperm, $posStart, $posEnd, $(inferProb-reverseProb) ,$cSNP, "2"],sep = " ") )
return 0 return 0
proc pathTrackBack*(scSpermSeq: var SeqSpermViNodes, proc pathTrackBack*(scSpermSeq: TableRef[int,SpermViNodes],
thetaRef: float, thetaRef: float,
thetaAlt: float, thetaAlt: float,
cmPmb: float, cmPmb: float,
outFileVStateMtx: var FileStream, outFileVStateMtx: var FileStream,
viSegmentInfo: var FileStream): int = viSegmentInfo: var FileStream): int =
var posEnd: int64 var posEnd: int64
var inferProb,reverseProb = 0.0 var inferProb,reverseProb = 0.0
var currentEm: array[stateRef..stateAlt, float] var currentEm: array[stateRef..stateAlt, float]
var lastNode: ViNode var lastNode: ViNode
var spermVNseq: SpermViNodes # var spermVNseq: SpermViNodes
# 0..(scSpermSeq.len-1)
# rightGap,leftGap = [0.0,0.0]
# spermVNseq = scSpermSeq[ithSperm]
var ithSNP: int var ithSNP: int
for ithSperm,spermVNseq in scSpermSeq:
for ithSperm in 0..(scSpermSeq.len-1): if spermVNseq.viNodeseq.len==0:
## rightGap,leftGap = [0.0,0.0] continue
spermVNseq = scSpermSeq[ithSperm]
if spermVNseq.viNodeseq.len==0: continue
lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)]
currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt) currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=lastNode.cRef, cAlt=lastNode.cAlt)
if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]:
......
...@@ -30,9 +30,7 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o ...@@ -30,9 +30,7 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o
for rec in ivcf.query(chrom): for rec in ivcf.query(chrom):
var rec_alt = rec.ALT[0][0] var rec_alt = rec.ALT[0][0]
var rec_ref = rec.REF[0] var rec_ref = rec.REF[0]
cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads, cellGtNodes = findGtNodes(rec=rec, variantIndex = variantIndex, ibam=ibam, maxTotalReads=maxTotalReads,minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,minbsq=minbsq,barcodeTag=barcodeTag)
minTotalReads=minTotalReads, mapq = mapq, barcodeTable = barcodeTable,
minbsq=minbsq,barcodeTag=barcodeTag)
cellGtNodeCopy = cellGtNodes cellGtNodeCopy = cellGtNodes
for cellbarcode in cellGtNodeCopy.keys: for cellbarcode in cellGtNodeCopy.keys:
cellDP = cellGtNodes[cellbarcode].alleles.len cellDP = cellGtNodes[cellbarcode].alleles.len
......
...@@ -13,10 +13,11 @@ type ...@@ -13,10 +13,11 @@ type
snpIndexLookUp*: Table[int,int] snpIndexLookUp*: Table[int,int]
## look up from current Sperm SNP index to SNPIndex ## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp*: Table[int,int] spermSnpIndexLookUp*: Table[int,int]
SeqSpermViNodes* = seq[SpermViNodes] # SeqSpermViNodes* = seq[SpermViNodes]
proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int = proc addViNodeIthSperm*(scSpermSeq: TableRef[int,SpermViNodes], cAlt: int16, cRef: int16, ithSperm:int, emissionArray: array[stateRef..stateAlt, float], snpIndex:int, initProb: array[stateRef..stateAlt, float], rec_pos:int,cmPmb:float): int =
if scSpermSeq[ithSperm].viNodeseq.len==0: if not scSpermSeq.hasKey(ithSperm):
scSpermSeq[ithSperm] = SpermViNodes()
var currentViNode = ViNode() var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef] currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt] currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
...@@ -26,7 +27,6 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i ...@@ -26,7 +27,6 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i
currentViNode.pos = rec_pos currentViNode.pos = rec_pos
currentViNode.cAlt = cAlt currentViNode.cAlt = cAlt
currentViNode.cRef = cRef currentViNode.cRef = cRef
scSpermSeq[ithSperm].viNodeseq.add(currentViNode) scSpermSeq[ithSperm].viNodeseq.add(currentViNode)
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
...@@ -65,7 +65,7 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i ...@@ -65,7 +65,7 @@ proc addViNodeIthSperm*(scSpermSeq: var SeqSpermViNodes, cAlt: int, cRef: int, i
return 0 return 0
proc addViNode*(barcodeTable: TableRef, proc addViNode*(barcodeTable: TableRef,
alleleCountTable: Table[string,allele_expr], alleleCountTable: Table[string,allele_expr],
scSpermSeq: var SeqSpermViNodes, scSpermSeq: TableRef[int,SpermViNodes],
outFileTotalCountMtx: var FileStream, outFileTotalCountMtx: var FileStream,
outFileAltCountMtx: var FileStream, outFileAltCountMtx: var FileStream,
nnsize: var int, nnsize: var int,
...@@ -80,13 +80,13 @@ proc addViNode*(barcodeTable: TableRef, ...@@ -80,13 +80,13 @@ proc addViNode*(barcodeTable: TableRef,
for bc, ac in alleleCountTable.pairs: for bc, ac in alleleCountTable.pairs:
# ac.tostring(acs) # ac.tostring(acs)
## mindp, maxdp, they are values per cell ## mindp, maxdp, they are values per cell
if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue if (ac.cref+ac.calt) < mindp or (ac.cref+ac.calt) > maxdp: continue
var ithSperm = barcodeTable[bc] var ithSperm = barcodeTable[bc]
## write to mtx Ref count ## write to mtx Ref count
outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt))
outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt)
nnsize += 1 nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = int(ac.calt), cRef = int(ac.cref), ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb) discard addViNodeIthSperm(scSpermSeq = scSpermSeq, cAlt = ac.calt, cRef = ac.cref, ithSperm = ithSperm, emissionArray =emissionArray, snpIndex =snpIndex,initProb = initProb,rec_pos =rec_pos,cmPmb = cmPmb)
return 0 return 0
# The size line # The size line
...@@ -37,11 +37,11 @@ proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh: float) ...@@ -37,11 +37,11 @@ proc inferGeno(type00:int, type10:int, error_rate = 0.1,posterior_thresh: float)
let prob_1_p = 1 - prob_0_p let prob_1_p = 1 - prob_0_p
if max(prob_0_p,prob_1_p) >= posterior_thresh: if max(prob_0_p,prob_1_p) >= posterior_thresh:
# echo "posterior_prob: " & $(max(prob_0_p,prob_1_p)) # echo "posterior_prob: " & $(max(prob_0_p,prob_1_p))
if prob_0_p > prob_1_p: if prob_0_p > prob_1_p:
# echo "returned inferred geno: gREF" # "returned inferred geno: gREF"
return gREF return gREF
else: else:
# echo "returned inferred geno: gALT" #"returned inferred geno: gALT"
return gALT return gALT
return gUnknown return gUnknown
# return full sequence of template geno by inferring missing SNPs's genotypes # return full sequence of template geno by inferring missing SNPs's genotypes
...@@ -75,7 +75,7 @@ proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], po ...@@ -75,7 +75,7 @@ proc inferSnpGeno*(templateGeno: seq[BinaryGeno], gtMtx:seq[seq[BinaryGeno]], po
offset += 1 offset += 1
totalIter += 1 totalIter += 1
if coexisPos.len < int(nAnchorSnps/2): if coexisPos.len < int(nAnchorSnps/2):
if debug: echo "not enough coexisting SNPs for jth cell: " & $j # if debug: echo "not enough coexisting SNPs for jth cell: " & $j
continue continue
snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]), snpLD = matchTemplateHap(cell_geno = map(coexisPos,proc(x:int): BinaryGeno = gtMtx[j][x]),
temp_geno = map(coexisPos,proc(x:int): BinaryGeno = templateGeno[x]) ) temp_geno = map(coexisPos,proc(x:int): BinaryGeno = templateGeno[x]) )
......
...@@ -9,16 +9,16 @@ import sequtils ...@@ -9,16 +9,16 @@ import sequtils
type type
allele_expr* = ref object allele_expr* = ref object
cref*: int cref*: int16
calt*: int calt*: int16
ViState* = enum ViState* = enum
stateRef, stateAlt, stateN stateRef, stateAlt, stateN
BinaryGeno* = enum BinaryGeno* = enum
gUnknown, gREF, gALT gUnknown, gREF, gALT
ViNode* = object ViNode* = object
pos*: int pos*: int
cRef*: int cRef*: int16
cAlt*: int cAlt*: int16
pathScore*: array[stateRef..stateAlt, float] pathScore*: array[stateRef..stateAlt, float]
pathState*: array[stateRef..stateAlt, ViState] pathState*: array[stateRef..stateAlt, ViState]
state*: ViState state*: ViState
...@@ -37,7 +37,7 @@ type ...@@ -37,7 +37,7 @@ type
MtxRow* = tuple MtxRow* = tuple
rowIndex: int rowIndex: int
colIndex: int colIndex: int
value: int value: int16
type type
Mtx* = ref object Mtx* = ref object
header*: string header*: string
...@@ -70,10 +70,10 @@ proc get_base_offset*(position:int, align: Record): int = ...@@ -70,10 +70,10 @@ proc get_base_offset*(position:int, align: Record): int =
# get the base # get the base
base_offset = qoff - over-1 base_offset = qoff - over-1
break break
return base_offset return base_offset
proc toBinaryGeno*(x: int, format = "12") : BinaryGeno = proc toBinaryGeno*(x: int8, format = "12") : BinaryGeno =
if(format == "01"): if(format == "01"):
if x == 1: return gALT if x == 1: return gALT
if x == 0: return gREF if x == 0: return gREF
...@@ -88,15 +88,18 @@ proc toBinaryGeno*(x: int, format = "12") : BinaryGeno = ...@@ -88,15 +88,18 @@ proc toBinaryGeno*(x: int, format = "12") : BinaryGeno =
proc readGtMtxToSeq*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]], by_cell = false):int = proc readGtMtxToSeq*(mtxFileStream:FileStream, gtMtx:var seq[seq[BinaryGeno]], by_cell = false):int =
var currentEntrySeq:seq[int] var currentEntrySeq:seq[int]
var currentLine:seq[string] var currentLine:seq[string]
if by_cell:
while not mtxFileStream.atEnd(): while not mtxFileStream.atEnd():
currentLine = mtxFileStream.readLine().splitWhitespace() currentLine = mtxFileStream.readLine().splitWhitespace()
## i j 1-based from file ## i j 1-based from file
currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x)) currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x))
if by_cell: gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = int8(currentEntrySeq[2]).toBinaryGeno
gtMtx[(currentEntrySeq[0]-1)][(currentEntrySeq[1]-1)] = currentEntrySeq[2].toBinaryGeno else:
else: while not mtxFileStream.atEnd():
gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = currentEntrySeq[2].toBinaryGeno currentLine = mtxFileStream.readLine().splitWhitespace()
## i j 1-based from file
currentEntrySeq = map(currentLine, proc(x: string): int = parseInt(x))
gtMtx[(currentEntrySeq[1]-1)][(currentEntrySeq[0]-1)] = int8(currentEntrySeq[2]).toBinaryGeno
return 0 return 0
proc add_allele*(g:GtNode, alleleBinGeno:int):string = g.alleles & $alleleBinGeno proc add_allele*(g:GtNode, alleleBinGeno:int):string = g.alleles & $alleleBinGeno
...@@ -105,7 +108,7 @@ proc inc_count_cref*(a:allele_expr) = inc(a.cref) ...@@ -105,7 +108,7 @@ proc inc_count_cref*(a:allele_expr) = inc(a.cref)
proc inc_count_calt*(a:allele_expr) = inc(a.calt) proc inc_count_calt*(a:allele_expr) = inc(a.calt)
proc getEmission*(thetaRef=0.1,thetaAlt=0.9, proc getEmission*(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] = cRef:int16,cAlt:int16): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
...@@ -182,7 +185,7 @@ proc readMtx*(mtx_file:string): Mtx = ...@@ -182,7 +185,7 @@ proc readMtx*(mtx_file:string): Mtx =
current_line = mtxFileStream.readLine() current_line = mtxFileStream.readLine()
sparseMtx.lines.add((rowIndex: parseInt(current_line.splitWhitespace()[0]), sparseMtx.lines.add((rowIndex: parseInt(current_line.splitWhitespace()[0]),
colIndex: parseInt(current_line.splitWhitespace()[1]), colIndex: parseInt(current_line.splitWhitespace()[1]),
value: parseInt(current_line.splitWhitespace()[2]) )) value: int16(parseInt(current_line.splitWhitespace()[2])) ))
mtxFileStream.close() mtxFileStream.close()
return sparseMtx return sparseMtx
......