From d2c1a8c45d68d341a798e9816c832408b9bc048f Mon Sep 17 00:00:00 2001 From: rlyu <rlyu@svi.edu.au> Date: Mon, 11 Jan 2021 10:53:00 +1100 Subject: [PATCH] readme and nimble --- README.md | 39 +++- src/sscocaller.nim | 516 +++++++++++++++++++++++++++++++++++++++++++++ sscocaller.nimble | 18 ++ tests/test.sh | 2 + 4 files changed, 574 insertions(+), 1 deletion(-) create mode 100755 src/sscocaller.nim create mode 100644 sscocaller.nimble create mode 100644 tests/test.sh diff --git a/README.md b/README.md index ed96ea2..bfcba97 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,39 @@ -# sscocaller +## sscocaller: Calling crossovers from single-sperm DNA sequencing reads + +It takes the large bam file which contains aligned DNA reads from a list of single sperm cells and summarizes allele +counts for informative SNP markers. A HMM model is applied for haplotypin each sperm and viterbi algorithm is run +for deriving the inferred haplotype sequence against the list of SNP markers. + +## Inputs + +- Bam, Reads from single sperm cells with BC tag, eg. single-cell alignment pipeline (cellranger) +- VCF, variant calling file that contains the list of SNPs provided +- barcodeFile, the list of cell barcodes + +## Outputs +- Generate per chr allele counts matrices +- Sequence of inferred viterbi state (haplotype state) per chr + + +## Usage +Obtain allele counts for cell barcodes listed in barcodeFile at the SNP positions in VCF from the BAM file. + + Usage: + sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix> + + Options: + -t --threads <threads> number of BAM decompression threads [default: 4] + -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] + -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] + -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) + -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] + -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 10] + -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix + -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] + -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] + -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] + -h --help show help + + Examples + ./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp- diff --git a/src/sscocaller.nim b/src/sscocaller.nim new file mode 100755 index 0000000..88756df --- /dev/null +++ b/src/sscocaller.nim @@ -0,0 +1,516 @@ +## Process the large bam that contains reads from all barcodes and get allele counts +## for the list of SNPs provided in a VCF file +## Generate per cell per chr Allele counts +## Runs viterbi algorithm and output sequence of inferred viterbi state +## Authror: Ruqian Lyu +## Date: 04/11/2020 + + +import os +import docopt +import strutils +import hts +import tables +import sequtils +import math +from rmath import dbinom +import streams + +#echo paramCount(), " ", paramStr(1) + +proc sscocaller(argv: var seq[string]): int = + let doc = """ + Obtain allele counts for cell barcodes listed in barcodeFile at the SNP positions in VCF from the BAM file. + + Usage: + sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix> + + Options: + -t --threads <threads> number of BAM decompression threads [default: 4] + -MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20] + -BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13] + -CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms) + -minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1] + -maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 10] + -chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix + -thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] + -thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] + -cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1] + -h --help show help + + Examples + ./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp- + """ + let args = docopt(doc, argv=argv) + + var + ibam:Bam + threads:int + vcff:string + ivcf:VCF + barcodeFile:string + bamfile:string + selectedChrs:string + out_dir:string + mapq:int + minbsq:int + mintotal:int + mindp:int + maxdp:int + thetaREF:float + thetaALT:float + cmPmb:float + type + allele_expr = ref object + cref: int + calt: int + + proc inc_count_cref(a:allele_expr) = inc(a.cref) + proc inc_count_calt(a:allele_expr) = inc(a.calt) + proc reset_count(a:allele_expr) = + a.calt = 0 + a.cref = 0 + + # proc cref(a:allele_expr): int {.inline.} = return a.cref + # proc calt(a:allele_expr): int {.inline.} = return a.calt + + proc tostring(a:allele_expr, s:var string) {.inline.} = + s.set_len(0) + s.add("\t" & $a.cref & "," & $a.calt) + + + if($args["--threads"] != "nil"): + threads = parse_int($args["--threads"]) + else: threads = 4 + if($args["--minDP"] != "nil"): + mindp = parse_int($args["--minDP"]) + else: mindp = 1 + + if($args["--maxDP"] != "nil"): + maxdp = parse_int($args["--maxDP"]) + else: maxdp = 10 + + if($args["<BAM>"] == "nil"): + quit "input bam is required" + else: bamfile = $args["<BAM>"] + + if($args["<barcodeFile>"] == "nil"): + quit "input barcodeFile is required" + else: barcodeFile = $args["<barcodeFile>"] + + if($args["<out_prefix>"] == "nil"): + quit "output prefix is required" + else: out_dir = $args["<out_prefix>"] + + if($args["<VCF>"] == "nil"): + quit "input VCF file is required" + else: vcff = $args["<VCF>"] + + if($args["--minMAPQ"] != "nil"): + mapq = parse_int($args["--minMAPQ"]) + else: mapq = 20 + if($args["--baseq"] != "nil"): + minbsq = parse_int($args["--baseq"]) + else: minbsq = 13 + if($args["--thetaREF"] != "nil"): + thetaRef = parse_float($args["--thetaREF"]) + else: thetaRef = 0.1 + + if($args["--thetaALT"] != "nil"): + thetaAlt = parse_float($args["--thetaALT"]) + else: thetaAlt = 0.9 + + if($args["--cmPmb"] != "nil"): + cmPmb = parse_float($args["--cmPmb"]) + else: cmPmb = 0.1 + + echo "min mapq ", mapq, " min base qual ", minbsq + + if($args["--chrom"] != "nil"): + selectedChrs = $args["--chrom"] + else: + let a = toSeq(1..19) + if($args["--chrName"] == "nil"): + let b = map(a, proc (x: int): string = "" & $x) + let all_Chrs = concat(b,@["X","Y"]) + selectedChrs = all_Chrs.join(",") + else: + let b = map(a, proc (x: int): string = "chr" & $x) + let all_Chrs = concat(b,@["chrX","chrY"]) + selectedChrs = all_Chrs.join(",") + + let s_Chrs = selectedChrs.split(',') + echo $s_Chrs + if not open(ibam, bamfile, threads=threads, index = true): + quit "couldn't open input bam" + if not open(ivcf, vcff, threads=threads): + quit "couldn't open: vcff" + + var hf = hts.hts_open(cstring(barcodeFile), "r") + #### TODO : Table size + var barcodeTable = newOrderedTable[string,int](initialSize = 1024) + # var outFileTable = initTable[string,File]() + var kstr: hts.kstring_t + kstr.l = 0 + kstr.m = 0 + kstr.s = nil + var ithSperm=0 + ## initiate the table with CB as has keys, allele counts (ref object) as elements + while hts_getline(hf, cint(10), addr kstr) > 0: + if $kstr.s[0] == "#": + continue + var v = $kstr.s + ## barcodekey:spermIndex pair + discard barcodeTable.hasKeyOrPut(v, ithSperm) + ithSperm+=1 + echo "fetch given SNPs in ", barcodeTable.len ," single cells" + + type + AlleleExpr = object + cRef: int + cAlt: int + # Nucleotide = enum + # A, C, T, G + ViState = enum + stateRef, stateAlt, stateN + ViNode = object + pos: int + cRef: int + cAlt: int + pathScore: array[stateRef..stateAlt, float] + pathState: array[stateRef..stateAlt, ViState] + state: ViState + SpermViNodes = object + viNodeseq: seq[ViNode] + ## look up from SNPIndex to current Sperm SNP index + snpIndexLookUp: Table[int,int] + ## look up from current Sperm SNP index to SNPIndex + spermSnpIndexLookUp: Table[int,int] + + let initProb:array[stateRef..stateAlt, float]=[0.5,0.5] + var alexprSeq:seq[AlleleExpr] + proc getEmission(thetaRef=0.1,thetaAlt=0.9, + cRef:int,cAlt:int): array[stateRef..stateAlt, float] = + + var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true), + dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)] + return emissionScore + + #echo getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=10,cAlt=0) + + proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float = + if pos2<pos1: + quit "Wrong order of snps" + var rec = 1-exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb + return rec + #maxTotalReads:int + proc countAllele(rec:Variant, ibam:Bam, chrom:string, mapq: int, + barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] = + var alleleCountTable = initTable[string,allele_expr]() + var rec_alt:char + # var total_reads=0 + rec_alt = rec.ALT[0][0] + for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): + if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue + ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary + ## if not aln.flag.proper_pair: continue + var cbt = tag[string](aln, "CB") + if cbt.isNone: continue + if not barcodeTable.hasKey(cbt.get): continue + var + off = aln.start.int + qoff = 0 + roff_only = 0 + base = 'n' + position = rec.POS.cint + over = 0 + for event in aln.cigar: + var cons = event.consumes + if cons.query: + qoff+=event.len ## the offs to the query sequences + if cons.reference: + off += event.len ## the offs to the reference sequences + if not cons.query: + roff_only += event.len + if off <= position: continue + over = off - position + # get the base + base = aln.base_at(qoff - over) + break + if aln.base_quality_at(qoff - over-1).cint < minbsq: continue + if alleleCountTable.hasKey(cbt.get): + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[cbt.get].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[cbt.get].inc_count_calt + continue + else: + var new_snp = allele_expr(cref:0,calt:0) + alleleCountTable[cbt.get] = new_snp + if aln.base_at(qoff - over-1) == rec.REF[0]: + alleleCountTable[cbt.get].inc_count_cref + continue + if aln.base_at(qoff - over-1) == rec_alt: + alleleCountTable[cbt.get].inc_count_calt + continue + # total_reads+=1 + # if total_reads > maxTotalReads: + # return initTable[string,allele_expr]() + #else: + return alleleCountTable + + ## iterate through each selected chromosome + for chrom in s_Chrs: + # var scTable = initTable[string,SingleSpermRec]() + var snpIndex = 0 + var spermIndex = 0 + ## number of non zeros + var nnsize = 0 +# var snpAnnoSeq:seq[SNPAnnot] + var scSpermSeq:seq[SpermViNodes] + ## matches with the order in barcodeTable + scSpermSeq.setLen(barcodeTable.len) + var outFileSNPanno:FileStream + var outFileTotalCountMtx:FileStream + var outFileAltCountMtx:FileStream + # var outHeaderMtx:FileStream + var outFileVStateMtx:FileStream + var viSegmentInfo: FileStream + try: + outFileSNPanno = openFileStream(out_dir & chrom & "_snpAnnot.txt", fmWrite) + outFileTotalCountMtx = openFileStream(out_dir & chrom & "_totalCount.mtx", fmWrite) + outFileAltCountMtx = openFileStream(out_dir & chrom & "_altCount.mtx", fmWrite) + # outHeaderMtx = openFileStream(out_dir & chrom & "_header.mtx", fmWrite) + outFileVStateMtx = openFileStream(out_dir & chrom & "_vi.mtx", fmWrite) + viSegmentInfo = openFileStream(out_dir & chrom & "_viSegInfo.txt", fmWrite) + #echo strm.readLine() + except: + stderr.write getCurrentExceptionMsg() + + ## write headers to those mtx files and the first line place holder for total_row total_column total_entry + outFileTotalCountMtx.writeLine("%%MatrixMarket matrix coordinate integer general") + outFileTotalCountMtx.writeLine(' '.repeat(50)) + outFileAltCountMtx.writeLine("%%MatrixMarket matrix coordinate integer general") + outFileAltCountMtx.writeLine(' '.repeat(50)) + outFileVStateMtx.writeLine("%%MatrixMarket matrix coordinate integer general") + outFileVStateMtx.writeLine(' '.repeat(50)) + + outFileSNPanno.writeLine("POS" & "\t" & "REF" & "\t" & "ALT" ) + for rec in ivcf.query(chrom): + if rec.ALT.len > 1: continue + if rec.ALT.len == 0: continue + ## alleleCountTable contains for this SNP.POS, each cell barcode's allele counts + var alleleCountTable = countAllele(rec=rec, ibam=ibam, + chrom=chrom, mapq=mapq, + barcodeTable=barcodeTable, + minbsq=minbsq) + if alleleCountTable.len==0: continue + + var rec_alt:char + rec_alt = rec.ALT[0][0] + # out_line = $rec.CHROM & "\t" & $rec.POS & "\t" & rec.REF[0] & "\t" & rec_alt + ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations + snpIndex += 1 + outFileSNPanno.writeLine($rec.POS & "\t" & $rec.REF[0] & "\t" & $rec_alt ) + for bc, ac in alleleCountTable.mpairs: + # ac.tostring(acs) + ## mindp, maxdp, they are values per cell + if (ac.cref+ac.calt) <= mindp or (ac.cref+ac.calt) >= maxdp: continue + + var ithSperm = barcodeTable[bc] + ## write to mtx Ref count + outFileTotalCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $(ac.cRef+ac.cAlt)) + outFileAltCountMtx.writeLine($snpIndex & " " & $(ithSperm+1) & " " & $ac.cAlt) + nnsize += 1 + var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt) + if scSpermSeq[ithSperm].viNodeseq.len==0: + echo "first node to ", ithSperm + var currentViNode = ViNode() + currentViNode.pathScore[stateRef] = ln(initProb[stateRef])+emissionArray[stateRef] + currentViNode.pathScore[stateAlt] = ln(initProb[stateAlt])+emissionArray[stateAlt] + currentViNode.pathState[stateRef] = stateN + currentViNode.pathState[stateAlt] = stateN + currentViNode.state = stateN + currentViNode.pos = int(rec.POS) + currentViNode.cAlt = int(ac.cAlt) + currentViNode.cRef = int(ac.cRef) + + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + else: + #echo "additional node to a cell" + let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] + var ltransProb = ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) + var lnoTransProb = ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) + var currentViNode = ViNode() + # ref/alt -> ref + var refTref = preVNode.pathScore[stateRef] + lnoTransProb + var altTref = preVNode.pathScore[stateAlt] + ltransProb + if refTref > altTref: + currentViNode.pathScore[stateRef] = refTref + currentViNode.pathState[stateRef] = stateRef + else: + currentViNode.pathScore[stateRef] = altTref + currentViNode.pathState[stateRef] = stateAlt + # ref/alt -> alt + var refTalt = preVNode.pathScore[stateRef] + ltransProb + var altTalt = preVNode.pathScore[stateAlt] + lnoTransProb + + if refTalt > altTalt: + currentViNode.pathScore[stateAlt] = refTalt + currentViNode.pathState[stateAlt] = stateRef + else: + currentViNode.pathScore[stateAlt] = altTalt + currentViNode.pathState[stateAlt] = stateAlt + currentViNode.pathScore[stateAlt] += emissionArray[stateAlt] + currentViNode.pathScore[stateRef] += emissionArray[stateRef] + currentViNode.cAlt = int(ac.cAlt) + currentViNode.cRef = int(ac.cRef) + currentViNode.pos = int(rec.POS) + scSpermSeq[ithSperm].viNodeseq.add(currentViNode) + scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len + scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex + # The size line + + var transitFlag = false + var cSNP = 0 + var posEnd,posStart:int64 + var inferProb,reverseProb,transProb = 0.0 + var currentEm,prevEm: array[stateRef..stateAlt, float] + var lastNode: ViNode + var spermVNseq: SpermViNodes + var ithSNP: int + ## trans/no trans + var rightGap,leftGap: array[0..1, float] + for ithSperm in 0..(scSpermSeq.len-1): + ## rightGap,leftGap = [0.0,0.0] + spermVNseq = scSpermSeq[ithSperm] + if spermVNseq.viNodeseq.len==0: continue + lastNode = spermVNseq.viNodeseq[high(spermVNseq.viNodeseq)] + currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, + cRef=lastNode.cRef, + cAlt=lastNode.cAlt) + if lastNode.pathScore[stateRef] > lastNode.pathScore[stateAlt]: + scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateRef + ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") + inferProb = currentEm[stateRef] + reverseProb = currentEm[stateAlt] + else: + scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)].state = stateAlt + ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)+1] + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") + inferProb = currentEm[stateAlt] + reverseProb = currentEm[stateRef] + posEnd = lastNode.pos + cSNP = 1 + ## traceback for yielding the most probable state sequence + for i in 1..high(scSpermSeq[ithSperm].viNodeseq): + var state = scSpermSeq[ithSperm].viNodeseq[^i].state + scSpermSeq[ithSperm].viNodeseq[^(i+1)].state = scSpermSeq[ithSperm].viNodeseq[^i].pathState[state] + prevEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, + cRef=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cRef, + cAlt=scSpermSeq[ithSperm].viNodeseq[^(i+1)].cAlt) + ithSNP = scSpermSeq[ithSperm].spermSnpIndexLookUp[high(scSpermSeq[ithSperm].viNodeseq)-i+1] + transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos, + scSpermSeq[ithSperm].viNodeseq[^(i)].pos, + cmPmb=cmPmb) + # echo inferProb + # echo reverseProb + if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef: + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1") + if state == stateRef: + # not transitioning to a different state ie still in this segment of same state + inferProb+=prevEm[stateRef] + reverseProb+=prevEm[stateAlt] + cSNP += 1 + transitFlag = false + #posStart = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos + else: # there is transition to different state: ref(start) to alt(end) now output the segment info + posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos + #leftGapSize = scSpermSeq[ithSperm].viNodeseq[^(i)].pos - scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos + leftGap=[ln(transProb),ln(1-transProb)] + inferProb+=leftGap[0] + reverseProb+=leftGap[1] + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2") + #echo "ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(reverseProb/inferProb) & " " & $cSNP + #echo $(inferProb/reverseProb) + transitFlag = true + cSNP = 1 + rightGap = leftGap + inferProb = prevEm[stateRef]+rightGap[0] + reverseProb = prevEm[stateAlt]+rightGap[1] + posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos + else: + outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 2") + if state == stateAlt: + # not transitioning to a different state ie still in this segment of same state + inferProb += prevEm[stateAlt] + reverseProb += prevEm[stateRef] + cSNP += 1 + transitFlag = false + else: + ## state transit + posStart = scSpermSeq[ithSperm].viNodeseq[^(i)].pos + leftGap=[ln(transProb),ln(1-transProb)] + inferProb += leftGap[0] + reverseProb += leftGap[1] + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + transitFlag = true + cSNP = 1 + rightGap = leftGap + inferProb = prevEm[stateAlt]+rightGap[0] + reverseProb = prevEm[stateRef]+rightGap[1] + posEnd = scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos + ## traced to the start position of the chromosome for this cell + #leftGap = [0.0,0.0] + if not transitFlag: + ## the first SNP is included in the segment from traced from back + posStart = scSpermSeq[ithSperm].viNodeseq[0].pos + if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + else: + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) + else: + ## the first node has different state from the second, the first node has its own segment + posStart = scSpermSeq[ithSperm].viNodeseq[0].pos + posEnd = posStart + cSNP = 1 + currentEm = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt, + cRef=scSpermSeq[ithSperm].viNodeseq[0].cRef, + cAlt=scSpermSeq[ithSperm].viNodeseq[0].cAlt) + transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[0].pos, + scSpermSeq[ithSperm].viNodeseq[1].pos, + cmPmb=cmPmb) + if scSpermSeq[ithSperm].viNodeseq[0].state == stateRef: + inferProb = currentEm[stateRef]+ln(transProb) + reverseProb = currentEm[stateAlt]+ln(1-transProb) + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 1" ) + else: + inferProb = currentEm[stateAlt]+ln(transProb) + reverseProb = currentEm[stateRef]+ln(1-transProb) + viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2" ) + transitFlag = false + outFileTotalCountMtx.setPosition(49) + outFileVStateMtx.setPosition(49) + outFileAltCountMtx.setPosition(49) + outFileAltCountMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) + outFileVStateMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) + outFileTotalCountMtx.write($snpIndex & " " & $barcodeTable.len & " " & $nnsize) + + # outHeaderMtx.writeLine() + outFileSNPanno.close() + outFileTotalCountMtx.close() + outFileAltCountMtx.close() + # outHeaderMtx.close() + outFileVStateMtx.close() + viSegmentInfo.close() + ibam.close() + ivcf.close() + + return 0 + +var args = commandLineParams() +discard sscocaller(args) diff --git a/sscocaller.nimble b/sscocaller.nimble new file mode 100644 index 0000000..20d31de --- /dev/null +++ b/sscocaller.nimble @@ -0,0 +1,18 @@ +# Package + +version = "0.1.0" +author = "Ruqian Lyu" +description = "sscocaller: calling crossover events from single-sperm DNA sequencing datasets" +license = "MIT" +srcDir = "src" +bin = @["sscocaller"] + + +# Dependencies + +requires "nim >= 1.4.0","docopt","hts >= 0.3.4" +requires "http://github.com/sdwfrost/rmath.git" + +# task test, "run the tests": +# exec "nim c -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo -r tests/test_groups" +# exec "bash tests/functional-tests.sh" diff --git a/tests/test.sh b/tests/test.sh new file mode 100644 index 0000000..b4ec888 --- /dev/null +++ b/tests/test.sh @@ -0,0 +1,2 @@ +./code/sscocaller --threads 2 --chrom 6 data/AAAGTAGCACGTCTCT-1.raw.bam data/AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz data/bcAA.tsv ./percell/ccsnp- + -- GitLab