Skip to content
Snippets Groups Projects
Commit 8624a917 authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

add versioning and refactored code to have mainModule check

parent 71a7cea1
No related branches found
No related tags found
No related merge requests found
Pipeline #6669 passed
......@@ -16,141 +16,127 @@ import math
from distributions/rmath import dbinom
import streams
#echo paramCount(), " ", paramStr(1)
type
allele_expr = ref object
cref: int
calt: int
type
AlleleExpr = object
cRef: int
cAlt: int
# Nucleotide = enum
# A, C, T, G
ViState = enum
stateRef, stateAlt, stateN
ViNode = object
pos: int
cRef: int
cAlt: int
pathScore: array[stateRef..stateAlt, float]
pathState: array[stateRef..stateAlt, ViState]
state: ViState
SpermViNodes = object
viNodeseq: seq[ViNode]
## look up from SNPIndex to current Sperm SNP index
snpIndexLookUp: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp: Table[int,int]
SeqSpermViNodes = seq[SpermViNodes]
#var alexprSeq:seq[AlleleExpr]
proc sscocaller(argv: var seq[string]): int =
let doc = """
Obtain allele counts for cell barcodes listed in barcodeFile at the SNP positions in VCF from the BAM file.
Usage:
sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix>
Options:
-t --threads <threads> number of BAM decompression threads [default: 4]
-MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20]
-BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13]
-CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms)
-minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1]
-maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5]
-maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25]
-minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10]
-chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix
-thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1]
-thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
-cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1]
-h --help show help
Examples
./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp-
"""
let args = docopt(doc, argv=argv)
var
ibam:Bam
threads:int
vcff:string
ivcf:VCF
barcodeFile:string
bamfile:string
selectedChrs:string
out_dir:string
mapq:int
minbsq:int
mintotal:int
maxtotal:int
mindp:int
maxdp:int
thetaREF:float
thetaALT:float
cmPmb:float
type
allele_expr = ref object
cref: int
calt: int
proc inc_count_cref(a:allele_expr) = inc(a.cref)
proc inc_count_calt(a:allele_expr) = inc(a.calt)
proc reset_count(a:allele_expr) =
a.calt = 0
a.cref = 0
# proc cref(a:allele_expr): int {.inline.} = return a.cref
# proc calt(a:allele_expr): int {.inline.} = return a.calt
proc tostring(a:allele_expr, s:var string) {.inline.} =
s.set_len(0)
s.add("\t" & $a.cref & "," & $a.calt)
proc inc_count_cref(a:allele_expr) = inc(a.cref)
proc inc_count_calt(a:allele_expr) = inc(a.calt)
proc reset_count(a:allele_expr) =
a.calt = 0
a.cref = 0
proc tostring(a:allele_expr, s:var string) {.inline.} =
s.set_len(0)
s.add("\t" & $a.cref & "," & $a.calt)
proc getEmission(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
if($args["--threads"] != "nil"):
threads = parse_int($args["--threads"])
else: threads = 4
if($args["--minDP"] != "nil"):
mindp = parse_int($args["--minDP"])
else: mindp = 1
if($args["--maxDP"] != "nil"):
maxdp = parse_int($args["--maxDP"])
else: maxdp = 10
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
return emissionScore
if($args["--maxTotalDP"] != "nil"):
maxtotal = parse_int($args["--maxTotalDP"])
else: maxtotal = 30
if($args["--minTotalDP"] != "nil"):
mintotal = parse_int($args["--minTotalDP"])
else: mintotal = 10
if($args["<BAM>"] == "nil"):
quit "input bam is required"
else: bamfile = $args["<BAM>"]
if($args["<barcodeFile>"] == "nil"):
quit "input barcodeFile is required"
else: barcodeFile = $args["<barcodeFile>"]
if($args["<out_prefix>"] == "nil"):
quit "output prefix is required"
else: out_dir = $args["<out_prefix>"]
if($args["<VCF>"] == "nil"):
quit "input VCF file is required"
else: vcff = $args["<VCF>"]
proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float =
if pos2<pos1:
quit "Wrong order of snps"
var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb
return rec
#
proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int,
minTotalReads:int,
chrom:string, mapq: int,
barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] =
var alleleCountTable = initTable[string,allele_expr]()
var rec_alt:char
var total_reads=0
rec_alt = rec.ALT[0][0]
for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint):
var cbt = tag[string](aln, "CB")
if cbt.isNone:
# echo "no cb "
continue
if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue
## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary
## if not aln.flag.proper_pair: continue
if($args["--minMAPQ"] != "nil"):
mapq = parse_int($args["--minMAPQ"])
else: mapq = 20
if($args["--baseq"] != "nil"):
minbsq = parse_int($args["--baseq"])
else: minbsq = 13
if($args["--thetaREF"] != "nil"):
thetaRef = parse_float($args["--thetaREF"])
else: thetaRef = 0.1
if not barcodeTable.hasKey(cbt.get):
continue
var
off = aln.start.int
qoff = 0
roff_only = 0
base = 'n'
position = rec.POS.cint
over = 0
for event in aln.cigar:
var cons = event.consumes
#echo $cons
if cons.query:
qoff+=event.len ## the offs to the query sequences
if cons.reference:
off += event.len ## the offs to the reference sequences
if not cons.query:
roff_only += event.len
if off <= position:
continue
over = off - position
# get the base
base = aln.base_at(qoff - over-1)
break
if aln.base_quality_at(qoff - over-1).cint < minbsq:
continue
total_reads+=1
if alleleCountTable.hasKey(cbt.get):
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[cbt.get].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[cbt.get].inc_count_calt
continue
else:
var new_snp = allele_expr(cref:0,calt:0)
alleleCountTable[cbt.get] = new_snp
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[cbt.get].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[cbt.get].inc_count_calt
continue
if total_reads > maxTotalReads or total_reads < minTotalReads:
return initTable[string,allele_expr]()
return alleleCountTable
proc sscocaller(threads:int, vcff:string, barcodeFile:string,
bamfile:string, out_dir:string, mapq:int,
minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int,
thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq): int =
if($args["--thetaALT"] != "nil"):
thetaAlt = parse_float($args["--thetaALT"])
else: thetaAlt = 0.9
if($args["--cmPmb"] != "nil"):
cmPmb = parse_float($args["--cmPmb"])
else: cmPmb = 0.1
echo "min mapq ", mapq, " min base qual ", minbsq
if($args["--chrom"] != "nil"):
selectedChrs = $args["--chrom"]
else:
let a = toSeq(1..19)
if($args["--chrName"] == "nil"):
let b = map(a, proc (x: int): string = "" & $x)
let all_Chrs = concat(b,@["X","Y"])
selectedChrs = all_Chrs.join(",")
else:
let b = map(a, proc (x: int): string = "chr" & $x)
let all_Chrs = concat(b,@["chrX","chrY"])
selectedChrs = all_Chrs.join(",")
let s_Chrs = selectedChrs.split(',')
echo $s_Chrs
var
ibam:Bam
ivcf:VCF
if not open(ibam, bamfile, threads=threads, index = true):
quit "couldn't open input bam"
if not open(ivcf, vcff, threads=threads):
......@@ -170,107 +156,9 @@ proc sscocaller(argv: var seq[string]): int =
if $kstr.s[0] == "#":
continue
var v = $kstr.s
## barcodekey:spermIndex pair
discard barcodeTable.hasKeyOrPut(v, ithSperm)
ithSperm+=1
echo "fetch given SNPs in ", barcodeTable.len ," single cells"
type
AlleleExpr = object
cRef: int
cAlt: int
# Nucleotide = enum
# A, C, T, G
ViState = enum
stateRef, stateAlt, stateN
ViNode = object
pos: int
cRef: int
cAlt: int
pathScore: array[stateRef..stateAlt, float]
pathState: array[stateRef..stateAlt, ViState]
state: ViState
SpermViNodes = object
viNodeseq: seq[ViNode]
## look up from SNPIndex to current Sperm SNP index
snpIndexLookUp: Table[int,int]
## look up from current Sperm SNP index to SNPIndex
spermSnpIndexLookUp: Table[int,int]
ithSperm+=1
let initProb:array[stateRef..stateAlt, float]=[0.5,0.5]
var alexprSeq:seq[AlleleExpr]
proc getEmission(thetaRef=0.1,thetaAlt=0.9,
cRef:int,cAlt:int): array[stateRef..stateAlt, float] =
var emissionScore = [dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaRef,log=true),
dbinom(x=float(cAlt),size=(cRef+cAlt),prob=thetaAlt,log=true)]
return emissionScore
#echo getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=10,cAlt=0)
proc getTrans(pos1:int64,pos2:int64,cmPmb=0.1): float =
if pos2<pos1:
quit "Wrong order of snps"
var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb
return rec
#
proc countAllele(rec:Variant, ibam:Bam, maxTotalReads:int,
minTotalReads:int,
chrom:string, mapq: int,
barcodeTable:OrderedTableRef,minbsq:int): Table[string,allele_expr] =
var alleleCountTable = initTable[string,allele_expr]()
var rec_alt:char
var total_reads=0
rec_alt = rec.ALT[0][0]
for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint):
if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue
## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary
## if not aln.flag.proper_pair: continue
var cbt = tag[string](aln, "CB")
if cbt.isNone: continue
if not barcodeTable.hasKey(cbt.get): continue
var
off = aln.start.int
qoff = 0
roff_only = 0
base = 'n'
position = rec.POS.cint
over = 0
for event in aln.cigar:
var cons = event.consumes
if cons.query:
qoff+=event.len ## the offs to the query sequences
if cons.reference:
off += event.len ## the offs to the reference sequences
# if not cons.query:
# roff_only += event.len
if off <= position: continue
over = off - position
# get the base
#base = aln.base_at(qoff - over-1)
break
if aln.base_quality_at(qoff - over-1).cint < minbsq: continue
total_reads+=1
if alleleCountTable.hasKey(cbt.get):
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[cbt.get].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[cbt.get].inc_count_calt
continue
else:
var new_snp = allele_expr(cref:0,calt:0)
alleleCountTable[cbt.get] = new_snp
if aln.base_at(qoff - over-1) == rec.REF[0]:
alleleCountTable[cbt.get].inc_count_cref
continue
if aln.base_at(qoff - over-1) == rec_alt:
alleleCountTable[cbt.get].inc_count_calt
continue
if total_reads > maxTotalReads or total_reads < minTotalReads:
return initTable[string,allele_expr]()
return alleleCountTable
## iterate through each selected chromosome
for chrom in s_Chrs:
# var scTable = initTable[string,SingleSpermRec]()
......@@ -278,8 +166,7 @@ proc sscocaller(argv: var seq[string]): int =
var spermIndex = 0
## number of non zeros
var nnsize = 0
# var snpAnnoSeq:seq[SNPAnnot]
var scSpermSeq:seq[SpermViNodes]
var scSpermSeq:SeqSpermViNodes
## matches with the order in barcodeTable
scSpermSeq.setLen(barcodeTable.len)
var outFileSNPanno:FileStream
......@@ -338,7 +225,6 @@ proc sscocaller(argv: var seq[string]): int =
nnsize += 1
var emissionArray = getEmission(thetaRef=thetaRef,thetaAlt=thetaAlt,cRef=ac.cRef,cAlt=ac.cAlt)
if scSpermSeq[ithSperm].viNodeseq.len==0:
echo "first node to ", ithSperm
var currentViNode = ViNode()
currentViNode.pathScore[stateRef] = math.ln(initProb[stateRef])+emissionArray[stateRef]
currentViNode.pathScore[stateAlt] = math.ln(initProb[stateAlt])+emissionArray[stateAlt]
......@@ -353,7 +239,6 @@ proc sscocaller(argv: var seq[string]): int =
scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len
scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex
else:
#echo "additional node to a cell"
let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)]
var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb))
......@@ -430,8 +315,6 @@ proc sscocaller(argv: var seq[string]): int =
transProb = getTrans(scSpermSeq[ithSperm].viNodeseq[^(i+1)].pos,
scSpermSeq[ithSperm].viNodeseq[^(i)].pos,
cmPmb=cmPmb)
# echo inferProb
# echo reverseProb
if scSpermSeq[ithSperm].viNodeseq[^(i+1)].state == stateRef:
outFileVStateMtx.writeLine($ithSNP & " " & $(ithSperm+1) & " 1")
if state == stateRef:
......@@ -448,8 +331,6 @@ proc sscocaller(argv: var seq[string]): int =
inferProb+=leftGap[0]
reverseProb+=leftGap[1]
viSegmentInfo.writeLine("ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(inferProb-reverseProb) & " " & $cSNP & " 2")
#echo "ithSperm" & $ithSperm & " " & $posStart & " " & $posEnd & " " & $(reverseProb/inferProb) & " " & $cSNP
#echo $(inferProb/reverseProb)
transitFlag = true
cSNP = 1
rightGap = leftGap
......@@ -524,6 +405,133 @@ proc sscocaller(argv: var seq[string]): int =
ivcf.close()
return 0
when(isMainModule):
let version = "0.1.0"
var doc = format("""
$version
Usage:
sscocaller [options] <BAM> <VCF> <barcodeFile> <out_prefix>
Arguments:
<BAM> the read alignment file with records of single-cell DNA reads
<VCF> the variant call file with records of SNPs
<barcodeFile> the text file containing the list of cell barcodes
<out_prefix> the prefix of output files
Options:
-t --threads <threads> number of BAM decompression threads [default: 4]
-MQ --minMAPQ <mapq> Minimum MAPQ for read filtering [default: 20]
-BQ --baseq <baseq> base quality threshold for a base to be used for counting [default: 13]
-CHR --chrom <chrom> the selected chromsome (whole genome if not supplied,separate by comma if multiple chroms)
-minDP --minDP <minDP> the minimum DP for a SNP to be included in the output file [default: 1]
-maxDP --maxDP <maxDP> the maximum DP for a SNP to be included in the output file [default: 5]
-maxTotalDP --maxTotalDP <maxTotalDP> the maximum DP across all barcodes for a SNP to be included in the output file [default: 25]
-minTotalDP --minTotalDP <minTotalDP> the minimum DP across all barcodes for a SNP to be included in the output file [default: 10]
-chrName --chrName <chrName> the chr names with chr prefix or not, if not supplied then no prefix
-thetaREF --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1]
-thetaALT --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
-cmPmb --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default 0.1]
-h --help show help
Examples
./sscocaller --threads 10 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp-
""" % ["version", version])
let args = docopt(doc, version=version)
var
threads:int
vcff:string
barcodeFile:string
bamfile:string
selectedChrs:string
out_dir:string
mapq:int
minbsq:int
mintotal:int
maxtotal:int
mindp:int
maxdp:int
thetaREF:float
thetaALT:float
cmPmb:float
if($args["--threads"] != "nil"):
threads = parse_int($args["--threads"])
else: threads = 4
if($args["--minDP"] != "nil"):
mindp = parse_int($args["--minDP"])
else: mindp = 1
if($args["--maxDP"] != "nil"):
maxdp = parse_int($args["--maxDP"])
else: maxdp = 10
if($args["--maxTotalDP"] != "nil"):
maxtotal = parse_int($args["--maxTotalDP"])
else: maxtotal = 30
if($args["--minTotalDP"] != "nil"):
mintotal = parse_int($args["--minTotalDP"])
else: mintotal = 10
if($args["<BAM>"] == "nil"):
quit "input bam is required"
else: bamfile = $args["<BAM>"]
if($args["<barcodeFile>"] == "nil"):
quit "input barcodeFile is required"
else: barcodeFile = $args["<barcodeFile>"]
if($args["<out_prefix>"] == "nil"):
quit "output prefix is required"
else: out_dir = $args["<out_prefix>"]
if($args["<VCF>"] == "nil"):
quit "input VCF file is required"
else: vcff = $args["<VCF>"]
if($args["--minMAPQ"] != "nil"):
mapq = parse_int($args["--minMAPQ"])
else: mapq = 20
if($args["--baseq"] != "nil"):
minbsq = parse_int($args["--baseq"])
else: minbsq = 13
if($args["--thetaREF"] != "nil"):
thetaRef = parse_float($args["--thetaREF"])
else: thetaRef = 0.1
if($args["--thetaALT"] != "nil"):
thetaAlt = parse_float($args["--thetaALT"])
else: thetaAlt = 0.9
if($args["--cmPmb"] != "nil"):
cmPmb = parse_float($args["--cmPmb"])
else: cmPmb = 0.1
if($args["--chrom"] != "nil"):
selectedChrs = $args["--chrom"]
else:
let a = toSeq(1..19)
if($args["--chrName"] == "nil"):
let b = map(a, proc (x: int): string = "" & $x)
let all_Chrs = concat(b,@["X","Y"])
selectedChrs = all_Chrs.join(",")
else:
let b = map(a, proc (x: int): string = "chr" & $x)
let all_Chrs = concat(b,@["chrX","chrY"])
selectedChrs = all_Chrs.join(",")
var args = commandLineParams()
discard sscocaller(args)
let s_Chrs = selectedChrs.split(',')
#echo $s_Chrs
#var args = commandLineParams()
discard sscocaller(threads, vcff, barcodeFile, bamfile,
out_dir, mapq, minbsq, mintotal,
maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment