From 194ef4cad22939aae4c1ffa1568b4eb6228f43ec Mon Sep 17 00:00:00 2001 From: rlyu <rlyu@svi.edu.au> Date: Mon, 20 Sep 2021 13:49:50 +1000 Subject: [PATCH] adding block phase module --- private/blocks_utils.nim | 150 ++++++++++++++++++++++ private/fragments.nim | 2 +- private/graph.nim | 12 +- private/phase_blocks.nim | 263 +++++++++++++++++++++++++++++++++++++++ private/readfmf.nim | 43 +++++++ private/utils.nim | 22 ++-- sgcocaller.nim | 209 +++++++++++++++++++++---------- 7 files changed, 614 insertions(+), 87 deletions(-) create mode 100755 private/blocks_utils.nim create mode 100755 private/phase_blocks.nim create mode 100755 private/readfmf.nim diff --git a/private/blocks_utils.nim b/private/blocks_utils.nim new file mode 100755 index 0000000..6c3dbc1 --- /dev/null +++ b/private/blocks_utils.nim @@ -0,0 +1,150 @@ +import streams +import strutils +import math + +let block_sep = "********" +let split_threshold = 30.0 +let mis_threshold = 30.0 +#let version = "v0.3.3" + +type + Block* = ref object + # the positions of SNP (index position from provided VCF) + snpPos*: seq[int] + # number of SNPs it spans + length*: int + # number of SNPs phased + phased*: int + # number of base pairs it spans + # span + span*: int + # number of fragments + fragments*: int + # haplotype type 0, its bit-wise complementary is h1 + h0*: seq[int] + switch_error*: seq[float] + mismatch_error*: seq[float] + # 1 switch, 0 no switch comparing to the prev block, -1 means unlinked + switch: int + # int frags; + # float SCORE, bestSCORE, lastSCORE; + # int* slist; // ordered list of variants in this connected component + # int lastvar; // index of the first and last variants in this connected component + # int iters_since_improvement; + + +type Block_linkage* = ref object + prevBlock*: int + nextBlock*: int + linkageType_00* :int + linkageType_01*: int + switch_posterior_prob*: float + switch_confidence*: float + switch*: int + + +proc cal_posterior*(error_rate = 0.1, n_snps:int, n_mis:int): float = + var ll_h0 = (1-error_rate)^(n_snps-n_mis)*error_rate^n_mis + var ll_h1 = (1-error_rate)^(n_mis)*error_rate^(n_snps-n_mis) + var new_p = max(ll_h0/(ll_h0+ll_h1),ll_h1/(ll_h0+ll_h1)) + return new_p + +proc cal_switch_posterior*(error_rate = 0.1, n_blocks:int, n_sw:int): float = + var ll_01 = error_rate^(n_blocks-n_sw)*(1-error_rate)^n_sw + var ll_00 = error_rate^(n_sw)*(1-error_rate)^(n_blocks-n_sw) + return ll_01/(ll_01+ll_00) +proc is_block_header(line_string:string): bool = + if line_string.splitWhitespace()[0] == "BLOCK:": + # .len == 11: + return true + else: + return false + +## to honor the current format of HAPCUT2 the long blocks are not splitted +## We look at the 10th column: switch error score to split the blocks +proc count_blocks(block_file:string, by_header = false): int = + var blocks = 1 + var bfs: FileStream + var switch_error: float + try: + bfs = openFileStream(block_file, fmRead) + except: + stderr.write getCurrentExceptionMsg() + var current_line: string + while not bfs.atEnd(): + current_line = bfs.readLine() + if current_line.splitWhitespace()[0] == block_sep: + continue + if by_header: + if current_line.is_block_header: + blocks.inc + continue + else: + if current_line.is_block_header: + continue + switch_error = parseFloat(current_line.splitWhitespace()[9]) + if switch_error < split_threshold : + blocks.inc + bfs.close() + if by_header: + return blocks - 1 + else: + return blocks + +proc add_record(line_record: string, current_block:Block):int = + var line_record_seq = line_record.splitWhitespace() + let snpPos = parseInt(line_record_seq[0]) + let switch_error = parseFloat(line_record_seq[9]) + let mismatch_error = parseFloat(line_record_seq[10]) + var h0: int + try: + h0 = parseInt(line_record_seq[1]) + except: + h0 = -1 +# let discrete_prune = parseInt(line_record_seq[8]) + current_block.snpPos.add(snpPos) + if mismatch_error < mis_threshold: + h0 = -1 + current_block.h0.add(h0) + current_block.switch_error.add(switch_error) + current_block.mismatch_error.add(mismatch_error) + +proc read_blocks*(block_file:string, by_header = false):seq[Block] = + # start with block 0 + var b_j = 0 + var bfs: FileStream + var current_line: string + var blockSeq = newSeq[Block]() + var line_index = 0 + var switch_error: float + try: + bfs = openFileStream(block_file, fmRead) + except: + stderr.write getCurrentExceptionMsg() + ## First line in block file, not used, a block header line + current_line = bfs.readLine() + var currentBlock = Block() + while not bfs.atEnd(): + current_line = bfs.readLine() + if current_line.splitWhitespace()[0] == block_sep: + continue + if by_header: + if current_line.is_block_header: + blockSeq.add(currentBlock) + currentBlock = Block() + continue + else: + ## add current line of record to this block + discard add_record(current_line,currentBlock) + else: + if current_line.is_block_header: + continue + else: + switch_error = parseFloat(current_line.splitWhitespace()[9]) + if switch_error < split_threshold : + blockSeq.add(currentBlock) + currentBlock = Block() + discard add_record(current_line,currentBlock) + blockSeq.add(currentBlock) + bfs.close() + return blockSeq diff --git a/private/fragments.nim b/private/fragments.nim index a6f2464..7eac2de 100755 --- a/private/fragments.nim +++ b/private/fragments.nim @@ -6,7 +6,7 @@ import streams # minSnpDepth, this SNP should at least be covered by two cells proc findGtNodes*(rec:Variant, variantIndex: int, ibam:Bam, maxTotalReads:int, minTotalReads:int, mapq: int, - barcodeTable:OrderedTableRef, + barcodeTable:TableRef, minbsq:int, barcodeTag:string, minCellDp:int, minSnpDepth:int ): Table[string,GtNode] = diff --git a/private/graph.nim b/private/graph.nim index bea952e..fe02980 100755 --- a/private/graph.nim +++ b/private/graph.nim @@ -16,7 +16,7 @@ type SeqSpermViNodes* = seq[SpermViNodes] -proc addViNode*(barcodeTable: OrderedTableRef, +proc addViNode*(barcodeTable: TableRef, alleleCountTable: Table[string,allele_expr], scSpermSeq: var SeqSpermViNodes, outFileTotalCountMtx: var FileStream, @@ -27,7 +27,7 @@ proc addViNode*(barcodeTable: OrderedTableRef, snpIndex: int, thetaRef: float, thetaAlt: float, - rec: Variant, + rec_pos: int, initProb: array, cmPmb: float): int = for bc, ac in alleleCountTable.pairs: @@ -47,7 +47,7 @@ proc addViNode*(barcodeTable: OrderedTableRef, currentViNode.pathState[stateRef] = stateN currentViNode.pathState[stateAlt] = stateN currentViNode.state = stateN - currentViNode.pos = int(rec.POS) + currentViNode.pos = int(rec_pos) currentViNode.cAlt = int(ac.cAlt) currentViNode.cRef = int(ac.cRef) @@ -56,8 +56,8 @@ proc addViNode*(barcodeTable: OrderedTableRef, scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex else: let preVNode = scSpermSeq[ithSperm].viNodeseq[high(scSpermSeq[ithSperm].viNodeseq)] - var ltransProb = math.ln(getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) - var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec.POS),cmPmb=cmPmb)) + var ltransProb = math.ln(getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) + var lnoTransProb = math.ln(1-getTrans(preVNode.pos,int(rec_pos),cmPmb=cmPmb)) var currentViNode = ViNode() # ref/alt -> ref var refTref = preVNode.pathScore[stateRef] + lnoTransProb @@ -82,7 +82,7 @@ proc addViNode*(barcodeTable: OrderedTableRef, currentViNode.pathScore[stateRef] += emissionArray[stateRef] currentViNode.cAlt = int(ac.cAlt) currentViNode.cRef = int(ac.cRef) - currentViNode.pos = int(rec.POS) + currentViNode.pos = int(rec_pos) scSpermSeq[ithSperm].viNodeseq.add(currentViNode) scSpermSeq[ithSperm].snpIndexLookUp[snpIndex] = scSpermSeq[ithSperm].viNodeseq.len scSpermSeq[ithSperm].spermSnpIndexLookUp[scSpermSeq[ithSperm].viNodeseq.len] = snpIndex diff --git a/private/phase_blocks.nim b/private/phase_blocks.nim new file mode 100755 index 0000000..9f2b2db --- /dev/null +++ b/private/phase_blocks.nim @@ -0,0 +1,263 @@ +# phase blocks from cell fmf genotypes + +import blocks_utils +#import readfmf +import tables +import math +import sequtils +import hts + +let n_anchor_snps = 9 +let posterior_threshold = 0.99 +let confidence_threshold = 0.95 +let debug = false + +proc match_phase(cell_geno:seq[int], block_h0:seq[int]): int = + if cell_geno.len != block_h0.len: + quit "the haplotypes lengths should be equal" & " " & $cell_geno.len & " " & $cell_geno.len + var diff = map(toSeq(0..high(cell_geno)), proc (x:int):int = abs(cell_geno[x]-block_h0[x])) + #echo "matching " & $cell_geno & " to " & $block_h0 + var posterior_phase = cal_posterior(n_snps = cell_geno.len, n_mis = sum(diff)) + ## unphased + if cell_geno.len < n_anchor_snps: + if debug: echo "returned -1 because too few SNPs in the block covered by this cell. " & $diff.len + return -1 + if float(sum(diff)) > (n_anchor_snps+1)/2: + if posterior_phase > posterior_threshold: + if debug: echo "returned 1 with diffs to h0 : " & $sum(diff) & " of " & $diff.len & ": posterior probs = " & $posterior_phase + return 1 + else: + if debug: echo "returned -1 because posterior prob is not high enough with diffs to h0 : " & $sum(diff) & " of " & $diff.len & ": posterior probs = " & $posterior_phase + return -1 + else: + if posterior_phase > posterior_threshold: + if debug: echo "returned 0 with diffs to h0 : " & $sum(diff) & " of " & $diff.len & ": posterior probs = " & $posterior_phase + return 0 + else: + if debug: echo "returned -1 because posterior prob is not high enough with diffs to h0 : " & $sum(diff) & " of " & $diff.len & ": posterior probs = " & $posterior_phase + return -1 + +proc block_phase_in_cells*(cellGenoTable:TableRef, block_pos: seq[int], block_h0: seq[int], left:bool, cellBlockPhaseTable: TableRef[string, seq[int]]): int = + var n_covered_snps = 0 + var covered_snps_cell_geno: seq[int] + var covered_snps_block_geno: seq[int] + var block_phase = 0 + for barcode in cellGenoTable.keys(): + n_covered_snps = 0 + covered_snps_cell_geno = newSeq[int]() + covered_snps_block_geno = newSeq[int]() + if left: + if debug: echo "Phasing left of the block for cell " & $barcode + for pos_i in 0..high(block_pos): + if cellGenoTable[barcode].hasKey($block_pos[pos_i]) and block_h0[pos_i] != -1: + #echo "using " & $block_pos[pos_i] & "for cell " & barcode + n_covered_snps.inc + covered_snps_cell_geno.add(cellGenoTable[barcode][$block_pos[pos_i]]) + covered_snps_block_geno.add(block_h0[pos_i]) + if n_covered_snps > n_anchor_snps: + break + else: + if debug: echo "Phasing right of the block for cell " & $barcode + # high(block_pos)..0 + for pos_i in countdown((block_pos.len - 1),0): + if cellGenoTable[barcode].hasKey($block_pos[pos_i]) and block_h0[pos_i] != -1: + #echo "using " & $block_pos[pos_i] & "for cell " & barcode + n_covered_snps.inc + covered_snps_cell_geno.add(cellGenoTable[barcode][$block_pos[pos_i]]) + covered_snps_block_geno.add(block_h0[pos_i]) + if n_covered_snps > n_anchor_snps: + break + block_phase = match_phase(covered_snps_cell_geno,covered_snps_block_geno) + if not cellBlockPhaseTable.hasKey(barcode): + cellBlockPhaseTable[barcode] = newSeq[int]() + cellBlockPhaseTable[barcode].add(block_phase) +proc find_contig_blocks*(blocks:seq[Block],cellBlockLeftPhaseTable: TableRef[string, seq[int]],cellBlockRightPhaseTable: TableRef[string, seq[int]] ):seq[int] = + var contig_blocks = newSeq[int]() + var count_missing = newSeqWith[blocks.len,0] + var contig_block_allow_missing = int(ceil(float(cellBlockLeftPhaseTable.len) * 0.1)) + # for b in 0..high(blocks): + var left_right_phase: seq[tuple[left:int,right:int]] + for bc in cellBlockLeftPhaseTable.keys(): + left_right_phase = zip(cellBlockLeftPhaseTable[bc],cellBlockRightPhaseTable[bc]) + if debug: echo bc & "\t" & $left_right_phase + for bl in 0..high(left_right_phase): + if left_right_phase[bl][0] == -1 or left_right_phase[bl][1] == -1 : + count_missing[bl].inc + if debug: echo "block missing in X number of cells:" & $count_missing + for i, c in count_missing: + if c <= contig_block_allow_missing: contig_blocks.add(i) + return contig_blocks + +## find linkage of two blocks +## +proc find_linkage*(block1: int, block2: int, cellBlockLeftPhaseTable: TableRef[string, seq[int]],cellBlockRightPhaseTable: TableRef[string, seq[int]]): Block_linkage= + if debug: echo "linking " & $block1 & "and" & $block2 + var left_right_phase: seq[tuple[left:int,right:int]] + var linkString: string + if not (block1 < block2): + quit "block1 needs to be left of block2" + var blockLinkage = Block_linkage(prevBlock :block1, nextBlock: block2, switch: -1) + for bc in cellBlockLeftPhaseTable.keys(): + left_right_phase = zip(cellBlockLeftPhaseTable[bc],cellBlockRightPhaseTable[bc]) + if debug: echo $left_right_phase[block1].right & $left_right_phase[block2].left + linkString = $left_right_phase[block1].right & $left_right_phase[block2].left + if linkString in @["00","11"]: + blockLinkage.linkageType_00.inc + elif linkString in @["01","10"]: + blockLinkage.linkageType_01.inc + blockLinkage.switch_posterior_prob = cal_switch_posterior(error_rate = 0.1, (blockLinkage.linkageType_00 + blockLinkage.linkageType_01), blockLinkage.linkageType_01) + blockLinkage.switch_confidence = log10(blockLinkage.switch_posterior_prob) - log10(1-blockLinkage.switch_posterior_prob) + return blockLinkage + +# contig_blocks, the blocks to build contigs. Rest of the blocks are filled in if can +proc build_contig*(contig_blocks:seq[int], cellBlockLeftPhaseTable: TableRef[string, seq[int]],cellBlockRightPhaseTable: TableRef[string, seq[int]] ):seq[Block_linkage] = + var blockLinkageSeq: seq[Block_linkage] + var blockLinkage:Block_linkage + for b in 0..(contig_blocks.len-2): + blockLinkage = find_linkage(block1=contig_blocks[b], block2=contig_blocks[b+1],cellBlockLeftPhaseTable,cellBlockRightPhaseTable) + if blockLinkage.switch_confidence < 0.0: + blockLinkage.switch = 0 + else: + blockLinkage.switch = 1 + blockLinkageSeq.add(blockLinkage) + return blockLinkageSeq + +proc infer_block_linkage_to_one_contig_block*(contig_block:int,to_infer_block:int, + cellBlockLeftPhaseTable: TableRef[string, seq[int]], + cellBlockRightPhaseTable: TableRef[string, seq[int]], + at_left: bool):int = + if at_left: + if debug: echo "linking " & $to_infer_block & "to contig block " & $contig_block & " from left" + var blockLinkage = Block_linkage(prevBlock : to_infer_block, nextBlock :contig_block, switch : -1) + blockLinkage = find_linkage(block1 = to_infer_block, block2 = contig_block, cellBlockLeftPhaseTable,cellBlockRightPhaseTable) + if abs(blockLinkage.switch_confidence) < confidence_threshold: + return -1 + elif blockLinkage.switch_confidence > 0: + return 1 + else: + return 0 + else: + if debug: echo "linking " & $to_infer_block & "to contig block " & $contig_block & " from right" + var blockLinkage = Block_linkage(prevBlock: contig_block, nextBlock : to_infer_block , switch: -1) + blockLinkage = find_linkage(block1 = contig_block, block2 = to_infer_block, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) + if abs(blockLinkage.switch_confidence) < confidence_threshold: + return -1 + elif blockLinkage.switch_confidence > 0: + return 1 + else: + return 0 + +proc infer_non_contig_blocks*(contig_blocks:seq[int], linkedContigs:seq[Block_linkage], blocks:seq[Block],cellBlockLeftPhaseTable: TableRef[string, seq[int]],cellBlockRightPhaseTable: TableRef[string, seq[int]]):seq[int] = + ## time to gather the non_contig forming blocks and fill their phase: + ## Anchor using the first block in the contig blocks + var hap_sw = 0 + var hap_sw_left = 0 + var hap_sw_right = 0 + + var blocks_phase = newSeqWith(blocks.len,-1) + var left_contig_block:int + var right_contig_block: int + blocks_phase[contig_blocks[0]] = 0 + for lc in linkedContigs: + if lc.switch == 0: + if debug: echo "blocks_phase[lc.nextBlock] " & $blocks_phase[lc.nextBlock] + + blocks_phase[lc.nextBlock] = blocks_phase[lc.prevBlock] + else: + blocks_phase[lc.nextBlock] = (1 xor blocks_phase[lc.prevBlock]) + if debug: echo "block contig haplotype : " & $blocks_phase + for b_j in 0..(blocks.len-1): + if not (b_j in contig_blocks): + if b_j < contig_blocks[0]: + hap_sw = infer_block_linkage_to_one_contig_block(contig_block = contig_blocks[0], to_infer_block = b_j, cellBlockLeftPhaseTable,cellBlockRightPhaseTable, at_left=true) + if hap_sw == 1: + blocks_phase[b_j] = (1 xor contig_blocks[0]) + elif hap_sw == 0: + blocks_phase[b_j] = contig_blocks[0] + elif b_j > contig_blocks[contig_blocks.len-1]: + hap_sw = infer_block_linkage_to_one_contig_block(contig_block = contig_blocks[contig_blocks.len-1], to_infer_block = b_j,cellBlockLeftPhaseTable,cellBlockRightPhaseTable,at_left=false) + if hap_sw == 1: + blocks_phase[b_j] = (1 xor contig_blocks[0]) + elif hap_sw == 0: + blocks_phase[b_j] = contig_blocks[0] + else: + ## in between two contig blocks + left_contig_block = b_j + while not (left_contig_block in contig_blocks): + left_contig_block = left_contig_block - 1 + right_contig_block = b_j + while not (right_contig_block in contig_blocks): + right_contig_block = right_contig_block+1 + hap_sw_left = infer_block_linkage_to_one_contig_block(contig_block = left_contig_block, to_infer_block = b_j, cellBlockLeftPhaseTable,cellBlockRightPhaseTable, at_left=false) + hap_sw_right = infer_block_linkage_to_one_contig_block(contig_block = right_contig_block, to_infer_block = b_j, cellBlockLeftPhaseTable,cellBlockRightPhaseTable, at_left=true) + if blocks_phase[left_contig_block] != blocks_phase[right_contig_block]: + if hap_sw_right == -1: + if hap_sw_left == 1: + blocks_phase[b_j] = blocks_phase[right_contig_block] + elif hap_sw_left == 0: + blocks_phase[b_j] = blocks_phase[left_contig_block] + elif hap_sw_right == 0 and (hap_sw_left == 1 or hap_sw_left == -1): + blocks_phase[b_j] = blocks_phase[right_contig_block] + elif hap_sw_right == 1 and (hap_sw_left == 0 or hap_sw_left == -1): + blocks_phase[b_j] = blocks_phase[left_contig_block] + else: continue + return blocks_phase + +proc write_linked_blocks*(vcfFile:string, outFile: string, blocks:seq[Block], blocks_phase:seq[int], threads:int):int = + var ivcf: VCF + var ovcf: VCF + var v_off = 0 + var current_block = 0 + if not open(ivcf, vcfFile, threads=threads): + quit "couldn't open input vcf file" + if not open(ovcf, outFile, threads=threads, mode ="w"): + quit "couldn't open output vcf file" + ovcf.header = ivcf.header + discard ovcf.header.add_string("""##sgcocaller_v0.1=phaseBlocks""") + discard ovcf.write_header() + var gt_string:seq[int32] + var block_pos_i = 0 + for v in ivcf.query("*"): + v_off.inc + if blocks_phase[current_block] == -1: + current_block.inc + if current_block == blocks.len: + break + block_pos_i = 0 + continue + if v_off != blocks[current_block].snpPos[block_pos_i]: + continue + else: ## equal + var f = v.format() + if blocks_phase[current_block] == 0: + # don't need to flip + if blocks[current_block].h0[block_pos_i] == 0: + ## 0|1 + if debug: echo "block hap0 and this h0 is 0 0|1" + gt_string = @[int32(2),int32(5)] + else: + ## 1|0 + if debug: echo "block hap0 and this h0 is 1 1|0" + gt_string = @[int32(4),int32(3)] + elif blocks_phase[current_block] == 1: + if blocks[current_block].h0[block_pos_i] == 0: + ## 1|0 + if debug: echo "block hap1 and this h0 is 0 1|0" + gt_string = @[int32(4),int32(3)] + else: + ## 0|1 + if debug: echo "block hap1 and this h0 is 1 0|1" + gt_string = @[int32(2),int32(5)] + if f.set("GT",gt_string) != Status.OK: + quit "set GT failed" + if not ovcf.write_variant(v) : + quit "write vcf failed for " & $voff + block_pos_i.inc + if block_pos_i == blocks[current_block].snpPos.len: + current_block.inc + if current_block == blocks.len: + break + block_pos_i = 0 + ovcf.close() + ivcf.close() + diff --git a/private/readfmf.nim b/private/readfmf.nim new file mode 100755 index 0000000..6a4e50c --- /dev/null +++ b/private/readfmf.nim @@ -0,0 +1,43 @@ +## read fmf +## barcode -> snpPos: geno +import tables +import streams +import strutils + +proc readFmf*(frag_file:string,cellGenoTable:TableRef):int = + var fmf: FileStream + var n_blocks,line_index,b_pos : int + var current_frag, barcode, b_geno: string + var current_frag_seq: seq[string] + try: + fmf = openFileStream(frag_file, fmRead) + except: + stderr.write getCurrentExceptionMsg() + while not fmf.atEnd(): + current_frag = fmf.readLine() + current_frag_seq = current_frag.splitWhitespace() + n_blocks = parseInt(current_frag_seq[0]) + barcode = current_frag_seq[1].split('.')[0] + for b in 1..n_blocks: + b_pos = parseInt(current_frag_seq[(b)*2]) + b_geno = current_frag_seq[(b)*2+1] + for i, g in b_geno: + if cellGenoTable.hasKey(barcode): + discard cellGenoTable[barcode].hasKeyOrPut($(b_pos+i), parseInt($g)) + else: + var geno_table = newTable[string,int]() + geno_table[$(b_pos+i)] = parseInt($g) + cellGenoTable[barcode] = geno_table + line_index.inc + echo "total fmfs " & $line_index + fmf.close() + + +# discard readFmf("data/WC_CNV_42/fmf_snpdepth1/WC_CNV_42.chr16.fmf",cellGenoTable) +# for k in cellGenoTable.keys(): +# echo k +# break +# echo cellGenoTable.len + +# for key,value in cellGenoTable["TGTATTCAGGACAGCT-1"].pairs(): +# echo $key & "\t" & $value diff --git a/private/utils.nim b/private/utils.nim index be82eb0..a4843a1 100755 --- a/private/utils.nim +++ b/private/utils.nim @@ -26,7 +26,7 @@ type counter*: int node1*: GtNode node2*: GtNode -proc get_base_offset*(position:cint, align: Record): int = +proc get_base_offset*(position:int, align: Record): int = var off = align.start.int qoff = 0 @@ -66,17 +66,18 @@ proc getTrans*(pos1:int64,pos2:int64,cmPmb=0.1): float = var rec = 1-math.exp(-float(pos2-pos1)*1e-8*cmPmb) # for autosomes 1*cmPbm cM per Mb return rec # -proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int, +proc countAllele*(ibam:Bam, maxTotalReads:int, minTotalReads:int, chrom:string, mapq: int, - barcodeTable:OrderedTableRef,minbsq:int,bulkBam:bool,barcodeTag:string): Table[string,allele_expr] = + barcodeTable:TableRef,minbsq:int,bulkBam:bool,barcodeTag:string,startPos:int, stopPos:int, + rec_alt:char, rec_ref:char): Table[string,allele_expr] = var alleleCountTable = initTable[string,allele_expr]() - var rec_alt:char + #var rec_alt:char var total_reads = 0 var base_off: int var base: char - rec_alt = rec.ALT[0][0] - for aln in ibam.query(chrom = chrom,start = rec.POS.cint-1, stop = rec.POS.cint): + #rec_alt = rec.ALT[0][0] + for aln in ibam.query(chrom = chrom,start = startPos, stop = stopPos): var cbt = tag[string](aln, barcodeTag) var currentCB: string if cbt.isNone: @@ -89,15 +90,14 @@ proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int, if aln.flag.unmapped or aln.mapping_quality.cint < mapq or aln.flag.dup: continue ## skip unmapped, duplicated, mapq low reads or aln.flag.secondary or aln.flag.supplementary ## if not aln.flag.proper_pair: continue - if not barcodeTable.hasKey(currentCB): - continue - base_off = get_base_offset(position = rec.POS.cint, align = aln) + if not barcodeTable.hasKey(currentCB): continue + base_off = get_base_offset(position = stopPos, align = aln) base = aln.base_at(base_off) if aln.base_quality_at(base_off).cint < minbsq: continue total_reads+=1 if alleleCountTable.hasKey(currentCB): - if aln.base_at(base_off) == rec.REF[0]: + if aln.base_at(base_off) == rec_ref: alleleCountTable[currentCB].inc_count_cref continue if aln.base_at(base_off) == rec_alt: @@ -106,7 +106,7 @@ proc countAllele*(rec:Variant, ibam:Bam, maxTotalReads:int, else: var new_snp = allele_expr(cref:0,calt:0) alleleCountTable[currentCB] = new_snp - if aln.base_at(base_off) == rec.REF[0]: + if aln.base_at(base_off) == rec_ref: alleleCountTable[currentCB].inc_count_cref continue if aln.base_at(base_off) == rec_alt: diff --git a/sgcocaller.nim b/sgcocaller.nim index d381011..fb06183 100755 --- a/sgcocaller.nim +++ b/sgcocaller.nim @@ -14,8 +14,12 @@ import math import streams import private/graph import private/findpath +import private/blocks_utils +import private/readfmf +import private/phase_blocks -proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:OrderedTableRef, outfmf:FileStream,maxTotalReads:int, + +proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outfmf:FileStream,maxTotalReads:int, minTotalReads:int, mapq: int,minbsq:int,mindp:int,barcodeTag:string, minsnpdepth:int):int = var variantIndex = 1 var cellFragmentsTable = newTable[string,Fragment]() @@ -30,11 +34,46 @@ proc getfmf(ibam:Bam, ivcf:VCF, barcodeTable:OrderedTableRef, outfmf:FileStream, variantIndex = variantIndex + 1 echo "processed " & $(variantIndex) & " variants" return 0 +proc phaseBlocks(fmf_file:string, inputVCF:string, outputVCF:string, block_hapfile:string, threads:int):int = + var cellGenoTable: TableRef[string,TableRef[string,int]] + var cellBlockLeftPhaseTable,cellBlockRightPhaseTable :TableRef[string, seq[int]] + var chr_blocks:seq[Block] + var blockkseq:seq[Block_linkage] + var phased_blocks: seq[int] + echo "Phasing for hapfile " & block_hapfile + cellGenoTable = newTable[string,TableRef[string,int]]() + discard readFmf(fmf_file,cellGenoTable) + chr_blocks = read_blocks(block_hapfile) + cellBlockLeftPhaseTable = newTable[string, seq[int]]() + cellBlockRightPhaseTable = newTable[string, seq[int]]() + ## blocks at the ends with 20 SNPs or fewer, removed? + ## blocks not at the ends with 20SNPs or fewer are not used for linking blocks but if their linkage with pre block and post-block is consistent, + ## these short blocks' haplotypes are retained. + ## + ## What if blocks cannot be linked ? the number of 11(00) == 10(01) + echo "Total blocks " & $chr_blocks.len + var contig_blocks:seq[int] + for b_j, bl in chr_blocks: + echo "block " & $b_j & " len: " & $bl.h0.len + # echo "last 5" & $bl.h0[(high(bl.h0)-5)..high(bl.h0)] + # echo "first 5" & $bl.h0[0..5] + discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, true, cellBlockLeftPhaseTable) + discard block_phase_in_cells(cellGenoTable, bl.snpPos, bl.h0, false, cellBlockRightPhaseTable) + contig_blocks = find_contig_blocks(chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) + # echo "blocks for contigs" & $contig_blocks + blockkseq = build_contig(contig_blocks = contig_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) + for bk in blockkseq: + echo "prevBlock" & $bk.prevBlock & "nextBlock" & $bk.nextBlock & " 00 type " & $ $bk.linkageType_00 & " 01 type " & $bk.linkageType_01 & " switch posterior prob " & $bk.switch_posterior_prob & " switch value :" & $bk.switch + phased_blocks = infer_non_contig_blocks(contig_blocks, blockkseq, chr_blocks, cellBlockLeftPhaseTable, cellBlockRightPhaseTable) + + # echo phased_blocks -proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:OrderedTableRef, + discard write_linked_blocks(inputVCF,outputVCF, blocks = chr_blocks, blocks_phase = phased_blocks, threads= threads) + return 0 +proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef, ibam:Bam, out_dir:string, mapq:int, minbsq:int, mintotal:int, maxtotal:int, mindp:int, maxdp:int, - thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string): int = + thetaREF:float, thetaALT:float, cmPmb:float,s_Chrs:seq,barcodeTag:string,phased:bool): int = var bulkBam = false if barcodeTable.len == 1 and barcodeTable.hasKey("bulk"): @@ -70,21 +109,31 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:OrderedTableRef, outFileStream.writeLine(' '.repeat(50)) outFileSNPanno.writeLine(join(["POS","REF", "ALT"], sep = "\t")) - + var ac = new_seq[int32](10) for rec in ivcf.query(chrom): if rec.ALT.len > 1 or rec.ALT.len == 0 : continue ## alleleCountTable contains for this rec.POS, each cell barcode's allele counts - var alleleCountTable = countAllele(rec=rec, ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, - maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag) + var rec_alt = rec.ALT[0][0] + var rec_ref = rec.REF[0] + if phased: + var f = rec.format() + var gts = f.genotypes(ac) + if ac[0] == 4 and ac[1] == 3: + # gt is 1|0 + rec_alt = rec.REF[0] + rec_ref = rec.ALT[0][0] + # if not gt == 0|1 continue + elif not (ac[0] == 2 and ac[1] == 5): continue + var alleleCountTable = countAllele(ibam=ibam, chrom=chrom, mapq=mapq, barcodeTable=barcodeTable,minbsq=minbsq, + maxTotalReads = maxtotal, minTotalReads = mintotal,bulkBam = bulkBam, barcodeTag = barcodeTag, startPos = rec.POS.cint-1, stopPos=rec.POS.cint,rec_alt = rec_alt, rec_ref = rec_ref) if alleleCountTable.len==0: continue - var rec_alt:char - rec_alt = rec.ALT[0][0] + ## add to snpAnnoSeq, later write to SNPannot file, which contains SNP.pos, SNP.ref,SNP.alt; The rowAnnotations snpIndex += 1 - outFileSNPanno.writeLine(join([$rec.POS, $rec.REF[0],$rec_alt], sep="\t") ) + outFileSNPanno.writeLine(join([$rec.POS, $rec_ref, $rec_alt], sep="\t") ) discard addViNode(barcodeTable = barcodeTable, alleleCountTable = alleleCountTable, scSpermSeq = scSpermSeq, outFileTotalCountMtx = outFileTotalCountMtx, outFileAltCountMtx = outFileAltCountMtx, nnsize = nnsize, - mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec=rec, + mindp = mindp, maxdp = maxdp, thetaRef = thetaRef, thetaAlt = thetaAlt, snpIndex = snpIndex, rec_pos=int(rec.POS), initProb = initProb, cmPmb = cmPmb) var posEnd: int64 var inferProb,reverseProb = 0.0 @@ -124,13 +173,15 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:OrderedTableRef, return 0 when(isMainModule): - let version = "0.3.2" + let version = "0.3.3" var doc = format(""" $version Usage: sgcocaller fmf [options] <BAM> <VCF> <barcodeFile> <out_prefix> sgcocaller xo [options] <BAM> <VCF> <barcodeFile> <out_prefix> + sgcocaller phase_blocks [options] <VCF> <fmf> <hapfile> <out_vcf> + Arguments: @@ -142,6 +193,10 @@ Arguments: <out_prefix> the prefix of output files + <fmf> the fragment file from running sgcocaller fmf + + <out_vcf> the output vcf aftering phasing blocks in hapfile + Options: -t --threads <threads> number of BAM decompression threads [default: 4] @@ -157,21 +212,21 @@ Options: --thetaREF <thetaREF> the theta for the binomial distribution conditioning on hidden state being REF [default: 0.1] --thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9] --cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1] + --phased the VCF contains the phased GT of heterozygous -h --help show help Examples ./sgcocaller fmf --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/cellfragment.fmf ./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp + ./sgcocaller phase_blocks --threads 4 AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz ./percell/cellfragment.fmf ./percell/ccsnp/linkedBlocks.vcf.gz + """ % ["version", version]) let args = docopt(doc, version=version) var threads:int - vcff:string - barcodeFile:string - bamfile:string selectedChrs:string out_dir:string mapq:int @@ -185,6 +240,9 @@ Options: cmPmb:float barcodeTag="CB" minsnpdepth:int + out_vcf,fmf,barcodeFile,bamfile,vcff,hapfile:string + vcfGtPhased = false + echo $args threads = parse_int($args["--threads"]) barcodeTag = $args["--barcodeTag"] @@ -193,64 +251,77 @@ Options: maxtotal = parse_int($args["--maxTotalDP"]) mintotal = parse_int($args["--minTotalDP"]) minsnpdepth = parse_int($args["--minSNPdepth"]) - - bamfile = $args["<BAM>"] - barcodeFile = $args["<barcodeFile>"] - out_dir = $args["<out_prefix>"] - vcff = $args["<VCF>"] mapq = parse_int($args["--minMAPQ"]) minbsq = parse_int($args["--baseq"]) thetaRef = parse_float($args["--thetaREF"]) thetaAlt = parse_float($args["--thetaALT"]) cmPmb = parse_float($args["--cmPmb"]) + vcfGtPhased = parse_bool($args["--phased"]) - var - ibam:Bam - ivcf:VCF - s_Chrs: seq[string] - if not open(ibam, bamfile, threads=threads, index = true): - quit "couldn't open input bam" - if not open(ivcf, vcff, threads=threads): - quit "couldn't open: vcf file" - - if($args["--chrom"] != "nil"): - selectedChrs = $args["--chrom"] - s_Chrs = selectedChrs.split(',') - else: - ## find all chroms from headers of vcf file (Contigs) - let contigs = ivcf.contigs - s_Chrs = map(contigs, proc(x: Contig): string = x.name) - - var hf = hts.hts_open(cstring(barcodeFile), "r") - #### TODO : Table size - var barcodeTable = newOrderedTable[string,int](initialSize = 1024) - var kstr: hts.kstring_t - kstr.l = 0 - kstr.m = 0 - kstr.s = nil - ## initiate the table with CB as keys, allele counts (ref object) as elements - while hts_getline(hf, cint(10), addr kstr) > 0: - if $kstr.s[0] == "#": - continue - var v = $kstr.s - discard barcodeTable.hasKeyOrPut(v, 0) - discard hf.hts_close() - if args["fmf"]: - echo "generating fragment file to " & out_dir - var outfmf:FileStream - try: - outfmf = openFileStream(out_dir, fmWrite) - except: - stderr.write getCurrentExceptionMsg() - discard getfmf(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outfmf = outfmf, maxTotalReads = maxtotal, - minTotalReads = mintotal, mapq = mapq, minbsq =minbsq, mindp = mindp, barcodeTag = barcodeTag,minsnpdepth=minsnpdepth) - close(outfmf) - if args["xo"]: - echo "running crossover calling \n" - echo "running for these chromosomes as specified in the header of the provided VCF file " & $s_Chrs - discard sgcocaller(threads, ivcf, barcodeTable, ibam, - out_dir, mapq, minbsq, mintotal, - maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag) - - ibam.close() - ivcf.close() \ No newline at end of file + if args["fmf"] or args["xo"]: + var + ibam:Bam + ivcf:VCF + s_Chrs: seq[string] + bamfile = $args["<BAM>"] + barcodeFile = $args["<barcodeFile>"] + out_dir = $args["<out_prefix>"] + vcff = $args["<VCF>"] + + if not open(ibam, bamfile, threads=threads, index = true): + quit "couldn't open input bam" + if not open(ivcf, vcff, threads=threads): + quit "couldn't open: vcf file" + + if($args["--chrom"] != "nil"): + selectedChrs = $args["--chrom"] + s_Chrs = selectedChrs.split(',') + else: + ## find all chroms from headers of vcf file (Contigs) + let contigs = ivcf.contigs + s_Chrs = map(contigs, proc(x: Contig): string = x.name) + + var hf = hts.hts_open(cstring(barcodeFile), "r") + #### TODO : Table size + var barcodeTable = newTable[string,int](initialSize = 1024) + var kstr: hts.kstring_t + kstr.l = 0 + kstr.m = 0 + kstr.s = nil + var ithSperm = 0 + ## initiate the table with CB as keys, allele counts (ref object) as elements + while hts_getline(hf, cint(10), addr kstr) > 0: + if $kstr.s[0] == "#": + continue + var v = $kstr.s + discard barcodeTable.hasKeyOrPut(v, ithSperm) + ithSperm.inc + discard hf.hts_close() + if args["fmf"]: + echo "generating fragment file to " & out_dir + var outfmf:FileStream + try: + outfmf = openFileStream(out_dir, fmWrite) + except: + stderr.write getCurrentExceptionMsg() + discard getfmf(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outfmf = outfmf, maxTotalReads = maxtotal, + minTotalReads = mintotal, mapq = mapq, minbsq =minbsq, mindp = mindp, barcodeTag = barcodeTag,minsnpdepth=minsnpdepth) + close(outfmf) + if args["xo"]: + echo "running crossover calling \n" + echo "running for these chromosomes as specified in the header of the provided VCF file " & $s_Chrs + discard sgcocaller(threads, ivcf, barcodeTable, ibam, + out_dir, mapq, minbsq, mintotal, + maxtotal, mindp, maxdp, thetaREF, thetaALT, cmPmb,s_Chrs,barcodeTag,phased = vcfGtPhased) + ibam.close() + ivcf.close() + if args["phase_blocks"]: + fmf = $args["<fmf>"] + out_vcf = $args["<out_vcf>"] + vcff = $args["<VCF>"] + hapfile = $args["<hapfile>"] + echo "running phasing blocks \n" + discard phaseBlocks(fmf_file = fmf, inputVCF=vcff, outputVCF=out_vcf, block_hapfile=hapfile, threads=threads) + + + \ No newline at end of file -- GitLab