Skip to content
Snippets Groups Projects
correctPhase.nim 12.86 KiB
## calcualte switch score for snps positions that are high risk
import utils
import sequtils
import math
import streams
# bins of SNP indexes
import strutils

let binSize = 2000
let movingStep = 200
let dissimThresh = 0.0099 
#let lookBeyondSnps = 25
let debug = false
let switchPrior = 0.5
type 
  switchScoreTuple = tuple
    switch_scores: seq[float]
    switch_scores_snpIndex: seq[int]
    
# reduce the search space
# return seq of snp indexes
# low risk bins are bins that do not have crossover happening in majority of the cells. 

## rough crossover estimates: return binary results, 0 means no crossovers, 1 means there is.
## simply comparing the dissimilary btw template geno seq with cell's geno seq for the SNPs in the bin

## cell_geno snp by cell
proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]): bool = 
  let ncells = cell_geno[0].len
  var ncompared = newSeq[int](ncells)
  var nmatch = newSeq[int](ncells)
  for snpi,g in templ_geno:
    for cellj in 0..(ncells-1):
      if g != gUnknown and cell_geno[snpi][cellj] != gUnknown:
        ncompared[cellj].inc
        if g == cell_geno[snpi][cellj]: nmatch[cellj].inc
  if debug: echo "ncompared " & $ncompared      
  let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float =  nmatch[x]/ncompared[x])
  if debug: echo "dissim " & $dissim
  var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh)))
  if debug: echo "ncrossover " & $ncrossover
  let nxcells = foldl(ncrossover, a + b)
  if nxcells < int(floor(0.5 * float(ncells))):
    if debug: echo "bin had no crossovers : " & $nxcells
    return false
  else:
    if debug: echo "bin had many crossovers : " & $nxcells
    return true

proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]]): seq[int] = 
  #let ncells = gtMtxByCell[0].len
  let nSnps = gtMtxByCell.len
  let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
  if debug: echo "nBins: " & $nBins
  let binIds = toSeq(0..(nBins-1))
  var snpPosStart,snpPosEnd: int
  var highRiskSnps: seq[int]
  for binI in binIds:
    if debug: echo "binI " & $binI
    snpPosStart = (binI)*(binSize - movingStep)
    if (snpPosStart + binSize) > (nSnps-1): 
      snpPosEnd = (nSnps-1)
    else:
      snpPosEnd = snpPosStart + binSize
    if (hasCrossover(templ_geno =  fullGeno[snpPosStart..snpPosEnd], 
                     cell_geno =  gtMtxByCell[snpPosStart..snpPosEnd] )):
      highRiskSnps = concat(highRiskSnps,toSeq(snpPosStart..snpPosEnd))

    
  # add a last bin as from end of chrom with length binSize
  let lastBinStart = nSnps - binSize
  let lastBinEnd = nSnps - 1
  if (hasCrossover(templ_geno =  fullGeno[snpPosStart..snpPosEnd], 
                     cell_geno =  gtMtxByCell[snpPosStart..snpPosEnd])):
    highRiskSnps = concat(highRiskSnps,toSeq(lastBinStart..lastBinEnd))

  highRiskSnps = deduplicate(highRiskSnps)
  return highRiskSnps
   
proc readPhasedSnpAnnot(phasedSnpAnnotFileStream: FileStream, nSnps:int): seq[BinaryGeno] =
  var fullGeno = newSeq[BinaryGeno](nSnps)
  var i = 0
  var currentEntrySeq: seq[string]
  while not phasedSnpAnnotFileStream.atEnd():
    currentEntrySeq = phasedSnpAnnotFileStream.readLine().splitWhitespace()
    fullGeno[i] = parseInt(currentEntrySeq[3]).toBinaryGeno(format = "01")
    i.inc
  if i != nSnps :
    quit "phased snpAnnot file does not have the same number of rows with gtMtx"
  return fullGeno

proc calculateProbslog10(hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno], error_rate = 0.1) : float = 
  var nmatch,ncompare:int
  for i,g in hap:
    if (g != gUnknown) and (cell_hap[i] != gUnknown):
      ncompare.inc
      if (g == cell_hap[i]): nmatch.inc
  let nmis = ncompare - nmatch
  let prob = 0.5*(error_rate^nmis*(1-error_rate)^nmatch + error_rate^nmatch*(1-error_rate)^nmis)
  return(log10(prob))

proc countNmatch( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = 
  var nmatch,ncompare:int
  for i,g in hap:
    if (g != gUnknown) and (cell_hap[i] != gUnknown):
      ncompare.inc
      if (g == cell_hap[i]): nmatch.inc
  let nmis = ncompare - nmatch
  return nmatch

proc countNMis( hap:seq[BinaryGeno],cell_hap:seq[BinaryGeno],nmatch =true): int = 
  var nmatch,ncompare:int
  for i,g in hap:
    if (g != gUnknown) and (cell_hap[i] != gUnknown):
      ncompare.inc
      if (g == cell_hap[i]): nmatch.inc
  let nmis = ncompare - nmatch
  return nmis

proc binarySwitch(x: BinaryGeno):BinaryGeno = 
    if(x == gUnknown):  gUnknown 
    elif(x == gREF):gALT
    else: gREF

proc switchHap(hap: seq[BinaryGeno]): seq[BinaryGeno] = 
  map(hap, binarySwitch)

proc getIthCellHap(gtMtxByCell:seq[seq[BinaryGeno]],cellIndex:int):seq[BinaryGeno] = 
  map(gtMtxByCell, proc(y: seq[BinaryGeno]): BinaryGeno = y[cellIndex])

proc calSwitchScore(riskySnps: seq[int], gtMtxByCell:seq[seq[BinaryGeno]], fullGeno: seq[BinaryGeno], lookBeyondSnps = 25): switchScoreTuple = 
  var letfIndexStart,letfIndexEnd,rightIndexStart,rightIndexEnd,offset: int
  let nSnps = gtMtxByCell.len
  let ncells =  gtMtxByCell[0].len
  var hswitch,cell_hap,cell_full_hap,templ_geno,templ_geno_left,templ_geno_right: seq[BinaryGeno]
  var prob_switch, prob_nonsw: float
  var leftIndex,rightIndex, switch_score_snpIndex: seq[int]
  var swscore = newSeq[float](nSnps)
  for rsnpi in riskySnps:
    prob_switch = 0.0
    prob_nonsw = 0.0
    if fullGeno[rsnpi] == gUnknown: continue
    for celli in 0..(ncells-1):
      # if celli == 2: continue
      leftIndex = newSeq[int]()
      rightIndex = newSeq[int]()
      offset = 1
      cell_full_hap = getIthCellHap(gtMtxByCell, celli)
      if(cell_full_hap[rsnpi] != gUnknown and fullGeno[rsnpi] != gUnknown ):
        rightIndex.add(rsnpi)
      ## need to find the nearest coexisting N snps (N = lookBeyondSnps*2)
      offset = 1
      while(leftIndex.len < lookBeyondSnps and ((rsnpi-offset) > 0)):
        if((cell_full_hap[rsnpi-offset] != gUnknown) and (fullGeno[rsnpi-offset] != gUnknown)):
          leftIndex.add(rsnpi-offset)
        offset.inc
      offset = 1
      while(rightIndex.len < lookBeyondSnps and ((rsnpi+offset) < nSnps)):
        if((cell_full_hap[rsnpi+offset] != gUnknown) and (fullGeno[rsnpi+offset] != gUnknown)):
          rightIndex.add(rsnpi+offset)
        offset.inc
      cell_hap = concat(map(concat(leftIndex,rightIndex), proc(x:int):BinaryGeno = cell_full_hap[x])) 
      templ_geno_left = map(leftIndex, proc(x:int):BinaryGeno = fullGeno[x])
      templ_geno_right = map(rightIndex, proc(x:int):BinaryGeno = fullGeno[x])
      templ_geno = concat(templ_geno_left,templ_geno_right)
      hswitch = concat(templ_geno_left,switchHap(templ_geno_right))
      prob_nonsw = prob_nonsw + calculateProbslog10(hap = templ_geno, cell_hap = cell_hap)
      prob_switch = prob_switch + calculateProbslog10(hap = hswitch, cell_hap = cell_hap)

    #if(prob_switch > prob_nonsw): echo "found positive switch score"
    swscore[rsnpi] =  log10(switchPrior) + prob_switch - log10(1-switchPrior) - prob_nonsw
    switch_score_snpIndex.add(rsnpi)
  return (swscore,switch_score_snpIndex)

## return SNP indexs to switch
## switchScore, a dense list of switch scores
proc findSwitchSites(switchScoreT: switchScoreTuple, lookBeyondSnps = 25, minSwitchScore:float, minPositiveSwitchScores = 25): seq[int] = 
  var sitesToSwitch = newSeq[int]()
  var positiveScores: seq[float]
  var positiveScoresIndex: seq[int]
  
  var inPositiveBlock = false
  for site, score in switchScoreT.switch_scores:
    echo "site, " & $site & " score: " & $(score)
    if switchScoreT.switch_scores_snpIndex.find(site)>=0:
      if score <= 0.0 or (switchScoreT.switch_scores_snpIndex.find(site) == (switchScoreT.switch_scores_snpIndex.len - 1)) :
        if inPositiveBlock:
          inPositiveBlock = false
          if (positiveScoresIndex.len > minPositiveSwitchScores) and (max(positiveScores) >= minSwitchScore):
            sitesToSwitch.add(positiveScoresIndex[maxIndex(positiveScores)])
          continue
      elif not inPositiveBlock:
        positiveScores = @[score]
        positiveScoresIndex = @[site]
        inPositiveBlock = true
        continue
      else:
        positiveScores.add(score)
        positiveScoresIndex.add(site)
  return sitesToSwitch

proc switchPhaseString(oriString:string): string = 
  if oriString == "-1": return "-1"
  if oriString == "0": return "1"
  if oriString == "1": return "0"
 
proc writeSwitchedPhase(siteToSwitch:seq[int],switchedPhasedAnnotFile:string, phaseSnpAnnotFile:string ): int = 
  var switchedPhasedAnnotFileFS,phaseSnpAnnotFileFS: FileStream
  var switch = 0
  var currentEntry:seq[string]
  var snpIndex = 0
  var writtenSnps = 0
  try:
    switchedPhasedAnnotFileFS = openFileStream(switchedPhasedAnnotFile, fmWrite)
    phaseSnpAnnotFileFS = openFileStream(phaseSnpAnnotFile, fmRead)
  except:
    stderr.write getCurrentExceptionMsg() 
  switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine())
  if siteToSwitch.len == 0:
    while not phaseSnpAnnotFileFS.atEnd():
      switchedPhasedAnnotFileFS.writeLine(phaseSnpAnnotFileFS.readLine())
    switchedPhasedAnnotFileFS.close()
    phaseSnpAnnotFileFS.close()
    return 0
  while not phaseSnpAnnotFileFS.atEnd():
    currentEntry = phaseSnpAnnotFileFS.readLine.splitWhitespace()
    if siteToSwitch.find(snpIndex) != -1:
      switch = switch xor 1 
    if switch == 0:
      switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t"))
    else:
      currentEntry[3] = switchPhaseString(currentEntry[3])
      switchedPhasedAnnotFileFS.writeLine(join(currentEntry,sep = "\t"))
    snpIndex.inc
  switchedPhasedAnnotFileFS.close()
  phaseSnpAnnotFileFS.close()
  return 0

proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAnnotFile:string, switchScoreFile: string, lookBeyondSnps = 25,minSwitchScore:float, minPositiveSwitchScores:int): int = 
  var gtMtxByCell: seq[seq[BinaryGeno]]
  var currentEntrySeq: seq[string]
  var currentEntry:seq[int]
  var nSnps,ncells:int
  var gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream:FileStream
  ## sort entries in mtx files 
  try: 
    gtMtxFileStream = openFileStream(gtMtxFile, fmRead)
    phaseSnpAnnotFileStream = openFileStream(phaseSnpAnnotFile, fmRead)
    switchScoreFileStream = openFileStream(switchScoreFile, fmWrite)
  except:
    stderr.write getCurrentExceptionMsg() 
  echo "reading gtMtx to by cell"
  discard gtMtxFileStream.readLine()
  discard phaseSnpAnnotFileStream.readLine()
  #N, i,j
  currentEntrySeq = gtMtxFileStream.readLine().splitWhitespace()
  currentEntry = map(currentEntrySeq, proc(x: string): int = parseInt(x))
  nSnps = currentEntry[0]
  echo "nSnps " & $nSnps
  ncells = currentEntry[1]
  echo "ncells " & $ncells

  echo "totalEntries " & $ currentEntry[2]
  ## gtMtx is cell by Snp format
  gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
  discard readGtMtxToSeq(mtxFileStream = gtMtxFileStream, gtMtx = gtMtxByCell, by_cell = true)
  var fullGeno = readPhasedSnpAnnot(phasedSnpAnnotFileStream = phaseSnpAnnotFileStream, nSnps = nSnps )
  var riskySnps = findHighRiskSnps(fullGeno = fullGeno, gtMtxByCell = gtMtxByCell)
  echo "riskySnps.len " & $riskySnps.len
  var switchScoresT = calSwitchScore(riskySnps = riskySnps, gtMtxByCell =gtMtxByCell, fullGeno = fullGeno, lookBeyondSnps = lookBeyondSnps)
  let switchSites = findSwitchSites(switchScoresT, lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores)
  switchScoreFileStream.writeLine("#" & $switchSites)
  for snpi,score in switchScoresT.switch_scores:
    if switchScoresT.switch_scores_snpIndex.find(snpi)>=0:
      switchScoreFileStream.writeLine($score)
    else: 
      switchScoreFileStream.writeLine("NA")
  for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]:
    fs.close()
  discard writeSwitchedPhase(switchSites,switchedPhasedAnnotFile,phaseSnpAnnotFile)

# for chrom in @["chr5"]:    
#   let gtMtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_gtMtx.mtx"
#   let phaseSnpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_phased_snpAnnot.txt"
#   let switchedPhasedAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_corrected_phased_snpAnnot.txt"
#   let switchScoreFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_switch_score.txt"  
#   discard correctPhase(gtMtxFile,phaseSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = 150,minSwitchScore = 280.0, minPositiveSwitchScores = 25)