Skip to content
Snippets Groups Projects
Commit 047edbe5 authored by Ruqian Lyu's avatar Ruqian Lyu
Browse files

code cleaning

parent b94c527b
No related branches found
No related tags found
No related merge requests found
Pipeline #9540 passed
......@@ -29,7 +29,7 @@ proc sgcocaller(threads:int, ivcf:VCF, barcodeTable:TableRef,
echo "running in bulk mode and all reads are regarded as from one sample/cell"
bulkBam = true
else:
echo "running sgcoaller for " & $barcodeTable.len & " single cells"
echo "running sgcoaller xo for " & $barcodeTable.len & " single cells"
## iterate through each selected chromosome
for chrom in s_Chrs:
......@@ -132,7 +132,7 @@ Options:
--thetaALT <thetaALT> the theta for the binomial distribution conditioning on hidden state being ALT [default: 0.9]
--cmPmb <cmPmb> the average centiMorgan distances per megabases default 0.1 cm per Mb [default: 0.1]
--phased the input VCF for calling crossovers contains the phased GT of heterozygous SNPs
--outvcf generate the output in vcf format (phase)
--outvcf generate the output in vcf format (sgcocaller phase)
--templateCell <templateCell> the cell's genotype to be used a template cell, as the cell's index (0-starting) in the barcode file, default as not supplied [default: -1]
--maxDissim <maxDissim> the maximum dissimilarity for a pair of cell to be selected as potential template cells due to not having crossovers in either cell [default: 0.0099]
--maxExpand <maxExpand> the maximum number of iterations to look for locally coexisting positions for inferring missing SNPs in template haplotype sequence [default: 1000]
......@@ -147,7 +147,7 @@ Options:
Examples
./sgcocaller phase gtMtxFile phaseOutputPrefix
./sgcocaller xo --threads 4 AAAGTAGCACGTCTCT-1.raw.bam AAAGTAGCACGTCTCT-1.raw.bam.dp3.alt.vcf.gz barcodeFile.tsv ./percell/ccsnp
./sgcocaller xo --threads 4 possorted_bam.bam phased_hetSNPs.vcf.gz barcodeFile.tsv ./percell/ccsnp
./sgcocaller sxo phaseOutputPrefix barcodeFile.tsv ./percell/ccsnp
""" % ["version", version])
......@@ -248,8 +248,8 @@ Options:
for mtxFile in [outGtMtxFile, outTotalCountMtxFile, outAltCountMtxFile]:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
echo "running phasing from single cell genotype matrix (one chromosome) " & outGtMtxFile
echo "using the " & outSnpAnnotFile & "for generating phased haplotypes"
echo "running sgcocaller phase from single cell genotype matrix (of one chromosome) " & outGtMtxFile
echo "using the " & outSnpAnnotFile & " for generating phased haplotypes"
discard sgphase(mtxFile = outGtMtxFile, snpAnnotFile = outSnpAnnotFile, phasedSnpAnnotFile = outphasedSnpAnnotFile,
diagnosticDataframeFile = outdiagnosticDataframeFile, templateCell = templateCell,
maxExpand = maxExpand, posteriorProbMin = posteriorProbMin,maxDissim = maxDissim)
......@@ -276,8 +276,8 @@ Options:
selectedChrs = $args["--chrom"]
s_Chrs = selectedChrs.split(',')
else:
quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes using comma as separator"
echo "running crossover calling from genotype and allele count matrices generate from sgcocaller phase/swphase "
quit "chromosomes need to be supplied via --chrom. Supply multiple chromosomes (will be processed sequencially) using comma as separator"
echo "running crossover calling from genotype and allele count matrices generated from sgcocaller phase/swphase "
echo "running for these chromosomes : " & $s_Chrs
let phase_dir = $args["<phaseOutputPrefix>"]
let phasedSnpAnnotFile = $args["<SNPPhaseFile>"]
......@@ -289,7 +289,7 @@ Options:
var imtx = readMtx(mtx_file = mtxFile)
discard sortWriteMtx(imtx, mtx_file = mtxFile)
elif args["swphase"]:
echo "find switch spots and generate corrected phase"
echo "running swphase to find switch spots and generate corrected phase"
let gtMtxFile = $args["<gtMtxFile>"]
let phasedSnpAnnotFile = $args["<phasedSnpAnnotFile>"]
let switchedPhasedAnnotFile = out_dir & "corrected_phased_snpAnnot.txt"
......@@ -297,7 +297,7 @@ Options:
let switchedPhasedAnnotVcfFile = out_dir & "corrected_phased_snpAnnot.vcf.gz"
discard correctPhase(gtMtxFile,phasedSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = lookBeyondSnps,minSwitchScore = minSwitchScore, minPositiveSwitchScores = minPositiveSwitchScores, binSize = binSize, stepSize = stepSize)
if ($args["--chrom"] == "nil"):
echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use --chrom option"
echo "Assuming supplied VCF only contains the SNPs for the relevant chromosome. If this is not the case, use the --chrom option"
discard writePhaseToVCF($args["<referenceVCF>"], switchedPhasedAnnotVcfFile, switchedPhasedAnnotFile, add_header_string = """##sgcocaller_v0.1=swphase""",threads = threads, chrom = $args["--chrom"])
......
......@@ -3,7 +3,6 @@ import utils
import sequtils
import math
import streams
# bins of SNP indexes
import strutils
# let binSize = 2000
......@@ -34,29 +33,29 @@ proc hasCrossover(templ_geno:seq[BinaryGeno], cell_geno: seq[seq[BinaryGeno]]):
if g != gUnknown and cell_geno[snpi][cellj] != gUnknown:
ncompared[cellj].inc
if g == cell_geno[snpi][cellj]: nmatch[cellj].inc
if debug: echo "ncompared " & $ncompared
# if debug: echo "ncompared " & $ncompared
let dissim = map(toSeq(0..(ncells-1)), proc(x:int):float = nmatch[x]/ncompared[x])
if debug: echo "dissim " & $dissim
# if debug: echo "dissim " & $dissim
var ncrossover = map(dissim, proc(x:float): int = (int)(x > dissimThresh and x < (1-dissimThresh)))
if debug: echo "ncrossover " & $ncrossover
# if debug: echo "ncrossover " & $ncrossover
let nxcells = foldl(ncrossover, a + b)
if nxcells < int(floor(0.5 * float(ncells))):
if debug: echo "bin had no crossovers : " & $nxcells
# if debug: echo "bin had no crossovers : " & $nxcells
return false
else:
if debug: echo "bin had many crossovers : " & $nxcells
# if debug: echo "bin had many crossovers : " & $nxcells
return true
proc findHighRiskSnps(fullGeno:seq[BinaryGeno], gtMtxByCell:seq[seq[BinaryGeno]], binSize:int, movingStep:int): seq[int] =
#let ncells = gtMtxByCell[0].len
let nSnps = gtMtxByCell.len
let nBins = (int)(ceil(nSnps / (binSize - movingStep)))
if debug: echo "nBins: " & $nBins
# if debug: echo "nBins: " & $nBins
let binIds = toSeq(0..(nBins-1))
var snpPosStart,snpPosEnd: int
var highRiskSnps: seq[int]
for binI in binIds:
if debug: echo "binI " & $binI
# if debug: echo "binI " & $binI
snpPosStart = (binI)*(binSize - movingStep)
if (snpPosStart + binSize) > (nSnps-1):
snpPosEnd = (nSnps-1)
......@@ -180,7 +179,7 @@ proc findSwitchSites(switchScoreT: switchScoreTuple, lookBeyondSnps = 25, minSwi
var inPositiveBlock = false
for site, score in switchScoreT.switch_scores:
echo "site, " & $site & " score: " & $(score)
# echo "site, " & $site & " score: " & $(score)
if switchScoreT.switch_scores_snpIndex.find(site)>=0:
if score <= 0.0 or (switchScoreT.switch_scores_snpIndex.find(site) == (switchScoreT.switch_scores_snpIndex.len - 1)) :
if inPositiveBlock:
......@@ -248,7 +247,7 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
switchScoreFileStream = openFileStream(switchScoreFile, fmWrite)
except:
stderr.write getCurrentExceptionMsg()
echo "reading gtMtx to by cell"
echo "reading gtMtx by cell"
discard gtMtxFileStream.readLine()
discard phaseSnpAnnotFileStream.readLine()
#N, i,j
......@@ -258,7 +257,6 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
echo "nSnps " & $nSnps
ncells = currentEntry[1]
echo "ncells " & $ncells
echo "totalEntries " & $ currentEntry[2]
## gtMtx is cell by Snp format
gtMtxByCell = newSeqWith(nSnps,newSeq[BinaryGeno](ncells))
......@@ -277,10 +275,3 @@ proc correctPhase*(gtMtxFile: string, phaseSnpAnnotFile:string, switchedPhasedAn
for fs in [gtMtxFileStream,phaseSnpAnnotFileStream,switchScoreFileStream]:
fs.close()
discard writeSwitchedPhase(switchSites,switchedPhasedAnnotFile,phaseSnpAnnotFile)
# for chrom in @["chr5"]:
# let gtMtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_gtMtx.mtx"
# let phaseSnpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseOneStep/hsperm_" & chrom & "_phased_snpAnnot.txt"
# let switchedPhasedAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_corrected_phased_snpAnnot.txt"
# let switchScoreFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Kirkness2013/output/boostrapping_9/sgcocaller/phaseCorrected/hsperm_" & chrom & "_switch_score.txt"
# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,switchedPhasedAnnotFile,switchScoreFile,lookBeyondSnps = 150,minSwitchScore = 280.0, minPositiveSwitchScores = 25)
\ No newline at end of file
......@@ -12,7 +12,6 @@ import findGtNodes
## these outputs will be per chromosome
proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, outSnpAnnot:FileStream,
outTotalCountMtx:FileStream, outAltCountMtx:FileStream,maxTotalReads:int,
minTotalReads:int, mapq: int,minbsq:int, barcodeTag:string, minsnpdepth:int,minCellDp:int, maxCellDp:int,p0 = 0.3, p1 = 0.7,chrom: string):int =
......@@ -75,62 +74,3 @@ proc getGtMtx*(ibam:Bam, ivcf:VCF, barcodeTable:TableRef, outGtMtx:FileStream, o
for outFileStream in[outGtMtx, outTotalCountMtx, outAltCountMtx, outSnpAnnot]:
outFileStream.close()
return 0
# let vcf_file = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT.chr1.vcf.gz"
# let bam_file = "data/WC_CNV_42/WC_CNV_42.bam"
# let barcode_file = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv"
# var
# ibam:Bam
# ivcf:VCF
# outGtMtx, outSnpAnnot, outTotalCountMtx, outAltCountMtx:FileStream
# if not open(ibam, bam_file, threads=1, index = true):
# quit "couldn't open input bam"
# if not open(ivcf, vcf_file, threads=1):
# quit "couldn't open: vcf file"
# var hf = hts.hts_open(cstring(barcode_file), "r")
# #### TODO : Table size
# var barcodeTable = newTable[string,int](initialSize = 1024)
# var kstr: hts.kstring_t
# kstr.l = 0
# kstr.m = 0
# kstr.s = nil
# var ithSperm = 0
# ## initiate the table with CB as keys, allele counts (ref object) as elements
# while hts_getline(hf, cint(10), addr kstr) > 0:
# if $kstr.s[0] == "#":
# continue
# var v = $kstr.s
# discard barcodeTable.hasKeyOrPut(v, ithSperm)
# ithSperm.inc
# discard hf.hts_close()
# let s_Chrs = @["chr1"]
# var chrom = "chr1"
# let out_prefix = "test_data/getMtx/"
# echo "generating gtMtx"
# try:
# outGtMtx = openFileStream(out_prefix & chrom & "_gtMtx.mtx", fmWrite)
# outSnpAnnot = openFileStream(out_prefix & chrom & "_snpAnnot.txt", fmWrite)
# outTotalCountMtx = openFileStream(out_prefix & chrom & "_totalMtx.mtx", fmWrite)
# outAltCountMtx = openFileStream(out_prefix & chrom & "_AltMtx.mtx", fmWrite)
# except:
# stderr.write getCurrentExceptionMsg()
# discard getGtMtx(ibam = ibam, ivcf = ivcf, barcodeTable = barcodeTable, outGtMtx = outGtMtx, outTotalCountMtx = outTotalCountMtx ,outAltCountMtx = outAltCountMtx,
# outSnpAnnot = outSnpAnnot, maxTotalReads = 30,
# minTotalReads = 5, mapq = 20, minbsq = 13, minCellDp = 2,maxCellDp =10,barcodeTag = "CB",minsnpdepth=1)
# var outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile : string
# ## sort entries in mtx files
# for chrom in s_Chrs:
# outGtMtxFile = out_prefix & chrom & "_gtMtx.mtx"
# outTotalCountMtxFile = out_prefix & chrom & "_totalMtx.mtx"
# outAltCountMtxFile = out_prefix & chrom & "_AltMtx.mtx"
# for mtxFile in [outGtMtxFile, outTotalCountMtxFile,outAltCountMtxFile]:
# var imtx = readMtx(mtx_file = mtxFile)
# discard sortWriteMtx(imtx, mtx_file = mtxFile)
......@@ -127,21 +127,3 @@ proc sgcocallerSXO*(barcodeTable:TableRef, phase_dir:string, out_dir:string, the
for outFileStream in [phasedSnpannoFS,totalCountMtxFS,altCountMtxFS,outFileVStateMtx,outaltCountMtxFS,outSnpAnnotFS, outtotalCountMtxFS,viSegmentInfo]:
outFileStream.close()
return 0
# let barcodeFile = "data/WC_CNV_42/WC_CNV_42_filteredBC.tsv"
# var barcodeTable = newTable[string,int](initialSize = 1024)
# var hf = hts.hts_open(cstring(barcodeFile), "r")
# var kstr: hts.kstring_t
# kstr.l = 0
# kstr.m = 0
# kstr.s = nil
# var ithSperm = 0
# ## initiate the table with CB as keys, gamete index as elements
# while hts_getline(hf, cint(10), addr kstr) > 0:
# if $kstr.s[0] == "#": continue
# var v = $kstr.s
# discard barcodeTable.hasKeyOrPut(v, ithSperm)
# ithSperm.inc
# discard hf.hts_close()
# var s_Chrs = @["chr6"]
# discard sgcocallerSXO(barcodeTable = barcodeTable, phase_dir = "data/WC_CNV_42/sgcocaller/phaseOneStep/", out_dir = "data/WC_CNV_42/sgcocaller/sxo/",thetaREF = 0.1, thetaALT = 0.9, cmPmb = 0.001,s_Chrs =s_Chrs,initProb = [0.5,0.5])
\ No newline at end of file
......@@ -7,15 +7,6 @@ import streams
import strutils
import sequtils
# phaseOutdir/chrom_phased_snpAnnot.txt
# phaseOutdir/cellGenoVersusTemplate.txt
# phaseOutdir/cellGenoVersusTemplate.png by executing the R code
# let mtxFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_gtMtx.mtx"
# let phaseOutdir = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/"
# #let diagnosticDataframe = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/phase/cellGenoVersusTemplate.txt"
# let snpAnnotFile = "/mnt/mcfiles/rlyu/Projects/sgcocaller/test_data/gtMtx_chr1_snpAnnot.txt"
proc writePhasedSnpAnnot*(fullGeno:seq[BinaryGeno], snpAnnotFileStream:FileStream, phasedSnpAnnotFileStream:FileStream): int =
var snpindex = 0
var header: string
......@@ -97,25 +88,3 @@ proc sgphase*(mtxFile: string, snpAnnotFile:string, phasedSnpAnnotFile:string, d
discard writePhasedSnpAnnot(fullGeno, snpAnnotFileStream, phasedSnpAnnotFileStream)
for fs in [gtMtxFileStream,ddframFileStream,snpAnnotFileStream,phasedSnpAnnotFileStream]:
fs.close()
# var s_Chrs = @["CUR6G"]
# for chrom in s_Chrs:
# var mtxFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_gtMtx.mtx"
# var phasedSnpFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_phased_snpAnnot.txt"
# var snpAnnotFile = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_snpAnnot.txt"
# var ddframe = "/mnt/beegfs/mccarthy/scratch/general/Datasets/Campoy2020-gamete-binning-apricot/output/sgcocaller/phaseOneStep/apricot_" & chrom & "_cellGenoVersusTemplate.txt"
# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe)
# for chrom in s_Chrs:
# var mtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx"
# var phasedSnpFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt"
# var snpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_snpAnnot.txt"
# var ddframe = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_cellGenoVersusTemplate.txt"
# discard sgphase(mtxFile, snpAnnotFile, phasedSnpFile,ddframe)
\ No newline at end of file
......@@ -22,7 +22,6 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string
if debug: echo "ivcffile " & ivcfFile
if debug: echo "ovcfFile " & ovcfFile
if debug: echo "phasedAnnotFile " & phasedAnnotFile
var ivcf: VCF
var ovcf: VCF
var v_off = 0
......@@ -72,10 +71,10 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string
quit "write vcf failed for " & $voff
if phasedAnnotFS.atEnd(): break
phaseRec = readNextPhased(phasedAnnotFS)
if debug:
if phaseRec.len != 4:
echo " phaseRec.len " & $phaseRec.len
echo " phaseRec" & $phaseRec
# if debug:
# if phaseRec.len != 4:
# echo " phaseRec.len " & $phaseRec.len
# echo " phaseRec" & $phaseRec
phasePos = parseInt(phaseRec[0])
phasePhase = parseInt(phaseRec[3])
phasedAnnotFS.close()
......@@ -84,16 +83,3 @@ proc writePhaseToVCF*(ivcfFile:string, ovcfFile: string, phasedAnnotFile: string
return 0
# let s_Chrs = map(toSeq(1..19), proc(x:int):string = "chr" & $x)
# for chrom in @["chr8"]:
# var ivcf = "data/swapped/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.modified.swapped.GT." & chrom & ".vcf.gz"
# var ovcf = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz"
# var swphasedSnpFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.txt"
# var gtMtxFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_gtMtx.mtx"
# var phaseSnpAnnotFile = "data/WC_CNV_42/sgcocaller/phaseOneStep/" & chrom & "_phased_snpAnnot.txt"
# var switchScoreFile ="data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_switch_score.txt"
# var switchedPhasedAnnotVcfFile = "data/WC_CNV_42/sgcocaller/swphase/" & chrom & "_corrected_phased_snpAnnot.vcf.gz"
# discard correctPhase(gtMtxFile,phaseSnpAnnotFile,swphasedSnpFile,switchScoreFile)
# discard writePhaseToVCF(ivcf, ovcf, swphasedSnpFile,1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment