comparison Contra/contra.py @ 0:7564f3b1e675

Uploaded
author fcaramia
date Thu, 13 Sep 2012 02:31:43 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7564f3b1e675
1 #!/usr/bin/python
2
3 # ----------------------------------------------------------------------#
4 # Copyright (c) 2011, Richard Lupat & Jason Li.
5 #
6 # > Source License <
7 # This file is part of CONTRA.
8 #
9 # CONTRA is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # CONTRA is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>.
21 #
22 #
23 #-----------------------------------------------------------------------#
24 # Last Updated : 23 July 2012 16:43PM
25
26
27 import os
28 from optparse import OptionParser
29 import sys
30 import subprocess
31 import shlex
32 from multiprocessing import Process, Manager
33
34 from scripts.assign_bin_number_v2 import *
35 from scripts.average_count import *
36 from scripts.cn_apply_threshold import *
37 from scripts.convert_gene_coordinate import *
38 from scripts.convert_targeted_regions import *
39 from scripts.split_chromosome import *
40 from scripts.vcf_out import *
41 from scripts.get_chr_length import *
42 from scripts.count_libsize import *
43 from scripts.target_breakdown import *
44
45 #Absolute Path
46 scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
47
48 class Params:
49 """
50 Class for top-level system parameters
51 """
52
53 def __init__(self):
54 # command-line option definition
55 self.parser = OptionParser()
56 self.parser.add_option("-t", "--target",
57 help="Target region definition file [REQUIRED] [BED Format]",
58 action="store", type="string", dest="target")
59 self.parser.add_option("-s", "--test",
60 help="Alignment file for the test sample [REQUIRED] [BAM/SAM]",
61 action="store", type="string", dest="test")
62 self.parser.add_option("-c", "--control",
63 help="Alignment file for the control sample [REQUIRED] [BAM/SAM]",
64 action="store", type="string", dest="control")
65 self.parser.add_option("-f", "--fasta",
66 help="Reference Genome [REQUIRED][FASTA]",
67 action="store", type="string", dest="fasta")
68 self.parser.add_option("-o", "--outFolder",
69 help="the output folder path name to store the output of analysis [REQUIRED]",
70 action="store", type="string", dest="outFolder")
71 self.parser.add_option("--numBin",
72 help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]",
73 action="store", type="string", dest="numBin", default="20")
74 self.parser.add_option("--minReadDepth",
75 help="The threshold for minimum read depth for each bases [10]",
76 action="store", type="string", dest="minReadDepth", default=10)
77 self.parser.add_option("--minNBases",
78 help="The threshold for minimum number of bases for each target regions [10]",
79 action="store", type="string", dest="minNBases", default= 10)
80 self.parser.add_option("--sam",
81 help="If the specified, test and control sample are in SAM [False]",
82 action="store_true", dest="sam", default="False")
83 self.parser.add_option("--bed",
84 help="if specified, control will be in BED format [False]",
85 action="store_true", dest = "bedInput", default="False")
86 self.parser.add_option("--pval",
87 help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.",
88 action="store", type="string", dest="pval", default=0.05)
89 self.parser.add_option("--sampleName",
90 help ="The name to be appended to the front of default output name ['']",
91 action="store", type="string", dest="sampleName", default='')
92 self.parser.add_option("--nomultimapped",
93 help="The option to remove multi-mapped reads [False]",
94 action="store_true", dest="nomultimapped",default="False")
95 self.parser.add_option("-p", "--plot",
96 help="Plots log-ratio distribution for each bin [False]",
97 action="store_true", dest="plot", default="False")
98 self.parser.add_option("--minExon",
99 help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons"
100 +"will be moved to the adjacent bins) [2000] ",
101 action="store", type="string", dest="minExon", default="2000")
102 self.parser.add_option("--minControlRdForCall",
103 help="Minimum control readdepth for call [5]",
104 action="store", type="string", dest="minControl", default="5")
105
106 self.parser.add_option("--minTestRdForCall",
107 help="Minimum test readdepth for call [0]",
108 action="store", type="string", dest="minTest", default="0")
109
110 self.parser.add_option("--minAvgForCall",
111 help="Minimum average coverage for call [20]",
112 action="store", type="string", dest="minAvg", default="20")
113
114 self.parser.add_option("--maxRegionSize",
115 help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]",
116 action="store", type="string", dest="maxRegionSize", default="0")
117
118 self.parser.add_option("--targetRegionSize",
119 help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]",
120 action="store", type="string", dest="targetRegionSize", default="200")
121
122 self.parser.add_option("-l", "--largeDeletion",
123 help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]",
124 action="store_true", dest = "large", default="False")
125
126 self.parser.add_option("--smallSegment",
127 help="CBS segment size for calling large variations [1]",
128 action="store", type="string", dest="smallSegment", default="1")
129
130 self.parser.add_option("--largeSegment",
131 help="CBS segment size for calling large variations [25]",
132 action="store", type="string", dest="largeSegment", default="25")
133
134 self.parser.add_option("--lrCallStart",
135 help="Log ratios start range that will be used to call CNV [-0.3]",
136 action="store", type="string", dest="lrs", default="-0.3")
137
138 self.parser.add_option("--lrCallEnd",
139 help="Log ratios end range that will be used to call CNV [0.3]",
140 action="store", type="string", dest="lre", default="0.3")
141
142 self.parser.add_option("--passSize",
143 help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]",
144 action="store", type="string", dest="passSize", default="0.35")
145
146
147 # required parameters list
148 self.ERRORLIST = []
149
150 # change system parameters based on any command line
151 (options, args) = self.parser.parse_args()
152 if options.target:
153 self.TARGET = options.target
154 else:
155 #self.parser.print_help()
156 #self.parser.error("--target not supplied")
157 self.ERRORLIST.append("target")
158
159 if options.test:
160 self.TEST = options.test
161 else:
162 #self.parser.error("--test not supplied")
163 self.ERRORLIST.append("test")
164
165 if options.control:
166 self.CONTROL = options.control
167 else:
168 #self.parser.error("--control not supplied")
169 self.ERRORLIST.append("control")
170
171 if options.fasta:
172 self.FASTA = options.fasta
173 else:
174 #self.parser.error("--fasta not supplied")
175 self.ERRORLIST.append("fasta")
176
177 if options.outFolder:
178 self.OUTFOLDER = options.outFolder
179 else:
180 #self.parser.error("--outFolder not supplied")
181 self.ERRORLIST.append("outfolder")
182
183 if len(self.ERRORLIST) != 0:
184 self.parser.print_help()
185 self.parser.error("Missing required parameters")
186
187 if options.numBin:
188 binsNumber = options.numBin.split(",")
189 try:
190 self.NUMBIN = [int(j) for j in binsNumber]
191 except:
192 self.NUMBIN = [20]
193 if options.minReadDepth:
194 self.MINREADDEPTH = int(options.minReadDepth)
195 if options.minNBases:
196 self.MINNBASES = int(options.minNBases)
197 if options.sam:
198 self.SAM = str(options.sam)
199 if options.pval:
200 self.PVAL = options.pval
201 if options.sampleName:
202 self.SAMPLENAME = options.sampleName
203 else:
204 self.SAMPLENAME = 'No-SampleName'
205 if options.nomultimapped:
206 self.NOMULTIMAPPED = str(options.nomultimapped)
207 if options.plot:
208 self.PLOT = str(options.plot)
209 if options.bedInput:
210 self.BEDINPUT = options.bedInput
211 if options.minExon:
212 self.MINEXON = int(options.minExon)
213 if options.minControl:
214 self.MINCONTROL = options.minControl
215 if options.minTest:
216 self.MINTEST = options.minTest
217 if options.minAvg:
218 self.MINAVG = options.minAvg
219 if options.maxRegionSize:
220 self.MAXREGIONSIZE = int(options.maxRegionSize)
221 if options.targetRegionSize:
222 self.TARGETREGIONSIZE = int(options.targetRegionSize)
223 if options.large:
224 self.LARGE = str(options.large)
225 if options.smallSegment:
226 self.SMALLSEGMENT = options.smallSegment
227 if options.largeSegment:
228 self.LARGESEGMENT = options.largeSegment
229 if options.lre:
230 self.LRE = options.lre
231 if options.lrs:
232 self.LRS = options.lrs
233 if options.passSize:
234 self.PASSSIZE = options.passSize
235
236 def repeat(self):
237 # params test
238 print "target :", self.TARGET
239 print "test :", self.TEST
240 print "control :", self.CONTROL
241 print "fasta :", self.FASTA
242 print "outfolder :", self.OUTFOLDER
243 print "numBin :", self.NUMBIN
244 print "minreaddepth :", self.MINREADDEPTH
245 print "minNBases :", self.MINNBASES
246 print "sam :", self.SAM
247 print "pval :", self.PVAL
248 print "sampleName :", self.SAMPLENAME
249 print "nomultimapped :", self.NOMULTIMAPPED
250 print "plot :", self.PLOT
251 print "bedInput :", self.BEDINPUT
252 print "minExon :", self.MINEXON
253 print "largeDeletion :", self.LARGE
254 def checkOutputFolder(outF):
255 print "Creating Output Folder :",
256
257 if outF[len(outF)-1] == "/":
258 outF = outF[:len(outF)-1]
259
260 try:
261 os.mkdir(outF)
262 except:
263 print "cannot create folder '%s'" %outF
264 print "if folder already exist, please specify other folder"
265 sys.exit(1)
266
267 try:
268 os.mkdir(outF+"/table")
269 os.mkdir(outF+"/plot")
270 os.mkdir(outF+"/buf")
271 os.mkdir(outF+"/buf/ctrData/")
272 os.mkdir(outF+"/buf/testData/")
273 except:
274 print "[ERROR: CANNOT CREATE SUBFOLDERS]"
275 sys.exit(1)
276
277 print " Done."
278
279 return outF
280
281
282 #BEDINPUT
283 def countTotalReads3(params, folder):
284 tempFileName = folder + "/temp.txt"
285 tempReadFile = open(tempFileName, "w")
286 libsize = get_libsize(params.BEDINPUT)
287 tempReadFile.write(libsize)
288 #tempReadFile.write(params.CONTROLREADCOUNT)
289 tempReadFile.close()
290
291
292 def countTotalReads(params, folder):
293 if 'testData' in folder:
294 inF = params.TEST
295 else:
296 inF = params.CONTROL
297
298 # Get Total ReadCount
299 getreadcount = os.system("samtools view %s | wc -l > %s/temp.txt" %(inF,folder))
300
301 def samToBam(samfile, bamfile):
302 args = shlex.split("samtools view -bS %s -o %s" %(samfile, bamfile))
303 samtobam = subprocess.call(args)
304
305 return bamfile
306
307 def removeMultiMapped(inF, newBAM):
308 # Get New BAM Files with mapping quality > 0
309 args = shlex.split("samtools view -bq 1 %s -o %s" %(inF, newBAM))
310 removeMM = subprocess.call(args)
311 print "Multi mapped reads removed. "
312
313
314 #BEDINPUT
315 def convertBamSimple(params, folder, targetList, genomeFile):
316 if 'testData' in folder:
317 inF = params.TEST
318 print "Converting TEST Sample... "
319 else:
320 inF = params.CONTROL
321 print "Converting CONTROL Sample... "
322
323 #Copy file to working folder
324 os.system("cp %s %s" %(inF, folder+"sample.BEDGRAPH"))
325
326 # Split Bedgraph by its chromosomes
327 splitByChromosome(folder)
328
329 # Slice the coverage files to only cover the targeted regions
330 print "Getting targeted regions DOC..."
331 convertGeneCoordinate(targetList, folder)
332
333 # LIBSIZE
334 libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
335 tempLibSize = open(folder + "/temp.txt", "w")
336 tempLibSize.write(libsize)
337 tempLibSize.close()
338
339 print "Targeted regions pre-processing: Done"
340
341
342 def convertBam(params, folder, targetList, genomeFile):
343 if 'testData' in folder:
344 inF = params.TEST
345 print "Converting TEST Sample... "
346 else:
347 inF = params.CONTROL
348 print "Converting CONTROL Sample... "
349
350 # Convert BAM Files to BEDGRAPH
351 bedgraph = folder + "sample.BEDGRAPH"
352 args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(inF, genomeFile))
353 #output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
354 iOutFile = open(bedgraph, "w")
355 #iOutFile.write(output)
356 output = subprocess.Popen(args, stdout = iOutFile).wait()
357 iOutFile.close()
358
359 # Split Bedgraph by its chromosomes
360 splitByChromosome(folder)
361
362 # Slice the coverage files to only cover the targeted regions
363 print "Getting targeted regions DOC..."
364 convertGeneCoordinate(targetList, folder)
365
366 # LIBSIZE
367 libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
368 tempLibSize = open(folder + "temp.txt", "w")
369 tempLibSize.write(libsize)
370 tempLibSize.close()
371
372 print "Targeted regions pre-processing: Done"
373
374 def analysisPerBin(params, num_bin, outFolder, targetList):
375 import shutil
376
377 bufLoc = outFolder + "/buf"
378 # Assign bin number to the median and average file
379 numBin = assignBin(num_bin, bufLoc+"/average.txt", bufLoc+"/bin", targetList, params.MINEXON)
380
381 #copy bin_boundary to plot folder
382 #outBounFile = os.path.join(outFolder, "plot", "bin_boundary"+str(num_bin))
383 #curBounFile = os.path.join(bufLoc, "bin" + str(num_bin) + ".boundaries.txt")
384 #shutil.copy(curBounFile, outBounFile)
385
386
387 print "Significance Test ... "
388 rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R")
389 args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s %s %s"
390 %(rScriptName, num_bin, params.MINREADDEPTH, params.MINNBASES, outFolder, params.SAMPLENAME,params.PLOT, numBin, params.MINCONTROL, params.MINTEST, params.MINAVG))
391 rscr = subprocess.call(args)
392
393
394 print "Generating Output Files ... "
395 # Analysis of CNV
396 tNameList = os.listdir(outFolder+"/table/")
397 if num_bin > 1:
398 tNameId = str(num_bin) + "bins"
399 else:
400 tNameId = str(num_bin) + "bin"
401 for tName in tNameList:
402 if tNameId in tName:
403 break
404
405 if "CNATable" in tName:
406 tName = tName[:len(tName)-4]
407 tableName = outFolder + "/table/" + tName
408 bufTable = bufLoc + "/" + tName
409 applyThreshold(tableName, bufTable, params.PVAL, 100000) #params.MAXGAP = 100000
410
411 # Large Region CBS
412 if (params.LARGE != "False"):
413 rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R")
414
415 args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s"
416 %(rScriptName2, tableName+".txt", params.SMALLSEGMENT, params.LARGESEGMENT, params.PVAL, params.PASSSIZE, params.LRS, params.LRE, bufLoc))
417 rscr2 = subprocess.call(args)
418
419 # Generate the DNA sequence (for VCF file)
420 bedFile = bufTable + ".BED"
421 bedFasta = bufTable + ".fastaOut.txt"
422 fastaFile = params.FASTA
423 args = shlex.split("fastaFromBed -fi %s -bed %s -fo %s -name"
424 %(fastaFile, bedFile, bedFasta))
425 fastaBED = subprocess.call(args)
426
427 # Write VCF
428 print "Creating VCF file ... "
429 vcfFile = tableName + ".vcf"
430 vcf_out(bedFasta, vcfFile)
431
432 print "%s created. " %(vcfFile)
433
434 else:
435 print "Table not found"
436
437 def removeTempFolder(tempFolderPath):
438 import shutil
439
440 shutil.rmtree(tempFolderPath)
441
442 print "Temp Folder Removed"
443
444
445 def main():
446 # option handling
447 params = Params()
448 params.repeat()
449
450 # output folder handling
451 outFolder = checkOutputFolder(params.OUTFOLDER)
452 bufLoc = outFolder + "/buf"
453
454 # convert target file
455 sorted_target = os.path.join(bufLoc, "target.BED")
456 os.system("sort -k1,1 -k2n %s > %s" %(params.TARGET, sorted_target))
457
458 # target breakdown
459 if params.MAXREGIONSIZE > 0:
460 new_target = os.path.join(bufLoc, "target_breakdown.BED")
461 target_breakdown(sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target)
462 sorted_target = new_target
463
464 targetList = convertTarget(sorted_target)
465
466 # convert sam to bam if -sam specified
467 if (params.SAM == "True"):
468 print "Pre-processing SAM files"
469
470 test_bam = bufLoc + "/test.BAM"
471 ctr_bam = bufLoc + "/control.BAM"
472
473 samTest = Process(target= samToBam, args=(params.TEST, test_bam))
474 if params.BEDINPUT == "False":
475 samCtr = Process(target= samToBam, args=(params.CONTROL, ctr_bam))
476
477 samTest.start()
478 if params.BEDINPUT == "False":
479 samCtr.start()
480
481 samTest.join()
482 if params.BEDINPUT == "False":
483 samCtr.join()
484
485 params.TEST = test_bam
486 if params.BEDINPUT == "False":
487 params.CONTROL = ctr_bam
488
489 # remove multi mapped reads if --nomultimapped is specified
490 if (params.NOMULTIMAPPED == "True"):
491 print "Removing multi-mapped reads"
492
493 test_bam = bufLoc + "/test_reliable.BAM"
494 ctr_bam = bufLoc + "/control_reliable.BAM"
495
496 bamTest = Process(target= removeMultiMapped, args=(params.TEST, test_bam))
497 if params.BEDINPUT == "False":
498 bamCtr = Process(target= removeMultiMapped, args=(params.CONTROL, ctr_bam))
499
500 bamTest.start()
501 if params.BEDINPUT == "False":
502 bamCtr.start()
503
504 bamTest.join()
505 if params.BEDINPUT == "False":
506 bamCtr.join()
507
508 params.TEST = test_bam
509 if params.BEDINPUT == "False":
510 params.CONTROL = ctr_bam
511
512 # Get Chromosome Length
513 genomeFile = bufLoc + '/sample.Genome'
514 get_genome(params.TEST, genomeFile)
515
516 # spawn bam converting scripts
517 pTest = Process(target= convertBam,
518 args=(params, bufLoc+'/testData/', targetList, genomeFile))
519
520 #BEDINPUT
521 if params.BEDINPUT == "False":
522
523 cTest = Process(target= convertBam,
524 args=(params, bufLoc+'/ctrData/' , targetList, genomeFile))
525 else:
526 cTest = Process(target= convertBamSimple,
527 args=(params, bufLoc+'/ctrData/', targetList, genomeFile))
528 # start the processes
529 pTest.start()
530 cTest.start()
531
532 # wait for all the processes to finish before continuing
533 pTest.join()
534 cTest.join()
535
536 # Get the read depth count from temporary folder
537 for folder in [bufLoc+'/testData/', bufLoc+'/ctrData/']:
538 if 'testData' in folder:
539 t1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
540 else:
541 n1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
542 print "Test file read depth = ", t1
543 print "Control file read depth = ", n1
544 print "Pre-processing Completed. "
545
546 # Get the Average of the Log Ratio
547 print "Getting the Log Ratio ... "
548 testListName = bufLoc + '/testData/geneRefCoverage.txt'
549 controlListName = bufLoc + '/ctrData/geneRefCoverage.txt'
550 avOut = bufLoc + "/average.txt"
551 averageCount(testListName, controlListName, avOut, t1, n1, params.MINREADDEPTH, params.MINNBASES)
552
553 # Analysis. [Bin, significance test, large deletion, vcf output]
554 print "Binning ... "
555 binProc = []
556 for numBin in params.NUMBIN:
557 binProc.append(Process(target= analysisPerBin, args=(params,numBin,outFolder,targetList)))
558
559 for proc in binProc:
560 proc.start()
561
562 for proc in binProc:
563 proc.join()
564
565 # Removed Temp Folder
566 removeTempFolder(bufLoc)
567
568 if __name__ == "__main__":
569 main()
570 print "Done... "