Mercurial > repos > xuebing > sharplabtool
comparison tools/picard/picard_wrapper.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Originally written by Kelly Vincent | |
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics | |
5 Runs all available wrapped Picard tools. | |
6 usage: picard_wrapper.py [options] | |
7 code Ross wrote licensed under the LGPL | |
8 see http://www.gnu.org/copyleft/lesser.html | |
9 """ | |
10 | |
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging | |
12 | |
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
16 <head> | |
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" /> | |
19 <title></title> | |
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
21 </head> | |
22 <body> | |
23 <div class="document"> | |
24 """ | |
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>""" | |
26 galhtmlpostfix = """</div></body></html>\n""" | |
27 | |
28 | |
29 def stop_err( msg ): | |
30 sys.stderr.write( '%s\n' % msg ) | |
31 sys.exit() | |
32 | |
33 | |
34 def timenow(): | |
35 """return current time as a string | |
36 """ | |
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
38 | |
39 | |
40 class PicardBase(): | |
41 """ | |
42 simple base class with some utilities for Picard | |
43 adapted and merged with Kelly Vincent's code april 2011 Ross | |
44 lots of changes... | |
45 """ | |
46 | |
47 def __init__(self, opts=None,arg0=None): | |
48 """ common stuff needed at init for a picard tool | |
49 """ | |
50 assert opts <> None, 'PicardBase needs opts at init' | |
51 self.opts = opts | |
52 if self.opts.outdir == None: | |
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir | |
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in' | |
55 self.picname = self.baseName(opts.jar) | |
56 if self.picname.startswith('picard'): | |
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader? | |
58 self.progname = self.baseName(arg0) | |
59 self.version = '0.002' | |
60 self.delme = [] # list of files to destroy | |
61 self.title = opts.title | |
62 self.inputfile = opts.input | |
63 try: | |
64 os.makedirs(opts.outdir) | |
65 except: | |
66 pass | |
67 try: | |
68 os.makedirs(opts.tmpdir) | |
69 except: | |
70 pass | |
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname) | |
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname) | |
73 self.setLogging(logfname=self.log_filename) | |
74 | |
75 def baseName(self,name=None): | |
76 return os.path.splitext(os.path.basename(name))[0] | |
77 | |
78 def setLogging(self,logfname="picard_wrapper.log"): | |
79 """setup a logger | |
80 """ | |
81 logging.basicConfig(level=logging.INFO, | |
82 filename=logfname, | |
83 filemode='a') | |
84 | |
85 | |
86 def readLarge(self,fname=None): | |
87 """ read a potentially huge file. | |
88 """ | |
89 try: | |
90 # get stderr, allowing for case where it's very large | |
91 tmp = open( fname, 'rb' ) | |
92 s = '' | |
93 buffsize = 1048576 | |
94 try: | |
95 while True: | |
96 more = tmp.read( buffsize ) | |
97 if len(more) > 0: | |
98 s += more | |
99 else: | |
100 break | |
101 except OverflowError: | |
102 pass | |
103 tmp.close() | |
104 except Exception, e: | |
105 stop_err( 'Error : %s' % str( e ) ) | |
106 return s | |
107 | |
108 def runCL(self,cl=None,output_dir=None): | |
109 """ construct and run a command line | |
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation | |
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts | |
112 """ | |
113 assert cl <> None, 'PicardBase runCL needs a command line as cl' | |
114 if output_dir == None: | |
115 output_dir = self.opts.outdir | |
116 if type(cl) == type([]): | |
117 cl = ' '.join(cl) | |
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt') | |
119 tlf = open(templog,'wb') | |
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt') | |
121 tef = open(temperr,'wb') | |
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir) | |
123 rval = process.wait() | |
124 tlf.close() | |
125 tef.close() | |
126 stderrs = self.readLarge(temperr) | |
127 stdouts = self.readLarge(templog) | |
128 if len(stderrs) > 0: | |
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs) | |
130 else: | |
131 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval) | |
132 logging.info(s) | |
133 os.unlink(templog) # always | |
134 os.unlink(temperr) # always | |
135 return s, stdouts # sometimes this is an output | |
136 | |
137 def runPic(self, jar, cl): | |
138 """ | |
139 cl should be everything after the jar file name in the command | |
140 """ | |
141 runme = ['java -Xmx%s' % self.opts.maxjheap] | |
142 runme.append('-jar %s' % jar) | |
143 runme += cl | |
144 s,stdout = self.runCL(cl=runme, output_dir=self.opts.outdir) | |
145 return stdout | |
146 | |
147 def samToBam(self,infile=None,outdir=None): | |
148 """ | |
149 use samtools view to convert sam to bam | |
150 """ | |
151 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam') | |
152 cl = ['samtools view -h -b -S -o ',tempbam,infile] | |
153 tlog,stdouts = self.runCL(cl,outdir) | |
154 return tlog,tempbam | |
155 | |
156 #def bamToSam(self,infile=None,outdir=None): | |
157 # """ | |
158 # use samtools view to convert bam to sam | |
159 # """ | |
160 # fd,tempsam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.sam') | |
161 # cl = ['samtools view -h -o ',tempsam,infile] | |
162 # tlog,stdouts = self.runCL(cl,outdir) | |
163 # return tlog,tempsam | |
164 | |
165 def sortSam(self, infile=None,outfile=None,outdir=None): | |
166 """ | |
167 """ | |
168 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir) | |
169 cl = ['samtools sort',infile,outfile] | |
170 tlog,stdouts = self.runCL(cl,outdir) | |
171 return tlog | |
172 | |
173 def cleanup(self): | |
174 for fname in self.delme: | |
175 try: | |
176 os.unlink(fname) | |
177 except: | |
178 pass | |
179 | |
180 def prettyPicout(self,transpose,maxrows): | |
181 """organize picard outpouts into a report html page | |
182 """ | |
183 res = [] | |
184 try: | |
185 r = open(self.metricsOut,'r').readlines() | |
186 except: | |
187 r = [] | |
188 if len(r) > 0: | |
189 res.append('<b>Picard on line resources</b><ul>\n') | |
190 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') | |
191 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') | |
192 if transpose: | |
193 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n') | |
194 else: | |
195 res.append('<b>Picard output</b><hr/>\n') | |
196 res.append('<table cellpadding="3" >\n') | |
197 dat = [] | |
198 heads = [] | |
199 lastr = len(r) - 1 | |
200 # special case for estimate library complexity hist | |
201 thist = False | |
202 for i,row in enumerate(r): | |
203 if row.strip() > '': | |
204 srow = row.split('\t') | |
205 if row.startswith('#'): | |
206 heads.append(row.strip()) # want strings | |
207 else: | |
208 dat.append(srow) # want lists | |
209 if row.startswith('## HISTOGRAM'): | |
210 thist = True | |
211 if len(heads) > 0: | |
212 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)] | |
213 res += hres | |
214 heads = [] | |
215 if len(dat) > 0: | |
216 if transpose and not thist: | |
217 tdat = map(None,*dat) # transpose an arbitrary list of lists | |
218 tdat = ['<tr class="d%d"><td>%s</td><td>%s </td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] | |
219 else: | |
220 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :( | |
221 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)] | |
222 res += tdat | |
223 dat = [] | |
224 res.append('</table>\n') | |
225 return res | |
226 | |
227 def fixPicardOutputs(self,transpose,maxloglines): | |
228 """ | |
229 picard produces long hard to read tab header files | |
230 make them available but present them transposed for readability | |
231 """ | |
232 logging.shutdown() | |
233 self.cleanup() # remove temp files stored in delme | |
234 rstyle="""<style type="text/css"> | |
235 tr.d0 td {background-color: oldlace; color: black;} | |
236 tr.d1 td {background-color: aliceblue; color: black;} | |
237 </style>""" | |
238 res = [rstyle,] | |
239 res.append(galhtmlprefix % self.progname) | |
240 res.append(galhtmlattr % (self.picname,timenow())) | |
241 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] | |
242 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] | |
243 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs | |
244 for p in pdflist: | |
245 imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf | |
246 res.append('<table cellpadding="10"><tr><td>\n') | |
247 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref)) | |
248 res.append('</tr></td></table>\n') | |
249 if len(flist) > 0: | |
250 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>') | |
251 res.append('<table>\n') | |
252 for i,f in enumerate(flist): | |
253 fn = os.path.split(f)[-1] | |
254 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) | |
255 res.append('</table><p/>\n') | |
256 pres = self.prettyPicout(transpose,maxloglines) | |
257 if len(pres) > 0: | |
258 res += pres | |
259 l = open(self.log_filename,'r').readlines() | |
260 llen = len(l) | |
261 if llen > 0: | |
262 res.append('<b>Picard Tool Run Log</b><hr/>\n') | |
263 rlog = ['<pre>',] | |
264 if llen > maxloglines: | |
265 n = min(50,int(maxloglines/2)) | |
266 rlog += l[:n] | |
267 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines)) | |
268 rlog += l[-n:] | |
269 else: | |
270 rlog += l | |
271 rlog.append('</pre>') | |
272 if llen > maxloglines: | |
273 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename)) | |
274 res += rlog | |
275 else: | |
276 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename) | |
277 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') | |
278 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool') | |
279 res.append(galhtmlpostfix) | |
280 outf = open(self.opts.htmlout,'w') | |
281 outf.write(''.join(res)) | |
282 outf.write('\n') | |
283 outf.close() | |
284 | |
285 def makePicInterval(self,inbed=None,outf=None): | |
286 """ | |
287 picard wants bait and target files to have the same header length as the incoming bam/sam | |
288 a meaningful (ie accurate) representation will fail because of this - so this hack | |
289 it would be far better to be able to supply the original bed untouched | |
290 """ | |
291 assert inbed <> None | |
292 bed = open(inbed,'r').readlines() | |
293 thead = os.path.join(self.opts.outdir,'tempSamHead.txt') | |
294 if self.opts.datatype == 'sam': | |
295 cl = ['samtools view -H -S',self.opts.input,'>',thead] | |
296 else: | |
297 cl = ['samtools view -H',self.opts.input,'>',thead] | |
298 self.runCL(cl=cl,output_dir=self.opts.outdir) | |
299 head = open(thead,'r').readlines() | |
300 s = '## got %d rows of header\n' % (len(head)) | |
301 logging.info(s) | |
302 o = open(outf,'w') | |
303 o.write(''.join(head)) | |
304 o.write(''.join(bed)) | |
305 o.close() | |
306 return outf | |
307 | |
308 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None): | |
309 """ | |
310 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs! | |
311 Do the work of removing all the error sequences | |
312 pysam is cool | |
313 infile = pysam.Samfile( "-", "r" ) | |
314 outfile = pysam.Samfile( "-", "w", template = infile ) | |
315 for s in infile: outfile.write(s) | |
316 | |
317 errors from ValidateSameFile.jar look like | |
318 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing | |
319 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary. | |
320 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041 | |
321 | |
322 """ | |
323 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam | |
324 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path' | |
325 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2] | |
326 remDict = dict(zip(removeNames,range(len(removeNames)))) | |
327 infile = pysam.Samfile(insam,'rb') | |
328 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict)) | |
329 if len(removeNames) > 0: | |
330 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file | |
331 i = 0 | |
332 j = 0 | |
333 for row in infile: | |
334 dropme = remDict.get(row.qname,None) # keep if None | |
335 if not dropme: | |
336 outfile.write(row) | |
337 j += 1 | |
338 else: # discard | |
339 i += 1 | |
340 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam)) | |
341 outfile.close() | |
342 infile.close() | |
343 else: # we really want a nullop or a simple pointer copy | |
344 infile.close() | |
345 if newsam: | |
346 shutil.copy(insam,newsam) | |
347 logging.info(info) | |
348 | |
349 | |
350 | |
351 def __main__(): | |
352 doFix = False # tools returning htmlfile don't need this | |
353 doTranspose = True # default | |
354 maxloglines = 100 # default | |
355 #Parse Command Line | |
356 op = optparse.OptionParser() | |
357 # All tools | |
358 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' ) | |
359 op.add_option('-e', '--inputext', default=None) | |
360 op.add_option('-o', '--output', default=None) | |
361 op.add_option('-n', '--title', default="Pick a Picard Tool") | |
362 op.add_option('-t', '--htmlout', default=None) | |
363 op.add_option('-d', '--outdir', default=None) | |
364 op.add_option('-x', '--maxjheap', default='4g') | |
365 op.add_option('-b', '--bisulphite', default='false') | |
366 op.add_option('-s', '--sortorder', default='query') | |
367 op.add_option('','--tmpdir', default='/tmp') | |
368 op.add_option('-j','--jar',default='') | |
369 op.add_option('','--picard-cmd',default=None) | |
370 # Many tools | |
371 op.add_option( '', '--output-format', dest='output_format', help='Output format' ) | |
372 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' ) | |
373 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None ) | |
374 # CreateSequenceDictionary | |
375 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None ) | |
376 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' ) | |
377 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' ) | |
378 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' ) | |
379 # MarkDuplicates | |
380 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' ) | |
381 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' ) | |
382 # CollectInsertSizeMetrics | |
383 op.add_option('', '--taillimit', default="0") | |
384 op.add_option('', '--histwidth', default="0") | |
385 op.add_option('', '--minpct', default="0.01") | |
386 # CollectAlignmentSummaryMetrics | |
387 op.add_option('', '--maxinsert', default="20") | |
388 op.add_option('', '--adaptors', action='append', type="string") | |
389 # FixMateInformation and validate | |
390 # CollectGcBiasMetrics | |
391 op.add_option('', '--windowsize', default='100') | |
392 op.add_option('', '--mingenomefrac', default='0.00001') | |
393 # AddOrReplaceReadGroups | |
394 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' ) | |
395 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' ) | |
396 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' ) | |
397 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' ) | |
398 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' ) | |
399 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' ) | |
400 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' ) | |
401 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' ) | |
402 # ReorderSam | |
403 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' ) | |
404 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' ) | |
405 # ReplaceSamHeader | |
406 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' ) | |
407 | |
408 op.add_option('','--assumesorted', default='true') | |
409 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*") | |
410 #estimatelibrarycomplexity | |
411 op.add_option('','--minid', default="5") | |
412 op.add_option('','--maxdiff', default="0.03") | |
413 op.add_option('','--minmeanq', default="20") | |
414 #hsmetrics | |
415 op.add_option('','--baitbed', default=None) | |
416 op.add_option('','--targetbed', default=None) | |
417 #validate | |
418 op.add_option('','--ignoreflags', action='append', type="string") | |
419 op.add_option('','--maxerrors', default=None) | |
420 op.add_option('','--datatype', default=None) | |
421 op.add_option('','--bamout', default=None) | |
422 op.add_option('','--samout', default=None) | |
423 | |
424 opts, args = op.parse_args() | |
425 opts.sortme = opts.assumesorted == 'false' | |
426 assert opts.input <> None | |
427 # need to add | |
428 # instance that does all the work | |
429 pic = PicardBase(opts,sys.argv[0]) | |
430 | |
431 tmp_dir = opts.outdir | |
432 haveTempout = False # we use this where sam output is an option | |
433 | |
434 # set ref and dict files to use (create if necessary) | |
435 ref_file_name = opts.ref | |
436 if opts.ref_file <> None: | |
437 csd = 'CreateSequenceDictionary' | |
438 realjarpath = os.path.split(opts.jar)[0] | |
439 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq | |
440 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname) | |
441 ref_file_name = '%s.fasta' % tmp_ref_name | |
442 # build dict | |
443 dict_file_name = '%s.dict' % tmp_ref_name | |
444 os.symlink( opts.ref_file, ref_file_name ) | |
445 cl = ['REFERENCE=%s' % ref_file_name] | |
446 cl.append('OUTPUT=%s' % dict_file_name) | |
447 cl.append('URI=%s' % os.path.basename( opts.ref_file )) | |
448 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names) | |
449 if opts.species_name: | |
450 cl.append('SPECIES=%s' % opts.species_name) | |
451 if opts.build_name: | |
452 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name) | |
453 pic.delme.append(dict_file_name) | |
454 pic.delme.append(ref_file_name) | |
455 pic.delme.append(tmp_ref_name) | |
456 s = pic.runPic(jarpath, cl) | |
457 # run relevant command(s) | |
458 | |
459 # define temporary output | |
460 # if output is sam, it must have that extension, otherwise bam will be produced | |
461 # specify sam or bam file with extension | |
462 if opts.output_format == 'sam': | |
463 suff = '.sam' | |
464 else: | |
465 suff = '' | |
466 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff ) | |
467 | |
468 cl = ['VALIDATION_STRINGENCY=LENIENT',] | |
469 | |
470 if pic.picname == 'AddOrReplaceReadGroups': | |
471 # sort order to match Galaxy's default | |
472 cl.append('SORT_ORDER=coordinate') | |
473 # input | |
474 cl.append('INPUT=%s' % opts.input) | |
475 # outputs | |
476 cl.append('OUTPUT=%s' % tempout) | |
477 # required read groups | |
478 cl.append('RGLB="%s"' % opts.rg_library) | |
479 cl.append('RGPL="%s"' % opts.rg_platform) | |
480 cl.append('RGPU="%s"' % opts.rg_plat_unit) | |
481 cl.append('RGSM="%s"' % opts.rg_sample) | |
482 if opts.rg_id: | |
483 cl.append('RGID="%s"' % opts.rg_id) | |
484 # optional read groups | |
485 if opts.rg_seq_center: | |
486 cl.append('RGCN="%s"' % opts.rg_seq_center) | |
487 if opts.rg_desc: | |
488 cl.append('RGDS="%s"' % opts.rg_desc) | |
489 pic.runPic(opts.jar, cl) | |
490 haveTempout = True | |
491 | |
492 elif pic.picname == 'BamIndexStats': | |
493 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir ) | |
494 tmp_bam_name = '%s.bam' % tmp_name | |
495 tmp_bai_name = '%s.bai' % tmp_bam_name | |
496 os.symlink( opts.input, tmp_bam_name ) | |
497 os.symlink( opts.bai_file, tmp_bai_name ) | |
498 cl.append('INPUT=%s' % ( tmp_bam_name )) | |
499 pic.delme.append(tmp_bam_name) | |
500 pic.delme.append(tmp_bai_name) | |
501 pic.delme.append(tmp_name) | |
502 s = pic.runPic( opts.jar, cl ) | |
503 f = open(pic.metricsOut,'a') | |
504 f.write(s) # got this on stdout from runCl | |
505 f.write('\n') | |
506 f.close() | |
507 doTranspose = False # but not transposed | |
508 | |
509 elif pic.picname == 'EstimateLibraryComplexity': | |
510 cl.append('I=%s' % opts.input) | |
511 cl.append('O=%s' % pic.metricsOut) | |
512 if float(opts.minid) > 0: | |
513 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid) | |
514 if float(opts.maxdiff) > 0.0: | |
515 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff) | |
516 if float(opts.minmeanq) > 0: | |
517 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq) | |
518 if opts.readregex > '': | |
519 cl.append('READ_NAME_REGEX="%s"' % opts.readregex) | |
520 if float(opts.optdupdist) > 0: | |
521 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist) | |
522 pic.runPic(opts.jar,cl) | |
523 | |
524 elif pic.picname == 'CollectAlignmentSummaryMetrics': | |
525 # Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. | |
526 # why? Dunno Seems to work without complaining if the .bai file is AWOL.... | |
527 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) | |
528 try: | |
529 os.symlink(ref_file_name,fakefasta) | |
530 except: | |
531 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy' | |
532 info = s | |
533 shutil.copy(ref_file_name,fakefasta) | |
534 pic.delme.append(fakefasta) | |
535 cl.append('ASSUME_SORTED=%s' % opts.assumesorted) | |
536 adaptorseqs = ''.join([' ADAPTER_SEQUENCE=%s' % x for x in opts.adaptors]) | |
537 cl.append(adaptorseqs) | |
538 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite) | |
539 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert) | |
540 cl.append('OUTPUT=%s' % pic.metricsOut) | |
541 cl.append('R=%s' % fakefasta) | |
542 cl.append('TMP_DIR=%s' % opts.tmpdir) | |
543 if not opts.assumesorted.lower() == 'true': # we need to sort input | |
544 fakeinput = '%s.sorted' % opts.input | |
545 s = pic.sortSam(opts.input, fakeinput, opts.outdir) | |
546 pic.delme.append(fakeinput) | |
547 cl.append('INPUT=%s' % fakeinput) | |
548 else: | |
549 cl.append('INPUT=%s' % os.path.abspath(opts.input)) | |
550 pic.runPic(opts.jar,cl) | |
551 | |
552 | |
553 elif pic.picname == 'CollectGcBiasMetrics': | |
554 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name | |
555 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. | |
556 # why? Dunno | |
557 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) | |
558 try: | |
559 os.symlink(ref_file_name,fakefasta) | |
560 except: | |
561 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy' | |
562 info = s | |
563 shutil.copy(ref_file_name,fakefasta) | |
564 pic.delme.append(fakefasta) | |
565 x = 'rgPicardGCBiasMetrics' | |
566 pdfname = '%s.pdf' % x | |
567 jpgname = '%s.jpg' % x | |
568 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out') | |
569 temppdf = os.path.join(opts.outdir,pdfname) | |
570 cl.append('R=%s' % fakefasta) | |
571 cl.append('WINDOW_SIZE=%s' % opts.windowsize) | |
572 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac) | |
573 cl.append('INPUT=%s' % opts.input) | |
574 cl.append('OUTPUT=%s' % tempout) | |
575 cl.append('TMP_DIR=%s' % opts.tmpdir) | |
576 cl.append('CHART_OUTPUT=%s' % temppdf) | |
577 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut) | |
578 pic.runPic(opts.jar,cl) | |
579 if os.path.isfile(temppdf): | |
580 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find | |
581 s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir) | |
582 else: | |
583 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf | |
584 lf = open(pic.log_filename,'a') | |
585 lf.write(s) | |
586 lf.write('\n') | |
587 lf.close() | |
588 | |
589 elif pic.picname == 'CollectInsertSizeMetrics': | |
590 isPDF = 'InsertSizeHist.pdf' | |
591 pdfpath = os.path.join(opts.outdir,isPDF) | |
592 histpdf = 'InsertSizeHist.pdf' | |
593 cl.append('I=%s' % opts.input) | |
594 cl.append('O=%s' % pic.metricsOut) | |
595 cl.append('HISTOGRAM_FILE=%s' % histpdf) | |
596 if opts.taillimit <> '0': | |
597 cl.append('TAIL_LIMIT=%s' % opts.taillimit) | |
598 if opts.histwidth <> '0': | |
599 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth) | |
600 if float( opts.minpct) > 0.0: | |
601 cl.append('MINIMUM_PCT=%s' % opts.minpct) | |
602 pic.runPic(opts.jar,cl) | |
603 if os.path.exists(pdfpath): # automake thumbnail - will be added to html | |
604 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath] | |
605 s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir) | |
606 else: | |
607 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath | |
608 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n' | |
609 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n' | |
610 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>' | |
611 stdouts = '' | |
612 logging.info(s) | |
613 if len(stdouts) > 0: | |
614 logging.info(stdouts) | |
615 | |
616 elif pic.picname == 'MarkDuplicates': | |
617 # assume sorted even if header says otherwise | |
618 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted)) | |
619 # input | |
620 cl.append('INPUT=%s' % opts.input) | |
621 # outputs | |
622 cl.append('OUTPUT=%s' % opts.output) | |
623 cl.append('METRICS_FILE=%s' % pic.metricsOut ) | |
624 # remove or mark duplicates | |
625 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups) | |
626 # the regular expression to be used to parse reads in incoming SAM file | |
627 cl.append('READ_NAME_REGEX="%s"' % opts.readregex) | |
628 # maximum offset between two duplicate clusters | |
629 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist) | |
630 pic.runPic(opts.jar, cl) | |
631 | |
632 elif pic.picname == 'FixMateInformation': | |
633 cl.append('I=%s' % opts.input) | |
634 cl.append('O=%s' % tempout) | |
635 cl.append('SORT_ORDER=%s' % opts.sortorder) | |
636 pic.runPic(opts.jar,cl) | |
637 haveTempout = True | |
638 | |
639 elif pic.picname == 'ReorderSam': | |
640 # input | |
641 cl.append('INPUT=%s' % opts.input) | |
642 # output | |
643 cl.append('OUTPUT=%s' % tempout) | |
644 # reference | |
645 cl.append('REFERENCE=%s' % ref_file_name) | |
646 # incomplete dict concordance | |
647 if opts.allow_inc_dict_concord == 'true': | |
648 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true') | |
649 # contig length discordance | |
650 if opts.allow_contig_len_discord == 'true': | |
651 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true') | |
652 pic.runPic(opts.jar, cl) | |
653 haveTempout = True | |
654 | |
655 elif pic.picname == 'ReplaceSamHeader': | |
656 cl.append('INPUT=%s' % opts.input) | |
657 cl.append('OUTPUT=%s' % tempout) | |
658 cl.append('HEADER=%s' % opts.header_file) | |
659 pic.runPic(opts.jar, cl) | |
660 haveTempout = True | |
661 | |
662 elif pic.picname == 'CalculateHsMetrics': | |
663 maxloglines = 100 | |
664 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait') | |
665 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target') | |
666 baitf = pic.makePicInterval(opts.baitbed,baitfname) | |
667 if opts.targetbed == opts.baitbed: # same file sometimes | |
668 targetf = baitf | |
669 else: | |
670 targetf = pic.makePicInterval(opts.targetbed,targetfname) | |
671 cl.append('BAIT_INTERVALS=%s' % baitf) | |
672 cl.append('TARGET_INTERVALS=%s' % targetf) | |
673 cl.append('INPUT=%s' % os.path.abspath(opts.input)) | |
674 cl.append('OUTPUT=%s' % pic.metricsOut) | |
675 cl.append('TMP_DIR=%s' % opts.tmpdir) | |
676 pic.runPic(opts.jar,cl) | |
677 | |
678 elif pic.picname == 'ValidateSamFile': | |
679 import pysam | |
680 doTranspose = False | |
681 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted') | |
682 stf = open(pic.log_filename,'w') | |
683 tlog = None | |
684 if opts.datatype == 'sam': # need to work with a bam | |
685 tlog,tempbam = pic.samToBam(opts.input,opts.outdir) | |
686 try: | |
687 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir) | |
688 except: | |
689 print '## exception on sorting sam file %s' % opts.input | |
690 else: # is already bam | |
691 try: | |
692 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir) | |
693 except: # bug - [bam_sort_core] not being ignored - TODO fixme | |
694 print '## exception on sorting bam file %s' % opts.input | |
695 if tlog: | |
696 print '##tlog=',tlog | |
697 stf.write(tlog) | |
698 stf.write('\n') | |
699 sortedfile = '%s.bam' % sortedfile # samtools does that | |
700 cl.append('O=%s' % pic.metricsOut) | |
701 cl.append('TMP_DIR=%s' % opts.tmpdir) | |
702 cl.append('I=%s' % sortedfile) | |
703 opts.maxerrors = '99999999' | |
704 cl.append('MAX_OUTPUT=%s' % opts.maxerrors) | |
705 if opts.ignoreflags[0] <> 'None': # picard error values to ignore | |
706 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None'] | |
707 cl.append(' '.join(igs)) | |
708 if opts.bisulphite.lower() <> 'false': | |
709 cl.append('IS_BISULFITE_SEQUENCED=true') | |
710 if opts.ref <> None or opts.ref_file <> None: | |
711 cl.append('R=%s' % ref_file_name) | |
712 pic.runPic(opts.jar,cl) | |
713 if opts.datatype == 'sam': | |
714 pic.delme.append(tempbam) | |
715 newsam = opts.output | |
716 outformat = 'bam' | |
717 pe = open(pic.metricsOut,'r').readlines() | |
718 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat) | |
719 pic.delme.append(sortedfile) # not wanted | |
720 stf.close() | |
721 pic.cleanup() | |
722 else: | |
723 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname | |
724 sys.exit(1) | |
725 if haveTempout: | |
726 # Some Picard tools produced a potentially intermediate bam file. | |
727 # Either just move to final location or create sam | |
728 shutil.move(tempout, os.path.abspath(opts.output)) | |
729 | |
730 if opts.htmlout <> None or doFix: # return a pretty html page | |
731 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines) | |
732 | |
733 if __name__=="__main__": __main__() | |
734 |