comparison picard_wrapper.py @ 0:1cd7f3b42609

Uploaded tool.
author devteam
date Tue, 23 Oct 2012 13:14:29 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1cd7f3b42609
1 #!/usr/bin/env python
2 """
3 Originally written by Kelly Vincent
4 pretty output and additional picard wrappers by Ross Lazarus for rgenetics
5 Runs all available wrapped Picard tools.
6 usage: picard_wrapper.py [options]
7 code Ross wrote licensed under the LGPL
8 see http://www.gnu.org/copyleft/lesser.html
9 """
10
11 import optparse, os, sys, subprocess, tempfile, shutil, time, logging
12
13 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
14 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
16 <head>
17 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
18 <meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />
19 <title></title>
20 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
21 </head>
22 <body>
23 <div class="document">
24 """
25 galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""
26 galhtmlpostfix = """</div></body></html>\n"""
27
28
29 def stop_err( msg ):
30 sys.stderr.write( '%s\n' % msg )
31 sys.exit()
32
33
34 def timenow():
35 """return current time as a string
36 """
37 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
38
39
40 class PicardBase():
41 """
42 simple base class with some utilities for Picard
43 adapted and merged with Kelly Vincent's code april 2011 Ross
44 lots of changes...
45 """
46
47 def __init__(self, opts=None,arg0=None):
48 """ common stuff needed at init for a picard tool
49 """
50 assert opts <> None, 'PicardBase needs opts at init'
51 self.opts = opts
52 if self.opts.outdir == None:
53 self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
54 assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
55 self.picname = self.baseName(opts.jar)
56 if self.picname.startswith('picard'):
57 self.picname = opts.picard_cmd # special case for some tools like replaceheader?
58 self.progname = self.baseName(arg0)
59 self.version = '0.002'
60 self.delme = [] # list of files to destroy
61 self.title = opts.title
62 self.inputfile = opts.input
63 try:
64 os.makedirs(opts.outdir)
65 except:
66 pass
67 try:
68 os.makedirs(opts.tmpdir)
69 except:
70 pass
71 self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
72 self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
73 self.setLogging(logfname=self.log_filename)
74
75 def baseName(self,name=None):
76 return os.path.splitext(os.path.basename(name))[0]
77
78 def setLogging(self,logfname="picard_wrapper.log"):
79 """setup a logger
80 """
81 logging.basicConfig(level=logging.INFO,
82 filename=logfname,
83 filemode='a')
84
85
86 def readLarge(self,fname=None):
87 """ read a potentially huge file.
88 """
89 try:
90 # get stderr, allowing for case where it's very large
91 tmp = open( fname, 'rb' )
92 s = ''
93 buffsize = 1048576
94 try:
95 while True:
96 more = tmp.read( buffsize )
97 if len(more) > 0:
98 s += more
99 else:
100 break
101 except OverflowError:
102 pass
103 tmp.close()
104 except Exception, e:
105 stop_err( 'Read Large Exception : %s' % str( e ) )
106 return s
107
108 def runCL(self,cl=None,output_dir=None):
109 """ construct and run a command line
110 we have galaxy's temp path as opt.temp_dir so don't really need isolation
111 sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
112 """
113 assert cl <> None, 'PicardBase runCL needs a command line as cl'
114 if output_dir == None:
115 output_dir = self.opts.outdir
116 if type(cl) == type([]):
117 cl = ' '.join(cl)
118 fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
119 tlf = open(templog,'wb')
120 fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
121 tef = open(temperr,'wb')
122 process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
123 rval = process.wait()
124 tlf.close()
125 tef.close()
126 stderrs = self.readLarge(temperr)
127 stdouts = self.readLarge(templog)
128 if rval > 0:
129 s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
130 stdouts = '%s\n%s' % (stdouts,stderrs)
131 else:
132 s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
133 logging.info(s)
134 os.unlink(templog) # always
135 os.unlink(temperr) # always
136 return s, stdouts, rval # sometimes s is an output
137
138 def runPic(self, jar, cl):
139 """
140 cl should be everything after the jar file name in the command
141 """
142 runme = ['java -Xmx%s' % self.opts.maxjheap]
143 runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
144 runme.append('-jar %s' % jar)
145 runme += cl
146 s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
147 return stdouts,rval
148
149 def samToBam(self,infile=None,outdir=None):
150 """
151 use samtools view to convert sam to bam
152 """
153 fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
154 cl = ['samtools view -h -b -S -o ',tempbam,infile]
155 tlog,stdouts,rval = self.runCL(cl,outdir)
156 return tlog,tempbam,rval
157
158 def sortSam(self, infile=None,outfile=None,outdir=None):
159 """
160 """
161 print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
162 cl = ['samtools sort',infile,outfile]
163 tlog,stdouts,rval = self.runCL(cl,outdir)
164 return tlog
165
166 def cleanup(self):
167 for fname in self.delme:
168 try:
169 os.unlink(fname)
170 except:
171 pass
172
173 def prettyPicout(self,transpose,maxrows):
174 """organize picard outpouts into a report html page
175 """
176 res = []
177 try:
178 r = open(self.metricsOut,'r').readlines()
179 except:
180 r = []
181 if len(r) > 0:
182 res.append('<b>Picard on line resources</b><ul>\n')
183 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
184 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
185 if transpose:
186 res.append('<b>Picard output (transposed to make it easier to see)</b><hr/>\n')
187 else:
188 res.append('<b>Picard output</b><hr/>\n')
189 res.append('<table cellpadding="3" >\n')
190 dat = []
191 heads = []
192 lastr = len(r) - 1
193 # special case for estimate library complexity hist
194 thist = False
195 for i,row in enumerate(r):
196 if row.strip() > '':
197 srow = row.split('\t')
198 if row.startswith('#'):
199 heads.append(row.strip()) # want strings
200 else:
201 dat.append(srow) # want lists
202 if row.startswith('## HISTOGRAM'):
203 thist = True
204 if len(heads) > 0:
205 hres = ['<tr class="d%d"><td colspan="2">%s</td></tr>' % (i % 2,x) for i,x in enumerate(heads)]
206 res += hres
207 heads = []
208 if len(dat) > 0:
209 if transpose and not thist:
210 tdat = map(None,*dat) # transpose an arbitrary list of lists
211 tdat = ['<tr class="d%d"><td>%s</td><td>%s&nbsp;</td></tr>\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
212 else:
213 tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
214 tdat = ['<tr class="d%d"><td colspan="2">%s</td></tr>\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
215 res += tdat
216 dat = []
217 res.append('</table>\n')
218 return res
219
220 def fixPicardOutputs(self,transpose,maxloglines):
221 """
222 picard produces long hard to read tab header files
223 make them available but present them transposed for readability
224 """
225 logging.shutdown()
226 self.cleanup() # remove temp files stored in delme
227 rstyle="""<style type="text/css">
228 tr.d0 td {background-color: oldlace; color: black;}
229 tr.d1 td {background-color: aliceblue; color: black;}
230 </style>"""
231 res = [rstyle,]
232 res.append(galhtmlprefix % self.progname)
233 res.append(galhtmlattr % (self.picname,timenow()))
234 flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
235 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
236 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
237 for p in pdflist:
238 pbase = os.path.splitext(p)[0] # removes .pdf
239 imghref = '%s.jpg' % pbase
240 mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
241 if mimghref in flist:
242 imghref=mimghref # only one for thumbnail...it's a multi page pdf
243 res.append('<table cellpadding="10"><tr><td>\n')
244 res.append('<a href="%s"><img src="%s" title="Click image preview for a print quality PDF version" hspace="10" align="middle"></a>\n' % (p,imghref))
245 res.append('</tr></td></table>\n')
246 if len(flist) > 0:
247 res.append('<b>The following output files were created (click the filename to view/download a copy):</b><hr/>')
248 res.append('<table>\n')
249 for i,f in enumerate(flist):
250 fn = os.path.split(f)[-1]
251 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
252 res.append('</table><p/>\n')
253 pres = self.prettyPicout(transpose,maxloglines)
254 if len(pres) > 0:
255 res += pres
256 l = open(self.log_filename,'r').readlines()
257 llen = len(l)
258 if llen > 0:
259 res.append('<b>Picard Tool Run Log</b><hr/>\n')
260 rlog = ['<pre>',]
261 if llen > maxloglines:
262 n = min(50,int(maxloglines/2))
263 rlog += l[:n]
264 rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
265 rlog += l[-n:]
266 else:
267 rlog += l
268 rlog.append('</pre>')
269 if llen > maxloglines:
270 rlog.append('\n<b>## WARNING - %d log lines truncated - <a href="%s">%s</a> contains entire output</b>' % (llen - maxloglines,self.log_filename,self.log_filename))
271 res += rlog
272 else:
273 res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
274 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
275 res.append( 'generated all outputs reported here running as a <a href="http://getgalaxy.org">Galaxy</a> tool')
276 res.append(galhtmlpostfix)
277 outf = open(self.opts.htmlout,'w')
278 outf.write(''.join(res))
279 outf.write('\n')
280 outf.close()
281
282 def makePicInterval(self,inbed=None,outf=None):
283 """
284 picard wants bait and target files to have the same header length as the incoming bam/sam
285 a meaningful (ie accurate) representation will fail because of this - so this hack
286 it would be far better to be able to supply the original bed untouched
287 Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
288 """
289 assert inbed <> None
290 bed = open(inbed,'r').readlines()
291 sbed = [x.split('\t') for x in bed] # lengths MUST be 5
292 lens = [len(x) for x in sbed]
293 strands = [x[3] for x in sbed if not x[3] in ['+','-']]
294 maxl = max(lens)
295 minl = min(lens)
296 e = []
297 if maxl <> minl:
298 e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
299 if maxl <> 5:
300 e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
301 if len(strands) > 0:
302 e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
303 if len(e) > 0: # write to stderr and quit
304 print >> sys.stderr, '\n'.join(e)
305 sys.exit(1)
306 thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
307 if self.opts.datatype == 'sam':
308 cl = ['samtools view -H -S',self.opts.input,'>',thead]
309 else:
310 cl = ['samtools view -H',self.opts.input,'>',thead]
311 self.runCL(cl=cl,output_dir=self.opts.outdir)
312 head = open(thead,'r').readlines()
313 s = '## got %d rows of header\n' % (len(head))
314 logging.info(s)
315 o = open(outf,'w')
316 o.write(''.join(head))
317 o.write(''.join(bed))
318 o.close()
319 return outf
320
321 def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
322 """
323 interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
324 Do the work of removing all the error sequences
325 pysam is cool
326 infile = pysam.Samfile( "-", "r" )
327 outfile = pysam.Samfile( "-", "w", template = infile )
328 for s in infile: outfile.write(s)
329
330 errors from ValidateSameFile.jar look like
331 WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
332 ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
333 ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
334
335 """
336 assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
337 assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
338 removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
339 remDict = dict(zip(removeNames,range(len(removeNames))))
340 infile = pysam.Samfile(insam,'rb')
341 info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
342 if len(removeNames) > 0:
343 outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
344 i = 0
345 j = 0
346 for row in infile:
347 dropme = remDict.get(row.qname,None) # keep if None
348 if not dropme:
349 outfile.write(row)
350 j += 1
351 else: # discard
352 i += 1
353 info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
354 outfile.close()
355 infile.close()
356 else: # we really want a nullop or a simple pointer copy
357 infile.close()
358 if newsam:
359 shutil.copy(insam,newsam)
360 logging.info(info)
361
362
363
364 def __main__():
365 doFix = False # tools returning htmlfile don't need this
366 doTranspose = True # default
367 maxloglines = 100 # default
368 #Parse Command Line
369 op = optparse.OptionParser()
370 # All tools
371 op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
372 op.add_option('-e', '--inputext', default=None)
373 op.add_option('-o', '--output', default=None)
374 op.add_option('-n', '--title', default="Pick a Picard Tool")
375 op.add_option('-t', '--htmlout', default=None)
376 op.add_option('-d', '--outdir', default=None)
377 op.add_option('-x', '--maxjheap', default='4g')
378 op.add_option('-b', '--bisulphite', default='false')
379 op.add_option('-s', '--sortorder', default='query')
380 op.add_option('','--tmpdir', default='/tmp')
381 op.add_option('-j','--jar',default='')
382 op.add_option('','--picard-cmd',default=None)
383 # Many tools
384 op.add_option( '', '--output-format', dest='output_format', help='Output format' )
385 op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
386 op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
387 # CreateSequenceDictionary
388 op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
389 op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
390 op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
391 op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
392 # MarkDuplicates
393 op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
394 op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
395 # CollectInsertSizeMetrics
396 op.add_option('', '--taillimit', default="0")
397 op.add_option('', '--histwidth', default="0")
398 op.add_option('', '--minpct', default="0.01")
399 op.add_option('', '--malevel', default='')
400 op.add_option('', '--deviations', default="0.0")
401 # CollectAlignmentSummaryMetrics
402 op.add_option('', '--maxinsert', default="20")
403 op.add_option('', '--adaptors', default='')
404 # FixMateInformation and validate
405 # CollectGcBiasMetrics
406 op.add_option('', '--windowsize', default='100')
407 op.add_option('', '--mingenomefrac', default='0.00001')
408 # AddOrReplaceReadGroups
409 op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
410 op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
411 op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
412 op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
413 op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
414 op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
415 op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
416 op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
417 # ReorderSam
418 op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
419 op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
420 # ReplaceSamHeader
421 op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
422
423 op.add_option('','--assumesorted', default='true')
424 op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
425 #estimatelibrarycomplexity
426 op.add_option('','--minid', default="5")
427 op.add_option('','--maxdiff', default="0.03")
428 op.add_option('','--minmeanq', default="20")
429 #hsmetrics
430 op.add_option('','--baitbed', default=None)
431 op.add_option('','--targetbed', default=None)
432 #validate
433 op.add_option('','--ignoreflags', action='append', type="string")
434 op.add_option('','--maxerrors', default=None)
435 op.add_option('','--datatype', default=None)
436 op.add_option('','--bamout', default=None)
437 op.add_option('','--samout', default=None)
438
439 opts, args = op.parse_args()
440 opts.sortme = opts.assumesorted == 'false'
441 assert opts.input <> None
442 # need to add
443 # instance that does all the work
444 pic = PicardBase(opts,sys.argv[0])
445
446 tmp_dir = opts.outdir
447 haveTempout = False # we use this where sam output is an option
448 rval = 0
449 stdouts = 'Not run yet'
450 # set ref and dict files to use (create if necessary)
451 ref_file_name = opts.ref
452 if opts.ref_file <> None:
453 csd = 'CreateSequenceDictionary'
454 realjarpath = os.path.split(opts.jar)[0]
455 jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
456 tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
457 ref_file_name = '%s.fasta' % tmp_ref_name
458 # build dict
459 dict_file_name = '%s.dict' % tmp_ref_name
460 os.symlink( opts.ref_file, ref_file_name )
461 cl = ['REFERENCE=%s' % ref_file_name]
462 cl.append('OUTPUT=%s' % dict_file_name)
463 cl.append('URI=%s' % os.path.basename( opts.ref_file ))
464 cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
465 if opts.species_name:
466 cl.append('SPECIES=%s' % opts.species_name)
467 if opts.build_name:
468 cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
469 pic.delme.append(dict_file_name)
470 pic.delme.append(ref_file_name)
471 pic.delme.append(tmp_ref_name)
472 stdouts,rval = pic.runPic(jarpath, cl)
473 # run relevant command(s)
474
475 # define temporary output
476 # if output is sam, it must have that extension, otherwise bam will be produced
477 # specify sam or bam file with extension
478 if opts.output_format == 'sam':
479 suff = '.sam'
480 else:
481 suff = ''
482 tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
483
484 cl = ['VALIDATION_STRINGENCY=LENIENT',]
485
486 if pic.picname == 'AddOrReplaceReadGroups':
487 # sort order to match Galaxy's default
488 cl.append('SORT_ORDER=coordinate')
489 # input
490 cl.append('INPUT=%s' % opts.input)
491 # outputs
492 cl.append('OUTPUT=%s' % tempout)
493 # required read groups
494 cl.append('RGLB="%s"' % opts.rg_library)
495 cl.append('RGPL="%s"' % opts.rg_platform)
496 cl.append('RGPU="%s"' % opts.rg_plat_unit)
497 cl.append('RGSM="%s"' % opts.rg_sample)
498 if opts.rg_id:
499 cl.append('RGID="%s"' % opts.rg_id)
500 # optional read groups
501 if opts.rg_seq_center:
502 cl.append('RGCN="%s"' % opts.rg_seq_center)
503 if opts.rg_desc:
504 cl.append('RGDS="%s"' % opts.rg_desc)
505 stdouts,rval = pic.runPic(opts.jar, cl)
506 haveTempout = True
507
508 elif pic.picname == 'BamIndexStats':
509 tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
510 tmp_bam_name = '%s.bam' % tmp_name
511 tmp_bai_name = '%s.bai' % tmp_bam_name
512 os.symlink( opts.input, tmp_bam_name )
513 os.symlink( opts.bai_file, tmp_bai_name )
514 cl.append('INPUT=%s' % ( tmp_bam_name ))
515 pic.delme.append(tmp_bam_name)
516 pic.delme.append(tmp_bai_name)
517 pic.delme.append(tmp_name)
518 stdouts,rval = pic.runPic( opts.jar, cl )
519 f = open(pic.metricsOut,'a')
520 f.write(stdouts) # got this on stdout from runCl
521 f.write('\n')
522 f.close()
523 doTranspose = False # but not transposed
524
525 elif pic.picname == 'EstimateLibraryComplexity':
526 cl.append('I=%s' % opts.input)
527 cl.append('O=%s' % pic.metricsOut)
528 if float(opts.minid) > 0:
529 cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
530 if float(opts.maxdiff) > 0.0:
531 cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
532 if float(opts.minmeanq) > 0:
533 cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
534 if opts.readregex > '':
535 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
536 if float(opts.optdupdist) > 0:
537 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
538 stdouts,rval = pic.runPic(opts.jar, cl)
539
540 elif pic.picname == 'CollectAlignmentSummaryMetrics':
541 # Why do we do this fakefasta thing?
542 # Because we need NO fai to be available or picard barfs unless it matches the input data.
543 # why? Dunno Seems to work without complaining if the .bai file is AWOL....
544 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
545 try:
546 os.symlink(ref_file_name,fakefasta)
547 except:
548 s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
549 info = s
550 shutil.copy(ref_file_name,fakefasta)
551 pic.delme.append(fakefasta)
552 cl.append('ASSUME_SORTED=true')
553 adaptlist = opts.adaptors.split(',')
554 adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
555 cl += adaptorseqs
556 cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
557 cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
558 cl.append('OUTPUT=%s' % pic.metricsOut)
559 cl.append('R=%s' % fakefasta)
560 cl.append('TMP_DIR=%s' % opts.tmpdir)
561 if not opts.assumesorted.lower() == 'true': # we need to sort input
562 sortedfile = '%s.sorted' % os.path.basename(opts.input)
563 if opts.datatype == 'sam': # need to work with a bam
564 tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
565 pic.delme.append(tempbam)
566 try:
567 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
568 except:
569 print '## exception on sorting sam file %s' % opts.input
570 else: # is already bam
571 try:
572 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
573 except : # bug - [bam_sort_core] not being ignored - TODO fixme
574 print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
575 cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
576 pic.delme.append(os.path.join(opts.outdir,sortedfile))
577 else:
578 cl.append('INPUT=%s' % os.path.abspath(opts.input))
579 stdouts,rval = pic.runPic(opts.jar, cl)
580
581
582 elif pic.picname == 'CollectGcBiasMetrics':
583 assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
584 # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
585 # why? Dunno
586 fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
587 try:
588 os.symlink(ref_file_name,fakefasta)
589 except:
590 s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
591 info = s
592 shutil.copy(ref_file_name,fakefasta)
593 pic.delme.append(fakefasta)
594 x = 'rgPicardGCBiasMetrics'
595 pdfname = '%s.pdf' % x
596 jpgname = '%s.jpg' % x
597 tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
598 temppdf = os.path.join(opts.outdir,pdfname)
599 cl.append('R=%s' % fakefasta)
600 cl.append('WINDOW_SIZE=%s' % opts.windowsize)
601 cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
602 cl.append('INPUT=%s' % opts.input)
603 cl.append('OUTPUT=%s' % tempout)
604 cl.append('TMP_DIR=%s' % opts.tmpdir)
605 cl.append('CHART_OUTPUT=%s' % temppdf)
606 cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
607 stdouts,rval = pic.runPic(opts.jar, cl)
608 if os.path.isfile(temppdf):
609 cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
610 s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
611 else:
612 s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
613 lf = open(pic.log_filename,'a')
614 lf.write(s)
615 lf.write('\n')
616 lf.close()
617
618 elif pic.picname == 'CollectInsertSizeMetrics':
619 """ <command interpreter="python">
620 picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
621 --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
622 -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
623 </command>
624 """
625 isPDF = 'InsertSizeHist.pdf'
626 pdfpath = os.path.join(opts.outdir,isPDF)
627 histpdf = 'InsertSizeHist.pdf'
628 cl.append('I=%s' % opts.input)
629 cl.append('O=%s' % pic.metricsOut)
630 cl.append('HISTOGRAM_FILE=%s' % histpdf)
631 #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
632 # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
633 if opts.histwidth <> '0':
634 cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
635 if float( opts.minpct) > 0.0:
636 cl.append('MINIMUM_PCT=%s' % opts.minpct)
637 if float(opts.deviations) > 0.0:
638 cl.append('DEVIATIONS=%s' % opts.deviations)
639 if opts.malevel:
640 malists = opts.malevel.split(',')
641 malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
642 cl += malist
643 stdouts,rval = pic.runPic(opts.jar, cl)
644 if os.path.exists(pdfpath): # automake thumbnail - will be added to html
645 cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
646 pic.runCL(cl=cl2,output_dir=opts.outdir)
647 else:
648 s = 'Unable to find expected pdf file %s<br/>\n' % pdfpath
649 s += 'This <b>always happens if single ended data was provided</b> to this tool,\n'
650 s += 'so please double check that your input data really is paired-end NGS data.<br/>\n'
651 s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n<br/>'
652 logging.info(s)
653 if len(stdouts) > 0:
654 logging.info(stdouts)
655
656 elif pic.picname == 'MarkDuplicates':
657 # assume sorted even if header says otherwise
658 cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
659 # input
660 cl.append('INPUT=%s' % opts.input)
661 # outputs
662 cl.append('OUTPUT=%s' % opts.output)
663 cl.append('METRICS_FILE=%s' % pic.metricsOut )
664 # remove or mark duplicates
665 cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
666 # the regular expression to be used to parse reads in incoming SAM file
667 cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
668 # maximum offset between two duplicate clusters
669 cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
670 stdouts,rval = pic.runPic(opts.jar, cl)
671
672 elif pic.picname == 'FixMateInformation':
673 cl.append('I=%s' % opts.input)
674 cl.append('O=%s' % tempout)
675 cl.append('SORT_ORDER=%s' % opts.sortorder)
676 stdouts,rval = pic.runPic(opts.jar,cl)
677 haveTempout = True
678
679 elif pic.picname == 'ReorderSam':
680 # input
681 cl.append('INPUT=%s' % opts.input)
682 # output
683 cl.append('OUTPUT=%s' % tempout)
684 # reference
685 cl.append('REFERENCE=%s' % ref_file_name)
686 # incomplete dict concordance
687 if opts.allow_inc_dict_concord == 'true':
688 cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
689 # contig length discordance
690 if opts.allow_contig_len_discord == 'true':
691 cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
692 stdouts,rval = pic.runPic(opts.jar, cl)
693 haveTempout = True
694
695 elif pic.picname == 'ReplaceSamHeader':
696 cl.append('INPUT=%s' % opts.input)
697 cl.append('OUTPUT=%s' % tempout)
698 cl.append('HEADER=%s' % opts.header_file)
699 stdouts,rval = pic.runPic(opts.jar, cl)
700 haveTempout = True
701
702 elif pic.picname == 'CalculateHsMetrics':
703 maxloglines = 100
704 baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
705 targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
706 baitf = pic.makePicInterval(opts.baitbed,baitfname)
707 if opts.targetbed == opts.baitbed: # same file sometimes
708 targetf = baitf
709 else:
710 targetf = pic.makePicInterval(opts.targetbed,targetfname)
711 cl.append('BAIT_INTERVALS=%s' % baitf)
712 cl.append('TARGET_INTERVALS=%s' % targetf)
713 cl.append('INPUT=%s' % os.path.abspath(opts.input))
714 cl.append('OUTPUT=%s' % pic.metricsOut)
715 cl.append('TMP_DIR=%s' % opts.tmpdir)
716 stdouts,rval = pic.runPic(opts.jar,cl)
717
718 elif pic.picname == 'ValidateSamFile':
719 import pysam
720 doTranspose = False
721 sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
722 stf = open(pic.log_filename,'w')
723 tlog = None
724 if opts.datatype == 'sam': # need to work with a bam
725 tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
726 try:
727 tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
728 except:
729 print '## exception on sorting sam file %s' % opts.input
730 else: # is already bam
731 try:
732 tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
733 except: # bug - [bam_sort_core] not being ignored - TODO fixme
734 print '## exception on sorting bam file %s' % opts.input
735 if tlog:
736 print '##tlog=',tlog
737 stf.write(tlog)
738 stf.write('\n')
739 sortedfile = '%s.bam' % sortedfile # samtools does that
740 cl.append('O=%s' % pic.metricsOut)
741 cl.append('TMP_DIR=%s' % opts.tmpdir)
742 cl.append('I=%s' % sortedfile)
743 opts.maxerrors = '99999999'
744 cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
745 if opts.ignoreflags[0] <> 'None': # picard error values to ignore
746 igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
747 cl.append(' '.join(igs))
748 if opts.bisulphite.lower() <> 'false':
749 cl.append('IS_BISULFITE_SEQUENCED=true')
750 if opts.ref <> None or opts.ref_file <> None:
751 cl.append('R=%s' % ref_file_name)
752 stdouts,rval = pic.runPic(opts.jar,cl)
753 if opts.datatype == 'sam':
754 pic.delme.append(tempbam)
755 newsam = opts.output
756 outformat = 'bam'
757 pe = open(pic.metricsOut,'r').readlines()
758 pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
759 pic.delme.append(sortedfile) # not wanted
760 stf.close()
761 pic.cleanup()
762 else:
763 print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
764 sys.exit(1)
765 if haveTempout:
766 # Some Picard tools produced a potentially intermediate bam file.
767 # Either just move to final location or create sam
768 if os.path.exists(tempout):
769 shutil.move(tempout, os.path.abspath(opts.output))
770 if opts.htmlout <> None or doFix: # return a pretty html page
771 pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
772 if rval <> 0:
773 print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
774 # signal failure
775 if __name__=="__main__": __main__()
776