comparison rgFastQC.py @ 1:8fae48caaf06 draft

Uploaded form GH
author devteam
date Tue, 11 Nov 2014 12:46:27 -0500
parents e28c965eeed4
children 0b201de108b9
comparison
equal deleted inserted replaced
0:e28c965eeed4 1:8fae48caaf06
1 """ 1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused 2 Rewrite of rgFastQC.py for Version 0.11.2 of FastQC.
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7 3
8 called as 4 Changes implemented from tmcgowan at
9 <command interpreter="python"> 5 https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" 6 and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc
11 </command> 7 with minor changes and bug fixes
12 8
9 SYNOPSIS
13 10
11 rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory]
12 [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable]
14 13
15 Current release seems overly intolerant of sam/bam header strangeness 14 EXAMPLE (generated by Galaxy)
16 Author notified...
17 15
16 rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder
17 -f fastq -n FastQC -c path/dataset_2.dat -e fastqc
18 18
19 """ 19 """
20
20 import re 21 import re
21 import os 22 import os
22 import sys 23 import shutil
23 import subprocess 24 import subprocess
24 import optparse 25 import optparse
25 import shutil
26 import tempfile 26 import tempfile
27 import glob
28 import gzip
29 import bz2
27 import zipfile 30 import zipfile
28 import gzip
29 31
30 32 class FastQCRunner(object):
31 def getFileString(fpath, outpath):
32 """
33 format a nice file size string
34 """
35 size = ''
36 fp = os.path.join(outpath, fpath)
37 s = '? ?'
38 if os.path.isfile(fp):
39 n = float(os.path.getsize(fp))
40 if n > 2**20:
41 size = ' (%1.1f MB)' % (n/2**20)
42 elif n > 2**10:
43 size = ' (%1.1f KB)' % (n/2**10)
44 elif n > 0:
45 size = ' (%d B)' % (int(n))
46 s = '%s %s' % (fpath, size)
47 return s
48
49
50 class FastQC():
51 """wrapper
52 """
53
54 33
55 def __init__(self,opts=None): 34 def __init__(self,opts=None):
56 assert opts <> None 35 '''
36 Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc()
37 '''
38
39 # Check whether the options are specified and saves them into the object
40 assert opts != None
57 self.opts = opts 41 self.opts = opts
42
43 def prepare_command_line(self):
44 '''
45 Develops the Commandline to run FastQC in Galaxy
46 '''
58 47
59 48 # Check whether a given file compression format is valid
60 def run_fastqc(self): 49 # This prevents uncompression of already uncompressed files
61 """
62 In batch mode fastqc behaves not very nicely - will write to a new folder in
63 the same place as the infile called [infilebasename]_fastqc
64 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
65 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
66 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
67 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
68
69 """
70 serr = ''
71 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
72 sout = open(tlog, 'w')
73 fastq = os.path.basename(self.opts.input)
74 cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
75 if self.opts.informat in ['sam','bam']:
76 cl.append('--f=%s' % self.opts.informat)
77 if self.opts.contaminants <> None :
78 cl.append('--contaminants=%s' % self.opts.contaminants)
79 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
80 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
81 infname = self.opts.inputfilename 50 infname = self.opts.inputfilename
82 linf = infname.lower() 51 linf = infname.lower()
83 trimext = False 52 trimext = False
84 # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf 53 # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
85 # patched may 29 2013 until this is fixed properly 54 # patched may 29 2013 until this is fixed properly
86 if ( linf.endswith('.gz') or linf.endswith('.gzip') ): 55 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
87 f = gzip.open(self.opts.input) 56 f = gzip.open(self.opts.input)
88 try: 57 try:
89 testrow = f.readline() 58 f.readline()
90 except: 59 except:
91 trimext = True 60 trimext = True
92 f.close() 61 f.close()
93 elif linf.endswith('bz2'): 62 elif linf.endswith('bz2'):
94 f = bz2.open(self.opts.input,'rb') 63 f = bz2.open(self.opts.input,'rb')
99 f.close() 68 f.close()
100 elif linf.endswith('.zip'): 69 elif linf.endswith('.zip'):
101 if not zipfile.is_zipfile(self.opts.input): 70 if not zipfile.is_zipfile(self.opts.input):
102 trimext = True 71 trimext = True
103 if trimext: 72 if trimext:
104 infname = os.path.splitext(infname)[0] 73 f = open(self.opts.input)
105 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) 74 try:
106 link_name = os.path.join(self.opts.outputdir, fastqinfilename) 75 f.readline()
107 os.symlink(self.opts.input, link_name) 76 except:
108 cl.append(link_name) 77 raise Exception("Input file corruption, could not identify the filetype")
109 sout.write('# FastQC cl = %s\n' % ' '.join(cl)) 78 infname = os.path.splitext(infname)[0]
110 sout.flush() 79
111 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) 80 # Replace unwanted or problematic charaters in the input file name
112 retval = p.wait() 81 self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
82
83 # Build the Commandline from the given parameters
84 command_line = [opts.executable, '--outdir %s' % opts.outputdir]
85 if opts.contaminants != None:
86 command_line.append('--contaminants %s' % opts.contaminants)
87 if opts.limits != None:
88 command_line.append('--limits %s' % opts.limits)
89 command_line.append('--quiet')
90 command_line.append('--extract') # to access the output text file
91 command_line.append(self.fastqinfilename)
92 self.command_line = ' '.join(command_line)
93
94 def copy_output_file_to_dataset(self):
95 '''
96 Retrieves the output html and text files from the output directory and copies them to the Galaxy output files
97 '''
98
99 # retrieve html file
100 result_file = glob.glob(opts.outputdir + '/*html')
101 with open(result_file[0], 'rb') as fsrc:
102 with open(self.opts.htmloutput, 'wb') as fdest:
103 shutil.copyfileobj(fsrc, fdest)
104
105 # retrieve text file
106 text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt')
107 with open(text_file[0], 'rb') as fsrc:
108 with open(self.opts.textoutput, 'wb') as fdest:
109 shutil.copyfileobj(fsrc, fdest)
110
111 def run_fastqc(self):
112 '''
113 Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts)
114 '''
115
116 # Create a log file
117 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
118 sout = open(tlog, 'w')
119
120 self.prepare_command_line()
121 sout.write(self.command_line)
122 sout.write('\n')
123 sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name
124 os.symlink(self.opts.input, self.fastqinfilename)
125 sout.write("check_call\n")
126 subprocess.check_call(self.command_line, shell=True)
127 sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput))
128 self.copy_output_file_to_dataset()
129 sout.write("Finished")
113 sout.close() 130 sout.close()
114 runlog = open(tlog,'r').readlines()
115 os.unlink(link_name)
116 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
117 odpath = None
118 for f in flist:
119 d = os.path.join(self.opts.outputdir,f)
120 if os.path.isdir(d):
121 if d.endswith('_fastqc'):
122 odpath = d
123 hpath = None
124 if odpath <> None:
125 try:
126 hpath = os.path.join(odpath,'fastqc_report.html')
127 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
128 except:
129 pass
130 if hpath == None:
131 serr = '\n'.join(runlog)
132 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
133 res += runlog
134 res += ['</pre>\n',
135 'Please read the above for clues<br/>\n',
136 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
137 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
138 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
139 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
140 return res,1,serr
141 self.fix_fastqcimages(odpath)
142 flist = os.listdir(self.opts.outputdir) # these have now been fixed
143 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
144 flist = [x for x in flist if not x in excludefiles]
145 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
146 rep[i] = rep[i].replace('Icons/','')
147 rep[i] = rep[i].replace('Images/','')
148
149 html = self.fix_fastqc(rep,flist,runlog)
150 return html,retval,serr
151
152
153
154 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
155 """ add some of our stuff to the html
156 """
157 bodyindex = len(rep) -1 # hope they don't change this
158 footrow = bodyindex - 1
159 footer = rep[footrow]
160 rep = rep[:footrow] + rep[footrow+1:]
161 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
162 flist.sort()
163 for i,f in enumerate(flist):
164 if not(os.path.isdir(f)):
165 fn = os.path.split(f)[-1]
166 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
167 res.append('</table>\n')
168 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
169 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>')
170 res.append(footer)
171 fixed = rep[:bodyindex] + res + rep[bodyindex:]
172 return fixed # with our additions
173
174
175 def fix_fastqcimages(self,odpath):
176 """ Galaxy wants everything in the same files_dir
177 """
178 icpath = os.path.join(odpath,'Icons')
179 impath = os.path.join(odpath,'Images')
180 for adir in [icpath,impath,odpath]:
181 if os.path.exists(adir):
182 flist = os.listdir(adir) # get all files created
183 for f in flist:
184 if not os.path.isdir(os.path.join(adir,f)):
185 sauce = os.path.join(adir,f)
186 dest = os.path.join(self.opts.outputdir,f)
187 shutil.move(sauce,dest)
188 os.rmdir(adir)
189
190
191 131
192 if __name__ == '__main__': 132 if __name__ == '__main__':
193 op = optparse.OptionParser() 133 op = optparse.OptionParser()
194 op.add_option('-i', '--input', default=None) 134 op.add_option('-i', '--input', default=None)
195 op.add_option('-j', '--inputfilename', default=None) 135 op.add_option('-j', '--inputfilename', default=None)
196 op.add_option('-o', '--htmloutput', default=None) 136 op.add_option('-o', '--htmloutput', default=None)
137 op.add_option('-t', '--textoutput', default=None)
197 op.add_option('-d', '--outputdir', default="/tmp/shortread") 138 op.add_option('-d', '--outputdir', default="/tmp/shortread")
198 op.add_option('-f', '--informat', default='fastq') 139 op.add_option('-f', '--informat', default='fastq')
199 op.add_option('-n', '--namejob', default='rgFastQC') 140 op.add_option('-n', '--namejob', default='rgFastQC')
200 op.add_option('-c', '--contaminants', default=None) 141 op.add_option('-c', '--contaminants', default=None)
142 op.add_option('-l', '--limits', default=None)
201 op.add_option('-e', '--executable', default='fastqc') 143 op.add_option('-e', '--executable', default='fastqc')
202 opts, args = op.parse_args() 144 opts, args = op.parse_args()
203 assert opts.input <> None 145
146 assert opts.input != None
147 assert opts.inputfilename != None
148 assert opts.htmloutput != None
204 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable 149 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
205 if not os.path.exists(opts.outputdir): 150 if not os.path.exists(opts.outputdir):
206 os.makedirs(opts.outputdir) 151 os.makedirs(opts.outputdir)
207 f = FastQC(opts)
208 html,retval,serr = f.run_fastqc()
209 f = open(opts.htmloutput, 'w')
210 f.write(''.join(html))
211 f.close()
212 if retval <> 0:
213 print >> sys.stderr, serr # indicate failure
214
215 152
216 153 fastqc_runner = FastQCRunner(opts)
154 fastqc_runner.run_fastqc()