| 1 | 1 """ | 
|  | 2 # May 2013 ross added check for bogus gz extension - fastqc gets confused | 
|  | 3 # added sanitizer for user supplied name | 
|  | 4 # removed shell and make cl a sequence for Popen call | 
|  | 5 # ross lazarus August 10 2012 in response to anon insecurity report | 
|  | 6 wrapper for fastqc | 
|  | 7 | 
|  | 8 called as | 
|  | 9   <command interpreter="python"> | 
|  | 10     rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" | 
|  | 11   </command> | 
|  | 12 | 
|  | 13 | 
|  | 14 | 
|  | 15 Current release seems overly intolerant of sam/bam header strangeness | 
|  | 16 Author notified... | 
|  | 17 | 
|  | 18 | 
|  | 19 """ | 
|  | 20 import re | 
|  | 21 import os | 
|  | 22 import sys | 
|  | 23 import subprocess | 
|  | 24 import optparse | 
|  | 25 import shutil | 
|  | 26 import tempfile | 
|  | 27 import zipfile | 
|  | 28 import gzip | 
|  | 29 | 
|  | 30 def pathfind(program): | 
|  | 31     """ toolshed path munging isn't so try to work around june 5 2013 | 
|  | 32     """ | 
|  | 33     def is_exe(fpath): | 
|  | 34         return os.path.isfile(fpath) and os.access(fpath, os.X_OK) | 
|  | 35 | 
|  | 36     fpath, fname = os.path.split(program) | 
|  | 37     if fpath: | 
|  | 38         if is_exe(program): | 
|  | 39             return program | 
|  | 40     else: | 
|  | 41         for path in os.environ["PATH"].split(os.pathsep): | 
|  | 42             path = path.strip('"') | 
|  | 43             exe_file = os.path.join(path, program) | 
|  | 44             if is_exe(exe_file): | 
|  | 45                 return exe_file | 
|  | 46 | 
|  | 47     return None | 
|  | 48 | 
|  | 49 class FastQC(): | 
|  | 50     """wrapper | 
|  | 51     """ | 
|  | 52 | 
|  | 53 | 
|  | 54     def __init__(self,opts=None): | 
|  | 55         assert opts <> None | 
|  | 56         self.opts = opts | 
|  | 57         fastqcexe = pathfind(opts.executable) | 
|  | 58         assert (fastqcexe != None),'##rgFastQC.py error - cannot find passed fastqc executable %s in path %s' % (opts.executable,os.environ['PATH']) | 
|  | 59         self.fastqcexe = fastqcexe | 
|  | 60 | 
|  | 61     def getFileString(self, fpath, outpath): | 
|  | 62         """ | 
|  | 63         format a nice file size string | 
|  | 64         """ | 
|  | 65         size = '' | 
|  | 66         fp = os.path.join(outpath, fpath) | 
|  | 67         s = fpath | 
|  | 68         if os.path.isfile(fp): | 
|  | 69             n = float(os.path.getsize(fp)) | 
|  | 70             if n > 2**20: | 
|  | 71                 size = ' (%1.1f MB)' % (n/2**20) | 
|  | 72             elif n > 2**10: | 
|  | 73                 size = ' (%1.1f KB)' % (n/2**10) | 
|  | 74             elif n > 0: | 
|  | 75                 size = ' (%d B)' % (int(n)) | 
|  | 76         s = '%s %s' % (fpath, size) | 
|  | 77         return s | 
|  | 78 | 
|  | 79     def run_fastqc(self): | 
|  | 80         """ | 
|  | 81         In batch mode fastqc behaves not very nicely - will write to a new folder in | 
|  | 82         the same place as the infile called [infilebasename]_fastqc | 
|  | 83     rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc | 
|  | 84     duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt | 
|  | 85     error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png | 
|  | 86     fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png | 
|  | 87 | 
|  | 88         """ | 
|  | 89         serr = '' | 
|  | 90         dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) | 
|  | 91         sout = open(tlog, 'w') | 
|  | 92         fastq = os.path.basename(self.opts.input) | 
|  | 93         cl = [self.fastqcexe,'--outdir=%s' % self.opts.outputdir] | 
|  | 94         if self.opts.informat in ['sam','bam']: | 
|  | 95             cl.append('--f=%s' % self.opts.informat) | 
|  | 96         if self.opts.contaminants <> None : | 
|  | 97             cl.append('--contaminants=%s' % self.opts.contaminants) | 
|  | 98         # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30 | 
|  | 99         # use a symlink in a temporary directory so that the FastQC report reflects the history input file name | 
|  | 100         # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files | 
|  | 101         # which fastqc helpfully tries to uncompress again - hilarity ensues. | 
|  | 102         # patched may 29 2013 until this is fixed properly | 
|  | 103         infname = self.opts.inputfilename | 
|  | 104         linf = infname.lower() | 
|  | 105         trimext = False | 
|  | 106         if ( linf.endswith('.gz') or linf.endswith('.gzip') ): | 
|  | 107             f = gzip.open(self.opts.input) | 
|  | 108             try: | 
|  | 109                testrow = f.readline() | 
|  | 110             except: | 
|  | 111                trimext = True | 
|  | 112             f.close() | 
|  | 113         elif linf.endswith('bz2'): | 
|  | 114            f = bz2.open(self.opts.input,'rb') | 
|  | 115            try: | 
|  | 116               f.readline() | 
|  | 117            except: | 
|  | 118               trimext = True | 
|  | 119            f.close() | 
|  | 120         elif linf.endswith('.zip'): | 
|  | 121            if not zipfile.is_zipfile(self.opts.input): | 
|  | 122               trimext = True | 
|  | 123         if trimext: | 
|  | 124            infname = os.path.splitext(infname)[0] | 
|  | 125         fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) | 
|  | 126         link_name = os.path.join(self.opts.outputdir, fastqinfilename) | 
|  | 127         os.symlink(self.opts.input, link_name) | 
|  | 128         cl.append(link_name) | 
|  | 129         sout.write('# FastQC cl = %s\n' % ' '.join(cl)) | 
|  | 130         sout.flush() | 
|  | 131         p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) | 
|  | 132         retval = p.wait() | 
|  | 133         sout.close() | 
|  | 134         runlog = open(tlog,'r').readlines() | 
|  | 135         os.unlink(link_name) | 
|  | 136         flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh | 
|  | 137         odpath = None | 
|  | 138         for f in flist: | 
|  | 139             d = os.path.join(self.opts.outputdir,f) | 
|  | 140             if os.path.isdir(d): | 
|  | 141                 if d.endswith('_fastqc'): | 
|  | 142                     odpath = d | 
|  | 143         hpath = None | 
|  | 144         if odpath <> None: | 
|  | 145             try: | 
|  | 146                 hpath = os.path.join(odpath,'fastqc_report.html') | 
|  | 147                 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag | 
|  | 148             except: | 
|  | 149                 pass | 
|  | 150         if hpath == None: | 
|  | 151             serr = '\n'.join(runlog) | 
|  | 152             res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] | 
|  | 153             res += runlog | 
|  | 154             res += ['</pre>\n', | 
|  | 155                    'Please read the above for clues<br/>\n', | 
|  | 156                    'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', | 
|  | 157                    'It is also possible that the log shows that fastqc is not installed?<br/>\n', | 
|  | 158                    'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', | 
|  | 159                    'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] | 
|  | 160             return res,1,serr | 
|  | 161         self.fix_fastqcimages(odpath) | 
|  | 162         flist = os.listdir(self.opts.outputdir) # these have now been fixed | 
|  | 163         excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] | 
|  | 164         flist = [x for x in flist if not x in excludefiles] | 
|  | 165         for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh | 
|  | 166             rep[i] = rep[i].replace('Icons/','') | 
|  | 167             rep[i] = rep[i].replace('Images/','') | 
|  | 168         html = self.fix_fastqc(rep,flist,runlog) | 
|  | 169         return html,retval,serr | 
|  | 170 | 
|  | 171 | 
|  | 172 | 
|  | 173     def fix_fastqc(self,rep=[],flist=[],runlog=[]): | 
|  | 174         """ add some of our stuff to the html | 
|  | 175         """ | 
|  | 176         bodyindex = len(rep) -1  # hope they don't change this | 
|  | 177         footrow = bodyindex - 1 | 
|  | 178         footer = rep[footrow] | 
|  | 179         rep = rep[:footrow] + rep[footrow+1:] | 
|  | 180         res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n'] | 
|  | 181         flist.sort() | 
|  | 182         for i,f in enumerate(flist): | 
|  | 183              if not(os.path.isdir(f)): | 
|  | 184                  fn = os.path.split(f)[-1] | 
|  | 185                  res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,self.getFileString(fn, self.opts.outputdir))) | 
|  | 186         res.append('</table>\n') | 
|  | 187         res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') | 
|  | 188         res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>') | 
|  | 189         res.append(footer) | 
|  | 190         fixed = rep[:bodyindex] + res + rep[bodyindex:] | 
|  | 191         return fixed # with our additions | 
|  | 192 | 
|  | 193 | 
|  | 194     def fix_fastqcimages(self,odpath): | 
|  | 195         """ Galaxy wants everything in the same files_dir | 
|  | 196         """ | 
|  | 197         icpath = os.path.join(odpath,'Icons') | 
|  | 198         impath = os.path.join(odpath,'Images') | 
|  | 199         for adir in [icpath,impath,odpath]: | 
|  | 200             if os.path.exists(adir): | 
|  | 201                 flist = os.listdir(adir) # get all files created | 
|  | 202                 for f in flist: | 
|  | 203                    if not os.path.isdir(os.path.join(adir,f)): | 
|  | 204                        sauce = os.path.join(adir,f) | 
|  | 205                        dest = os.path.join(self.opts.outputdir,f) | 
|  | 206                        shutil.move(sauce,dest) | 
|  | 207                 os.rmdir(adir) | 
|  | 208 | 
|  | 209 | 
|  | 210 if __name__ == '__main__': | 
|  | 211     op = optparse.OptionParser() | 
|  | 212     op.add_option('-i', '--input', default=None) | 
|  | 213     op.add_option('-j', '--inputfilename', default=None) | 
|  | 214     op.add_option('-o', '--htmloutput', default=None) | 
|  | 215     op.add_option('-d', '--outputdir', default="/tmp/shortread") | 
|  | 216     op.add_option('-f', '--informat', default='fastq') | 
|  | 217     op.add_option('-n', '--namejob', default='rgFastQC') | 
|  | 218     op.add_option('-c', '--contaminants', default=None) | 
|  | 219     op.add_option('-e', '--executable', default='fastqc') | 
|  | 220     opts, args = op.parse_args() | 
|  | 221     assert opts.input <> None | 
|  | 222     if not os.path.exists(opts.outputdir): | 
|  | 223         os.makedirs(opts.outputdir) | 
|  | 224     f = FastQC(opts) | 
|  | 225     html,retval,serr = f.run_fastqc() | 
|  | 226     f = open(opts.htmloutput, 'w') | 
|  | 227     f.write(''.join(html)) | 
|  | 228     f.close() | 
|  | 229     if retval <> 0: | 
|  | 230          print >> sys.stderr, serr # indicate failure | 
|  | 231 | 
|  | 232 | 
|  | 233 |