| 0 | 1 """ | 
|  | 2 # May 2013 ross added check for bogus gz extension - fastqc gets confused | 
|  | 3 # added sanitizer for user supplied name | 
|  | 4 # removed shell and make cl a sequence for Popen call | 
|  | 5 # ross lazarus August 10 2012 in response to anon insecurity report | 
|  | 6 wrapper for fastqc | 
|  | 7 | 
|  | 8 called as | 
|  | 9   <command interpreter="python"> | 
|  | 10     rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" | 
|  | 11   </command> | 
|  | 12 | 
|  | 13 | 
|  | 14 | 
|  | 15 Current release seems overly intolerant of sam/bam header strangeness | 
|  | 16 Author notified... | 
|  | 17 | 
|  | 18 | 
|  | 19 """ | 
|  | 20 import re | 
|  | 21 import os | 
|  | 22 import sys | 
|  | 23 import subprocess | 
|  | 24 import optparse | 
|  | 25 import shutil | 
|  | 26 import tempfile | 
|  | 27 import zipfile | 
|  | 28 import gzip | 
|  | 29 import magic | 
|  | 30 | 
|  | 31 | 
|  | 32 def getFileString(fpath, outpath): | 
|  | 33     """ | 
|  | 34     format a nice file size string | 
|  | 35     """ | 
|  | 36     size = '' | 
|  | 37     fp = os.path.join(outpath, fpath) | 
|  | 38     s = '? ?' | 
|  | 39     if os.path.isfile(fp): | 
|  | 40         n = float(os.path.getsize(fp)) | 
|  | 41         if n > 2**20: | 
|  | 42             size = ' (%1.1f MB)' % (n/2**20) | 
|  | 43         elif n > 2**10: | 
|  | 44             size = ' (%1.1f KB)' % (n/2**10) | 
|  | 45         elif n > 0: | 
|  | 46             size = ' (%d B)' % (int(n)) | 
|  | 47         s = '%s %s' % (fpath, size) | 
|  | 48     return s | 
|  | 49 | 
|  | 50 | 
|  | 51 class FastQC(): | 
|  | 52     """wrapper | 
|  | 53     """ | 
|  | 54 | 
|  | 55 | 
|  | 56     def __init__(self,opts=None): | 
|  | 57         assert opts <> None | 
|  | 58         self.opts = opts | 
|  | 59 | 
|  | 60 | 
|  | 61     def run_fastqc(self): | 
|  | 62         """ | 
|  | 63         In batch mode fastqc behaves not very nicely - will write to a new folder in | 
|  | 64         the same place as the infile called [infilebasename]_fastqc | 
|  | 65     rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc | 
|  | 66     duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt | 
|  | 67     error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png | 
|  | 68     fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png | 
|  | 69 | 
|  | 70         """ | 
|  | 71         serr = '' | 
|  | 72         dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) | 
|  | 73         sout = open(tlog, 'w') | 
|  | 74         fastq = os.path.basename(self.opts.input) | 
|  | 75         cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir] | 
|  | 76         if self.opts.informat in ['sam','bam']: | 
|  | 77             cl.append('--format=%s' % self.opts.informat) | 
|  | 78         if self.opts.contaminants <> None : | 
|  | 79             cl.append('--contaminants=%s' % self.opts.contaminants) | 
|  | 80         # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30 | 
|  | 81         # use a symlink in a temporary directory so that the FastQC report reflects the history input file name | 
|  | 82         infname = self.opts.inputfilename | 
|  | 83         linf = infname.lower() | 
|  | 84         trimext = False | 
|  | 85         # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf | 
|  | 86         # patched may 29 2013 until this is fixed properly | 
|  | 87         input_magic = magic.from_file(self.opts.input) | 
|  | 88         if ( linf.endswith('.gz') or linf.endswith('.gzip') or 'gzip' in input_magic): | 
|  | 89             f = gzip.open(self.opts.input) | 
|  | 90             try: | 
|  | 91                 testrow = f.readline() | 
|  | 92             except: | 
|  | 93                 trimext = True | 
|  | 94             f.close() | 
|  | 95         elif linf.endswith('bz2'): | 
|  | 96             f = bz2.open(self.opts.input,'rb') | 
|  | 97             try: | 
|  | 98                 f.readline() | 
|  | 99             except: | 
|  | 100                 trimext = True | 
|  | 101             f.close() | 
|  | 102         elif linf.endswith('.zip'): | 
|  | 103             if not zipfile.is_zipfile(self.opts.input): | 
|  | 104                 trimext = True | 
|  | 105         if trimext: | 
|  | 106             infname = os.path.splitext(infname)[0] | 
|  | 107         fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) | 
|  | 108         link_name = os.path.join(self.opts.outputdir, fastqinfilename) | 
|  | 109         os.symlink(self.opts.input, link_name) | 
|  | 110         cl.append(link_name) | 
|  | 111         if('gzip' in input_magic): | 
|  | 112             sout.write('# File magic = %s\n' % input_magic) | 
|  | 113         sout.write('# FastQC cl = %s\n' % ' '.join(cl)) | 
|  | 114         sout.flush() | 
|  | 115         p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) | 
|  | 116         retval = p.wait() | 
|  | 117         sout.close() | 
|  | 118         runlog = open(tlog,'r').readlines() | 
|  | 119         os.unlink(link_name) | 
|  | 120         flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh | 
|  | 121         odpath = None | 
|  | 122         for f in flist: | 
|  | 123             d = os.path.join(self.opts.outputdir,f) | 
|  | 124             if os.path.isdir(d): | 
|  | 125                 if d.endswith('_fastqc'): | 
|  | 126                     odpath = d | 
|  | 127         hpath = None | 
|  | 128         if odpath <> None: | 
|  | 129             try: | 
|  | 130                 hpath = os.path.join(odpath,'fastqc_report.html') | 
|  | 131                 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag | 
|  | 132             except: | 
|  | 133                 pass | 
|  | 134         if hpath == None: | 
|  | 135             serr = '\n'.join(runlog) | 
|  | 136             res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] | 
|  | 137             res += runlog | 
|  | 138             res += ['</pre>\n', | 
|  | 139                    'Please read the above for clues<br/>\n', | 
|  | 140                    'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', | 
|  | 141                    'It is also possible that the log shows that fastqc is not installed?<br/>\n', | 
|  | 142                    'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', | 
|  | 143                    'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] | 
|  | 144             return res,1,serr | 
|  | 145         self.fix_fastqcimages(odpath) | 
|  | 146         flist = os.listdir(self.opts.outputdir) # these have now been fixed | 
|  | 147         excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] | 
|  | 148         flist = [x for x in flist if not x in excludefiles] | 
|  | 149         for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh | 
|  | 150             rep[i] = rep[i].replace('Icons/','') | 
|  | 151             rep[i] = rep[i].replace('Images/','') | 
|  | 152 | 
|  | 153         html = self.fix_fastqc(rep,flist,runlog) | 
|  | 154         return html,retval,serr | 
|  | 155 | 
|  | 156 | 
|  | 157 | 
|  | 158     def fix_fastqc(self,rep=[],flist=[],runlog=[]): | 
|  | 159         """ add some of our stuff to the html | 
|  | 160         """ | 
|  | 161         bodyindex = len(rep) -1  # hope they don't change this | 
|  | 162         footrow = bodyindex - 1 | 
|  | 163         footer = rep[footrow] | 
|  | 164         rep = rep[:footrow] + rep[footrow+1:] | 
|  | 165         res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n'] | 
|  | 166         flist.sort() | 
|  | 167         for i,f in enumerate(flist): | 
|  | 168             if not(os.path.isdir(f)): | 
|  | 169                 fn = os.path.split(f)[-1] | 
|  | 170                 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir))) | 
|  | 171         res.append('</table>\n') | 
|  | 172         res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') | 
|  | 173         res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>') | 
|  | 174         res.append(footer) | 
|  | 175         fixed = rep[:bodyindex] + res + rep[bodyindex:] | 
|  | 176         return fixed # with our additions | 
|  | 177 | 
|  | 178 | 
|  | 179     def fix_fastqcimages(self,odpath): | 
|  | 180         """ Galaxy wants everything in the same files_dir | 
|  | 181         """ | 
|  | 182         icpath = os.path.join(odpath,'Icons') | 
|  | 183         impath = os.path.join(odpath,'Images') | 
|  | 184         for adir in [icpath,impath,odpath]: | 
|  | 185             if os.path.exists(adir): | 
|  | 186                 flist = os.listdir(adir) # get all files created | 
|  | 187                 for f in flist: | 
|  | 188                     if not os.path.isdir(os.path.join(adir,f)): | 
|  | 189                         sauce = os.path.join(adir,f) | 
|  | 190                         dest = os.path.join(self.opts.outputdir,f) | 
|  | 191                         shutil.move(sauce,dest) | 
|  | 192                 os.rmdir(adir) | 
|  | 193 | 
|  | 194 | 
|  | 195 | 
|  | 196 if __name__ == '__main__': | 
|  | 197     op = optparse.OptionParser() | 
|  | 198     op.add_option('-i', '--input', default=None) | 
|  | 199     op.add_option('-j', '--inputfilename', default=None) | 
|  | 200     op.add_option('-o', '--htmloutput', default=None) | 
|  | 201     op.add_option('-d', '--outputdir', default="/tmp/shortread") | 
|  | 202     op.add_option('-f', '--informat', default='fastq') | 
|  | 203     op.add_option('-n', '--namejob', default='rgFastQC') | 
|  | 204     op.add_option('-c', '--contaminants', default=None) | 
|  | 205     op.add_option('-e', '--executable', default='fastqc') | 
|  | 206     opts, args = op.parse_args() | 
|  | 207     assert opts.input <> None | 
|  | 208     assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable | 
|  | 209     if not os.path.exists(opts.outputdir): | 
|  | 210         os.makedirs(opts.outputdir) | 
|  | 211     f = FastQC(opts) | 
|  | 212     html,retval,serr = f.run_fastqc() | 
|  | 213     f = open(opts.htmloutput, 'w') | 
|  | 214     f.write(''.join(html)) | 
|  | 215     f.close() | 
|  | 216     if retval <> 0: | 
|  | 217         print >> sys.stderr, serr # indicate failure | 
|  | 218 | 
|  | 219 | 
|  | 220 |