Mercurial > repos > devteam > fastqc
diff rgFastQC.py @ 1:8fae48caaf06 draft
Uploaded form GH
author | devteam |
---|---|
date | Tue, 11 Nov 2014 12:46:27 -0500 |
parents | e28c965eeed4 |
children | 0b201de108b9 |
line wrap: on
line diff
--- a/rgFastQC.py Mon Jan 27 09:29:14 2014 -0500 +++ b/rgFastQC.py Tue Nov 11 12:46:27 2014 -0500 @@ -1,83 +1,52 @@ """ -# May 2013 ross added check for bogus gz extension - fastqc gets confused -# added sanitizer for user supplied name -# removed shell and make cl a sequence for Popen call -# ross lazarus August 10 2012 in response to anon insecurity report -wrapper for fastqc +Rewrite of rgFastQC.py for Version 0.11.2 of FastQC. + +Changes implemented from tmcgowan at +https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc +and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc +with minor changes and bug fixes -called as - <command interpreter="python"> - rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" - </command> +SYNOPSIS + + rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory] + [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable] - +EXAMPLE (generated by Galaxy) -Current release seems overly intolerant of sam/bam header strangeness -Author notified... - + rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder + -f fastq -n FastQC -c path/dataset_2.dat -e fastqc """ + import re import os -import sys +import shutil import subprocess import optparse -import shutil import tempfile +import glob +import gzip +import bz2 import zipfile -import gzip - -def getFileString(fpath, outpath): - """ - format a nice file size string - """ - size = '' - fp = os.path.join(outpath, fpath) - s = '? ?' - if os.path.isfile(fp): - n = float(os.path.getsize(fp)) - if n > 2**20: - size = ' (%1.1f MB)' % (n/2**20) - elif n > 2**10: - size = ' (%1.1f KB)' % (n/2**10) - elif n > 0: - size = ' (%d B)' % (int(n)) - s = '%s %s' % (fpath, size) - return s - - -class FastQC(): - """wrapper - """ - +class FastQCRunner(object): def __init__(self,opts=None): - assert opts <> None - self.opts = opts - + ''' + Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc() + ''' - def run_fastqc(self): - """ - In batch mode fastqc behaves not very nicely - will write to a new folder in - the same place as the infile called [infilebasename]_fastqc - rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc - duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt - error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png - fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png + # Check whether the options are specified and saves them into the object + assert opts != None + self.opts = opts - """ - serr = '' - dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) - sout = open(tlog, 'w') - fastq = os.path.basename(self.opts.input) - cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir] - if self.opts.informat in ['sam','bam']: - cl.append('--f=%s' % self.opts.informat) - if self.opts.contaminants <> None : - cl.append('--contaminants=%s' % self.opts.contaminants) - # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30 - # use a symlink in a temporary directory so that the FastQC report reflects the history input file name + def prepare_command_line(self): + ''' + Develops the Commandline to run FastQC in Galaxy + ''' + + # Check whether a given file compression format is valid + # This prevents uncompression of already uncompressed files infname = self.opts.inputfilename linf = infname.lower() trimext = False @@ -86,7 +55,7 @@ if ( linf.endswith('.gz') or linf.endswith('.gzip') ): f = gzip.open(self.opts.input) try: - testrow = f.readline() + f.readline() except: trimext = True f.close() @@ -101,116 +70,85 @@ if not zipfile.is_zipfile(self.opts.input): trimext = True if trimext: - infname = os.path.splitext(infname)[0] - fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) - link_name = os.path.join(self.opts.outputdir, fastqinfilename) - os.symlink(self.opts.input, link_name) - cl.append(link_name) - sout.write('# FastQC cl = %s\n' % ' '.join(cl)) - sout.flush() - p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) - retval = p.wait() + f = open(self.opts.input) + try: + f.readline() + except: + raise Exception("Input file corruption, could not identify the filetype") + infname = os.path.splitext(infname)[0] + + # Replace unwanted or problematic charaters in the input file name + self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) + + # Build the Commandline from the given parameters + command_line = [opts.executable, '--outdir %s' % opts.outputdir] + if opts.contaminants != None: + command_line.append('--contaminants %s' % opts.contaminants) + if opts.limits != None: + command_line.append('--limits %s' % opts.limits) + command_line.append('--quiet') + command_line.append('--extract') # to access the output text file + command_line.append(self.fastqinfilename) + self.command_line = ' '.join(command_line) + + def copy_output_file_to_dataset(self): + ''' + Retrieves the output html and text files from the output directory and copies them to the Galaxy output files + ''' + + # retrieve html file + result_file = glob.glob(opts.outputdir + '/*html') + with open(result_file[0], 'rb') as fsrc: + with open(self.opts.htmloutput, 'wb') as fdest: + shutil.copyfileobj(fsrc, fdest) + + # retrieve text file + text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt') + with open(text_file[0], 'rb') as fsrc: + with open(self.opts.textoutput, 'wb') as fdest: + shutil.copyfileobj(fsrc, fdest) + + def run_fastqc(self): + ''' + Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts) + ''' + + # Create a log file + dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) + sout = open(tlog, 'w') + + self.prepare_command_line() + sout.write(self.command_line) + sout.write('\n') + sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name + os.symlink(self.opts.input, self.fastqinfilename) + sout.write("check_call\n") + subprocess.check_call(self.command_line, shell=True) + sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput)) + self.copy_output_file_to_dataset() + sout.write("Finished") sout.close() - runlog = open(tlog,'r').readlines() - os.unlink(link_name) - flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh - odpath = None - for f in flist: - d = os.path.join(self.opts.outputdir,f) - if os.path.isdir(d): - if d.endswith('_fastqc'): - odpath = d - hpath = None - if odpath <> None: - try: - hpath = os.path.join(odpath,'fastqc_report.html') - rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag - except: - pass - if hpath == None: - serr = '\n'.join(runlog) - res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] - res += runlog - res += ['</pre>\n', - 'Please read the above for clues<br/>\n', - 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', - 'It is also possible that the log shows that fastqc is not installed?<br/>\n', - 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', - 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] - return res,1,serr - self.fix_fastqcimages(odpath) - flist = os.listdir(self.opts.outputdir) # these have now been fixed - excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] - flist = [x for x in flist if not x in excludefiles] - for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh - rep[i] = rep[i].replace('Icons/','') - rep[i] = rep[i].replace('Images/','') - - html = self.fix_fastqc(rep,flist,runlog) - return html,retval,serr - - - - def fix_fastqc(self,rep=[],flist=[],runlog=[]): - """ add some of our stuff to the html - """ - bodyindex = len(rep) -1 # hope they don't change this - footrow = bodyindex - 1 - footer = rep[footrow] - rep = rep[:footrow] + rep[footrow+1:] - res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n'] - flist.sort() - for i,f in enumerate(flist): - if not(os.path.isdir(f)): - fn = os.path.split(f)[-1] - res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir))) - res.append('</table>\n') - res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') - res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://bitbucket.org/rgenetics for details and licensing\n</div>') - res.append(footer) - fixed = rep[:bodyindex] + res + rep[bodyindex:] - return fixed # with our additions - - - def fix_fastqcimages(self,odpath): - """ Galaxy wants everything in the same files_dir - """ - icpath = os.path.join(odpath,'Icons') - impath = os.path.join(odpath,'Images') - for adir in [icpath,impath,odpath]: - if os.path.exists(adir): - flist = os.listdir(adir) # get all files created - for f in flist: - if not os.path.isdir(os.path.join(adir,f)): - sauce = os.path.join(adir,f) - dest = os.path.join(self.opts.outputdir,f) - shutil.move(sauce,dest) - os.rmdir(adir) - - if __name__ == '__main__': op = optparse.OptionParser() op.add_option('-i', '--input', default=None) - op.add_option('-j', '--inputfilename', default=None) + op.add_option('-j', '--inputfilename', default=None) op.add_option('-o', '--htmloutput', default=None) + op.add_option('-t', '--textoutput', default=None) op.add_option('-d', '--outputdir', default="/tmp/shortread") op.add_option('-f', '--informat', default='fastq') op.add_option('-n', '--namejob', default='rgFastQC') op.add_option('-c', '--contaminants', default=None) + op.add_option('-l', '--limits', default=None) op.add_option('-e', '--executable', default='fastqc') opts, args = op.parse_args() - assert opts.input <> None + + assert opts.input != None + assert opts.inputfilename != None + assert opts.htmloutput != None assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable if not os.path.exists(opts.outputdir): os.makedirs(opts.outputdir) - f = FastQC(opts) - html,retval,serr = f.run_fastqc() - f = open(opts.htmloutput, 'w') - f.write(''.join(html)) - f.close() - if retval <> 0: - print >> sys.stderr, serr # indicate failure - - + fastqc_runner = FastQCRunner(opts) + fastqc_runner.run_fastqc() \ No newline at end of file