# HG changeset patch # User jjohnson # Date 1307481725 14400 # Node ID 1d373f219445a58df5b2416739dd5040b86074c0 Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository diff -r 000000000000 -r 1d373f219445 fastqc/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc/README Tue Jun 07 17:22:05 2011 -0400 @@ -0,0 +1,39 @@ + +FastQC +------ + +From the FastQC website http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/ + +Function A quality control tool for high throughput sequence data. +Language Java +Requirements A suitable Java Runtime Environment + The Picard BAM/SAM Libraries (included in download) +Code Maturity Stable. Mature code, but feedback is appreciated. +Code Released Yes, under GPL v3 or later. +Initial Contact Simon Andrews + +FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. + +The main functions of FastQC are: + +- Import of data from BAM, SAM or FastQ files (any variant) +- Providing a quick overview to tell you in which areas there may be problems +- Summary graphs and tables to quickly assess your data +- Export of results to an HTML based permanent report +- Offline operation to allow automated generation of reports without running the interactive application + +Download and installation information is at: http://www.bioinformatics.bbsrc.ac.uk/projects/download.html#fastqc + + +Galaxy Tool Wrapper +------------------- + +The galaxy tool wrapper for FastQC requires version: FastQC v0.7.2 + +FastQC should be downloaded and installed on the system on which it will be executed. +The PATH environment variable should include the directory in which the fastqc script resides. + +The fastqc.py wrapper invokes the fastqc script provided in FastQC download, +and converts the FastQC results into a Galaxy html formatted dataset. + + diff -r 000000000000 -r 1d373f219445 fastqc/fastqc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc/fastqc.py Tue Jun 07 17:22:05 2011 -0400 @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +""" +Runs FastQC on a fastq file; +TODO: more documentation + +usage: fastqc.py [options] + -i, --input=i: The fastq input file + -n, --name=n: The fastq input name + -c, --contaminants=c: A contaminants file + -r, --report=r: The html summary report file + -D, --dir=D: The dir for report files + -d, --data=d: The data output text file +""" + +import optparse, os, shutil, subprocess, sys, tempfile, re, string + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-i', '--input', dest='input', help='The sequence input file' ) + parser.add_option( '-f', '--format', dest='format', help='The sequence input file format' ) + parser.add_option( '-n', '--name', dest='name', help='The fastq input name' ) + parser.add_option( '-c', '--contaminants', dest='contaminants', help='A contaminants file' ) + parser.add_option( '-r', '--report', dest='report', help='The HTML report' ) + parser.add_option( '-D', '--dir', dest='outdir', help='The dir for report files' ) + parser.add_option( '-d', '--data', dest='data', help='The output data text file' ) + (options, args) = parser.parse_args() + if options.input == None: + stop_err("Misssing option --input") + params = [] + #params.append('-Xmx250m') + params.append('-Djava.awt.headless=true') + name = 'input' + format = 'fastq' + if options.outdir != None: + os.makedirs(options.outdir) + if options.contaminants != None and options.contaminants != 'None': + params.append("-c %s" % options.contaminants) + if options.name != None and options.name != 'None': + name = re.sub('[^a-zA-Z0-9_.-]','_',options.name) + if options.format != None and options.format != 'None': + format = options.format + params.append("-f %s" % options.format) + # FastQC relies on the extension to determine file format .sam .bam or .fastq + if not name.endswith('.'+format): + name = '.'.join((name,format)) + # make temp directory + buffsize = 1048576 + tmp_dir = tempfile.mkdtemp() + params.append("-o %s" % tmp_dir) + # print("tmp_dir %s" % tmp_dir) + try: + # make a link to the input fastq in the tmp_dir + # FastQC will generate output in the same dir that it finds its input + fastq = os.path.join(tmp_dir,name) + os.symlink( options.input, fastq) + # generate commandline + cmd = 'fastqc %s %s' % (' '.join(params),fastq) + # need to nest try-except in try-finally to handle 2.4 + try: + try: + tmp_stderr_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.err' ).name + tmp_stderr = open( tmp_stderr_name, 'wb' ) + tmp_stdout_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.out' ).name + tmp_stdout = open( tmp_stdout_name, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_stderr_name, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error executing FastQC. ' + str( e ) + # remove the input file symlink so it does get copied + os.remove(fastq) + # remove the stdout and stderr files so they do not get copied + os.remove(tmp_stderr_name) + os.remove(tmp_stdout_name) + # array to retrieve results of each test so that it gets displayed in the tool info + tests = [] + # move result to outdir + # Need to flatten the dir hierachy in order for galaxy to serve the href links + for root, dirs, files in os.walk(tmp_dir): + for fname in files: + path = os.path.join(root,fname) + # print("%s" % fname) + if re.match('.+\.zip',fname): + pass + elif fname == 'fastqc_report.html': + if options.outdir != None: + fsrc = open(path,'r') + # fdst = open(os.path.join(options.outdir,fname),'w') + fdst = open(options.report,'w') + try: + for line in fsrc: + if line.find('footer') > 0: + # add extra links in case someone prefers raw text + fdst.write('

FastQC Summary text report') + fdst.write('

FastQC Report Data') + # copy lines removing subdirs from links + fdst.write(re.sub('Icons/|Images/','',line)) + finally: + fsrc.close() + fdst.close() + else: + if options.outdir != None: + shutil.copy(path,options.outdir) + if fname == 'summary.txt': + # Use the contents of this file to put stdout info into the HistoryDataset panel + fsrc = open(path,'r') + try: + for line in fsrc: + (grade,test,seq) = string.split(line,' ') + tests.append("%s %s" % ('+' if grade == 'PASS' else '-',re.sub('equence','eq',test))) + finally: + fsrc.close() + elif fname == 'fastqc_data.txt': + if options.data != None: + # copy the fastqc_data.txt file to the dataset data + shutil.copy(path,options.data) + cnt = '?' + flen = '?' + gc = '?' + fsrc = open(path,'r') + try: + for line in fsrc: + m = re.match('^Total Sequences (\d+)',line) + if m: + cnt = m.groups()[-1] + m = re.match('^Sequence length (\d+)',line) + if m: + flen = m.groups()[-1] + m = re.match('^%GC (\d+)',line) + if m: + gc = m.groups()[-1] + finally: + fsrc.close() + #print to stdout so that this appears in the tool dataset info + print("Seqs %s, Len %s, GC %s" %(cnt,flen,gc)) + #print to stdout so that this appears in the tool dataset info + print("%s" % '\n'.join(tests)) + except Exception, e: + stop_err( 'Fastq failed.\n' + str( e ) ) + finally: + # clean up temp dir, put in a try block so we don't fail on stale nfs handles + try: + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + except Exception, e: + pass + +if __name__=="__main__": __main__() diff -r 000000000000 -r 1d373f219445 fastqc/fastqc.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc/fastqc.xml Tue Jun 07 17:22:05 2011 -0400 @@ -0,0 +1,94 @@ + + quality control checks on raw sequence data + fastqc.py + #if $input.extension.startswith( "fastq"): + --format=fastq + #else + --format=$input.extension + #end if + --input='$input' + --name='$input.name' + --dir='$report.extra_files_path' + --report='$report' + #if $contaminants != None and $contaminants != "None" and $contaminants != "": + --contaminants=$contaminants + #end if + + + + + + + + + + + + +**What it does** + +FastQC_ is a product of Bioinformatics Group at the Babraham Institute. FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. + +The main functions of FastQC are:: + + - Import of data from BAM, SAM or FastQ files (any variant) + - Provding a quick overview to tell you in which areas there may be problems + - Summary graphs and tables to quickly assess your data + - Export of results to an HTML based permanent report + - Offline operation to allow automated generation of reports without running the interactive application + + +.. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/ + +----- + +**Input format** + +Any fastq file, for example:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + +**Contaminants format** + +An optional contaminant file (otherwise FastQC will use the default):: + + # This file contains a list of potential contaminants which are + # frequently found in high throughput sequencing reactions. These + # are mostly sequences of adapters / primers used in the various + # sequencing chemistries. + # + # You can add more sequences to the file by putting one line per entry + # and specifying a name[tab]sequence. If the contaminant you add is + # likely to be of use to others please consider sending it to the FastQ + # authors, either via a bug report at www.bioinformatics.bbsrc.ac.uk/bugzilla/ + # or by directly emailing simon.andrews@bbsrc.ac.uk so other users of + # the program can benefit. + Illumina Single End Apapter 1 ACACTCTTTCCCTACACGACGCTGTTCCATCT + Illumina Single End Apapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT + Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT + Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT + Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT + + +----- + +**Outputs** + +An HTML file with links to:: + + - fastqc_report.html + - summary.txt + - fastqc_data.txt + + +