changeset 0:ef5dd11c01e6 default tip

pyrocleaner v1.2
author g2cmnty@test-web1.g2.bx.psu.edu
date Thu, 09 Jun 2011 06:09:09 -0400
parents
children
files pyrocleaner_galaxy_tool_V1.2/README pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py pyrocleaner_galaxy_tool_V1.2/pyrocleaner.xml pyrocleaner_galaxy_tool_V1.2/pyrocleaner_wrapper.py
diffstat 4 files changed, 1154 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrocleaner_galaxy_tool_V1.2/README	Thu Jun 09 06:09:09 2011 -0400
@@ -0,0 +1,77 @@
+#
+# Pyrocleaner 
+# Copyright (C) 2009 INRA
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+1. WHAT IS PYROCLEANER
+
+PyroCleaner is intended to clean reads coming from pyrosequencing in order to ease the assembly process. 
+It enables filtering sequences on multiple copy reads and other criteria such as length, complexity and 
+number of undetermined bases which has been proven to correlate with pour quality. It also permits to 
+clean sff files of paired-end sequences and generates on one side a sff with the validated paired-ends 
+and on the other the shotgun reads.
+
+
+2. DEPENDENCIES
+
+Pyrocleaner has some dependencies :
+  - BioPython, http://www.biopython.org/wiki/Download
+  - NCBI_blast (megablast and formatdb), http://www.ncbi.nlm.nih.gov/BLAST/download.shtml
+  - sff_extract.py, http://bioinf.comav.upv.es/sff_extract/index.html
+  - cross_match from the Phrap package, this software is only used with the pairends cleaning option.
+  - sfffile from Newbler provided by Roche. This software is only used to return a sff file as output. If this one is not installed into your output will be generated in fastq format. 
+
+
+3. INSTALLATION GUIDELINES
+
+# Download the source from the svn repository 
+svn checkout svn://scm.mulcyber.toulouse.inra.fr/svnroot/pyrocleaner
+
+# Then to make sure your installation succeed, the pyrocleaner help should be displayed when typing the following command :
+python pyrocleaner.py 
+
+
+4. GALAXY
+
+If you're using the galaxy tool, when pyrocleaner.py is installed, add the shebang flag as following:
+
+#!/usr/bin/python
+# Pyrocleaner 
+# Copyright (C) 2009 INRA
+...
+
+# Then make a symbolic link to the script file in a PATH directory without the .py extension so it can be called by the galaxy wrapper:
+ln -s /path/to/pyrocleaner.py /path/to/bin/dir/pyrocleaner
+
+
+5. ERGATIS
+
+If you're using the ergatis component, when pyrocleaner.py is installed, add the shebang flag as following:
+
+#!/usr/bin/python
+# Pyrocleaner 
+# Copyright (C) 2009 INRA
+...
+
+# Then make a symbolic link to the script file in a PATH directory without the .py extension so it can be called by the ergatis component:
+ln -s /path/to/pyrocleaner.py /path/to/bin/dir/pyrocleaner
+
+# Edit the ergatis config file (/path/to/ergatis/software.config) and add the following lines:
+[component pyrocleaner]
+$;PYROCLEANER_EXEC$;=/path/to/pyrocleaner
+
+# To iter througth multiple file, the script create_2files_iterator_list.pl is required. Copy this file into the ergatis bin dir and create it's wrapper:
+cp create_2files_iterator_list.pl /path/to/ergatis/bin/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py	Thu Jun 09 06:09:09 2011 -0400
@@ -0,0 +1,793 @@
+#
+# Pyrocleaner 
+# Copyright (C) 2009 INRA
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# ChangeLog
+# v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the
+#                  --clean-quality option
+# v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality
+#                  Add the --aggressive option in order to keep only one read per cluster
+#                  Modify the duplication strategy so the longer read is keept
+# v1.0 (09/2009) : Pyrocleaner first version
+#
+
+__author__ = 'Plateforme bioinformatique Midi Pyrenees'
+__copyright__ = 'Copyright (C) 2009 INRA'
+__license__ = 'GNU General Public License'
+__version__ = '1.2'
+__email__ = 'support.genopole@toulouse.inra.fr'
+__status__ = 'beta'
+
+from Bio import SeqIO
+from igraph import *
+from optparse import *
+import os, math, datetime, zlib, sys, re, glob, string, gzip
+
+
+def create_roche_pairends_spacer_file(out_dir):
+    """
+    Create the Roche pairends fasta file
+      @param out_dir : the directory where will be writen the fasta file
+    """
+    file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr')
+    file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n")
+    file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n")
+    file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n")
+    file.close()
+    return file.name
+
+
+def version_string ():
+    """
+    Return the pyrocleaner version
+    """
+    return "pyrocleaner " + __version__
+
+
+def reads_to_sff (sff_file, seqs, output_file):
+    """
+    Extract seqs reads from the sff_file
+      @dependences sfffile : the sff software is required to execute this function
+      @param sff_file      : the input sff file
+      @param seqs          : table of seqs to extract   
+      @param output_file   : the name of the output sff file
+    """
+    # First creates the to_write file
+    tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt")
+    to_write_file = open(tmp_file, 'wr')
+    for read in seqs:
+        to_write_file.write(read.id + "\n")
+    to_write_file.close()
+    # Use the sfffile tools
+    cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file
+    os.system(cmd)
+    # Clean temporary files
+    try: os.remove(tmp_file)
+    except: pass
+
+    
+def filter_reads (seqs, options):
+    """
+    Filter input seqs by length, ns and complexity if options are asked by user
+      @param seqs    : table of seqs to filter   
+      @param options : the options asked by the user
+    """
+    reads_id = []
+    reads_length = []
+    reads_ok = []
+    del_by_ns = 0
+    del_by_complexity = 0
+    del_by_length = 0
+    del_by_quality = 0
+    log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n")
+    
+    # Go throught all sequences
+    for i, reads_record in enumerate(seqs) :
+        reads_id.append(reads_record.id)
+        reads_ok.append(0)
+        
+        # If is asked to clean sequences by length using the standard 
+        # deviation, save length to compute some statistics 
+        if options.clean_length_std :
+            reads_length.append(len(reads_record))
+        
+        # If is asked to clean sequences by length using a window 
+        # and the sequence has not been flagged as deleted
+        if options.clean_length_win and reads_ok[i] == 0:
+            # Check if the sequence is longer than the min asked, if not flagged it as deleted
+            if len(reads_record) < int(options.min):
+                reads_ok[i] = 1
+                del_by_length += 1
+                log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n")
+            # Check if the sequence is smaller than the max asked, if not flagged it as deleted
+            elif len(reads_record) > int(options.max):
+                reads_ok[i] = 1
+                del_by_length += 1
+                log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n")
+        
+        # If is asked to clean sequences with too much Ns
+        # and the sequence has not been flagged as deleted
+        if options.clean_ns and reads_ok[i] == 0:
+            # Compute the rate of Ns into the current sequence
+            nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100)
+            # If the rate is higher than the threshold flagged it as deleted
+            if nb_n > float(options.ns_percent) :
+                reads_ok[i] = 1
+                del_by_ns += 1
+                log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n")
+        
+        # If is asked to clean sequences with low complexity using a sliding window
+        # and the sequence has not been flagged as deleted
+        if options.clean_complexity_win and reads_ok[i] == 0:
+            is_complex = False
+            # For each window
+            for win in range(0, len(reads_record)-options.window, options.step):
+                start = win
+                stop = start + options.window
+                # Compute the window complexity
+                w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100)
+                # if the window complexity is higher to the threshold, flag the whole sequence as complex
+                if w_cplx >= float(options.complexity):
+                    is_complex = True
+                    break
+            # If no window has been flagged as complex, then flagg the sequence as deleted
+            if not is_complex:
+                reads_ok[i] = 1
+                del_by_complexity += 1
+                log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n")
+        
+        # If is asked to clean sequences with low complexity working on the whole sequence
+        # and the sequence has not been flagged as deleted
+        if options.clean_complexity_full and reads_ok[i] == 0:
+            # Compute the complexity on the whole sequence
+            cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100)
+            # If the complexity is higher to the threshold, flag the sequence as deleted
+            if cplx < float(options.complexity) :
+                reads_ok[i] = 1
+                del_by_complexity += 1
+                log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n")
+    
+        # If is asked to clean sequences with low quality and quality information is available
+        # and the sequence has not been flagged as deleted
+        if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"):
+            # If at least one base has a quality score higher than the threashold
+            maw_qual = max(reads_record.letter_annotations["phred_quality"])
+            if maw_qual < int(options.quality_threshold) :
+                reads_ok[i] = 1
+                del_by_quality += 1
+                log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n")
+    
+    # If is asked to clean sequences by length using the standard deviation
+    if options.clean_length_std :
+        # Compute the mean and the standard deviation
+        mean = sum(reads_length) / len(reads_length)
+        mq = mean**2
+        s = sum([ x**2 for x in reads_length])
+        var = s/len(reads_length) - mq
+        etype = math.sqrt(var)
+        # For each sequences
+        for i, id in enumerate(reads_id):
+            # If the sequence has not been flagged as deleted
+            if  reads_ok[i] == 0 :
+                # If the sequence length is higher than the upper threshold, flag the sequence as deleted
+                if reads_length[i] > mean + options.std*etype:
+                    reads_ok[i] = 1
+                    del_by_length += 1
+                    log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
+                # If the sequence length is smaller than the lower threshold, flag the sequence as deleted
+                elif reads_length[i] < mean - options.std*etype:
+                    reads_ok[i] = 1
+                    del_by_length += 1
+                    log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
+    
+    seqs_to_return = []
+    # Then get only sequences not flagged to be deleted
+    for i, reads_record in enumerate(seqs) :
+        if reads_ok[i] == 0:
+            seqs_to_return.append(reads_record)
+    
+    # Return values
+    return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality]
+
+
+def filter_same_reads (seqs, options):
+    """
+    Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster
+      @param seqs    : table of seqs to filter   
+      @param options : the options asked by the user
+    """
+    
+    megablast_input = os.path.join(options.output, "megablast_input.fasta")
+    log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n")
+    log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n")
+    
+    # First write down seqs
+    fasta = open(megablast_input, "w")
+    SeqIO.write(seqs, fasta, "fasta")
+    fasta.close()
+    
+    # Then formatdb the fasta file (no formatdb utils in Biopython)
+    cmd = "formatdb -i %s -p F" % megablast_input
+    os.system(cmd)    
+    log.write("## megablast (" + str(datetime.datetime.now()) + ")\n")
+    
+    # In case of pairends, use words of 50, so megablast cannot connect with spacers
+    opts = ""
+    if options.clean_pairends:
+        opts = " -W 50 -H 10 "
+        
+    # Megablast the fasta file versus itself
+    cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" 
+    os.system(cmd)
+    
+    # Let's get the reads length with the fasta file and creates the graph
+    log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n")
+    gseqs = {}
+    vertices = []
+    for reads_record in seqs :
+        vertices.append({'name': reads_record.id})
+        gseqs[reads_record.id] = len(reads_record)
+
+    #  Connects reads from hits starting at 1 and with ends closed to each others
+    log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n")
+    edges = []
+    for read in open(megablast_input + ".res", 'rU').readlines() :
+        parts = read.rstrip().split()
+        len1 = gseqs[parts[0]]
+        len2 = gseqs[parts[1]]
+        if options.clean_aggressive :
+            edges.append({'source': parts[0], 'target': parts[1]})
+        elif math.fabs(len1-len2) < options.duplication_limit:
+            if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit:
+                # This alignments are realy similar -> may be the same -> link them into the graph
+                edges.append({'source': parts[0], 'target': parts[1]})
+    
+    # Then get connected components and extract one of them as cluster leader
+    log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n")
+    gr = Graph.DictList(vertices, edges)
+    connex = gr.clusters().membership
+    
+    clusters = {}
+    for i, vertex in enumerate(vertices):
+        cluster_id = connex[i]
+        try:
+            clusters[cluster_id].append(seqs[i])
+        except:
+            clusters[cluster_id] = []
+            clusters[cluster_id].append(seqs[i])
+
+    del_by_duplicat = 0
+    # Write down into the log the composition of each cluster
+    clusters_stats = {}
+    seqs_to_return = []
+    for cluster_id in clusters:
+        cl_elts = ""
+        del_by_duplicat += len(clusters[cluster_id]) - 1
+        longest_value = 0
+        cluster_leader = None
+        for seq_record in clusters[cluster_id]:
+            # Find out which sequence is the longest
+            if len(seq_record.seq) > longest_value:
+                cluster_leader = seq_record
+                
+        seqs_to_return.append(cluster_leader)
+        for seq_record in clusters[cluster_id]:
+            if seq_record.id != cluster_leader.id:
+                log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n")
+            cl_elts += seq_record.id + " "
+        log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n")
+        
+        try :
+            clusters_stats[len(clusters[cluster_id])] += 1
+        except:
+            clusters_stats[len(clusters[cluster_id])] = 1
+    
+    # Write down a summary of what has been done
+    log_header = "## header (duplicated) : "
+    log_summary = "## summary (duplicated) : "
+    for stat in sorted(clusters_stats.keys()):
+        log_header += str(stat) + "\t"
+        log_summary += str(clusters_stats[stat]) + "\t"
+    log.write(log_header + "\n")
+    log.write(log_summary + "\n")
+    
+    # Clean temporary files
+    try:
+        os.remove(megablast_input)
+        os.remove(megablast_input+".nhr")
+        os.remove(megablast_input+".nin")
+        os.remove(megablast_input+".nsq")
+        os.remove(megablast_input+".res")
+        os.remove("formatdb.log")
+        os.remove("error.log")
+    except:
+        pass
+    
+    # Returns results
+    return [seqs_to_return, del_by_duplicat]
+
+
+def filter_pairends(seqs, options):
+    """
+    Filter pairends sequences and split sequences without pairends into a fasta file
+      @param seqs    : the table of sequences to filter
+      @param options : the options asked by the user
+    """
+    
+    # Setup output files
+    shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta")
+    shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual")
+    crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta")
+    split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta")
+    split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual")
+    
+    # First write down seqs
+    fasta = open(crossmatch_input, "w")
+    SeqIO.write(seqs, fasta, "fasta")
+    fasta.close()
+    
+    # Write down the qual file for cross_match if possible
+    try:
+        qual = open(crossmatch_input+".qual", "w")
+        SeqIO.write(seqs, qual, "qual")
+        qual.close()
+    except:
+        os.remove(crossmatch_input+".qual")
+
+    # Cross_match reverse matches pattern
+    rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
+    # Cross_match forward matches pattern
+    fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")
+    
+    # Write the spacer file and execute cross_match against the input sequences
+    spacers_file = create_roche_pairends_spacer_file(options.output)
+    cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res"
+    os.system(cmd)
+       
+    # Parse the cross_match file
+    cross_match_tab = {}
+    block_found = False
+    for line in open(crossmatch_input + ".cross_match.res", 'r'):
+        save_line = False
+        rm = rev_regex.match(line)
+        fm = fwd_regex.match(line)
+        if rm != None: # If it's a reverse matches
+            block_found = True
+            (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8)))
+            save_line = True
+        elif fm != None: # If it's a forward matches
+            block_found = True
+            (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8)))
+            save_line = True
+        else : save_line = False
+
+        if line.startswith("Discrepancy summary:"): # This is the end of the section
+            break
+        
+        # Save the line
+        if save_line:
+            try:
+                cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch])
+            except:
+                cross_match_tab[primary_match] = []
+                cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)])
+    
+    # Then get the spacers_length
+    spacers_length = {}
+    for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") :
+        spacers_length[seq_record.id] = len(seq_record.seq.tostring())
+    
+    # Finaly go throught all sequences
+    seqs_to_return = []
+    shotgun_seqs = []
+    del_by_pairends = 1
+    pe_splited = []
+        
+    for i, seq_record in enumerate(seqs) :
+        if not cross_match_tab.has_key(seq_record.id):
+            # No spacers found -> just add it to the shotgun_seqs tab
+            shotgun_seqs.append(seq_record)
+            log.write(seq_record.id + " shotgun -> no spacer found\n")
+        elif len(cross_match_tab[seq_record.id]) > 1:
+            # If multiple linker -> delete the whole sequence
+            log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n")
+            del_by_pairends += 1
+        elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)):
+            if (cross_match_tab[seq_record.id][0][1] == 1):
+                shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
+                log.write(seq_record.id + " shotgun -> spacer found at the begining\n")
+            elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)):
+                shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
+                log.write(seq_record.id + " shotgun -> spacer found at the end\n")
+            else :
+                log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n")
+                del_by_pairends += 1
+        elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit:
+            seqs_to_return.append(seq_record)
+            if options.split_pairends:
+                sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:]
+                sub_seq1.id = seq_record.id + ".r"
+                pe_splited.append(sub_seq1)   
+                sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]]
+                sub_seq2.id = seq_record.id + ".f"
+                pe_splited.append(sub_seq2)
+        elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
+            log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n")
+            del_by_pairends += 1
+        elif cross_match_tab[seq_record.id][0][1] < options.border_limit:
+            shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
+            log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n")
+        elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
+            shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
+            log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n")
+    
+    # Write down if required the splited pairends file
+    if options.split_pairends:
+        handle = open(split_pairends_fasta, "w")
+        SeqIO.write(pe_splited, handle, "fasta")
+        handle.close()
+        handle = open(split_pairends_qual, "w")
+        SeqIO.write(pe_splited, handle, "qual")
+        handle.close()
+    
+    # Finaly clean up new shotguns seqs
+    [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options)
+    handle = open(shotgun_ffile, "w")
+    SeqIO.write(shotgun_seqs_clean, handle, "fasta")
+    handle.close()
+    in_shot_gun = 0
+    try:
+        handle = open(shotgun_qfile, "w")
+        SeqIO.write(shotgun_seqs_clean, handle, "qual")
+        handle.close()
+        in_shot_gun = len(shotgun_seqs_clean)
+    except:
+        pass
+    
+    log.write("## header (pairends) : pairends\ttotal shotgun\n")
+    log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n")
+    
+    # Clean temporary files
+    try:
+        os.remove(crossmatch_input)
+        os.remove(spacers_file)
+        os.remove(crossmatch_input+".cross_match.res")
+        os.remove(crossmatch_input+".log")
+        os.remove(crossmatch_input+".qual")
+    except:
+        pass
+    
+    # Then returned pairends ones
+    return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends]
+
+
+def get_seqs (options):
+    """
+    Converts input seqs in a BioPython seq table
+      @param options : the options asked by the user
+    """
+    
+    # First get fasta or/and qual input files
+    qual_file = ""
+    if options.format == "sff":
+        sff_file = options.input_file
+        
+        if sff_file.endswith(".gz"):
+            '''Gunzip the given file and then remove the file.'''
+            r_file = gzip.GzipFile(sff_file, 'r')
+            write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz'))
+            w_file = open(write_file, 'w')
+            w_file.write(r_file.read())
+            w_file.close()
+            r_file.close()
+            sff_file = write_file
+            
+        base = os.path.basename(sff_file)
+        fasta_file = os.path.join(options.output, base + '.fasta')
+        qual_file = os.path.join(options.output, base + '.qual')
+        xml_file = os.path.join(options.output, base + '.xml')
+        format = "fasta"
+        if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file):
+            #First extract the sff file to fasta, qual and xml file
+            cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file
+            os.system(cmd)
+        else :
+            log = open(options.log_file, "a+")
+            log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n")
+            log.close()
+    elif options.format == "fasta" or options.format == "fastq":
+        format = options.format
+        fasta_file = options.input_file
+    else :
+        print "Error : " + options.format + " is not a supported format!"
+        sys.exit(1)
+
+    if options.qual_file:
+        qual_file = options.qual_file
+
+    # If we got a quality file
+    if qual_file :
+        quals = {}
+        # If the file is gziped
+        if qual_file.endswith(".gz"):
+            for qual in SeqIO.parse(gzip.open(qual_file), "qual") :
+                quals[qual.id] = qual
+        else :
+            for qual in SeqIO.parse(open(qual_file), "qual") :
+                quals[qual.id] = qual
+    
+    # If the fasta file is gziped
+    if fasta_file.endswith(".gz"):
+        seqs = []
+        for record in SeqIO.parse(gzip.open(fasta_file), format) :
+            if qual_file :
+                record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
+            seqs.append(record)
+    else :
+        seqs = []
+        for record in SeqIO.parse(open(fasta_file), format) :
+            if qual_file :
+                record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
+            seqs.append(record)
+    
+    # Clean temporary files
+    if options.format == "sff":
+        try:
+            os.remove(fasta_file)
+            os.remove(qual_file)
+            os.remove(xml_file)
+        except:
+            pass
+    
+    # Returns the table of sequences
+    return [seqs]
+
+def which (program):
+    """
+    Return if the asked program exist in the user path
+      @param options : the options asked by the user
+    """
+    import os
+    def is_exe(fpath):
+        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
+def pyrocleaner_depts_error(options):
+    """
+    Return the list of dependencies missing to run the pyrocleaner
+    """
+    error = ""
+    if which("formatdb") == None:
+        error += " - formatdb is missing\n" 
+    if which("megablast") == None:
+        error += " - megablast is missing\n"
+    if which("sff_extract.py") == None:
+        error += " - sff_extract.py\n" 
+    if which("cross_match") == None and options.clean_pairends != None:
+        error += " - cross_match\n"
+    if error != "":
+        error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!"
+    return error
+    
+
+if __name__ == "__main__":
+
+    parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]")
+
+    usage = "usage: %prog -i file -o output -f format"
+    desc = " Clean sequences from 454 SFF files \n"\
+           " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win"
+    parser = OptionParser(usage = usage, version = version_string(), description = desc)
+
+    igroup = OptionGroup(parser, "Input files options","")
+    igroup.add_option("-i", "--in", dest="input_file",
+                      help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE")
+    igroup.add_option("-q", "--qual", dest="qual_file",
+                      help="The quality file to use if input file is fasta", metavar="FILE")
+    igroup.add_option("-f", "--format", dest="format",
+                      help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff")
+    parser.add_option_group(igroup)
+    
+    ogroup = OptionGroup(parser, "Output files options","")
+    ogroup.add_option("-o", "--out", dest="output",
+                      help="The output folder where to store results")
+    ogroup.add_option("-g", "--log", dest="log_file",
+                      help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log")
+    ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", 
+                      default=False, help="Write splited pairends sequences into a fasta file")
+    parser.add_option_group(ogroup)
+
+    cgroup = OptionGroup(parser, "Cleaning options","")
+    cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", 
+                      default=False, help="Clean pairends")
+    cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", 
+                      default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation")
+    cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", 
+                      default=False, help="Filter reads with a legnth in between [x:y]")
+    cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", 
+                      default=False, help="Filter reads with too much N")
+    cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", 
+                      default=False, help="Filter duplicated reads")
+    cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", 
+                      default=False, help="Filter low complexity reads computed on a sliding window")
+    cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", 
+                      default=False, help="Filter low complexity reads computed on the whole sequence")        
+    cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", 
+                      default=False, help="Filter low quality reads")  
+    parser.add_option_group(cgroup)
+    
+    pgroup = OptionGroup(parser, "Processing options","")
+    pgroup.add_option("-a", "--acpus", dest="nb_cpus",
+                      help="Number of processors to use", type="int", default=1)
+    pgroup.add_option("-r", "--recursion", dest="recursion",
+                      help="Recursion limit when computing duplicated reads", type="int", default=1000)
+    parser.add_option_group(pgroup)
+    
+    cpgroup = OptionGroup(parser, "Cleaning parameters","")
+    cpgroup.add_option("-b", "--border-limit", dest="border_limit",
+                      help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70)
+    cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", 
+                      default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)")  
+    cpgroup.add_option("-e", "--missmatch", dest="missmatch",
+                      help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10)
+    cpgroup.add_option("-j", "--std", dest="std",
+                      help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2)
+    cpgroup.add_option("-1", "--min", dest="min",
+                      help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200)
+    cpgroup.add_option("-2", "--max", dest="max",
+                       help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600)        
+    cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold",
+                       help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) 
+    cpgroup.add_option("-s", "--ns_percent", dest="ns_percent",
+                      help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4)        
+    cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit",
+                      help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70)        
+    cpgroup.add_option("-v", "--window", dest="window",
+                      help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100)
+    cpgroup.add_option("-x", "--step", dest="step",
+                      help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5)
+    cpgroup.add_option("-y", "--complexity", dest="complexity",
+                      help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40)
+    parser.add_option_group(cpgroup)        
+
+    (options, args) = parser.parse_args()
+    
+    sys.setrecursionlimit(options.recursion)
+
+    if not pyrocleaner_depts_error(options):
+        
+        if options.input_file == None or options.output == None and options.format == None:
+            parser.print_help()
+            sys.exit(1)
+        elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality :
+            print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help"
+            sys.exit(1)         
+        else:
+
+            # Create output directory if doesn't exist
+            if not os.path.isdir(options.output) : os.mkdir(options.output )
+            
+            global log
+            log = open(options.output+"/"+options.log_file, "w")
+            log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n")
+            log.write("## with the following options: \n")
+            opts = ""
+            if options.clean_complexity_full:
+                opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" 
+            if options.clean_complexity_win:
+                opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" 
+            if options.clean_duplicated_reads and not options.clean_aggressive:
+                opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n"
+            elif options.clean_duplicated_reads and options.clean_aggressive :
+                opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" 
+            if options.clean_length_std:
+                opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" 
+            if options.clean_length_win:
+                opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" 
+            if options.clean_ns:
+                opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" 
+            if options.clean_pairends:
+                opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n"                                             
+            if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) :
+                opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n"
+            elif options.clean_quality :
+                print "No quality check will be performed beacause there is no quality file provided."                                           
+            log.write(opts)  
+            log.close()
+            
+            # 1 - First get inputs from options
+            [seqs] = get_seqs(options)
+            before_cleaning = len(seqs)
+            del_by_length = 0
+            del_by_ns = 0
+            del_by_complexity = 0
+            del_by_duplicat = 0
+            del_by_pairends = 0
+            del_by_quality = 0
+            in_shot_gun = 0
+            
+            log = open(options.output+"/"+options.log_file, "a+")
+            # 2 - clean reads
+            # 2.1 - Clean reads by length, ns, complexity
+            if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality:
+                if options.format == "sff" :
+                    [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
+                elif options.format == "fasta" or options.format == "fastq":
+                    [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
+                    
+            # 2.2 - Clean reads by duplicat
+            if options.clean_duplicated_reads:
+                [seqs, del_by_duplicat] = filter_same_reads(seqs, options)
+            
+            # 3 - Clean pairends
+            if options.clean_pairends:
+                [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options)
+                del_by_length += del_by_length_pe
+                del_by_ns += del_by_ns_pe
+                del_by_complexity += del_by_complexity_pe
+                del_by_quality += del_by_quality_pe
+            
+            # 4 - Create the output in the same format than the input
+            after_cleaning = len(seqs) + in_shot_gun
+            if options.input_file.endswith(".gz"):
+                ifile = string.rstrip(options.input_file, '.gz')
+            else : ifile = options.input_file
+            if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean")
+            else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean")
+            output += "." + options.format
+            if options.format == "sff" :
+                if options.input_file.endswith(".gz"):
+                    sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz'))
+                else :
+                    sfffile = options.input_file
+                if which("sfffile") == None:
+                    fastq = open(output, "w")
+                    SeqIO.write(seqs, fastq, "fastq")
+                    fastq.close()
+                else : 
+                    reads_to_sff(sfffile, seqs, output)
+                    # Clean temporary files
+                    if options.input_file.endswith(".gz"):
+                        os.remove(sfffile)
+                        
+            elif options.format == "fasta" or options.format == "fastq":
+                fasta = open(output, "w")
+                SeqIO.write(seqs, fasta, options.format)
+                fasta.close()
+                if options.format == "fasta" and options.qual_file != None:
+                    qual = open(os.path.splitext(output)[0]+".qual", "w")
+                    SeqIO.write(seqs, qual, "qual")
+                    qual.close()
+            
+            # 5 - Write down the analyze sumary
+            log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n")
+            log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning)  + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n")
+            log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
+            log.close()           
+            sys.exit(0)
+    else : 
+        print pyrocleaner_depts_error(options)
+        sys.exit(1)
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrocleaner_galaxy_tool_V1.2/pyrocleaner.xml	Thu Jun 09 06:09:09 2011 -0400
@@ -0,0 +1,196 @@
+<tool id="pyrocleaner" name="pyrocleaner" version="1.2">
+	<description>454 sequence cleaning</description>
+    <requirements>
+        <requirement type="binary">megablast</requirement>
+        <requirement type="binary">formatdb</requirement>
+        <requirement type="binary">cross_match</requirement>
+        <requirement type="binary">sfffile</requirement>
+        <requirement type="binary">sff_extract.py</requirement>
+        <requirement type="python-module">Bio</requirement>
+        <requirement type="python-module">igraph</requirement>
+    </requirements>
+	<command interpreter="python">pyrocleaner_wrapper.py
+		#set $options = " --in='%s' " % ($input)
+		#if $clean_pairends.clean_pairends_select=="y":
+			--out-pe-qual="$out_pe_qual"
+			--out-pe-fasta="$out_pe_fasta"
+			#set $options = $options + " --clean-pairends --border-limit='%s' --missmatch='%s'" % ($clean_pairends.border_limit,$clean_pairends.missmatch)
+    	#end if
+		#if $clean_length_std.clean_length_std_select=="y":
+			#set $options = $options + " --clean-length-std --std='%s'" % ($clean_length_std.std)
+    	#end if
+		#if $clean_length_win.clean_length_win_select=="y":
+			#set $options = $options + " --clean-length-win --min='%s' --max='%s'" % ($clean_length_win.min,$clean_length_win.max)
+    	#end if
+		#if $clean_ns.clean_ns_select=="y":
+			#set $options = $options + " --clean-ns --ns_percent='%s'" % ($clean_ns.ns_percent)
+    	#end if
+		#if $clean_duplicated_reads.clean_duplicated_reads_select=="y":
+			#set $options = $options + " --clean-duplicated-reads --duplication_limit='%s'" % ($clean_duplicated_reads.duplication_limit)
+			#if $str($clean_duplicated_reads.aggressive) != 'None':
+				#set $options = $options + " --aggressive"
+			#end if
+    	#end if
+		#if $clean_complexity_win.clean_complexity_win_select=="y":
+			#set $options = $options + " --clean-complexity-win --complexity='%s' --window='%s' --step='%s'" % ($clean_complexity_win.complexity,$clean_complexity_win.window,$clean_complexity_win.step)
+    	#end if
+		#if $clean_complexity_full.clean_complexity_full_select=="y":
+			#set $options = $options + " --clean-complexity-full --complexity='%s'" % ($clean_complexity_full.complexity)
+    	#end if
+		#if $clean_quality.clean_quality_select=="y":
+			#set $options = $options + " --clean-quality --quality-threshold='%s'" % ($clean_quality.quality_threshold)
+    	#end if
+    	--options="$options" --log="$log" --output="$output" --out-dir="$output.extra_files_path" --format="$input.extension"
+	</command>
+	<inputs>
+		<param name="input" type="data" format="sff,fastq" label="454 reads" />
+		<conditional name="clean_pairends">
+    		<param name="clean_pairends_select" type="select" label="Clean pairends">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+				<param name="border_limit" type="integer" size="10" value="70" label="Minimal length between the spacer and the read extremity"/>
+				<param name="missmatch" type="integer" size="10" value="10" label="Limit of missmatch"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional>
+		<conditional name="clean_length_std">
+    		<param name="clean_length_std_select" type="select" label="Filter reads shorter than mean less x*standard deviation and reads longer than mean plus x*standard deviation">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+			   <param name="std" type="integer" size="10" value="2" label="Number of standard deviation"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional>  
+		<conditional name="clean_length_win">
+    		<param name="clean_length_win_select" type="select" label="Filter reads with a length within specified values">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+				<param name="min" type="integer" size="10" value="200" label="Minimal length"/>
+				<param name="max" type="integer" size="10" value="600" label="Maximal length"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional>  
+		<conditional name="clean_ns">
+    		<param name="clean_ns_select" type="select" label="Filter reads with too many N">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+		    	<param name="ns_percent" type="integer" size="10" value="4" label="Percentage of N to use to filter reads"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional> 
+		<conditional name="clean_duplicated_reads">
+    		<param name="clean_duplicated_reads_select" type="select" label="Filter duplicated reads">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+	      		<param name="aggressive" type="select" display="checkboxes" multiple="True" label="Keep only one read per cluster"> 
+	        		<option value="--aggressive">Yes</option>
+	      		</param>
+		    	<param name="duplication_limit" type="integer" size="10" value="70" label="Limit size difference to use when cleaning duplicated reads"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional> 
+		<conditional name="clean_complexity_win">
+    		<param name="clean_complexity_win_select" type="select" label="Filter low complexity reads computed on a sliding window">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+				<param name="window" type="integer" size="10" value="100" label="The window size to use when filtering reads based on their complexity"/>
+   				<param name="step" type="integer" size="10" value="5" label="The window step to use when filtering reads based on their complexity"/>
+   				<param name="complexity" type="integer" size="10" value="40" label="Minimal complexity/length ratio to use when filtering reads based on their complexity"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional> 
+		<conditional name="clean_complexity_full">
+    		<param name="clean_complexity_full_select" type="select" label="Filter low complexity reads computed on the whole sequence">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+   				<param name="complexity" type="integer" size="10" value="40" label="Minimal complexity/length ratio to use when filtering reads based on their complexity"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional>  
+		<conditional name="clean_quality">
+    		<param name="clean_quality_select" type="select" label="Filter low quality reads">
+        		<option value="y">Yes</option>
+        		<option selected="true" value="n">No</option>
+    		</param>
+		    <when value="y">
+		    	<param name="quality_threshold" type="integer" size="10" value="35" label="At least one base pair has to be equal or higher than this value when filtering reads considering their quality"/>
+		    </when>
+		    <when value="n">
+		    </when>
+		</conditional>  
+	</inputs>
+  <outputs>
+    <data name="log" format="txt" label="${tool.name} on ${on_string}: log file"/>
+    <data name="output" format="input" label="${tool.name} on ${on_string}: cleaned reads"/>
+   	<data name="out_pe_fasta" format="fasta" label="${tool.name} on ${on_string}: shotgun reads">
+   		<filter>clean_pairends['clean_pairends_select'] == "y"</filter>
+   	</data>
+   	<data name="out_pe_qual" format="qual" label="${tool.name} on ${on_string}: shotgun reads">
+   		<filter>clean_pairends['clean_pairends_select'] == "y"</filter>
+   	</data>
+  </outputs>
+  <tests>
+    <!--
+    <test>
+      <param name="input1_file" value="3.fastqsanger" ftype="fastqsanger" />
+      <output name="output1_file" file="split_pair_reads_1.fastqsanger" />
+      <output name="output2_file" file="split_pair_reads_2.fastqsanger" />
+    </test>
+    -->
+  </tests>
+  <help>
+**What it does**
+
+pyrocleaner is a product of the genotoul bioinformatic platform.
+
+PyroCleaner is intended to clean reads coming from pyrosequencing in order to ease the assembly process. 
+It enables filtering sequences on multiple copy reads and other criteria such as length, complexity and 
+number of undetermined bases which has been proven to correlate with pour quality. It also permits to 
+clean sff files of paired-end sequences and generates on one side a sff with the validated paired-ends 
+and on the other the shotgun reads.
+
+
+.. _pyrocleaner: https://mulcyber.toulouse.inra.fr/plugins/mediawiki/wiki/pyrocleaner/index.php/Main_Page
+
+-----
+
+**Input format**
+
+Any sff, fasta or fastq file, for example if a fastq file::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+-----
+
+**Outputs**
+
+A cleaned file in the same format as the input.
+If the user ask to clean pairends sequences, additionals fasta and qual files will be created within sequences 
+trimmed from the spacer.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrocleaner_galaxy_tool_V1.2/pyrocleaner_wrapper.py	Thu Jun 09 06:09:09 2011 -0400
@@ -0,0 +1,88 @@
+#
+# Pyrocleaner wrapper for galaxy
+# Copyright (C) 2009 INRA
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#
+
+__author__ = 'Plateforme bioinformatique Midi Pyrenees'
+__copyright__ = 'Copyright (C) 2009 INRA'
+__license__ = 'GNU General Public License'
+__version__ = '1.2'
+__email__ = 'support.genopole@toulouse.inra.fr'
+__status__ = 'beta'
+
+import optparse, os, shutil, subprocess, sys, tempfile, re, string
+    
+if __name__ == "__main__":
+    
+    parser = optparse.OptionParser()
+    
+    parser.add_option( '-s', '--options',      dest='options',      help='The pyrocleaner options' )
+    parser.add_option( '-l', '--log',          dest='log',          help='Path to the pyrocleaner log' )
+    parser.add_option( '-f', '--format',       dest='format',       help='The input file format (sff|fastq)' )
+    parser.add_option( '-o', '--output',       dest='output',       help='The output file path' )
+    parser.add_option( '-a', '--out-pe-qual',  dest='out_pe_qual',  help='The output pairends qual file path' )
+    parser.add_option( '-b', '--out-pe-fasta', dest='out_pe_fasta', help='The output pairends fasta file path' )
+    parser.add_option( '-d', '--out-dir',      dest='out_dir',      help='The output dir where to process the data' )
+    
+    (options, args) = parser.parse_args()
+    
+    buffsize = 1048576
+    tmp_dir = tempfile.mkdtemp()
+    
+    try:
+    
+        if options.format == "sff": format = "sff"
+        else : format = "fastq"
+        
+        tmp_stderr_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.err' ).name
+        tmp_stderr = open( tmp_stderr_name, 'wb' )
+        tmp_stdout_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.out' ).name
+        tmp_stdout = open( tmp_stdout_name, 'wb' )
+        cmd = 'pyrocleaner --format %s %s --out %s' % (format, options.options, options.out_dir)
+        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp_stderr_name, 'rb' )
+        stderr = ''
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+        
+        # Move the log file to the right place
+        shutil.move(os.path.join(options.out_dir, "pyrocleaner.log"), options.log)
+        
+        for file in os.listdir(options.out_dir):
+            # If the file has the input file format and is not a shotgun file
+            if file.endswith(format):
+                shutil.move(os.path.join(options.out_dir, file), options.output)
+            # If it's a quality file from shotgun
+            if file.endswith(".qual") :
+                shutil.move(os.path.join(options.out_dir, file), options.out_pe_qual)
+            # If it's a fasta file from shotgun
+            elif file.endswith(".fasta") :
+                shutil.move(os.path.join(options.out_dir, file), options.out_pe_fasta)
+
+    except Exception:
+        raise Exception, 'Error executing pyrocleaner.'