Mercurial > repos > jmariette > pyrocleaner
view pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py @ 0:ef5dd11c01e6 default tip
pyrocleaner v1.2
author | g2cmnty@test-web1.g2.bx.psu.edu |
---|---|
date | Thu, 09 Jun 2011 06:09:09 -0400 |
parents | |
children |
line wrap: on
line source
# # Pyrocleaner # Copyright (C) 2009 INRA # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # ChangeLog # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the # --clean-quality option # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality # Add the --aggressive option in order to keep only one read per cluster # Modify the duplication strategy so the longer read is keept # v1.0 (09/2009) : Pyrocleaner first version # __author__ = 'Plateforme bioinformatique Midi Pyrenees' __copyright__ = 'Copyright (C) 2009 INRA' __license__ = 'GNU General Public License' __version__ = '1.2' __email__ = 'support.genopole@toulouse.inra.fr' __status__ = 'beta' from Bio import SeqIO from igraph import * from optparse import * import os, math, datetime, zlib, sys, re, glob, string, gzip def create_roche_pairends_spacer_file(out_dir): """ Create the Roche pairends fasta file @param out_dir : the directory where will be writen the fasta file """ file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr') file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n") file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n") file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n") file.close() return file.name def version_string (): """ Return the pyrocleaner version """ return "pyrocleaner " + __version__ def reads_to_sff (sff_file, seqs, output_file): """ Extract seqs reads from the sff_file @dependences sfffile : the sff software is required to execute this function @param sff_file : the input sff file @param seqs : table of seqs to extract @param output_file : the name of the output sff file """ # First creates the to_write file tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt") to_write_file = open(tmp_file, 'wr') for read in seqs: to_write_file.write(read.id + "\n") to_write_file.close() # Use the sfffile tools cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file os.system(cmd) # Clean temporary files try: os.remove(tmp_file) except: pass def filter_reads (seqs, options): """ Filter input seqs by length, ns and complexity if options are asked by user @param seqs : table of seqs to filter @param options : the options asked by the user """ reads_id = [] reads_length = [] reads_ok = [] del_by_ns = 0 del_by_complexity = 0 del_by_length = 0 del_by_quality = 0 log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n") # Go throught all sequences for i, reads_record in enumerate(seqs) : reads_id.append(reads_record.id) reads_ok.append(0) # If is asked to clean sequences by length using the standard # deviation, save length to compute some statistics if options.clean_length_std : reads_length.append(len(reads_record)) # If is asked to clean sequences by length using a window # and the sequence has not been flagged as deleted if options.clean_length_win and reads_ok[i] == 0: # Check if the sequence is longer than the min asked, if not flagged it as deleted if len(reads_record) < int(options.min): reads_ok[i] = 1 del_by_length += 1 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n") # Check if the sequence is smaller than the max asked, if not flagged it as deleted elif len(reads_record) > int(options.max): reads_ok[i] = 1 del_by_length += 1 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n") # If is asked to clean sequences with too much Ns # and the sequence has not been flagged as deleted if options.clean_ns and reads_ok[i] == 0: # Compute the rate of Ns into the current sequence nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100) # If the rate is higher than the threshold flagged it as deleted if nb_n > float(options.ns_percent) : reads_ok[i] = 1 del_by_ns += 1 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n") # If is asked to clean sequences with low complexity using a sliding window # and the sequence has not been flagged as deleted if options.clean_complexity_win and reads_ok[i] == 0: is_complex = False # For each window for win in range(0, len(reads_record)-options.window, options.step): start = win stop = start + options.window # Compute the window complexity w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100) # if the window complexity is higher to the threshold, flag the whole sequence as complex if w_cplx >= float(options.complexity): is_complex = True break # If no window has been flagged as complex, then flagg the sequence as deleted if not is_complex: reads_ok[i] = 1 del_by_complexity += 1 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n") # If is asked to clean sequences with low complexity working on the whole sequence # and the sequence has not been flagged as deleted if options.clean_complexity_full and reads_ok[i] == 0: # Compute the complexity on the whole sequence cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100) # If the complexity is higher to the threshold, flag the sequence as deleted if cplx < float(options.complexity) : reads_ok[i] = 1 del_by_complexity += 1 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n") # If is asked to clean sequences with low quality and quality information is available # and the sequence has not been flagged as deleted if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"): # If at least one base has a quality score higher than the threashold maw_qual = max(reads_record.letter_annotations["phred_quality"]) if maw_qual < int(options.quality_threshold) : reads_ok[i] = 1 del_by_quality += 1 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n") # If is asked to clean sequences by length using the standard deviation if options.clean_length_std : # Compute the mean and the standard deviation mean = sum(reads_length) / len(reads_length) mq = mean**2 s = sum([ x**2 for x in reads_length]) var = s/len(reads_length) - mq etype = math.sqrt(var) # For each sequences for i, id in enumerate(reads_id): # If the sequence has not been flagged as deleted if reads_ok[i] == 0 : # If the sequence length is higher than the upper threshold, flag the sequence as deleted if reads_length[i] > mean + options.std*etype: reads_ok[i] = 1 del_by_length += 1 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") # If the sequence length is smaller than the lower threshold, flag the sequence as deleted elif reads_length[i] < mean - options.std*etype: reads_ok[i] = 1 del_by_length += 1 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") seqs_to_return = [] # Then get only sequences not flagged to be deleted for i, reads_record in enumerate(seqs) : if reads_ok[i] == 0: seqs_to_return.append(reads_record) # Return values return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality] def filter_same_reads (seqs, options): """ Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster @param seqs : table of seqs to filter @param options : the options asked by the user """ megablast_input = os.path.join(options.output, "megablast_input.fasta") log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n") log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n") # First write down seqs fasta = open(megablast_input, "w") SeqIO.write(seqs, fasta, "fasta") fasta.close() # Then formatdb the fasta file (no formatdb utils in Biopython) cmd = "formatdb -i %s -p F" % megablast_input os.system(cmd) log.write("## megablast (" + str(datetime.datetime.now()) + ")\n") # In case of pairends, use words of 50, so megablast cannot connect with spacers opts = "" if options.clean_pairends: opts = " -W 50 -H 10 " # Megablast the fasta file versus itself cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" os.system(cmd) # Let's get the reads length with the fasta file and creates the graph log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n") gseqs = {} vertices = [] for reads_record in seqs : vertices.append({'name': reads_record.id}) gseqs[reads_record.id] = len(reads_record) # Connects reads from hits starting at 1 and with ends closed to each others log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n") edges = [] for read in open(megablast_input + ".res", 'rU').readlines() : parts = read.rstrip().split() len1 = gseqs[parts[0]] len2 = gseqs[parts[1]] if options.clean_aggressive : edges.append({'source': parts[0], 'target': parts[1]}) elif math.fabs(len1-len2) < options.duplication_limit: if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit: # This alignments are realy similar -> may be the same -> link them into the graph edges.append({'source': parts[0], 'target': parts[1]}) # Then get connected components and extract one of them as cluster leader log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n") gr = Graph.DictList(vertices, edges) connex = gr.clusters().membership clusters = {} for i, vertex in enumerate(vertices): cluster_id = connex[i] try: clusters[cluster_id].append(seqs[i]) except: clusters[cluster_id] = [] clusters[cluster_id].append(seqs[i]) del_by_duplicat = 0 # Write down into the log the composition of each cluster clusters_stats = {} seqs_to_return = [] for cluster_id in clusters: cl_elts = "" del_by_duplicat += len(clusters[cluster_id]) - 1 longest_value = 0 cluster_leader = None for seq_record in clusters[cluster_id]: # Find out which sequence is the longest if len(seq_record.seq) > longest_value: cluster_leader = seq_record seqs_to_return.append(cluster_leader) for seq_record in clusters[cluster_id]: if seq_record.id != cluster_leader.id: log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n") cl_elts += seq_record.id + " " log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n") try : clusters_stats[len(clusters[cluster_id])] += 1 except: clusters_stats[len(clusters[cluster_id])] = 1 # Write down a summary of what has been done log_header = "## header (duplicated) : " log_summary = "## summary (duplicated) : " for stat in sorted(clusters_stats.keys()): log_header += str(stat) + "\t" log_summary += str(clusters_stats[stat]) + "\t" log.write(log_header + "\n") log.write(log_summary + "\n") # Clean temporary files try: os.remove(megablast_input) os.remove(megablast_input+".nhr") os.remove(megablast_input+".nin") os.remove(megablast_input+".nsq") os.remove(megablast_input+".res") os.remove("formatdb.log") os.remove("error.log") except: pass # Returns results return [seqs_to_return, del_by_duplicat] def filter_pairends(seqs, options): """ Filter pairends sequences and split sequences without pairends into a fasta file @param seqs : the table of sequences to filter @param options : the options asked by the user """ # Setup output files shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta") shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual") crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta") split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta") split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual") # First write down seqs fasta = open(crossmatch_input, "w") SeqIO.write(seqs, fasta, "fasta") fasta.close() # Write down the qual file for cross_match if possible try: qual = open(crossmatch_input+".qual", "w") SeqIO.write(seqs, qual, "qual") qual.close() except: os.remove(crossmatch_input+".qual") # Cross_match reverse matches pattern rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") # Cross_match forward matches pattern fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") # Write the spacer file and execute cross_match against the input sequences spacers_file = create_roche_pairends_spacer_file(options.output) cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res" os.system(cmd) # Parse the cross_match file cross_match_tab = {} block_found = False for line in open(crossmatch_input + ".cross_match.res", 'r'): save_line = False rm = rev_regex.match(line) fm = fwd_regex.match(line) if rm != None: # If it's a reverse matches block_found = True (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8))) save_line = True elif fm != None: # If it's a forward matches block_found = True (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8))) save_line = True else : save_line = False if line.startswith("Discrepancy summary:"): # This is the end of the section break # Save the line if save_line: try: cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch]) except: cross_match_tab[primary_match] = [] cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)]) # Then get the spacers_length spacers_length = {} for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") : spacers_length[seq_record.id] = len(seq_record.seq.tostring()) # Finaly go throught all sequences seqs_to_return = [] shotgun_seqs = [] del_by_pairends = 1 pe_splited = [] for i, seq_record in enumerate(seqs) : if not cross_match_tab.has_key(seq_record.id): # No spacers found -> just add it to the shotgun_seqs tab shotgun_seqs.append(seq_record) log.write(seq_record.id + " shotgun -> no spacer found\n") elif len(cross_match_tab[seq_record.id]) > 1: # If multiple linker -> delete the whole sequence log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n") del_by_pairends += 1 elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)): if (cross_match_tab[seq_record.id][0][1] == 1): shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) log.write(seq_record.id + " shotgun -> spacer found at the begining\n") elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)): shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) log.write(seq_record.id + " shotgun -> spacer found at the end\n") else : log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n") del_by_pairends += 1 elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit: seqs_to_return.append(seq_record) if options.split_pairends: sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:] sub_seq1.id = seq_record.id + ".r" pe_splited.append(sub_seq1) sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]] sub_seq2.id = seq_record.id + ".f" pe_splited.append(sub_seq2) elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n") del_by_pairends += 1 elif cross_match_tab[seq_record.id][0][1] < options.border_limit: shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n") elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n") # Write down if required the splited pairends file if options.split_pairends: handle = open(split_pairends_fasta, "w") SeqIO.write(pe_splited, handle, "fasta") handle.close() handle = open(split_pairends_qual, "w") SeqIO.write(pe_splited, handle, "qual") handle.close() # Finaly clean up new shotguns seqs [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options) handle = open(shotgun_ffile, "w") SeqIO.write(shotgun_seqs_clean, handle, "fasta") handle.close() in_shot_gun = 0 try: handle = open(shotgun_qfile, "w") SeqIO.write(shotgun_seqs_clean, handle, "qual") handle.close() in_shot_gun = len(shotgun_seqs_clean) except: pass log.write("## header (pairends) : pairends\ttotal shotgun\n") log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n") # Clean temporary files try: os.remove(crossmatch_input) os.remove(spacers_file) os.remove(crossmatch_input+".cross_match.res") os.remove(crossmatch_input+".log") os.remove(crossmatch_input+".qual") except: pass # Then returned pairends ones return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends] def get_seqs (options): """ Converts input seqs in a BioPython seq table @param options : the options asked by the user """ # First get fasta or/and qual input files qual_file = "" if options.format == "sff": sff_file = options.input_file if sff_file.endswith(".gz"): '''Gunzip the given file and then remove the file.''' r_file = gzip.GzipFile(sff_file, 'r') write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz')) w_file = open(write_file, 'w') w_file.write(r_file.read()) w_file.close() r_file.close() sff_file = write_file base = os.path.basename(sff_file) fasta_file = os.path.join(options.output, base + '.fasta') qual_file = os.path.join(options.output, base + '.qual') xml_file = os.path.join(options.output, base + '.xml') format = "fasta" if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file): #First extract the sff file to fasta, qual and xml file cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file os.system(cmd) else : log = open(options.log_file, "a+") log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n") log.close() elif options.format == "fasta" or options.format == "fastq": format = options.format fasta_file = options.input_file else : print "Error : " + options.format + " is not a supported format!" sys.exit(1) if options.qual_file: qual_file = options.qual_file # If we got a quality file if qual_file : quals = {} # If the file is gziped if qual_file.endswith(".gz"): for qual in SeqIO.parse(gzip.open(qual_file), "qual") : quals[qual.id] = qual else : for qual in SeqIO.parse(open(qual_file), "qual") : quals[qual.id] = qual # If the fasta file is gziped if fasta_file.endswith(".gz"): seqs = [] for record in SeqIO.parse(gzip.open(fasta_file), format) : if qual_file : record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] seqs.append(record) else : seqs = [] for record in SeqIO.parse(open(fasta_file), format) : if qual_file : record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] seqs.append(record) # Clean temporary files if options.format == "sff": try: os.remove(fasta_file) os.remove(qual_file) os.remove(xml_file) except: pass # Returns the table of sequences return [seqs] def which (program): """ Return if the asked program exist in the user path @param options : the options asked by the user """ import os def is_exe(fpath): return os.path.exists(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None def pyrocleaner_depts_error(options): """ Return the list of dependencies missing to run the pyrocleaner """ error = "" if which("formatdb") == None: error += " - formatdb is missing\n" if which("megablast") == None: error += " - megablast is missing\n" if which("sff_extract.py") == None: error += " - sff_extract.py\n" if which("cross_match") == None and options.clean_pairends != None: error += " - cross_match\n" if error != "": error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!" return error if __name__ == "__main__": parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]") usage = "usage: %prog -i file -o output -f format" desc = " Clean sequences from 454 SFF files \n"\ " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win" parser = OptionParser(usage = usage, version = version_string(), description = desc) igroup = OptionGroup(parser, "Input files options","") igroup.add_option("-i", "--in", dest="input_file", help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE") igroup.add_option("-q", "--qual", dest="qual_file", help="The quality file to use if input file is fasta", metavar="FILE") igroup.add_option("-f", "--format", dest="format", help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff") parser.add_option_group(igroup) ogroup = OptionGroup(parser, "Output files options","") ogroup.add_option("-o", "--out", dest="output", help="The output folder where to store results") ogroup.add_option("-g", "--log", dest="log_file", help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log") ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", default=False, help="Write splited pairends sequences into a fasta file") parser.add_option_group(ogroup) cgroup = OptionGroup(parser, "Cleaning options","") cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", default=False, help="Clean pairends") cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation") cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", default=False, help="Filter reads with a legnth in between [x:y]") cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", default=False, help="Filter reads with too much N") cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", default=False, help="Filter duplicated reads") cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", default=False, help="Filter low complexity reads computed on a sliding window") cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", default=False, help="Filter low complexity reads computed on the whole sequence") cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", default=False, help="Filter low quality reads") parser.add_option_group(cgroup) pgroup = OptionGroup(parser, "Processing options","") pgroup.add_option("-a", "--acpus", dest="nb_cpus", help="Number of processors to use", type="int", default=1) pgroup.add_option("-r", "--recursion", dest="recursion", help="Recursion limit when computing duplicated reads", type="int", default=1000) parser.add_option_group(pgroup) cpgroup = OptionGroup(parser, "Cleaning parameters","") cpgroup.add_option("-b", "--border-limit", dest="border_limit", help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70) cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)") cpgroup.add_option("-e", "--missmatch", dest="missmatch", help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10) cpgroup.add_option("-j", "--std", dest="std", help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2) cpgroup.add_option("-1", "--min", dest="min", help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200) cpgroup.add_option("-2", "--max", dest="max", help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600) cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold", help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) cpgroup.add_option("-s", "--ns_percent", dest="ns_percent", help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4) cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit", help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70) cpgroup.add_option("-v", "--window", dest="window", help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100) cpgroup.add_option("-x", "--step", dest="step", help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5) cpgroup.add_option("-y", "--complexity", dest="complexity", help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40) parser.add_option_group(cpgroup) (options, args) = parser.parse_args() sys.setrecursionlimit(options.recursion) if not pyrocleaner_depts_error(options): if options.input_file == None or options.output == None and options.format == None: parser.print_help() sys.exit(1) elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality : print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help" sys.exit(1) else: # Create output directory if doesn't exist if not os.path.isdir(options.output) : os.mkdir(options.output ) global log log = open(options.output+"/"+options.log_file, "w") log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n") log.write("## with the following options: \n") opts = "" if options.clean_complexity_full: opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" if options.clean_complexity_win: opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" if options.clean_duplicated_reads and not options.clean_aggressive: opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n" elif options.clean_duplicated_reads and options.clean_aggressive : opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" if options.clean_length_std: opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" if options.clean_length_win: opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" if options.clean_ns: opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" if options.clean_pairends: opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n" if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) : opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n" elif options.clean_quality : print "No quality check will be performed beacause there is no quality file provided." log.write(opts) log.close() # 1 - First get inputs from options [seqs] = get_seqs(options) before_cleaning = len(seqs) del_by_length = 0 del_by_ns = 0 del_by_complexity = 0 del_by_duplicat = 0 del_by_pairends = 0 del_by_quality = 0 in_shot_gun = 0 log = open(options.output+"/"+options.log_file, "a+") # 2 - clean reads # 2.1 - Clean reads by length, ns, complexity if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality: if options.format == "sff" : [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) elif options.format == "fasta" or options.format == "fastq": [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) # 2.2 - Clean reads by duplicat if options.clean_duplicated_reads: [seqs, del_by_duplicat] = filter_same_reads(seqs, options) # 3 - Clean pairends if options.clean_pairends: [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options) del_by_length += del_by_length_pe del_by_ns += del_by_ns_pe del_by_complexity += del_by_complexity_pe del_by_quality += del_by_quality_pe # 4 - Create the output in the same format than the input after_cleaning = len(seqs) + in_shot_gun if options.input_file.endswith(".gz"): ifile = string.rstrip(options.input_file, '.gz') else : ifile = options.input_file if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean") else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean") output += "." + options.format if options.format == "sff" : if options.input_file.endswith(".gz"): sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz')) else : sfffile = options.input_file if which("sfffile") == None: fastq = open(output, "w") SeqIO.write(seqs, fastq, "fastq") fastq.close() else : reads_to_sff(sfffile, seqs, output) # Clean temporary files if options.input_file.endswith(".gz"): os.remove(sfffile) elif options.format == "fasta" or options.format == "fastq": fasta = open(output, "w") SeqIO.write(seqs, fasta, options.format) fasta.close() if options.format == "fasta" and options.qual_file != None: qual = open(os.path.splitext(output)[0]+".qual", "w") SeqIO.write(seqs, qual, "qual") qual.close() # 5 - Write down the analyze sumary log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n") log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning) + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n") log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n") log.close() sys.exit(0) else : print pyrocleaner_depts_error(options) sys.exit(1)