Mercurial > repos > jmariette > pyrocleaner
diff pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py @ 0:ef5dd11c01e6 default tip
pyrocleaner v1.2
| author | g2cmnty@test-web1.g2.bx.psu.edu | 
|---|---|
| date | Thu, 09 Jun 2011 06:09:09 -0400 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py Thu Jun 09 06:09:09 2011 -0400 @@ -0,0 +1,793 @@ +# +# Pyrocleaner +# Copyright (C) 2009 INRA +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# ChangeLog +# v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the +# --clean-quality option +# v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality +# Add the --aggressive option in order to keep only one read per cluster +# Modify the duplication strategy so the longer read is keept +# v1.0 (09/2009) : Pyrocleaner first version +# + +__author__ = 'Plateforme bioinformatique Midi Pyrenees' +__copyright__ = 'Copyright (C) 2009 INRA' +__license__ = 'GNU General Public License' +__version__ = '1.2' +__email__ = 'support.genopole@toulouse.inra.fr' +__status__ = 'beta' + +from Bio import SeqIO +from igraph import * +from optparse import * +import os, math, datetime, zlib, sys, re, glob, string, gzip + + +def create_roche_pairends_spacer_file(out_dir): + """ + Create the Roche pairends fasta file + @param out_dir : the directory where will be writen the fasta file + """ + file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr') + file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n") + file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n") + file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n") + file.close() + return file.name + + +def version_string (): + """ + Return the pyrocleaner version + """ + return "pyrocleaner " + __version__ + + +def reads_to_sff (sff_file, seqs, output_file): + """ + Extract seqs reads from the sff_file + @dependences sfffile : the sff software is required to execute this function + @param sff_file : the input sff file + @param seqs : table of seqs to extract + @param output_file : the name of the output sff file + """ + # First creates the to_write file + tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt") + to_write_file = open(tmp_file, 'wr') + for read in seqs: + to_write_file.write(read.id + "\n") + to_write_file.close() + # Use the sfffile tools + cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file + os.system(cmd) + # Clean temporary files + try: os.remove(tmp_file) + except: pass + + +def filter_reads (seqs, options): + """ + Filter input seqs by length, ns and complexity if options are asked by user + @param seqs : table of seqs to filter + @param options : the options asked by the user + """ + reads_id = [] + reads_length = [] + reads_ok = [] + del_by_ns = 0 + del_by_complexity = 0 + del_by_length = 0 + del_by_quality = 0 + log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n") + + # Go throught all sequences + for i, reads_record in enumerate(seqs) : + reads_id.append(reads_record.id) + reads_ok.append(0) + + # If is asked to clean sequences by length using the standard + # deviation, save length to compute some statistics + if options.clean_length_std : + reads_length.append(len(reads_record)) + + # If is asked to clean sequences by length using a window + # and the sequence has not been flagged as deleted + if options.clean_length_win and reads_ok[i] == 0: + # Check if the sequence is longer than the min asked, if not flagged it as deleted + if len(reads_record) < int(options.min): + reads_ok[i] = 1 + del_by_length += 1 + log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n") + # Check if the sequence is smaller than the max asked, if not flagged it as deleted + elif len(reads_record) > int(options.max): + reads_ok[i] = 1 + del_by_length += 1 + log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n") + + # If is asked to clean sequences with too much Ns + # and the sequence has not been flagged as deleted + if options.clean_ns and reads_ok[i] == 0: + # Compute the rate of Ns into the current sequence + nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100) + # If the rate is higher than the threshold flagged it as deleted + if nb_n > float(options.ns_percent) : + reads_ok[i] = 1 + del_by_ns += 1 + log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n") + + # If is asked to clean sequences with low complexity using a sliding window + # and the sequence has not been flagged as deleted + if options.clean_complexity_win and reads_ok[i] == 0: + is_complex = False + # For each window + for win in range(0, len(reads_record)-options.window, options.step): + start = win + stop = start + options.window + # Compute the window complexity + w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100) + # if the window complexity is higher to the threshold, flag the whole sequence as complex + if w_cplx >= float(options.complexity): + is_complex = True + break + # If no window has been flagged as complex, then flagg the sequence as deleted + if not is_complex: + reads_ok[i] = 1 + del_by_complexity += 1 + log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n") + + # If is asked to clean sequences with low complexity working on the whole sequence + # and the sequence has not been flagged as deleted + if options.clean_complexity_full and reads_ok[i] == 0: + # Compute the complexity on the whole sequence + cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100) + # If the complexity is higher to the threshold, flag the sequence as deleted + if cplx < float(options.complexity) : + reads_ok[i] = 1 + del_by_complexity += 1 + log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n") + + # If is asked to clean sequences with low quality and quality information is available + # and the sequence has not been flagged as deleted + if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"): + # If at least one base has a quality score higher than the threashold + maw_qual = max(reads_record.letter_annotations["phred_quality"]) + if maw_qual < int(options.quality_threshold) : + reads_ok[i] = 1 + del_by_quality += 1 + log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n") + + # If is asked to clean sequences by length using the standard deviation + if options.clean_length_std : + # Compute the mean and the standard deviation + mean = sum(reads_length) / len(reads_length) + mq = mean**2 + s = sum([ x**2 for x in reads_length]) + var = s/len(reads_length) - mq + etype = math.sqrt(var) + # For each sequences + for i, id in enumerate(reads_id): + # If the sequence has not been flagged as deleted + if reads_ok[i] == 0 : + # If the sequence length is higher than the upper threshold, flag the sequence as deleted + if reads_length[i] > mean + options.std*etype: + reads_ok[i] = 1 + del_by_length += 1 + log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") + # If the sequence length is smaller than the lower threshold, flag the sequence as deleted + elif reads_length[i] < mean - options.std*etype: + reads_ok[i] = 1 + del_by_length += 1 + log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") + + seqs_to_return = [] + # Then get only sequences not flagged to be deleted + for i, reads_record in enumerate(seqs) : + if reads_ok[i] == 0: + seqs_to_return.append(reads_record) + + # Return values + return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality] + + +def filter_same_reads (seqs, options): + """ + Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster + @param seqs : table of seqs to filter + @param options : the options asked by the user + """ + + megablast_input = os.path.join(options.output, "megablast_input.fasta") + log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n") + log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n") + + # First write down seqs + fasta = open(megablast_input, "w") + SeqIO.write(seqs, fasta, "fasta") + fasta.close() + + # Then formatdb the fasta file (no formatdb utils in Biopython) + cmd = "formatdb -i %s -p F" % megablast_input + os.system(cmd) + log.write("## megablast (" + str(datetime.datetime.now()) + ")\n") + + # In case of pairends, use words of 50, so megablast cannot connect with spacers + opts = "" + if options.clean_pairends: + opts = " -W 50 -H 10 " + + # Megablast the fasta file versus itself + cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" + os.system(cmd) + + # Let's get the reads length with the fasta file and creates the graph + log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n") + gseqs = {} + vertices = [] + for reads_record in seqs : + vertices.append({'name': reads_record.id}) + gseqs[reads_record.id] = len(reads_record) + + # Connects reads from hits starting at 1 and with ends closed to each others + log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n") + edges = [] + for read in open(megablast_input + ".res", 'rU').readlines() : + parts = read.rstrip().split() + len1 = gseqs[parts[0]] + len2 = gseqs[parts[1]] + if options.clean_aggressive : + edges.append({'source': parts[0], 'target': parts[1]}) + elif math.fabs(len1-len2) < options.duplication_limit: + if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit: + # This alignments are realy similar -> may be the same -> link them into the graph + edges.append({'source': parts[0], 'target': parts[1]}) + + # Then get connected components and extract one of them as cluster leader + log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n") + gr = Graph.DictList(vertices, edges) + connex = gr.clusters().membership + + clusters = {} + for i, vertex in enumerate(vertices): + cluster_id = connex[i] + try: + clusters[cluster_id].append(seqs[i]) + except: + clusters[cluster_id] = [] + clusters[cluster_id].append(seqs[i]) + + del_by_duplicat = 0 + # Write down into the log the composition of each cluster + clusters_stats = {} + seqs_to_return = [] + for cluster_id in clusters: + cl_elts = "" + del_by_duplicat += len(clusters[cluster_id]) - 1 + longest_value = 0 + cluster_leader = None + for seq_record in clusters[cluster_id]: + # Find out which sequence is the longest + if len(seq_record.seq) > longest_value: + cluster_leader = seq_record + + seqs_to_return.append(cluster_leader) + for seq_record in clusters[cluster_id]: + if seq_record.id != cluster_leader.id: + log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n") + cl_elts += seq_record.id + " " + log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n") + + try : + clusters_stats[len(clusters[cluster_id])] += 1 + except: + clusters_stats[len(clusters[cluster_id])] = 1 + + # Write down a summary of what has been done + log_header = "## header (duplicated) : " + log_summary = "## summary (duplicated) : " + for stat in sorted(clusters_stats.keys()): + log_header += str(stat) + "\t" + log_summary += str(clusters_stats[stat]) + "\t" + log.write(log_header + "\n") + log.write(log_summary + "\n") + + # Clean temporary files + try: + os.remove(megablast_input) + os.remove(megablast_input+".nhr") + os.remove(megablast_input+".nin") + os.remove(megablast_input+".nsq") + os.remove(megablast_input+".res") + os.remove("formatdb.log") + os.remove("error.log") + except: + pass + + # Returns results + return [seqs_to_return, del_by_duplicat] + + +def filter_pairends(seqs, options): + """ + Filter pairends sequences and split sequences without pairends into a fasta file + @param seqs : the table of sequences to filter + @param options : the options asked by the user + """ + + # Setup output files + shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta") + shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual") + crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta") + split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta") + split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual") + + # First write down seqs + fasta = open(crossmatch_input, "w") + SeqIO.write(seqs, fasta, "fasta") + fasta.close() + + # Write down the qual file for cross_match if possible + try: + qual = open(crossmatch_input+".qual", "w") + SeqIO.write(seqs, qual, "qual") + qual.close() + except: + os.remove(crossmatch_input+".qual") + + # Cross_match reverse matches pattern + rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") + # Cross_match forward matches pattern + fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") + + # Write the spacer file and execute cross_match against the input sequences + spacers_file = create_roche_pairends_spacer_file(options.output) + cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res" + os.system(cmd) + + # Parse the cross_match file + cross_match_tab = {} + block_found = False + for line in open(crossmatch_input + ".cross_match.res", 'r'): + save_line = False + rm = rev_regex.match(line) + fm = fwd_regex.match(line) + if rm != None: # If it's a reverse matches + block_found = True + (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8))) + save_line = True + elif fm != None: # If it's a forward matches + block_found = True + (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8))) + save_line = True + else : save_line = False + + if line.startswith("Discrepancy summary:"): # This is the end of the section + break + + # Save the line + if save_line: + try: + cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch]) + except: + cross_match_tab[primary_match] = [] + cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)]) + + # Then get the spacers_length + spacers_length = {} + for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") : + spacers_length[seq_record.id] = len(seq_record.seq.tostring()) + + # Finaly go throught all sequences + seqs_to_return = [] + shotgun_seqs = [] + del_by_pairends = 1 + pe_splited = [] + + for i, seq_record in enumerate(seqs) : + if not cross_match_tab.has_key(seq_record.id): + # No spacers found -> just add it to the shotgun_seqs tab + shotgun_seqs.append(seq_record) + log.write(seq_record.id + " shotgun -> no spacer found\n") + elif len(cross_match_tab[seq_record.id]) > 1: + # If multiple linker -> delete the whole sequence + log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n") + del_by_pairends += 1 + elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)): + if (cross_match_tab[seq_record.id][0][1] == 1): + shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) + log.write(seq_record.id + " shotgun -> spacer found at the begining\n") + elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)): + shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) + log.write(seq_record.id + " shotgun -> spacer found at the end\n") + else : + log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n") + del_by_pairends += 1 + elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit: + seqs_to_return.append(seq_record) + if options.split_pairends: + sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:] + sub_seq1.id = seq_record.id + ".r" + pe_splited.append(sub_seq1) + sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]] + sub_seq2.id = seq_record.id + ".f" + pe_splited.append(sub_seq2) + elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: + log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n") + del_by_pairends += 1 + elif cross_match_tab[seq_record.id][0][1] < options.border_limit: + shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) + log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n") + elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: + shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) + log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n") + + # Write down if required the splited pairends file + if options.split_pairends: + handle = open(split_pairends_fasta, "w") + SeqIO.write(pe_splited, handle, "fasta") + handle.close() + handle = open(split_pairends_qual, "w") + SeqIO.write(pe_splited, handle, "qual") + handle.close() + + # Finaly clean up new shotguns seqs + [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options) + handle = open(shotgun_ffile, "w") + SeqIO.write(shotgun_seqs_clean, handle, "fasta") + handle.close() + in_shot_gun = 0 + try: + handle = open(shotgun_qfile, "w") + SeqIO.write(shotgun_seqs_clean, handle, "qual") + handle.close() + in_shot_gun = len(shotgun_seqs_clean) + except: + pass + + log.write("## header (pairends) : pairends\ttotal shotgun\n") + log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n") + + # Clean temporary files + try: + os.remove(crossmatch_input) + os.remove(spacers_file) + os.remove(crossmatch_input+".cross_match.res") + os.remove(crossmatch_input+".log") + os.remove(crossmatch_input+".qual") + except: + pass + + # Then returned pairends ones + return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends] + + +def get_seqs (options): + """ + Converts input seqs in a BioPython seq table + @param options : the options asked by the user + """ + + # First get fasta or/and qual input files + qual_file = "" + if options.format == "sff": + sff_file = options.input_file + + if sff_file.endswith(".gz"): + '''Gunzip the given file and then remove the file.''' + r_file = gzip.GzipFile(sff_file, 'r') + write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz')) + w_file = open(write_file, 'w') + w_file.write(r_file.read()) + w_file.close() + r_file.close() + sff_file = write_file + + base = os.path.basename(sff_file) + fasta_file = os.path.join(options.output, base + '.fasta') + qual_file = os.path.join(options.output, base + '.qual') + xml_file = os.path.join(options.output, base + '.xml') + format = "fasta" + if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file): + #First extract the sff file to fasta, qual and xml file + cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file + os.system(cmd) + else : + log = open(options.log_file, "a+") + log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n") + log.close() + elif options.format == "fasta" or options.format == "fastq": + format = options.format + fasta_file = options.input_file + else : + print "Error : " + options.format + " is not a supported format!" + sys.exit(1) + + if options.qual_file: + qual_file = options.qual_file + + # If we got a quality file + if qual_file : + quals = {} + # If the file is gziped + if qual_file.endswith(".gz"): + for qual in SeqIO.parse(gzip.open(qual_file), "qual") : + quals[qual.id] = qual + else : + for qual in SeqIO.parse(open(qual_file), "qual") : + quals[qual.id] = qual + + # If the fasta file is gziped + if fasta_file.endswith(".gz"): + seqs = [] + for record in SeqIO.parse(gzip.open(fasta_file), format) : + if qual_file : + record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] + seqs.append(record) + else : + seqs = [] + for record in SeqIO.parse(open(fasta_file), format) : + if qual_file : + record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] + seqs.append(record) + + # Clean temporary files + if options.format == "sff": + try: + os.remove(fasta_file) + os.remove(qual_file) + os.remove(xml_file) + except: + pass + + # Returns the table of sequences + return [seqs] + +def which (program): + """ + Return if the asked program exist in the user path + @param options : the options asked by the user + """ + import os + def is_exe(fpath): + return os.path.exists(fpath) and os.access(fpath, os.X_OK) + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None + +def pyrocleaner_depts_error(options): + """ + Return the list of dependencies missing to run the pyrocleaner + """ + error = "" + if which("formatdb") == None: + error += " - formatdb is missing\n" + if which("megablast") == None: + error += " - megablast is missing\n" + if which("sff_extract.py") == None: + error += " - sff_extract.py\n" + if which("cross_match") == None and options.clean_pairends != None: + error += " - cross_match\n" + if error != "": + error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!" + return error + + +if __name__ == "__main__": + + parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]") + + usage = "usage: %prog -i file -o output -f format" + desc = " Clean sequences from 454 SFF files \n"\ + " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win" + parser = OptionParser(usage = usage, version = version_string(), description = desc) + + igroup = OptionGroup(parser, "Input files options","") + igroup.add_option("-i", "--in", dest="input_file", + help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE") + igroup.add_option("-q", "--qual", dest="qual_file", + help="The quality file to use if input file is fasta", metavar="FILE") + igroup.add_option("-f", "--format", dest="format", + help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff") + parser.add_option_group(igroup) + + ogroup = OptionGroup(parser, "Output files options","") + ogroup.add_option("-o", "--out", dest="output", + help="The output folder where to store results") + ogroup.add_option("-g", "--log", dest="log_file", + help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log") + ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", + default=False, help="Write splited pairends sequences into a fasta file") + parser.add_option_group(ogroup) + + cgroup = OptionGroup(parser, "Cleaning options","") + cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", + default=False, help="Clean pairends") + cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", + default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation") + cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", + default=False, help="Filter reads with a legnth in between [x:y]") + cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", + default=False, help="Filter reads with too much N") + cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", + default=False, help="Filter duplicated reads") + cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", + default=False, help="Filter low complexity reads computed on a sliding window") + cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", + default=False, help="Filter low complexity reads computed on the whole sequence") + cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", + default=False, help="Filter low quality reads") + parser.add_option_group(cgroup) + + pgroup = OptionGroup(parser, "Processing options","") + pgroup.add_option("-a", "--acpus", dest="nb_cpus", + help="Number of processors to use", type="int", default=1) + pgroup.add_option("-r", "--recursion", dest="recursion", + help="Recursion limit when computing duplicated reads", type="int", default=1000) + parser.add_option_group(pgroup) + + cpgroup = OptionGroup(parser, "Cleaning parameters","") + cpgroup.add_option("-b", "--border-limit", dest="border_limit", + help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70) + cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", + default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)") + cpgroup.add_option("-e", "--missmatch", dest="missmatch", + help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10) + cpgroup.add_option("-j", "--std", dest="std", + help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2) + cpgroup.add_option("-1", "--min", dest="min", + help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200) + cpgroup.add_option("-2", "--max", dest="max", + help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600) + cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold", + help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) + cpgroup.add_option("-s", "--ns_percent", dest="ns_percent", + help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4) + cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit", + help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70) + cpgroup.add_option("-v", "--window", dest="window", + help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100) + cpgroup.add_option("-x", "--step", dest="step", + help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5) + cpgroup.add_option("-y", "--complexity", dest="complexity", + help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40) + parser.add_option_group(cpgroup) + + (options, args) = parser.parse_args() + + sys.setrecursionlimit(options.recursion) + + if not pyrocleaner_depts_error(options): + + if options.input_file == None or options.output == None and options.format == None: + parser.print_help() + sys.exit(1) + elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality : + print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help" + sys.exit(1) + else: + + # Create output directory if doesn't exist + if not os.path.isdir(options.output) : os.mkdir(options.output ) + + global log + log = open(options.output+"/"+options.log_file, "w") + log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n") + log.write("## with the following options: \n") + opts = "" + if options.clean_complexity_full: + opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" + if options.clean_complexity_win: + opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" + if options.clean_duplicated_reads and not options.clean_aggressive: + opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n" + elif options.clean_duplicated_reads and options.clean_aggressive : + opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" + if options.clean_length_std: + opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" + if options.clean_length_win: + opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" + if options.clean_ns: + opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" + if options.clean_pairends: + opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n" + if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) : + opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n" + elif options.clean_quality : + print "No quality check will be performed beacause there is no quality file provided." + log.write(opts) + log.close() + + # 1 - First get inputs from options + [seqs] = get_seqs(options) + before_cleaning = len(seqs) + del_by_length = 0 + del_by_ns = 0 + del_by_complexity = 0 + del_by_duplicat = 0 + del_by_pairends = 0 + del_by_quality = 0 + in_shot_gun = 0 + + log = open(options.output+"/"+options.log_file, "a+") + # 2 - clean reads + # 2.1 - Clean reads by length, ns, complexity + if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality: + if options.format == "sff" : + [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) + elif options.format == "fasta" or options.format == "fastq": + [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) + + # 2.2 - Clean reads by duplicat + if options.clean_duplicated_reads: + [seqs, del_by_duplicat] = filter_same_reads(seqs, options) + + # 3 - Clean pairends + if options.clean_pairends: + [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options) + del_by_length += del_by_length_pe + del_by_ns += del_by_ns_pe + del_by_complexity += del_by_complexity_pe + del_by_quality += del_by_quality_pe + + # 4 - Create the output in the same format than the input + after_cleaning = len(seqs) + in_shot_gun + if options.input_file.endswith(".gz"): + ifile = string.rstrip(options.input_file, '.gz') + else : ifile = options.input_file + if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean") + else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean") + output += "." + options.format + if options.format == "sff" : + if options.input_file.endswith(".gz"): + sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz')) + else : + sfffile = options.input_file + if which("sfffile") == None: + fastq = open(output, "w") + SeqIO.write(seqs, fastq, "fastq") + fastq.close() + else : + reads_to_sff(sfffile, seqs, output) + # Clean temporary files + if options.input_file.endswith(".gz"): + os.remove(sfffile) + + elif options.format == "fasta" or options.format == "fastq": + fasta = open(output, "w") + SeqIO.write(seqs, fasta, options.format) + fasta.close() + if options.format == "fasta" and options.qual_file != None: + qual = open(os.path.splitext(output)[0]+".qual", "w") + SeqIO.write(seqs, qual, "qual") + qual.close() + + # 5 - Write down the analyze sumary + log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n") + log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning) + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n") + log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n") + log.close() + sys.exit(0) + else : + print pyrocleaner_depts_error(options) + sys.exit(1) + \ No newline at end of file
