| 0 | 1 # | 
|  | 2 # Pyrocleaner | 
|  | 3 # Copyright (C) 2009 INRA | 
|  | 4 # | 
|  | 5 # This program is free software: you can redistribute it and/or modify | 
|  | 6 # it under the terms of the GNU General Public License as published by | 
|  | 7 # the Free Software Foundation, either version 3 of the License, or | 
|  | 8 # (at your option) any later version. | 
|  | 9 # | 
|  | 10 # This program is distributed in the hope that it will be useful, | 
|  | 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 13 # GNU General Public License for more details. | 
|  | 14 # | 
|  | 15 # You should have received a copy of the GNU General Public License | 
|  | 16 # along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 17 # | 
|  | 18 # ChangeLog | 
|  | 19 # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the | 
|  | 20 #                  --clean-quality option | 
|  | 21 # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality | 
|  | 22 #                  Add the --aggressive option in order to keep only one read per cluster | 
|  | 23 #                  Modify the duplication strategy so the longer read is keept | 
|  | 24 # v1.0 (09/2009) : Pyrocleaner first version | 
|  | 25 # | 
|  | 26 | 
|  | 27 __author__ = 'Plateforme bioinformatique Midi Pyrenees' | 
|  | 28 __copyright__ = 'Copyright (C) 2009 INRA' | 
|  | 29 __license__ = 'GNU General Public License' | 
|  | 30 __version__ = '1.2' | 
|  | 31 __email__ = 'support.genopole@toulouse.inra.fr' | 
|  | 32 __status__ = 'beta' | 
|  | 33 | 
|  | 34 from Bio import SeqIO | 
|  | 35 from igraph import * | 
|  | 36 from optparse import * | 
|  | 37 import os, math, datetime, zlib, sys, re, glob, string, gzip | 
|  | 38 | 
|  | 39 | 
|  | 40 def create_roche_pairends_spacer_file(out_dir): | 
|  | 41     """ | 
|  | 42     Create the Roche pairends fasta file | 
|  | 43       @param out_dir : the directory where will be writen the fasta file | 
|  | 44     """ | 
|  | 45     file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr') | 
|  | 46     file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n") | 
|  | 47     file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n") | 
|  | 48     file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n") | 
|  | 49     file.close() | 
|  | 50     return file.name | 
|  | 51 | 
|  | 52 | 
|  | 53 def version_string (): | 
|  | 54     """ | 
|  | 55     Return the pyrocleaner version | 
|  | 56     """ | 
|  | 57     return "pyrocleaner " + __version__ | 
|  | 58 | 
|  | 59 | 
|  | 60 def reads_to_sff (sff_file, seqs, output_file): | 
|  | 61     """ | 
|  | 62     Extract seqs reads from the sff_file | 
|  | 63       @dependences sfffile : the sff software is required to execute this function | 
|  | 64       @param sff_file      : the input sff file | 
|  | 65       @param seqs          : table of seqs to extract | 
|  | 66       @param output_file   : the name of the output sff file | 
|  | 67     """ | 
|  | 68     # First creates the to_write file | 
|  | 69     tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt") | 
|  | 70     to_write_file = open(tmp_file, 'wr') | 
|  | 71     for read in seqs: | 
|  | 72         to_write_file.write(read.id + "\n") | 
|  | 73     to_write_file.close() | 
|  | 74     # Use the sfffile tools | 
|  | 75     cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file | 
|  | 76     os.system(cmd) | 
|  | 77     # Clean temporary files | 
|  | 78     try: os.remove(tmp_file) | 
|  | 79     except: pass | 
|  | 80 | 
|  | 81 | 
|  | 82 def filter_reads (seqs, options): | 
|  | 83     """ | 
|  | 84     Filter input seqs by length, ns and complexity if options are asked by user | 
|  | 85       @param seqs    : table of seqs to filter | 
|  | 86       @param options : the options asked by the user | 
|  | 87     """ | 
|  | 88     reads_id = [] | 
|  | 89     reads_length = [] | 
|  | 90     reads_ok = [] | 
|  | 91     del_by_ns = 0 | 
|  | 92     del_by_complexity = 0 | 
|  | 93     del_by_length = 0 | 
|  | 94     del_by_quality = 0 | 
|  | 95     log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n") | 
|  | 96 | 
|  | 97     # Go throught all sequences | 
|  | 98     for i, reads_record in enumerate(seqs) : | 
|  | 99         reads_id.append(reads_record.id) | 
|  | 100         reads_ok.append(0) | 
|  | 101 | 
|  | 102         # If is asked to clean sequences by length using the standard | 
|  | 103         # deviation, save length to compute some statistics | 
|  | 104         if options.clean_length_std : | 
|  | 105             reads_length.append(len(reads_record)) | 
|  | 106 | 
|  | 107         # If is asked to clean sequences by length using a window | 
|  | 108         # and the sequence has not been flagged as deleted | 
|  | 109         if options.clean_length_win and reads_ok[i] == 0: | 
|  | 110             # Check if the sequence is longer than the min asked, if not flagged it as deleted | 
|  | 111             if len(reads_record) < int(options.min): | 
|  | 112                 reads_ok[i] = 1 | 
|  | 113                 del_by_length += 1 | 
|  | 114                 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n") | 
|  | 115             # Check if the sequence is smaller than the max asked, if not flagged it as deleted | 
|  | 116             elif len(reads_record) > int(options.max): | 
|  | 117                 reads_ok[i] = 1 | 
|  | 118                 del_by_length += 1 | 
|  | 119                 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n") | 
|  | 120 | 
|  | 121         # If is asked to clean sequences with too much Ns | 
|  | 122         # and the sequence has not been flagged as deleted | 
|  | 123         if options.clean_ns and reads_ok[i] == 0: | 
|  | 124             # Compute the rate of Ns into the current sequence | 
|  | 125             nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100) | 
|  | 126             # If the rate is higher than the threshold flagged it as deleted | 
|  | 127             if nb_n > float(options.ns_percent) : | 
|  | 128                 reads_ok[i] = 1 | 
|  | 129                 del_by_ns += 1 | 
|  | 130                 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n") | 
|  | 131 | 
|  | 132         # If is asked to clean sequences with low complexity using a sliding window | 
|  | 133         # and the sequence has not been flagged as deleted | 
|  | 134         if options.clean_complexity_win and reads_ok[i] == 0: | 
|  | 135             is_complex = False | 
|  | 136             # For each window | 
|  | 137             for win in range(0, len(reads_record)-options.window, options.step): | 
|  | 138                 start = win | 
|  | 139                 stop = start + options.window | 
|  | 140                 # Compute the window complexity | 
|  | 141                 w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100) | 
|  | 142                 # if the window complexity is higher to the threshold, flag the whole sequence as complex | 
|  | 143                 if w_cplx >= float(options.complexity): | 
|  | 144                     is_complex = True | 
|  | 145                     break | 
|  | 146             # If no window has been flagged as complex, then flagg the sequence as deleted | 
|  | 147             if not is_complex: | 
|  | 148                 reads_ok[i] = 1 | 
|  | 149                 del_by_complexity += 1 | 
|  | 150                 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n") | 
|  | 151 | 
|  | 152         # If is asked to clean sequences with low complexity working on the whole sequence | 
|  | 153         # and the sequence has not been flagged as deleted | 
|  | 154         if options.clean_complexity_full and reads_ok[i] == 0: | 
|  | 155             # Compute the complexity on the whole sequence | 
|  | 156             cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100) | 
|  | 157             # If the complexity is higher to the threshold, flag the sequence as deleted | 
|  | 158             if cplx < float(options.complexity) : | 
|  | 159                 reads_ok[i] = 1 | 
|  | 160                 del_by_complexity += 1 | 
|  | 161                 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n") | 
|  | 162 | 
|  | 163         # If is asked to clean sequences with low quality and quality information is available | 
|  | 164         # and the sequence has not been flagged as deleted | 
|  | 165         if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"): | 
|  | 166             # If at least one base has a quality score higher than the threashold | 
|  | 167             maw_qual = max(reads_record.letter_annotations["phred_quality"]) | 
|  | 168             if maw_qual < int(options.quality_threshold) : | 
|  | 169                 reads_ok[i] = 1 | 
|  | 170                 del_by_quality += 1 | 
|  | 171                 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n") | 
|  | 172 | 
|  | 173     # If is asked to clean sequences by length using the standard deviation | 
|  | 174     if options.clean_length_std : | 
|  | 175         # Compute the mean and the standard deviation | 
|  | 176         mean = sum(reads_length) / len(reads_length) | 
|  | 177         mq = mean**2 | 
|  | 178         s = sum([ x**2 for x in reads_length]) | 
|  | 179         var = s/len(reads_length) - mq | 
|  | 180         etype = math.sqrt(var) | 
|  | 181         # For each sequences | 
|  | 182         for i, id in enumerate(reads_id): | 
|  | 183             # If the sequence has not been flagged as deleted | 
|  | 184             if  reads_ok[i] == 0 : | 
|  | 185                 # If the sequence length is higher than the upper threshold, flag the sequence as deleted | 
|  | 186                 if reads_length[i] > mean + options.std*etype: | 
|  | 187                     reads_ok[i] = 1 | 
|  | 188                     del_by_length += 1 | 
|  | 189                     log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | 
|  | 190                 # If the sequence length is smaller than the lower threshold, flag the sequence as deleted | 
|  | 191                 elif reads_length[i] < mean - options.std*etype: | 
|  | 192                     reads_ok[i] = 1 | 
|  | 193                     del_by_length += 1 | 
|  | 194                     log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | 
|  | 195 | 
|  | 196     seqs_to_return = [] | 
|  | 197     # Then get only sequences not flagged to be deleted | 
|  | 198     for i, reads_record in enumerate(seqs) : | 
|  | 199         if reads_ok[i] == 0: | 
|  | 200             seqs_to_return.append(reads_record) | 
|  | 201 | 
|  | 202     # Return values | 
|  | 203     return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality] | 
|  | 204 | 
|  | 205 | 
|  | 206 def filter_same_reads (seqs, options): | 
|  | 207     """ | 
|  | 208     Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster | 
|  | 209       @param seqs    : table of seqs to filter | 
|  | 210       @param options : the options asked by the user | 
|  | 211     """ | 
|  | 212 | 
|  | 213     megablast_input = os.path.join(options.output, "megablast_input.fasta") | 
|  | 214     log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n") | 
|  | 215     log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n") | 
|  | 216 | 
|  | 217     # First write down seqs | 
|  | 218     fasta = open(megablast_input, "w") | 
|  | 219     SeqIO.write(seqs, fasta, "fasta") | 
|  | 220     fasta.close() | 
|  | 221 | 
|  | 222     # Then formatdb the fasta file (no formatdb utils in Biopython) | 
|  | 223     cmd = "formatdb -i %s -p F" % megablast_input | 
|  | 224     os.system(cmd) | 
|  | 225     log.write("## megablast (" + str(datetime.datetime.now()) + ")\n") | 
|  | 226 | 
|  | 227     # In case of pairends, use words of 50, so megablast cannot connect with spacers | 
|  | 228     opts = "" | 
|  | 229     if options.clean_pairends: | 
|  | 230         opts = " -W 50 -H 10 " | 
|  | 231 | 
|  | 232     # Megablast the fasta file versus itself | 
|  | 233     cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" | 
|  | 234     os.system(cmd) | 
|  | 235 | 
|  | 236     # Let's get the reads length with the fasta file and creates the graph | 
|  | 237     log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n") | 
|  | 238     gseqs = {} | 
|  | 239     vertices = [] | 
|  | 240     for reads_record in seqs : | 
|  | 241         vertices.append({'name': reads_record.id}) | 
|  | 242         gseqs[reads_record.id] = len(reads_record) | 
|  | 243 | 
|  | 244     #  Connects reads from hits starting at 1 and with ends closed to each others | 
|  | 245     log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n") | 
|  | 246     edges = [] | 
|  | 247     for read in open(megablast_input + ".res", 'rU').readlines() : | 
|  | 248         parts = read.rstrip().split() | 
|  | 249         len1 = gseqs[parts[0]] | 
|  | 250         len2 = gseqs[parts[1]] | 
|  | 251         if options.clean_aggressive : | 
|  | 252             edges.append({'source': parts[0], 'target': parts[1]}) | 
|  | 253         elif math.fabs(len1-len2) < options.duplication_limit: | 
|  | 254             if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit: | 
|  | 255                 # This alignments are realy similar -> may be the same -> link them into the graph | 
|  | 256                 edges.append({'source': parts[0], 'target': parts[1]}) | 
|  | 257 | 
|  | 258     # Then get connected components and extract one of them as cluster leader | 
|  | 259     log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n") | 
|  | 260     gr = Graph.DictList(vertices, edges) | 
|  | 261     connex = gr.clusters().membership | 
|  | 262 | 
|  | 263     clusters = {} | 
|  | 264     for i, vertex in enumerate(vertices): | 
|  | 265         cluster_id = connex[i] | 
|  | 266         try: | 
|  | 267             clusters[cluster_id].append(seqs[i]) | 
|  | 268         except: | 
|  | 269             clusters[cluster_id] = [] | 
|  | 270             clusters[cluster_id].append(seqs[i]) | 
|  | 271 | 
|  | 272     del_by_duplicat = 0 | 
|  | 273     # Write down into the log the composition of each cluster | 
|  | 274     clusters_stats = {} | 
|  | 275     seqs_to_return = [] | 
|  | 276     for cluster_id in clusters: | 
|  | 277         cl_elts = "" | 
|  | 278         del_by_duplicat += len(clusters[cluster_id]) - 1 | 
|  | 279         longest_value = 0 | 
|  | 280         cluster_leader = None | 
|  | 281         for seq_record in clusters[cluster_id]: | 
|  | 282             # Find out which sequence is the longest | 
|  | 283             if len(seq_record.seq) > longest_value: | 
|  | 284                 cluster_leader = seq_record | 
|  | 285 | 
|  | 286         seqs_to_return.append(cluster_leader) | 
|  | 287         for seq_record in clusters[cluster_id]: | 
|  | 288             if seq_record.id != cluster_leader.id: | 
|  | 289                 log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n") | 
|  | 290             cl_elts += seq_record.id + " " | 
|  | 291         log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n") | 
|  | 292 | 
|  | 293         try : | 
|  | 294             clusters_stats[len(clusters[cluster_id])] += 1 | 
|  | 295         except: | 
|  | 296             clusters_stats[len(clusters[cluster_id])] = 1 | 
|  | 297 | 
|  | 298     # Write down a summary of what has been done | 
|  | 299     log_header = "## header (duplicated) : " | 
|  | 300     log_summary = "## summary (duplicated) : " | 
|  | 301     for stat in sorted(clusters_stats.keys()): | 
|  | 302         log_header += str(stat) + "\t" | 
|  | 303         log_summary += str(clusters_stats[stat]) + "\t" | 
|  | 304     log.write(log_header + "\n") | 
|  | 305     log.write(log_summary + "\n") | 
|  | 306 | 
|  | 307     # Clean temporary files | 
|  | 308     try: | 
|  | 309         os.remove(megablast_input) | 
|  | 310         os.remove(megablast_input+".nhr") | 
|  | 311         os.remove(megablast_input+".nin") | 
|  | 312         os.remove(megablast_input+".nsq") | 
|  | 313         os.remove(megablast_input+".res") | 
|  | 314         os.remove("formatdb.log") | 
|  | 315         os.remove("error.log") | 
|  | 316     except: | 
|  | 317         pass | 
|  | 318 | 
|  | 319     # Returns results | 
|  | 320     return [seqs_to_return, del_by_duplicat] | 
|  | 321 | 
|  | 322 | 
|  | 323 def filter_pairends(seqs, options): | 
|  | 324     """ | 
|  | 325     Filter pairends sequences and split sequences without pairends into a fasta file | 
|  | 326       @param seqs    : the table of sequences to filter | 
|  | 327       @param options : the options asked by the user | 
|  | 328     """ | 
|  | 329 | 
|  | 330     # Setup output files | 
|  | 331     shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta") | 
|  | 332     shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual") | 
|  | 333     crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta") | 
|  | 334     split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta") | 
|  | 335     split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual") | 
|  | 336 | 
|  | 337     # First write down seqs | 
|  | 338     fasta = open(crossmatch_input, "w") | 
|  | 339     SeqIO.write(seqs, fasta, "fasta") | 
|  | 340     fasta.close() | 
|  | 341 | 
|  | 342     # Write down the qual file for cross_match if possible | 
|  | 343     try: | 
|  | 344         qual = open(crossmatch_input+".qual", "w") | 
|  | 345         SeqIO.write(seqs, qual, "qual") | 
|  | 346         qual.close() | 
|  | 347     except: | 
|  | 348         os.remove(crossmatch_input+".qual") | 
|  | 349 | 
|  | 350     # Cross_match reverse matches pattern | 
|  | 351     rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") | 
|  | 352     # Cross_match forward matches pattern | 
|  | 353     fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") | 
|  | 354 | 
|  | 355     # Write the spacer file and execute cross_match against the input sequences | 
|  | 356     spacers_file = create_roche_pairends_spacer_file(options.output) | 
|  | 357     cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res" | 
|  | 358     os.system(cmd) | 
|  | 359 | 
|  | 360     # Parse the cross_match file | 
|  | 361     cross_match_tab = {} | 
|  | 362     block_found = False | 
|  | 363     for line in open(crossmatch_input + ".cross_match.res", 'r'): | 
|  | 364         save_line = False | 
|  | 365         rm = rev_regex.match(line) | 
|  | 366         fm = fwd_regex.match(line) | 
|  | 367         if rm != None: # If it's a reverse matches | 
|  | 368             block_found = True | 
|  | 369             (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8))) | 
|  | 370             save_line = True | 
|  | 371         elif fm != None: # If it's a forward matches | 
|  | 372             block_found = True | 
|  | 373             (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8))) | 
|  | 374             save_line = True | 
|  | 375         else : save_line = False | 
|  | 376 | 
|  | 377         if line.startswith("Discrepancy summary:"): # This is the end of the section | 
|  | 378             break | 
|  | 379 | 
|  | 380         # Save the line | 
|  | 381         if save_line: | 
|  | 382             try: | 
|  | 383                 cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch]) | 
|  | 384             except: | 
|  | 385                 cross_match_tab[primary_match] = [] | 
|  | 386                 cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)]) | 
|  | 387 | 
|  | 388     # Then get the spacers_length | 
|  | 389     spacers_length = {} | 
|  | 390     for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") : | 
|  | 391         spacers_length[seq_record.id] = len(seq_record.seq.tostring()) | 
|  | 392 | 
|  | 393     # Finaly go throught all sequences | 
|  | 394     seqs_to_return = [] | 
|  | 395     shotgun_seqs = [] | 
|  | 396     del_by_pairends = 1 | 
|  | 397     pe_splited = [] | 
|  | 398 | 
|  | 399     for i, seq_record in enumerate(seqs) : | 
|  | 400         if not cross_match_tab.has_key(seq_record.id): | 
|  | 401             # No spacers found -> just add it to the shotgun_seqs tab | 
|  | 402             shotgun_seqs.append(seq_record) | 
|  | 403             log.write(seq_record.id + " shotgun -> no spacer found\n") | 
|  | 404         elif len(cross_match_tab[seq_record.id]) > 1: | 
|  | 405             # If multiple linker -> delete the whole sequence | 
|  | 406             log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n") | 
|  | 407             del_by_pairends += 1 | 
|  | 408         elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)): | 
|  | 409             if (cross_match_tab[seq_record.id][0][1] == 1): | 
|  | 410                 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | 
|  | 411                 log.write(seq_record.id + " shotgun -> spacer found at the begining\n") | 
|  | 412             elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)): | 
|  | 413                 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | 
|  | 414                 log.write(seq_record.id + " shotgun -> spacer found at the end\n") | 
|  | 415             else : | 
|  | 416                 log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n") | 
|  | 417                 del_by_pairends += 1 | 
|  | 418         elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit: | 
|  | 419             seqs_to_return.append(seq_record) | 
|  | 420             if options.split_pairends: | 
|  | 421                 sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:] | 
|  | 422                 sub_seq1.id = seq_record.id + ".r" | 
|  | 423                 pe_splited.append(sub_seq1) | 
|  | 424                 sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]] | 
|  | 425                 sub_seq2.id = seq_record.id + ".f" | 
|  | 426                 pe_splited.append(sub_seq2) | 
|  | 427         elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | 
|  | 428             log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n") | 
|  | 429             del_by_pairends += 1 | 
|  | 430         elif cross_match_tab[seq_record.id][0][1] < options.border_limit: | 
|  | 431             shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | 
|  | 432             log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n") | 
|  | 433         elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | 
|  | 434             shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | 
|  | 435             log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n") | 
|  | 436 | 
|  | 437     # Write down if required the splited pairends file | 
|  | 438     if options.split_pairends: | 
|  | 439         handle = open(split_pairends_fasta, "w") | 
|  | 440         SeqIO.write(pe_splited, handle, "fasta") | 
|  | 441         handle.close() | 
|  | 442         handle = open(split_pairends_qual, "w") | 
|  | 443         SeqIO.write(pe_splited, handle, "qual") | 
|  | 444         handle.close() | 
|  | 445 | 
|  | 446     # Finaly clean up new shotguns seqs | 
|  | 447     [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options) | 
|  | 448     handle = open(shotgun_ffile, "w") | 
|  | 449     SeqIO.write(shotgun_seqs_clean, handle, "fasta") | 
|  | 450     handle.close() | 
|  | 451     in_shot_gun = 0 | 
|  | 452     try: | 
|  | 453         handle = open(shotgun_qfile, "w") | 
|  | 454         SeqIO.write(shotgun_seqs_clean, handle, "qual") | 
|  | 455         handle.close() | 
|  | 456         in_shot_gun = len(shotgun_seqs_clean) | 
|  | 457     except: | 
|  | 458         pass | 
|  | 459 | 
|  | 460     log.write("## header (pairends) : pairends\ttotal shotgun\n") | 
|  | 461     log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n") | 
|  | 462 | 
|  | 463     # Clean temporary files | 
|  | 464     try: | 
|  | 465         os.remove(crossmatch_input) | 
|  | 466         os.remove(spacers_file) | 
|  | 467         os.remove(crossmatch_input+".cross_match.res") | 
|  | 468         os.remove(crossmatch_input+".log") | 
|  | 469         os.remove(crossmatch_input+".qual") | 
|  | 470     except: | 
|  | 471         pass | 
|  | 472 | 
|  | 473     # Then returned pairends ones | 
|  | 474     return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends] | 
|  | 475 | 
|  | 476 | 
|  | 477 def get_seqs (options): | 
|  | 478     """ | 
|  | 479     Converts input seqs in a BioPython seq table | 
|  | 480       @param options : the options asked by the user | 
|  | 481     """ | 
|  | 482 | 
|  | 483     # First get fasta or/and qual input files | 
|  | 484     qual_file = "" | 
|  | 485     if options.format == "sff": | 
|  | 486         sff_file = options.input_file | 
|  | 487 | 
|  | 488         if sff_file.endswith(".gz"): | 
|  | 489             '''Gunzip the given file and then remove the file.''' | 
|  | 490             r_file = gzip.GzipFile(sff_file, 'r') | 
|  | 491             write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz')) | 
|  | 492             w_file = open(write_file, 'w') | 
|  | 493             w_file.write(r_file.read()) | 
|  | 494             w_file.close() | 
|  | 495             r_file.close() | 
|  | 496             sff_file = write_file | 
|  | 497 | 
|  | 498         base = os.path.basename(sff_file) | 
|  | 499         fasta_file = os.path.join(options.output, base + '.fasta') | 
|  | 500         qual_file = os.path.join(options.output, base + '.qual') | 
|  | 501         xml_file = os.path.join(options.output, base + '.xml') | 
|  | 502         format = "fasta" | 
|  | 503         if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file): | 
|  | 504             #First extract the sff file to fasta, qual and xml file | 
|  | 505             cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file | 
|  | 506             os.system(cmd) | 
|  | 507         else : | 
|  | 508             log = open(options.log_file, "a+") | 
|  | 509             log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n") | 
|  | 510             log.close() | 
|  | 511     elif options.format == "fasta" or options.format == "fastq": | 
|  | 512         format = options.format | 
|  | 513         fasta_file = options.input_file | 
|  | 514     else : | 
|  | 515         print "Error : " + options.format + " is not a supported format!" | 
|  | 516         sys.exit(1) | 
|  | 517 | 
|  | 518     if options.qual_file: | 
|  | 519         qual_file = options.qual_file | 
|  | 520 | 
|  | 521     # If we got a quality file | 
|  | 522     if qual_file : | 
|  | 523         quals = {} | 
|  | 524         # If the file is gziped | 
|  | 525         if qual_file.endswith(".gz"): | 
|  | 526             for qual in SeqIO.parse(gzip.open(qual_file), "qual") : | 
|  | 527                 quals[qual.id] = qual | 
|  | 528         else : | 
|  | 529             for qual in SeqIO.parse(open(qual_file), "qual") : | 
|  | 530                 quals[qual.id] = qual | 
|  | 531 | 
|  | 532     # If the fasta file is gziped | 
|  | 533     if fasta_file.endswith(".gz"): | 
|  | 534         seqs = [] | 
|  | 535         for record in SeqIO.parse(gzip.open(fasta_file), format) : | 
|  | 536             if qual_file : | 
|  | 537                 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | 
|  | 538             seqs.append(record) | 
|  | 539     else : | 
|  | 540         seqs = [] | 
|  | 541         for record in SeqIO.parse(open(fasta_file), format) : | 
|  | 542             if qual_file : | 
|  | 543                 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | 
|  | 544             seqs.append(record) | 
|  | 545 | 
|  | 546     # Clean temporary files | 
|  | 547     if options.format == "sff": | 
|  | 548         try: | 
|  | 549             os.remove(fasta_file) | 
|  | 550             os.remove(qual_file) | 
|  | 551             os.remove(xml_file) | 
|  | 552         except: | 
|  | 553             pass | 
|  | 554 | 
|  | 555     # Returns the table of sequences | 
|  | 556     return [seqs] | 
|  | 557 | 
|  | 558 def which (program): | 
|  | 559     """ | 
|  | 560     Return if the asked program exist in the user path | 
|  | 561       @param options : the options asked by the user | 
|  | 562     """ | 
|  | 563     import os | 
|  | 564     def is_exe(fpath): | 
|  | 565         return os.path.exists(fpath) and os.access(fpath, os.X_OK) | 
|  | 566     fpath, fname = os.path.split(program) | 
|  | 567     if fpath: | 
|  | 568         if is_exe(program): | 
|  | 569             return program | 
|  | 570     else: | 
|  | 571         for path in os.environ["PATH"].split(os.pathsep): | 
|  | 572             exe_file = os.path.join(path, program) | 
|  | 573             if is_exe(exe_file): | 
|  | 574                 return exe_file | 
|  | 575     return None | 
|  | 576 | 
|  | 577 def pyrocleaner_depts_error(options): | 
|  | 578     """ | 
|  | 579     Return the list of dependencies missing to run the pyrocleaner | 
|  | 580     """ | 
|  | 581     error = "" | 
|  | 582     if which("formatdb") == None: | 
|  | 583         error += " - formatdb is missing\n" | 
|  | 584     if which("megablast") == None: | 
|  | 585         error += " - megablast is missing\n" | 
|  | 586     if which("sff_extract.py") == None: | 
|  | 587         error += " - sff_extract.py\n" | 
|  | 588     if which("cross_match") == None and options.clean_pairends != None: | 
|  | 589         error += " - cross_match\n" | 
|  | 590     if error != "": | 
|  | 591         error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!" | 
|  | 592     return error | 
|  | 593 | 
|  | 594 | 
|  | 595 if __name__ == "__main__": | 
|  | 596 | 
|  | 597     parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]") | 
|  | 598 | 
|  | 599     usage = "usage: %prog -i file -o output -f format" | 
|  | 600     desc = " Clean sequences from 454 SFF files \n"\ | 
|  | 601            " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win" | 
|  | 602     parser = OptionParser(usage = usage, version = version_string(), description = desc) | 
|  | 603 | 
|  | 604     igroup = OptionGroup(parser, "Input files options","") | 
|  | 605     igroup.add_option("-i", "--in", dest="input_file", | 
|  | 606                       help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE") | 
|  | 607     igroup.add_option("-q", "--qual", dest="qual_file", | 
|  | 608                       help="The quality file to use if input file is fasta", metavar="FILE") | 
|  | 609     igroup.add_option("-f", "--format", dest="format", | 
|  | 610                       help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff") | 
|  | 611     parser.add_option_group(igroup) | 
|  | 612 | 
|  | 613     ogroup = OptionGroup(parser, "Output files options","") | 
|  | 614     ogroup.add_option("-o", "--out", dest="output", | 
|  | 615                       help="The output folder where to store results") | 
|  | 616     ogroup.add_option("-g", "--log", dest="log_file", | 
|  | 617                       help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log") | 
|  | 618     ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", | 
|  | 619                       default=False, help="Write splited pairends sequences into a fasta file") | 
|  | 620     parser.add_option_group(ogroup) | 
|  | 621 | 
|  | 622     cgroup = OptionGroup(parser, "Cleaning options","") | 
|  | 623     cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", | 
|  | 624                       default=False, help="Clean pairends") | 
|  | 625     cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", | 
|  | 626                       default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation") | 
|  | 627     cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", | 
|  | 628                       default=False, help="Filter reads with a legnth in between [x:y]") | 
|  | 629     cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", | 
|  | 630                       default=False, help="Filter reads with too much N") | 
|  | 631     cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", | 
|  | 632                       default=False, help="Filter duplicated reads") | 
|  | 633     cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", | 
|  | 634                       default=False, help="Filter low complexity reads computed on a sliding window") | 
|  | 635     cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", | 
|  | 636                       default=False, help="Filter low complexity reads computed on the whole sequence") | 
|  | 637     cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", | 
|  | 638                       default=False, help="Filter low quality reads") | 
|  | 639     parser.add_option_group(cgroup) | 
|  | 640 | 
|  | 641     pgroup = OptionGroup(parser, "Processing options","") | 
|  | 642     pgroup.add_option("-a", "--acpus", dest="nb_cpus", | 
|  | 643                       help="Number of processors to use", type="int", default=1) | 
|  | 644     pgroup.add_option("-r", "--recursion", dest="recursion", | 
|  | 645                       help="Recursion limit when computing duplicated reads", type="int", default=1000) | 
|  | 646     parser.add_option_group(pgroup) | 
|  | 647 | 
|  | 648     cpgroup = OptionGroup(parser, "Cleaning parameters","") | 
|  | 649     cpgroup.add_option("-b", "--border-limit", dest="border_limit", | 
|  | 650                       help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70) | 
|  | 651     cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", | 
|  | 652                       default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)") | 
|  | 653     cpgroup.add_option("-e", "--missmatch", dest="missmatch", | 
|  | 654                       help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10) | 
|  | 655     cpgroup.add_option("-j", "--std", dest="std", | 
|  | 656                       help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2) | 
|  | 657     cpgroup.add_option("-1", "--min", dest="min", | 
|  | 658                       help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200) | 
|  | 659     cpgroup.add_option("-2", "--max", dest="max", | 
|  | 660                        help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600) | 
|  | 661     cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold", | 
|  | 662                        help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) | 
|  | 663     cpgroup.add_option("-s", "--ns_percent", dest="ns_percent", | 
|  | 664                       help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4) | 
|  | 665     cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit", | 
|  | 666                       help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70) | 
|  | 667     cpgroup.add_option("-v", "--window", dest="window", | 
|  | 668                       help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100) | 
|  | 669     cpgroup.add_option("-x", "--step", dest="step", | 
|  | 670                       help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5) | 
|  | 671     cpgroup.add_option("-y", "--complexity", dest="complexity", | 
|  | 672                       help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40) | 
|  | 673     parser.add_option_group(cpgroup) | 
|  | 674 | 
|  | 675     (options, args) = parser.parse_args() | 
|  | 676 | 
|  | 677     sys.setrecursionlimit(options.recursion) | 
|  | 678 | 
|  | 679     if not pyrocleaner_depts_error(options): | 
|  | 680 | 
|  | 681         if options.input_file == None or options.output == None and options.format == None: | 
|  | 682             parser.print_help() | 
|  | 683             sys.exit(1) | 
|  | 684         elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality : | 
|  | 685             print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help" | 
|  | 686             sys.exit(1) | 
|  | 687         else: | 
|  | 688 | 
|  | 689             # Create output directory if doesn't exist | 
|  | 690             if not os.path.isdir(options.output) : os.mkdir(options.output ) | 
|  | 691 | 
|  | 692             global log | 
|  | 693             log = open(options.output+"/"+options.log_file, "w") | 
|  | 694             log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n") | 
|  | 695             log.write("## with the following options: \n") | 
|  | 696             opts = "" | 
|  | 697             if options.clean_complexity_full: | 
|  | 698                 opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" | 
|  | 699             if options.clean_complexity_win: | 
|  | 700                 opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" | 
|  | 701             if options.clean_duplicated_reads and not options.clean_aggressive: | 
|  | 702                 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n" | 
|  | 703             elif options.clean_duplicated_reads and options.clean_aggressive : | 
|  | 704                 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" | 
|  | 705             if options.clean_length_std: | 
|  | 706                 opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" | 
|  | 707             if options.clean_length_win: | 
|  | 708                 opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" | 
|  | 709             if options.clean_ns: | 
|  | 710                 opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" | 
|  | 711             if options.clean_pairends: | 
|  | 712                 opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n" | 
|  | 713             if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) : | 
|  | 714                 opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n" | 
|  | 715             elif options.clean_quality : | 
|  | 716                 print "No quality check will be performed beacause there is no quality file provided." | 
|  | 717             log.write(opts) | 
|  | 718             log.close() | 
|  | 719 | 
|  | 720             # 1 - First get inputs from options | 
|  | 721             [seqs] = get_seqs(options) | 
|  | 722             before_cleaning = len(seqs) | 
|  | 723             del_by_length = 0 | 
|  | 724             del_by_ns = 0 | 
|  | 725             del_by_complexity = 0 | 
|  | 726             del_by_duplicat = 0 | 
|  | 727             del_by_pairends = 0 | 
|  | 728             del_by_quality = 0 | 
|  | 729             in_shot_gun = 0 | 
|  | 730 | 
|  | 731             log = open(options.output+"/"+options.log_file, "a+") | 
|  | 732             # 2 - clean reads | 
|  | 733             # 2.1 - Clean reads by length, ns, complexity | 
|  | 734             if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality: | 
|  | 735                 if options.format == "sff" : | 
|  | 736                     [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | 
|  | 737                 elif options.format == "fasta" or options.format == "fastq": | 
|  | 738                     [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | 
|  | 739 | 
|  | 740             # 2.2 - Clean reads by duplicat | 
|  | 741             if options.clean_duplicated_reads: | 
|  | 742                 [seqs, del_by_duplicat] = filter_same_reads(seqs, options) | 
|  | 743 | 
|  | 744             # 3 - Clean pairends | 
|  | 745             if options.clean_pairends: | 
|  | 746                 [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options) | 
|  | 747                 del_by_length += del_by_length_pe | 
|  | 748                 del_by_ns += del_by_ns_pe | 
|  | 749                 del_by_complexity += del_by_complexity_pe | 
|  | 750                 del_by_quality += del_by_quality_pe | 
|  | 751 | 
|  | 752             # 4 - Create the output in the same format than the input | 
|  | 753             after_cleaning = len(seqs) + in_shot_gun | 
|  | 754             if options.input_file.endswith(".gz"): | 
|  | 755                 ifile = string.rstrip(options.input_file, '.gz') | 
|  | 756             else : ifile = options.input_file | 
|  | 757             if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean") | 
|  | 758             else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean") | 
|  | 759             output += "." + options.format | 
|  | 760             if options.format == "sff" : | 
|  | 761                 if options.input_file.endswith(".gz"): | 
|  | 762                     sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz')) | 
|  | 763                 else : | 
|  | 764                     sfffile = options.input_file | 
|  | 765                 if which("sfffile") == None: | 
|  | 766                     fastq = open(output, "w") | 
|  | 767                     SeqIO.write(seqs, fastq, "fastq") | 
|  | 768                     fastq.close() | 
|  | 769                 else : | 
|  | 770                     reads_to_sff(sfffile, seqs, output) | 
|  | 771                     # Clean temporary files | 
|  | 772                     if options.input_file.endswith(".gz"): | 
|  | 773                         os.remove(sfffile) | 
|  | 774 | 
|  | 775             elif options.format == "fasta" or options.format == "fastq": | 
|  | 776                 fasta = open(output, "w") | 
|  | 777                 SeqIO.write(seqs, fasta, options.format) | 
|  | 778                 fasta.close() | 
|  | 779                 if options.format == "fasta" and options.qual_file != None: | 
|  | 780                     qual = open(os.path.splitext(output)[0]+".qual", "w") | 
|  | 781                     SeqIO.write(seqs, qual, "qual") | 
|  | 782                     qual.close() | 
|  | 783 | 
|  | 784             # 5 - Write down the analyze sumary | 
|  | 785             log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n") | 
|  | 786             log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning)  + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n") | 
|  | 787             log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n") | 
|  | 788             log.close() | 
|  | 789             sys.exit(0) | 
|  | 790     else : | 
|  | 791         print pyrocleaner_depts_error(options) | 
|  | 792         sys.exit(1) | 
|  | 793 |