Mercurial > repos > jmariette > pyrocleaner
comparison pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py @ 0:ef5dd11c01e6 default tip
pyrocleaner v1.2
| author | g2cmnty@test-web1.g2.bx.psu.edu |
|---|---|
| date | Thu, 09 Jun 2011 06:09:09 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ef5dd11c01e6 |
|---|---|
| 1 # | |
| 2 # Pyrocleaner | |
| 3 # Copyright (C) 2009 INRA | |
| 4 # | |
| 5 # This program is free software: you can redistribute it and/or modify | |
| 6 # it under the terms of the GNU General Public License as published by | |
| 7 # the Free Software Foundation, either version 3 of the License, or | |
| 8 # (at your option) any later version. | |
| 9 # | |
| 10 # This program is distributed in the hope that it will be useful, | |
| 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 # GNU General Public License for more details. | |
| 14 # | |
| 15 # You should have received a copy of the GNU General Public License | |
| 16 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 17 # | |
| 18 # ChangeLog | |
| 19 # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the | |
| 20 # --clean-quality option | |
| 21 # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality | |
| 22 # Add the --aggressive option in order to keep only one read per cluster | |
| 23 # Modify the duplication strategy so the longer read is keept | |
| 24 # v1.0 (09/2009) : Pyrocleaner first version | |
| 25 # | |
| 26 | |
| 27 __author__ = 'Plateforme bioinformatique Midi Pyrenees' | |
| 28 __copyright__ = 'Copyright (C) 2009 INRA' | |
| 29 __license__ = 'GNU General Public License' | |
| 30 __version__ = '1.2' | |
| 31 __email__ = 'support.genopole@toulouse.inra.fr' | |
| 32 __status__ = 'beta' | |
| 33 | |
| 34 from Bio import SeqIO | |
| 35 from igraph import * | |
| 36 from optparse import * | |
| 37 import os, math, datetime, zlib, sys, re, glob, string, gzip | |
| 38 | |
| 39 | |
| 40 def create_roche_pairends_spacer_file(out_dir): | |
| 41 """ | |
| 42 Create the Roche pairends fasta file | |
| 43 @param out_dir : the directory where will be writen the fasta file | |
| 44 """ | |
| 45 file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr') | |
| 46 file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n") | |
| 47 file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n") | |
| 48 file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n") | |
| 49 file.close() | |
| 50 return file.name | |
| 51 | |
| 52 | |
| 53 def version_string (): | |
| 54 """ | |
| 55 Return the pyrocleaner version | |
| 56 """ | |
| 57 return "pyrocleaner " + __version__ | |
| 58 | |
| 59 | |
| 60 def reads_to_sff (sff_file, seqs, output_file): | |
| 61 """ | |
| 62 Extract seqs reads from the sff_file | |
| 63 @dependences sfffile : the sff software is required to execute this function | |
| 64 @param sff_file : the input sff file | |
| 65 @param seqs : table of seqs to extract | |
| 66 @param output_file : the name of the output sff file | |
| 67 """ | |
| 68 # First creates the to_write file | |
| 69 tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt") | |
| 70 to_write_file = open(tmp_file, 'wr') | |
| 71 for read in seqs: | |
| 72 to_write_file.write(read.id + "\n") | |
| 73 to_write_file.close() | |
| 74 # Use the sfffile tools | |
| 75 cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file | |
| 76 os.system(cmd) | |
| 77 # Clean temporary files | |
| 78 try: os.remove(tmp_file) | |
| 79 except: pass | |
| 80 | |
| 81 | |
| 82 def filter_reads (seqs, options): | |
| 83 """ | |
| 84 Filter input seqs by length, ns and complexity if options are asked by user | |
| 85 @param seqs : table of seqs to filter | |
| 86 @param options : the options asked by the user | |
| 87 """ | |
| 88 reads_id = [] | |
| 89 reads_length = [] | |
| 90 reads_ok = [] | |
| 91 del_by_ns = 0 | |
| 92 del_by_complexity = 0 | |
| 93 del_by_length = 0 | |
| 94 del_by_quality = 0 | |
| 95 log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n") | |
| 96 | |
| 97 # Go throught all sequences | |
| 98 for i, reads_record in enumerate(seqs) : | |
| 99 reads_id.append(reads_record.id) | |
| 100 reads_ok.append(0) | |
| 101 | |
| 102 # If is asked to clean sequences by length using the standard | |
| 103 # deviation, save length to compute some statistics | |
| 104 if options.clean_length_std : | |
| 105 reads_length.append(len(reads_record)) | |
| 106 | |
| 107 # If is asked to clean sequences by length using a window | |
| 108 # and the sequence has not been flagged as deleted | |
| 109 if options.clean_length_win and reads_ok[i] == 0: | |
| 110 # Check if the sequence is longer than the min asked, if not flagged it as deleted | |
| 111 if len(reads_record) < int(options.min): | |
| 112 reads_ok[i] = 1 | |
| 113 del_by_length += 1 | |
| 114 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n") | |
| 115 # Check if the sequence is smaller than the max asked, if not flagged it as deleted | |
| 116 elif len(reads_record) > int(options.max): | |
| 117 reads_ok[i] = 1 | |
| 118 del_by_length += 1 | |
| 119 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n") | |
| 120 | |
| 121 # If is asked to clean sequences with too much Ns | |
| 122 # and the sequence has not been flagged as deleted | |
| 123 if options.clean_ns and reads_ok[i] == 0: | |
| 124 # Compute the rate of Ns into the current sequence | |
| 125 nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100) | |
| 126 # If the rate is higher than the threshold flagged it as deleted | |
| 127 if nb_n > float(options.ns_percent) : | |
| 128 reads_ok[i] = 1 | |
| 129 del_by_ns += 1 | |
| 130 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n") | |
| 131 | |
| 132 # If is asked to clean sequences with low complexity using a sliding window | |
| 133 # and the sequence has not been flagged as deleted | |
| 134 if options.clean_complexity_win and reads_ok[i] == 0: | |
| 135 is_complex = False | |
| 136 # For each window | |
| 137 for win in range(0, len(reads_record)-options.window, options.step): | |
| 138 start = win | |
| 139 stop = start + options.window | |
| 140 # Compute the window complexity | |
| 141 w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100) | |
| 142 # if the window complexity is higher to the threshold, flag the whole sequence as complex | |
| 143 if w_cplx >= float(options.complexity): | |
| 144 is_complex = True | |
| 145 break | |
| 146 # If no window has been flagged as complex, then flagg the sequence as deleted | |
| 147 if not is_complex: | |
| 148 reads_ok[i] = 1 | |
| 149 del_by_complexity += 1 | |
| 150 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n") | |
| 151 | |
| 152 # If is asked to clean sequences with low complexity working on the whole sequence | |
| 153 # and the sequence has not been flagged as deleted | |
| 154 if options.clean_complexity_full and reads_ok[i] == 0: | |
| 155 # Compute the complexity on the whole sequence | |
| 156 cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100) | |
| 157 # If the complexity is higher to the threshold, flag the sequence as deleted | |
| 158 if cplx < float(options.complexity) : | |
| 159 reads_ok[i] = 1 | |
| 160 del_by_complexity += 1 | |
| 161 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n") | |
| 162 | |
| 163 # If is asked to clean sequences with low quality and quality information is available | |
| 164 # and the sequence has not been flagged as deleted | |
| 165 if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"): | |
| 166 # If at least one base has a quality score higher than the threashold | |
| 167 maw_qual = max(reads_record.letter_annotations["phred_quality"]) | |
| 168 if maw_qual < int(options.quality_threshold) : | |
| 169 reads_ok[i] = 1 | |
| 170 del_by_quality += 1 | |
| 171 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n") | |
| 172 | |
| 173 # If is asked to clean sequences by length using the standard deviation | |
| 174 if options.clean_length_std : | |
| 175 # Compute the mean and the standard deviation | |
| 176 mean = sum(reads_length) / len(reads_length) | |
| 177 mq = mean**2 | |
| 178 s = sum([ x**2 for x in reads_length]) | |
| 179 var = s/len(reads_length) - mq | |
| 180 etype = math.sqrt(var) | |
| 181 # For each sequences | |
| 182 for i, id in enumerate(reads_id): | |
| 183 # If the sequence has not been flagged as deleted | |
| 184 if reads_ok[i] == 0 : | |
| 185 # If the sequence length is higher than the upper threshold, flag the sequence as deleted | |
| 186 if reads_length[i] > mean + options.std*etype: | |
| 187 reads_ok[i] = 1 | |
| 188 del_by_length += 1 | |
| 189 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | |
| 190 # If the sequence length is smaller than the lower threshold, flag the sequence as deleted | |
| 191 elif reads_length[i] < mean - options.std*etype: | |
| 192 reads_ok[i] = 1 | |
| 193 del_by_length += 1 | |
| 194 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | |
| 195 | |
| 196 seqs_to_return = [] | |
| 197 # Then get only sequences not flagged to be deleted | |
| 198 for i, reads_record in enumerate(seqs) : | |
| 199 if reads_ok[i] == 0: | |
| 200 seqs_to_return.append(reads_record) | |
| 201 | |
| 202 # Return values | |
| 203 return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality] | |
| 204 | |
| 205 | |
| 206 def filter_same_reads (seqs, options): | |
| 207 """ | |
| 208 Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster | |
| 209 @param seqs : table of seqs to filter | |
| 210 @param options : the options asked by the user | |
| 211 """ | |
| 212 | |
| 213 megablast_input = os.path.join(options.output, "megablast_input.fasta") | |
| 214 log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n") | |
| 215 log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n") | |
| 216 | |
| 217 # First write down seqs | |
| 218 fasta = open(megablast_input, "w") | |
| 219 SeqIO.write(seqs, fasta, "fasta") | |
| 220 fasta.close() | |
| 221 | |
| 222 # Then formatdb the fasta file (no formatdb utils in Biopython) | |
| 223 cmd = "formatdb -i %s -p F" % megablast_input | |
| 224 os.system(cmd) | |
| 225 log.write("## megablast (" + str(datetime.datetime.now()) + ")\n") | |
| 226 | |
| 227 # In case of pairends, use words of 50, so megablast cannot connect with spacers | |
| 228 opts = "" | |
| 229 if options.clean_pairends: | |
| 230 opts = " -W 50 -H 10 " | |
| 231 | |
| 232 # Megablast the fasta file versus itself | |
| 233 cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" | |
| 234 os.system(cmd) | |
| 235 | |
| 236 # Let's get the reads length with the fasta file and creates the graph | |
| 237 log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n") | |
| 238 gseqs = {} | |
| 239 vertices = [] | |
| 240 for reads_record in seqs : | |
| 241 vertices.append({'name': reads_record.id}) | |
| 242 gseqs[reads_record.id] = len(reads_record) | |
| 243 | |
| 244 # Connects reads from hits starting at 1 and with ends closed to each others | |
| 245 log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n") | |
| 246 edges = [] | |
| 247 for read in open(megablast_input + ".res", 'rU').readlines() : | |
| 248 parts = read.rstrip().split() | |
| 249 len1 = gseqs[parts[0]] | |
| 250 len2 = gseqs[parts[1]] | |
| 251 if options.clean_aggressive : | |
| 252 edges.append({'source': parts[0], 'target': parts[1]}) | |
| 253 elif math.fabs(len1-len2) < options.duplication_limit: | |
| 254 if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit: | |
| 255 # This alignments are realy similar -> may be the same -> link them into the graph | |
| 256 edges.append({'source': parts[0], 'target': parts[1]}) | |
| 257 | |
| 258 # Then get connected components and extract one of them as cluster leader | |
| 259 log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n") | |
| 260 gr = Graph.DictList(vertices, edges) | |
| 261 connex = gr.clusters().membership | |
| 262 | |
| 263 clusters = {} | |
| 264 for i, vertex in enumerate(vertices): | |
| 265 cluster_id = connex[i] | |
| 266 try: | |
| 267 clusters[cluster_id].append(seqs[i]) | |
| 268 except: | |
| 269 clusters[cluster_id] = [] | |
| 270 clusters[cluster_id].append(seqs[i]) | |
| 271 | |
| 272 del_by_duplicat = 0 | |
| 273 # Write down into the log the composition of each cluster | |
| 274 clusters_stats = {} | |
| 275 seqs_to_return = [] | |
| 276 for cluster_id in clusters: | |
| 277 cl_elts = "" | |
| 278 del_by_duplicat += len(clusters[cluster_id]) - 1 | |
| 279 longest_value = 0 | |
| 280 cluster_leader = None | |
| 281 for seq_record in clusters[cluster_id]: | |
| 282 # Find out which sequence is the longest | |
| 283 if len(seq_record.seq) > longest_value: | |
| 284 cluster_leader = seq_record | |
| 285 | |
| 286 seqs_to_return.append(cluster_leader) | |
| 287 for seq_record in clusters[cluster_id]: | |
| 288 if seq_record.id != cluster_leader.id: | |
| 289 log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n") | |
| 290 cl_elts += seq_record.id + " " | |
| 291 log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n") | |
| 292 | |
| 293 try : | |
| 294 clusters_stats[len(clusters[cluster_id])] += 1 | |
| 295 except: | |
| 296 clusters_stats[len(clusters[cluster_id])] = 1 | |
| 297 | |
| 298 # Write down a summary of what has been done | |
| 299 log_header = "## header (duplicated) : " | |
| 300 log_summary = "## summary (duplicated) : " | |
| 301 for stat in sorted(clusters_stats.keys()): | |
| 302 log_header += str(stat) + "\t" | |
| 303 log_summary += str(clusters_stats[stat]) + "\t" | |
| 304 log.write(log_header + "\n") | |
| 305 log.write(log_summary + "\n") | |
| 306 | |
| 307 # Clean temporary files | |
| 308 try: | |
| 309 os.remove(megablast_input) | |
| 310 os.remove(megablast_input+".nhr") | |
| 311 os.remove(megablast_input+".nin") | |
| 312 os.remove(megablast_input+".nsq") | |
| 313 os.remove(megablast_input+".res") | |
| 314 os.remove("formatdb.log") | |
| 315 os.remove("error.log") | |
| 316 except: | |
| 317 pass | |
| 318 | |
| 319 # Returns results | |
| 320 return [seqs_to_return, del_by_duplicat] | |
| 321 | |
| 322 | |
| 323 def filter_pairends(seqs, options): | |
| 324 """ | |
| 325 Filter pairends sequences and split sequences without pairends into a fasta file | |
| 326 @param seqs : the table of sequences to filter | |
| 327 @param options : the options asked by the user | |
| 328 """ | |
| 329 | |
| 330 # Setup output files | |
| 331 shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta") | |
| 332 shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual") | |
| 333 crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta") | |
| 334 split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta") | |
| 335 split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual") | |
| 336 | |
| 337 # First write down seqs | |
| 338 fasta = open(crossmatch_input, "w") | |
| 339 SeqIO.write(seqs, fasta, "fasta") | |
| 340 fasta.close() | |
| 341 | |
| 342 # Write down the qual file for cross_match if possible | |
| 343 try: | |
| 344 qual = open(crossmatch_input+".qual", "w") | |
| 345 SeqIO.write(seqs, qual, "qual") | |
| 346 qual.close() | |
| 347 except: | |
| 348 os.remove(crossmatch_input+".qual") | |
| 349 | |
| 350 # Cross_match reverse matches pattern | |
| 351 rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") | |
| 352 # Cross_match forward matches pattern | |
| 353 fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") | |
| 354 | |
| 355 # Write the spacer file and execute cross_match against the input sequences | |
| 356 spacers_file = create_roche_pairends_spacer_file(options.output) | |
| 357 cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res" | |
| 358 os.system(cmd) | |
| 359 | |
| 360 # Parse the cross_match file | |
| 361 cross_match_tab = {} | |
| 362 block_found = False | |
| 363 for line in open(crossmatch_input + ".cross_match.res", 'r'): | |
| 364 save_line = False | |
| 365 rm = rev_regex.match(line) | |
| 366 fm = fwd_regex.match(line) | |
| 367 if rm != None: # If it's a reverse matches | |
| 368 block_found = True | |
| 369 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8))) | |
| 370 save_line = True | |
| 371 elif fm != None: # If it's a forward matches | |
| 372 block_found = True | |
| 373 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8))) | |
| 374 save_line = True | |
| 375 else : save_line = False | |
| 376 | |
| 377 if line.startswith("Discrepancy summary:"): # This is the end of the section | |
| 378 break | |
| 379 | |
| 380 # Save the line | |
| 381 if save_line: | |
| 382 try: | |
| 383 cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch]) | |
| 384 except: | |
| 385 cross_match_tab[primary_match] = [] | |
| 386 cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)]) | |
| 387 | |
| 388 # Then get the spacers_length | |
| 389 spacers_length = {} | |
| 390 for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") : | |
| 391 spacers_length[seq_record.id] = len(seq_record.seq.tostring()) | |
| 392 | |
| 393 # Finaly go throught all sequences | |
| 394 seqs_to_return = [] | |
| 395 shotgun_seqs = [] | |
| 396 del_by_pairends = 1 | |
| 397 pe_splited = [] | |
| 398 | |
| 399 for i, seq_record in enumerate(seqs) : | |
| 400 if not cross_match_tab.has_key(seq_record.id): | |
| 401 # No spacers found -> just add it to the shotgun_seqs tab | |
| 402 shotgun_seqs.append(seq_record) | |
| 403 log.write(seq_record.id + " shotgun -> no spacer found\n") | |
| 404 elif len(cross_match_tab[seq_record.id]) > 1: | |
| 405 # If multiple linker -> delete the whole sequence | |
| 406 log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n") | |
| 407 del_by_pairends += 1 | |
| 408 elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)): | |
| 409 if (cross_match_tab[seq_record.id][0][1] == 1): | |
| 410 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | |
| 411 log.write(seq_record.id + " shotgun -> spacer found at the begining\n") | |
| 412 elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)): | |
| 413 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | |
| 414 log.write(seq_record.id + " shotgun -> spacer found at the end\n") | |
| 415 else : | |
| 416 log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n") | |
| 417 del_by_pairends += 1 | |
| 418 elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit: | |
| 419 seqs_to_return.append(seq_record) | |
| 420 if options.split_pairends: | |
| 421 sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:] | |
| 422 sub_seq1.id = seq_record.id + ".r" | |
| 423 pe_splited.append(sub_seq1) | |
| 424 sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]] | |
| 425 sub_seq2.id = seq_record.id + ".f" | |
| 426 pe_splited.append(sub_seq2) | |
| 427 elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | |
| 428 log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n") | |
| 429 del_by_pairends += 1 | |
| 430 elif cross_match_tab[seq_record.id][0][1] < options.border_limit: | |
| 431 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | |
| 432 log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n") | |
| 433 elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | |
| 434 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | |
| 435 log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n") | |
| 436 | |
| 437 # Write down if required the splited pairends file | |
| 438 if options.split_pairends: | |
| 439 handle = open(split_pairends_fasta, "w") | |
| 440 SeqIO.write(pe_splited, handle, "fasta") | |
| 441 handle.close() | |
| 442 handle = open(split_pairends_qual, "w") | |
| 443 SeqIO.write(pe_splited, handle, "qual") | |
| 444 handle.close() | |
| 445 | |
| 446 # Finaly clean up new shotguns seqs | |
| 447 [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options) | |
| 448 handle = open(shotgun_ffile, "w") | |
| 449 SeqIO.write(shotgun_seqs_clean, handle, "fasta") | |
| 450 handle.close() | |
| 451 in_shot_gun = 0 | |
| 452 try: | |
| 453 handle = open(shotgun_qfile, "w") | |
| 454 SeqIO.write(shotgun_seqs_clean, handle, "qual") | |
| 455 handle.close() | |
| 456 in_shot_gun = len(shotgun_seqs_clean) | |
| 457 except: | |
| 458 pass | |
| 459 | |
| 460 log.write("## header (pairends) : pairends\ttotal shotgun\n") | |
| 461 log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n") | |
| 462 | |
| 463 # Clean temporary files | |
| 464 try: | |
| 465 os.remove(crossmatch_input) | |
| 466 os.remove(spacers_file) | |
| 467 os.remove(crossmatch_input+".cross_match.res") | |
| 468 os.remove(crossmatch_input+".log") | |
| 469 os.remove(crossmatch_input+".qual") | |
| 470 except: | |
| 471 pass | |
| 472 | |
| 473 # Then returned pairends ones | |
| 474 return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends] | |
| 475 | |
| 476 | |
| 477 def get_seqs (options): | |
| 478 """ | |
| 479 Converts input seqs in a BioPython seq table | |
| 480 @param options : the options asked by the user | |
| 481 """ | |
| 482 | |
| 483 # First get fasta or/and qual input files | |
| 484 qual_file = "" | |
| 485 if options.format == "sff": | |
| 486 sff_file = options.input_file | |
| 487 | |
| 488 if sff_file.endswith(".gz"): | |
| 489 '''Gunzip the given file and then remove the file.''' | |
| 490 r_file = gzip.GzipFile(sff_file, 'r') | |
| 491 write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz')) | |
| 492 w_file = open(write_file, 'w') | |
| 493 w_file.write(r_file.read()) | |
| 494 w_file.close() | |
| 495 r_file.close() | |
| 496 sff_file = write_file | |
| 497 | |
| 498 base = os.path.basename(sff_file) | |
| 499 fasta_file = os.path.join(options.output, base + '.fasta') | |
| 500 qual_file = os.path.join(options.output, base + '.qual') | |
| 501 xml_file = os.path.join(options.output, base + '.xml') | |
| 502 format = "fasta" | |
| 503 if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file): | |
| 504 #First extract the sff file to fasta, qual and xml file | |
| 505 cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file | |
| 506 os.system(cmd) | |
| 507 else : | |
| 508 log = open(options.log_file, "a+") | |
| 509 log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n") | |
| 510 log.close() | |
| 511 elif options.format == "fasta" or options.format == "fastq": | |
| 512 format = options.format | |
| 513 fasta_file = options.input_file | |
| 514 else : | |
| 515 print "Error : " + options.format + " is not a supported format!" | |
| 516 sys.exit(1) | |
| 517 | |
| 518 if options.qual_file: | |
| 519 qual_file = options.qual_file | |
| 520 | |
| 521 # If we got a quality file | |
| 522 if qual_file : | |
| 523 quals = {} | |
| 524 # If the file is gziped | |
| 525 if qual_file.endswith(".gz"): | |
| 526 for qual in SeqIO.parse(gzip.open(qual_file), "qual") : | |
| 527 quals[qual.id] = qual | |
| 528 else : | |
| 529 for qual in SeqIO.parse(open(qual_file), "qual") : | |
| 530 quals[qual.id] = qual | |
| 531 | |
| 532 # If the fasta file is gziped | |
| 533 if fasta_file.endswith(".gz"): | |
| 534 seqs = [] | |
| 535 for record in SeqIO.parse(gzip.open(fasta_file), format) : | |
| 536 if qual_file : | |
| 537 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | |
| 538 seqs.append(record) | |
| 539 else : | |
| 540 seqs = [] | |
| 541 for record in SeqIO.parse(open(fasta_file), format) : | |
| 542 if qual_file : | |
| 543 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | |
| 544 seqs.append(record) | |
| 545 | |
| 546 # Clean temporary files | |
| 547 if options.format == "sff": | |
| 548 try: | |
| 549 os.remove(fasta_file) | |
| 550 os.remove(qual_file) | |
| 551 os.remove(xml_file) | |
| 552 except: | |
| 553 pass | |
| 554 | |
| 555 # Returns the table of sequences | |
| 556 return [seqs] | |
| 557 | |
| 558 def which (program): | |
| 559 """ | |
| 560 Return if the asked program exist in the user path | |
| 561 @param options : the options asked by the user | |
| 562 """ | |
| 563 import os | |
| 564 def is_exe(fpath): | |
| 565 return os.path.exists(fpath) and os.access(fpath, os.X_OK) | |
| 566 fpath, fname = os.path.split(program) | |
| 567 if fpath: | |
| 568 if is_exe(program): | |
| 569 return program | |
| 570 else: | |
| 571 for path in os.environ["PATH"].split(os.pathsep): | |
| 572 exe_file = os.path.join(path, program) | |
| 573 if is_exe(exe_file): | |
| 574 return exe_file | |
| 575 return None | |
| 576 | |
| 577 def pyrocleaner_depts_error(options): | |
| 578 """ | |
| 579 Return the list of dependencies missing to run the pyrocleaner | |
| 580 """ | |
| 581 error = "" | |
| 582 if which("formatdb") == None: | |
| 583 error += " - formatdb is missing\n" | |
| 584 if which("megablast") == None: | |
| 585 error += " - megablast is missing\n" | |
| 586 if which("sff_extract.py") == None: | |
| 587 error += " - sff_extract.py\n" | |
| 588 if which("cross_match") == None and options.clean_pairends != None: | |
| 589 error += " - cross_match\n" | |
| 590 if error != "": | |
| 591 error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!" | |
| 592 return error | |
| 593 | |
| 594 | |
| 595 if __name__ == "__main__": | |
| 596 | |
| 597 parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]") | |
| 598 | |
| 599 usage = "usage: %prog -i file -o output -f format" | |
| 600 desc = " Clean sequences from 454 SFF files \n"\ | |
| 601 " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win" | |
| 602 parser = OptionParser(usage = usage, version = version_string(), description = desc) | |
| 603 | |
| 604 igroup = OptionGroup(parser, "Input files options","") | |
| 605 igroup.add_option("-i", "--in", dest="input_file", | |
| 606 help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE") | |
| 607 igroup.add_option("-q", "--qual", dest="qual_file", | |
| 608 help="The quality file to use if input file is fasta", metavar="FILE") | |
| 609 igroup.add_option("-f", "--format", dest="format", | |
| 610 help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff") | |
| 611 parser.add_option_group(igroup) | |
| 612 | |
| 613 ogroup = OptionGroup(parser, "Output files options","") | |
| 614 ogroup.add_option("-o", "--out", dest="output", | |
| 615 help="The output folder where to store results") | |
| 616 ogroup.add_option("-g", "--log", dest="log_file", | |
| 617 help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log") | |
| 618 ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", | |
| 619 default=False, help="Write splited pairends sequences into a fasta file") | |
| 620 parser.add_option_group(ogroup) | |
| 621 | |
| 622 cgroup = OptionGroup(parser, "Cleaning options","") | |
| 623 cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", | |
| 624 default=False, help="Clean pairends") | |
| 625 cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", | |
| 626 default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation") | |
| 627 cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", | |
| 628 default=False, help="Filter reads with a legnth in between [x:y]") | |
| 629 cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", | |
| 630 default=False, help="Filter reads with too much N") | |
| 631 cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", | |
| 632 default=False, help="Filter duplicated reads") | |
| 633 cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", | |
| 634 default=False, help="Filter low complexity reads computed on a sliding window") | |
| 635 cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", | |
| 636 default=False, help="Filter low complexity reads computed on the whole sequence") | |
| 637 cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", | |
| 638 default=False, help="Filter low quality reads") | |
| 639 parser.add_option_group(cgroup) | |
| 640 | |
| 641 pgroup = OptionGroup(parser, "Processing options","") | |
| 642 pgroup.add_option("-a", "--acpus", dest="nb_cpus", | |
| 643 help="Number of processors to use", type="int", default=1) | |
| 644 pgroup.add_option("-r", "--recursion", dest="recursion", | |
| 645 help="Recursion limit when computing duplicated reads", type="int", default=1000) | |
| 646 parser.add_option_group(pgroup) | |
| 647 | |
| 648 cpgroup = OptionGroup(parser, "Cleaning parameters","") | |
| 649 cpgroup.add_option("-b", "--border-limit", dest="border_limit", | |
| 650 help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70) | |
| 651 cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", | |
| 652 default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)") | |
| 653 cpgroup.add_option("-e", "--missmatch", dest="missmatch", | |
| 654 help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10) | |
| 655 cpgroup.add_option("-j", "--std", dest="std", | |
| 656 help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2) | |
| 657 cpgroup.add_option("-1", "--min", dest="min", | |
| 658 help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200) | |
| 659 cpgroup.add_option("-2", "--max", dest="max", | |
| 660 help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600) | |
| 661 cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold", | |
| 662 help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) | |
| 663 cpgroup.add_option("-s", "--ns_percent", dest="ns_percent", | |
| 664 help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4) | |
| 665 cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit", | |
| 666 help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70) | |
| 667 cpgroup.add_option("-v", "--window", dest="window", | |
| 668 help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100) | |
| 669 cpgroup.add_option("-x", "--step", dest="step", | |
| 670 help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5) | |
| 671 cpgroup.add_option("-y", "--complexity", dest="complexity", | |
| 672 help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40) | |
| 673 parser.add_option_group(cpgroup) | |
| 674 | |
| 675 (options, args) = parser.parse_args() | |
| 676 | |
| 677 sys.setrecursionlimit(options.recursion) | |
| 678 | |
| 679 if not pyrocleaner_depts_error(options): | |
| 680 | |
| 681 if options.input_file == None or options.output == None and options.format == None: | |
| 682 parser.print_help() | |
| 683 sys.exit(1) | |
| 684 elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality : | |
| 685 print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help" | |
| 686 sys.exit(1) | |
| 687 else: | |
| 688 | |
| 689 # Create output directory if doesn't exist | |
| 690 if not os.path.isdir(options.output) : os.mkdir(options.output ) | |
| 691 | |
| 692 global log | |
| 693 log = open(options.output+"/"+options.log_file, "w") | |
| 694 log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n") | |
| 695 log.write("## with the following options: \n") | |
| 696 opts = "" | |
| 697 if options.clean_complexity_full: | |
| 698 opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" | |
| 699 if options.clean_complexity_win: | |
| 700 opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" | |
| 701 if options.clean_duplicated_reads and not options.clean_aggressive: | |
| 702 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n" | |
| 703 elif options.clean_duplicated_reads and options.clean_aggressive : | |
| 704 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" | |
| 705 if options.clean_length_std: | |
| 706 opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" | |
| 707 if options.clean_length_win: | |
| 708 opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" | |
| 709 if options.clean_ns: | |
| 710 opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" | |
| 711 if options.clean_pairends: | |
| 712 opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n" | |
| 713 if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) : | |
| 714 opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n" | |
| 715 elif options.clean_quality : | |
| 716 print "No quality check will be performed beacause there is no quality file provided." | |
| 717 log.write(opts) | |
| 718 log.close() | |
| 719 | |
| 720 # 1 - First get inputs from options | |
| 721 [seqs] = get_seqs(options) | |
| 722 before_cleaning = len(seqs) | |
| 723 del_by_length = 0 | |
| 724 del_by_ns = 0 | |
| 725 del_by_complexity = 0 | |
| 726 del_by_duplicat = 0 | |
| 727 del_by_pairends = 0 | |
| 728 del_by_quality = 0 | |
| 729 in_shot_gun = 0 | |
| 730 | |
| 731 log = open(options.output+"/"+options.log_file, "a+") | |
| 732 # 2 - clean reads | |
| 733 # 2.1 - Clean reads by length, ns, complexity | |
| 734 if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality: | |
| 735 if options.format == "sff" : | |
| 736 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | |
| 737 elif options.format == "fasta" or options.format == "fastq": | |
| 738 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | |
| 739 | |
| 740 # 2.2 - Clean reads by duplicat | |
| 741 if options.clean_duplicated_reads: | |
| 742 [seqs, del_by_duplicat] = filter_same_reads(seqs, options) | |
| 743 | |
| 744 # 3 - Clean pairends | |
| 745 if options.clean_pairends: | |
| 746 [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options) | |
| 747 del_by_length += del_by_length_pe | |
| 748 del_by_ns += del_by_ns_pe | |
| 749 del_by_complexity += del_by_complexity_pe | |
| 750 del_by_quality += del_by_quality_pe | |
| 751 | |
| 752 # 4 - Create the output in the same format than the input | |
| 753 after_cleaning = len(seqs) + in_shot_gun | |
| 754 if options.input_file.endswith(".gz"): | |
| 755 ifile = string.rstrip(options.input_file, '.gz') | |
| 756 else : ifile = options.input_file | |
| 757 if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean") | |
| 758 else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean") | |
| 759 output += "." + options.format | |
| 760 if options.format == "sff" : | |
| 761 if options.input_file.endswith(".gz"): | |
| 762 sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz')) | |
| 763 else : | |
| 764 sfffile = options.input_file | |
| 765 if which("sfffile") == None: | |
| 766 fastq = open(output, "w") | |
| 767 SeqIO.write(seqs, fastq, "fastq") | |
| 768 fastq.close() | |
| 769 else : | |
| 770 reads_to_sff(sfffile, seqs, output) | |
| 771 # Clean temporary files | |
| 772 if options.input_file.endswith(".gz"): | |
| 773 os.remove(sfffile) | |
| 774 | |
| 775 elif options.format == "fasta" or options.format == "fastq": | |
| 776 fasta = open(output, "w") | |
| 777 SeqIO.write(seqs, fasta, options.format) | |
| 778 fasta.close() | |
| 779 if options.format == "fasta" and options.qual_file != None: | |
| 780 qual = open(os.path.splitext(output)[0]+".qual", "w") | |
| 781 SeqIO.write(seqs, qual, "qual") | |
| 782 qual.close() | |
| 783 | |
| 784 # 5 - Write down the analyze sumary | |
| 785 log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n") | |
| 786 log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning) + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n") | |
| 787 log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n") | |
| 788 log.close() | |
| 789 sys.exit(0) | |
| 790 else : | |
| 791 print pyrocleaner_depts_error(options) | |
| 792 sys.exit(1) | |
| 793 |
