Mercurial > repos > jmariette > pyrocleaner
comparison pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py @ 0:ef5dd11c01e6 default tip
pyrocleaner v1.2
author | g2cmnty@test-web1.g2.bx.psu.edu |
---|---|
date | Thu, 09 Jun 2011 06:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ef5dd11c01e6 |
---|---|
1 # | |
2 # Pyrocleaner | |
3 # Copyright (C) 2009 INRA | |
4 # | |
5 # This program is free software: you can redistribute it and/or modify | |
6 # it under the terms of the GNU General Public License as published by | |
7 # the Free Software Foundation, either version 3 of the License, or | |
8 # (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 # | |
18 # ChangeLog | |
19 # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the | |
20 # --clean-quality option | |
21 # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality | |
22 # Add the --aggressive option in order to keep only one read per cluster | |
23 # Modify the duplication strategy so the longer read is keept | |
24 # v1.0 (09/2009) : Pyrocleaner first version | |
25 # | |
26 | |
27 __author__ = 'Plateforme bioinformatique Midi Pyrenees' | |
28 __copyright__ = 'Copyright (C) 2009 INRA' | |
29 __license__ = 'GNU General Public License' | |
30 __version__ = '1.2' | |
31 __email__ = 'support.genopole@toulouse.inra.fr' | |
32 __status__ = 'beta' | |
33 | |
34 from Bio import SeqIO | |
35 from igraph import * | |
36 from optparse import * | |
37 import os, math, datetime, zlib, sys, re, glob, string, gzip | |
38 | |
39 | |
40 def create_roche_pairends_spacer_file(out_dir): | |
41 """ | |
42 Create the Roche pairends fasta file | |
43 @param out_dir : the directory where will be writen the fasta file | |
44 """ | |
45 file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr') | |
46 file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n") | |
47 file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n") | |
48 file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n") | |
49 file.close() | |
50 return file.name | |
51 | |
52 | |
53 def version_string (): | |
54 """ | |
55 Return the pyrocleaner version | |
56 """ | |
57 return "pyrocleaner " + __version__ | |
58 | |
59 | |
60 def reads_to_sff (sff_file, seqs, output_file): | |
61 """ | |
62 Extract seqs reads from the sff_file | |
63 @dependences sfffile : the sff software is required to execute this function | |
64 @param sff_file : the input sff file | |
65 @param seqs : table of seqs to extract | |
66 @param output_file : the name of the output sff file | |
67 """ | |
68 # First creates the to_write file | |
69 tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt") | |
70 to_write_file = open(tmp_file, 'wr') | |
71 for read in seqs: | |
72 to_write_file.write(read.id + "\n") | |
73 to_write_file.close() | |
74 # Use the sfffile tools | |
75 cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file | |
76 os.system(cmd) | |
77 # Clean temporary files | |
78 try: os.remove(tmp_file) | |
79 except: pass | |
80 | |
81 | |
82 def filter_reads (seqs, options): | |
83 """ | |
84 Filter input seqs by length, ns and complexity if options are asked by user | |
85 @param seqs : table of seqs to filter | |
86 @param options : the options asked by the user | |
87 """ | |
88 reads_id = [] | |
89 reads_length = [] | |
90 reads_ok = [] | |
91 del_by_ns = 0 | |
92 del_by_complexity = 0 | |
93 del_by_length = 0 | |
94 del_by_quality = 0 | |
95 log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n") | |
96 | |
97 # Go throught all sequences | |
98 for i, reads_record in enumerate(seqs) : | |
99 reads_id.append(reads_record.id) | |
100 reads_ok.append(0) | |
101 | |
102 # If is asked to clean sequences by length using the standard | |
103 # deviation, save length to compute some statistics | |
104 if options.clean_length_std : | |
105 reads_length.append(len(reads_record)) | |
106 | |
107 # If is asked to clean sequences by length using a window | |
108 # and the sequence has not been flagged as deleted | |
109 if options.clean_length_win and reads_ok[i] == 0: | |
110 # Check if the sequence is longer than the min asked, if not flagged it as deleted | |
111 if len(reads_record) < int(options.min): | |
112 reads_ok[i] = 1 | |
113 del_by_length += 1 | |
114 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n") | |
115 # Check if the sequence is smaller than the max asked, if not flagged it as deleted | |
116 elif len(reads_record) > int(options.max): | |
117 reads_ok[i] = 1 | |
118 del_by_length += 1 | |
119 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n") | |
120 | |
121 # If is asked to clean sequences with too much Ns | |
122 # and the sequence has not been flagged as deleted | |
123 if options.clean_ns and reads_ok[i] == 0: | |
124 # Compute the rate of Ns into the current sequence | |
125 nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100) | |
126 # If the rate is higher than the threshold flagged it as deleted | |
127 if nb_n > float(options.ns_percent) : | |
128 reads_ok[i] = 1 | |
129 del_by_ns += 1 | |
130 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n") | |
131 | |
132 # If is asked to clean sequences with low complexity using a sliding window | |
133 # and the sequence has not been flagged as deleted | |
134 if options.clean_complexity_win and reads_ok[i] == 0: | |
135 is_complex = False | |
136 # For each window | |
137 for win in range(0, len(reads_record)-options.window, options.step): | |
138 start = win | |
139 stop = start + options.window | |
140 # Compute the window complexity | |
141 w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100) | |
142 # if the window complexity is higher to the threshold, flag the whole sequence as complex | |
143 if w_cplx >= float(options.complexity): | |
144 is_complex = True | |
145 break | |
146 # If no window has been flagged as complex, then flagg the sequence as deleted | |
147 if not is_complex: | |
148 reads_ok[i] = 1 | |
149 del_by_complexity += 1 | |
150 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n") | |
151 | |
152 # If is asked to clean sequences with low complexity working on the whole sequence | |
153 # and the sequence has not been flagged as deleted | |
154 if options.clean_complexity_full and reads_ok[i] == 0: | |
155 # Compute the complexity on the whole sequence | |
156 cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100) | |
157 # If the complexity is higher to the threshold, flag the sequence as deleted | |
158 if cplx < float(options.complexity) : | |
159 reads_ok[i] = 1 | |
160 del_by_complexity += 1 | |
161 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n") | |
162 | |
163 # If is asked to clean sequences with low quality and quality information is available | |
164 # and the sequence has not been flagged as deleted | |
165 if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"): | |
166 # If at least one base has a quality score higher than the threashold | |
167 maw_qual = max(reads_record.letter_annotations["phred_quality"]) | |
168 if maw_qual < int(options.quality_threshold) : | |
169 reads_ok[i] = 1 | |
170 del_by_quality += 1 | |
171 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n") | |
172 | |
173 # If is asked to clean sequences by length using the standard deviation | |
174 if options.clean_length_std : | |
175 # Compute the mean and the standard deviation | |
176 mean = sum(reads_length) / len(reads_length) | |
177 mq = mean**2 | |
178 s = sum([ x**2 for x in reads_length]) | |
179 var = s/len(reads_length) - mq | |
180 etype = math.sqrt(var) | |
181 # For each sequences | |
182 for i, id in enumerate(reads_id): | |
183 # If the sequence has not been flagged as deleted | |
184 if reads_ok[i] == 0 : | |
185 # If the sequence length is higher than the upper threshold, flag the sequence as deleted | |
186 if reads_length[i] > mean + options.std*etype: | |
187 reads_ok[i] = 1 | |
188 del_by_length += 1 | |
189 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | |
190 # If the sequence length is smaller than the lower threshold, flag the sequence as deleted | |
191 elif reads_length[i] < mean - options.std*etype: | |
192 reads_ok[i] = 1 | |
193 del_by_length += 1 | |
194 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n") | |
195 | |
196 seqs_to_return = [] | |
197 # Then get only sequences not flagged to be deleted | |
198 for i, reads_record in enumerate(seqs) : | |
199 if reads_ok[i] == 0: | |
200 seqs_to_return.append(reads_record) | |
201 | |
202 # Return values | |
203 return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality] | |
204 | |
205 | |
206 def filter_same_reads (seqs, options): | |
207 """ | |
208 Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster | |
209 @param seqs : table of seqs to filter | |
210 @param options : the options asked by the user | |
211 """ | |
212 | |
213 megablast_input = os.path.join(options.output, "megablast_input.fasta") | |
214 log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n") | |
215 log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n") | |
216 | |
217 # First write down seqs | |
218 fasta = open(megablast_input, "w") | |
219 SeqIO.write(seqs, fasta, "fasta") | |
220 fasta.close() | |
221 | |
222 # Then formatdb the fasta file (no formatdb utils in Biopython) | |
223 cmd = "formatdb -i %s -p F" % megablast_input | |
224 os.system(cmd) | |
225 log.write("## megablast (" + str(datetime.datetime.now()) + ")\n") | |
226 | |
227 # In case of pairends, use words of 50, so megablast cannot connect with spacers | |
228 opts = "" | |
229 if options.clean_pairends: | |
230 opts = " -W 50 -H 10 " | |
231 | |
232 # Megablast the fasta file versus itself | |
233 cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" | |
234 os.system(cmd) | |
235 | |
236 # Let's get the reads length with the fasta file and creates the graph | |
237 log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n") | |
238 gseqs = {} | |
239 vertices = [] | |
240 for reads_record in seqs : | |
241 vertices.append({'name': reads_record.id}) | |
242 gseqs[reads_record.id] = len(reads_record) | |
243 | |
244 # Connects reads from hits starting at 1 and with ends closed to each others | |
245 log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n") | |
246 edges = [] | |
247 for read in open(megablast_input + ".res", 'rU').readlines() : | |
248 parts = read.rstrip().split() | |
249 len1 = gseqs[parts[0]] | |
250 len2 = gseqs[parts[1]] | |
251 if options.clean_aggressive : | |
252 edges.append({'source': parts[0], 'target': parts[1]}) | |
253 elif math.fabs(len1-len2) < options.duplication_limit: | |
254 if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit: | |
255 # This alignments are realy similar -> may be the same -> link them into the graph | |
256 edges.append({'source': parts[0], 'target': parts[1]}) | |
257 | |
258 # Then get connected components and extract one of them as cluster leader | |
259 log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n") | |
260 gr = Graph.DictList(vertices, edges) | |
261 connex = gr.clusters().membership | |
262 | |
263 clusters = {} | |
264 for i, vertex in enumerate(vertices): | |
265 cluster_id = connex[i] | |
266 try: | |
267 clusters[cluster_id].append(seqs[i]) | |
268 except: | |
269 clusters[cluster_id] = [] | |
270 clusters[cluster_id].append(seqs[i]) | |
271 | |
272 del_by_duplicat = 0 | |
273 # Write down into the log the composition of each cluster | |
274 clusters_stats = {} | |
275 seqs_to_return = [] | |
276 for cluster_id in clusters: | |
277 cl_elts = "" | |
278 del_by_duplicat += len(clusters[cluster_id]) - 1 | |
279 longest_value = 0 | |
280 cluster_leader = None | |
281 for seq_record in clusters[cluster_id]: | |
282 # Find out which sequence is the longest | |
283 if len(seq_record.seq) > longest_value: | |
284 cluster_leader = seq_record | |
285 | |
286 seqs_to_return.append(cluster_leader) | |
287 for seq_record in clusters[cluster_id]: | |
288 if seq_record.id != cluster_leader.id: | |
289 log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n") | |
290 cl_elts += seq_record.id + " " | |
291 log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n") | |
292 | |
293 try : | |
294 clusters_stats[len(clusters[cluster_id])] += 1 | |
295 except: | |
296 clusters_stats[len(clusters[cluster_id])] = 1 | |
297 | |
298 # Write down a summary of what has been done | |
299 log_header = "## header (duplicated) : " | |
300 log_summary = "## summary (duplicated) : " | |
301 for stat in sorted(clusters_stats.keys()): | |
302 log_header += str(stat) + "\t" | |
303 log_summary += str(clusters_stats[stat]) + "\t" | |
304 log.write(log_header + "\n") | |
305 log.write(log_summary + "\n") | |
306 | |
307 # Clean temporary files | |
308 try: | |
309 os.remove(megablast_input) | |
310 os.remove(megablast_input+".nhr") | |
311 os.remove(megablast_input+".nin") | |
312 os.remove(megablast_input+".nsq") | |
313 os.remove(megablast_input+".res") | |
314 os.remove("formatdb.log") | |
315 os.remove("error.log") | |
316 except: | |
317 pass | |
318 | |
319 # Returns results | |
320 return [seqs_to_return, del_by_duplicat] | |
321 | |
322 | |
323 def filter_pairends(seqs, options): | |
324 """ | |
325 Filter pairends sequences and split sequences without pairends into a fasta file | |
326 @param seqs : the table of sequences to filter | |
327 @param options : the options asked by the user | |
328 """ | |
329 | |
330 # Setup output files | |
331 shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta") | |
332 shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual") | |
333 crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta") | |
334 split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta") | |
335 split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual") | |
336 | |
337 # First write down seqs | |
338 fasta = open(crossmatch_input, "w") | |
339 SeqIO.write(seqs, fasta, "fasta") | |
340 fasta.close() | |
341 | |
342 # Write down the qual file for cross_match if possible | |
343 try: | |
344 qual = open(crossmatch_input+".qual", "w") | |
345 SeqIO.write(seqs, qual, "qual") | |
346 qual.close() | |
347 except: | |
348 os.remove(crossmatch_input+".qual") | |
349 | |
350 # Cross_match reverse matches pattern | |
351 rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") | |
352 # Cross_match forward matches pattern | |
353 fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") | |
354 | |
355 # Write the spacer file and execute cross_match against the input sequences | |
356 spacers_file = create_roche_pairends_spacer_file(options.output) | |
357 cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res" | |
358 os.system(cmd) | |
359 | |
360 # Parse the cross_match file | |
361 cross_match_tab = {} | |
362 block_found = False | |
363 for line in open(crossmatch_input + ".cross_match.res", 'r'): | |
364 save_line = False | |
365 rm = rev_regex.match(line) | |
366 fm = fwd_regex.match(line) | |
367 if rm != None: # If it's a reverse matches | |
368 block_found = True | |
369 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8))) | |
370 save_line = True | |
371 elif fm != None: # If it's a forward matches | |
372 block_found = True | |
373 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8))) | |
374 save_line = True | |
375 else : save_line = False | |
376 | |
377 if line.startswith("Discrepancy summary:"): # This is the end of the section | |
378 break | |
379 | |
380 # Save the line | |
381 if save_line: | |
382 try: | |
383 cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch]) | |
384 except: | |
385 cross_match_tab[primary_match] = [] | |
386 cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)]) | |
387 | |
388 # Then get the spacers_length | |
389 spacers_length = {} | |
390 for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") : | |
391 spacers_length[seq_record.id] = len(seq_record.seq.tostring()) | |
392 | |
393 # Finaly go throught all sequences | |
394 seqs_to_return = [] | |
395 shotgun_seqs = [] | |
396 del_by_pairends = 1 | |
397 pe_splited = [] | |
398 | |
399 for i, seq_record in enumerate(seqs) : | |
400 if not cross_match_tab.has_key(seq_record.id): | |
401 # No spacers found -> just add it to the shotgun_seqs tab | |
402 shotgun_seqs.append(seq_record) | |
403 log.write(seq_record.id + " shotgun -> no spacer found\n") | |
404 elif len(cross_match_tab[seq_record.id]) > 1: | |
405 # If multiple linker -> delete the whole sequence | |
406 log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n") | |
407 del_by_pairends += 1 | |
408 elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)): | |
409 if (cross_match_tab[seq_record.id][0][1] == 1): | |
410 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | |
411 log.write(seq_record.id + " shotgun -> spacer found at the begining\n") | |
412 elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)): | |
413 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | |
414 log.write(seq_record.id + " shotgun -> spacer found at the end\n") | |
415 else : | |
416 log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n") | |
417 del_by_pairends += 1 | |
418 elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit: | |
419 seqs_to_return.append(seq_record) | |
420 if options.split_pairends: | |
421 sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:] | |
422 sub_seq1.id = seq_record.id + ".r" | |
423 pe_splited.append(sub_seq1) | |
424 sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]] | |
425 sub_seq2.id = seq_record.id + ".f" | |
426 pe_splited.append(sub_seq2) | |
427 elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | |
428 log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n") | |
429 del_by_pairends += 1 | |
430 elif cross_match_tab[seq_record.id][0][1] < options.border_limit: | |
431 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:]) | |
432 log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n") | |
433 elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit: | |
434 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]]) | |
435 log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n") | |
436 | |
437 # Write down if required the splited pairends file | |
438 if options.split_pairends: | |
439 handle = open(split_pairends_fasta, "w") | |
440 SeqIO.write(pe_splited, handle, "fasta") | |
441 handle.close() | |
442 handle = open(split_pairends_qual, "w") | |
443 SeqIO.write(pe_splited, handle, "qual") | |
444 handle.close() | |
445 | |
446 # Finaly clean up new shotguns seqs | |
447 [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options) | |
448 handle = open(shotgun_ffile, "w") | |
449 SeqIO.write(shotgun_seqs_clean, handle, "fasta") | |
450 handle.close() | |
451 in_shot_gun = 0 | |
452 try: | |
453 handle = open(shotgun_qfile, "w") | |
454 SeqIO.write(shotgun_seqs_clean, handle, "qual") | |
455 handle.close() | |
456 in_shot_gun = len(shotgun_seqs_clean) | |
457 except: | |
458 pass | |
459 | |
460 log.write("## header (pairends) : pairends\ttotal shotgun\n") | |
461 log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n") | |
462 | |
463 # Clean temporary files | |
464 try: | |
465 os.remove(crossmatch_input) | |
466 os.remove(spacers_file) | |
467 os.remove(crossmatch_input+".cross_match.res") | |
468 os.remove(crossmatch_input+".log") | |
469 os.remove(crossmatch_input+".qual") | |
470 except: | |
471 pass | |
472 | |
473 # Then returned pairends ones | |
474 return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends] | |
475 | |
476 | |
477 def get_seqs (options): | |
478 """ | |
479 Converts input seqs in a BioPython seq table | |
480 @param options : the options asked by the user | |
481 """ | |
482 | |
483 # First get fasta or/and qual input files | |
484 qual_file = "" | |
485 if options.format == "sff": | |
486 sff_file = options.input_file | |
487 | |
488 if sff_file.endswith(".gz"): | |
489 '''Gunzip the given file and then remove the file.''' | |
490 r_file = gzip.GzipFile(sff_file, 'r') | |
491 write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz')) | |
492 w_file = open(write_file, 'w') | |
493 w_file.write(r_file.read()) | |
494 w_file.close() | |
495 r_file.close() | |
496 sff_file = write_file | |
497 | |
498 base = os.path.basename(sff_file) | |
499 fasta_file = os.path.join(options.output, base + '.fasta') | |
500 qual_file = os.path.join(options.output, base + '.qual') | |
501 xml_file = os.path.join(options.output, base + '.xml') | |
502 format = "fasta" | |
503 if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file): | |
504 #First extract the sff file to fasta, qual and xml file | |
505 cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file | |
506 os.system(cmd) | |
507 else : | |
508 log = open(options.log_file, "a+") | |
509 log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n") | |
510 log.close() | |
511 elif options.format == "fasta" or options.format == "fastq": | |
512 format = options.format | |
513 fasta_file = options.input_file | |
514 else : | |
515 print "Error : " + options.format + " is not a supported format!" | |
516 sys.exit(1) | |
517 | |
518 if options.qual_file: | |
519 qual_file = options.qual_file | |
520 | |
521 # If we got a quality file | |
522 if qual_file : | |
523 quals = {} | |
524 # If the file is gziped | |
525 if qual_file.endswith(".gz"): | |
526 for qual in SeqIO.parse(gzip.open(qual_file), "qual") : | |
527 quals[qual.id] = qual | |
528 else : | |
529 for qual in SeqIO.parse(open(qual_file), "qual") : | |
530 quals[qual.id] = qual | |
531 | |
532 # If the fasta file is gziped | |
533 if fasta_file.endswith(".gz"): | |
534 seqs = [] | |
535 for record in SeqIO.parse(gzip.open(fasta_file), format) : | |
536 if qual_file : | |
537 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | |
538 seqs.append(record) | |
539 else : | |
540 seqs = [] | |
541 for record in SeqIO.parse(open(fasta_file), format) : | |
542 if qual_file : | |
543 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"] | |
544 seqs.append(record) | |
545 | |
546 # Clean temporary files | |
547 if options.format == "sff": | |
548 try: | |
549 os.remove(fasta_file) | |
550 os.remove(qual_file) | |
551 os.remove(xml_file) | |
552 except: | |
553 pass | |
554 | |
555 # Returns the table of sequences | |
556 return [seqs] | |
557 | |
558 def which (program): | |
559 """ | |
560 Return if the asked program exist in the user path | |
561 @param options : the options asked by the user | |
562 """ | |
563 import os | |
564 def is_exe(fpath): | |
565 return os.path.exists(fpath) and os.access(fpath, os.X_OK) | |
566 fpath, fname = os.path.split(program) | |
567 if fpath: | |
568 if is_exe(program): | |
569 return program | |
570 else: | |
571 for path in os.environ["PATH"].split(os.pathsep): | |
572 exe_file = os.path.join(path, program) | |
573 if is_exe(exe_file): | |
574 return exe_file | |
575 return None | |
576 | |
577 def pyrocleaner_depts_error(options): | |
578 """ | |
579 Return the list of dependencies missing to run the pyrocleaner | |
580 """ | |
581 error = "" | |
582 if which("formatdb") == None: | |
583 error += " - formatdb is missing\n" | |
584 if which("megablast") == None: | |
585 error += " - megablast is missing\n" | |
586 if which("sff_extract.py") == None: | |
587 error += " - sff_extract.py\n" | |
588 if which("cross_match") == None and options.clean_pairends != None: | |
589 error += " - cross_match\n" | |
590 if error != "": | |
591 error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!" | |
592 return error | |
593 | |
594 | |
595 if __name__ == "__main__": | |
596 | |
597 parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]") | |
598 | |
599 usage = "usage: %prog -i file -o output -f format" | |
600 desc = " Clean sequences from 454 SFF files \n"\ | |
601 " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win" | |
602 parser = OptionParser(usage = usage, version = version_string(), description = desc) | |
603 | |
604 igroup = OptionGroup(parser, "Input files options","") | |
605 igroup.add_option("-i", "--in", dest="input_file", | |
606 help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE") | |
607 igroup.add_option("-q", "--qual", dest="qual_file", | |
608 help="The quality file to use if input file is fasta", metavar="FILE") | |
609 igroup.add_option("-f", "--format", dest="format", | |
610 help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff") | |
611 parser.add_option_group(igroup) | |
612 | |
613 ogroup = OptionGroup(parser, "Output files options","") | |
614 ogroup.add_option("-o", "--out", dest="output", | |
615 help="The output folder where to store results") | |
616 ogroup.add_option("-g", "--log", dest="log_file", | |
617 help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log") | |
618 ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", | |
619 default=False, help="Write splited pairends sequences into a fasta file") | |
620 parser.add_option_group(ogroup) | |
621 | |
622 cgroup = OptionGroup(parser, "Cleaning options","") | |
623 cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", | |
624 default=False, help="Clean pairends") | |
625 cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", | |
626 default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation") | |
627 cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", | |
628 default=False, help="Filter reads with a legnth in between [x:y]") | |
629 cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", | |
630 default=False, help="Filter reads with too much N") | |
631 cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", | |
632 default=False, help="Filter duplicated reads") | |
633 cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", | |
634 default=False, help="Filter low complexity reads computed on a sliding window") | |
635 cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", | |
636 default=False, help="Filter low complexity reads computed on the whole sequence") | |
637 cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", | |
638 default=False, help="Filter low quality reads") | |
639 parser.add_option_group(cgroup) | |
640 | |
641 pgroup = OptionGroup(parser, "Processing options","") | |
642 pgroup.add_option("-a", "--acpus", dest="nb_cpus", | |
643 help="Number of processors to use", type="int", default=1) | |
644 pgroup.add_option("-r", "--recursion", dest="recursion", | |
645 help="Recursion limit when computing duplicated reads", type="int", default=1000) | |
646 parser.add_option_group(pgroup) | |
647 | |
648 cpgroup = OptionGroup(parser, "Cleaning parameters","") | |
649 cpgroup.add_option("-b", "--border-limit", dest="border_limit", | |
650 help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70) | |
651 cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", | |
652 default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)") | |
653 cpgroup.add_option("-e", "--missmatch", dest="missmatch", | |
654 help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10) | |
655 cpgroup.add_option("-j", "--std", dest="std", | |
656 help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2) | |
657 cpgroup.add_option("-1", "--min", dest="min", | |
658 help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200) | |
659 cpgroup.add_option("-2", "--max", dest="max", | |
660 help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600) | |
661 cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold", | |
662 help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) | |
663 cpgroup.add_option("-s", "--ns_percent", dest="ns_percent", | |
664 help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4) | |
665 cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit", | |
666 help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70) | |
667 cpgroup.add_option("-v", "--window", dest="window", | |
668 help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100) | |
669 cpgroup.add_option("-x", "--step", dest="step", | |
670 help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5) | |
671 cpgroup.add_option("-y", "--complexity", dest="complexity", | |
672 help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40) | |
673 parser.add_option_group(cpgroup) | |
674 | |
675 (options, args) = parser.parse_args() | |
676 | |
677 sys.setrecursionlimit(options.recursion) | |
678 | |
679 if not pyrocleaner_depts_error(options): | |
680 | |
681 if options.input_file == None or options.output == None and options.format == None: | |
682 parser.print_help() | |
683 sys.exit(1) | |
684 elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality : | |
685 print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help" | |
686 sys.exit(1) | |
687 else: | |
688 | |
689 # Create output directory if doesn't exist | |
690 if not os.path.isdir(options.output) : os.mkdir(options.output ) | |
691 | |
692 global log | |
693 log = open(options.output+"/"+options.log_file, "w") | |
694 log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n") | |
695 log.write("## with the following options: \n") | |
696 opts = "" | |
697 if options.clean_complexity_full: | |
698 opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" | |
699 if options.clean_complexity_win: | |
700 opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" | |
701 if options.clean_duplicated_reads and not options.clean_aggressive: | |
702 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n" | |
703 elif options.clean_duplicated_reads and options.clean_aggressive : | |
704 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" | |
705 if options.clean_length_std: | |
706 opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" | |
707 if options.clean_length_win: | |
708 opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" | |
709 if options.clean_ns: | |
710 opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" | |
711 if options.clean_pairends: | |
712 opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n" | |
713 if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) : | |
714 opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n" | |
715 elif options.clean_quality : | |
716 print "No quality check will be performed beacause there is no quality file provided." | |
717 log.write(opts) | |
718 log.close() | |
719 | |
720 # 1 - First get inputs from options | |
721 [seqs] = get_seqs(options) | |
722 before_cleaning = len(seqs) | |
723 del_by_length = 0 | |
724 del_by_ns = 0 | |
725 del_by_complexity = 0 | |
726 del_by_duplicat = 0 | |
727 del_by_pairends = 0 | |
728 del_by_quality = 0 | |
729 in_shot_gun = 0 | |
730 | |
731 log = open(options.output+"/"+options.log_file, "a+") | |
732 # 2 - clean reads | |
733 # 2.1 - Clean reads by length, ns, complexity | |
734 if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality: | |
735 if options.format == "sff" : | |
736 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | |
737 elif options.format == "fasta" or options.format == "fastq": | |
738 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options) | |
739 | |
740 # 2.2 - Clean reads by duplicat | |
741 if options.clean_duplicated_reads: | |
742 [seqs, del_by_duplicat] = filter_same_reads(seqs, options) | |
743 | |
744 # 3 - Clean pairends | |
745 if options.clean_pairends: | |
746 [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options) | |
747 del_by_length += del_by_length_pe | |
748 del_by_ns += del_by_ns_pe | |
749 del_by_complexity += del_by_complexity_pe | |
750 del_by_quality += del_by_quality_pe | |
751 | |
752 # 4 - Create the output in the same format than the input | |
753 after_cleaning = len(seqs) + in_shot_gun | |
754 if options.input_file.endswith(".gz"): | |
755 ifile = string.rstrip(options.input_file, '.gz') | |
756 else : ifile = options.input_file | |
757 if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean") | |
758 else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean") | |
759 output += "." + options.format | |
760 if options.format == "sff" : | |
761 if options.input_file.endswith(".gz"): | |
762 sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz')) | |
763 else : | |
764 sfffile = options.input_file | |
765 if which("sfffile") == None: | |
766 fastq = open(output, "w") | |
767 SeqIO.write(seqs, fastq, "fastq") | |
768 fastq.close() | |
769 else : | |
770 reads_to_sff(sfffile, seqs, output) | |
771 # Clean temporary files | |
772 if options.input_file.endswith(".gz"): | |
773 os.remove(sfffile) | |
774 | |
775 elif options.format == "fasta" or options.format == "fastq": | |
776 fasta = open(output, "w") | |
777 SeqIO.write(seqs, fasta, options.format) | |
778 fasta.close() | |
779 if options.format == "fasta" and options.qual_file != None: | |
780 qual = open(os.path.splitext(output)[0]+".qual", "w") | |
781 SeqIO.write(seqs, qual, "qual") | |
782 qual.close() | |
783 | |
784 # 5 - Write down the analyze sumary | |
785 log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n") | |
786 log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning) + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n") | |
787 log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n") | |
788 log.close() | |
789 sys.exit(0) | |
790 else : | |
791 print pyrocleaner_depts_error(options) | |
792 sys.exit(1) | |
793 |