Mercurial > repos > peterjc > fastq_paired_unpaired
comparison tools/fastq/fastq_paired_unpaired.py @ 0:72e9fcaec61f
Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:21:17 -0400 |
parents | |
children | 7ed81e36fc1c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:72e9fcaec61f |
---|---|
1 #!/usr/bin/env python | |
2 """Divides a FASTQ into paired and single (orphan reads) as separate files. | |
3 | |
4 The input file should be a valid FASTQ file which has been sorted so that | |
5 any partner forward+reverse reads are consecutive. The output files all | |
6 preserve this sort order. Pairing are recognised based on standard name | |
7 suffices. See below or run the tool with no arguments for more details. | |
8 | |
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even | |
10 Color Space should all work equally well). | |
11 | |
12 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | |
13 See accompanying text file for licence details (MIT/BSD style). | |
14 | |
15 This is version 0.0.4 of the script. | |
16 """ | |
17 import os | |
18 import sys | |
19 import re | |
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
21 | |
22 def stop_err(msg, err=1): | |
23 sys.stderr.write(msg.rstrip() + "\n") | |
24 sys.exit(err) | |
25 | |
26 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. | |
27 | |
28 If you want two output files, use four arguments: | |
29 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) | |
30 - Sorted input FASTQ filename, | |
31 - Output paired FASTQ filename (forward then reverse interleaved), | |
32 - Output singles FASTQ filename (orphan reads) | |
33 | |
34 If you want three output files, use five arguments: | |
35 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) | |
36 - Sorted input FASTQ filename, | |
37 - Output forward paired FASTQ filename, | |
38 - Output reverse paired FASTQ filename, | |
39 - Output singles FASTQ filename (orphan reads) | |
40 | |
41 The input file should be a valid FASTQ file which has been sorted so that | |
42 any partner forward+reverse reads are consecutive. The output files all | |
43 preserve this sort order. | |
44 | |
45 Any reads where the forward/reverse naming suffix used is not recognised | |
46 are treated as orphan reads. The tool supports the /1 and /2 convention | |
47 used by Illumina, the .f and .r convention, and the Sanger convention | |
48 (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). | |
49 | |
50 Note that this does support multiple forward and reverse reads per template | |
51 (which is quite common with Sanger sequencing), e.g. this which is sorted | |
52 alphabetically: | |
53 | |
54 WTSI_1055_4p17.p1kapIBF | |
55 WTSI_1055_4p17.p1kpIBF | |
56 WTSI_1055_4p17.q1kapIBR | |
57 WTSI_1055_4p17.q1kpIBR | |
58 | |
59 or this where the reads already come in pairs: | |
60 | |
61 WTSI_1055_4p17.p1kapIBF | |
62 WTSI_1055_4p17.q1kapIBR | |
63 WTSI_1055_4p17.p1kpIBF | |
64 WTSI_1055_4p17.q1kpIBR | |
65 | |
66 both become: | |
67 | |
68 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR | |
69 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR | |
70 """ | |
71 | |
72 if len(sys.argv) == 5: | |
73 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] | |
74 elif len(sys.argv) == 6: | |
75 pairs_fastq = None | |
76 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] | |
77 else: | |
78 stop_err(msg) | |
79 | |
80 format = format.replace("fastq", "").lower() | |
81 if not format: | |
82 format="sanger" #safe default | |
83 elif format not in ["sanger","solexa","illumina","cssanger"]: | |
84 stop_err("Unrecognised format %s" % format) | |
85 | |
86 def f_match(name): | |
87 if name.endswith("/1") or name.endswith(".f"): | |
88 return True | |
89 | |
90 #Cope with three widely used suffix naming convensions, | |
91 #Illumina: /1 or /2 | |
92 #Forward/revered: .f or .r | |
93 #Sanger, e.g. .p1k and .q1k | |
94 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | |
95 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | |
96 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | |
97 | |
98 #assert re_f.match("demo/1") | |
99 assert re_f.search("demo.f") | |
100 assert re_f.search("demo.s1") | |
101 assert re_f.search("demo.f1k") | |
102 assert re_f.search("demo.p1") | |
103 assert re_f.search("demo.p1k") | |
104 assert re_f.search("demo.p1lk") | |
105 assert re_r.search("demo/2") | |
106 assert re_r.search("demo.r") | |
107 assert re_r.search("demo.q1") | |
108 assert re_r.search("demo.q1lk") | |
109 assert not re_r.search("demo/1") | |
110 assert not re_r.search("demo.f") | |
111 assert not re_r.search("demo.p") | |
112 assert not re_f.search("demo/2") | |
113 assert not re_f.search("demo.r") | |
114 assert not re_f.search("demo.q") | |
115 | |
116 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 | |
117 in_handle = open(input_fastq) | |
118 if pairs_fastq: | |
119 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) | |
120 pairs_r_writer = pairs_f_writer | |
121 else: | |
122 pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) | |
123 pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) | |
124 singles_writer = fastqWriter(open(singles_fastq, "w"), format) | |
125 last_template, buffered_reads = None, [] | |
126 | |
127 for record in fastqReader(in_handle, format): | |
128 count += 1 | |
129 name = record.identifier.split(None,1)[0] | |
130 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser | |
131 suffix = re_f.search(name) | |
132 if suffix: | |
133 #============ | |
134 #Forward read | |
135 #============ | |
136 template = name[:suffix.start()] | |
137 #print name, "forward", template | |
138 forward += 1 | |
139 if last_template == template: | |
140 buffered_reads.append(record) | |
141 else: | |
142 #Any old buffered reads are orphans | |
143 for old in buffered_reads: | |
144 singles_writer.write(old) | |
145 singles += 1 | |
146 #Save this read in buffer | |
147 buffered_reads = [record] | |
148 last_template = template | |
149 else: | |
150 suffix = re_r.search(name) | |
151 if suffix: | |
152 #============ | |
153 #Reverse read | |
154 #============ | |
155 template = name[:suffix.start()] | |
156 #print name, "reverse", template | |
157 reverse += 1 | |
158 if last_template == template and buffered_reads: | |
159 #We have a pair! | |
160 #If there are multiple buffered forward reads, want to pick | |
161 #the first one (although we could try and do something more | |
162 #clever looking at the suffix to match them up...) | |
163 old = buffered_reads.pop(0) | |
164 pairs_f_writer.write(old) | |
165 pairs_r_writer.write(record) | |
166 pairs += 2 | |
167 else: | |
168 #As this is a reverse read, this and any buffered read(s) are | |
169 #all orphans | |
170 for old in buffered_reads: | |
171 singles_writer.write(old) | |
172 singles += 1 | |
173 buffered_reads = [] | |
174 singles_writer.write(record) | |
175 singles += 1 | |
176 last_template = None | |
177 else: | |
178 #=========================== | |
179 #Neither forward nor reverse | |
180 #=========================== | |
181 singles_writer.write(record) | |
182 singles += 1 | |
183 neither += 1 | |
184 for old in buffered_reads: | |
185 singles_writer.write(old) | |
186 singles += 1 | |
187 buffered_reads = [] | |
188 last_template = None | |
189 if last_template: | |
190 #Left over singles... | |
191 for old in buffered_reads: | |
192 singles_writer.write(old) | |
193 singles += 1 | |
194 in_handle.close | |
195 singles_writer.close() | |
196 if pairs_fastq: | |
197 pairs_f_writer.close() | |
198 assert pairs_r_writer.file.closed | |
199 else: | |
200 pairs_f_writer.close() | |
201 pairs_r_writer.close() | |
202 | |
203 if neither: | |
204 print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ | |
205 % (count, forward, reverse, neither, pairs, singles) | |
206 else: | |
207 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ | |
208 % (count, forward, reverse, pairs, singles) | |
209 | |
210 assert count == pairs + singles == forward + reverse + neither, \ | |
211 "%i vs %i+%i=%i vs %i+%i=%i" \ | |
212 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) |