comparison tools/fastq/fastq_paired_unpaired.py @ 0:72e9fcaec61f

Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 17:21:17 -0400
parents
children 7ed81e36fc1c
comparison
equal deleted inserted replaced
-1:000000000000 0:72e9fcaec61f
1 #!/usr/bin/env python
2 """Divides a FASTQ into paired and single (orphan reads) as separate files.
3
4 The input file should be a valid FASTQ file which has been sorted so that
5 any partner forward+reverse reads are consecutive. The output files all
6 preserve this sort order. Pairing are recognised based on standard name
7 suffices. See below or run the tool with no arguments for more details.
8
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
10 Color Space should all work equally well).
11
12 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
13 See accompanying text file for licence details (MIT/BSD style).
14
15 This is version 0.0.4 of the script.
16 """
17 import os
18 import sys
19 import re
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
21
22 def stop_err(msg, err=1):
23 sys.stderr.write(msg.rstrip() + "\n")
24 sys.exit(err)
25
26 msg = """Expect either 3 or 4 arguments, all FASTQ filenames.
27
28 If you want two output files, use four arguments:
29 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
30 - Sorted input FASTQ filename,
31 - Output paired FASTQ filename (forward then reverse interleaved),
32 - Output singles FASTQ filename (orphan reads)
33
34 If you want three output files, use five arguments:
35 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
36 - Sorted input FASTQ filename,
37 - Output forward paired FASTQ filename,
38 - Output reverse paired FASTQ filename,
39 - Output singles FASTQ filename (orphan reads)
40
41 The input file should be a valid FASTQ file which has been sorted so that
42 any partner forward+reverse reads are consecutive. The output files all
43 preserve this sort order.
44
45 Any reads where the forward/reverse naming suffix used is not recognised
46 are treated as orphan reads. The tool supports the /1 and /2 convention
47 used by Illumina, the .f and .r convention, and the Sanger convention
48 (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details).
49
50 Note that this does support multiple forward and reverse reads per template
51 (which is quite common with Sanger sequencing), e.g. this which is sorted
52 alphabetically:
53
54 WTSI_1055_4p17.p1kapIBF
55 WTSI_1055_4p17.p1kpIBF
56 WTSI_1055_4p17.q1kapIBR
57 WTSI_1055_4p17.q1kpIBR
58
59 or this where the reads already come in pairs:
60
61 WTSI_1055_4p17.p1kapIBF
62 WTSI_1055_4p17.q1kapIBR
63 WTSI_1055_4p17.p1kpIBF
64 WTSI_1055_4p17.q1kpIBR
65
66 both become:
67
68 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
69 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
70 """
71
72 if len(sys.argv) == 5:
73 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:]
74 elif len(sys.argv) == 6:
75 pairs_fastq = None
76 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:]
77 else:
78 stop_err(msg)
79
80 format = format.replace("fastq", "").lower()
81 if not format:
82 format="sanger" #safe default
83 elif format not in ["sanger","solexa","illumina","cssanger"]:
84 stop_err("Unrecognised format %s" % format)
85
86 def f_match(name):
87 if name.endswith("/1") or name.endswith(".f"):
88 return True
89
90 #Cope with three widely used suffix naming convensions,
91 #Illumina: /1 or /2
92 #Forward/revered: .f or .r
93 #Sanger, e.g. .p1k and .q1k
94 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
95 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
96 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
97
98 #assert re_f.match("demo/1")
99 assert re_f.search("demo.f")
100 assert re_f.search("demo.s1")
101 assert re_f.search("demo.f1k")
102 assert re_f.search("demo.p1")
103 assert re_f.search("demo.p1k")
104 assert re_f.search("demo.p1lk")
105 assert re_r.search("demo/2")
106 assert re_r.search("demo.r")
107 assert re_r.search("demo.q1")
108 assert re_r.search("demo.q1lk")
109 assert not re_r.search("demo/1")
110 assert not re_r.search("demo.f")
111 assert not re_r.search("demo.p")
112 assert not re_f.search("demo/2")
113 assert not re_f.search("demo.r")
114 assert not re_f.search("demo.q")
115
116 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
117 in_handle = open(input_fastq)
118 if pairs_fastq:
119 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
120 pairs_r_writer = pairs_f_writer
121 else:
122 pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
123 pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
124 singles_writer = fastqWriter(open(singles_fastq, "w"), format)
125 last_template, buffered_reads = None, []
126
127 for record in fastqReader(in_handle, format):
128 count += 1
129 name = record.identifier.split(None,1)[0]
130 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
131 suffix = re_f.search(name)
132 if suffix:
133 #============
134 #Forward read
135 #============
136 template = name[:suffix.start()]
137 #print name, "forward", template
138 forward += 1
139 if last_template == template:
140 buffered_reads.append(record)
141 else:
142 #Any old buffered reads are orphans
143 for old in buffered_reads:
144 singles_writer.write(old)
145 singles += 1
146 #Save this read in buffer
147 buffered_reads = [record]
148 last_template = template
149 else:
150 suffix = re_r.search(name)
151 if suffix:
152 #============
153 #Reverse read
154 #============
155 template = name[:suffix.start()]
156 #print name, "reverse", template
157 reverse += 1
158 if last_template == template and buffered_reads:
159 #We have a pair!
160 #If there are multiple buffered forward reads, want to pick
161 #the first one (although we could try and do something more
162 #clever looking at the suffix to match them up...)
163 old = buffered_reads.pop(0)
164 pairs_f_writer.write(old)
165 pairs_r_writer.write(record)
166 pairs += 2
167 else:
168 #As this is a reverse read, this and any buffered read(s) are
169 #all orphans
170 for old in buffered_reads:
171 singles_writer.write(old)
172 singles += 1
173 buffered_reads = []
174 singles_writer.write(record)
175 singles += 1
176 last_template = None
177 else:
178 #===========================
179 #Neither forward nor reverse
180 #===========================
181 singles_writer.write(record)
182 singles += 1
183 neither += 1
184 for old in buffered_reads:
185 singles_writer.write(old)
186 singles += 1
187 buffered_reads = []
188 last_template = None
189 if last_template:
190 #Left over singles...
191 for old in buffered_reads:
192 singles_writer.write(old)
193 singles += 1
194 in_handle.close
195 singles_writer.close()
196 if pairs_fastq:
197 pairs_f_writer.close()
198 assert pairs_r_writer.file.closed
199 else:
200 pairs_f_writer.close()
201 pairs_r_writer.close()
202
203 if neither:
204 print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \
205 % (count, forward, reverse, neither, pairs, singles)
206 else:
207 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \
208 % (count, forward, reverse, pairs, singles)
209
210 assert count == pairs + singles == forward + reverse + neither, \
211 "%i vs %i+%i=%i vs %i+%i=%i" \
212 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)