comparison tools/fastq/fastq_paired_unpaired.py @ 1:7ed81e36fc1c

Uploaded v0.0.5 which handles Illumina 1.8 style pair naming.
author peterjc
date Mon, 12 Dec 2011 11:33:10 -0500
parents 72e9fcaec61f
children 95a632a71951
comparison
equal deleted inserted replaced
0:72e9fcaec61f 1:7ed81e36fc1c
7 suffices. See below or run the tool with no arguments for more details. 7 suffices. See below or run the tool with no arguments for more details.
8 8
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even 9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
10 Color Space should all work equally well). 10 Color Space should all work equally well).
11 11
12 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. 12 This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute
13 (formerly SCRI), Scotland, UK. All rights reserved.
14
13 See accompanying text file for licence details (MIT/BSD style). 15 See accompanying text file for licence details (MIT/BSD style).
14
15 This is version 0.0.4 of the script.
16 """ 16 """
17 import os 17 import os
18 import sys 18 import sys
19 import re 19 import re
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
21
22 if "--version" in sys.argv[1:]:
23 print "Version 0.0.5"
24 sys.exit(0)
21 25
22 def stop_err(msg, err=1): 26 def stop_err(msg, err=1):
23 sys.stderr.write(msg.rstrip() + "\n") 27 sys.stderr.write(msg.rstrip() + "\n")
24 sys.exit(err) 28 sys.exit(err)
25 29
42 any partner forward+reverse reads are consecutive. The output files all 46 any partner forward+reverse reads are consecutive. The output files all
43 preserve this sort order. 47 preserve this sort order.
44 48
45 Any reads where the forward/reverse naming suffix used is not recognised 49 Any reads where the forward/reverse naming suffix used is not recognised
46 are treated as orphan reads. The tool supports the /1 and /2 convention 50 are treated as orphan reads. The tool supports the /1 and /2 convention
47 used by Illumina, the .f and .r convention, and the Sanger convention 51 originally used by Illumina, the .f and .r convention, and the Sanger
48 (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html
53 for details), and the new Illumina convention where the reads have the
54 same identifier with the fragment at the start of the description, e.g.
55
56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA
49 58
50 Note that this does support multiple forward and reverse reads per template 59 Note that this does support multiple forward and reverse reads per template
51 (which is quite common with Sanger sequencing), e.g. this which is sorted 60 (which is quite common with Sanger sequencing), e.g. this which is sorted
52 alphabetically: 61 alphabetically:
53 62
111 assert not re_r.search("demo.p") 120 assert not re_r.search("demo.p")
112 assert not re_f.search("demo/2") 121 assert not re_f.search("demo/2")
113 assert not re_f.search("demo.r") 122 assert not re_f.search("demo.r")
114 assert not re_f.search("demo.q") 123 assert not re_f.search("demo.q")
115 124
125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
131
132
116 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
117 in_handle = open(input_fastq) 134 in_handle = open(input_fastq)
118 if pairs_fastq: 135 if pairs_fastq:
119 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) 136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
120 pairs_r_writer = pairs_f_writer 137 pairs_r_writer = pairs_f_writer
126 143
127 for record in fastqReader(in_handle, format): 144 for record in fastqReader(in_handle, format):
128 count += 1 145 count += 1
129 name = record.identifier.split(None,1)[0] 146 name = record.identifier.split(None,1)[0]
130 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser 147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
148 is_forward = False
131 suffix = re_f.search(name) 149 suffix = re_f.search(name)
132 if suffix: 150 if suffix:
133 #============ 151 #============
134 #Forward read 152 #Forward read
135 #============ 153 #============
136 template = name[:suffix.start()] 154 template = name[:suffix.start()]
155 is_forward = True
156 elif re_illumina_f.match(record.identifier):
157 template = name #No suffix
158 is_forward = True
159 if is_forward:
137 #print name, "forward", template 160 #print name, "forward", template
138 forward += 1 161 forward += 1
139 if last_template == template: 162 if last_template == template:
140 buffered_reads.append(record) 163 buffered_reads.append(record)
141 else: 164 else:
143 for old in buffered_reads: 166 for old in buffered_reads:
144 singles_writer.write(old) 167 singles_writer.write(old)
145 singles += 1 168 singles += 1
146 #Save this read in buffer 169 #Save this read in buffer
147 buffered_reads = [record] 170 buffered_reads = [record]
148 last_template = template 171 last_template = template
149 else: 172 else:
173 is_reverse = False
150 suffix = re_r.search(name) 174 suffix = re_r.search(name)
151 if suffix: 175 if suffix:
152 #============ 176 #============
153 #Reverse read 177 #Reverse read
154 #============ 178 #============
155 template = name[:suffix.start()] 179 template = name[:suffix.start()]
180 is_reverse = True
181 elif re_illumina_r.match(record.identifier):
182 template = name #No suffix
183 is_reverse = True
184 if is_reverse:
156 #print name, "reverse", template 185 #print name, "reverse", template
157 reverse += 1 186 reverse += 1
158 if last_template == template and buffered_reads: 187 if last_template == template and buffered_reads:
159 #We have a pair! 188 #We have a pair!
160 #If there are multiple buffered forward reads, want to pick 189 #If there are multiple buffered forward reads, want to pick
206 else: 235 else:
207 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ 236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \
208 % (count, forward, reverse, pairs, singles) 237 % (count, forward, reverse, pairs, singles)
209 238
210 assert count == pairs + singles == forward + reverse + neither, \ 239 assert count == pairs + singles == forward + reverse + neither, \
211 "%i vs %i+%i=%i vs %i+%i=%i" \ 240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \
212 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) 241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)