Mercurial > repos > peterjc > fastq_paired_unpaired
comparison tools/fastq/fastq_paired_unpaired.py @ 1:7ed81e36fc1c
Uploaded v0.0.5 which handles Illumina 1.8 style pair naming.
author | peterjc |
---|---|
date | Mon, 12 Dec 2011 11:33:10 -0500 |
parents | 72e9fcaec61f |
children | 95a632a71951 |
comparison
equal
deleted
inserted
replaced
0:72e9fcaec61f | 1:7ed81e36fc1c |
---|---|
7 suffices. See below or run the tool with no arguments for more details. | 7 suffices. See below or run the tool with no arguments for more details. |
8 | 8 |
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even | 9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even |
10 Color Space should all work equally well). | 10 Color Space should all work equally well). |
11 | 11 |
12 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | 12 This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute |
13 (formerly SCRI), Scotland, UK. All rights reserved. | |
14 | |
13 See accompanying text file for licence details (MIT/BSD style). | 15 See accompanying text file for licence details (MIT/BSD style). |
14 | |
15 This is version 0.0.4 of the script. | |
16 """ | 16 """ |
17 import os | 17 import os |
18 import sys | 18 import sys |
19 import re | 19 import re |
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
21 | |
22 if "--version" in sys.argv[1:]: | |
23 print "Version 0.0.5" | |
24 sys.exit(0) | |
21 | 25 |
22 def stop_err(msg, err=1): | 26 def stop_err(msg, err=1): |
23 sys.stderr.write(msg.rstrip() + "\n") | 27 sys.stderr.write(msg.rstrip() + "\n") |
24 sys.exit(err) | 28 sys.exit(err) |
25 | 29 |
42 any partner forward+reverse reads are consecutive. The output files all | 46 any partner forward+reverse reads are consecutive. The output files all |
43 preserve this sort order. | 47 preserve this sort order. |
44 | 48 |
45 Any reads where the forward/reverse naming suffix used is not recognised | 49 Any reads where the forward/reverse naming suffix used is not recognised |
46 are treated as orphan reads. The tool supports the /1 and /2 convention | 50 are treated as orphan reads. The tool supports the /1 and /2 convention |
47 used by Illumina, the .f and .r convention, and the Sanger convention | 51 originally used by Illumina, the .f and .r convention, and the Sanger |
48 (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). | 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html |
53 for details), and the new Illumina convention where the reads have the | |
54 same identifier with the fragment at the start of the description, e.g. | |
55 | |
56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA | |
57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA | |
49 | 58 |
50 Note that this does support multiple forward and reverse reads per template | 59 Note that this does support multiple forward and reverse reads per template |
51 (which is quite common with Sanger sequencing), e.g. this which is sorted | 60 (which is quite common with Sanger sequencing), e.g. this which is sorted |
52 alphabetically: | 61 alphabetically: |
53 | 62 |
111 assert not re_r.search("demo.p") | 120 assert not re_r.search("demo.p") |
112 assert not re_f.search("demo/2") | 121 assert not re_f.search("demo/2") |
113 assert not re_f.search("demo.r") | 122 assert not re_f.search("demo.r") |
114 assert not re_f.search("demo.q") | 123 assert not re_f.search("demo.q") |
115 | 124 |
125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") | |
126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") | |
127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") | |
130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") | |
131 | |
132 | |
116 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 | 133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 |
117 in_handle = open(input_fastq) | 134 in_handle = open(input_fastq) |
118 if pairs_fastq: | 135 if pairs_fastq: |
119 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) | 136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) |
120 pairs_r_writer = pairs_f_writer | 137 pairs_r_writer = pairs_f_writer |
126 | 143 |
127 for record in fastqReader(in_handle, format): | 144 for record in fastqReader(in_handle, format): |
128 count += 1 | 145 count += 1 |
129 name = record.identifier.split(None,1)[0] | 146 name = record.identifier.split(None,1)[0] |
130 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser | 147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser |
148 is_forward = False | |
131 suffix = re_f.search(name) | 149 suffix = re_f.search(name) |
132 if suffix: | 150 if suffix: |
133 #============ | 151 #============ |
134 #Forward read | 152 #Forward read |
135 #============ | 153 #============ |
136 template = name[:suffix.start()] | 154 template = name[:suffix.start()] |
155 is_forward = True | |
156 elif re_illumina_f.match(record.identifier): | |
157 template = name #No suffix | |
158 is_forward = True | |
159 if is_forward: | |
137 #print name, "forward", template | 160 #print name, "forward", template |
138 forward += 1 | 161 forward += 1 |
139 if last_template == template: | 162 if last_template == template: |
140 buffered_reads.append(record) | 163 buffered_reads.append(record) |
141 else: | 164 else: |
143 for old in buffered_reads: | 166 for old in buffered_reads: |
144 singles_writer.write(old) | 167 singles_writer.write(old) |
145 singles += 1 | 168 singles += 1 |
146 #Save this read in buffer | 169 #Save this read in buffer |
147 buffered_reads = [record] | 170 buffered_reads = [record] |
148 last_template = template | 171 last_template = template |
149 else: | 172 else: |
173 is_reverse = False | |
150 suffix = re_r.search(name) | 174 suffix = re_r.search(name) |
151 if suffix: | 175 if suffix: |
152 #============ | 176 #============ |
153 #Reverse read | 177 #Reverse read |
154 #============ | 178 #============ |
155 template = name[:suffix.start()] | 179 template = name[:suffix.start()] |
180 is_reverse = True | |
181 elif re_illumina_r.match(record.identifier): | |
182 template = name #No suffix | |
183 is_reverse = True | |
184 if is_reverse: | |
156 #print name, "reverse", template | 185 #print name, "reverse", template |
157 reverse += 1 | 186 reverse += 1 |
158 if last_template == template and buffered_reads: | 187 if last_template == template and buffered_reads: |
159 #We have a pair! | 188 #We have a pair! |
160 #If there are multiple buffered forward reads, want to pick | 189 #If there are multiple buffered forward reads, want to pick |
206 else: | 235 else: |
207 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ | 236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ |
208 % (count, forward, reverse, pairs, singles) | 237 % (count, forward, reverse, pairs, singles) |
209 | 238 |
210 assert count == pairs + singles == forward + reverse + neither, \ | 239 assert count == pairs + singles == forward + reverse + neither, \ |
211 "%i vs %i+%i=%i vs %i+%i=%i" \ | 240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ |
212 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) | 241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) |