Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/FilterReads.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import sys | |
4 import os | |
5 import os.path | |
6 import re | |
7 | |
8 # Define the Dictionary & the Reverse Dictionary for barcode | |
9 | |
10 # =========================================== | |
11 # | |
12 def FilterFastq (infile, outfile) : | |
13 try: | |
14 INFILE = open(infile, 'r') | |
15 except IOError: | |
16 print "\n[Error]:\n\t File cannot be open: ", infile; | |
17 exit(-1) | |
18 try: | |
19 OUTFILE = open(outfile, 'w') | |
20 except IOError: | |
21 print "\n[Error]:\n\t File cannot be open: ", outfile; | |
22 exit(-1) | |
23 Dict = {} | |
24 count = 0 | |
25 total = 0 | |
26 while 1 : | |
27 line1 = INFILE.readline() | |
28 if not line1 : | |
29 break | |
30 line1 = line1 | |
31 line2 = INFILE.readline() | |
32 line3 = INFILE.readline() | |
33 line4 = INFILE.readline() | |
34 total += 1 | |
35 # print line2 | |
36 if ( line2 in Dict ) : | |
37 Dict[line2] += 1 | |
38 else : | |
39 Dict[line2] = 1 | |
40 OUTFILE.write(line1) | |
41 OUTFILE.write(line2) | |
42 OUTFILE.write(line3) | |
43 OUTFILE.write(line4) | |
44 count += 1 | |
45 INFILE.close() | |
46 OUTFILE.close() | |
47 print "Count of total raw reads: ", total | |
48 print "Count of reads left: ", count | |
49 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | |
50 | |
51 | |
52 # =============================== | |
53 # | |
54 | |
55 def FilterSequence (infile, outfile) : | |
56 try: | |
57 INFILE = open(infile, 'r') | |
58 except IOError: | |
59 print "\n[Error]:\n\t File cannot be open: ", infile; | |
60 exit(-1) | |
61 try: | |
62 OUTFILE = open(outfile, 'w') | |
63 except IOError: | |
64 print "\n[Error]:\n\t File cannot be open: ", outfile; | |
65 exit(-1) | |
66 Dict = {} | |
67 count = 0 | |
68 total = 0 | |
69 for line in INFILE: | |
70 line = line.strip() | |
71 total += 1 | |
72 if ( line in Dict ) : | |
73 Dict[line] += 1 | |
74 else : | |
75 Dict[line] = 1 | |
76 count += 1 | |
77 OUTFILE.write(line + "\n") | |
78 INFILE.close() | |
79 OUTFILE.close() | |
80 print "Count of total raw reads: ", total | |
81 print "Count of reads left: ", count | |
82 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | |
83 | |
84 # =============================== | |
85 # SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0 | |
86 # | |
87 def FilterQseq (infile, outfile, keepquality) : | |
88 if keepquality : | |
89 print "User specified '-k' and read quality will not be considered" | |
90 else : | |
91 print "Reads with PF=0 will be filtered" | |
92 | |
93 try: | |
94 INFILE = open(infile, 'r') | |
95 except IOError: | |
96 print "\n[Error]:\n\t File cannot be open: ", infile; | |
97 exit(-1) | |
98 try: | |
99 OUTFILE = open(outfile, 'w') | |
100 except IOError: | |
101 print "\n[Error]:\n\t File cannot be open: ", outfile; | |
102 exit(-1) | |
103 Dict = {} | |
104 count = 0 | |
105 total = 0 | |
106 for line in INFILE: | |
107 tokens = line.strip().split() | |
108 total += 1 | |
109 if ( (not keepquality and tokens[10] == "1") or keepquality ) : | |
110 if ( tokens[8] in Dict ) : | |
111 Dict[tokens[8]] += 1 | |
112 else : | |
113 Dict[tokens[8]] = 1 | |
114 count += 1 | |
115 OUTFILE.write(line) | |
116 | |
117 INFILE.close() | |
118 OUTFILE.close() | |
119 print "Count of total raw reads: ", total | |
120 print "Count of reads left: ", count | |
121 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | |
122 | |
123 | |
124 # =============================== | |
125 # | |
126 | |
127 def FilterFasta (infile, outfile) : | |
128 try: | |
129 INFILE = open(infile, 'r') | |
130 except IOError: | |
131 print "\n[Error]:\n\t File cannot be open: ", infile; | |
132 exit(-1) | |
133 try: | |
134 OUTFILE = open(outfile, 'w') | |
135 except IOError: | |
136 print "\n[Error]:\n\t File cannot be open: ", outfile; | |
137 exit(-1) | |
138 Dict = {} | |
139 count = 0 | |
140 total = 0 | |
141 name = "" | |
142 read = "" | |
143 for line in INFILE: | |
144 line = line.strip() | |
145 if (line[0] == '>') : | |
146 total += 1 | |
147 if ( name != "" ) : | |
148 if ( read in Dict ) : | |
149 Dict[read] += 1 | |
150 else : | |
151 Dict[read] = 1 | |
152 count += 1 | |
153 OUTFILE.write(">" + name + "\n") | |
154 OUTFILE.write(read + "\n") | |
155 name = line[1:] | |
156 read = "" | |
157 else : | |
158 read = read + line | |
159 if (name != "") : | |
160 if ( read in Dict ) : | |
161 Dict[read] += 1 | |
162 else : | |
163 Dict[read] = 1 | |
164 count += 1 | |
165 OUTFILE.write(">" + name + "\n") | |
166 OUTFILE.write(read + "\n") | |
167 print "Count of total raw reads: ", total | |
168 print "Count of reads left: ", count | |
169 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | |
170 | |
171 | |
172 | |
173 # ============================== | |
174 # Decide branch functions for different format | |
175 | |
176 def FilterReads (infile, outfile, keepquality): | |
177 try: | |
178 INFILE = open(infile, 'r') | |
179 except IOError: | |
180 print "\n[Error]:\n\t File cannot be open: ", infile; | |
181 exit(-1) | |
182 i = 0 | |
183 # Count the numbers of barcodes in first 10000 lines | |
184 line = INFILE.readline() | |
185 tokens = line.strip().split() | |
186 input_format=""; | |
187 if line[0]=="@" : | |
188 input_format = "FastQ" | |
189 n_fastq = 0 | |
190 elif len(tokens) == 1 and line[0] != ">": | |
191 input_format = "sequence" | |
192 elif len(tokens) == 11: | |
193 input_format = "qseq" | |
194 elif line[0]==">" : | |
195 input_format = "fasta" | |
196 INFILE.close() | |
197 | |
198 print "Input file format detected: ", input_format | |
199 | |
200 if input_format == "FastQ" : | |
201 FilterFastq(infile, outfile) | |
202 elif input_format == "sequence" : | |
203 FilterSequence(infile, outfile) | |
204 elif input_format == "qseq" : | |
205 FilterQseq(infile, outfile, keepquality) | |
206 elif input_format == "fasta" : | |
207 FilterFasta(infile, outfile) | |
208 | |
209 | |
210 from optparse import OptionParser | |
211 | |
212 # =========================================== | |
213 def main(): | |
214 usage = "Usage: %prog -i <input> -o <output> [-k]\n" \ | |
215 "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \ | |
216 "Last Update: 2013-04-01" \ | |
217 "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \ | |
218 " and filter low quality reads in qseq file." | |
219 parser = OptionParser(usage) | |
220 parser.add_option("-i", dest="infile", | |
221 help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE") | |
222 parser.add_option("-o", dest="outfile", | |
223 help="Name of the output file", metavar="FILE") | |
224 parser.add_option("-k", dest="keepquality", default = False, action = "store_true", | |
225 help="Would not filter low quality reads if specified") | |
226 (options, args) = parser.parse_args() | |
227 | |
228 if (options.infile is None) or (options.outfile is None) : | |
229 print parser.print_help() | |
230 exit(-1) | |
231 FilterReads(options.infile, options.outfile, options.keepquality) | |
232 | |
233 | |
234 # =========================================== | |
235 if __name__ == "__main__": | |
236 main() | |
237 |