1
|
1 #!/usr/bin/env python
|
0
|
2
|
|
3 import sys
|
|
4 import os
|
|
5 import os.path
|
|
6 import re
|
|
7
|
|
8 # Define the Dictionary & the Reverse Dictionary for barcode
|
|
9
|
|
10 # ===========================================
|
|
11 #
|
|
12 def FilterFastq (infile, outfile) :
|
|
13 try:
|
|
14 INFILE = open(infile, 'r')
|
|
15 except IOError:
|
|
16 print "\n[Error]:\n\t File cannot be open: ", infile;
|
|
17 exit(-1)
|
|
18 try:
|
|
19 OUTFILE = open(outfile, 'w')
|
|
20 except IOError:
|
|
21 print "\n[Error]:\n\t File cannot be open: ", outfile;
|
|
22 exit(-1)
|
|
23 Dict = {}
|
|
24 count = 0
|
|
25 total = 0
|
|
26 while 1 :
|
|
27 line1 = INFILE.readline()
|
|
28 if not line1 :
|
|
29 break
|
|
30 line1 = line1
|
|
31 line2 = INFILE.readline()
|
|
32 line3 = INFILE.readline()
|
|
33 line4 = INFILE.readline()
|
|
34 total += 1
|
|
35 # print line2
|
|
36 if ( line2 in Dict ) :
|
|
37 Dict[line2] += 1
|
|
38 else :
|
|
39 Dict[line2] = 1
|
|
40 OUTFILE.write(line1)
|
|
41 OUTFILE.write(line2)
|
|
42 OUTFILE.write(line3)
|
|
43 OUTFILE.write(line4)
|
|
44 count += 1
|
|
45 INFILE.close()
|
|
46 OUTFILE.close()
|
|
47 print "Count of total raw reads: ", total
|
|
48 print "Count of reads left: ", count
|
|
49 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
|
|
50
|
|
51
|
|
52 # ===============================
|
|
53 #
|
|
54
|
|
55 def FilterSequence (infile, outfile) :
|
|
56 try:
|
|
57 INFILE = open(infile, 'r')
|
|
58 except IOError:
|
|
59 print "\n[Error]:\n\t File cannot be open: ", infile;
|
|
60 exit(-1)
|
|
61 try:
|
|
62 OUTFILE = open(outfile, 'w')
|
|
63 except IOError:
|
|
64 print "\n[Error]:\n\t File cannot be open: ", outfile;
|
|
65 exit(-1)
|
|
66 Dict = {}
|
|
67 count = 0
|
|
68 total = 0
|
|
69 for line in INFILE:
|
|
70 line = line.strip()
|
|
71 total += 1
|
|
72 if ( line in Dict ) :
|
|
73 Dict[line] += 1
|
|
74 else :
|
|
75 Dict[line] = 1
|
|
76 count += 1
|
|
77 OUTFILE.write(line + "\n")
|
|
78 INFILE.close()
|
|
79 OUTFILE.close()
|
|
80 print "Count of total raw reads: ", total
|
|
81 print "Count of reads left: ", count
|
|
82 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
|
|
83
|
|
84 # ===============================
|
|
85 # SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0
|
|
86 #
|
|
87 def FilterQseq (infile, outfile, keepquality) :
|
|
88 if keepquality :
|
|
89 print "User specified '-k' and read quality will not be considered"
|
|
90 else :
|
|
91 print "Reads with PF=0 will be filtered"
|
|
92
|
|
93 try:
|
|
94 INFILE = open(infile, 'r')
|
|
95 except IOError:
|
|
96 print "\n[Error]:\n\t File cannot be open: ", infile;
|
|
97 exit(-1)
|
|
98 try:
|
|
99 OUTFILE = open(outfile, 'w')
|
|
100 except IOError:
|
|
101 print "\n[Error]:\n\t File cannot be open: ", outfile;
|
|
102 exit(-1)
|
|
103 Dict = {}
|
|
104 count = 0
|
|
105 total = 0
|
|
106 for line in INFILE:
|
|
107 tokens = line.strip().split()
|
|
108 total += 1
|
|
109 if ( (not keepquality and tokens[10] == "1") or keepquality ) :
|
|
110 if ( tokens[8] in Dict ) :
|
|
111 Dict[tokens[8]] += 1
|
|
112 else :
|
|
113 Dict[tokens[8]] = 1
|
|
114 count += 1
|
|
115 OUTFILE.write(line)
|
|
116
|
|
117 INFILE.close()
|
|
118 OUTFILE.close()
|
|
119 print "Count of total raw reads: ", total
|
|
120 print "Count of reads left: ", count
|
|
121 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
|
|
122
|
|
123
|
|
124 # ===============================
|
|
125 #
|
|
126
|
|
127 def FilterFasta (infile, outfile) :
|
|
128 try:
|
|
129 INFILE = open(infile, 'r')
|
|
130 except IOError:
|
|
131 print "\n[Error]:\n\t File cannot be open: ", infile;
|
|
132 exit(-1)
|
|
133 try:
|
|
134 OUTFILE = open(outfile, 'w')
|
|
135 except IOError:
|
|
136 print "\n[Error]:\n\t File cannot be open: ", outfile;
|
|
137 exit(-1)
|
|
138 Dict = {}
|
|
139 count = 0
|
|
140 total = 0
|
|
141 name = ""
|
|
142 read = ""
|
|
143 for line in INFILE:
|
|
144 line = line.strip()
|
|
145 if (line[0] == '>') :
|
|
146 total += 1
|
|
147 if ( name != "" ) :
|
|
148 if ( read in Dict ) :
|
|
149 Dict[read] += 1
|
|
150 else :
|
|
151 Dict[read] = 1
|
|
152 count += 1
|
|
153 OUTFILE.write(">" + name + "\n")
|
|
154 OUTFILE.write(read + "\n")
|
|
155 name = line[1:]
|
|
156 read = ""
|
|
157 else :
|
|
158 read = read + line
|
|
159 if (name != "") :
|
|
160 if ( read in Dict ) :
|
|
161 Dict[read] += 1
|
|
162 else :
|
|
163 Dict[read] = 1
|
|
164 count += 1
|
|
165 OUTFILE.write(">" + name + "\n")
|
|
166 OUTFILE.write(read + "\n")
|
|
167 print "Count of total raw reads: ", total
|
|
168 print "Count of reads left: ", count
|
|
169 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
|
|
170
|
|
171
|
|
172
|
|
173 # ==============================
|
|
174 # Decide branch functions for different format
|
|
175
|
|
176 def FilterReads (infile, outfile, keepquality):
|
|
177 try:
|
|
178 INFILE = open(infile, 'r')
|
|
179 except IOError:
|
|
180 print "\n[Error]:\n\t File cannot be open: ", infile;
|
|
181 exit(-1)
|
|
182 i = 0
|
|
183 # Count the numbers of barcodes in first 10000 lines
|
|
184 line = INFILE.readline()
|
|
185 tokens = line.strip().split()
|
|
186 input_format="";
|
|
187 if line[0]=="@" :
|
|
188 input_format = "FastQ"
|
|
189 n_fastq = 0
|
|
190 elif len(tokens) == 1 and line[0] != ">":
|
|
191 input_format = "sequence"
|
|
192 elif len(tokens) == 11:
|
|
193 input_format = "qseq"
|
|
194 elif line[0]==">" :
|
|
195 input_format = "fasta"
|
|
196 INFILE.close()
|
|
197
|
|
198 print "Input file format detected: ", input_format
|
|
199
|
|
200 if input_format == "FastQ" :
|
|
201 FilterFastq(infile, outfile)
|
|
202 elif input_format == "sequence" :
|
|
203 FilterSequence(infile, outfile)
|
|
204 elif input_format == "qseq" :
|
|
205 FilterQseq(infile, outfile, keepquality)
|
|
206 elif input_format == "fasta" :
|
|
207 FilterFasta(infile, outfile)
|
|
208
|
|
209
|
|
210 from optparse import OptionParser
|
|
211
|
|
212 # ===========================================
|
|
213 def main():
|
|
214 usage = "Usage: %prog -i <input> -o <output> [-k]\n" \
|
|
215 "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \
|
|
216 "Last Update: 2013-04-01" \
|
|
217 "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \
|
|
218 " and filter low quality reads in qseq file."
|
|
219 parser = OptionParser(usage)
|
|
220 parser.add_option("-i", dest="infile",
|
|
221 help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE")
|
|
222 parser.add_option("-o", dest="outfile",
|
|
223 help="Name of the output file", metavar="FILE")
|
|
224 parser.add_option("-k", dest="keepquality", default = False, action = "store_true",
|
|
225 help="Would not filter low quality reads if specified")
|
|
226 (options, args) = parser.parse_args()
|
|
227
|
|
228 if (options.infile is None) or (options.outfile is None) :
|
1
|
229 parser.print_help()
|
0
|
230 exit(-1)
|
|
231 FilterReads(options.infile, options.outfile, options.keepquality)
|
|
232
|
|
233
|
|
234 # ===========================================
|
|
235 if __name__ == "__main__":
|
|
236 main()
|
|
237
|