Mercurial > repos > xuebing > sharplabtool
comparison tools/next_gen_conversion/fastq_gen_conv.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 """ | |
2 Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary. | |
3 | |
4 usage: %prog [options] | |
5 -i, --input=i: Input FASTQ candidate file | |
6 -r, --origType=r: Original type | |
7 -a, --allOrNot=a: Whether or not to check all blocks | |
8 -b, --blocks=b: Number of blocks to check | |
9 -o, --output=o: Output file | |
10 | |
11 usage: %prog input_file oroutput_file | |
12 """ | |
13 | |
14 import math, sys | |
15 from galaxy import eggs | |
16 import pkg_resources; pkg_resources.require( "bx-python" ) | |
17 from bx.cookbook import doc_optparse | |
18 | |
19 def stop_err( msg ): | |
20 sys.stderr.write( "%s\n" % msg ) | |
21 sys.exit() | |
22 | |
23 def all_bases_valid(seq): | |
24 """Confirm that the sequence contains only bases""" | |
25 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N'] | |
26 for base in seq: | |
27 if base not in valid_bases: | |
28 return False | |
29 return True | |
30 | |
31 def __main__(): | |
32 #Parse Command Line | |
33 options, args = doc_optparse.parse( __doc__ ) | |
34 orig_type = options.origType | |
35 if orig_type == 'sanger' and options.allOrNot == 'not': | |
36 max_blocks = int(options.blocks) | |
37 else: | |
38 max_blocks = -1 | |
39 fin = file(options.input, 'r') | |
40 fout = file(options.output, 'w') | |
41 range_min = 1000 | |
42 range_max = -5 | |
43 block_num = 0 | |
44 bad_blocks = 0 | |
45 base_len = -1 | |
46 line_count = 0 | |
47 lines = [] | |
48 line = fin.readline() | |
49 while line: | |
50 if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks: | |
51 fout.write(line) | |
52 if line_count % 4 == 0: | |
53 block_num += 1 | |
54 line_count += 1 | |
55 elif line.strip(): | |
56 # the line that starts a block, with a name | |
57 if line_count % 4 == 0 and line.startswith('@'): | |
58 lines.append(line) | |
59 else: | |
60 # if we expect a sequence of bases | |
61 if line_count % 4 == 1 and all_bases_valid(line.strip()): | |
62 lines.append(line) | |
63 base_len = len(line.strip()) | |
64 # if we expect the second name line | |
65 elif line_count % 4 == 2 and line.startswith('+'): | |
66 lines.append(line) | |
67 # if we expect a sequence of qualities and it's the expected length | |
68 elif line_count % 4 == 3: | |
69 split_line = line.strip().split() | |
70 # decimal qualities | |
71 if len(split_line) == base_len: | |
72 # convert | |
73 phred_list = [] | |
74 for ch in split_line: | |
75 int_ch = int(ch) | |
76 if int_ch < range_min: | |
77 range_min = int_ch | |
78 if int_ch > range_max: | |
79 range_max = int_ch | |
80 if int_ch >= 0 and int_ch <= 93: | |
81 phred_list.append(chr(int_ch + 33)) | |
82 # make sure we haven't lost any quality values | |
83 if len(phred_list) == base_len: | |
84 # print first three lines | |
85 for l in lines: | |
86 fout.write(l) | |
87 # print converted quality line | |
88 fout.write(''.join(phred_list)) | |
89 # reset | |
90 lines = [] | |
91 base_len = -1 | |
92 # abort if so | |
93 else: | |
94 bad_blocks += 1 | |
95 lines = [] | |
96 base_len = -1 | |
97 # ascii qualities | |
98 elif len(split_line[0]) == base_len: | |
99 qualities = [] | |
100 # print converted quality line | |
101 if orig_type == 'illumina': | |
102 for c in line.strip(): | |
103 if ord(c) - 64 < range_min: | |
104 range_min = ord(c) - 64 | |
105 if ord(c) - 64 > range_max: | |
106 range_max = ord(c) - 64 | |
107 if ord(c) < 64 or ord(c) > 126: | |
108 bad_blocks += 1 | |
109 base_len = -1 | |
110 lines = [] | |
111 break | |
112 else: | |
113 qualities.append( chr( ord(c) - 31 ) ) | |
114 quals = ''.join(qualities) | |
115 elif orig_type == 'solexa': | |
116 for c in line.strip(): | |
117 if ord(c) - 64 < range_min: | |
118 range_min = ord(c) - 64 | |
119 if ord(c) - 64 > range_max: | |
120 range_max = ord(c) - 64 | |
121 if ord(c) < 59 or ord(c) > 126: | |
122 bad_blocks += 1 | |
123 base_len = -1 | |
124 lines = [] | |
125 break | |
126 else: | |
127 p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) ) | |
128 qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) ) | |
129 quals = ''.join(qualities) | |
130 else: # 'sanger' | |
131 for c in line.strip(): | |
132 if ord(c) - 33 < range_min: | |
133 range_min = ord(c) - 33 | |
134 if ord(c) - 33 > range_max: | |
135 range_max = ord(c) - 33 | |
136 if ord(c) < 33 or ord(c) > 126: | |
137 bad_blocks += 1 | |
138 base_len = -1 | |
139 lines = [] | |
140 break | |
141 else: | |
142 qualities.append(c) | |
143 quals = ''.join(qualities) | |
144 # make sure we don't have bad qualities | |
145 if len(quals) == base_len: | |
146 # print first three lines | |
147 for l in lines: | |
148 fout.write(l) | |
149 # print out quality line | |
150 fout.write(quals+'\n') | |
151 # reset | |
152 lines = [] | |
153 base_len = -1 | |
154 else: | |
155 bad_blocks += 1 | |
156 base_len = -1 | |
157 lines = [] | |
158 # mark the successful end of a block | |
159 block_num += 1 | |
160 line_count += 1 | |
161 line = fin.readline() | |
162 fout.close() | |
163 fin.close() | |
164 if range_min != 1000 and range_min != -5: | |
165 outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max) | |
166 else: | |
167 outmsg = '' | |
168 if bad_blocks > 0: | |
169 outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks) | |
170 sys.stdout.write(outmsg) | |
171 | |
172 if __name__=="__main__": __main__() |