comparison tools/next_gen_conversion/fastq_gen_conv.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary.
3
4 usage: %prog [options]
5 -i, --input=i: Input FASTQ candidate file
6 -r, --origType=r: Original type
7 -a, --allOrNot=a: Whether or not to check all blocks
8 -b, --blocks=b: Number of blocks to check
9 -o, --output=o: Output file
10
11 usage: %prog input_file oroutput_file
12 """
13
14 import math, sys
15 from galaxy import eggs
16 import pkg_resources; pkg_resources.require( "bx-python" )
17 from bx.cookbook import doc_optparse
18
19 def stop_err( msg ):
20 sys.stderr.write( "%s\n" % msg )
21 sys.exit()
22
23 def all_bases_valid(seq):
24 """Confirm that the sequence contains only bases"""
25 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
26 for base in seq:
27 if base not in valid_bases:
28 return False
29 return True
30
31 def __main__():
32 #Parse Command Line
33 options, args = doc_optparse.parse( __doc__ )
34 orig_type = options.origType
35 if orig_type == 'sanger' and options.allOrNot == 'not':
36 max_blocks = int(options.blocks)
37 else:
38 max_blocks = -1
39 fin = file(options.input, 'r')
40 fout = file(options.output, 'w')
41 range_min = 1000
42 range_max = -5
43 block_num = 0
44 bad_blocks = 0
45 base_len = -1
46 line_count = 0
47 lines = []
48 line = fin.readline()
49 while line:
50 if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
51 fout.write(line)
52 if line_count % 4 == 0:
53 block_num += 1
54 line_count += 1
55 elif line.strip():
56 # the line that starts a block, with a name
57 if line_count % 4 == 0 and line.startswith('@'):
58 lines.append(line)
59 else:
60 # if we expect a sequence of bases
61 if line_count % 4 == 1 and all_bases_valid(line.strip()):
62 lines.append(line)
63 base_len = len(line.strip())
64 # if we expect the second name line
65 elif line_count % 4 == 2 and line.startswith('+'):
66 lines.append(line)
67 # if we expect a sequence of qualities and it's the expected length
68 elif line_count % 4 == 3:
69 split_line = line.strip().split()
70 # decimal qualities
71 if len(split_line) == base_len:
72 # convert
73 phred_list = []
74 for ch in split_line:
75 int_ch = int(ch)
76 if int_ch < range_min:
77 range_min = int_ch
78 if int_ch > range_max:
79 range_max = int_ch
80 if int_ch >= 0 and int_ch <= 93:
81 phred_list.append(chr(int_ch + 33))
82 # make sure we haven't lost any quality values
83 if len(phred_list) == base_len:
84 # print first three lines
85 for l in lines:
86 fout.write(l)
87 # print converted quality line
88 fout.write(''.join(phred_list))
89 # reset
90 lines = []
91 base_len = -1
92 # abort if so
93 else:
94 bad_blocks += 1
95 lines = []
96 base_len = -1
97 # ascii qualities
98 elif len(split_line[0]) == base_len:
99 qualities = []
100 # print converted quality line
101 if orig_type == 'illumina':
102 for c in line.strip():
103 if ord(c) - 64 < range_min:
104 range_min = ord(c) - 64
105 if ord(c) - 64 > range_max:
106 range_max = ord(c) - 64
107 if ord(c) < 64 or ord(c) > 126:
108 bad_blocks += 1
109 base_len = -1
110 lines = []
111 break
112 else:
113 qualities.append( chr( ord(c) - 31 ) )
114 quals = ''.join(qualities)
115 elif orig_type == 'solexa':
116 for c in line.strip():
117 if ord(c) - 64 < range_min:
118 range_min = ord(c) - 64
119 if ord(c) - 64 > range_max:
120 range_max = ord(c) - 64
121 if ord(c) < 59 or ord(c) > 126:
122 bad_blocks += 1
123 base_len = -1
124 lines = []
125 break
126 else:
127 p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
128 qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
129 quals = ''.join(qualities)
130 else: # 'sanger'
131 for c in line.strip():
132 if ord(c) - 33 < range_min:
133 range_min = ord(c) - 33
134 if ord(c) - 33 > range_max:
135 range_max = ord(c) - 33
136 if ord(c) < 33 or ord(c) > 126:
137 bad_blocks += 1
138 base_len = -1
139 lines = []
140 break
141 else:
142 qualities.append(c)
143 quals = ''.join(qualities)
144 # make sure we don't have bad qualities
145 if len(quals) == base_len:
146 # print first three lines
147 for l in lines:
148 fout.write(l)
149 # print out quality line
150 fout.write(quals+'\n')
151 # reset
152 lines = []
153 base_len = -1
154 else:
155 bad_blocks += 1
156 base_len = -1
157 lines = []
158 # mark the successful end of a block
159 block_num += 1
160 line_count += 1
161 line = fin.readline()
162 fout.close()
163 fin.close()
164 if range_min != 1000 and range_min != -5:
165 outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
166 else:
167 outmsg = ''
168 if bad_blocks > 0:
169 outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
170 sys.stdout.write(outmsg)
171
172 if __name__=="__main__": __main__()