annotate tools/next_gen_conversion/fastq_gen_conv.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 usage: %prog [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 -i, --input=i: Input FASTQ candidate file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 -r, --origType=r: Original type
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 -a, --allOrNot=a: Whether or not to check all blocks
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 -b, --blocks=b: Number of blocks to check
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 -o, --output=o: Output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 usage: %prog input_file oroutput_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 import math, sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 import pkg_resources; pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 from bx.cookbook import doc_optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 sys.stderr.write( "%s\n" % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 def all_bases_valid(seq):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 """Confirm that the sequence contains only bases"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 for base in seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 if base not in valid_bases:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 options, args = doc_optparse.parse( __doc__ )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 orig_type = options.origType
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 if orig_type == 'sanger' and options.allOrNot == 'not':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 max_blocks = int(options.blocks)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 max_blocks = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 fin = file(options.input, 'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 fout = file(options.output, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 range_min = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 range_max = -5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 block_num = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 bad_blocks = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 line_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 line = fin.readline()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 while line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 fout.write(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 if line_count % 4 == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 block_num += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 line_count += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 elif line.strip():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # the line that starts a block, with a name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if line_count % 4 == 0 and line.startswith('@'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 lines.append(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 # if we expect a sequence of bases
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 if line_count % 4 == 1 and all_bases_valid(line.strip()):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 lines.append(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 base_len = len(line.strip())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 # if we expect the second name line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 elif line_count % 4 == 2 and line.startswith('+'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 lines.append(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 # if we expect a sequence of qualities and it's the expected length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 elif line_count % 4 == 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 split_line = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 # decimal qualities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 if len(split_line) == base_len:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 # convert
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 phred_list = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 for ch in split_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 int_ch = int(ch)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 if int_ch < range_min:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 range_min = int_ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if int_ch > range_max:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 range_max = int_ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 if int_ch >= 0 and int_ch <= 93:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 phred_list.append(chr(int_ch + 33))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 # make sure we haven't lost any quality values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 if len(phred_list) == base_len:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 # print first three lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 for l in lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 fout.write(l)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 # print converted quality line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 fout.write(''.join(phred_list))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 # reset
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 # abort if so
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 bad_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 # ascii qualities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 elif len(split_line[0]) == base_len:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 qualities = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 # print converted quality line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if orig_type == 'illumina':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 for c in line.strip():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 if ord(c) - 64 < range_min:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 range_min = ord(c) - 64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if ord(c) - 64 > range_max:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 range_max = ord(c) - 64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 if ord(c) < 64 or ord(c) > 126:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 bad_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 qualities.append( chr( ord(c) - 31 ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 quals = ''.join(qualities)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 elif orig_type == 'solexa':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 for c in line.strip():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if ord(c) - 64 < range_min:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 range_min = ord(c) - 64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 if ord(c) - 64 > range_max:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 range_max = ord(c) - 64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 if ord(c) < 59 or ord(c) > 126:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 bad_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 quals = ''.join(qualities)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 else: # 'sanger'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 for c in line.strip():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if ord(c) - 33 < range_min:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 range_min = ord(c) - 33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 if ord(c) - 33 > range_max:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 range_max = ord(c) - 33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 if ord(c) < 33 or ord(c) > 126:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 bad_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 qualities.append(c)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 quals = ''.join(qualities)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 # make sure we don't have bad qualities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 if len(quals) == base_len:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 # print first three lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 for l in lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 fout.write(l)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 # print out quality line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 fout.write(quals+'\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 # reset
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 bad_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 base_len = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 # mark the successful end of a block
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 block_num += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 line_count += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 line = fin.readline()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 fout.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 fin.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 if range_min != 1000 and range_min != -5:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 outmsg = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 if bad_blocks > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 sys.stdout.write(outmsg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 if __name__=="__main__": __main__()