0
|
1 """
|
|
2 Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary.
|
|
3
|
|
4 usage: %prog [options]
|
|
5 -i, --input=i: Input FASTQ candidate file
|
|
6 -r, --origType=r: Original type
|
|
7 -a, --allOrNot=a: Whether or not to check all blocks
|
|
8 -b, --blocks=b: Number of blocks to check
|
|
9 -o, --output=o: Output file
|
|
10
|
|
11 usage: %prog input_file oroutput_file
|
|
12 """
|
|
13
|
|
14 import math, sys
|
|
15 from galaxy import eggs
|
|
16 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
17 from bx.cookbook import doc_optparse
|
|
18
|
|
19 def stop_err( msg ):
|
|
20 sys.stderr.write( "%s\n" % msg )
|
|
21 sys.exit()
|
|
22
|
|
23 def all_bases_valid(seq):
|
|
24 """Confirm that the sequence contains only bases"""
|
|
25 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
|
|
26 for base in seq:
|
|
27 if base not in valid_bases:
|
|
28 return False
|
|
29 return True
|
|
30
|
|
31 def __main__():
|
|
32 #Parse Command Line
|
|
33 options, args = doc_optparse.parse( __doc__ )
|
|
34 orig_type = options.origType
|
|
35 if orig_type == 'sanger' and options.allOrNot == 'not':
|
|
36 max_blocks = int(options.blocks)
|
|
37 else:
|
|
38 max_blocks = -1
|
|
39 fin = file(options.input, 'r')
|
|
40 fout = file(options.output, 'w')
|
|
41 range_min = 1000
|
|
42 range_max = -5
|
|
43 block_num = 0
|
|
44 bad_blocks = 0
|
|
45 base_len = -1
|
|
46 line_count = 0
|
|
47 lines = []
|
|
48 line = fin.readline()
|
|
49 while line:
|
|
50 if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
|
|
51 fout.write(line)
|
|
52 if line_count % 4 == 0:
|
|
53 block_num += 1
|
|
54 line_count += 1
|
|
55 elif line.strip():
|
|
56 # the line that starts a block, with a name
|
|
57 if line_count % 4 == 0 and line.startswith('@'):
|
|
58 lines.append(line)
|
|
59 else:
|
|
60 # if we expect a sequence of bases
|
|
61 if line_count % 4 == 1 and all_bases_valid(line.strip()):
|
|
62 lines.append(line)
|
|
63 base_len = len(line.strip())
|
|
64 # if we expect the second name line
|
|
65 elif line_count % 4 == 2 and line.startswith('+'):
|
|
66 lines.append(line)
|
|
67 # if we expect a sequence of qualities and it's the expected length
|
|
68 elif line_count % 4 == 3:
|
|
69 split_line = line.strip().split()
|
|
70 # decimal qualities
|
|
71 if len(split_line) == base_len:
|
|
72 # convert
|
|
73 phred_list = []
|
|
74 for ch in split_line:
|
|
75 int_ch = int(ch)
|
|
76 if int_ch < range_min:
|
|
77 range_min = int_ch
|
|
78 if int_ch > range_max:
|
|
79 range_max = int_ch
|
|
80 if int_ch >= 0 and int_ch <= 93:
|
|
81 phred_list.append(chr(int_ch + 33))
|
|
82 # make sure we haven't lost any quality values
|
|
83 if len(phred_list) == base_len:
|
|
84 # print first three lines
|
|
85 for l in lines:
|
|
86 fout.write(l)
|
|
87 # print converted quality line
|
|
88 fout.write(''.join(phred_list))
|
|
89 # reset
|
|
90 lines = []
|
|
91 base_len = -1
|
|
92 # abort if so
|
|
93 else:
|
|
94 bad_blocks += 1
|
|
95 lines = []
|
|
96 base_len = -1
|
|
97 # ascii qualities
|
|
98 elif len(split_line[0]) == base_len:
|
|
99 qualities = []
|
|
100 # print converted quality line
|
|
101 if orig_type == 'illumina':
|
|
102 for c in line.strip():
|
|
103 if ord(c) - 64 < range_min:
|
|
104 range_min = ord(c) - 64
|
|
105 if ord(c) - 64 > range_max:
|
|
106 range_max = ord(c) - 64
|
|
107 if ord(c) < 64 or ord(c) > 126:
|
|
108 bad_blocks += 1
|
|
109 base_len = -1
|
|
110 lines = []
|
|
111 break
|
|
112 else:
|
|
113 qualities.append( chr( ord(c) - 31 ) )
|
|
114 quals = ''.join(qualities)
|
|
115 elif orig_type == 'solexa':
|
|
116 for c in line.strip():
|
|
117 if ord(c) - 64 < range_min:
|
|
118 range_min = ord(c) - 64
|
|
119 if ord(c) - 64 > range_max:
|
|
120 range_max = ord(c) - 64
|
|
121 if ord(c) < 59 or ord(c) > 126:
|
|
122 bad_blocks += 1
|
|
123 base_len = -1
|
|
124 lines = []
|
|
125 break
|
|
126 else:
|
|
127 p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
|
|
128 qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
|
|
129 quals = ''.join(qualities)
|
|
130 else: # 'sanger'
|
|
131 for c in line.strip():
|
|
132 if ord(c) - 33 < range_min:
|
|
133 range_min = ord(c) - 33
|
|
134 if ord(c) - 33 > range_max:
|
|
135 range_max = ord(c) - 33
|
|
136 if ord(c) < 33 or ord(c) > 126:
|
|
137 bad_blocks += 1
|
|
138 base_len = -1
|
|
139 lines = []
|
|
140 break
|
|
141 else:
|
|
142 qualities.append(c)
|
|
143 quals = ''.join(qualities)
|
|
144 # make sure we don't have bad qualities
|
|
145 if len(quals) == base_len:
|
|
146 # print first three lines
|
|
147 for l in lines:
|
|
148 fout.write(l)
|
|
149 # print out quality line
|
|
150 fout.write(quals+'\n')
|
|
151 # reset
|
|
152 lines = []
|
|
153 base_len = -1
|
|
154 else:
|
|
155 bad_blocks += 1
|
|
156 base_len = -1
|
|
157 lines = []
|
|
158 # mark the successful end of a block
|
|
159 block_num += 1
|
|
160 line_count += 1
|
|
161 line = fin.readline()
|
|
162 fout.close()
|
|
163 fin.close()
|
|
164 if range_min != 1000 and range_min != -5:
|
|
165 outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
|
|
166 else:
|
|
167 outmsg = ''
|
|
168 if bad_blocks > 0:
|
|
169 outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
|
|
170 sys.stdout.write(outmsg)
|
|
171
|
|
172 if __name__=="__main__": __main__() |