Mercurial > repos > xuebing > sharplabtool
comparison tools/next_gen_conversion/solid2fastq.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import string | |
5 import optparse | |
6 import tempfile | |
7 import sqlite3 | |
8 | |
9 def stop_err( msg ): | |
10 sys.stderr.write( msg ) | |
11 sys.exit() | |
12 | |
13 def solid2sanger( quality_string, min_qual = 0 ): | |
14 sanger = "" | |
15 quality_string = quality_string.rstrip( " " ) | |
16 for qv in quality_string.split(" "): | |
17 try: | |
18 if int( qv ) < 0: | |
19 qv = '0' | |
20 if int( qv ) < min_qual: | |
21 return False | |
22 break | |
23 sanger += chr( int( qv ) + 33 ) | |
24 except: | |
25 pass | |
26 return sanger | |
27 | |
28 def Translator(frm='', to='', delete='', keep=None): | |
29 allchars = string.maketrans('','') | |
30 if len(to) == 1: | |
31 to = to * len(frm) | |
32 trans = string.maketrans(frm, to) | |
33 if keep is not None: | |
34 delete = allchars.translate(allchars, keep.translate(allchars, delete)) | |
35 def callable(s): | |
36 return s.translate(trans, delete) | |
37 return callable | |
38 | |
39 def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): | |
40 | |
41 # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, | |
42 # which can have three values: fastq, txt, and db | |
43 # fastq = fastq format | |
44 # txt = space delimited format with defline, reads, and qvs | |
45 # dp = dump data into sqlite3 db. | |
46 # IMPORTNAT! If out = db two optins must be provided: | |
47 # 1. f_out must be a db connection object initialized with sqlite3.connect() | |
48 # 2. table_name must be provided | |
49 | |
50 if out == 'db': | |
51 cursor = f_out.cursor() | |
52 sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name | |
53 cursor.execute(sql) | |
54 | |
55 lines = [] | |
56 line = " " | |
57 while line: | |
58 for f in [ f_reads, f_qual ]: | |
59 line = f.readline().rstrip( '\n\r' ) | |
60 while line.startswith( '#' ): | |
61 line = f.readline().rstrip( '\n\r' ) | |
62 lines.append( line ) | |
63 | |
64 | |
65 if lines[0].startswith( '>' ) and lines[1].startswith( '>' ): | |
66 | |
67 if lines[0] != lines[1]: | |
68 stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data') | |
69 | |
70 defline = lines[0][1:] | |
71 if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): | |
72 defline = defline[ : len( defline )-3 ] | |
73 | |
74 elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ): | |
75 | |
76 if trim_first_base: | |
77 lines[0] = lines[0][1:] | |
78 if double_encode: | |
79 de = Translator(frm="0123.", to="ACGTN") | |
80 lines[0] = de(lines[0]) | |
81 qual = solid2sanger( lines[1], int( min_qual ) ) | |
82 if qual: | |
83 if out == 'fastq': | |
84 f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) | |
85 if out == 'txt': | |
86 f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) | |
87 if out == 'db': | |
88 cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) | |
89 lines = [] | |
90 | |
91 def main(): | |
92 | |
93 usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" | |
94 parser = optparse.OptionParser(usage=usage) | |
95 | |
96 | |
97 parser.add_option( | |
98 '--fr','--f_reads', | |
99 metavar="F3_CSFASTA_FILE", | |
100 dest='fr', | |
101 help='Name of F3 file with color space reads') | |
102 | |
103 parser.add_option( | |
104 '--fq','--f_qual', | |
105 metavar="F3_QUAL_FILE", | |
106 dest='fq', | |
107 help='Name of F3 file with color quality values') | |
108 | |
109 parser.add_option( | |
110 '--fout','--f3_fastq_output', | |
111 metavar="F3_OUTPUT", | |
112 dest='fout', | |
113 help='Name for F3 output file') | |
114 | |
115 parser.add_option( | |
116 '--rr','--r_reads', | |
117 metavar="R3_CSFASTA_FILE", | |
118 dest='rr', | |
119 default = False, | |
120 help='Name of R3 file with color space reads') | |
121 | |
122 parser.add_option( | |
123 '--rq','--r_qual', | |
124 metavar="R3_QUAL_FILE", | |
125 dest='rq', | |
126 default = False, | |
127 help='Name of R3 file with color quality values') | |
128 | |
129 parser.add_option( | |
130 '--rout', | |
131 metavar="R3_OUTPUT", | |
132 dest='rout', | |
133 help='Name for F3 output file') | |
134 | |
135 parser.add_option( | |
136 '-q','--min_qual', | |
137 dest='min_qual', | |
138 default = '-1000', | |
139 help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') | |
140 | |
141 parser.add_option( | |
142 '-t','--trim_name', | |
143 dest='trim_name', | |
144 action='store_true', | |
145 default = False, | |
146 help='Trim _R3 and _F3 off read names. Default is False') | |
147 | |
148 parser.add_option( | |
149 '-f','--trim_first_base', | |
150 dest='trim_first_base', | |
151 action='store_true', | |
152 default = False, | |
153 help='Remove the first base of reads in color-space. Default is False') | |
154 | |
155 parser.add_option( | |
156 '-d','--double_encode', | |
157 dest='de', | |
158 action='store_true', | |
159 default = False, | |
160 help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') | |
161 | |
162 options, args = parser.parse_args() | |
163 | |
164 if not ( options.fout and options.fr and options.fq ): | |
165 parser.error(""" | |
166 One or more of the three required paremetrs is missing: | |
167 (1) --fr F3.csfasta file | |
168 (2) --fq F3.qual file | |
169 (3) --fout name of output file | |
170 Use --help for more info | |
171 """) | |
172 | |
173 fr = open ( options.fr , 'r' ) | |
174 fq = open ( options.fq , 'r' ) | |
175 f_out = open ( options.fout , 'w' ) | |
176 | |
177 if options.rr and options.rq: | |
178 rr = open ( options.rr , 'r' ) | |
179 rq = open ( options.rq , 'r' ) | |
180 if not options.rout: | |
181 parser.error("Provide the name for f3 output using --rout option. Use --help for more info") | |
182 r_out = open ( options.rout, 'w' ) | |
183 | |
184 db = tempfile.NamedTemporaryFile() | |
185 | |
186 try: | |
187 con = sqlite3.connect(db.name) | |
188 cur = con.cursor() | |
189 except: | |
190 stop_err('Cannot connect to %s\n') % db.name | |
191 | |
192 | |
193 merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) | |
194 merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) | |
195 cur.execute('create index f3_name on f3( name )') | |
196 cur.execute('create index r3_name on r3( name )') | |
197 | |
198 cur.execute('select * from f3,r3 where f3.name = r3.name') | |
199 for item in cur: | |
200 f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) | |
201 r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) | |
202 | |
203 | |
204 else: | |
205 merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) | |
206 | |
207 | |
208 | |
209 f_out.close() | |
210 | |
211 if __name__ == "__main__": | |
212 main() | |
213 | |
214 |