comparison tools/next_gen_conversion/solid2fastq.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2
3 import sys
4 import string
5 import optparse
6 import tempfile
7 import sqlite3
8
9 def stop_err( msg ):
10 sys.stderr.write( msg )
11 sys.exit()
12
13 def solid2sanger( quality_string, min_qual = 0 ):
14 sanger = ""
15 quality_string = quality_string.rstrip( " " )
16 for qv in quality_string.split(" "):
17 try:
18 if int( qv ) < 0:
19 qv = '0'
20 if int( qv ) < min_qual:
21 return False
22 break
23 sanger += chr( int( qv ) + 33 )
24 except:
25 pass
26 return sanger
27
28 def Translator(frm='', to='', delete='', keep=None):
29 allchars = string.maketrans('','')
30 if len(to) == 1:
31 to = to * len(frm)
32 trans = string.maketrans(frm, to)
33 if keep is not None:
34 delete = allchars.translate(allchars, keep.translate(allchars, delete))
35 def callable(s):
36 return s.translate(trans, delete)
37 return callable
38
39 def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
40
41 # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
42 # which can have three values: fastq, txt, and db
43 # fastq = fastq format
44 # txt = space delimited format with defline, reads, and qvs
45 # dp = dump data into sqlite3 db.
46 # IMPORTNAT! If out = db two optins must be provided:
47 # 1. f_out must be a db connection object initialized with sqlite3.connect()
48 # 2. table_name must be provided
49
50 if out == 'db':
51 cursor = f_out.cursor()
52 sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
53 cursor.execute(sql)
54
55 lines = []
56 line = " "
57 while line:
58 for f in [ f_reads, f_qual ]:
59 line = f.readline().rstrip( '\n\r' )
60 while line.startswith( '#' ):
61 line = f.readline().rstrip( '\n\r' )
62 lines.append( line )
63
64
65 if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
66
67 if lines[0] != lines[1]:
68 stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
69
70 defline = lines[0][1:]
71 if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
72 defline = defline[ : len( defline )-3 ]
73
74 elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
75
76 if trim_first_base:
77 lines[0] = lines[0][1:]
78 if double_encode:
79 de = Translator(frm="0123.", to="ACGTN")
80 lines[0] = de(lines[0])
81 qual = solid2sanger( lines[1], int( min_qual ) )
82 if qual:
83 if out == 'fastq':
84 f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
85 if out == 'txt':
86 f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
87 if out == 'db':
88 cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )
89 lines = []
90
91 def main():
92
93 usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
94 parser = optparse.OptionParser(usage=usage)
95
96
97 parser.add_option(
98 '--fr','--f_reads',
99 metavar="F3_CSFASTA_FILE",
100 dest='fr',
101 help='Name of F3 file with color space reads')
102
103 parser.add_option(
104 '--fq','--f_qual',
105 metavar="F3_QUAL_FILE",
106 dest='fq',
107 help='Name of F3 file with color quality values')
108
109 parser.add_option(
110 '--fout','--f3_fastq_output',
111 metavar="F3_OUTPUT",
112 dest='fout',
113 help='Name for F3 output file')
114
115 parser.add_option(
116 '--rr','--r_reads',
117 metavar="R3_CSFASTA_FILE",
118 dest='rr',
119 default = False,
120 help='Name of R3 file with color space reads')
121
122 parser.add_option(
123 '--rq','--r_qual',
124 metavar="R3_QUAL_FILE",
125 dest='rq',
126 default = False,
127 help='Name of R3 file with color quality values')
128
129 parser.add_option(
130 '--rout',
131 metavar="R3_OUTPUT",
132 dest='rout',
133 help='Name for F3 output file')
134
135 parser.add_option(
136 '-q','--min_qual',
137 dest='min_qual',
138 default = '-1000',
139 help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
140
141 parser.add_option(
142 '-t','--trim_name',
143 dest='trim_name',
144 action='store_true',
145 default = False,
146 help='Trim _R3 and _F3 off read names. Default is False')
147
148 parser.add_option(
149 '-f','--trim_first_base',
150 dest='trim_first_base',
151 action='store_true',
152 default = False,
153 help='Remove the first base of reads in color-space. Default is False')
154
155 parser.add_option(
156 '-d','--double_encode',
157 dest='de',
158 action='store_true',
159 default = False,
160 help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
161
162 options, args = parser.parse_args()
163
164 if not ( options.fout and options.fr and options.fq ):
165 parser.error("""
166 One or more of the three required paremetrs is missing:
167 (1) --fr F3.csfasta file
168 (2) --fq F3.qual file
169 (3) --fout name of output file
170 Use --help for more info
171 """)
172
173 fr = open ( options.fr , 'r' )
174 fq = open ( options.fq , 'r' )
175 f_out = open ( options.fout , 'w' )
176
177 if options.rr and options.rq:
178 rr = open ( options.rr , 'r' )
179 rq = open ( options.rq , 'r' )
180 if not options.rout:
181 parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
182 r_out = open ( options.rout, 'w' )
183
184 db = tempfile.NamedTemporaryFile()
185
186 try:
187 con = sqlite3.connect(db.name)
188 cur = con.cursor()
189 except:
190 stop_err('Cannot connect to %s\n') % db.name
191
192
193 merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
194 merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
195 cur.execute('create index f3_name on f3( name )')
196 cur.execute('create index r3_name on r3( name )')
197
198 cur.execute('select * from f3,r3 where f3.name = r3.name')
199 for item in cur:
200 f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
201 r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
202
203
204 else:
205 merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
206
207
208
209 f_out.close()
210
211 if __name__ == "__main__":
212 main()
213
214