annotate tools/next_gen_conversion/solid2fastq.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 import optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 import tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 import sqlite3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 sys.stderr.write( msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 def solid2sanger( quality_string, min_qual = 0 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 sanger = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 quality_string = quality_string.rstrip( " " )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 for qv in quality_string.split(" "):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 if int( qv ) < 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 qv = '0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 if int( qv ) < min_qual:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 sanger += chr( int( qv ) + 33 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 return sanger
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 def Translator(frm='', to='', delete='', keep=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 allchars = string.maketrans('','')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 if len(to) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 to = to * len(frm)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 trans = string.maketrans(frm, to)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 if keep is not None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 delete = allchars.translate(allchars, keep.translate(allchars, delete))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 def callable(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 return s.translate(trans, delete)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 return callable
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 # which can have three values: fastq, txt, and db
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 # fastq = fastq format
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 # txt = space delimited format with defline, reads, and qvs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # dp = dump data into sqlite3 db.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # IMPORTNAT! If out = db two optins must be provided:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 # 1. f_out must be a db connection object initialized with sqlite3.connect()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # 2. table_name must be provided
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 if out == 'db':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 cursor = f_out.cursor()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 cursor.execute(sql)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 line = " "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 while line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 for f in [ f_reads, f_qual ]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 line = f.readline().rstrip( '\n\r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 while line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 line = f.readline().rstrip( '\n\r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 lines.append( line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 if lines[0] != lines[1]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 defline = lines[0][1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 defline = defline[ : len( defline )-3 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 if trim_first_base:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 lines[0] = lines[0][1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if double_encode:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 de = Translator(frm="0123.", to="ACGTN")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 lines[0] = de(lines[0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 qual = solid2sanger( lines[1], int( min_qual ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 if qual:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 if out == 'fastq':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 if out == 'txt':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 if out == 'db':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 parser = optparse.OptionParser(usage=usage)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 '--fr','--f_reads',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 metavar="F3_CSFASTA_FILE",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 dest='fr',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 help='Name of F3 file with color space reads')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 '--fq','--f_qual',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 metavar="F3_QUAL_FILE",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 dest='fq',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 help='Name of F3 file with color quality values')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 '--fout','--f3_fastq_output',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 metavar="F3_OUTPUT",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 dest='fout',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 help='Name for F3 output file')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 '--rr','--r_reads',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 metavar="R3_CSFASTA_FILE",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 dest='rr',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 help='Name of R3 file with color space reads')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 '--rq','--r_qual',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 metavar="R3_QUAL_FILE",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 dest='rq',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 help='Name of R3 file with color quality values')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 '--rout',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 metavar="R3_OUTPUT",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 dest='rout',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 help='Name for F3 output file')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 '-q','--min_qual',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 dest='min_qual',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 default = '-1000',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 '-t','--trim_name',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 dest='trim_name',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 action='store_true',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 help='Trim _R3 and _F3 off read names. Default is False')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 '-f','--trim_first_base',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 dest='trim_first_base',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 action='store_true',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 help='Remove the first base of reads in color-space. Default is False')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 '-d','--double_encode',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 dest='de',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 action='store_true',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 options, args = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 if not ( options.fout and options.fr and options.fq ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 parser.error("""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 One or more of the three required paremetrs is missing:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 (1) --fr F3.csfasta file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 (2) --fq F3.qual file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 (3) --fout name of output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 Use --help for more info
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 """)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 fr = open ( options.fr , 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 fq = open ( options.fq , 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 f_out = open ( options.fout , 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 if options.rr and options.rq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 rr = open ( options.rr , 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 rq = open ( options.rq , 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 if not options.rout:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 r_out = open ( options.rout, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 db = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 con = sqlite3.connect(db.name)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 cur = con.cursor()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 stop_err('Cannot connect to %s\n') % db.name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 cur.execute('create index f3_name on f3( name )')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 cur.execute('create index r3_name on r3( name )')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 cur.execute('select * from f3,r3 where f3.name = r3.name')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 for item in cur:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 f_out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214