0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import string
|
|
5 import optparse
|
|
6 import tempfile
|
|
7 import sqlite3
|
|
8
|
|
9 def stop_err( msg ):
|
|
10 sys.stderr.write( msg )
|
|
11 sys.exit()
|
|
12
|
|
13 def solid2sanger( quality_string, min_qual = 0 ):
|
|
14 sanger = ""
|
|
15 quality_string = quality_string.rstrip( " " )
|
|
16 for qv in quality_string.split(" "):
|
|
17 try:
|
|
18 if int( qv ) < 0:
|
|
19 qv = '0'
|
|
20 if int( qv ) < min_qual:
|
|
21 return False
|
|
22 break
|
|
23 sanger += chr( int( qv ) + 33 )
|
|
24 except:
|
|
25 pass
|
|
26 return sanger
|
|
27
|
|
28 def Translator(frm='', to='', delete='', keep=None):
|
|
29 allchars = string.maketrans('','')
|
|
30 if len(to) == 1:
|
|
31 to = to * len(frm)
|
|
32 trans = string.maketrans(frm, to)
|
|
33 if keep is not None:
|
|
34 delete = allchars.translate(allchars, keep.translate(allchars, delete))
|
|
35 def callable(s):
|
|
36 return s.translate(trans, delete)
|
|
37 return callable
|
|
38
|
|
39 def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
|
|
40
|
|
41 # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
|
|
42 # which can have three values: fastq, txt, and db
|
|
43 # fastq = fastq format
|
|
44 # txt = space delimited format with defline, reads, and qvs
|
|
45 # dp = dump data into sqlite3 db.
|
|
46 # IMPORTNAT! If out = db two optins must be provided:
|
|
47 # 1. f_out must be a db connection object initialized with sqlite3.connect()
|
|
48 # 2. table_name must be provided
|
|
49
|
|
50 if out == 'db':
|
|
51 cursor = f_out.cursor()
|
|
52 sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
|
|
53 cursor.execute(sql)
|
|
54
|
|
55 lines = []
|
|
56 line = " "
|
|
57 while line:
|
|
58 for f in [ f_reads, f_qual ]:
|
|
59 line = f.readline().rstrip( '\n\r' )
|
|
60 while line.startswith( '#' ):
|
|
61 line = f.readline().rstrip( '\n\r' )
|
|
62 lines.append( line )
|
|
63
|
|
64
|
|
65 if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
|
|
66
|
|
67 if lines[0] != lines[1]:
|
|
68 stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
|
|
69
|
|
70 defline = lines[0][1:]
|
|
71 if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
|
|
72 defline = defline[ : len( defline )-3 ]
|
|
73
|
|
74 elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
|
|
75
|
|
76 if trim_first_base:
|
|
77 lines[0] = lines[0][1:]
|
|
78 if double_encode:
|
|
79 de = Translator(frm="0123.", to="ACGTN")
|
|
80 lines[0] = de(lines[0])
|
|
81 qual = solid2sanger( lines[1], int( min_qual ) )
|
|
82 if qual:
|
|
83 if out == 'fastq':
|
|
84 f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
|
|
85 if out == 'txt':
|
|
86 f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
|
|
87 if out == 'db':
|
|
88 cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )
|
|
89 lines = []
|
|
90
|
|
91 def main():
|
|
92
|
|
93 usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
|
|
94 parser = optparse.OptionParser(usage=usage)
|
|
95
|
|
96
|
|
97 parser.add_option(
|
|
98 '--fr','--f_reads',
|
|
99 metavar="F3_CSFASTA_FILE",
|
|
100 dest='fr',
|
|
101 help='Name of F3 file with color space reads')
|
|
102
|
|
103 parser.add_option(
|
|
104 '--fq','--f_qual',
|
|
105 metavar="F3_QUAL_FILE",
|
|
106 dest='fq',
|
|
107 help='Name of F3 file with color quality values')
|
|
108
|
|
109 parser.add_option(
|
|
110 '--fout','--f3_fastq_output',
|
|
111 metavar="F3_OUTPUT",
|
|
112 dest='fout',
|
|
113 help='Name for F3 output file')
|
|
114
|
|
115 parser.add_option(
|
|
116 '--rr','--r_reads',
|
|
117 metavar="R3_CSFASTA_FILE",
|
|
118 dest='rr',
|
|
119 default = False,
|
|
120 help='Name of R3 file with color space reads')
|
|
121
|
|
122 parser.add_option(
|
|
123 '--rq','--r_qual',
|
|
124 metavar="R3_QUAL_FILE",
|
|
125 dest='rq',
|
|
126 default = False,
|
|
127 help='Name of R3 file with color quality values')
|
|
128
|
|
129 parser.add_option(
|
|
130 '--rout',
|
|
131 metavar="R3_OUTPUT",
|
|
132 dest='rout',
|
|
133 help='Name for F3 output file')
|
|
134
|
|
135 parser.add_option(
|
|
136 '-q','--min_qual',
|
|
137 dest='min_qual',
|
|
138 default = '-1000',
|
|
139 help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
|
|
140
|
|
141 parser.add_option(
|
|
142 '-t','--trim_name',
|
|
143 dest='trim_name',
|
|
144 action='store_true',
|
|
145 default = False,
|
|
146 help='Trim _R3 and _F3 off read names. Default is False')
|
|
147
|
|
148 parser.add_option(
|
|
149 '-f','--trim_first_base',
|
|
150 dest='trim_first_base',
|
|
151 action='store_true',
|
|
152 default = False,
|
|
153 help='Remove the first base of reads in color-space. Default is False')
|
|
154
|
|
155 parser.add_option(
|
|
156 '-d','--double_encode',
|
|
157 dest='de',
|
|
158 action='store_true',
|
|
159 default = False,
|
|
160 help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
|
|
161
|
|
162 options, args = parser.parse_args()
|
|
163
|
|
164 if not ( options.fout and options.fr and options.fq ):
|
|
165 parser.error("""
|
|
166 One or more of the three required paremetrs is missing:
|
|
167 (1) --fr F3.csfasta file
|
|
168 (2) --fq F3.qual file
|
|
169 (3) --fout name of output file
|
|
170 Use --help for more info
|
|
171 """)
|
|
172
|
|
173 fr = open ( options.fr , 'r' )
|
|
174 fq = open ( options.fq , 'r' )
|
|
175 f_out = open ( options.fout , 'w' )
|
|
176
|
|
177 if options.rr and options.rq:
|
|
178 rr = open ( options.rr , 'r' )
|
|
179 rq = open ( options.rq , 'r' )
|
|
180 if not options.rout:
|
|
181 parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
|
|
182 r_out = open ( options.rout, 'w' )
|
|
183
|
|
184 db = tempfile.NamedTemporaryFile()
|
|
185
|
|
186 try:
|
|
187 con = sqlite3.connect(db.name)
|
|
188 cur = con.cursor()
|
|
189 except:
|
|
190 stop_err('Cannot connect to %s\n') % db.name
|
|
191
|
|
192
|
|
193 merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
|
|
194 merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
|
|
195 cur.execute('create index f3_name on f3( name )')
|
|
196 cur.execute('create index r3_name on r3( name )')
|
|
197
|
|
198 cur.execute('select * from f3,r3 where f3.name = r3.name')
|
|
199 for item in cur:
|
|
200 f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
|
|
201 r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
|
|
202
|
|
203
|
|
204 else:
|
|
205 merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
|
|
206
|
|
207
|
|
208
|
|
209 f_out.close()
|
|
210
|
|
211 if __name__ == "__main__":
|
|
212 main()
|
|
213
|
|
214 |