comparison tools/solid_tools/maq_cs_wrapper.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3 #MAQ mapper for SOLiD colourspace-reads
4
5 import sys, os, zipfile, tempfile, subprocess
6
7 def stop_err( msg ):
8 sys.stderr.write( "%s\n" % msg )
9 sys.exit()
10
11 def __main__():
12
13 out_fname = sys.argv[1].strip()
14 out_f2 = open(sys.argv[2].strip(),'r+')
15 ref_fname = sys.argv[3].strip()
16 f3_read_fname = sys.argv[4].strip()
17 f3_qual_fname = sys.argv[5].strip()
18 paired = sys.argv[6]
19 if paired == 'yes':
20 r3_read_fname = sys.argv[7].strip()
21 r3_qual_fname = sys.argv[8].strip()
22 min_mapqual = int(sys.argv[9].strip())
23 max_mismatch = int(sys.argv[10].strip())
24 out_f3name = sys.argv[11].strip()
25 subprocess_dict = {}
26
27 ref_csfa = tempfile.NamedTemporaryFile()
28 ref_bfa = tempfile.NamedTemporaryFile()
29 ref_csbfa = tempfile.NamedTemporaryFile()
30 cmd2_1 = 'maq fasta2csfa %s > %s 2>&1' %(ref_fname,ref_csfa.name)
31 cmd2_2 = 'maq fasta2bfa %s %s 2>&1' %(ref_csfa.name,ref_csbfa.name)
32 cmd2_3 = 'maq fasta2bfa %s %s 2>&1' %(ref_fname,ref_bfa.name)
33 try:
34 os.system(cmd2_1)
35 os.system(cmd2_2)
36 os.system(cmd2_3)
37 except Exception, erf:
38 stop_err(str(erf)+"Error processing reference sequence")
39
40 if paired == 'yes': #paired end reads
41 tmpf = tempfile.NamedTemporaryFile() #forward reads
42 tmpr = tempfile.NamedTemporaryFile() #reverse reads
43 tmps = tempfile.NamedTemporaryFile() #single reads
44 tmpffastq = tempfile.NamedTemporaryFile()
45 tmprfastq = tempfile.NamedTemporaryFile()
46 tmpsfastq = tempfile.NamedTemporaryFile()
47
48 cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname)
49 try:
50 os.system(cmd1)
51 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpffastq.name))
52 os.system('gunzip -c %s >> %s' %(tmpr.name,tmprfastq.name))
53 os.system('gunzip -c %s >> %s' %(tmps.name,tmpsfastq.name))
54
55 except Exception, eq:
56 stop_err("Error converting data to fastq format." + str(eq))
57
58 #make a temp directory where the split fastq files will be stored
59 try:
60 split_dir = tempfile.mkdtemp()
61 split_file_prefix_f = tempfile.mktemp(dir=split_dir)
62 split_file_prefix_r = tempfile.mktemp(dir=split_dir)
63 splitcmd_f = 'split -a 2 -l %d %s %s' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads
64 splitcmd_r = 'split -a 2 -l %d %s %s' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads
65
66 os.system(splitcmd_f)
67 os.system(splitcmd_r)
68 os.chdir(split_dir)
69 ii = 0
70 for fastq in os.listdir(split_dir):
71 if not fastq.startswith(split_file_prefix_f.split("/")[-1]):
72 continue
73 fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq
74 tmpbfq_f = tempfile.NamedTemporaryFile()
75 tmpbfq_r = tempfile.NamedTemporaryFile()
76 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq)
77 subprocess_dict['sp'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
78 ii += 1
79 while True:
80 all_done = True
81 for j,k in enumerate(subprocess_dict.keys()):
82 if subprocess_dict['sp'+str(j+1)].wait() != 0:
83 err = subprocess_dict['sp'+str(j+1)].communicate()[1]
84 if err != None:
85 stop_err("Mapping error: %s" %err)
86 all_done = False
87 if all_done:
88 break
89 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
90 os.system(cmdout)
91
92 tmpcsmap = tempfile.NamedTemporaryFile()
93 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
94 os.system(cmd_cat_csmap)
95
96 tmppileup = tempfile.NamedTemporaryFile()
97 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
98 os.system(cmdpileup)
99 tmppileup.seek(0)
100 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
101 for line in file(tmppileup.name):
102 elems = line.strip().split()
103 ref_nt = elems[2].capitalize()
104 read_nt = elems[4]
105 coverage = int(elems[3])
106 a,t,g,c = 0,0,0,0
107 ref_nt_count = 0
108 for ch in read_nt:
109 ch = ch.capitalize()
110 if ch not in ['A','T','G','C',',','.']:
111 continue
112 if ch in [',','.']:
113 ch = ref_nt
114 ref_nt_count += 1
115 try:
116 nt_ind = ['A','T','G','C'].index(ch)
117 if nt_ind == 0:
118 a+=1
119 elif nt_ind == 1:
120 t+=1
121 elif nt_ind == 2:
122 g+=1
123 else:
124 c+=1
125 except ValueError, we:
126 print >>sys.stderr, we
127 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
128 except Exception, er2:
129 stop_err("Encountered error while mapping: %s" %(str(er2)))
130
131
132 else: #single end reads
133 tmpf = tempfile.NamedTemporaryFile()
134 tmpfastq = tempfile.NamedTemporaryFile()
135 cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,None,None,f3_read_fname,f3_qual_fname,None,None)
136 try:
137 os.system(cmd1)
138 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpfastq.name))
139 tmpf.close()
140 except:
141 stop_err("Error converting data to fastq format.")
142
143 #make a temp directory where the split fastq files will be stored
144 try:
145 split_dir = tempfile.mkdtemp()
146 split_file_prefix = tempfile.mktemp(dir=split_dir)
147 splitcmd = 'split -a 2 -l %d %s %s' %(32000000,tmpfastq.name,split_file_prefix) #32M lines correspond to 8M reads
148 os.system(splitcmd)
149 os.chdir(split_dir)
150 for i,fastq in enumerate(os.listdir(split_dir)):
151 tmpbfq = tempfile.NamedTemporaryFile()
152 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq.name,fastq,ref_csbfa.name,tmpbfq.name,fastq,fastq)
153 subprocess_dict['sp'+str(i+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
154
155 while True:
156 all_done = True
157 for j,k in enumerate(subprocess_dict.keys()):
158 if subprocess_dict['sp'+str(j+1)].wait() != 0:
159 err = subprocess_dict['sp'+str(j+1)].communicate()[1]
160 if err != None:
161 stop_err("Mapping error: %s" %err)
162 all_done = False
163 if all_done:
164 break
165
166 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
167 os.system(cmdout)
168
169 tmpcsmap = tempfile.NamedTemporaryFile()
170 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
171 os.system(cmd_cat_csmap)
172
173 tmppileup = tempfile.NamedTemporaryFile()
174 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
175 os.system(cmdpileup)
176 tmppileup.seek(0)
177 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
178 for line in file(tmppileup.name):
179 elems = line.strip().split()
180 ref_nt = elems[2].capitalize()
181 read_nt = elems[4]
182 coverage = int(elems[3])
183 a,t,g,c = 0,0,0,0
184 ref_nt_count = 0
185 for ch in read_nt:
186 ch = ch.capitalize()
187 if ch not in ['A','T','G','C',',','.']:
188 continue
189 if ch in [',','.']:
190 ch = ref_nt
191 ref_nt_count += 1
192 try:
193 nt_ind = ['A','T','G','C'].index(ch)
194 if nt_ind == 0:
195 a+=1
196 elif nt_ind == 1:
197 t+=1
198 elif nt_ind == 2:
199 g+=1
200 else:
201 c+=1
202 except:
203 pass
204 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
205 except Exception, er2:
206 stop_err("Encountered error while mapping: %s" %(str(er2)))
207
208 #Build custom track from pileup
209 chr_list=[]
210 out_f2.seek(0)
211 fcov = tempfile.NamedTemporaryFile()
212 fout_a = tempfile.NamedTemporaryFile()
213 fout_t = tempfile.NamedTemporaryFile()
214 fout_g = tempfile.NamedTemporaryFile()
215 fout_c = tempfile.NamedTemporaryFile()
216 fcov.write('''track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n''')
217 fout_a.write('''track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n''')
218 fout_t.write('''track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n''')
219 fout_g.write('''track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n''')
220 fout_c.write('''track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n''')
221
222 for line in out_f2:
223 if line.startswith("#"):
224 continue
225 elems = line.split()
226 chr = elems[0]
227
228 if chr not in chr_list:
229 chr_list.append(chr)
230 if not (chr.startswith('chr') or chr.startswith('scaffold')):
231 chr = 'chr'
232 header = "variableStep chrom=%s" %(chr)
233 fcov.write("%s\n" %(header))
234 fout_a.write("%s\n" %(header))
235 fout_t.write("%s\n" %(header))
236 fout_g.write("%s\n" %(header))
237 fout_c.write("%s\n" %(header))
238 try:
239 pos = int(elems[1])
240 cov = int(elems[3])
241 a = int(elems[5])
242 t = int(elems[6])
243 g = int(elems[7])
244 c = int(elems[8])
245 except:
246 continue
247 fcov.write("%s\t%s\n" %(pos,cov))
248 try:
249 a_freq = a*100./cov
250 t_freq = t*100./cov
251 g_freq = g*100./cov
252 c_freq = c*100./cov
253 except ZeroDivisionError:
254 a_freq=t_freq=g_freq=c_freq=0
255 fout_a.write("%s\t%s\n" %(pos,a_freq))
256 fout_t.write("%s\t%s\n" %(pos,t_freq))
257 fout_g.write("%s\t%s\n" %(pos,g_freq))
258 fout_c.write("%s\t%s\n" %(pos,c_freq))
259
260 fcov.seek(0)
261 fout_a.seek(0)
262 fout_g.seek(0)
263 fout_t.seek(0)
264 fout_c.seek(0)
265 os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name))
266
267 if __name__=="__main__":
268 __main__()
269
270