Mercurial > repos > xuebing > sharplabtool
comparison tools/solid_tools/maq_cs_wrapper.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 #MAQ mapper for SOLiD colourspace-reads | |
4 | |
5 import sys, os, zipfile, tempfile, subprocess | |
6 | |
7 def stop_err( msg ): | |
8 sys.stderr.write( "%s\n" % msg ) | |
9 sys.exit() | |
10 | |
11 def __main__(): | |
12 | |
13 out_fname = sys.argv[1].strip() | |
14 out_f2 = open(sys.argv[2].strip(),'r+') | |
15 ref_fname = sys.argv[3].strip() | |
16 f3_read_fname = sys.argv[4].strip() | |
17 f3_qual_fname = sys.argv[5].strip() | |
18 paired = sys.argv[6] | |
19 if paired == 'yes': | |
20 r3_read_fname = sys.argv[7].strip() | |
21 r3_qual_fname = sys.argv[8].strip() | |
22 min_mapqual = int(sys.argv[9].strip()) | |
23 max_mismatch = int(sys.argv[10].strip()) | |
24 out_f3name = sys.argv[11].strip() | |
25 subprocess_dict = {} | |
26 | |
27 ref_csfa = tempfile.NamedTemporaryFile() | |
28 ref_bfa = tempfile.NamedTemporaryFile() | |
29 ref_csbfa = tempfile.NamedTemporaryFile() | |
30 cmd2_1 = 'maq fasta2csfa %s > %s 2>&1' %(ref_fname,ref_csfa.name) | |
31 cmd2_2 = 'maq fasta2bfa %s %s 2>&1' %(ref_csfa.name,ref_csbfa.name) | |
32 cmd2_3 = 'maq fasta2bfa %s %s 2>&1' %(ref_fname,ref_bfa.name) | |
33 try: | |
34 os.system(cmd2_1) | |
35 os.system(cmd2_2) | |
36 os.system(cmd2_3) | |
37 except Exception, erf: | |
38 stop_err(str(erf)+"Error processing reference sequence") | |
39 | |
40 if paired == 'yes': #paired end reads | |
41 tmpf = tempfile.NamedTemporaryFile() #forward reads | |
42 tmpr = tempfile.NamedTemporaryFile() #reverse reads | |
43 tmps = tempfile.NamedTemporaryFile() #single reads | |
44 tmpffastq = tempfile.NamedTemporaryFile() | |
45 tmprfastq = tempfile.NamedTemporaryFile() | |
46 tmpsfastq = tempfile.NamedTemporaryFile() | |
47 | |
48 cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname) | |
49 try: | |
50 os.system(cmd1) | |
51 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpffastq.name)) | |
52 os.system('gunzip -c %s >> %s' %(tmpr.name,tmprfastq.name)) | |
53 os.system('gunzip -c %s >> %s' %(tmps.name,tmpsfastq.name)) | |
54 | |
55 except Exception, eq: | |
56 stop_err("Error converting data to fastq format." + str(eq)) | |
57 | |
58 #make a temp directory where the split fastq files will be stored | |
59 try: | |
60 split_dir = tempfile.mkdtemp() | |
61 split_file_prefix_f = tempfile.mktemp(dir=split_dir) | |
62 split_file_prefix_r = tempfile.mktemp(dir=split_dir) | |
63 splitcmd_f = 'split -a 2 -l %d %s %s' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads | |
64 splitcmd_r = 'split -a 2 -l %d %s %s' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads | |
65 | |
66 os.system(splitcmd_f) | |
67 os.system(splitcmd_r) | |
68 os.chdir(split_dir) | |
69 ii = 0 | |
70 for fastq in os.listdir(split_dir): | |
71 if not fastq.startswith(split_file_prefix_f.split("/")[-1]): | |
72 continue | |
73 fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq | |
74 tmpbfq_f = tempfile.NamedTemporaryFile() | |
75 tmpbfq_r = tempfile.NamedTemporaryFile() | |
76 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq) | |
77 subprocess_dict['sp'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE) | |
78 ii += 1 | |
79 while True: | |
80 all_done = True | |
81 for j,k in enumerate(subprocess_dict.keys()): | |
82 if subprocess_dict['sp'+str(j+1)].wait() != 0: | |
83 err = subprocess_dict['sp'+str(j+1)].communicate()[1] | |
84 if err != None: | |
85 stop_err("Mapping error: %s" %err) | |
86 all_done = False | |
87 if all_done: | |
88 break | |
89 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname) | |
90 os.system(cmdout) | |
91 | |
92 tmpcsmap = tempfile.NamedTemporaryFile() | |
93 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name) | |
94 os.system(cmd_cat_csmap) | |
95 | |
96 tmppileup = tempfile.NamedTemporaryFile() | |
97 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name) | |
98 os.system(cmdpileup) | |
99 tmppileup.seek(0) | |
100 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count" | |
101 for line in file(tmppileup.name): | |
102 elems = line.strip().split() | |
103 ref_nt = elems[2].capitalize() | |
104 read_nt = elems[4] | |
105 coverage = int(elems[3]) | |
106 a,t,g,c = 0,0,0,0 | |
107 ref_nt_count = 0 | |
108 for ch in read_nt: | |
109 ch = ch.capitalize() | |
110 if ch not in ['A','T','G','C',',','.']: | |
111 continue | |
112 if ch in [',','.']: | |
113 ch = ref_nt | |
114 ref_nt_count += 1 | |
115 try: | |
116 nt_ind = ['A','T','G','C'].index(ch) | |
117 if nt_ind == 0: | |
118 a+=1 | |
119 elif nt_ind == 1: | |
120 t+=1 | |
121 elif nt_ind == 2: | |
122 g+=1 | |
123 else: | |
124 c+=1 | |
125 except ValueError, we: | |
126 print >>sys.stderr, we | |
127 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c) | |
128 except Exception, er2: | |
129 stop_err("Encountered error while mapping: %s" %(str(er2))) | |
130 | |
131 | |
132 else: #single end reads | |
133 tmpf = tempfile.NamedTemporaryFile() | |
134 tmpfastq = tempfile.NamedTemporaryFile() | |
135 cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,None,None,f3_read_fname,f3_qual_fname,None,None) | |
136 try: | |
137 os.system(cmd1) | |
138 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpfastq.name)) | |
139 tmpf.close() | |
140 except: | |
141 stop_err("Error converting data to fastq format.") | |
142 | |
143 #make a temp directory where the split fastq files will be stored | |
144 try: | |
145 split_dir = tempfile.mkdtemp() | |
146 split_file_prefix = tempfile.mktemp(dir=split_dir) | |
147 splitcmd = 'split -a 2 -l %d %s %s' %(32000000,tmpfastq.name,split_file_prefix) #32M lines correspond to 8M reads | |
148 os.system(splitcmd) | |
149 os.chdir(split_dir) | |
150 for i,fastq in enumerate(os.listdir(split_dir)): | |
151 tmpbfq = tempfile.NamedTemporaryFile() | |
152 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq.name,fastq,ref_csbfa.name,tmpbfq.name,fastq,fastq) | |
153 subprocess_dict['sp'+str(i+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE) | |
154 | |
155 while True: | |
156 all_done = True | |
157 for j,k in enumerate(subprocess_dict.keys()): | |
158 if subprocess_dict['sp'+str(j+1)].wait() != 0: | |
159 err = subprocess_dict['sp'+str(j+1)].communicate()[1] | |
160 if err != None: | |
161 stop_err("Mapping error: %s" %err) | |
162 all_done = False | |
163 if all_done: | |
164 break | |
165 | |
166 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname) | |
167 os.system(cmdout) | |
168 | |
169 tmpcsmap = tempfile.NamedTemporaryFile() | |
170 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name) | |
171 os.system(cmd_cat_csmap) | |
172 | |
173 tmppileup = tempfile.NamedTemporaryFile() | |
174 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name) | |
175 os.system(cmdpileup) | |
176 tmppileup.seek(0) | |
177 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count" | |
178 for line in file(tmppileup.name): | |
179 elems = line.strip().split() | |
180 ref_nt = elems[2].capitalize() | |
181 read_nt = elems[4] | |
182 coverage = int(elems[3]) | |
183 a,t,g,c = 0,0,0,0 | |
184 ref_nt_count = 0 | |
185 for ch in read_nt: | |
186 ch = ch.capitalize() | |
187 if ch not in ['A','T','G','C',',','.']: | |
188 continue | |
189 if ch in [',','.']: | |
190 ch = ref_nt | |
191 ref_nt_count += 1 | |
192 try: | |
193 nt_ind = ['A','T','G','C'].index(ch) | |
194 if nt_ind == 0: | |
195 a+=1 | |
196 elif nt_ind == 1: | |
197 t+=1 | |
198 elif nt_ind == 2: | |
199 g+=1 | |
200 else: | |
201 c+=1 | |
202 except: | |
203 pass | |
204 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c) | |
205 except Exception, er2: | |
206 stop_err("Encountered error while mapping: %s" %(str(er2))) | |
207 | |
208 #Build custom track from pileup | |
209 chr_list=[] | |
210 out_f2.seek(0) | |
211 fcov = tempfile.NamedTemporaryFile() | |
212 fout_a = tempfile.NamedTemporaryFile() | |
213 fout_t = tempfile.NamedTemporaryFile() | |
214 fout_g = tempfile.NamedTemporaryFile() | |
215 fout_c = tempfile.NamedTemporaryFile() | |
216 fcov.write('''track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n''') | |
217 fout_a.write('''track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n''') | |
218 fout_t.write('''track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n''') | |
219 fout_g.write('''track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n''') | |
220 fout_c.write('''track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n''') | |
221 | |
222 for line in out_f2: | |
223 if line.startswith("#"): | |
224 continue | |
225 elems = line.split() | |
226 chr = elems[0] | |
227 | |
228 if chr not in chr_list: | |
229 chr_list.append(chr) | |
230 if not (chr.startswith('chr') or chr.startswith('scaffold')): | |
231 chr = 'chr' | |
232 header = "variableStep chrom=%s" %(chr) | |
233 fcov.write("%s\n" %(header)) | |
234 fout_a.write("%s\n" %(header)) | |
235 fout_t.write("%s\n" %(header)) | |
236 fout_g.write("%s\n" %(header)) | |
237 fout_c.write("%s\n" %(header)) | |
238 try: | |
239 pos = int(elems[1]) | |
240 cov = int(elems[3]) | |
241 a = int(elems[5]) | |
242 t = int(elems[6]) | |
243 g = int(elems[7]) | |
244 c = int(elems[8]) | |
245 except: | |
246 continue | |
247 fcov.write("%s\t%s\n" %(pos,cov)) | |
248 try: | |
249 a_freq = a*100./cov | |
250 t_freq = t*100./cov | |
251 g_freq = g*100./cov | |
252 c_freq = c*100./cov | |
253 except ZeroDivisionError: | |
254 a_freq=t_freq=g_freq=c_freq=0 | |
255 fout_a.write("%s\t%s\n" %(pos,a_freq)) | |
256 fout_t.write("%s\t%s\n" %(pos,t_freq)) | |
257 fout_g.write("%s\t%s\n" %(pos,g_freq)) | |
258 fout_c.write("%s\t%s\n" %(pos,c_freq)) | |
259 | |
260 fcov.seek(0) | |
261 fout_a.seek(0) | |
262 fout_g.seek(0) | |
263 fout_t.seek(0) | |
264 fout_c.seek(0) | |
265 os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name)) | |
266 | |
267 if __name__=="__main__": | |
268 __main__() | |
269 | |
270 |