0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3 #MAQ mapper for SOLiD colourspace-reads
|
|
4
|
|
5 import sys, os, zipfile, tempfile, subprocess
|
|
6
|
|
7 def stop_err( msg ):
|
|
8 sys.stderr.write( "%s\n" % msg )
|
|
9 sys.exit()
|
|
10
|
|
11 def __main__():
|
|
12
|
|
13 out_fname = sys.argv[1].strip()
|
|
14 out_f2 = open(sys.argv[2].strip(),'r+')
|
|
15 ref_fname = sys.argv[3].strip()
|
|
16 f3_read_fname = sys.argv[4].strip()
|
|
17 f3_qual_fname = sys.argv[5].strip()
|
|
18 paired = sys.argv[6]
|
|
19 if paired == 'yes':
|
|
20 r3_read_fname = sys.argv[7].strip()
|
|
21 r3_qual_fname = sys.argv[8].strip()
|
|
22 min_mapqual = int(sys.argv[9].strip())
|
|
23 max_mismatch = int(sys.argv[10].strip())
|
|
24 out_f3name = sys.argv[11].strip()
|
|
25 subprocess_dict = {}
|
|
26
|
|
27 ref_csfa = tempfile.NamedTemporaryFile()
|
|
28 ref_bfa = tempfile.NamedTemporaryFile()
|
|
29 ref_csbfa = tempfile.NamedTemporaryFile()
|
|
30 cmd2_1 = 'maq fasta2csfa %s > %s 2>&1' %(ref_fname,ref_csfa.name)
|
|
31 cmd2_2 = 'maq fasta2bfa %s %s 2>&1' %(ref_csfa.name,ref_csbfa.name)
|
|
32 cmd2_3 = 'maq fasta2bfa %s %s 2>&1' %(ref_fname,ref_bfa.name)
|
|
33 try:
|
|
34 os.system(cmd2_1)
|
|
35 os.system(cmd2_2)
|
|
36 os.system(cmd2_3)
|
|
37 except Exception, erf:
|
|
38 stop_err(str(erf)+"Error processing reference sequence")
|
|
39
|
|
40 if paired == 'yes': #paired end reads
|
|
41 tmpf = tempfile.NamedTemporaryFile() #forward reads
|
|
42 tmpr = tempfile.NamedTemporaryFile() #reverse reads
|
|
43 tmps = tempfile.NamedTemporaryFile() #single reads
|
|
44 tmpffastq = tempfile.NamedTemporaryFile()
|
|
45 tmprfastq = tempfile.NamedTemporaryFile()
|
|
46 tmpsfastq = tempfile.NamedTemporaryFile()
|
|
47
|
|
48 cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname)
|
|
49 try:
|
|
50 os.system(cmd1)
|
|
51 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpffastq.name))
|
|
52 os.system('gunzip -c %s >> %s' %(tmpr.name,tmprfastq.name))
|
|
53 os.system('gunzip -c %s >> %s' %(tmps.name,tmpsfastq.name))
|
|
54
|
|
55 except Exception, eq:
|
|
56 stop_err("Error converting data to fastq format." + str(eq))
|
|
57
|
|
58 #make a temp directory where the split fastq files will be stored
|
|
59 try:
|
|
60 split_dir = tempfile.mkdtemp()
|
|
61 split_file_prefix_f = tempfile.mktemp(dir=split_dir)
|
|
62 split_file_prefix_r = tempfile.mktemp(dir=split_dir)
|
|
63 splitcmd_f = 'split -a 2 -l %d %s %s' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads
|
|
64 splitcmd_r = 'split -a 2 -l %d %s %s' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads
|
|
65
|
|
66 os.system(splitcmd_f)
|
|
67 os.system(splitcmd_r)
|
|
68 os.chdir(split_dir)
|
|
69 ii = 0
|
|
70 for fastq in os.listdir(split_dir):
|
|
71 if not fastq.startswith(split_file_prefix_f.split("/")[-1]):
|
|
72 continue
|
|
73 fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq
|
|
74 tmpbfq_f = tempfile.NamedTemporaryFile()
|
|
75 tmpbfq_r = tempfile.NamedTemporaryFile()
|
|
76 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq)
|
|
77 subprocess_dict['sp'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
|
|
78 ii += 1
|
|
79 while True:
|
|
80 all_done = True
|
|
81 for j,k in enumerate(subprocess_dict.keys()):
|
|
82 if subprocess_dict['sp'+str(j+1)].wait() != 0:
|
|
83 err = subprocess_dict['sp'+str(j+1)].communicate()[1]
|
|
84 if err != None:
|
|
85 stop_err("Mapping error: %s" %err)
|
|
86 all_done = False
|
|
87 if all_done:
|
|
88 break
|
|
89 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
|
|
90 os.system(cmdout)
|
|
91
|
|
92 tmpcsmap = tempfile.NamedTemporaryFile()
|
|
93 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
|
|
94 os.system(cmd_cat_csmap)
|
|
95
|
|
96 tmppileup = tempfile.NamedTemporaryFile()
|
|
97 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
|
|
98 os.system(cmdpileup)
|
|
99 tmppileup.seek(0)
|
|
100 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
|
|
101 for line in file(tmppileup.name):
|
|
102 elems = line.strip().split()
|
|
103 ref_nt = elems[2].capitalize()
|
|
104 read_nt = elems[4]
|
|
105 coverage = int(elems[3])
|
|
106 a,t,g,c = 0,0,0,0
|
|
107 ref_nt_count = 0
|
|
108 for ch in read_nt:
|
|
109 ch = ch.capitalize()
|
|
110 if ch not in ['A','T','G','C',',','.']:
|
|
111 continue
|
|
112 if ch in [',','.']:
|
|
113 ch = ref_nt
|
|
114 ref_nt_count += 1
|
|
115 try:
|
|
116 nt_ind = ['A','T','G','C'].index(ch)
|
|
117 if nt_ind == 0:
|
|
118 a+=1
|
|
119 elif nt_ind == 1:
|
|
120 t+=1
|
|
121 elif nt_ind == 2:
|
|
122 g+=1
|
|
123 else:
|
|
124 c+=1
|
|
125 except ValueError, we:
|
|
126 print >>sys.stderr, we
|
|
127 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
|
|
128 except Exception, er2:
|
|
129 stop_err("Encountered error while mapping: %s" %(str(er2)))
|
|
130
|
|
131
|
|
132 else: #single end reads
|
|
133 tmpf = tempfile.NamedTemporaryFile()
|
|
134 tmpfastq = tempfile.NamedTemporaryFile()
|
|
135 cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,None,None,f3_read_fname,f3_qual_fname,None,None)
|
|
136 try:
|
|
137 os.system(cmd1)
|
|
138 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpfastq.name))
|
|
139 tmpf.close()
|
|
140 except:
|
|
141 stop_err("Error converting data to fastq format.")
|
|
142
|
|
143 #make a temp directory where the split fastq files will be stored
|
|
144 try:
|
|
145 split_dir = tempfile.mkdtemp()
|
|
146 split_file_prefix = tempfile.mktemp(dir=split_dir)
|
|
147 splitcmd = 'split -a 2 -l %d %s %s' %(32000000,tmpfastq.name,split_file_prefix) #32M lines correspond to 8M reads
|
|
148 os.system(splitcmd)
|
|
149 os.chdir(split_dir)
|
|
150 for i,fastq in enumerate(os.listdir(split_dir)):
|
|
151 tmpbfq = tempfile.NamedTemporaryFile()
|
|
152 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq.name,fastq,ref_csbfa.name,tmpbfq.name,fastq,fastq)
|
|
153 subprocess_dict['sp'+str(i+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
|
|
154
|
|
155 while True:
|
|
156 all_done = True
|
|
157 for j,k in enumerate(subprocess_dict.keys()):
|
|
158 if subprocess_dict['sp'+str(j+1)].wait() != 0:
|
|
159 err = subprocess_dict['sp'+str(j+1)].communicate()[1]
|
|
160 if err != None:
|
|
161 stop_err("Mapping error: %s" %err)
|
|
162 all_done = False
|
|
163 if all_done:
|
|
164 break
|
|
165
|
|
166 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
|
|
167 os.system(cmdout)
|
|
168
|
|
169 tmpcsmap = tempfile.NamedTemporaryFile()
|
|
170 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
|
|
171 os.system(cmd_cat_csmap)
|
|
172
|
|
173 tmppileup = tempfile.NamedTemporaryFile()
|
|
174 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
|
|
175 os.system(cmdpileup)
|
|
176 tmppileup.seek(0)
|
|
177 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
|
|
178 for line in file(tmppileup.name):
|
|
179 elems = line.strip().split()
|
|
180 ref_nt = elems[2].capitalize()
|
|
181 read_nt = elems[4]
|
|
182 coverage = int(elems[3])
|
|
183 a,t,g,c = 0,0,0,0
|
|
184 ref_nt_count = 0
|
|
185 for ch in read_nt:
|
|
186 ch = ch.capitalize()
|
|
187 if ch not in ['A','T','G','C',',','.']:
|
|
188 continue
|
|
189 if ch in [',','.']:
|
|
190 ch = ref_nt
|
|
191 ref_nt_count += 1
|
|
192 try:
|
|
193 nt_ind = ['A','T','G','C'].index(ch)
|
|
194 if nt_ind == 0:
|
|
195 a+=1
|
|
196 elif nt_ind == 1:
|
|
197 t+=1
|
|
198 elif nt_ind == 2:
|
|
199 g+=1
|
|
200 else:
|
|
201 c+=1
|
|
202 except:
|
|
203 pass
|
|
204 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
|
|
205 except Exception, er2:
|
|
206 stop_err("Encountered error while mapping: %s" %(str(er2)))
|
|
207
|
|
208 #Build custom track from pileup
|
|
209 chr_list=[]
|
|
210 out_f2.seek(0)
|
|
211 fcov = tempfile.NamedTemporaryFile()
|
|
212 fout_a = tempfile.NamedTemporaryFile()
|
|
213 fout_t = tempfile.NamedTemporaryFile()
|
|
214 fout_g = tempfile.NamedTemporaryFile()
|
|
215 fout_c = tempfile.NamedTemporaryFile()
|
|
216 fcov.write('''track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n''')
|
|
217 fout_a.write('''track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n''')
|
|
218 fout_t.write('''track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n''')
|
|
219 fout_g.write('''track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n''')
|
|
220 fout_c.write('''track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n''')
|
|
221
|
|
222 for line in out_f2:
|
|
223 if line.startswith("#"):
|
|
224 continue
|
|
225 elems = line.split()
|
|
226 chr = elems[0]
|
|
227
|
|
228 if chr not in chr_list:
|
|
229 chr_list.append(chr)
|
|
230 if not (chr.startswith('chr') or chr.startswith('scaffold')):
|
|
231 chr = 'chr'
|
|
232 header = "variableStep chrom=%s" %(chr)
|
|
233 fcov.write("%s\n" %(header))
|
|
234 fout_a.write("%s\n" %(header))
|
|
235 fout_t.write("%s\n" %(header))
|
|
236 fout_g.write("%s\n" %(header))
|
|
237 fout_c.write("%s\n" %(header))
|
|
238 try:
|
|
239 pos = int(elems[1])
|
|
240 cov = int(elems[3])
|
|
241 a = int(elems[5])
|
|
242 t = int(elems[6])
|
|
243 g = int(elems[7])
|
|
244 c = int(elems[8])
|
|
245 except:
|
|
246 continue
|
|
247 fcov.write("%s\t%s\n" %(pos,cov))
|
|
248 try:
|
|
249 a_freq = a*100./cov
|
|
250 t_freq = t*100./cov
|
|
251 g_freq = g*100./cov
|
|
252 c_freq = c*100./cov
|
|
253 except ZeroDivisionError:
|
|
254 a_freq=t_freq=g_freq=c_freq=0
|
|
255 fout_a.write("%s\t%s\n" %(pos,a_freq))
|
|
256 fout_t.write("%s\t%s\n" %(pos,t_freq))
|
|
257 fout_g.write("%s\t%s\n" %(pos,g_freq))
|
|
258 fout_c.write("%s\t%s\n" %(pos,c_freq))
|
|
259
|
|
260 fcov.seek(0)
|
|
261 fout_a.seek(0)
|
|
262 fout_g.seek(0)
|
|
263 fout_t.seek(0)
|
|
264 fout_c.seek(0)
|
|
265 os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name))
|
|
266
|
|
267 if __name__=="__main__":
|
|
268 __main__()
|
|
269
|
|
270
|