Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_single_end.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 import fileinput, os, time, random, math | |
2 from bs_utils.utils import * | |
3 from bs_align_utils import * | |
4 | |
5 #---------------------------------------------------------------- | |
6 # Read from the mapped results, return lists of unique / multiple-hit reads | |
7 # The function suppose at most 2 hits will be reported in single file | |
8 def extract_mapping(ali_file): | |
9 unique_hits = {} | |
10 non_unique_hits = {} | |
11 | |
12 header0 = "" | |
13 lst = [] | |
14 | |
15 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file): | |
16 #------------------------------ | |
17 if header != header0: | |
18 #---------- output ----------- | |
19 if len(lst) == 1: | |
20 unique_hits[header0] = lst[0] # [no_mismatch, chr, location] | |
21 elif len(lst) > 1: | |
22 min_lst = min(lst, key = lambda x: x[0]) | |
23 max_lst = max(lst, key = lambda x: x[0]) | |
24 | |
25 if min_lst[0] < max_lst[0]: | |
26 unique_hits[header0] = min_lst | |
27 else: | |
28 non_unique_hits[header0] = min_lst[0] | |
29 #print "multiple hit", header, chr, location, no_mismatch, cigar # test | |
30 header0 = header | |
31 lst = [(no_mismatch, chr, location, cigar)] | |
32 else: # header == header0, same header (read id) | |
33 lst.append((no_mismatch, chr, location, cigar)) | |
34 | |
35 if len(lst) == 1: | |
36 unique_hits[header0] = lst[0] # [no_mismatch, chr, location] | |
37 elif len(lst) > 1: | |
38 min_lst = min(lst, key = lambda x: x[0]) | |
39 max_lst = max(lst, key = lambda x: x[0]) | |
40 | |
41 if min_lst[0] < max_lst[0]: | |
42 unique_hits[header0] = min_lst | |
43 else: | |
44 non_unique_hits[header0] = min_lst[0] | |
45 | |
46 | |
47 return unique_hits, non_unique_hits | |
48 | |
49 | |
50 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines, | |
51 max_mismatch_no, aligner_command, db_path, tmp_path, outfile, | |
52 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False): | |
53 #---------------------------------------------------------------- | |
54 # adapter : strand-specific or not | |
55 adapter="" | |
56 adapter_fw="" | |
57 adapter_rc="" | |
58 if adapter_file !="": | |
59 try : | |
60 adapter_inf=open(adapter_file,"r") | |
61 except IOError: | |
62 print "[Error] Cannot open adapter file : %s" % adapter_file | |
63 exit(-1) | |
64 if asktag == "N": #<--- directional library | |
65 adapter=adapter_inf.readline() | |
66 adapter_inf.close() | |
67 adapter=adapter.rstrip("\n") | |
68 elif asktag == "Y":#<--- un-directional library | |
69 adapter_fw=adapter_inf.readline() | |
70 adapter_rc=adapter_inf.readline() | |
71 adapter_inf.close() | |
72 adapter_fw=adapter_fw.rstrip("\n") | |
73 adapter_rc=adapter_rc.rstrip("\n") | |
74 adapter_inf.close() | |
75 #---------------------------------------------------------------- | |
76 | |
77 | |
78 | |
79 #---------------------------------------------------------------- | |
80 logm("Read filename: %s"% main_read_file ) | |
81 logm("Un-directional library: %s" % asktag ) | |
82 logm("The first base (for mapping): %d" % cut1) | |
83 logm("The last base (for mapping): %d" % cut2) | |
84 logm("Max. lines per mapping: %d"% no_small_lines) | |
85 logm("Aligner: %s" % aligner_command) | |
86 logm("Reference genome library path: %s" % db_path ) | |
87 logm("Number of mismatches allowed: %s" % max_mismatch_no ) | |
88 if adapter_file !="": | |
89 if asktag=="N": | |
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n"))) | |
91 elif asktag=="Y": | |
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) ) | |
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) ) | |
94 #---------------------------------------------------------------- | |
95 | |
96 # helper method to join fname with tmp_path | |
97 tmp_d = lambda fname: os.path.join(tmp_path, fname) | |
98 | |
99 db_d = lambda fname: os.path.join(db_path, fname) | |
100 | |
101 #---------------------------------------------------------------- | |
102 # splitting the big read file | |
103 | |
104 input_fname = os.path.split(main_read_file)[1] | |
105 | |
106 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines) | |
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) | |
108 # if splitted_file.startswith("%s-s-" % input_fname)) | |
109 | |
110 #---- Stats ------------------------------------------------------------ | |
111 all_raw_reads=0 | |
112 all_trimed=0 | |
113 all_mapped=0 | |
114 all_mapped_passed=0 | |
115 | |
116 numbers_premapped_lst=[0,0,0,0] | |
117 numbers_mapped_lst=[0,0,0,0] | |
118 | |
119 mC_lst=[0,0,0] | |
120 uC_lst=[0,0,0] | |
121 | |
122 | |
123 no_my_files=0 | |
124 | |
125 #---------------------------------------------------------------- | |
126 logm("== Start mapping ==") | |
127 | |
128 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): | |
129 # for read_file in my_files: | |
130 original_bs_reads = {} | |
131 no_my_files+=1 | |
132 random_id = ".tmp-"+str(random.randint(1000000,9999999)) | |
133 | |
134 #------------------------------------------------------------------- | |
135 # undirectional sequencing | |
136 #------------------------------------------------------------------- | |
137 if asktag=="Y": | |
138 | |
139 #---------------------------------------------------------------- | |
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) | |
141 outfile3=tmp_d('Trimmed_G2A.fa'+random_id) | |
142 | |
143 outf2=open(outfile2,'w') | |
144 outf3=open(outfile3,'w') | |
145 | |
146 #---------------------------------------------------------------- | |
147 # detect format of input file | |
148 try : | |
149 read_inf=open(read_file,"r") | |
150 except IOError : | |
151 print "[Error] Cannot open input file : %s" % read_file | |
152 exit(-1) | |
153 | |
154 oneline=read_inf.readline() | |
155 l=oneline.split() | |
156 input_format="" | |
157 if oneline[0]=="@": # fastq | |
158 input_format="fastq" | |
159 n_fastq=0 | |
160 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
161 input_format="seq" | |
162 elif len(l)==11: # qseq | |
163 input_format="qseq" | |
164 elif oneline[0]==">": # fasta | |
165 input_format="fasta" | |
166 n_fasta=0 | |
167 read_inf.close() | |
168 | |
169 #---------------------------------------------------------------- | |
170 # read sequence, remove adapter and convert | |
171 read_id="" | |
172 seq="" | |
173 seq_ready="N" | |
174 for line in fileinput.input(read_file): | |
175 l=line.split() | |
176 | |
177 if input_format=="seq": | |
178 all_raw_reads+=1 | |
179 read_id=str(all_raw_reads) | |
180 read_id=read_id.zfill(12) | |
181 seq=l[0] | |
182 seq_ready="Y" | |
183 elif input_format=="fastq": | |
184 m_fastq=math.fmod(n_fastq,4) | |
185 n_fastq+=1 | |
186 seq_ready="N" | |
187 if m_fastq==0: | |
188 all_raw_reads+=1 | |
189 read_id=str(all_raw_reads) | |
190 read_id=read_id.zfill(12) | |
191 seq="" | |
192 elif m_fastq==1: | |
193 seq=l[0] | |
194 seq_ready="Y" | |
195 else: | |
196 seq="" | |
197 elif input_format=="qseq": | |
198 all_raw_reads+=1 | |
199 read_id=str(all_raw_reads) | |
200 read_id=read_id.zfill(12) | |
201 seq=l[8] | |
202 seq_ready="Y" | |
203 elif input_format=="fasta": | |
204 m_fasta=math.fmod(n_fasta,2) | |
205 n_fasta+=1 | |
206 seq_ready="N" | |
207 if m_fasta==0: | |
208 all_raw_reads+=1 | |
209 #read_id=str(all_raw_reads) | |
210 read_id=l[0][1:] | |
211 seq="" | |
212 elif m_fasta==1: | |
213 seq=l[0] | |
214 seq_ready="Y" | |
215 else: | |
216 seq="" | |
217 | |
218 #---------------------------------------------------------------- | |
219 if seq_ready=="Y": | |
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 | |
221 seq=seq.upper() | |
222 seq=seq.replace(".","N") | |
223 | |
224 # striping BS adapter from 3' read | |
225 if (adapter_fw !="") and (adapter_rc !="") : | |
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) | |
227 new_read = Remove_5end_Adapter(new_read, adapter_rc) | |
228 if len(new_read) < len(seq) : | |
229 all_trimed += 1 | |
230 seq = new_read | |
231 | |
232 if len(seq)<=4: | |
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) | |
234 | |
235 #--------- trimmed_raw_BS_read ------------------ | |
236 original_bs_reads[read_id] = seq | |
237 | |
238 #--------- FW_C2T ------------------ | |
239 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) | |
240 #--------- RC_G2A ------------------ | |
241 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) | |
242 | |
243 fileinput.close() | |
244 | |
245 outf2.close() | |
246 outf3.close() | |
247 | |
248 delete_files(read_file) | |
249 | |
250 #-------------------------------------------------------------------------------- | |
251 # Bowtie mapping | |
252 #------------------------------------------------------------------------------- | |
253 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
254 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
255 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) | |
256 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) | |
257 | |
258 # print aligner_command % {'int_no_mismatches' : int_no_mismatches, | |
259 # 'reference_genome' : os.path.join(db_path,'W_C2T'), | |
260 # 'input_file' : outfile2, | |
261 # 'output_file' : WC2T} | |
262 | |
263 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
264 'input_file' : outfile2, | |
265 'output_file' : WC2T}, | |
266 | |
267 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
268 'input_file' : outfile2, | |
269 'output_file' : CC2T}, | |
270 | |
271 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'), | |
272 'input_file' : outfile3, | |
273 'output_file' : WG2A}, | |
274 | |
275 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'), | |
276 'input_file' : outfile3, | |
277 'output_file' : CG2A} ]) | |
278 | |
279 | |
280 delete_files(outfile2, outfile3) | |
281 | |
282 | |
283 #-------------------------------------------------------------------------------- | |
284 # Post processing | |
285 #-------------------------------------------------------------------------------- | |
286 | |
287 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) | |
288 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) | |
289 | |
290 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) | |
291 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) | |
292 | |
293 #---------------------------------------------------------------- | |
294 # get unique-hit reads | |
295 #---------------------------------------------------------------- | |
296 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) | |
297 | |
298 Unique_FW_C2T=set() # + | |
299 Unique_RC_G2A=set() # + | |
300 Unique_FW_G2A=set() # - | |
301 Unique_RC_C2T=set() # - | |
302 Multiple_hits=set() | |
303 | |
304 | |
305 for x in Union_set: | |
306 _list=[] | |
307 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: | |
308 mis_lst=d.get(x,[99]) | |
309 mis=int(mis_lst[0]) | |
310 _list.append(mis) | |
311 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: | |
312 mis=d.get(x,99) | |
313 _list.append(mis) | |
314 mini=min(_list) | |
315 if _list.count(mini) == 1: | |
316 mini_index=_list.index(mini) | |
317 if mini_index == 0: | |
318 Unique_FW_C2T.add(x) | |
319 elif mini_index == 1: | |
320 Unique_RC_G2A.add(x) | |
321 elif mini_index == 2: | |
322 Unique_FW_G2A.add(x) | |
323 elif mini_index == 3: | |
324 Unique_RC_C2T.add(x) | |
325 # if mini_index = 4,5,6,7, indicating multiple hits | |
326 else : | |
327 Multiple_hits.add(x) | |
328 else : | |
329 Multiple_hits.add(x) | |
330 # write reads rejected by Multiple Hits to file | |
331 if show_multiple_hit : | |
332 outf_MH=open("Multiple_hit.fa",'w') | |
333 for i in Multiple_hits : | |
334 outf_MH.write(">%s\n" % i) | |
335 outf_MH.write("%s\n" % original_bs_reads[i]) | |
336 outf_MH.close() | |
337 | |
338 del Union_set | |
339 del FW_C2T_R | |
340 del FW_G2A_R | |
341 del RC_C2T_R | |
342 del RC_G2A_R | |
343 | |
344 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | |
345 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] | |
346 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | |
347 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] | |
348 FW_C2T_uniq_lst.sort() | |
349 RC_C2T_uniq_lst.sort() | |
350 FW_G2A_uniq_lst.sort() | |
351 RC_G2A_uniq_lst.sort() | |
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | |
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | |
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] | |
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] | |
356 | |
357 del Unique_FW_C2T | |
358 del Unique_FW_G2A | |
359 del Unique_RC_C2T | |
360 del Unique_RC_G2A | |
361 | |
362 #---------------------------------------------------------------- | |
363 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
364 numbers_premapped_lst[1] += len(Unique_RC_G2A) | |
365 numbers_premapped_lst[2] += len(Unique_FW_G2A) | |
366 numbers_premapped_lst[3] += len(Unique_RC_C2T) | |
367 | |
368 | |
369 #---------------------------------------------------------------- | |
370 | |
371 nn=0 | |
372 gseq = dict() | |
373 chr_length = dict() | |
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), | |
375 (RC_G2A_uniq_lst,RC_G2A_U), | |
376 (FW_G2A_uniq_lst,FW_G2A_U), | |
377 (RC_C2T_uniq_lst,RC_C2T_U)]: | |
378 nn += 1 | |
379 mapped_chr0 = "" | |
380 | |
381 for header in ali_unique_lst: | |
382 | |
383 _, mapped_chr, mapped_location, cigar = ali_dic[header] | |
384 | |
385 original_BS = original_bs_reads[header] | |
386 #------------------------------------- | |
387 if mapped_chr not in gseq: | |
388 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
389 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
390 | |
391 if nn == 2 or nn == 3: | |
392 cigar = list(reversed(cigar)) | |
393 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
394 | |
395 | |
396 all_mapped += 1 | |
397 | |
398 if nn == 1: # +FW mapped to + strand: | |
399 FR = "+FW" | |
400 mapped_strand="+" | |
401 | |
402 elif nn == 2: # +RC mapped to + strand: | |
403 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+) | |
404 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
405 mapped_strand = "+" | |
406 original_BS = reverse_compl_seq(original_BS) # for RC reads | |
407 | |
408 elif nn == 3: # -RC mapped to - strand: | |
409 mapped_strand = "-" | |
410 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-) | |
411 original_BS = reverse_compl_seq(original_BS) # for RC reads | |
412 | |
413 elif nn == 4: # -FW mapped to - strand: | |
414 mapped_strand = "-" | |
415 FR = "-FW" | |
416 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
417 | |
418 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) | |
419 | |
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
421 | |
422 | |
423 if len(r_aln)==len(g_aln): | |
424 N_mismatch = N_MIS(r_aln, g_aln) | |
425 if N_mismatch <= int(max_mismatch_no): | |
426 numbers_mapped_lst[nn-1] += 1 | |
427 all_mapped_passed += 1 | |
428 methy = methy_seq(r_aln, g_aln + next) | |
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
430 | |
431 #---XS FILTER---------------- | |
432 XS = 0 | |
433 nCH = methy.count('y') + methy.count('z') | |
434 nmCH = methy.count('Y') + methy.count('Z') | |
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
436 XS = 1 | |
437 | |
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | |
439 | |
440 #---------------------------------------------------------------- | |
441 logm("--> %s (%d) "%(read_file, no_my_files)) | |
442 delete_files(WC2T, WG2A, CC2T, CG2A) | |
443 | |
444 | |
445 | |
446 #-------------------------------------------------------------------- | |
447 # directional sequencing | |
448 #-------------------------------------------------------------------- | |
449 | |
450 if asktag=="N": | |
451 #---------------------------------------------------------------- | |
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id) | |
453 outf2=open(outfile2,'w') | |
454 | |
455 n=0 | |
456 #---------------------------------------------------------------- | |
457 try : | |
458 read_inf=open(read_file,"r") | |
459 except IOError : | |
460 print "[Error] Cannot open input file : %s" % read_file | |
461 exit(-1) | |
462 | |
463 oneline=read_inf.readline() | |
464 l=oneline.split() | |
465 input_format="" | |
466 if oneline[0]=="@": # FastQ | |
467 input_format="fastq" | |
468 n_fastq=0 | |
469 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
470 input_format="seq" | |
471 elif len(l)==11: # Illumina GAII qseq file | |
472 input_format="qseq" | |
473 elif oneline[0]==">": # fasta | |
474 input_format="fasta" | |
475 n_fasta=0 | |
476 read_inf.close() | |
477 #print "detected data format: %s"%(input_format) | |
478 #---------------------------------------------------------------- | |
479 read_id="" | |
480 seq="" | |
481 seq_ready="N" | |
482 for line in fileinput.input(read_file): | |
483 l=line.split() | |
484 if input_format=="seq": | |
485 all_raw_reads+=1 | |
486 read_id=str(all_raw_reads) | |
487 read_id=read_id.zfill(12) | |
488 seq=l[0] | |
489 seq_ready="Y" | |
490 elif input_format=="fastq": | |
491 m_fastq=math.fmod(n_fastq,4) | |
492 n_fastq+=1 | |
493 seq_ready="N" | |
494 if m_fastq==0: | |
495 all_raw_reads+=1 | |
496 read_id=str(all_raw_reads) | |
497 read_id=read_id.zfill(12) | |
498 seq="" | |
499 elif m_fastq==1: | |
500 seq=l[0] | |
501 seq_ready="Y" | |
502 else: | |
503 seq="" | |
504 elif input_format=="qseq": | |
505 all_raw_reads+=1 | |
506 read_id=str(all_raw_reads) | |
507 read_id=read_id.zfill(12) | |
508 seq=l[8] | |
509 seq_ready="Y" | |
510 elif input_format=="fasta": | |
511 m_fasta=math.fmod(n_fasta,2) | |
512 n_fasta+=1 | |
513 seq_ready="N" | |
514 if m_fasta==0: | |
515 all_raw_reads+=1 | |
516 read_id=l[0][1:] | |
517 seq="" | |
518 elif m_fasta==1: | |
519 seq=l[0] | |
520 seq_ready="Y" | |
521 else: | |
522 seq="" | |
523 | |
524 #-------------------------------- | |
525 if seq_ready=="Y": | |
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 | |
527 seq=seq.upper() | |
528 seq=seq.replace(".","N") | |
529 | |
530 #--striping adapter from 3' read ------- | |
531 if adapter != "": | |
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) | |
533 if len(new_read) < len(seq) : | |
534 all_trimed += 1 | |
535 seq = new_read | |
536 | |
537 if len(seq)<=4: | |
538 seq = "N" * (cut2-cut1+1) | |
539 | |
540 #--------- trimmed_raw_BS_read ------------------ | |
541 original_bs_reads[read_id] = seq | |
542 | |
543 | |
544 #--------- FW_C2T ------------------ | |
545 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) | |
546 | |
547 fileinput.close() | |
548 | |
549 outf2.close() | |
550 delete_files(read_file) | |
551 | |
552 #-------------------------------------------------------------------------------- | |
553 # Bowtie mapping | |
554 #-------------------------------------------------------------------------------- | |
555 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
556 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
557 | |
558 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
559 'input_file' : outfile2, | |
560 'output_file' : WC2T}, | |
561 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
562 'input_file' : outfile2, | |
563 'output_file' : CC2T} ]) | |
564 | |
565 delete_files(outfile2) | |
566 | |
567 #-------------------------------------------------------------------------------- | |
568 # Post processing | |
569 #-------------------------------------------------------------------------------- | |
570 | |
571 | |
572 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T) | |
573 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T) | |
574 | |
575 #---------------------------------------------------------------- | |
576 # get uniq-hit reads | |
577 #---------------------------------------------------------------- | |
578 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys()) | |
579 | |
580 Unique_FW_C2T = set() # + | |
581 Unique_RC_C2T = set() # - | |
582 Multiple_hits=set() | |
583 # write reads rejected by Multiple Hits to file | |
584 | |
585 for x in Union_set: | |
586 _list=[] | |
587 for d in [FW_C2T_U,RC_C2T_U]: | |
588 mis_lst=d.get(x,[99]) | |
589 mis=int(mis_lst[0]) | |
590 _list.append(mis) | |
591 for d in [FW_C2T_R,RC_C2T_R]: | |
592 mis=d.get(x,99) | |
593 _list.append(mis) | |
594 mini=min(_list) | |
595 #print _list | |
596 if _list.count(mini)==1: | |
597 mini_index=_list.index(mini) | |
598 if mini_index==0: | |
599 Unique_FW_C2T.add(x) | |
600 elif mini_index==1: | |
601 Unique_RC_C2T.add(x) | |
602 else: | |
603 Multiple_hits.add(x) | |
604 else : | |
605 Multiple_hits.add(x) | |
606 # write reads rejected by Multiple Hits to file | |
607 if show_multiple_hit : | |
608 outf_MH=open("Multiple_hit.fa",'w') | |
609 for i in Multiple_hits : | |
610 outf_MH.write(">%s\n" % i) | |
611 outf_MH.write("%s\n" % original_bs_reads[i]) | |
612 outf_MH.close() | |
613 | |
614 | |
615 | |
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | |
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | |
618 FW_C2T_uniq_lst.sort() | |
619 RC_C2T_uniq_lst.sort() | |
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | |
621 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | |
622 | |
623 | |
624 #---------------------------------------------------------------- | |
625 | |
626 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
627 numbers_premapped_lst[1] += len(Unique_RC_C2T) | |
628 | |
629 #---------------------------------------------------------------- | |
630 | |
631 nn = 0 | |
632 gseq = dict() | |
633 chr_length = dict() | |
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: | |
635 nn += 1 | |
636 mapped_chr0 = "" | |
637 for header in ali_unique_lst: | |
638 _, mapped_chr, mapped_location, cigar = ali_dic[header] | |
639 original_BS = original_bs_reads[header] | |
640 #------------------------------------- | |
641 if mapped_chr not in gseq : | |
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
643 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
644 #if mapped_chr != mapped_chr0: | |
645 # my_gseq = deserialize(db_d(mapped_chr)) | |
646 # chr_length = len(my_gseq) | |
647 # mapped_chr0 = mapped_chr | |
648 #------------------------------------- | |
649 | |
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
651 | |
652 all_mapped+=1 | |
653 if nn == 1: # +FW mapped to + strand: | |
654 FR = "+FW" | |
655 mapped_strand = "+" | |
656 elif nn == 2: # -FW mapped to - strand: | |
657 mapped_strand = "-" | |
658 FR = "-FW" | |
659 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
660 | |
661 | |
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) | |
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
664 | |
665 if len(r_aln) == len(g_aln): | |
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides | |
667 if N_mismatch <= int(max_mismatch_no): | |
668 numbers_mapped_lst[nn-1] += 1 | |
669 all_mapped_passed += 1 | |
670 methy = methy_seq(r_aln, g_aln+next) | |
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
672 | |
673 #---XS FILTER---------------- | |
674 XS = 0 | |
675 nCH = methy.count('y') + methy.count('z') | |
676 nmCH = methy.count('Y') + methy.count('Z') | |
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
678 XS = 1 | |
679 | |
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | |
681 | |
682 #---------------------------------------------------------------- | |
683 logm("--> %s (%d) "%(read_file,no_my_files)) | |
684 delete_files(WC2T, CC2T) | |
685 | |
686 | |
687 #---------------------------------------------------------------- | |
688 | |
689 # outf.close() | |
690 delete_files(tmp_path) | |
691 | |
692 logm("Number of raw reads: %d \n"% all_raw_reads) | |
693 if all_raw_reads >0: | |
694 logm("Number of reads having adapter removed: %d \n" % all_trimed ) | |
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped) | |
697 if asktag=="Y": | |
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | |
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | |
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) | |
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) | |
702 elif asktag=="N": | |
703 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | |
704 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | |
705 | |
706 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) | |
707 if asktag=="Y": | |
708 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) | |
709 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) ) | |
710 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) ) | |
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) | |
712 elif asktag=="N": | |
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) | |
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) | |
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) | |
716 | |
717 n_CG=mC_lst[0]+uC_lst[0] | |
718 n_CHG=mC_lst[1]+uC_lst[1] | |
719 n_CHH=mC_lst[2]+uC_lst[2] | |
720 | |
721 logm("----------------------------------------------" ) | |
722 logm("Methylated C in mapped reads ") | |
723 | |
724 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) | |
725 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) | |
726 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) | |
727 | |
728 logm("------------------- END --------------------" ) | |
729 elapsed("=== END %s ===" % main_read_file) | |
730 | |
731 | |
732 |