comparison BSseeker2/bs_align/bs_pair_end.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 import fileinput, os, random, math
2 from bs_utils.utils import *
3 from bs_align_utils import *
4 #----------------------------------------------------------------
5 def extract_mapping(ali_file):
6 unique_hits = {}
7 non_unique_hits = {}
8 header0 = ""
9 family = []
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
11 #------------------------
12 if header != header0:
13
14 # --- output ----
15 if len(family) == 1:
16 unique_hits[header0] = family[0]
17 elif len(family) > 1:
18 min_lst = min(family, key = lambda x: x[0])
19 max_lst = max(family, key = lambda x: x[0])
20
21 if min_lst[0] < max_lst[0]:
22 unique_hits[header0] = min_lst
23 else:
24 non_unique_hits[header0] = min_lst[0]
25
26 header0 = header
27 family = []
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
29
30 if len(family) == 1:
31 unique_hits[header0] = family[0]
32 elif len(family) > 1:
33 min_lst = min(family, key = lambda x: x[0])
34 max_lst = max(family, key = lambda x: x[0])
35
36 if min_lst[0] < max_lst[0]:
37 unique_hits[header0] = min_lst
38 else:
39 non_unique_hits[header0] = min_lst[0]
40
41 return unique_hits, non_unique_hits
42
43 def _extract_mapping(ali_file):
44 U = {}
45 R = {}
46 header0 = ""
47 family = []
48 for line in fileinput.input(ali_file):
49 l = line.split()
50 header = l[0][:-2]
51 chr = str(l[1])
52 location = int(l[2])
53 #no_hits=int(l[4])
54 #-------- mismatches -----------
55 if len(l) == 4:
56 no_mismatch = 0
57 elif len(l) == 5:
58 no_mismatch = l[4].count(":")
59 else:
60 print l
61 #------------------------
62 if header != header0:
63 #--------------------
64 if header0 != "":
65 # --- output ----
66 if len(family) == 1:
67 U[header0] = family[0]
68 else:
69 if family[0][0] < family[1][0]:
70 U[header0] = family[0]
71 elif family[1][0] < family[0][0]:
72 U[header0] = family[1]
73 else:
74 R[header0] = family[0][0]
75 family=[]
76 # ---------------
77 header0 = header
78 family = [[no_mismatch, chr, location]]
79 member = 1
80 elif header == header0:
81 if member == 1:
82 family[-1][0] += no_mismatch
83 family[-1].append(location)
84 member = 2
85 elif member == 2:
86 family.append([no_mismatch, chr, location])
87 member = 1
88 #------------------------------
89
90 fileinput.close()
91 return U, R
92
93
94 #----------------------------------------------------------------
95
96 def bs_pair_end(main_read_file_1,
97 main_read_file_2,
98 asktag,
99 adapter_file,
100 cut1,
101 cut2, # add cut3 and cut4?
102 no_small_lines,
103 max_mismatch_no,
104 aligner_command,
105 db_path,
106 tmp_path,
107 outfile, XS_pct, XS_count, show_multiple_hit=False):
108
109
110 #----------------------------------------------------------------
111 adapter=""
112 adapterA=""
113 adapterB=""
114 if adapter_file !="":
115 try :
116 adapter_inf=open(adapter_file,"r")
117 if asktag=="N": #<--- directional library
118 adapter=adapter_inf.readline()
119 adapter_inf.close()
120 adapter=adapter.rstrip("\n")
121 elif asktag=="Y":#<--- un-directional library
122 adapterA=adapter_inf.readline()
123 adapterB=adapter_inf.readline()
124 adapter_inf.close()
125 adapterA=adapterA.rstrip("\n")
126 adapterB=adapterB.rstrip("\n")
127 except IOError :
128 print "[Error] Cannot find adapter file : %s" % adapter_file
129 exit(-1)
130
131
132 #----------------------------------------------------------------
133
134 logm("End 1 filename: %s"% main_read_file_1 )
135 logm("End 2 filename: %s"% main_read_file_2 )
136 logm("The first base (for mapping): %d"% cut1 )
137 logm("The last base (for mapping): %d"% cut2 )
138
139 logm("-------------------------------- " )
140 logm("Un-directional library: %s" % asktag )
141 logm("Path for short reads aligner: %s"% aligner_command + '\n')
142 logm("Reference genome library path: %s"% db_path )
143 logm("Number of mismatches allowed: %s"% max_mismatch_no )
144
145 if adapter_file !="":
146 if asktag=="Y":
147 logm("Adapters to be removed from 3' of the reads:" )
148 logm("-- A: %s" % adapterA )
149 logm("-- B: %s" % adapterB )
150 elif asktag=="N":
151 logm("Adapter to be removed from 3' of the reads:" )
152 logm("-- %s" % adapter )
153
154 logm("-------------------------------- " )
155
156 #----------------------------------------------------------------
157
158 # helper method to join fname with tmp_path
159 tmp_d = lambda fname: os.path.join(tmp_path, fname)
160
161 db_d = lambda fname: os.path.join(db_path, fname)
162
163
164 #----------------------------------------------------------------
165 # splitting the 2 big read files
166
167 input_fname1 = os.path.split(main_read_file_1)[1]
168 input_fname2 = os.path.split(main_read_file_2)[1]
169
170 # TODO: run these in parallel with a subprocess
171 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
172 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
173
174 dirList=os.listdir(tmp_path)
175 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
176 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
177
178
179
180 #---- Stats ------------------------------------------------------------
181 all_raw_reads=0
182 all_trimmed=0
183 all_mapped=0
184 all_mapped_passed=0
185
186 all_unmapped=0
187
188 numbers_premapped_lst=[0,0,0,0]
189 numbers_mapped_lst=[0,0,0,0]
190
191 mC_lst=[0,0,0]
192 uC_lst=[0,0,0]
193
194 no_my_files=0
195
196 #----------------------------------------------------------------
197 print "== Start mapping =="
198
199 for read_file_1, read_file_2 in my_files:
200
201 no_my_files+=1
202
203 random_id=".tmp-"+str(random.randint(1000000,9999999))
204
205 original_bs_reads_1 = {}
206 original_bs_reads_2 = {}
207 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
208
209
210 if asktag == "Y" :
211
212 #----------------------------------------------------------------
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id)
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
217
218 try :
219 read_inf = open(tmp_d(read_file_1),"r")
220 except IOError :
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
222 exit(-1)
223 oneline = read_inf.readline()
224 l = oneline.split()
225 input_format = ""
226
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
228 input_format="fastq"
229 n_fastq=0
230 elif len(l)==1 and oneline[0]!=">": # pure sequences
231 input_format="seq"
232 elif len(l)==11: # Illumina GAII qseq file
233 input_format="qseq"
234 elif oneline[0]==">": # fasta
235 input_format="fasta"
236 n_fasta=0
237 read_inf.close()
238
239 print "Detected data format: %s" % input_format
240
241 #----------------------------------------------------------------
242 read_file_list = [read_file_1, read_file_2]
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT]
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
245 n_list = [0, 0]
246
247
248 for f in range(2):
249 read_file = read_file_list[f]
250 outf_FCT = open(outfile_FCT_list[f], 'w')
251 outf_RCT = open(outfile_RCT_list[f], 'w')
252 original_bs_reads = original_bs_reads_lst[f]
253 n = n_list[f]
254
255 seq_id = ""
256 seq = ""
257 seq_ready = "N"
258 for line in fileinput.input(tmp_d(read_file)):
259 l=line.split()
260 if input_format=="seq":
261 n+=1
262 seq_id=str(n)
263 seq_id=seq_id.zfill(12)
264 seq=l[0]
265 seq_ready="Y"
266 elif input_format=="fastq":
267 m_fastq=math.fmod(n_fastq,4)
268 n_fastq+=1
269 seq_ready="N"
270 if m_fastq==0:
271 n+=1
272 seq_id=str(n)
273 seq_id=seq_id.zfill(12)
274 seq=""
275 elif m_fastq==1:
276 seq=l[0]
277 seq_ready="Y"
278 else:
279 seq=""
280 elif input_format=="qseq":
281 n+=1
282 seq_id=str(n)
283 seq_id=seq_id.zfill(12)
284 seq=l[8]
285 seq_ready="Y"
286 elif input_format=="fasta":
287 m_fasta=math.fmod(n_fasta,2)
288 n_fasta+=1
289 seq_ready="N"
290 if m_fasta==0:
291 n+=1
292 seq_id=l[0][1:]
293 seq_id=seq_id.zfill(17)
294 seq=""
295 elif m_fasta==1:
296 seq=l[0]
297 seq_ready="Y"
298 else:
299 seq=""
300 #----------------------------------------------------------------
301 if seq_ready=="Y":
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
303 seq=seq.upper()
304 seq=seq.replace(".","N")
305
306 #--striping BS adapter from 3' read --------------------------------------------------------------
307 if (adapterA !="") and (adapterB !=""):
308 signature=adapterA[:6]
309 if signature in seq:
310 signature_pos=seq.index(signature)
311 if seq[signature_pos:] in adapterA:
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
313 all_trimmed+=1
314 else:
315 signature=adapterB[:6]
316 if signature in seq:
317 #print seq_id,seq,signature;
318 signature_pos=seq.index(signature)
319 if seq[signature_pos:] in adapterB:
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
321 all_trimmed+=1
322
323 if len(seq) <= 4:
324 seq = "N" * (cut2-cut1+1)
325 #--------- trimmed_raw_BS_read ------------------
326 original_bs_reads[seq_id] = seq
327
328 #--------- FW_C2T ------------------
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T")))
330 #--------- RC_G2A ------------------
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
332
333 n_list[f]=n
334
335 outf_FCT.close()
336 outf_RCT.close()
337
338 fileinput.close()
339
340
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
342 all_raw_reads+=n
343
344 #--------------------------------------------------------------------------------
345 # Bowtie mapping
346 #--------------------------------------------------------------------------------
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
351
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
353 'input_file_1' : outfile_1FCT,
354 'input_file_2' : outfile_2RCT,
355 'output_file' : WC2T_fr},
356
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
358 'input_file_1' : outfile_1FCT,
359 'input_file_2' : outfile_2RCT,
360 'output_file' : CC2T_fr},
361
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
363 'input_file_1' : outfile_2FCT,
364 'input_file_2' : outfile_1RCT,
365 'output_file' : WC2T_rf},
366
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
368 'input_file_1' : outfile_2FCT,
369 'input_file_2' : outfile_1RCT,
370 'output_file' : CC2T_rf}])
371
372
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
374
375
376 #--------------------------------------------------------------------------------
377 # Post processing
378 #--------------------------------------------------------------------------------
379
380
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
385
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf)
387
388 #----------------------------------------------------------------
389 # get uniq-hit reads
390 #----------------------------------------------------------------
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
392
393 Unique_FW_fr_C2T=set() # +
394 Unique_FW_rf_C2T=set() # +
395 Unique_RC_fr_C2T=set() # -
396 Unique_RC_rf_C2T=set() # -
397 Multiple_hits=set()
398
399
400 for x in Union_set:
401 _list=[]
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]:
403 mis_lst=d.get(x,[99])
404 mis=int(mis_lst[0])
405 _list.append(mis)
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]:
407 mis=d.get(x,99)
408 _list.append(mis)
409 mini=min(_list)
410 if _list.count(mini)==1:
411 mini_index=_list.index(mini)
412 if mini_index==0:
413 Unique_FW_fr_C2T.add(x)
414 elif mini_index==1:
415 Unique_FW_rf_C2T.add(x)
416 elif mini_index==2:
417 Unique_RC_fr_C2T.add(x)
418 elif mini_index==3:
419 Unique_RC_rf_C2T.add(x)
420 else :
421 Multiple_hits.add(x)
422 else :
423 Multiple_hits.add(x)
424
425 # write reads rejected by Multiple Hits to file
426 if show_multiple_hit :
427 outf_MH=open("Multiple_hit.fa",'w')
428 for i in Multiple_hits :
429 outf_MH.write(">%s\n" % i)
430 outf_MH.write("%s\n" % original_bs_reads[i])
431 outf_MH.close()
432
433 del Union_set
434 del FW_C2T_fr_R
435 del FW_C2T_rf_R
436 del RC_C2T_fr_R
437 del RC_C2T_rf_R
438
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
443
444 FW_C2T_fr_uniq_lst.sort()
445 FW_C2T_rf_uniq_lst.sort()
446 RC_C2T_fr_uniq_lst.sort()
447 RC_C2T_rf_uniq_lst.sort()
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst]
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst]
452 #----------------------------------------------------------------
453
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T)
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T)
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T)
458
459 del Unique_FW_fr_C2T
460 del Unique_FW_rf_C2T
461 del Unique_RC_fr_C2T
462 del Unique_RC_rf_C2T
463
464 #----------------------------------------------------------------
465
466 nn = 0
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U),
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
471 nn += 1
472
473 mapped_chr0 = ""
474 for header in ali_unique_lst:
475
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
477
478 #-------------------------------------
479 if mapped_chr != mapped_chr0:
480 my_gseq=deserialize(db_d(mapped_chr))
481 chr_length=len(my_gseq)
482 mapped_chr0=mapped_chr
483 #-------------------------------------
484 if nn == 1 or nn == 3:
485 original_BS_1 = original_bs_reads_1[header]
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
487 else:
488 original_BS_1 = original_bs_reads_2[header]
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
490
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
493
494 all_mapped += 1
495
496 if nn == 1: # FW-RC mapped to + strand:
497
498 FR = "+FR"
499
500 # mapped_location_1 += 1
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
502 # origin_genome_1 = origin_genome_long_1[2:-2]
503 mapped_strand_1 = "+"
504
505 # mapped_location_2 += 1
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
507 # origin_genome_2 = origin_genome_long_2[2:-2]
508 mapped_strand_2 = "+"
509
510 elif nn==2: # RC-FW mapped to + strand:
511
512 # original_BS_1 = original_bs_reads_2[header]
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
514 FR = "+RF"
515 # mapped_location_1 += 1
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
517 # origin_genome_1 = origin_genome_long_1[2:-2]
518 mapped_strand_1 = "+"
519
520 # mapped_location_2 += 1
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
522 # origin_genome_2 = origin_genome_long_2[2:-2]
523 mapped_strand_2 = "+"
524
525
526 elif nn==3: # FW-RC mapped to - strand:
527 # original_BS_1=original_bs_reads_1[header]
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
529
530 FR = "-FR"
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
534 # origin_genome_1 = origin_genome_long_1[2:-2]
535 mapped_strand_1 = "-"
536
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
539 # origin_genome_2 = origin_genome_long_2[2:-2]
540 mapped_strand_2 = "-"
541
542 elif nn==4: # RC-FW mapped to - strand:
543 # original_BS_1=original_bs_reads_2[header]
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
545
546 FR = "-RF"
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
550 # origin_genome_1 = origin_genome_long_1[2:-2]
551 mapped_strand_1 = "-"
552
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
555 # origin_genome_2 = origin_genome_long_2[2:-2]
556 mapped_strand_2 = "-"
557
558
559
560
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
563
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
566
567
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
570
571
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) :
573 all_mapped_passed += 1
574 numbers_mapped_lst[nn-1] += 1
575 #---- unmapped -------------------------
576 del original_bs_reads_1[header]
577 del original_bs_reads_2[header]
578 #---------------------------------------
579
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
582
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
585
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
588
589
590 #
591 #---XS FILTER----------------
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
593 XS_1 = 0
594 nCH_1 = methy_1.count('y') + methy_1.count('z')
595 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
596 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
597 XS_1 = 1
598 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
599 XS_2 = 0
600 nCH_2 = methy_2.count('y') + methy_2.count('z')
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
603 XS_2 = 1
604
605
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
610
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
612 #----------------------------------------------------------------
613 # output unmapped pairs
614 #----------------------------------------------------------------
615
616 unmapped_lst=original_bs_reads_1.keys()
617 unmapped_lst.sort()
618
619 # for u in unmapped_lst:
620 # outf_u1.write("%s\n"%original_bs_reads_1[u])
621 # outf_u2.write("%s\n"%original_bs_reads_2[u])
622
623 all_unmapped += len(unmapped_lst)
624
625
626 if asktag=="N":
627
628 #----------------------------------------------------------------
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
631
632 try :
633 read_inf=open(tmp_d(read_file_1),"r")
634 except IOError :
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
636 exit(-1)
637
638 oneline=read_inf.readline()
639 l=oneline.split()
640 input_format=""
641
642 #if len(l)==5: # old solexa format
643 # input_format="old Solexa Seq file"
644
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
646 input_format="FastQ"
647 n_fastq=0
648 elif len(l)==1 and oneline[0]!=">": # pure sequences
649 input_format="_list of sequences"
650 elif len(l)==11: # Illumina GAII qseq file
651 input_format="Illumina GAII qseq file"
652 elif oneline[0]==">": # fasta
653 input_format="fasta"
654 n_fasta=0
655
656 read_inf.close()
657
658 print "Detected data format: %s" % input_format
659
660 #----------------------------------------------------------------
661 read_file_list=[read_file_1,read_file_2]
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
663 n_list=[0,0]
664
665 for f in range(2):
666 read_file=read_file_list[f]
667 outf_FCT=open(outfile_FCT_list[f],'w')
668 original_bs_reads = original_bs_reads_lst[f]
669 n=n_list[f]
670
671 seq_id=""
672 seq=""
673 seq_ready="N"
674 for line in fileinput.input(tmp_d(read_file)):
675 l=line.split()
676 if input_format=="old Solexa Seq file":
677 n+=1
678 seq_id=str(n)
679 seq_id=seq_id.zfill(12)
680 seq=l[4]
681 seq_ready="Y"
682 elif input_format=="_list of sequences":
683 n+=1
684 seq_id=str(n)
685 seq_id=seq_id.zfill(12)
686 seq=l[0]
687 seq_ready="Y"
688 elif input_format=="FastQ":
689 m_fastq=math.fmod(n_fastq,4)
690 n_fastq+=1
691 seq_ready="N"
692 if m_fastq==0:
693 n+=1
694 seq_id=str(n)
695 seq_id=seq_id.zfill(12)
696 seq=""
697 elif m_fastq==1:
698 seq=l[0]
699 seq_ready="Y"
700 else:
701 seq=""
702 elif input_format=="Illumina GAII qseq file":
703 n+=1
704 seq_id=str(n)
705 seq_id=seq_id.zfill(12)
706 seq=l[8]
707 seq_ready="Y"
708 elif input_format=="fasta":
709 m_fasta=math.fmod(n_fasta,2)
710 n_fasta+=1
711 seq_ready="N"
712 if m_fasta==0:
713 n+=1
714 seq_id=l[0][1:]
715 seq_id=seq_id.zfill(17)
716 seq=""
717 elif m_fasta==1:
718 seq=l[0]
719 seq_ready="Y"
720 else:
721 seq=""
722 #----------------------------------------------------------------
723 if seq_ready=="Y":
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
725 seq=seq.upper()
726 seq=seq.replace(".","N")
727
728 #--striping BS adapter from 3' read --------------------------------------------------------------
729 if (adapterA !="") and (adapterB !=""):
730 signature=adapterA[:6]
731 if signature in seq:
732 signature_pos=seq.index(signature)
733 if seq[signature_pos:] in adapterA:
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
735 all_trimmed+=1
736 else:
737 signature=adapterB[:6]
738 if signature in seq:
739 #print seq_id,seq,signature;
740 signature_pos=seq.index(signature)
741 if seq[signature_pos:] in adapterB:
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
743 all_trimmed+=1
744
745 if len(seq) <= 4:
746 seq = "N" * (cut2-cut1+1)
747 #--------- trimmed_raw_BS_read ------------------
748 original_bs_reads[seq_id] = seq
749
750 #--------- FW_C2T ------------------
751 if f==0:
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
753 elif f==1:
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
755
756
757 n_list[f]=n
758 outf_FCT.close()
759
760 fileinput.close()
761
762
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
764 all_raw_reads+=n
765
766 #--------------------------------------------------------------------------------
767 # Bowtie mapping
768 #--------------------------------------------------------------------------------
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
771
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
773 'input_file_1' : outfile_1FCT,
774 'input_file_2' : outfile_2FCT,
775 'output_file' : WC2T_fr},
776
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
778 'input_file_1' : outfile_1FCT,
779 'input_file_2' : outfile_2FCT,
780 'output_file' : CC2T_fr} ])
781
782
783 delete_files(outfile_1FCT, outfile_2FCT)
784
785 #--------------------------------------------------------------------------------
786 # Post processing
787 #--------------------------------------------------------------------------------
788
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
791
792 #----------------------------------------------------------------
793 # get uniq-hit reads
794 #----------------------------------------------------------------
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
796
797 Unique_FW_fr_C2T = set() # +
798 Unique_RC_fr_C2T = set() # -
799 Multiple_hits=set()
800
801
802 for x in Union_set:
803 _list = []
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
805 mis_lst = d.get(x,[99])
806 mis = int(mis_lst[0])
807 _list.append(mis)
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
809 mis = d.get(x,99)
810 _list.append(mis)
811 mini = min(_list)
812 if _list.count(mini) == 1:
813 mini_index = _list.index(mini)
814 if mini_index == 0:
815 Unique_FW_fr_C2T.add(x)
816 elif mini_index == 1:
817 Unique_RC_fr_C2T.add(x)
818 else :
819 Multiple_hits.add(x)
820 else :
821 Multiple_hits.add(x)
822
823 # write reads rejected by Multiple Hits to file
824 if show_multiple_hit :
825 outf_MH=open("Multiple_hit.fa",'w')
826 for i in Multiple_hits :
827 outf_MH.write(">%s\n" % i)
828 outf_MH.write("%s\n" % original_bs_reads[i])
829 outf_MH.close()
830
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
833
834 FW_C2T_fr_uniq_lst.sort()
835 RC_C2T_fr_uniq_lst.sort()
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
838 #----------------------------------------------------------------
839
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
842
843
844 #----------------------------------------------------------------
845
846 nn = 0
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
848 nn += 1
849 mapped_chr0 = ""
850 for header in ali_unique_lst:
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
852
853
854 #-------------------------------------
855 if mapped_chr != mapped_chr0:
856 my_gseq = deserialize(db_d(mapped_chr))
857 chr_length = len(my_gseq)
858 mapped_chr0 = mapped_chr
859 #-------------------------------------
860
861 original_BS_1 = original_bs_reads_1[header]
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
863
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
866
867 all_mapped += 1
868
869 if nn == 1: # FW-RC mapped to + strand:
870 FR = "+FR"
871 # mapped_location_1 += 1
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
873 # origin_genome_1 = origin_genome_long_1[2:-2]
874 mapped_strand_1 = "+"
875
876 # mapped_location_2 += 1
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
878 # origin_genome_2 = origin_genome_long_2[2:-2]
879 mapped_strand_2 = "+"
880
881 elif nn == 2: # FW-RC mapped to - strand:
882
883 FR="-FR"
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
887 # origin_genome_1 = origin_genome_long_1[2:-2]
888 mapped_strand_1 = "-"
889
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
892 # origin_genome_2 = origin_genome_long_2[2:-2]
893 mapped_strand_2 = "-"
894
895
896
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
899
900
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
903
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
906
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
908
909 numbers_mapped_lst[nn-1] += 1
910 all_mapped_passed += 1
911
912 #---- unmapped -------------------------
913 del original_bs_reads_1[header]
914 del original_bs_reads_2[header]
915
916 #---------------------------------------
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
923
924 #
925 #---XS FILTER----------------
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
927 XS_1 = 0
928 nCH_1 = methy_1.count('y') + methy_1.count('z')
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
931 XS_1 = 1
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
933 XS_2 = 0
934 nCH_2 = methy_2.count('y') + methy_2.count('z')
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
937 XS_2 = 1
938
939
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
944
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
946 #----------------------------------------------------------------
947 # output unmapped pairs
948 #----------------------------------------------------------------
949
950 unmapped_lst=original_bs_reads_1.keys()
951 unmapped_lst.sort()
952
953 # for u in unmapped_lst:
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
956
957 all_unmapped+=len(unmapped_lst)
958
959
960 #==================================================================================================
961 # outf.close()
962 #
963 # outf_u1.close()
964 # outf_u2.close()
965 delete_files(tmp_path)
966
967 logm("-------------------------------- " )
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n")
970
971 if all_raw_reads >0:
972
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n")
974 if asktag=="Y":
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
979 elif asktag=="N":
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
982
983
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
986 if asktag=="Y":
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
991 elif asktag=="N":
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
994
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n")
997
998
999 n_CG=mC_lst[0]+uC_lst[0]
1000 n_CHG=mC_lst[1]+uC_lst[1]
1001 n_CHH=mC_lst[2]+uC_lst[2]
1002
1003 logm("-------------------------------- " )
1004 logm("Methylated C in mapped reads " )
1005 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
1006 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
1007 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
1008
1009 logm("----------------------------------------------" )
1010 logm("------------------- END ----------------------" )
1011 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))