Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_pair_end.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 import fileinput, os, random, math | |
2 from bs_utils.utils import * | |
3 from bs_align_utils import * | |
4 #---------------------------------------------------------------- | |
5 def extract_mapping(ali_file): | |
6 unique_hits = {} | |
7 non_unique_hits = {} | |
8 header0 = "" | |
9 family = [] | |
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True): | |
11 #------------------------ | |
12 if header != header0: | |
13 | |
14 # --- output ---- | |
15 if len(family) == 1: | |
16 unique_hits[header0] = family[0] | |
17 elif len(family) > 1: | |
18 min_lst = min(family, key = lambda x: x[0]) | |
19 max_lst = max(family, key = lambda x: x[0]) | |
20 | |
21 if min_lst[0] < max_lst[0]: | |
22 unique_hits[header0] = min_lst | |
23 else: | |
24 non_unique_hits[header0] = min_lst[0] | |
25 | |
26 header0 = header | |
27 family = [] | |
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2)) | |
29 | |
30 if len(family) == 1: | |
31 unique_hits[header0] = family[0] | |
32 elif len(family) > 1: | |
33 min_lst = min(family, key = lambda x: x[0]) | |
34 max_lst = max(family, key = lambda x: x[0]) | |
35 | |
36 if min_lst[0] < max_lst[0]: | |
37 unique_hits[header0] = min_lst | |
38 else: | |
39 non_unique_hits[header0] = min_lst[0] | |
40 | |
41 return unique_hits, non_unique_hits | |
42 | |
43 def _extract_mapping(ali_file): | |
44 U = {} | |
45 R = {} | |
46 header0 = "" | |
47 family = [] | |
48 for line in fileinput.input(ali_file): | |
49 l = line.split() | |
50 header = l[0][:-2] | |
51 chr = str(l[1]) | |
52 location = int(l[2]) | |
53 #no_hits=int(l[4]) | |
54 #-------- mismatches ----------- | |
55 if len(l) == 4: | |
56 no_mismatch = 0 | |
57 elif len(l) == 5: | |
58 no_mismatch = l[4].count(":") | |
59 else: | |
60 print l | |
61 #------------------------ | |
62 if header != header0: | |
63 #-------------------- | |
64 if header0 != "": | |
65 # --- output ---- | |
66 if len(family) == 1: | |
67 U[header0] = family[0] | |
68 else: | |
69 if family[0][0] < family[1][0]: | |
70 U[header0] = family[0] | |
71 elif family[1][0] < family[0][0]: | |
72 U[header0] = family[1] | |
73 else: | |
74 R[header0] = family[0][0] | |
75 family=[] | |
76 # --------------- | |
77 header0 = header | |
78 family = [[no_mismatch, chr, location]] | |
79 member = 1 | |
80 elif header == header0: | |
81 if member == 1: | |
82 family[-1][0] += no_mismatch | |
83 family[-1].append(location) | |
84 member = 2 | |
85 elif member == 2: | |
86 family.append([no_mismatch, chr, location]) | |
87 member = 1 | |
88 #------------------------------ | |
89 | |
90 fileinput.close() | |
91 return U, R | |
92 | |
93 | |
94 #---------------------------------------------------------------- | |
95 | |
96 def bs_pair_end(main_read_file_1, | |
97 main_read_file_2, | |
98 asktag, | |
99 adapter_file, | |
100 cut1, | |
101 cut2, # add cut3 and cut4? | |
102 no_small_lines, | |
103 max_mismatch_no, | |
104 aligner_command, | |
105 db_path, | |
106 tmp_path, | |
107 outfile, XS_pct, XS_count, show_multiple_hit=False): | |
108 | |
109 | |
110 #---------------------------------------------------------------- | |
111 adapter="" | |
112 adapterA="" | |
113 adapterB="" | |
114 if adapter_file !="": | |
115 try : | |
116 adapter_inf=open(adapter_file,"r") | |
117 if asktag=="N": #<--- directional library | |
118 adapter=adapter_inf.readline() | |
119 adapter_inf.close() | |
120 adapter=adapter.rstrip("\n") | |
121 elif asktag=="Y":#<--- un-directional library | |
122 adapterA=adapter_inf.readline() | |
123 adapterB=adapter_inf.readline() | |
124 adapter_inf.close() | |
125 adapterA=adapterA.rstrip("\n") | |
126 adapterB=adapterB.rstrip("\n") | |
127 except IOError : | |
128 print "[Error] Cannot find adapter file : %s" % adapter_file | |
129 exit(-1) | |
130 | |
131 | |
132 #---------------------------------------------------------------- | |
133 | |
134 logm("End 1 filename: %s"% main_read_file_1 ) | |
135 logm("End 2 filename: %s"% main_read_file_2 ) | |
136 logm("The first base (for mapping): %d"% cut1 ) | |
137 logm("The last base (for mapping): %d"% cut2 ) | |
138 | |
139 logm("-------------------------------- " ) | |
140 logm("Un-directional library: %s" % asktag ) | |
141 logm("Path for short reads aligner: %s"% aligner_command + '\n') | |
142 logm("Reference genome library path: %s"% db_path ) | |
143 logm("Number of mismatches allowed: %s"% max_mismatch_no ) | |
144 | |
145 if adapter_file !="": | |
146 if asktag=="Y": | |
147 logm("Adapters to be removed from 3' of the reads:" ) | |
148 logm("-- A: %s" % adapterA ) | |
149 logm("-- B: %s" % adapterB ) | |
150 elif asktag=="N": | |
151 logm("Adapter to be removed from 3' of the reads:" ) | |
152 logm("-- %s" % adapter ) | |
153 | |
154 logm("-------------------------------- " ) | |
155 | |
156 #---------------------------------------------------------------- | |
157 | |
158 # helper method to join fname with tmp_path | |
159 tmp_d = lambda fname: os.path.join(tmp_path, fname) | |
160 | |
161 db_d = lambda fname: os.path.join(db_path, fname) | |
162 | |
163 | |
164 #---------------------------------------------------------------- | |
165 # splitting the 2 big read files | |
166 | |
167 input_fname1 = os.path.split(main_read_file_1)[1] | |
168 input_fname2 = os.path.split(main_read_file_2)[1] | |
169 | |
170 # TODO: run these in parallel with a subprocess | |
171 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines) | |
172 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines) | |
173 | |
174 dirList=os.listdir(tmp_path) | |
175 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)), | |
176 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList))) | |
177 | |
178 | |
179 | |
180 #---- Stats ------------------------------------------------------------ | |
181 all_raw_reads=0 | |
182 all_trimmed=0 | |
183 all_mapped=0 | |
184 all_mapped_passed=0 | |
185 | |
186 all_unmapped=0 | |
187 | |
188 numbers_premapped_lst=[0,0,0,0] | |
189 numbers_mapped_lst=[0,0,0,0] | |
190 | |
191 mC_lst=[0,0,0] | |
192 uC_lst=[0,0,0] | |
193 | |
194 no_my_files=0 | |
195 | |
196 #---------------------------------------------------------------- | |
197 print "== Start mapping ==" | |
198 | |
199 for read_file_1, read_file_2 in my_files: | |
200 | |
201 no_my_files+=1 | |
202 | |
203 random_id=".tmp-"+str(random.randint(1000000,9999999)) | |
204 | |
205 original_bs_reads_1 = {} | |
206 original_bs_reads_2 = {} | |
207 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2] | |
208 | |
209 | |
210 if asktag == "Y" : | |
211 | |
212 #---------------------------------------------------------------- | |
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) | |
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) | |
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id) | |
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) | |
217 | |
218 try : | |
219 read_inf = open(tmp_d(read_file_1),"r") | |
220 except IOError : | |
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1) | |
222 exit(-1) | |
223 oneline = read_inf.readline() | |
224 l = oneline.split() | |
225 input_format = "" | |
226 | |
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) | |
228 input_format="fastq" | |
229 n_fastq=0 | |
230 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
231 input_format="seq" | |
232 elif len(l)==11: # Illumina GAII qseq file | |
233 input_format="qseq" | |
234 elif oneline[0]==">": # fasta | |
235 input_format="fasta" | |
236 n_fasta=0 | |
237 read_inf.close() | |
238 | |
239 print "Detected data format: %s" % input_format | |
240 | |
241 #---------------------------------------------------------------- | |
242 read_file_list = [read_file_1, read_file_2] | |
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT] | |
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT] | |
245 n_list = [0, 0] | |
246 | |
247 | |
248 for f in range(2): | |
249 read_file = read_file_list[f] | |
250 outf_FCT = open(outfile_FCT_list[f], 'w') | |
251 outf_RCT = open(outfile_RCT_list[f], 'w') | |
252 original_bs_reads = original_bs_reads_lst[f] | |
253 n = n_list[f] | |
254 | |
255 seq_id = "" | |
256 seq = "" | |
257 seq_ready = "N" | |
258 for line in fileinput.input(tmp_d(read_file)): | |
259 l=line.split() | |
260 if input_format=="seq": | |
261 n+=1 | |
262 seq_id=str(n) | |
263 seq_id=seq_id.zfill(12) | |
264 seq=l[0] | |
265 seq_ready="Y" | |
266 elif input_format=="fastq": | |
267 m_fastq=math.fmod(n_fastq,4) | |
268 n_fastq+=1 | |
269 seq_ready="N" | |
270 if m_fastq==0: | |
271 n+=1 | |
272 seq_id=str(n) | |
273 seq_id=seq_id.zfill(12) | |
274 seq="" | |
275 elif m_fastq==1: | |
276 seq=l[0] | |
277 seq_ready="Y" | |
278 else: | |
279 seq="" | |
280 elif input_format=="qseq": | |
281 n+=1 | |
282 seq_id=str(n) | |
283 seq_id=seq_id.zfill(12) | |
284 seq=l[8] | |
285 seq_ready="Y" | |
286 elif input_format=="fasta": | |
287 m_fasta=math.fmod(n_fasta,2) | |
288 n_fasta+=1 | |
289 seq_ready="N" | |
290 if m_fasta==0: | |
291 n+=1 | |
292 seq_id=l[0][1:] | |
293 seq_id=seq_id.zfill(17) | |
294 seq="" | |
295 elif m_fasta==1: | |
296 seq=l[0] | |
297 seq_ready="Y" | |
298 else: | |
299 seq="" | |
300 #---------------------------------------------------------------- | |
301 if seq_ready=="Y": | |
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 | |
303 seq=seq.upper() | |
304 seq=seq.replace(".","N") | |
305 | |
306 #--striping BS adapter from 3' read -------------------------------------------------------------- | |
307 if (adapterA !="") and (adapterB !=""): | |
308 signature=adapterA[:6] | |
309 if signature in seq: | |
310 signature_pos=seq.index(signature) | |
311 if seq[signature_pos:] in adapterA: | |
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
313 all_trimmed+=1 | |
314 else: | |
315 signature=adapterB[:6] | |
316 if signature in seq: | |
317 #print seq_id,seq,signature; | |
318 signature_pos=seq.index(signature) | |
319 if seq[signature_pos:] in adapterB: | |
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
321 all_trimmed+=1 | |
322 | |
323 if len(seq) <= 4: | |
324 seq = "N" * (cut2-cut1+1) | |
325 #--------- trimmed_raw_BS_read ------------------ | |
326 original_bs_reads[seq_id] = seq | |
327 | |
328 #--------- FW_C2T ------------------ | |
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T"))) | |
330 #--------- RC_G2A ------------------ | |
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) | |
332 | |
333 n_list[f]=n | |
334 | |
335 outf_FCT.close() | |
336 outf_RCT.close() | |
337 | |
338 fileinput.close() | |
339 | |
340 | |
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); | |
342 all_raw_reads+=n | |
343 | |
344 #-------------------------------------------------------------------------------- | |
345 # Bowtie mapping | |
346 #-------------------------------------------------------------------------------- | |
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) | |
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) | |
351 | |
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
353 'input_file_1' : outfile_1FCT, | |
354 'input_file_2' : outfile_2RCT, | |
355 'output_file' : WC2T_fr}, | |
356 | |
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
358 'input_file_1' : outfile_1FCT, | |
359 'input_file_2' : outfile_2RCT, | |
360 'output_file' : CC2T_fr}, | |
361 | |
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
363 'input_file_1' : outfile_2FCT, | |
364 'input_file_2' : outfile_1RCT, | |
365 'output_file' : WC2T_rf}, | |
366 | |
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
368 'input_file_1' : outfile_2FCT, | |
369 'input_file_2' : outfile_1RCT, | |
370 'output_file' : CC2T_rf}]) | |
371 | |
372 | |
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT) | |
374 | |
375 | |
376 #-------------------------------------------------------------------------------- | |
377 # Post processing | |
378 #-------------------------------------------------------------------------------- | |
379 | |
380 | |
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) | |
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) | |
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) | |
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) | |
385 | |
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf) | |
387 | |
388 #---------------------------------------------------------------- | |
389 # get uniq-hit reads | |
390 #---------------------------------------------------------------- | |
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) | |
392 | |
393 Unique_FW_fr_C2T=set() # + | |
394 Unique_FW_rf_C2T=set() # + | |
395 Unique_RC_fr_C2T=set() # - | |
396 Unique_RC_rf_C2T=set() # - | |
397 Multiple_hits=set() | |
398 | |
399 | |
400 for x in Union_set: | |
401 _list=[] | |
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]: | |
403 mis_lst=d.get(x,[99]) | |
404 mis=int(mis_lst[0]) | |
405 _list.append(mis) | |
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]: | |
407 mis=d.get(x,99) | |
408 _list.append(mis) | |
409 mini=min(_list) | |
410 if _list.count(mini)==1: | |
411 mini_index=_list.index(mini) | |
412 if mini_index==0: | |
413 Unique_FW_fr_C2T.add(x) | |
414 elif mini_index==1: | |
415 Unique_FW_rf_C2T.add(x) | |
416 elif mini_index==2: | |
417 Unique_RC_fr_C2T.add(x) | |
418 elif mini_index==3: | |
419 Unique_RC_rf_C2T.add(x) | |
420 else : | |
421 Multiple_hits.add(x) | |
422 else : | |
423 Multiple_hits.add(x) | |
424 | |
425 # write reads rejected by Multiple Hits to file | |
426 if show_multiple_hit : | |
427 outf_MH=open("Multiple_hit.fa",'w') | |
428 for i in Multiple_hits : | |
429 outf_MH.write(">%s\n" % i) | |
430 outf_MH.write("%s\n" % original_bs_reads[i]) | |
431 outf_MH.close() | |
432 | |
433 del Union_set | |
434 del FW_C2T_fr_R | |
435 del FW_C2T_rf_R | |
436 del RC_C2T_fr_R | |
437 del RC_C2T_rf_R | |
438 | |
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] | |
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] | |
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] | |
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] | |
443 | |
444 FW_C2T_fr_uniq_lst.sort() | |
445 FW_C2T_rf_uniq_lst.sort() | |
446 RC_C2T_fr_uniq_lst.sort() | |
447 RC_C2T_rf_uniq_lst.sort() | |
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] | |
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst] | |
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] | |
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst] | |
452 #---------------------------------------------------------------- | |
453 | |
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) | |
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T) | |
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T) | |
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T) | |
458 | |
459 del Unique_FW_fr_C2T | |
460 del Unique_FW_rf_C2T | |
461 del Unique_RC_fr_C2T | |
462 del Unique_RC_rf_C2T | |
463 | |
464 #---------------------------------------------------------------- | |
465 | |
466 nn = 0 | |
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), | |
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), | |
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U), | |
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: | |
471 nn += 1 | |
472 | |
473 mapped_chr0 = "" | |
474 for header in ali_unique_lst: | |
475 | |
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] | |
477 | |
478 #------------------------------------- | |
479 if mapped_chr != mapped_chr0: | |
480 my_gseq=deserialize(db_d(mapped_chr)) | |
481 chr_length=len(my_gseq) | |
482 mapped_chr0=mapped_chr | |
483 #------------------------------------- | |
484 if nn == 1 or nn == 3: | |
485 original_BS_1 = original_bs_reads_1[header] | |
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) | |
487 else: | |
488 original_BS_1 = original_bs_reads_2[header] | |
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) | |
490 | |
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) | |
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) | |
493 | |
494 all_mapped += 1 | |
495 | |
496 if nn == 1: # FW-RC mapped to + strand: | |
497 | |
498 FR = "+FR" | |
499 | |
500 # mapped_location_1 += 1 | |
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
502 # origin_genome_1 = origin_genome_long_1[2:-2] | |
503 mapped_strand_1 = "+" | |
504 | |
505 # mapped_location_2 += 1 | |
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
507 # origin_genome_2 = origin_genome_long_2[2:-2] | |
508 mapped_strand_2 = "+" | |
509 | |
510 elif nn==2: # RC-FW mapped to + strand: | |
511 | |
512 # original_BS_1 = original_bs_reads_2[header] | |
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) | |
514 FR = "+RF" | |
515 # mapped_location_1 += 1 | |
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
517 # origin_genome_1 = origin_genome_long_1[2:-2] | |
518 mapped_strand_1 = "+" | |
519 | |
520 # mapped_location_2 += 1 | |
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
522 # origin_genome_2 = origin_genome_long_2[2:-2] | |
523 mapped_strand_2 = "+" | |
524 | |
525 | |
526 elif nn==3: # FW-RC mapped to - strand: | |
527 # original_BS_1=original_bs_reads_1[header] | |
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header]) | |
529 | |
530 FR = "-FR" | |
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | |
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
534 # origin_genome_1 = origin_genome_long_1[2:-2] | |
535 mapped_strand_1 = "-" | |
536 | |
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | |
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ]) | |
539 # origin_genome_2 = origin_genome_long_2[2:-2] | |
540 mapped_strand_2 = "-" | |
541 | |
542 elif nn==4: # RC-FW mapped to - strand: | |
543 # original_BS_1=original_bs_reads_2[header] | |
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header]) | |
545 | |
546 FR = "-RF" | |
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | |
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
550 # origin_genome_1 = origin_genome_long_1[2:-2] | |
551 mapped_strand_1 = "-" | |
552 | |
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | |
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) | |
555 # origin_genome_2 = origin_genome_long_2[2:-2] | |
556 mapped_strand_2 = "-" | |
557 | |
558 | |
559 | |
560 | |
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) | |
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) | |
563 | |
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) | |
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) | |
566 | |
567 | |
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides | |
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides | |
570 | |
571 | |
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) : | |
573 all_mapped_passed += 1 | |
574 numbers_mapped_lst[nn-1] += 1 | |
575 #---- unmapped ------------------------- | |
576 del original_bs_reads_1[header] | |
577 del original_bs_reads_2[header] | |
578 #--------------------------------------- | |
579 | |
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] | |
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] | |
582 | |
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) | |
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) | |
585 | |
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) | |
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) | |
588 | |
589 | |
590 # | |
591 #---XS FILTER---------------- | |
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
593 XS_1 = 0 | |
594 nCH_1 = methy_1.count('y') + methy_1.count('z') | |
595 nmCH_1 = methy_1.count('Y') + methy_1.count('Z') | |
596 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : | |
597 XS_1 = 1 | |
598 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
599 XS_2 = 0 | |
600 nCH_2 = methy_2.count('y') + methy_2.count('z') | |
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') | |
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : | |
603 XS_2 = 1 | |
604 | |
605 | |
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, | |
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) | |
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | |
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) | |
610 | |
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) | |
612 #---------------------------------------------------------------- | |
613 # output unmapped pairs | |
614 #---------------------------------------------------------------- | |
615 | |
616 unmapped_lst=original_bs_reads_1.keys() | |
617 unmapped_lst.sort() | |
618 | |
619 # for u in unmapped_lst: | |
620 # outf_u1.write("%s\n"%original_bs_reads_1[u]) | |
621 # outf_u2.write("%s\n"%original_bs_reads_2[u]) | |
622 | |
623 all_unmapped += len(unmapped_lst) | |
624 | |
625 | |
626 if asktag=="N": | |
627 | |
628 #---------------------------------------------------------------- | |
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id) | |
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id) | |
631 | |
632 try : | |
633 read_inf=open(tmp_d(read_file_1),"r") | |
634 except IOError : | |
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1) | |
636 exit(-1) | |
637 | |
638 oneline=read_inf.readline() | |
639 l=oneline.split() | |
640 input_format="" | |
641 | |
642 #if len(l)==5: # old solexa format | |
643 # input_format="old Solexa Seq file" | |
644 | |
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) | |
646 input_format="FastQ" | |
647 n_fastq=0 | |
648 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
649 input_format="_list of sequences" | |
650 elif len(l)==11: # Illumina GAII qseq file | |
651 input_format="Illumina GAII qseq file" | |
652 elif oneline[0]==">": # fasta | |
653 input_format="fasta" | |
654 n_fasta=0 | |
655 | |
656 read_inf.close() | |
657 | |
658 print "Detected data format: %s" % input_format | |
659 | |
660 #---------------------------------------------------------------- | |
661 read_file_list=[read_file_1,read_file_2] | |
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT] | |
663 n_list=[0,0] | |
664 | |
665 for f in range(2): | |
666 read_file=read_file_list[f] | |
667 outf_FCT=open(outfile_FCT_list[f],'w') | |
668 original_bs_reads = original_bs_reads_lst[f] | |
669 n=n_list[f] | |
670 | |
671 seq_id="" | |
672 seq="" | |
673 seq_ready="N" | |
674 for line in fileinput.input(tmp_d(read_file)): | |
675 l=line.split() | |
676 if input_format=="old Solexa Seq file": | |
677 n+=1 | |
678 seq_id=str(n) | |
679 seq_id=seq_id.zfill(12) | |
680 seq=l[4] | |
681 seq_ready="Y" | |
682 elif input_format=="_list of sequences": | |
683 n+=1 | |
684 seq_id=str(n) | |
685 seq_id=seq_id.zfill(12) | |
686 seq=l[0] | |
687 seq_ready="Y" | |
688 elif input_format=="FastQ": | |
689 m_fastq=math.fmod(n_fastq,4) | |
690 n_fastq+=1 | |
691 seq_ready="N" | |
692 if m_fastq==0: | |
693 n+=1 | |
694 seq_id=str(n) | |
695 seq_id=seq_id.zfill(12) | |
696 seq="" | |
697 elif m_fastq==1: | |
698 seq=l[0] | |
699 seq_ready="Y" | |
700 else: | |
701 seq="" | |
702 elif input_format=="Illumina GAII qseq file": | |
703 n+=1 | |
704 seq_id=str(n) | |
705 seq_id=seq_id.zfill(12) | |
706 seq=l[8] | |
707 seq_ready="Y" | |
708 elif input_format=="fasta": | |
709 m_fasta=math.fmod(n_fasta,2) | |
710 n_fasta+=1 | |
711 seq_ready="N" | |
712 if m_fasta==0: | |
713 n+=1 | |
714 seq_id=l[0][1:] | |
715 seq_id=seq_id.zfill(17) | |
716 seq="" | |
717 elif m_fasta==1: | |
718 seq=l[0] | |
719 seq_ready="Y" | |
720 else: | |
721 seq="" | |
722 #---------------------------------------------------------------- | |
723 if seq_ready=="Y": | |
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 | |
725 seq=seq.upper() | |
726 seq=seq.replace(".","N") | |
727 | |
728 #--striping BS adapter from 3' read -------------------------------------------------------------- | |
729 if (adapterA !="") and (adapterB !=""): | |
730 signature=adapterA[:6] | |
731 if signature in seq: | |
732 signature_pos=seq.index(signature) | |
733 if seq[signature_pos:] in adapterA: | |
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
735 all_trimmed+=1 | |
736 else: | |
737 signature=adapterB[:6] | |
738 if signature in seq: | |
739 #print seq_id,seq,signature; | |
740 signature_pos=seq.index(signature) | |
741 if seq[signature_pos:] in adapterB: | |
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
743 all_trimmed+=1 | |
744 | |
745 if len(seq) <= 4: | |
746 seq = "N" * (cut2-cut1+1) | |
747 #--------- trimmed_raw_BS_read ------------------ | |
748 original_bs_reads[seq_id] = seq | |
749 | |
750 #--------- FW_C2T ------------------ | |
751 if f==0: | |
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T"))) | |
753 elif f==1: | |
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T"))) | |
755 | |
756 | |
757 n_list[f]=n | |
758 outf_FCT.close() | |
759 | |
760 fileinput.close() | |
761 | |
762 | |
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); | |
764 all_raw_reads+=n | |
765 | |
766 #-------------------------------------------------------------------------------- | |
767 # Bowtie mapping | |
768 #-------------------------------------------------------------------------------- | |
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
771 | |
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
773 'input_file_1' : outfile_1FCT, | |
774 'input_file_2' : outfile_2FCT, | |
775 'output_file' : WC2T_fr}, | |
776 | |
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
778 'input_file_1' : outfile_1FCT, | |
779 'input_file_2' : outfile_2FCT, | |
780 'output_file' : CC2T_fr} ]) | |
781 | |
782 | |
783 delete_files(outfile_1FCT, outfile_2FCT) | |
784 | |
785 #-------------------------------------------------------------------------------- | |
786 # Post processing | |
787 #-------------------------------------------------------------------------------- | |
788 | |
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) | |
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) | |
791 | |
792 #---------------------------------------------------------------- | |
793 # get uniq-hit reads | |
794 #---------------------------------------------------------------- | |
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | |
796 | |
797 Unique_FW_fr_C2T = set() # + | |
798 Unique_RC_fr_C2T = set() # - | |
799 Multiple_hits=set() | |
800 | |
801 | |
802 for x in Union_set: | |
803 _list = [] | |
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]: | |
805 mis_lst = d.get(x,[99]) | |
806 mis = int(mis_lst[0]) | |
807 _list.append(mis) | |
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]: | |
809 mis = d.get(x,99) | |
810 _list.append(mis) | |
811 mini = min(_list) | |
812 if _list.count(mini) == 1: | |
813 mini_index = _list.index(mini) | |
814 if mini_index == 0: | |
815 Unique_FW_fr_C2T.add(x) | |
816 elif mini_index == 1: | |
817 Unique_RC_fr_C2T.add(x) | |
818 else : | |
819 Multiple_hits.add(x) | |
820 else : | |
821 Multiple_hits.add(x) | |
822 | |
823 # write reads rejected by Multiple Hits to file | |
824 if show_multiple_hit : | |
825 outf_MH=open("Multiple_hit.fa",'w') | |
826 for i in Multiple_hits : | |
827 outf_MH.write(">%s\n" % i) | |
828 outf_MH.write("%s\n" % original_bs_reads[i]) | |
829 outf_MH.close() | |
830 | |
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] | |
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] | |
833 | |
834 FW_C2T_fr_uniq_lst.sort() | |
835 RC_C2T_fr_uniq_lst.sort() | |
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] | |
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] | |
838 #---------------------------------------------------------------- | |
839 | |
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) | |
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T) | |
842 | |
843 | |
844 #---------------------------------------------------------------- | |
845 | |
846 nn = 0 | |
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]: | |
848 nn += 1 | |
849 mapped_chr0 = "" | |
850 for header in ali_unique_lst: | |
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] | |
852 | |
853 | |
854 #------------------------------------- | |
855 if mapped_chr != mapped_chr0: | |
856 my_gseq = deserialize(db_d(mapped_chr)) | |
857 chr_length = len(my_gseq) | |
858 mapped_chr0 = mapped_chr | |
859 #------------------------------------- | |
860 | |
861 original_BS_1 = original_bs_reads_1[header] | |
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) | |
863 | |
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) | |
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) | |
866 | |
867 all_mapped += 1 | |
868 | |
869 if nn == 1: # FW-RC mapped to + strand: | |
870 FR = "+FR" | |
871 # mapped_location_1 += 1 | |
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
873 # origin_genome_1 = origin_genome_long_1[2:-2] | |
874 mapped_strand_1 = "+" | |
875 | |
876 # mapped_location_2 += 1 | |
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
878 # origin_genome_2 = origin_genome_long_2[2:-2] | |
879 mapped_strand_2 = "+" | |
880 | |
881 elif nn == 2: # FW-RC mapped to - strand: | |
882 | |
883 FR="-FR" | |
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | |
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
887 # origin_genome_1 = origin_genome_long_1[2:-2] | |
888 mapped_strand_1 = "-" | |
889 | |
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | |
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) | |
892 # origin_genome_2 = origin_genome_long_2[2:-2] | |
893 mapped_strand_2 = "-" | |
894 | |
895 | |
896 | |
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) | |
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) | |
899 | |
900 | |
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) | |
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) | |
903 | |
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides | |
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides | |
906 | |
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): | |
908 | |
909 numbers_mapped_lst[nn-1] += 1 | |
910 all_mapped_passed += 1 | |
911 | |
912 #---- unmapped ------------------------- | |
913 del original_bs_reads_1[header] | |
914 del original_bs_reads_2[header] | |
915 | |
916 #--------------------------------------- | |
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] | |
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] | |
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) | |
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) | |
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) | |
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst) | |
923 | |
924 # | |
925 #---XS FILTER---------------- | |
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
927 XS_1 = 0 | |
928 nCH_1 = methy_1.count('y') + methy_1.count('z') | |
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z') | |
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : | |
931 XS_1 = 1 | |
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
933 XS_2 = 0 | |
934 nCH_2 = methy_2.count('y') + methy_2.count('z') | |
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') | |
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : | |
937 XS_2 = 1 | |
938 | |
939 | |
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, | |
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) | |
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | |
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) | |
944 | |
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) | |
946 #---------------------------------------------------------------- | |
947 # output unmapped pairs | |
948 #---------------------------------------------------------------- | |
949 | |
950 unmapped_lst=original_bs_reads_1.keys() | |
951 unmapped_lst.sort() | |
952 | |
953 # for u in unmapped_lst: | |
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u])) | |
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) | |
956 | |
957 all_unmapped+=len(unmapped_lst) | |
958 | |
959 | |
960 #================================================================================================== | |
961 # outf.close() | |
962 # | |
963 # outf_u1.close() | |
964 # outf_u2.close() | |
965 delete_files(tmp_path) | |
966 | |
967 logm("-------------------------------- " ) | |
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) ) | |
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n") | |
970 | |
971 if all_raw_reads >0: | |
972 | |
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n") | |
974 if asktag=="Y": | |
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | |
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | |
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) | |
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) | |
979 elif asktag=="N": | |
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | |
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | |
982 | |
983 | |
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) | |
986 if asktag=="Y": | |
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) | |
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) | |
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) | |
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) | |
991 elif asktag=="N": | |
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) | |
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) | |
994 | |
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) | |
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n") | |
997 | |
998 | |
999 n_CG=mC_lst[0]+uC_lst[0] | |
1000 n_CHG=mC_lst[1]+uC_lst[1] | |
1001 n_CHH=mC_lst[2]+uC_lst[2] | |
1002 | |
1003 logm("-------------------------------- " ) | |
1004 logm("Methylated C in mapped reads " ) | |
1005 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) ) | |
1006 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) ) | |
1007 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) ) | |
1008 | |
1009 logm("----------------------------------------------" ) | |
1010 logm("------------------- END ----------------------" ) | |
1011 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2)) |