Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_rrbs.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
comparison
equal
deleted
inserted
replaced
0:e6df770c0e58 | 1:8b26adf64adc |
---|---|
40 show_multiple_hit=False): | 40 show_multiple_hit=False): |
41 #---------------------------------------------------------------- | 41 #---------------------------------------------------------------- |
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" | 42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" |
43 #cut_context = re.sub("-", "", cut_format) | 43 #cut_context = re.sub("-", "", cut_format) |
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC" | 44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC" |
45 """ | 45 |
46 | |
47 :param main_read_file: | |
48 :param asktag: | |
49 :param adapter_file: | |
50 :param cut_s: | |
51 :param cut_e: | |
52 :param no_small_lines: | |
53 :param max_mismatch_no: | |
54 :param aligner_command: | |
55 :param db_path: | |
56 :param tmp_path: | |
57 :param outfile: | |
58 :param XS_pct: | |
59 :param XS_count: | |
60 :param adapter_mismatch: | |
61 :param cut_format: | |
62 """ | |
63 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] | 46 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] |
64 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] | 47 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] |
65 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] | 48 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] |
66 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC'] | 49 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC'] |
67 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5] | 50 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5] |
104 adapter_inf.close() | 87 adapter_inf.close() |
105 except IOError: | 88 except IOError: |
106 print "[Error] Cannot find adapter file : %s !" % adapter_file | 89 print "[Error] Cannot find adapter file : %s !" % adapter_file |
107 exit(-1) | 90 exit(-1) |
108 | 91 |
109 logm("I Read filename: %s" % main_read_file) | 92 logm("Read filename: %s" % main_read_file) |
110 logm("I The last cycle (for mapping): %d" % cut_e ) | 93 logm("The first base (for mapping): %d"% cut_s ) |
111 logm("I Bowtie path: %s" % aligner_command ) | 94 logm("The last base (for mapping): %d"% cut_e ) |
112 logm("I Reference genome library path: %s" % db_path ) | 95 |
113 logm("I Number of mismatches allowed: %s" % max_mismatch_no) | 96 logm("Path for short reads aligner: %s"% aligner_command + '\n') |
114 logm("I Adapter seq: %s" % whole_adapter_seq) | 97 logm("Reference genome library path: %s"% db_path ) |
115 logm("----------------------------------------------") | 98 |
99 if asktag == "Y" : | |
100 logm("Un-directional library" ) | |
101 else : | |
102 logm("Directional library") | |
103 | |
104 logm("Number of mismatches allowed: %s"% max_mismatch_no ) | |
105 | |
106 if adapter_file !="": | |
107 logm("Adapter seq: %s" % whole_adapter_seq) | |
108 logm("-------------------------------- " ) | |
116 | 109 |
117 #---------------------------------------------------------------- | 110 #---------------------------------------------------------------- |
118 all_raw_reads=0 | 111 all_raw_reads=0 |
119 all_tagged=0 | 112 all_tagged=0 |
120 all_tagged_trimmed=0 | 113 all_tagged_trimmed=0 |
114 all_base_before_trim=0 | |
115 all_base_after_trim=0 | |
116 all_base_mapped=0 | |
121 all_mapped=0 | 117 all_mapped=0 |
122 all_mapped_passed=0 | 118 all_mapped_passed=0 |
123 n_cut_tag_lst={} | 119 n_cut_tag_lst={} |
124 #print cut3_tag_lst | 120 #print cut3_tag_lst |
125 for x in cut3_tag_lst: | 121 for x in cut3_tag_lst: |
133 num_mapped_FW_C2T = 0 | 129 num_mapped_FW_C2T = 0 |
134 num_mapped_RC_C2T = 0 | 130 num_mapped_RC_C2T = 0 |
135 num_mapped_FW_G2A = 0 | 131 num_mapped_FW_G2A = 0 |
136 num_mapped_RC_G2A = 0 | 132 num_mapped_RC_G2A = 0 |
137 | 133 |
134 # Count of nucleotides, which should be cut before the adapters | |
135 Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)]) | |
136 | |
138 #=============================================== | 137 #=============================================== |
139 # directional sequencing | 138 # directional sequencing |
140 #=============================================== | 139 #=============================================== |
141 | 140 |
142 if asktag=="N" : | 141 if asktag=="N" : |
154 | 153 |
155 outf2=open(outfile2,'w') | 154 outf2=open(outfile2,'w') |
156 | 155 |
157 #--- Checking input format ------------------------------------------ | 156 #--- Checking input format ------------------------------------------ |
158 try : | 157 try : |
159 read_inf=open(read_file,"r") | 158 if read_file.endswith(".gz") : # support input file ending with ".gz" |
159 read_inf = gzip.open(read_file, "rb") | |
160 else : | |
161 read_inf=open(read_file,"r") | |
160 except IOError: | 162 except IOError: |
161 print "[Error] Cannot open input file : %s" % read_file | 163 print "[Error] Cannot open input file : %s" % read_file |
162 exit(-1) | 164 exit(-1) |
163 | 165 |
164 oneline=read_inf.readline() | 166 oneline = read_inf.readline() |
165 l=oneline.split() | 167 l = oneline.split() |
166 n_fastq=0 | 168 input_format = "" |
167 n_fasta=0 | 169 if oneline[0]=="@": |
168 input_format="" | 170 input_format = "fastq" |
169 if oneline[0]=="@": # FastQ | 171 elif len(l)==1 and oneline[0]!=">" : |
170 input_format="fastq" | 172 input_format = "seq" |
171 elif len(l)==1 and oneline[0]!=">": # pure sequences | 173 elif len(l)==11: |
172 input_format="seq" | 174 input_format = "qseq" |
173 elif len(l)==11: # Illumina qseq | 175 elif oneline[0]==">" : |
174 input_format="qseq" | 176 input_format = "fasta" |
175 elif oneline[0]==">": # fasta | |
176 input_format="fasta" | |
177 read_inf.close() | 177 read_inf.close() |
178 | 178 |
179 | |
179 #---------------------------------------------------------------- | 180 #---------------------------------------------------------------- |
180 seq_id="" | 181 read_id = "" |
181 seq="" | 182 seq = "" |
182 seq_ready=0 | 183 seq_ready = 0 |
183 for line in fileinput.input(read_file): | 184 line_no = 0 |
184 l=line.split() | 185 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): |
185 | 186 l = line.split() |
187 line_no += 1 | |
186 if input_format=="seq": | 188 if input_format=="seq": |
187 all_raw_reads+=1 | 189 all_raw_reads += 1 |
188 seq_id=str(all_raw_reads) | 190 read_id = str(all_raw_reads) |
189 seq_id=seq_id.zfill(12) | 191 read_id = read_id.zfill(12) |
190 seq=l[0] | 192 seq = l[0] |
191 seq_ready="Y" | 193 seq_ready = "Y" |
192 elif input_format=="fastq": | 194 elif input_format=="fastq": |
193 m_fastq=math.fmod(n_fastq,4) | 195 l_fastq = math.fmod(line_no, 4) |
194 n_fastq+=1 | 196 if l_fastq == 1 : |
195 seq_ready="N" | 197 all_raw_reads += 1 |
196 if m_fastq==0: | 198 read_id = l[0][1:] |
197 all_raw_reads+=1 | 199 seq_ready = "N" |
198 seq_id=str(all_raw_reads) | 200 elif l_fastq == 2 : |
199 seq_id=seq_id.zfill(12) | 201 seq = l[0] |
200 seq="" | 202 seq_ready = "Y" |
201 elif m_fastq==1: | 203 else : |
202 seq=l[0] | 204 seq = "" |
203 seq_ready="Y" | 205 seq_ready = "N" |
204 else: | |
205 seq="" | |
206 elif input_format=="qseq": | 206 elif input_format=="qseq": |
207 all_raw_reads+=1 | 207 all_raw_reads += 1 |
208 seq_id=str(all_raw_reads) | 208 read_id = str(all_raw_reads) |
209 seq_id=seq_id.zfill(12) | 209 read_id = read_id.zfill(12) |
210 seq=l[8] | 210 seq = l[8] |
211 seq_ready="Y" | 211 seq_ready = "Y" |
212 elif input_format=="fasta": | 212 elif input_format=="fasta" : |
213 m_fasta=math.fmod(n_fasta,2) | 213 l_fasta = math.fmod(line_no,2) |
214 n_fasta+=1 | 214 if l_fasta==1: |
215 seq_ready="N" | 215 all_raw_reads += 1 |
216 if m_fasta==0: | 216 read_id = l[0][1:] |
217 all_raw_reads+=1 | 217 seq = "" |
218 seq_id=l[0][1:] | 218 seq_ready = "N" |
219 seq="" | 219 elif l_fasta==0 : |
220 elif m_fasta==1: | 220 seq = l[0] |
221 seq=l[0] | 221 seq_ready = "Y" |
222 seq_ready="Y" | 222 |
223 else: | |
224 seq="" | |
225 #--------------------------------------------------------------- | 223 #--------------------------------------------------------------- |
226 if seq_ready=="Y": | 224 if seq_ready=="Y": |
227 # Normalize the characters | 225 # Normalize the characters |
228 seq=seq.upper().replace(".","N") | 226 seq=seq.upper().replace(".","N") |
229 | 227 |
234 n_cut_tag_lst[i] += 1 | 232 n_cut_tag_lst[i] += 1 |
235 | 233 |
236 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default | 234 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default |
237 | 235 |
238 #-- Trimming adapter sequence --- | 236 #-- Trimming adapter sequence --- |
237 | |
238 all_base_before_trim += len(seq) | |
239 if adapter_seq != "" : | 239 if adapter_seq != "" : |
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) | 240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter) |
241 if len(new_read) < len(seq) : | 241 if len(new_read) < len(seq) : |
242 all_tagged_trimmed += 1 | 242 all_tagged_trimmed += 1 |
243 seq = new_read | 243 seq = new_read |
244 all_base_after_trim += len(seq) | |
244 if len(seq) <= 4 : | 245 if len(seq) <= 4 : |
245 seq = "N" * (cut_e - cut_s) | 246 seq = "N" * (cut_e - cut_s) |
246 | 247 |
247 # all reads will be considered, regardless of tags | 248 # all reads will be considered, regardless of tags |
248 #--------- trimmed_raw_BS_read and qscore ------------------ | 249 #--------- trimmed_raw_BS_read and qscore ------------------ |
249 original_bs_reads[seq_id] = seq | 250 original_bs_reads[read_id] = seq |
250 #--------- FW_C2T ------------------ | 251 #--------- FW_C2T ------------------ |
251 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) | 252 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T'))) |
252 fileinput.close() | 253 fileinput.close() |
253 | 254 |
254 outf2.close() | 255 outf2.close() |
255 | 256 |
256 delete_files(read_file) | 257 delete_files(read_file) |
382 # print "[For debug]: +FW read still can not find fragment serial" | 383 # print "[For debug]: +FW read still can not find fragment serial" |
383 # Tip: sometimes "my_region_serial" is still 0 ... | 384 # Tip: sometimes "my_region_serial" is still 0 ... |
384 | 385 |
385 | 386 |
386 N_mismatch = N_MIS(r_aln, g_aln) | 387 N_mismatch = N_MIS(r_aln, g_aln) |
387 if N_mismatch <= int(max_mismatch_no) : | 388 # if N_mismatch <= int(max_mismatch_no) : |
389 mm_no=float(max_mismatch_no) | |
390 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
388 all_mapped_passed += 1 | 391 all_mapped_passed += 1 |
389 methy = methy_seq(r_aln, g_aln + next2bp) | 392 methy = methy_seq(r_aln, g_aln + next2bp) |
390 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 393 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
391 #---XS FILTER---------------- | 394 #---XS FILTER---------------- |
392 XS = 0 | 395 XS = 0 |
400 output_genome = output_genome, | 403 output_genome = output_genome, |
401 rrbs = True, | 404 rrbs = True, |
402 my_region_serial = my_region_serial, | 405 my_region_serial = my_region_serial, |
403 my_region_start = my_region_start, | 406 my_region_start = my_region_start, |
404 my_region_end = my_region_end) | 407 my_region_end = my_region_end) |
408 all_base_mapped += len(original_BS) | |
405 else : | 409 else : |
406 print "[For debug]: reads not in same lengths" | 410 print "[For debug]: reads not in same lengths" |
407 | 411 |
408 #print "start RC" | 412 #print "start RC" |
409 # ---- RC ---- | 413 # ---- RC ---- |
451 # print "[For debug]: chr=", mapped_chr | 455 # print "[For debug]: chr=", mapped_chr |
452 # print "[For debug]: -FW read still cannot find fragment serial" | 456 # print "[For debug]: -FW read still cannot find fragment serial" |
453 | 457 |
454 | 458 |
455 N_mismatch = N_MIS(r_aln, g_aln) | 459 N_mismatch = N_MIS(r_aln, g_aln) |
456 if N_mismatch <= int(max_mismatch_no) : | 460 # if N_mismatch <= int(max_mismatch_no) : |
461 mm_no=float(max_mismatch_no) | |
462 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
457 all_mapped_passed += 1 | 463 all_mapped_passed += 1 |
458 methy = methy_seq(r_aln, g_aln + next2bp) | 464 methy = methy_seq(r_aln, g_aln + next2bp) |
459 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 465 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
460 #---XS FILTER---------------- | 466 #---XS FILTER---------------- |
461 XS = 0 | 467 XS = 0 |
469 output_genome = output_genome, | 475 output_genome = output_genome, |
470 rrbs = True, | 476 rrbs = True, |
471 my_region_serial = my_region_serial, | 477 my_region_serial = my_region_serial, |
472 my_region_start = my_region_start, | 478 my_region_start = my_region_start, |
473 my_region_end = my_region_end) | 479 my_region_end = my_region_end) |
480 all_base_mapped += len(original_BS) | |
474 else : | 481 else : |
475 print "[For debug]: reads not in same lengths" | 482 print "[For debug]: reads not in same lengths" |
476 | 483 |
477 | 484 |
478 # Finished both FW and RC | 485 # Finished both FW and RC |
505 outf2=open(outfile2,'w') | 512 outf2=open(outfile2,'w') |
506 outf3=open(outfile3,'w') | 513 outf3=open(outfile3,'w') |
507 | 514 |
508 #--- Checking input format ------------------------------------------ | 515 #--- Checking input format ------------------------------------------ |
509 try : | 516 try : |
510 read_inf=open(read_file,"r") | 517 if read_file.endswith(".gz") : # support input file ending with ".gz" |
518 read_inf = gzip.open(read_file, "rb") | |
519 else : | |
520 read_inf=open(read_file,"r") | |
511 except IOError: | 521 except IOError: |
512 print "[Error] Cannot open input file : %s" % read_file | 522 print "[Error] Cannot open input file : %s" % read_file |
513 exit(-1) | 523 exit(-1) |
514 | 524 |
515 oneline=read_inf.readline() | 525 oneline = read_inf.readline() |
516 l=oneline.split() | 526 l = oneline.split() |
517 n_fastq=0 | 527 input_format = "" |
518 n_fasta=0 | 528 if oneline[0]=="@": |
519 input_format="" | 529 input_format = "fastq" |
520 if oneline[0]=="@": # FastQ | 530 elif len(l)==1 and oneline[0]!=">" : |
521 input_format="fastq" | 531 input_format = "seq" |
522 elif len(l)==1 and oneline[0]!=">": # pure sequences | 532 elif len(l)==11: |
523 input_format="seq" | 533 input_format = "qseq" |
524 elif len(l)==11: # Illumina qseq | 534 elif oneline[0]==">" : |
525 input_format="qseq" | 535 input_format = "fasta" |
526 elif oneline[0]==">": # fasta | |
527 input_format="fasta" | |
528 read_inf.close() | 536 read_inf.close() |
529 | 537 |
530 #---------------------------------------------------------------- | 538 #---------------------------------------------------------------- |
531 seq_id = "" | 539 read_id = "" |
532 seq = "" | 540 seq = "" |
533 seq_ready=0 | 541 seq_ready = 0 |
534 for line in fileinput.input(read_file): | 542 line_no = 0 |
535 l=line.split() | 543 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): |
536 | 544 l = line.split() |
537 if input_format == "seq": | 545 line_no += 1 |
538 all_raw_reads+=1 | 546 if input_format=="seq": |
539 seq_id=str(all_raw_reads) | 547 all_raw_reads += 1 |
540 seq_id=seq_id.zfill(12) | 548 read_id = str(all_raw_reads) |
541 seq=l[0] | 549 read_id = read_id.zfill(12) |
542 seq_ready="Y" | 550 seq = l[0] |
551 seq_ready = "Y" | |
543 elif input_format=="fastq": | 552 elif input_format=="fastq": |
544 m_fastq=math.fmod(n_fastq,4) | 553 l_fastq = math.fmod(line_no, 4) |
545 n_fastq+=1 | 554 if l_fastq == 1 : |
546 seq_ready="N" | 555 all_raw_reads += 1 |
547 if m_fastq==0: | 556 read_id = l[0][1:] |
548 all_raw_reads+=1 | 557 seq_ready = "N" |
549 seq_id=str(all_raw_reads) | 558 elif l_fastq == 2 : |
550 seq_id=seq_id.zfill(12) | 559 seq = l[0] |
551 seq="" | 560 seq_ready = "Y" |
552 elif m_fastq==1: | 561 else : |
553 seq=l[0] | 562 seq = "" |
554 seq_ready="Y" | 563 seq_ready = "N" |
555 else: | |
556 seq="" | |
557 elif input_format=="qseq": | 564 elif input_format=="qseq": |
558 all_raw_reads+=1 | 565 all_raw_reads += 1 |
559 seq_id=str(all_raw_reads) | 566 read_id = str(all_raw_reads) |
560 seq_id=seq_id.zfill(12) | 567 read_id = read_id.zfill(12) |
561 seq=l[8] | 568 seq = l[8] |
562 seq_ready="Y" | 569 seq_ready = "Y" |
563 elif input_format=="fasta": | 570 elif input_format=="fasta" : |
564 m_fasta=math.fmod(n_fasta,2) | 571 l_fasta = math.fmod(line_no,2) |
565 n_fasta+=1 | 572 if l_fasta==1: |
566 seq_ready="N" | 573 all_raw_reads += 1 |
567 if m_fasta==0: | 574 read_id = l[0][1:] |
568 all_raw_reads+=1 | 575 seq = "" |
569 seq_id=l[0][1:] | 576 seq_ready = "N" |
570 seq="" | 577 elif l_fasta==0 : |
571 elif m_fasta==1: | 578 seq = l[0] |
572 seq=l[0] | 579 seq_ready = "Y" |
573 seq_ready="Y" | 580 |
574 else: | 581 #--------------------------------------------------------------- |
575 seq="" | |
576 #--------------------------------------------------------------- | |
577 if seq_ready=="Y": | 582 if seq_ready=="Y": |
578 # Normalize the characters | 583 # Normalize the characters |
579 seq=seq.upper().replace(".","N") | 584 seq = seq.upper().replace(".","N") |
580 | 585 |
581 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] | 586 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] |
582 if len(read_tag) > 0 : | 587 if len(read_tag) > 0 : |
583 all_tagged += 1 | 588 all_tagged += 1 |
584 for i in read_tag : | 589 for i in read_tag : |
585 n_cut_tag_lst[i] += 1 | 590 n_cut_tag_lst[i] += 1 |
586 | 591 |
587 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default | 592 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default |
588 | 593 |
589 #-- Trimming adapter sequence --- | 594 #-- Trimming adapter sequence --- |
595 all_base_before_trim += len(seq) | |
590 if adapter_seq != "" : | 596 if adapter_seq != "" : |
591 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) | 597 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter) |
592 if len(new_read) < len(seq) : | 598 if len(new_read) < len(seq) : |
593 all_tagged_trimmed += 1 | 599 all_tagged_trimmed += 1 |
594 seq = new_read | 600 seq = new_read |
601 all_base_after_trim += len(seq) | |
595 if len(seq) <= 4 : | 602 if len(seq) <= 4 : |
596 seq = "N" * (cut_e - cut_s) | 603 seq = "N" * (cut_e - cut_s) |
597 | 604 |
598 # all reads will be considered, regardless of tags | 605 # all reads will be considered, regardless of tags |
599 #--------- trimmed_raw_BS_read and qscore ------------------ | 606 #--------- trimmed_raw_BS_read and qscore ------------------ |
600 original_bs_reads[seq_id] = seq | 607 original_bs_reads[read_id] = seq |
601 #--------- FW_C2T ------------------ | 608 #--------- FW_C2T ------------------ |
602 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) | 609 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T'))) |
603 #--------- RC_G2A ------------------ | 610 #--------- RC_G2A ------------------ |
604 outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) | 611 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) |
605 fileinput.close() | 612 fileinput.close() |
606 | 613 |
607 outf2.close() | 614 outf2.close() |
608 | 615 |
609 delete_files(read_file) | 616 delete_files(read_file) |
610 logm("Processing input is done") | 617 logm("Processing input is done") |
611 #-------------------------------------------------------------------------------- | 618 #-------------------------------------------------------------------------------- |
612 | 619 |
613 # mapping | 620 # mapping |
614 #-------------------------------------------------------------------------------- | 621 #-------------------------------------------------------------------------------- |
615 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) | 622 WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) |
616 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) | 623 CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) |
617 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) | 624 WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) |
618 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) | 625 CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) |
619 | 626 |
620 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | 627 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), |
621 'input_file' : outfile2, | 628 'input_file' : outfile2, |
622 'output_file' : WC2T}, | 629 'output_file' : WC2T}, |
623 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | 630 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), |
636 | 643 |
637 #-------------------------------------------------------------------------------- | 644 #-------------------------------------------------------------------------------- |
638 # Post processing | 645 # Post processing |
639 #-------------------------------------------------------------------------------- | 646 #-------------------------------------------------------------------------------- |
640 | 647 |
641 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) | 648 FW_C2T_U,FW_C2T_R = extract_mapping(WC2T) |
642 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) | 649 RC_G2A_U,RC_G2A_R = extract_mapping(CG2A) |
643 | 650 |
644 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) | 651 FW_G2A_U,FW_G2A_R = extract_mapping(WG2A) |
645 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) | 652 RC_C2T_U,RC_C2T_R = extract_mapping(CC2T) |
646 | 653 |
647 logm("Extracting alignments is done") | 654 logm("Extracting alignments is done") |
648 | 655 |
649 #---------------------------------------------------------------- | 656 #---------------------------------------------------------------- |
650 # get unique-hit reads | 657 # get unique-hit reads |
651 #---------------------------------------------------------------- | 658 #---------------------------------------------------------------- |
652 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) | 659 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) |
653 | 660 |
654 Unique_FW_C2T=set() # + | 661 Unique_FW_C2T = set() # + |
655 Unique_RC_G2A=set() # + | 662 Unique_RC_G2A = set() # + |
656 Unique_FW_G2A=set() # - | 663 Unique_FW_G2A = set() # - |
657 Unique_RC_C2T=set() # - | 664 Unique_RC_C2T = set() # - |
658 Multiple_hits=set() | 665 Multiple_hits = set() |
659 | 666 |
660 for x in Union_set: | 667 for x in Union_set : |
661 _list=[] | 668 _list = [] |
662 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: | 669 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: |
663 mis_lst=dx.get(x,[99]) | 670 mis_lst = dx.get(x,[99]) |
664 mis=int(mis_lst[0]) | 671 mis = int(mis_lst[0]) |
665 _list.append(mis) | 672 _list.append(mis) |
666 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: | 673 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: |
667 mis=dx.get(x,99) | 674 mis = dx.get(x,99) |
668 _list.append(mis) | 675 _list.append(mis) |
669 mini=min(_list) | 676 mini = min(_list) |
670 if _list.count(mini) == 1: | 677 if _list.count(mini) == 1: |
671 mini_index=_list.index(mini) | 678 mini_index = _list.index(mini) |
672 if mini_index == 0: | 679 if mini_index == 0: |
673 Unique_FW_C2T.add(x) | 680 Unique_FW_C2T.add(x) |
674 elif mini_index == 1: | 681 elif mini_index == 1: |
675 Unique_RC_G2A.add(x) | 682 Unique_RC_G2A.add(x) |
676 elif mini_index == 2: | 683 elif mini_index == 2: |
681 Multiple_hits.add(x) | 688 Multiple_hits.add(x) |
682 else : | 689 else : |
683 Multiple_hits.add(x) | 690 Multiple_hits.add(x) |
684 # write reads rejected by Multiple Hits to file | 691 # write reads rejected by Multiple Hits to file |
685 if show_multiple_hit : | 692 if show_multiple_hit : |
686 outf_MH=open("Multiple_hit.fa",'w') | 693 outf_MH = open("Multiple_hit.fa",'w') |
687 for i in Multiple_hits : | 694 for i in Multiple_hits : |
688 outf_MH.write(">%s\n" % i) | 695 outf_MH.write(">%s\n" % i) |
689 outf_MH.write("%s\n" % original_bs_reads[i]) | 696 outf_MH.write("%s\n" % original_bs_reads[i]) |
690 outf_MH.close() | 697 outf_MH.close() |
691 | 698 |
693 del FW_C2T_R | 700 del FW_C2T_R |
694 del FW_G2A_R | 701 del FW_G2A_R |
695 del RC_C2T_R | 702 del RC_C2T_R |
696 del RC_G2A_R | 703 del RC_G2A_R |
697 | 704 |
698 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | 705 FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] |
699 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] | 706 FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] |
700 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | 707 RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] |
701 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] | 708 RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] |
702 FW_C2T_uniq_lst.sort() | 709 FW_C2T_uniq_lst.sort() |
703 RC_C2T_uniq_lst.sort() | 710 RC_C2T_uniq_lst.sort() |
704 FW_G2A_uniq_lst.sort() | 711 FW_G2A_uniq_lst.sort() |
705 RC_G2A_uniq_lst.sort() | 712 RC_G2A_uniq_lst.sort() |
706 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | 713 FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst] |
707 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | 714 RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst] |
708 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] | 715 FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst] |
709 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] | 716 RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst] |
710 | 717 |
711 del Unique_FW_C2T | 718 del Unique_FW_C2T |
712 del Unique_FW_G2A | 719 del Unique_FW_G2A |
713 del Unique_RC_C2T | 720 del Unique_RC_C2T |
714 del Unique_RC_G2A | 721 del Unique_RC_G2A |
715 | 722 |
716 | 723 |
717 #---------------------------------------------------------------- | 724 #---------------------------------------------------------------- |
718 # Post-filtering reads | 725 # Post-filtering reads |
719 # ---- FW_C2T ---- undirectional | 726 # ---- FW_C2T ---- un-directional |
720 FW_regions = dict() | 727 FW_regions = dict() |
721 gseq = dict() | 728 gseq = dict() |
722 chr_length = dict() | 729 chr_length = dict() |
723 for header in FW_C2T_uniq_lst : | 730 for header in FW_C2T_uniq_lst : |
724 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] | 731 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] |
756 try_pos, FR) | 763 try_pos, FR) |
757 try_pos -= 1 | 764 try_pos -= 1 |
758 try_count += 1 | 765 try_count += 1 |
759 | 766 |
760 N_mismatch = N_MIS(r_aln, g_aln) | 767 N_mismatch = N_MIS(r_aln, g_aln) |
761 if N_mismatch <= int(max_mismatch_no) : | 768 # if N_mismatch <= int(max_mismatch_no) : |
769 mm_no=float(max_mismatch_no) | |
770 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
762 all_mapped_passed += 1 | 771 all_mapped_passed += 1 |
763 methy = methy_seq(r_aln, g_aln + next2bp) | 772 methy = methy_seq(r_aln, g_aln + next2bp) |
764 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 773 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
765 #---XS FILTER---------------- | 774 #---XS FILTER---------------- |
766 XS = 0 | 775 XS = 0 |
774 output_genome = output_genome, | 783 output_genome = output_genome, |
775 rrbs = True, | 784 rrbs = True, |
776 my_region_serial = my_region_serial, | 785 my_region_serial = my_region_serial, |
777 my_region_start = my_region_start, | 786 my_region_start = my_region_start, |
778 my_region_end = my_region_end) | 787 my_region_end = my_region_end) |
788 all_base_mapped += len(original_BS) | |
779 else : | 789 else : |
780 print "[For debug]: reads not in same lengths" | 790 print "[For debug]: reads not in same lengths" |
781 | 791 |
782 | 792 |
783 # ---- RC_C2T ---- undirectional | 793 # ---- RC_C2T ---- un-directional |
784 RC_regions = dict() | 794 RC_regions = dict() |
785 for header in RC_C2T_uniq_lst : | 795 for header in RC_C2T_uniq_lst : |
786 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] | 796 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] |
787 original_BS = original_bs_reads[header] | 797 original_BS = original_bs_reads[header] |
788 if mapped_chr not in RC_regions : | 798 if mapped_chr not in RC_regions : |
819 try_pos, FR) | 829 try_pos, FR) |
820 try_pos += 1 | 830 try_pos += 1 |
821 try_count += 1 | 831 try_count += 1 |
822 | 832 |
823 N_mismatch = N_MIS(r_aln, g_aln) | 833 N_mismatch = N_MIS(r_aln, g_aln) |
824 if N_mismatch <= int(max_mismatch_no) : | 834 # if N_mismatch <= int(max_mismatch_no) : |
835 mm_no=float(max_mismatch_no) | |
836 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
825 all_mapped_passed += 1 | 837 all_mapped_passed += 1 |
826 methy = methy_seq(r_aln, g_aln + next2bp) | 838 methy = methy_seq(r_aln, g_aln + next2bp) |
827 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 839 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
828 #---XS FILTER---------------- | 840 #---XS FILTER---------------- |
829 XS = 0 | 841 XS = 0 |
837 output_genome = output_genome, | 849 output_genome = output_genome, |
838 rrbs = True, | 850 rrbs = True, |
839 my_region_serial = my_region_serial, | 851 my_region_serial = my_region_serial, |
840 my_region_start = my_region_start, | 852 my_region_start = my_region_start, |
841 my_region_end = my_region_end) | 853 my_region_end = my_region_end) |
854 all_base_mapped += len(original_BS) | |
842 | 855 |
843 else : | 856 else : |
844 print "[For debug]: reads not in same lengths" | 857 print "[For debug]: reads not in same lengths" |
845 | 858 |
846 | 859 |
847 # ---- FW_G2A ---- undirectional | 860 # ---- FW_G2A ---- un-directional |
848 FW_regions = dict() | 861 FW_regions = dict() |
849 gseq = dict() | 862 gseq = dict() |
850 chr_length = dict() | 863 chr_length = dict() |
851 for header in FW_G2A_uniq_lst : | 864 for header in FW_G2A_uniq_lst : |
852 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header] | 865 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header] |
889 #if my_region_serial == 0 : | 902 #if my_region_serial == 0 : |
890 # print "[For debug]: chr=", mapped_chr | 903 # print "[For debug]: chr=", mapped_chr |
891 # print "[For debug]: FW_G2A read still can not find fragment serial" | 904 # print "[For debug]: FW_G2A read still can not find fragment serial" |
892 # Tip: sometimes "my_region_serial" is still 0 ... | 905 # Tip: sometimes "my_region_serial" is still 0 ... |
893 | 906 |
894 | |
895 N_mismatch = N_MIS(r_aln, g_aln) | 907 N_mismatch = N_MIS(r_aln, g_aln) |
896 if N_mismatch <= int(max_mismatch_no) : | 908 # if N_mismatch <= int(max_mismatch_no) : |
909 max_mismatch_no=float(max_mismatch_no) | |
910 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
897 all_mapped_passed += 1 | 911 all_mapped_passed += 1 |
898 methy = methy_seq(r_aln, g_aln + next2bp) | 912 methy = methy_seq(r_aln, g_aln + next2bp) |
899 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 913 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
900 #---XS FILTER---------------- | 914 #---XS FILTER---------------- |
901 XS = 0 | 915 XS = 0 |
909 output_genome = output_genome, | 923 output_genome = output_genome, |
910 rrbs = True, | 924 rrbs = True, |
911 my_region_serial = my_region_serial, | 925 my_region_serial = my_region_serial, |
912 my_region_start = my_region_start, | 926 my_region_start = my_region_start, |
913 my_region_end = my_region_end) | 927 my_region_end = my_region_end) |
928 all_base_mapped += len(original_BS) | |
914 else : | 929 else : |
915 print "[For debug]: reads not in same lengths" | 930 print "[For debug]: reads not in same lengths" |
916 | 931 |
917 | 932 |
918 # ---- RC_G2A ---- undirectional | 933 # ---- RC_G2A ---- un-directional |
919 RC_regions = dict() | 934 RC_regions = dict() |
920 for header in RC_G2A_uniq_lst : | 935 for header in RC_G2A_uniq_lst : |
921 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] | 936 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] |
922 original_BS = original_bs_reads[header] | 937 original_BS = original_bs_reads[header] |
923 if mapped_chr not in RC_regions : | 938 if mapped_chr not in RC_regions : |
959 | 974 |
960 #if my_region_serial == 0 : | 975 #if my_region_serial == 0 : |
961 # print "[For debug]: chr=", mapped_chr | 976 # print "[For debug]: chr=", mapped_chr |
962 # print "[For debug]: RC_C2A read still cannot find fragment serial" | 977 # print "[For debug]: RC_C2A read still cannot find fragment serial" |
963 | 978 |
964 | |
965 N_mismatch = N_MIS(r_aln, g_aln) | 979 N_mismatch = N_MIS(r_aln, g_aln) |
966 if N_mismatch <= int(max_mismatch_no) : | 980 # if N_mismatch <= int(max_mismatch_no) : |
981 mm_no=float(max_mismatch_no) | |
982 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
967 all_mapped_passed += 1 | 983 all_mapped_passed += 1 |
968 methy = methy_seq(r_aln, g_aln + next2bp) | 984 methy = methy_seq(r_aln, g_aln + next2bp) |
969 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 985 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
970 #---XS FILTER---------------- | 986 #---XS FILTER---------------- |
971 XS = 0 | 987 XS = 0 |
979 output_genome = output_genome, | 995 output_genome = output_genome, |
980 rrbs = True, | 996 rrbs = True, |
981 my_region_serial = my_region_serial, | 997 my_region_serial = my_region_serial, |
982 my_region_start = my_region_start, | 998 my_region_start = my_region_start, |
983 my_region_end = my_region_end) | 999 my_region_end = my_region_end) |
1000 all_base_mapped += len(original_BS) | |
984 else : | 1001 else : |
985 print "[For debug]: reads not in same lengths" | 1002 print "[For debug]: reads not in same lengths" |
986 | |
987 | |
988 | 1003 |
989 # Finished both FW and RC | 1004 # Finished both FW and RC |
990 logm("Done: %s (%d) \n" % (read_file, no_my_files)) | 1005 logm("Done: %s (%d) \n" % (read_file, no_my_files)) |
991 print "--> %s (%d) "%(read_file, no_my_files) | 1006 print "--> %s (%d) "%(read_file, no_my_files) |
992 del original_bs_reads | 1007 del original_bs_reads |
993 delete_files(WC2T, CC2T, WG2A, CG2A) | 1008 delete_files(WC2T, CC2T, WG2A, CG2A) |
994 | |
995 | |
996 | |
997 # End of un-directional library | 1009 # End of un-directional library |
998 | 1010 |
999 delete_files(tmp_path) | 1011 delete_files(tmp_path) |
1000 | 1012 |
1001 | 1013 |
1002 logm("O Number of raw reads: %d "% all_raw_reads) | 1014 logm("Number of raw reads: %d "% all_raw_reads) |
1003 if all_raw_reads >0: | 1015 if all_raw_reads>0: |
1004 logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) | 1016 logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) |
1005 for kk in range(len(n_cut_tag_lst)): | 1017 for kk in range(len(n_cut_tag_lst)): |
1006 logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) | 1018 logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) |
1007 logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed) | 1019 if adapter_seq!="" : |
1008 logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | 1020 logm("Number of reads having adapter removed: %d "%all_tagged_trimmed) |
1009 logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped) | 1021 logm("Number of bases in total: %d "%all_base_before_trim) |
1010 | 1022 if adapter_seq!="" : |
1011 logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) | 1023 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) ) |
1012 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) | 1024 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) ) |
1013 | 1025 logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped) |
1014 if asktag=="Y": # undiretional | 1026 |
1027 logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) | |
1028 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) | |
1029 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) | |
1030 | |
1031 if asktag=="Y": # un-diretional | |
1015 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) | 1032 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) |
1016 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) | 1033 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) |
1017 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) | 1034 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) |
1018 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) ) | 1035 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) ) |
1019 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration | 1036 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration |
1020 # according to literal meaning | 1037 # according to literal meaning |
1021 elif asktag=="N": # directional | 1038 elif asktag=="N": # directional |
1022 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) | 1039 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) |
1023 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) | 1040 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) |
1024 | 1041 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) |
1025 n_CG=mC_lst[0]+uC_lst[0] | 1042 |
1026 n_CHG=mC_lst[1]+uC_lst[1] | 1043 n_CG = mC_lst[0] + uC_lst[0] |
1027 n_CHH=mC_lst[2]+uC_lst[2] | 1044 n_CHG = mC_lst[1] + uC_lst[1] |
1045 n_CHH = mC_lst[2] + uC_lst[2] | |
1028 | 1046 |
1029 logm("----------------------------------------------") | 1047 logm("----------------------------------------------") |
1030 logm("M Methylated C in mapped reads ") | 1048 logm("Methylated C in mapped reads ") |
1031 logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) | 1049 logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0)) |
1032 logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) | 1050 logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0)) |
1033 logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) | 1051 logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0)) |
1034 logm("----------------------------------------------") | 1052 logm("----------------------------------------------") |
1035 logm("------------------- END ----------------------") | 1053 logm("------------------- END ----------------------") |
1036 | 1054 |
1037 elapsed(main_read_file) | 1055 elapsed(main_read_file) |
1038 | 1056 |