comparison BSseeker2/bs_align/bs_single_end.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
comparison
equal deleted inserted replaced
0:e6df770c0e58 1:8b26adf64adc
1 import fileinput, os, time, random, math 1 import fileinput, os, time, random, math
2 from bs_utils.utils import * 2 from bs_utils.utils import *
3 from bs_align_utils import * 3 from bs_align_utils import *
4 import gzip
4 5
5 #---------------------------------------------------------------- 6 #----------------------------------------------------------------
6 # Read from the mapped results, return lists of unique / multiple-hit reads 7 # Read from the mapped results, return lists of unique / multiple-hit reads
7 # The function suppose at most 2 hits will be reported in single file 8 # The function suppose at most 2 hits will be reported in single file
8 def extract_mapping(ali_file): 9 def extract_mapping(ali_file):
75 #---------------------------------------------------------------- 76 #----------------------------------------------------------------
76 77
77 78
78 79
79 #---------------------------------------------------------------- 80 #----------------------------------------------------------------
80 logm("Read filename: %s"% main_read_file ) 81 logm("Read filename: %s" % main_read_file)
81 logm("Un-directional library: %s" % asktag ) 82 logm("The first base (for mapping): %d"% cut1 )
82 logm("The first base (for mapping): %d" % cut1) 83 logm("The last base (for mapping): %d"% cut2 )
83 logm("The last base (for mapping): %d" % cut2) 84
84 logm("Max. lines per mapping: %d"% no_small_lines) 85 logm("Path for short reads aligner: %s"% aligner_command + '\n')
85 logm("Aligner: %s" % aligner_command) 86 logm("Reference genome library path: %s"% db_path )
86 logm("Reference genome library path: %s" % db_path ) 87
87 logm("Number of mismatches allowed: %s" % max_mismatch_no ) 88 if asktag == "Y" :
89 logm("Un-directional library" )
90 else :
91 logm("Directional library")
92
93 logm("Number of mismatches allowed: %s"% max_mismatch_no )
94
88 if adapter_file !="": 95 if adapter_file !="":
89 if asktag=="N": 96 logm("Adapter seq: %s" % adapter_fw)
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n"))) 97 logm("-------------------------------- " )
91 elif asktag=="Y":
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
94 #---------------------------------------------------------------- 98 #----------------------------------------------------------------
95 99
96 # helper method to join fname with tmp_path 100 # helper method to join fname with tmp_path
97 tmp_d = lambda fname: os.path.join(tmp_path, fname) 101 tmp_d = lambda fname: os.path.join(tmp_path, fname)
98 102
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) 111 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
108 # if splitted_file.startswith("%s-s-" % input_fname)) 112 # if splitted_file.startswith("%s-s-" % input_fname))
109 113
110 #---- Stats ------------------------------------------------------------ 114 #---- Stats ------------------------------------------------------------
111 all_raw_reads=0 115 all_raw_reads=0
112 all_trimed=0 116 all_trimmed=0
113 all_mapped=0 117 all_mapped=0
114 all_mapped_passed=0 118 all_mapped_passed=0
119 all_base_before_trim=0
120 all_base_after_trim=0
121 all_base_mapped=0
115 122
116 numbers_premapped_lst=[0,0,0,0] 123 numbers_premapped_lst=[0,0,0,0]
117 numbers_mapped_lst=[0,0,0,0] 124 numbers_mapped_lst=[0,0,0,0]
118 125
119 mC_lst=[0,0,0] 126 mC_lst=[0,0,0]
120 uC_lst=[0,0,0] 127 uC_lst=[0,0,0]
121
122 128
123 no_my_files=0 129 no_my_files=0
124 130
125 #---------------------------------------------------------------- 131 #----------------------------------------------------------------
126 logm("== Start mapping ==") 132 logm("== Start mapping ==")
130 original_bs_reads = {} 136 original_bs_reads = {}
131 no_my_files+=1 137 no_my_files+=1
132 random_id = ".tmp-"+str(random.randint(1000000,9999999)) 138 random_id = ".tmp-"+str(random.randint(1000000,9999999))
133 139
134 #------------------------------------------------------------------- 140 #-------------------------------------------------------------------
135 # undirectional sequencing 141 # un-directional sequencing
136 #------------------------------------------------------------------- 142 #-------------------------------------------------------------------
137 if asktag=="Y": 143 if asktag=="Y":
138 144
139 #---------------------------------------------------------------- 145 #----------------------------------------------------------------
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) 146 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
144 outf3=open(outfile3,'w') 150 outf3=open(outfile3,'w')
145 151
146 #---------------------------------------------------------------- 152 #----------------------------------------------------------------
147 # detect format of input file 153 # detect format of input file
148 try : 154 try :
149 read_inf=open(read_file,"r") 155 if read_file.endswith(".gz") : # support input file ending with ".gz"
156 read_inf = gzip.open(read_file, "rb")
157 else :
158 read_inf=open(read_file,"r")
150 except IOError : 159 except IOError :
151 print "[Error] Cannot open input file : %s" % read_file 160 print "[Error] Cannot open input file : %s" % read_file
152 exit(-1) 161 exit(-1)
153 162
154 oneline=read_inf.readline() 163 oneline = read_inf.readline()
155 l=oneline.split() 164 l = oneline.split()
156 input_format="" 165 input_format = ""
157 if oneline[0]=="@": # fastq 166 if oneline[0]=="@":
158 input_format="fastq" 167 input_format = "fastq"
159 n_fastq=0 168 elif len(l)==1 and oneline[0]!=">":
160 elif len(l)==1 and oneline[0]!=">": # pure sequences 169 input_format = "seq"
161 input_format="seq" 170 elif len(l)==11:
162 elif len(l)==11: # qseq 171 input_format = "qseq"
163 input_format="qseq" 172 elif oneline[0]==">":
164 elif oneline[0]==">": # fasta 173 input_format = "fasta"
165 input_format="fasta"
166 n_fasta=0
167 read_inf.close() 174 read_inf.close()
168 175
169 #---------------------------------------------------------------- 176 #----------------------------------------------------------------
170 # read sequence, remove adapter and convert 177 # read sequence, remove adapter and convert
171 read_id="" 178 read_id = ""
172 seq="" 179 seq = ""
173 seq_ready="N" 180 seq_ready = "N"
174 for line in fileinput.input(read_file): 181 line_no = 0
175 l=line.split() 182 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz
176 183 l = line.split()
184 line_no += 1
177 if input_format=="seq": 185 if input_format=="seq":
178 all_raw_reads+=1 186 all_raw_reads += 1
179 read_id=str(all_raw_reads) 187 read_id = str(all_raw_reads)
180 read_id=read_id.zfill(12) 188 read_id = read_id.zfill(12)
181 seq=l[0] 189 seq = l[0]
182 seq_ready="Y" 190 seq_ready = "Y"
183 elif input_format=="fastq": 191 elif input_format=="fastq":
184 m_fastq=math.fmod(n_fastq,4) 192 l_fastq = math.fmod(line_no, 4)
185 n_fastq+=1 193 if l_fastq == 1 :
186 seq_ready="N" 194 all_raw_reads += 1
187 if m_fastq==0: 195 read_id = l[0][1:]
188 all_raw_reads+=1 196 seq_ready = "N"
189 read_id=str(all_raw_reads) 197 elif l_fastq == 2 :
190 read_id=read_id.zfill(12) 198 seq = l[0]
191 seq="" 199 seq_ready = "Y"
192 elif m_fastq==1: 200 else :
193 seq=l[0] 201 seq = ""
194 seq_ready="Y" 202 seq_ready = "N"
195 else:
196 seq=""
197 elif input_format=="qseq": 203 elif input_format=="qseq":
198 all_raw_reads+=1 204 all_raw_reads += 1
199 read_id=str(all_raw_reads) 205 read_id = str(all_raw_reads)
200 read_id=read_id.zfill(12) 206 read_id = read_id.zfill(12)
201 seq=l[8] 207 seq = l[8]
202 seq_ready="Y" 208 seq_ready = "Y"
203 elif input_format=="fasta": 209 elif input_format=="fasta" :
204 m_fasta=math.fmod(n_fasta,2) 210 l_fasta = math.fmod(line_no,2)
205 n_fasta+=1 211 if l_fasta==1:
206 seq_ready="N" 212 all_raw_reads += 1
207 if m_fasta==0: 213 read_id = l[0][1:]
208 all_raw_reads+=1 214 seq = ""
209 #read_id=str(all_raw_reads) 215 seq_ready = "N"
210 read_id=l[0][1:] 216 elif l_fasta==0 :
211 seq="" 217 seq = l[0]
212 elif m_fasta==1: 218 seq_ready = "Y"
213 seq=l[0]
214 seq_ready="Y"
215 else:
216 seq=""
217 219
218 #---------------------------------------------------------------- 220 #----------------------------------------------------------------
219 if seq_ready=="Y": 221 if seq_ready=="Y":
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 222 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
221 seq=seq.upper() 223 seq=seq.upper()
222 seq=seq.replace(".","N") 224 seq=seq.replace(".","N")
223 225
224 # striping BS adapter from 3' read 226 # striping BS adapter from 3' read
227 all_base_before_trim += len(seq)
225 if (adapter_fw !="") and (adapter_rc !="") : 228 if (adapter_fw !="") and (adapter_rc !="") :
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) 229 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
227 new_read = Remove_5end_Adapter(new_read, adapter_rc) 230 new_read = Remove_5end_Adapter(new_read, adapter_rc)
228 if len(new_read) < len(seq) : 231 if len(new_read) < len(seq) :
229 all_trimed += 1 232 all_trimmed += 1
230 seq = new_read 233 seq = new_read
231 234 all_base_after_trim += len(seq)
232 if len(seq)<=4: 235 if len(seq)<=4:
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) 236 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
234 237
235 #--------- trimmed_raw_BS_read ------------------ 238 #--------- trimmed_raw_BS_read ------------------
236 original_bs_reads[read_id] = seq 239 original_bs_reads[read_id] = seq
351 RC_G2A_uniq_lst.sort() 354 RC_G2A_uniq_lst.sort()
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] 355 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] 356 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] 357 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] 358 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
359 #----------------------------------------------------------------
360
361 numbers_premapped_lst[0] += len(Unique_FW_C2T)
362 numbers_premapped_lst[1] += len(Unique_RC_G2A)
363 numbers_premapped_lst[2] += len(Unique_FW_G2A)
364 numbers_premapped_lst[3] += len(Unique_RC_C2T)
356 365
357 del Unique_FW_C2T 366 del Unique_FW_C2T
358 del Unique_FW_G2A 367 del Unique_FW_G2A
359 del Unique_RC_C2T 368 del Unique_RC_C2T
360 del Unique_RC_G2A 369 del Unique_RC_G2A
361
362 #----------------------------------------------------------------
363 numbers_premapped_lst[0] += len(Unique_FW_C2T)
364 numbers_premapped_lst[1] += len(Unique_RC_G2A)
365 numbers_premapped_lst[2] += len(Unique_FW_G2A)
366 numbers_premapped_lst[3] += len(Unique_RC_C2T)
367 370
368 371
369 #---------------------------------------------------------------- 372 #----------------------------------------------------------------
370 373
371 nn=0 374 nn=0
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), 377 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
375 (RC_G2A_uniq_lst,RC_G2A_U), 378 (RC_G2A_uniq_lst,RC_G2A_U),
376 (FW_G2A_uniq_lst,FW_G2A_U), 379 (FW_G2A_uniq_lst,FW_G2A_U),
377 (RC_C2T_uniq_lst,RC_C2T_U)]: 380 (RC_C2T_uniq_lst,RC_C2T_U)]:
378 nn += 1 381 nn += 1
379 mapped_chr0 = ""
380 382
381 for header in ali_unique_lst: 383 for header in ali_unique_lst:
382 384
383 _, mapped_chr, mapped_location, cigar = ali_dic[header] 385 _, mapped_chr, mapped_location, cigar = ali_dic[header]
384 386
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) 422 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
421 423
422 424
423 if len(r_aln)==len(g_aln): 425 if len(r_aln)==len(g_aln):
424 N_mismatch = N_MIS(r_aln, g_aln) 426 N_mismatch = N_MIS(r_aln, g_aln)
425 if N_mismatch <= int(max_mismatch_no): 427 # if N_mismatch <= int(max_mismatch_no):
428 mm_no=float(max_mismatch_no)
429 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
426 numbers_mapped_lst[nn-1] += 1 430 numbers_mapped_lst[nn-1] += 1
427 all_mapped_passed += 1 431 all_mapped_passed += 1
428 methy = methy_seq(r_aln, g_aln + next) 432 methy = methy_seq(r_aln, g_aln + next)
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 433 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
430 434
434 nmCH = methy.count('Y') + methy.count('Z') 438 nmCH = methy.count('Y') + methy.count('Z')
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : 439 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
436 XS = 1 440 XS = 1
437 441
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) 442 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
443 all_base_mapped += len(original_BS)
439 444
440 #---------------------------------------------------------------- 445 #----------------------------------------------------------------
441 logm("--> %s (%d) "%(read_file, no_my_files)) 446 logm("--> %s (%d) "%(read_file, no_my_files))
442 delete_files(WC2T, WG2A, CC2T, CG2A) 447 delete_files(WC2T, WG2A, CC2T, CG2A)
443 448
447 # directional sequencing 452 # directional sequencing
448 #-------------------------------------------------------------------- 453 #--------------------------------------------------------------------
449 454
450 if asktag=="N": 455 if asktag=="N":
451 #---------------------------------------------------------------- 456 #----------------------------------------------------------------
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id) 457 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
453 outf2=open(outfile2,'w') 458 outf2=open(outfile2,'w')
454
455 n=0
456 #---------------------------------------------------------------- 459 #----------------------------------------------------------------
457 try : 460 try :
458 read_inf=open(read_file,"r") 461 if read_file.endswith(".gz") : # support input file ending with ".gz"
462 read_inf = gzip.open(read_file, "rb")
463 else :
464 read_inf=open(read_file,"r")
459 except IOError : 465 except IOError :
460 print "[Error] Cannot open input file : %s" % read_file 466 print "[Error] Cannot open input file : %s" % read_file
461 exit(-1) 467 exit(-1)
462 468
463 oneline=read_inf.readline() 469 oneline = read_inf.readline()
464 l=oneline.split() 470 l = oneline.split()
465 input_format="" 471 input_format = ""
466 if oneline[0]=="@": # FastQ 472 if oneline[0]=="@":
467 input_format="fastq" 473 input_format = "fastq"
468 n_fastq=0 474 elif len(l)==1 and oneline[0]!=">":
469 elif len(l)==1 and oneline[0]!=">": # pure sequences 475 input_format = "seq"
470 input_format="seq" 476 elif len(l)==11:
471 elif len(l)==11: # Illumina GAII qseq file 477 input_format = "qseq"
472 input_format="qseq" 478 elif oneline[0]==">":
473 elif oneline[0]==">": # fasta 479 input_format = "fasta"
474 input_format="fasta"
475 n_fasta=0
476 read_inf.close() 480 read_inf.close()
481
477 #print "detected data format: %s"%(input_format) 482 #print "detected data format: %s"%(input_format)
478 #---------------------------------------------------------------- 483 #----------------------------------------------------------------
479 read_id="" 484 read_id=""
480 seq="" 485 seq=""
481 seq_ready="N" 486 seq_ready="N"
482 for line in fileinput.input(read_file): 487 line_no = 0
483 l=line.split() 488 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
489 l = line.split()
490 line_no += 1
484 if input_format=="seq": 491 if input_format=="seq":
485 all_raw_reads+=1 492 all_raw_reads += 1
486 read_id=str(all_raw_reads) 493 read_id = str(all_raw_reads)
487 read_id=read_id.zfill(12) 494 read_id = read_id.zfill(12)
488 seq=l[0] 495 seq = l[0]
489 seq_ready="Y" 496 seq_ready = "Y"
490 elif input_format=="fastq": 497 elif input_format=="fastq":
491 m_fastq=math.fmod(n_fastq,4) 498 l_fastq = math.fmod(line_no, 4)
492 n_fastq+=1 499 if l_fastq == 1 :
493 seq_ready="N" 500 all_raw_reads += 1
494 if m_fastq==0: 501 read_id = l[0][1:]
495 all_raw_reads+=1 502 seq_ready = "N"
496 read_id=str(all_raw_reads) 503 elif l_fastq == 2 :
497 read_id=read_id.zfill(12) 504 seq = l[0]
498 seq="" 505 seq_ready = "Y"
499 elif m_fastq==1: 506 else :
500 seq=l[0] 507 seq = ""
501 seq_ready="Y" 508 seq_ready = "N"
502 else:
503 seq=""
504 elif input_format=="qseq": 509 elif input_format=="qseq":
505 all_raw_reads+=1 510 all_raw_reads += 1
506 read_id=str(all_raw_reads) 511 read_id = str(all_raw_reads)
507 read_id=read_id.zfill(12) 512 read_id = read_id.zfill(12)
508 seq=l[8] 513 seq = l[8]
509 seq_ready="Y" 514 seq_ready = "Y"
510 elif input_format=="fasta": 515 elif input_format=="fasta" :
511 m_fasta=math.fmod(n_fasta,2) 516 l_fasta = math.fmod(line_no,2)
512 n_fasta+=1 517 if l_fasta==1:
513 seq_ready="N" 518 all_raw_reads += 1
514 if m_fasta==0: 519 read_id = l[0][1:]
515 all_raw_reads+=1 520 seq = ""
516 read_id=l[0][1:] 521 seq_ready = "N"
517 seq="" 522 elif l_fasta==0 :
518 elif m_fasta==1: 523 seq = l[0]
519 seq=l[0] 524 seq_ready = "Y"
520 seq_ready="Y"
521 else:
522 seq=""
523
524 #-------------------------------- 525 #--------------------------------
525 if seq_ready=="Y": 526 if seq_ready=="Y":
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 527 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
527 seq=seq.upper() 528 seq=seq.upper()
528 seq=seq.replace(".","N") 529 seq=seq.replace(".","N")
529 530
530 #--striping adapter from 3' read ------- 531 #--striping adapter from 3' read -------
532 all_base_before_trim += len(seq)
531 if adapter != "": 533 if adapter != "":
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) 534 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
533 if len(new_read) < len(seq) : 535 if len(new_read) < len(seq) :
534 all_trimed += 1 536 all_trimmed += 1
535 seq = new_read 537 seq = new_read
536 538 all_base_after_trim += len(seq)
537 if len(seq)<=4: 539 if len(seq)<=4:
538 seq = "N" * (cut2-cut1+1) 540 seq = "N" * (cut2-cut1+1)
539 541
540 #--------- trimmed_raw_BS_read ------------------ 542 #--------- trimmed_raw_BS_read ------------------
541 original_bs_reads[read_id] = seq 543 original_bs_reads[read_id] = seq
610 outf_MH.write(">%s\n" % i) 612 outf_MH.write(">%s\n" % i)
611 outf_MH.write("%s\n" % original_bs_reads[i]) 613 outf_MH.write("%s\n" % original_bs_reads[i])
612 outf_MH.close() 614 outf_MH.close()
613 615
614 616
615
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] 617 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] 618 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
618 FW_C2T_uniq_lst.sort() 619 FW_C2T_uniq_lst.sort()
619 RC_C2T_uniq_lst.sort() 620 RC_C2T_uniq_lst.sort()
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] 621 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
631 nn = 0 632 nn = 0
632 gseq = dict() 633 gseq = dict()
633 chr_length = dict() 634 chr_length = dict()
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: 635 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
635 nn += 1 636 nn += 1
636 mapped_chr0 = ""
637 for header in ali_unique_lst: 637 for header in ali_unique_lst:
638 _, mapped_chr, mapped_location, cigar = ali_dic[header] 638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
639 original_BS = original_bs_reads[header] 639 original_BS = original_bs_reads[header]
640 #------------------------------------- 640 #-------------------------------------
641 if mapped_chr not in gseq : 641 if mapped_chr not in gseq :
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) 642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
643 chr_length[mapped_chr] = len(gseq[mapped_chr]) 643 chr_length[mapped_chr] = len(gseq[mapped_chr])
644 #if mapped_chr != mapped_chr0:
645 # my_gseq = deserialize(db_d(mapped_chr))
646 # chr_length = len(my_gseq)
647 # mapped_chr0 = mapped_chr
648 #-------------------------------------
649 644
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) 645 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
651 646
652 all_mapped+=1 647 all_mapped+=1
653 if nn == 1: # +FW mapped to + strand: 648 if nn == 1: # +FW mapped to + strand:
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) 657 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) 658 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
664 659
665 if len(r_aln) == len(g_aln): 660 if len(r_aln) == len(g_aln):
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides 661 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
667 if N_mismatch <= int(max_mismatch_no): 662 mm_no=float(max_mismatch_no)
663 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
668 numbers_mapped_lst[nn-1] += 1 664 numbers_mapped_lst[nn-1] += 1
669 all_mapped_passed += 1 665 all_mapped_passed += 1
670 methy = methy_seq(r_aln, g_aln+next) 666 methy = methy_seq(r_aln, g_aln+next)
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 667 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
672 668
676 nmCH = methy.count('Y') + methy.count('Z') 672 nmCH = methy.count('Y') + methy.count('Z')
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : 673 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
678 XS = 1 674 XS = 1
679 675
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) 676 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
677 all_base_mapped += len(original_BS)
681 678
682 #---------------------------------------------------------------- 679 #----------------------------------------------------------------
683 logm("--> %s (%d) "%(read_file,no_my_files)) 680 logm("--> %s (%d) "%(read_file,no_my_files))
684 delete_files(WC2T, CC2T) 681 delete_files(WC2T, CC2T)
685 682
686 683
687 #---------------------------------------------------------------- 684 #----------------------------------------------------------------
688 685
689 # outf.close()
690 delete_files(tmp_path) 686 delete_files(tmp_path)
691 687
692 logm("Number of raw reads: %d \n"% all_raw_reads) 688 logm("Number of raw reads: %d \n"% all_raw_reads)
693 if all_raw_reads >0: 689 if all_raw_reads >0:
694 logm("Number of reads having adapter removed: %d \n" % all_trimed ) 690 logm("Number of bases in total: %d "%all_base_before_trim)
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) 691 if adapter != "" :
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped) 692 logm("Number of reads having adapter removed: %d \n" % all_trimmed )
693 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
694 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
695 logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped)
697 if asktag=="Y": 696 if asktag=="Y":
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) 697 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) 698 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) 699 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) 700 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) 710 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
712 elif asktag=="N": 711 elif asktag=="N":
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) 712 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) 713 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) 714 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
715 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
716
716 717
717 n_CG=mC_lst[0]+uC_lst[0] 718 n_CG=mC_lst[0]+uC_lst[0]
718 n_CHG=mC_lst[1]+uC_lst[1] 719 n_CHG=mC_lst[1]+uC_lst[1]
719 n_CHH=mC_lst[2]+uC_lst[2] 720 n_CHH=mC_lst[2]+uC_lst[2]
720 721