Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_single_end.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
comparison
equal
deleted
inserted
replaced
0:e6df770c0e58 | 1:8b26adf64adc |
---|---|
1 import fileinput, os, time, random, math | 1 import fileinput, os, time, random, math |
2 from bs_utils.utils import * | 2 from bs_utils.utils import * |
3 from bs_align_utils import * | 3 from bs_align_utils import * |
4 import gzip | |
4 | 5 |
5 #---------------------------------------------------------------- | 6 #---------------------------------------------------------------- |
6 # Read from the mapped results, return lists of unique / multiple-hit reads | 7 # Read from the mapped results, return lists of unique / multiple-hit reads |
7 # The function suppose at most 2 hits will be reported in single file | 8 # The function suppose at most 2 hits will be reported in single file |
8 def extract_mapping(ali_file): | 9 def extract_mapping(ali_file): |
75 #---------------------------------------------------------------- | 76 #---------------------------------------------------------------- |
76 | 77 |
77 | 78 |
78 | 79 |
79 #---------------------------------------------------------------- | 80 #---------------------------------------------------------------- |
80 logm("Read filename: %s"% main_read_file ) | 81 logm("Read filename: %s" % main_read_file) |
81 logm("Un-directional library: %s" % asktag ) | 82 logm("The first base (for mapping): %d"% cut1 ) |
82 logm("The first base (for mapping): %d" % cut1) | 83 logm("The last base (for mapping): %d"% cut2 ) |
83 logm("The last base (for mapping): %d" % cut2) | 84 |
84 logm("Max. lines per mapping: %d"% no_small_lines) | 85 logm("Path for short reads aligner: %s"% aligner_command + '\n') |
85 logm("Aligner: %s" % aligner_command) | 86 logm("Reference genome library path: %s"% db_path ) |
86 logm("Reference genome library path: %s" % db_path ) | 87 |
87 logm("Number of mismatches allowed: %s" % max_mismatch_no ) | 88 if asktag == "Y" : |
89 logm("Un-directional library" ) | |
90 else : | |
91 logm("Directional library") | |
92 | |
93 logm("Number of mismatches allowed: %s"% max_mismatch_no ) | |
94 | |
88 if adapter_file !="": | 95 if adapter_file !="": |
89 if asktag=="N": | 96 logm("Adapter seq: %s" % adapter_fw) |
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n"))) | 97 logm("-------------------------------- " ) |
91 elif asktag=="Y": | |
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) ) | |
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) ) | |
94 #---------------------------------------------------------------- | 98 #---------------------------------------------------------------- |
95 | 99 |
96 # helper method to join fname with tmp_path | 100 # helper method to join fname with tmp_path |
97 tmp_d = lambda fname: os.path.join(tmp_path, fname) | 101 tmp_d = lambda fname: os.path.join(tmp_path, fname) |
98 | 102 |
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) | 111 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) |
108 # if splitted_file.startswith("%s-s-" % input_fname)) | 112 # if splitted_file.startswith("%s-s-" % input_fname)) |
109 | 113 |
110 #---- Stats ------------------------------------------------------------ | 114 #---- Stats ------------------------------------------------------------ |
111 all_raw_reads=0 | 115 all_raw_reads=0 |
112 all_trimed=0 | 116 all_trimmed=0 |
113 all_mapped=0 | 117 all_mapped=0 |
114 all_mapped_passed=0 | 118 all_mapped_passed=0 |
119 all_base_before_trim=0 | |
120 all_base_after_trim=0 | |
121 all_base_mapped=0 | |
115 | 122 |
116 numbers_premapped_lst=[0,0,0,0] | 123 numbers_premapped_lst=[0,0,0,0] |
117 numbers_mapped_lst=[0,0,0,0] | 124 numbers_mapped_lst=[0,0,0,0] |
118 | 125 |
119 mC_lst=[0,0,0] | 126 mC_lst=[0,0,0] |
120 uC_lst=[0,0,0] | 127 uC_lst=[0,0,0] |
121 | |
122 | 128 |
123 no_my_files=0 | 129 no_my_files=0 |
124 | 130 |
125 #---------------------------------------------------------------- | 131 #---------------------------------------------------------------- |
126 logm("== Start mapping ==") | 132 logm("== Start mapping ==") |
130 original_bs_reads = {} | 136 original_bs_reads = {} |
131 no_my_files+=1 | 137 no_my_files+=1 |
132 random_id = ".tmp-"+str(random.randint(1000000,9999999)) | 138 random_id = ".tmp-"+str(random.randint(1000000,9999999)) |
133 | 139 |
134 #------------------------------------------------------------------- | 140 #------------------------------------------------------------------- |
135 # undirectional sequencing | 141 # un-directional sequencing |
136 #------------------------------------------------------------------- | 142 #------------------------------------------------------------------- |
137 if asktag=="Y": | 143 if asktag=="Y": |
138 | 144 |
139 #---------------------------------------------------------------- | 145 #---------------------------------------------------------------- |
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) | 146 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) |
144 outf3=open(outfile3,'w') | 150 outf3=open(outfile3,'w') |
145 | 151 |
146 #---------------------------------------------------------------- | 152 #---------------------------------------------------------------- |
147 # detect format of input file | 153 # detect format of input file |
148 try : | 154 try : |
149 read_inf=open(read_file,"r") | 155 if read_file.endswith(".gz") : # support input file ending with ".gz" |
156 read_inf = gzip.open(read_file, "rb") | |
157 else : | |
158 read_inf=open(read_file,"r") | |
150 except IOError : | 159 except IOError : |
151 print "[Error] Cannot open input file : %s" % read_file | 160 print "[Error] Cannot open input file : %s" % read_file |
152 exit(-1) | 161 exit(-1) |
153 | 162 |
154 oneline=read_inf.readline() | 163 oneline = read_inf.readline() |
155 l=oneline.split() | 164 l = oneline.split() |
156 input_format="" | 165 input_format = "" |
157 if oneline[0]=="@": # fastq | 166 if oneline[0]=="@": |
158 input_format="fastq" | 167 input_format = "fastq" |
159 n_fastq=0 | 168 elif len(l)==1 and oneline[0]!=">": |
160 elif len(l)==1 and oneline[0]!=">": # pure sequences | 169 input_format = "seq" |
161 input_format="seq" | 170 elif len(l)==11: |
162 elif len(l)==11: # qseq | 171 input_format = "qseq" |
163 input_format="qseq" | 172 elif oneline[0]==">": |
164 elif oneline[0]==">": # fasta | 173 input_format = "fasta" |
165 input_format="fasta" | |
166 n_fasta=0 | |
167 read_inf.close() | 174 read_inf.close() |
168 | 175 |
169 #---------------------------------------------------------------- | 176 #---------------------------------------------------------------- |
170 # read sequence, remove adapter and convert | 177 # read sequence, remove adapter and convert |
171 read_id="" | 178 read_id = "" |
172 seq="" | 179 seq = "" |
173 seq_ready="N" | 180 seq_ready = "N" |
174 for line in fileinput.input(read_file): | 181 line_no = 0 |
175 l=line.split() | 182 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz |
176 | 183 l = line.split() |
184 line_no += 1 | |
177 if input_format=="seq": | 185 if input_format=="seq": |
178 all_raw_reads+=1 | 186 all_raw_reads += 1 |
179 read_id=str(all_raw_reads) | 187 read_id = str(all_raw_reads) |
180 read_id=read_id.zfill(12) | 188 read_id = read_id.zfill(12) |
181 seq=l[0] | 189 seq = l[0] |
182 seq_ready="Y" | 190 seq_ready = "Y" |
183 elif input_format=="fastq": | 191 elif input_format=="fastq": |
184 m_fastq=math.fmod(n_fastq,4) | 192 l_fastq = math.fmod(line_no, 4) |
185 n_fastq+=1 | 193 if l_fastq == 1 : |
186 seq_ready="N" | 194 all_raw_reads += 1 |
187 if m_fastq==0: | 195 read_id = l[0][1:] |
188 all_raw_reads+=1 | 196 seq_ready = "N" |
189 read_id=str(all_raw_reads) | 197 elif l_fastq == 2 : |
190 read_id=read_id.zfill(12) | 198 seq = l[0] |
191 seq="" | 199 seq_ready = "Y" |
192 elif m_fastq==1: | 200 else : |
193 seq=l[0] | 201 seq = "" |
194 seq_ready="Y" | 202 seq_ready = "N" |
195 else: | |
196 seq="" | |
197 elif input_format=="qseq": | 203 elif input_format=="qseq": |
198 all_raw_reads+=1 | 204 all_raw_reads += 1 |
199 read_id=str(all_raw_reads) | 205 read_id = str(all_raw_reads) |
200 read_id=read_id.zfill(12) | 206 read_id = read_id.zfill(12) |
201 seq=l[8] | 207 seq = l[8] |
202 seq_ready="Y" | 208 seq_ready = "Y" |
203 elif input_format=="fasta": | 209 elif input_format=="fasta" : |
204 m_fasta=math.fmod(n_fasta,2) | 210 l_fasta = math.fmod(line_no,2) |
205 n_fasta+=1 | 211 if l_fasta==1: |
206 seq_ready="N" | 212 all_raw_reads += 1 |
207 if m_fasta==0: | 213 read_id = l[0][1:] |
208 all_raw_reads+=1 | 214 seq = "" |
209 #read_id=str(all_raw_reads) | 215 seq_ready = "N" |
210 read_id=l[0][1:] | 216 elif l_fasta==0 : |
211 seq="" | 217 seq = l[0] |
212 elif m_fasta==1: | 218 seq_ready = "Y" |
213 seq=l[0] | |
214 seq_ready="Y" | |
215 else: | |
216 seq="" | |
217 | 219 |
218 #---------------------------------------------------------------- | 220 #---------------------------------------------------------------- |
219 if seq_ready=="Y": | 221 if seq_ready=="Y": |
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 | 222 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 |
221 seq=seq.upper() | 223 seq=seq.upper() |
222 seq=seq.replace(".","N") | 224 seq=seq.replace(".","N") |
223 | 225 |
224 # striping BS adapter from 3' read | 226 # striping BS adapter from 3' read |
227 all_base_before_trim += len(seq) | |
225 if (adapter_fw !="") and (adapter_rc !="") : | 228 if (adapter_fw !="") and (adapter_rc !="") : |
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) | 229 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) |
227 new_read = Remove_5end_Adapter(new_read, adapter_rc) | 230 new_read = Remove_5end_Adapter(new_read, adapter_rc) |
228 if len(new_read) < len(seq) : | 231 if len(new_read) < len(seq) : |
229 all_trimed += 1 | 232 all_trimmed += 1 |
230 seq = new_read | 233 seq = new_read |
231 | 234 all_base_after_trim += len(seq) |
232 if len(seq)<=4: | 235 if len(seq)<=4: |
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) | 236 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) |
234 | 237 |
235 #--------- trimmed_raw_BS_read ------------------ | 238 #--------- trimmed_raw_BS_read ------------------ |
236 original_bs_reads[read_id] = seq | 239 original_bs_reads[read_id] = seq |
351 RC_G2A_uniq_lst.sort() | 354 RC_G2A_uniq_lst.sort() |
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | 355 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] |
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | 356 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] |
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] | 357 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] |
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] | 358 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] |
359 #---------------------------------------------------------------- | |
360 | |
361 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
362 numbers_premapped_lst[1] += len(Unique_RC_G2A) | |
363 numbers_premapped_lst[2] += len(Unique_FW_G2A) | |
364 numbers_premapped_lst[3] += len(Unique_RC_C2T) | |
356 | 365 |
357 del Unique_FW_C2T | 366 del Unique_FW_C2T |
358 del Unique_FW_G2A | 367 del Unique_FW_G2A |
359 del Unique_RC_C2T | 368 del Unique_RC_C2T |
360 del Unique_RC_G2A | 369 del Unique_RC_G2A |
361 | |
362 #---------------------------------------------------------------- | |
363 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
364 numbers_premapped_lst[1] += len(Unique_RC_G2A) | |
365 numbers_premapped_lst[2] += len(Unique_FW_G2A) | |
366 numbers_premapped_lst[3] += len(Unique_RC_C2T) | |
367 | 370 |
368 | 371 |
369 #---------------------------------------------------------------- | 372 #---------------------------------------------------------------- |
370 | 373 |
371 nn=0 | 374 nn=0 |
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), | 377 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), |
375 (RC_G2A_uniq_lst,RC_G2A_U), | 378 (RC_G2A_uniq_lst,RC_G2A_U), |
376 (FW_G2A_uniq_lst,FW_G2A_U), | 379 (FW_G2A_uniq_lst,FW_G2A_U), |
377 (RC_C2T_uniq_lst,RC_C2T_U)]: | 380 (RC_C2T_uniq_lst,RC_C2T_U)]: |
378 nn += 1 | 381 nn += 1 |
379 mapped_chr0 = "" | |
380 | 382 |
381 for header in ali_unique_lst: | 383 for header in ali_unique_lst: |
382 | 384 |
383 _, mapped_chr, mapped_location, cigar = ali_dic[header] | 385 _, mapped_chr, mapped_location, cigar = ali_dic[header] |
384 | 386 |
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | 422 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) |
421 | 423 |
422 | 424 |
423 if len(r_aln)==len(g_aln): | 425 if len(r_aln)==len(g_aln): |
424 N_mismatch = N_MIS(r_aln, g_aln) | 426 N_mismatch = N_MIS(r_aln, g_aln) |
425 if N_mismatch <= int(max_mismatch_no): | 427 # if N_mismatch <= int(max_mismatch_no): |
428 mm_no=float(max_mismatch_no) | |
429 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
426 numbers_mapped_lst[nn-1] += 1 | 430 numbers_mapped_lst[nn-1] += 1 |
427 all_mapped_passed += 1 | 431 all_mapped_passed += 1 |
428 methy = methy_seq(r_aln, g_aln + next) | 432 methy = methy_seq(r_aln, g_aln + next) |
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 433 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
430 | 434 |
434 nmCH = methy.count('Y') + methy.count('Z') | 438 nmCH = methy.count('Y') + methy.count('Z') |
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | 439 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : |
436 XS = 1 | 440 XS = 1 |
437 | 441 |
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | 442 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) |
443 all_base_mapped += len(original_BS) | |
439 | 444 |
440 #---------------------------------------------------------------- | 445 #---------------------------------------------------------------- |
441 logm("--> %s (%d) "%(read_file, no_my_files)) | 446 logm("--> %s (%d) "%(read_file, no_my_files)) |
442 delete_files(WC2T, WG2A, CC2T, CG2A) | 447 delete_files(WC2T, WG2A, CC2T, CG2A) |
443 | 448 |
447 # directional sequencing | 452 # directional sequencing |
448 #-------------------------------------------------------------------- | 453 #-------------------------------------------------------------------- |
449 | 454 |
450 if asktag=="N": | 455 if asktag=="N": |
451 #---------------------------------------------------------------- | 456 #---------------------------------------------------------------- |
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id) | 457 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) |
453 outf2=open(outfile2,'w') | 458 outf2=open(outfile2,'w') |
454 | |
455 n=0 | |
456 #---------------------------------------------------------------- | 459 #---------------------------------------------------------------- |
457 try : | 460 try : |
458 read_inf=open(read_file,"r") | 461 if read_file.endswith(".gz") : # support input file ending with ".gz" |
462 read_inf = gzip.open(read_file, "rb") | |
463 else : | |
464 read_inf=open(read_file,"r") | |
459 except IOError : | 465 except IOError : |
460 print "[Error] Cannot open input file : %s" % read_file | 466 print "[Error] Cannot open input file : %s" % read_file |
461 exit(-1) | 467 exit(-1) |
462 | 468 |
463 oneline=read_inf.readline() | 469 oneline = read_inf.readline() |
464 l=oneline.split() | 470 l = oneline.split() |
465 input_format="" | 471 input_format = "" |
466 if oneline[0]=="@": # FastQ | 472 if oneline[0]=="@": |
467 input_format="fastq" | 473 input_format = "fastq" |
468 n_fastq=0 | 474 elif len(l)==1 and oneline[0]!=">": |
469 elif len(l)==1 and oneline[0]!=">": # pure sequences | 475 input_format = "seq" |
470 input_format="seq" | 476 elif len(l)==11: |
471 elif len(l)==11: # Illumina GAII qseq file | 477 input_format = "qseq" |
472 input_format="qseq" | 478 elif oneline[0]==">": |
473 elif oneline[0]==">": # fasta | 479 input_format = "fasta" |
474 input_format="fasta" | |
475 n_fasta=0 | |
476 read_inf.close() | 480 read_inf.close() |
481 | |
477 #print "detected data format: %s"%(input_format) | 482 #print "detected data format: %s"%(input_format) |
478 #---------------------------------------------------------------- | 483 #---------------------------------------------------------------- |
479 read_id="" | 484 read_id="" |
480 seq="" | 485 seq="" |
481 seq_ready="N" | 486 seq_ready="N" |
482 for line in fileinput.input(read_file): | 487 line_no = 0 |
483 l=line.split() | 488 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): |
489 l = line.split() | |
490 line_no += 1 | |
484 if input_format=="seq": | 491 if input_format=="seq": |
485 all_raw_reads+=1 | 492 all_raw_reads += 1 |
486 read_id=str(all_raw_reads) | 493 read_id = str(all_raw_reads) |
487 read_id=read_id.zfill(12) | 494 read_id = read_id.zfill(12) |
488 seq=l[0] | 495 seq = l[0] |
489 seq_ready="Y" | 496 seq_ready = "Y" |
490 elif input_format=="fastq": | 497 elif input_format=="fastq": |
491 m_fastq=math.fmod(n_fastq,4) | 498 l_fastq = math.fmod(line_no, 4) |
492 n_fastq+=1 | 499 if l_fastq == 1 : |
493 seq_ready="N" | 500 all_raw_reads += 1 |
494 if m_fastq==0: | 501 read_id = l[0][1:] |
495 all_raw_reads+=1 | 502 seq_ready = "N" |
496 read_id=str(all_raw_reads) | 503 elif l_fastq == 2 : |
497 read_id=read_id.zfill(12) | 504 seq = l[0] |
498 seq="" | 505 seq_ready = "Y" |
499 elif m_fastq==1: | 506 else : |
500 seq=l[0] | 507 seq = "" |
501 seq_ready="Y" | 508 seq_ready = "N" |
502 else: | |
503 seq="" | |
504 elif input_format=="qseq": | 509 elif input_format=="qseq": |
505 all_raw_reads+=1 | 510 all_raw_reads += 1 |
506 read_id=str(all_raw_reads) | 511 read_id = str(all_raw_reads) |
507 read_id=read_id.zfill(12) | 512 read_id = read_id.zfill(12) |
508 seq=l[8] | 513 seq = l[8] |
509 seq_ready="Y" | 514 seq_ready = "Y" |
510 elif input_format=="fasta": | 515 elif input_format=="fasta" : |
511 m_fasta=math.fmod(n_fasta,2) | 516 l_fasta = math.fmod(line_no,2) |
512 n_fasta+=1 | 517 if l_fasta==1: |
513 seq_ready="N" | 518 all_raw_reads += 1 |
514 if m_fasta==0: | 519 read_id = l[0][1:] |
515 all_raw_reads+=1 | 520 seq = "" |
516 read_id=l[0][1:] | 521 seq_ready = "N" |
517 seq="" | 522 elif l_fasta==0 : |
518 elif m_fasta==1: | 523 seq = l[0] |
519 seq=l[0] | 524 seq_ready = "Y" |
520 seq_ready="Y" | |
521 else: | |
522 seq="" | |
523 | |
524 #-------------------------------- | 525 #-------------------------------- |
525 if seq_ready=="Y": | 526 if seq_ready=="Y": |
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 | 527 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 |
527 seq=seq.upper() | 528 seq=seq.upper() |
528 seq=seq.replace(".","N") | 529 seq=seq.replace(".","N") |
529 | 530 |
530 #--striping adapter from 3' read ------- | 531 #--striping adapter from 3' read ------- |
532 all_base_before_trim += len(seq) | |
531 if adapter != "": | 533 if adapter != "": |
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) | 534 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) |
533 if len(new_read) < len(seq) : | 535 if len(new_read) < len(seq) : |
534 all_trimed += 1 | 536 all_trimmed += 1 |
535 seq = new_read | 537 seq = new_read |
536 | 538 all_base_after_trim += len(seq) |
537 if len(seq)<=4: | 539 if len(seq)<=4: |
538 seq = "N" * (cut2-cut1+1) | 540 seq = "N" * (cut2-cut1+1) |
539 | 541 |
540 #--------- trimmed_raw_BS_read ------------------ | 542 #--------- trimmed_raw_BS_read ------------------ |
541 original_bs_reads[read_id] = seq | 543 original_bs_reads[read_id] = seq |
610 outf_MH.write(">%s\n" % i) | 612 outf_MH.write(">%s\n" % i) |
611 outf_MH.write("%s\n" % original_bs_reads[i]) | 613 outf_MH.write("%s\n" % original_bs_reads[i]) |
612 outf_MH.close() | 614 outf_MH.close() |
613 | 615 |
614 | 616 |
615 | |
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | 617 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] |
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | 618 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] |
618 FW_C2T_uniq_lst.sort() | 619 FW_C2T_uniq_lst.sort() |
619 RC_C2T_uniq_lst.sort() | 620 RC_C2T_uniq_lst.sort() |
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | 621 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] |
631 nn = 0 | 632 nn = 0 |
632 gseq = dict() | 633 gseq = dict() |
633 chr_length = dict() | 634 chr_length = dict() |
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: | 635 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: |
635 nn += 1 | 636 nn += 1 |
636 mapped_chr0 = "" | |
637 for header in ali_unique_lst: | 637 for header in ali_unique_lst: |
638 _, mapped_chr, mapped_location, cigar = ali_dic[header] | 638 _, mapped_chr, mapped_location, cigar = ali_dic[header] |
639 original_BS = original_bs_reads[header] | 639 original_BS = original_bs_reads[header] |
640 #------------------------------------- | 640 #------------------------------------- |
641 if mapped_chr not in gseq : | 641 if mapped_chr not in gseq : |
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | 642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) |
643 chr_length[mapped_chr] = len(gseq[mapped_chr]) | 643 chr_length[mapped_chr] = len(gseq[mapped_chr]) |
644 #if mapped_chr != mapped_chr0: | |
645 # my_gseq = deserialize(db_d(mapped_chr)) | |
646 # chr_length = len(my_gseq) | |
647 # mapped_chr0 = mapped_chr | |
648 #------------------------------------- | |
649 | 644 |
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | 645 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) |
651 | 646 |
652 all_mapped+=1 | 647 all_mapped+=1 |
653 if nn == 1: # +FW mapped to + strand: | 648 if nn == 1: # +FW mapped to + strand: |
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) | 657 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) |
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | 658 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) |
664 | 659 |
665 if len(r_aln) == len(g_aln): | 660 if len(r_aln) == len(g_aln): |
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides | 661 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides |
667 if N_mismatch <= int(max_mismatch_no): | 662 mm_no=float(max_mismatch_no) |
663 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
668 numbers_mapped_lst[nn-1] += 1 | 664 numbers_mapped_lst[nn-1] += 1 |
669 all_mapped_passed += 1 | 665 all_mapped_passed += 1 |
670 methy = methy_seq(r_aln, g_aln+next) | 666 methy = methy_seq(r_aln, g_aln+next) |
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 667 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
672 | 668 |
676 nmCH = methy.count('Y') + methy.count('Z') | 672 nmCH = methy.count('Y') + methy.count('Z') |
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | 673 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : |
678 XS = 1 | 674 XS = 1 |
679 | 675 |
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | 676 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) |
677 all_base_mapped += len(original_BS) | |
681 | 678 |
682 #---------------------------------------------------------------- | 679 #---------------------------------------------------------------- |
683 logm("--> %s (%d) "%(read_file,no_my_files)) | 680 logm("--> %s (%d) "%(read_file,no_my_files)) |
684 delete_files(WC2T, CC2T) | 681 delete_files(WC2T, CC2T) |
685 | 682 |
686 | 683 |
687 #---------------------------------------------------------------- | 684 #---------------------------------------------------------------- |
688 | 685 |
689 # outf.close() | |
690 delete_files(tmp_path) | 686 delete_files(tmp_path) |
691 | 687 |
692 logm("Number of raw reads: %d \n"% all_raw_reads) | 688 logm("Number of raw reads: %d \n"% all_raw_reads) |
693 if all_raw_reads >0: | 689 if all_raw_reads >0: |
694 logm("Number of reads having adapter removed: %d \n" % all_trimed ) | 690 logm("Number of bases in total: %d "%all_base_before_trim) |
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | 691 if adapter != "" : |
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped) | 692 logm("Number of reads having adapter removed: %d \n" % all_trimmed ) |
693 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) ) | |
694 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
695 logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped) | |
697 if asktag=="Y": | 696 if asktag=="Y": |
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | 697 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) |
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | 698 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) |
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) | 699 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) |
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) | 700 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) |
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) | 710 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) |
712 elif asktag=="N": | 711 elif asktag=="N": |
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) | 712 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) |
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) | 713 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) |
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) | 714 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) |
715 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) | |
716 | |
716 | 717 |
717 n_CG=mC_lst[0]+uC_lst[0] | 718 n_CG=mC_lst[0]+uC_lst[0] |
718 n_CHG=mC_lst[1]+uC_lst[1] | 719 n_CHG=mC_lst[1]+uC_lst[1] |
719 n_CHH=mC_lst[2]+uC_lst[2] | 720 n_CHH=mC_lst[2]+uC_lst[2] |
720 | 721 |