Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_single_end.py @ 1:8b26adf64adc draft default tip
V2.0.5
| author | weilong-guo |
|---|---|
| date | Tue, 05 Nov 2013 01:55:39 -0500 |
| parents | e6df770c0e58 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e6df770c0e58 | 1:8b26adf64adc |
|---|---|
| 1 import fileinput, os, time, random, math | 1 import fileinput, os, time, random, math |
| 2 from bs_utils.utils import * | 2 from bs_utils.utils import * |
| 3 from bs_align_utils import * | 3 from bs_align_utils import * |
| 4 import gzip | |
| 4 | 5 |
| 5 #---------------------------------------------------------------- | 6 #---------------------------------------------------------------- |
| 6 # Read from the mapped results, return lists of unique / multiple-hit reads | 7 # Read from the mapped results, return lists of unique / multiple-hit reads |
| 7 # The function suppose at most 2 hits will be reported in single file | 8 # The function suppose at most 2 hits will be reported in single file |
| 8 def extract_mapping(ali_file): | 9 def extract_mapping(ali_file): |
| 75 #---------------------------------------------------------------- | 76 #---------------------------------------------------------------- |
| 76 | 77 |
| 77 | 78 |
| 78 | 79 |
| 79 #---------------------------------------------------------------- | 80 #---------------------------------------------------------------- |
| 80 logm("Read filename: %s"% main_read_file ) | 81 logm("Read filename: %s" % main_read_file) |
| 81 logm("Un-directional library: %s" % asktag ) | 82 logm("The first base (for mapping): %d"% cut1 ) |
| 82 logm("The first base (for mapping): %d" % cut1) | 83 logm("The last base (for mapping): %d"% cut2 ) |
| 83 logm("The last base (for mapping): %d" % cut2) | 84 |
| 84 logm("Max. lines per mapping: %d"% no_small_lines) | 85 logm("Path for short reads aligner: %s"% aligner_command + '\n') |
| 85 logm("Aligner: %s" % aligner_command) | 86 logm("Reference genome library path: %s"% db_path ) |
| 86 logm("Reference genome library path: %s" % db_path ) | 87 |
| 87 logm("Number of mismatches allowed: %s" % max_mismatch_no ) | 88 if asktag == "Y" : |
| 89 logm("Un-directional library" ) | |
| 90 else : | |
| 91 logm("Directional library") | |
| 92 | |
| 93 logm("Number of mismatches allowed: %s"% max_mismatch_no ) | |
| 94 | |
| 88 if adapter_file !="": | 95 if adapter_file !="": |
| 89 if asktag=="N": | 96 logm("Adapter seq: %s" % adapter_fw) |
| 90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n"))) | 97 logm("-------------------------------- " ) |
| 91 elif asktag=="Y": | |
| 92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) ) | |
| 93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) ) | |
| 94 #---------------------------------------------------------------- | 98 #---------------------------------------------------------------- |
| 95 | 99 |
| 96 # helper method to join fname with tmp_path | 100 # helper method to join fname with tmp_path |
| 97 tmp_d = lambda fname: os.path.join(tmp_path, fname) | 101 tmp_d = lambda fname: os.path.join(tmp_path, fname) |
| 98 | 102 |
| 107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) | 111 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) |
| 108 # if splitted_file.startswith("%s-s-" % input_fname)) | 112 # if splitted_file.startswith("%s-s-" % input_fname)) |
| 109 | 113 |
| 110 #---- Stats ------------------------------------------------------------ | 114 #---- Stats ------------------------------------------------------------ |
| 111 all_raw_reads=0 | 115 all_raw_reads=0 |
| 112 all_trimed=0 | 116 all_trimmed=0 |
| 113 all_mapped=0 | 117 all_mapped=0 |
| 114 all_mapped_passed=0 | 118 all_mapped_passed=0 |
| 119 all_base_before_trim=0 | |
| 120 all_base_after_trim=0 | |
| 121 all_base_mapped=0 | |
| 115 | 122 |
| 116 numbers_premapped_lst=[0,0,0,0] | 123 numbers_premapped_lst=[0,0,0,0] |
| 117 numbers_mapped_lst=[0,0,0,0] | 124 numbers_mapped_lst=[0,0,0,0] |
| 118 | 125 |
| 119 mC_lst=[0,0,0] | 126 mC_lst=[0,0,0] |
| 120 uC_lst=[0,0,0] | 127 uC_lst=[0,0,0] |
| 121 | |
| 122 | 128 |
| 123 no_my_files=0 | 129 no_my_files=0 |
| 124 | 130 |
| 125 #---------------------------------------------------------------- | 131 #---------------------------------------------------------------- |
| 126 logm("== Start mapping ==") | 132 logm("== Start mapping ==") |
| 130 original_bs_reads = {} | 136 original_bs_reads = {} |
| 131 no_my_files+=1 | 137 no_my_files+=1 |
| 132 random_id = ".tmp-"+str(random.randint(1000000,9999999)) | 138 random_id = ".tmp-"+str(random.randint(1000000,9999999)) |
| 133 | 139 |
| 134 #------------------------------------------------------------------- | 140 #------------------------------------------------------------------- |
| 135 # undirectional sequencing | 141 # un-directional sequencing |
| 136 #------------------------------------------------------------------- | 142 #------------------------------------------------------------------- |
| 137 if asktag=="Y": | 143 if asktag=="Y": |
| 138 | 144 |
| 139 #---------------------------------------------------------------- | 145 #---------------------------------------------------------------- |
| 140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) | 146 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) |
| 144 outf3=open(outfile3,'w') | 150 outf3=open(outfile3,'w') |
| 145 | 151 |
| 146 #---------------------------------------------------------------- | 152 #---------------------------------------------------------------- |
| 147 # detect format of input file | 153 # detect format of input file |
| 148 try : | 154 try : |
| 149 read_inf=open(read_file,"r") | 155 if read_file.endswith(".gz") : # support input file ending with ".gz" |
| 156 read_inf = gzip.open(read_file, "rb") | |
| 157 else : | |
| 158 read_inf=open(read_file,"r") | |
| 150 except IOError : | 159 except IOError : |
| 151 print "[Error] Cannot open input file : %s" % read_file | 160 print "[Error] Cannot open input file : %s" % read_file |
| 152 exit(-1) | 161 exit(-1) |
| 153 | 162 |
| 154 oneline=read_inf.readline() | 163 oneline = read_inf.readline() |
| 155 l=oneline.split() | 164 l = oneline.split() |
| 156 input_format="" | 165 input_format = "" |
| 157 if oneline[0]=="@": # fastq | 166 if oneline[0]=="@": |
| 158 input_format="fastq" | 167 input_format = "fastq" |
| 159 n_fastq=0 | 168 elif len(l)==1 and oneline[0]!=">": |
| 160 elif len(l)==1 and oneline[0]!=">": # pure sequences | 169 input_format = "seq" |
| 161 input_format="seq" | 170 elif len(l)==11: |
| 162 elif len(l)==11: # qseq | 171 input_format = "qseq" |
| 163 input_format="qseq" | 172 elif oneline[0]==">": |
| 164 elif oneline[0]==">": # fasta | 173 input_format = "fasta" |
| 165 input_format="fasta" | |
| 166 n_fasta=0 | |
| 167 read_inf.close() | 174 read_inf.close() |
| 168 | 175 |
| 169 #---------------------------------------------------------------- | 176 #---------------------------------------------------------------- |
| 170 # read sequence, remove adapter and convert | 177 # read sequence, remove adapter and convert |
| 171 read_id="" | 178 read_id = "" |
| 172 seq="" | 179 seq = "" |
| 173 seq_ready="N" | 180 seq_ready = "N" |
| 174 for line in fileinput.input(read_file): | 181 line_no = 0 |
| 175 l=line.split() | 182 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz |
| 176 | 183 l = line.split() |
| 184 line_no += 1 | |
| 177 if input_format=="seq": | 185 if input_format=="seq": |
| 178 all_raw_reads+=1 | 186 all_raw_reads += 1 |
| 179 read_id=str(all_raw_reads) | 187 read_id = str(all_raw_reads) |
| 180 read_id=read_id.zfill(12) | 188 read_id = read_id.zfill(12) |
| 181 seq=l[0] | 189 seq = l[0] |
| 182 seq_ready="Y" | 190 seq_ready = "Y" |
| 183 elif input_format=="fastq": | 191 elif input_format=="fastq": |
| 184 m_fastq=math.fmod(n_fastq,4) | 192 l_fastq = math.fmod(line_no, 4) |
| 185 n_fastq+=1 | 193 if l_fastq == 1 : |
| 186 seq_ready="N" | 194 all_raw_reads += 1 |
| 187 if m_fastq==0: | 195 read_id = l[0][1:] |
| 188 all_raw_reads+=1 | 196 seq_ready = "N" |
| 189 read_id=str(all_raw_reads) | 197 elif l_fastq == 2 : |
| 190 read_id=read_id.zfill(12) | 198 seq = l[0] |
| 191 seq="" | 199 seq_ready = "Y" |
| 192 elif m_fastq==1: | 200 else : |
| 193 seq=l[0] | 201 seq = "" |
| 194 seq_ready="Y" | 202 seq_ready = "N" |
| 195 else: | |
| 196 seq="" | |
| 197 elif input_format=="qseq": | 203 elif input_format=="qseq": |
| 198 all_raw_reads+=1 | 204 all_raw_reads += 1 |
| 199 read_id=str(all_raw_reads) | 205 read_id = str(all_raw_reads) |
| 200 read_id=read_id.zfill(12) | 206 read_id = read_id.zfill(12) |
| 201 seq=l[8] | 207 seq = l[8] |
| 202 seq_ready="Y" | 208 seq_ready = "Y" |
| 203 elif input_format=="fasta": | 209 elif input_format=="fasta" : |
| 204 m_fasta=math.fmod(n_fasta,2) | 210 l_fasta = math.fmod(line_no,2) |
| 205 n_fasta+=1 | 211 if l_fasta==1: |
| 206 seq_ready="N" | 212 all_raw_reads += 1 |
| 207 if m_fasta==0: | 213 read_id = l[0][1:] |
| 208 all_raw_reads+=1 | 214 seq = "" |
| 209 #read_id=str(all_raw_reads) | 215 seq_ready = "N" |
| 210 read_id=l[0][1:] | 216 elif l_fasta==0 : |
| 211 seq="" | 217 seq = l[0] |
| 212 elif m_fasta==1: | 218 seq_ready = "Y" |
| 213 seq=l[0] | |
| 214 seq_ready="Y" | |
| 215 else: | |
| 216 seq="" | |
| 217 | 219 |
| 218 #---------------------------------------------------------------- | 220 #---------------------------------------------------------------- |
| 219 if seq_ready=="Y": | 221 if seq_ready=="Y": |
| 220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 | 222 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 |
| 221 seq=seq.upper() | 223 seq=seq.upper() |
| 222 seq=seq.replace(".","N") | 224 seq=seq.replace(".","N") |
| 223 | 225 |
| 224 # striping BS adapter from 3' read | 226 # striping BS adapter from 3' read |
| 227 all_base_before_trim += len(seq) | |
| 225 if (adapter_fw !="") and (adapter_rc !="") : | 228 if (adapter_fw !="") and (adapter_rc !="") : |
| 226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) | 229 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) |
| 227 new_read = Remove_5end_Adapter(new_read, adapter_rc) | 230 new_read = Remove_5end_Adapter(new_read, adapter_rc) |
| 228 if len(new_read) < len(seq) : | 231 if len(new_read) < len(seq) : |
| 229 all_trimed += 1 | 232 all_trimmed += 1 |
| 230 seq = new_read | 233 seq = new_read |
| 231 | 234 all_base_after_trim += len(seq) |
| 232 if len(seq)<=4: | 235 if len(seq)<=4: |
| 233 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) | 236 seq=''.join(["N" for x in xrange(cut2-cut1+1)]) |
| 234 | 237 |
| 235 #--------- trimmed_raw_BS_read ------------------ | 238 #--------- trimmed_raw_BS_read ------------------ |
| 236 original_bs_reads[read_id] = seq | 239 original_bs_reads[read_id] = seq |
| 351 RC_G2A_uniq_lst.sort() | 354 RC_G2A_uniq_lst.sort() |
| 352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | 355 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] |
| 353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | 356 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] |
| 354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] | 357 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] |
| 355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] | 358 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] |
| 359 #---------------------------------------------------------------- | |
| 360 | |
| 361 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
| 362 numbers_premapped_lst[1] += len(Unique_RC_G2A) | |
| 363 numbers_premapped_lst[2] += len(Unique_FW_G2A) | |
| 364 numbers_premapped_lst[3] += len(Unique_RC_C2T) | |
| 356 | 365 |
| 357 del Unique_FW_C2T | 366 del Unique_FW_C2T |
| 358 del Unique_FW_G2A | 367 del Unique_FW_G2A |
| 359 del Unique_RC_C2T | 368 del Unique_RC_C2T |
| 360 del Unique_RC_G2A | 369 del Unique_RC_G2A |
| 361 | |
| 362 #---------------------------------------------------------------- | |
| 363 numbers_premapped_lst[0] += len(Unique_FW_C2T) | |
| 364 numbers_premapped_lst[1] += len(Unique_RC_G2A) | |
| 365 numbers_premapped_lst[2] += len(Unique_FW_G2A) | |
| 366 numbers_premapped_lst[3] += len(Unique_RC_C2T) | |
| 367 | 370 |
| 368 | 371 |
| 369 #---------------------------------------------------------------- | 372 #---------------------------------------------------------------- |
| 370 | 373 |
| 371 nn=0 | 374 nn=0 |
| 374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), | 377 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), |
| 375 (RC_G2A_uniq_lst,RC_G2A_U), | 378 (RC_G2A_uniq_lst,RC_G2A_U), |
| 376 (FW_G2A_uniq_lst,FW_G2A_U), | 379 (FW_G2A_uniq_lst,FW_G2A_U), |
| 377 (RC_C2T_uniq_lst,RC_C2T_U)]: | 380 (RC_C2T_uniq_lst,RC_C2T_U)]: |
| 378 nn += 1 | 381 nn += 1 |
| 379 mapped_chr0 = "" | |
| 380 | 382 |
| 381 for header in ali_unique_lst: | 383 for header in ali_unique_lst: |
| 382 | 384 |
| 383 _, mapped_chr, mapped_location, cigar = ali_dic[header] | 385 _, mapped_chr, mapped_location, cigar = ali_dic[header] |
| 384 | 386 |
| 420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | 422 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) |
| 421 | 423 |
| 422 | 424 |
| 423 if len(r_aln)==len(g_aln): | 425 if len(r_aln)==len(g_aln): |
| 424 N_mismatch = N_MIS(r_aln, g_aln) | 426 N_mismatch = N_MIS(r_aln, g_aln) |
| 425 if N_mismatch <= int(max_mismatch_no): | 427 # if N_mismatch <= int(max_mismatch_no): |
| 428 mm_no=float(max_mismatch_no) | |
| 429 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
| 426 numbers_mapped_lst[nn-1] += 1 | 430 numbers_mapped_lst[nn-1] += 1 |
| 427 all_mapped_passed += 1 | 431 all_mapped_passed += 1 |
| 428 methy = methy_seq(r_aln, g_aln + next) | 432 methy = methy_seq(r_aln, g_aln + next) |
| 429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 433 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
| 430 | 434 |
| 434 nmCH = methy.count('Y') + methy.count('Z') | 438 nmCH = methy.count('Y') + methy.count('Z') |
| 435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | 439 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : |
| 436 XS = 1 | 440 XS = 1 |
| 437 | 441 |
| 438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | 442 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) |
| 443 all_base_mapped += len(original_BS) | |
| 439 | 444 |
| 440 #---------------------------------------------------------------- | 445 #---------------------------------------------------------------- |
| 441 logm("--> %s (%d) "%(read_file, no_my_files)) | 446 logm("--> %s (%d) "%(read_file, no_my_files)) |
| 442 delete_files(WC2T, WG2A, CC2T, CG2A) | 447 delete_files(WC2T, WG2A, CC2T, CG2A) |
| 443 | 448 |
| 447 # directional sequencing | 452 # directional sequencing |
| 448 #-------------------------------------------------------------------- | 453 #-------------------------------------------------------------------- |
| 449 | 454 |
| 450 if asktag=="N": | 455 if asktag=="N": |
| 451 #---------------------------------------------------------------- | 456 #---------------------------------------------------------------- |
| 452 outfile2=tmp_d('Trimed_C2T.fa'+random_id) | 457 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) |
| 453 outf2=open(outfile2,'w') | 458 outf2=open(outfile2,'w') |
| 454 | |
| 455 n=0 | |
| 456 #---------------------------------------------------------------- | 459 #---------------------------------------------------------------- |
| 457 try : | 460 try : |
| 458 read_inf=open(read_file,"r") | 461 if read_file.endswith(".gz") : # support input file ending with ".gz" |
| 462 read_inf = gzip.open(read_file, "rb") | |
| 463 else : | |
| 464 read_inf=open(read_file,"r") | |
| 459 except IOError : | 465 except IOError : |
| 460 print "[Error] Cannot open input file : %s" % read_file | 466 print "[Error] Cannot open input file : %s" % read_file |
| 461 exit(-1) | 467 exit(-1) |
| 462 | 468 |
| 463 oneline=read_inf.readline() | 469 oneline = read_inf.readline() |
| 464 l=oneline.split() | 470 l = oneline.split() |
| 465 input_format="" | 471 input_format = "" |
| 466 if oneline[0]=="@": # FastQ | 472 if oneline[0]=="@": |
| 467 input_format="fastq" | 473 input_format = "fastq" |
| 468 n_fastq=0 | 474 elif len(l)==1 and oneline[0]!=">": |
| 469 elif len(l)==1 and oneline[0]!=">": # pure sequences | 475 input_format = "seq" |
| 470 input_format="seq" | 476 elif len(l)==11: |
| 471 elif len(l)==11: # Illumina GAII qseq file | 477 input_format = "qseq" |
| 472 input_format="qseq" | 478 elif oneline[0]==">": |
| 473 elif oneline[0]==">": # fasta | 479 input_format = "fasta" |
| 474 input_format="fasta" | |
| 475 n_fasta=0 | |
| 476 read_inf.close() | 480 read_inf.close() |
| 481 | |
| 477 #print "detected data format: %s"%(input_format) | 482 #print "detected data format: %s"%(input_format) |
| 478 #---------------------------------------------------------------- | 483 #---------------------------------------------------------------- |
| 479 read_id="" | 484 read_id="" |
| 480 seq="" | 485 seq="" |
| 481 seq_ready="N" | 486 seq_ready="N" |
| 482 for line in fileinput.input(read_file): | 487 line_no = 0 |
| 483 l=line.split() | 488 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): |
| 489 l = line.split() | |
| 490 line_no += 1 | |
| 484 if input_format=="seq": | 491 if input_format=="seq": |
| 485 all_raw_reads+=1 | 492 all_raw_reads += 1 |
| 486 read_id=str(all_raw_reads) | 493 read_id = str(all_raw_reads) |
| 487 read_id=read_id.zfill(12) | 494 read_id = read_id.zfill(12) |
| 488 seq=l[0] | 495 seq = l[0] |
| 489 seq_ready="Y" | 496 seq_ready = "Y" |
| 490 elif input_format=="fastq": | 497 elif input_format=="fastq": |
| 491 m_fastq=math.fmod(n_fastq,4) | 498 l_fastq = math.fmod(line_no, 4) |
| 492 n_fastq+=1 | 499 if l_fastq == 1 : |
| 493 seq_ready="N" | 500 all_raw_reads += 1 |
| 494 if m_fastq==0: | 501 read_id = l[0][1:] |
| 495 all_raw_reads+=1 | 502 seq_ready = "N" |
| 496 read_id=str(all_raw_reads) | 503 elif l_fastq == 2 : |
| 497 read_id=read_id.zfill(12) | 504 seq = l[0] |
| 498 seq="" | 505 seq_ready = "Y" |
| 499 elif m_fastq==1: | 506 else : |
| 500 seq=l[0] | 507 seq = "" |
| 501 seq_ready="Y" | 508 seq_ready = "N" |
| 502 else: | |
| 503 seq="" | |
| 504 elif input_format=="qseq": | 509 elif input_format=="qseq": |
| 505 all_raw_reads+=1 | 510 all_raw_reads += 1 |
| 506 read_id=str(all_raw_reads) | 511 read_id = str(all_raw_reads) |
| 507 read_id=read_id.zfill(12) | 512 read_id = read_id.zfill(12) |
| 508 seq=l[8] | 513 seq = l[8] |
| 509 seq_ready="Y" | 514 seq_ready = "Y" |
| 510 elif input_format=="fasta": | 515 elif input_format=="fasta" : |
| 511 m_fasta=math.fmod(n_fasta,2) | 516 l_fasta = math.fmod(line_no,2) |
| 512 n_fasta+=1 | 517 if l_fasta==1: |
| 513 seq_ready="N" | 518 all_raw_reads += 1 |
| 514 if m_fasta==0: | 519 read_id = l[0][1:] |
| 515 all_raw_reads+=1 | 520 seq = "" |
| 516 read_id=l[0][1:] | 521 seq_ready = "N" |
| 517 seq="" | 522 elif l_fasta==0 : |
| 518 elif m_fasta==1: | 523 seq = l[0] |
| 519 seq=l[0] | 524 seq_ready = "Y" |
| 520 seq_ready="Y" | |
| 521 else: | |
| 522 seq="" | |
| 523 | |
| 524 #-------------------------------- | 525 #-------------------------------- |
| 525 if seq_ready=="Y": | 526 if seq_ready=="Y": |
| 526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 | 527 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 |
| 527 seq=seq.upper() | 528 seq=seq.upper() |
| 528 seq=seq.replace(".","N") | 529 seq=seq.replace(".","N") |
| 529 | 530 |
| 530 #--striping adapter from 3' read ------- | 531 #--striping adapter from 3' read ------- |
| 532 all_base_before_trim += len(seq) | |
| 531 if adapter != "": | 533 if adapter != "": |
| 532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) | 534 new_read = RemoveAdapter(seq, adapter, adapter_mismatch) |
| 533 if len(new_read) < len(seq) : | 535 if len(new_read) < len(seq) : |
| 534 all_trimed += 1 | 536 all_trimmed += 1 |
| 535 seq = new_read | 537 seq = new_read |
| 536 | 538 all_base_after_trim += len(seq) |
| 537 if len(seq)<=4: | 539 if len(seq)<=4: |
| 538 seq = "N" * (cut2-cut1+1) | 540 seq = "N" * (cut2-cut1+1) |
| 539 | 541 |
| 540 #--------- trimmed_raw_BS_read ------------------ | 542 #--------- trimmed_raw_BS_read ------------------ |
| 541 original_bs_reads[read_id] = seq | 543 original_bs_reads[read_id] = seq |
| 610 outf_MH.write(">%s\n" % i) | 612 outf_MH.write(">%s\n" % i) |
| 611 outf_MH.write("%s\n" % original_bs_reads[i]) | 613 outf_MH.write("%s\n" % original_bs_reads[i]) |
| 612 outf_MH.close() | 614 outf_MH.close() |
| 613 | 615 |
| 614 | 616 |
| 615 | |
| 616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | 617 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] |
| 617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | 618 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] |
| 618 FW_C2T_uniq_lst.sort() | 619 FW_C2T_uniq_lst.sort() |
| 619 RC_C2T_uniq_lst.sort() | 620 RC_C2T_uniq_lst.sort() |
| 620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | 621 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] |
| 631 nn = 0 | 632 nn = 0 |
| 632 gseq = dict() | 633 gseq = dict() |
| 633 chr_length = dict() | 634 chr_length = dict() |
| 634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: | 635 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: |
| 635 nn += 1 | 636 nn += 1 |
| 636 mapped_chr0 = "" | |
| 637 for header in ali_unique_lst: | 637 for header in ali_unique_lst: |
| 638 _, mapped_chr, mapped_location, cigar = ali_dic[header] | 638 _, mapped_chr, mapped_location, cigar = ali_dic[header] |
| 639 original_BS = original_bs_reads[header] | 639 original_BS = original_bs_reads[header] |
| 640 #------------------------------------- | 640 #------------------------------------- |
| 641 if mapped_chr not in gseq : | 641 if mapped_chr not in gseq : |
| 642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | 642 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) |
| 643 chr_length[mapped_chr] = len(gseq[mapped_chr]) | 643 chr_length[mapped_chr] = len(gseq[mapped_chr]) |
| 644 #if mapped_chr != mapped_chr0: | |
| 645 # my_gseq = deserialize(db_d(mapped_chr)) | |
| 646 # chr_length = len(my_gseq) | |
| 647 # mapped_chr0 = mapped_chr | |
| 648 #------------------------------------- | |
| 649 | 644 |
| 650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | 645 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) |
| 651 | 646 |
| 652 all_mapped+=1 | 647 all_mapped+=1 |
| 653 if nn == 1: # +FW mapped to + strand: | 648 if nn == 1: # +FW mapped to + strand: |
| 662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) | 657 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) |
| 663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | 658 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) |
| 664 | 659 |
| 665 if len(r_aln) == len(g_aln): | 660 if len(r_aln) == len(g_aln): |
| 666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides | 661 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides |
| 667 if N_mismatch <= int(max_mismatch_no): | 662 mm_no=float(max_mismatch_no) |
| 663 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): | |
| 668 numbers_mapped_lst[nn-1] += 1 | 664 numbers_mapped_lst[nn-1] += 1 |
| 669 all_mapped_passed += 1 | 665 all_mapped_passed += 1 |
| 670 methy = methy_seq(r_aln, g_aln+next) | 666 methy = methy_seq(r_aln, g_aln+next) |
| 671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | 667 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) |
| 672 | 668 |
| 676 nmCH = methy.count('Y') + methy.count('Z') | 672 nmCH = methy.count('Y') + methy.count('Z') |
| 677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | 673 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : |
| 678 XS = 1 | 674 XS = 1 |
| 679 | 675 |
| 680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) | 676 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) |
| 677 all_base_mapped += len(original_BS) | |
| 681 | 678 |
| 682 #---------------------------------------------------------------- | 679 #---------------------------------------------------------------- |
| 683 logm("--> %s (%d) "%(read_file,no_my_files)) | 680 logm("--> %s (%d) "%(read_file,no_my_files)) |
| 684 delete_files(WC2T, CC2T) | 681 delete_files(WC2T, CC2T) |
| 685 | 682 |
| 686 | 683 |
| 687 #---------------------------------------------------------------- | 684 #---------------------------------------------------------------- |
| 688 | 685 |
| 689 # outf.close() | |
| 690 delete_files(tmp_path) | 686 delete_files(tmp_path) |
| 691 | 687 |
| 692 logm("Number of raw reads: %d \n"% all_raw_reads) | 688 logm("Number of raw reads: %d \n"% all_raw_reads) |
| 693 if all_raw_reads >0: | 689 if all_raw_reads >0: |
| 694 logm("Number of reads having adapter removed: %d \n" % all_trimed ) | 690 logm("Number of bases in total: %d "%all_base_before_trim) |
| 695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | 691 if adapter != "" : |
| 696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped) | 692 logm("Number of reads having adapter removed: %d \n" % all_trimmed ) |
| 693 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) ) | |
| 694 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
| 695 logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped) | |
| 697 if asktag=="Y": | 696 if asktag=="Y": |
| 698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | 697 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) |
| 699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | 698 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) |
| 700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) | 699 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) |
| 701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) | 700 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) |
| 711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) | 710 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) |
| 712 elif asktag=="N": | 711 elif asktag=="N": |
| 713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) | 712 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) |
| 714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) | 713 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) |
| 715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) | 714 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) |
| 715 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) | |
| 716 | |
| 716 | 717 |
| 717 n_CG=mC_lst[0]+uC_lst[0] | 718 n_CG=mC_lst[0]+uC_lst[0] |
| 718 n_CHG=mC_lst[1]+uC_lst[1] | 719 n_CHG=mC_lst[1]+uC_lst[1] |
| 719 n_CHH=mC_lst[2]+uC_lst[2] | 720 n_CHH=mC_lst[2]+uC_lst[2] |
| 720 | 721 |
