Mercurial > repos > weilong-guo > bs_seeker2
view BSseeker2/bs_align/bs_pair_end.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
line wrap: on
line source
import fileinput, os, random, math from bs_utils.utils import * from bs_align_utils import * #---------------------------------------------------------------- def extract_mapping(ali_file): unique_hits = {} non_unique_hits = {} header0 = "" family = [] for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True): #------------------------ if header != header0: # --- output ---- if len(family) == 1: unique_hits[header0] = family[0] elif len(family) > 1: min_lst = min(family, key = lambda x: x[0]) max_lst = max(family, key = lambda x: x[0]) if min_lst[0] < max_lst[0]: unique_hits[header0] = min_lst else: non_unique_hits[header0] = min_lst[0] header0 = header family = [] family.append((no_mismatch, chr, location1, cigar1, location2, cigar2)) if len(family) == 1: unique_hits[header0] = family[0] elif len(family) > 1: min_lst = min(family, key = lambda x: x[0]) max_lst = max(family, key = lambda x: x[0]) if min_lst[0] < max_lst[0]: unique_hits[header0] = min_lst else: non_unique_hits[header0] = min_lst[0] return unique_hits, non_unique_hits def _extract_mapping(ali_file): U = {} R = {} header0 = "" family = [] for line in fileinput.input(ali_file): l = line.split() header = l[0][:-2] chr = str(l[1]) location = int(l[2]) #no_hits=int(l[4]) #-------- mismatches ----------- if len(l) == 4: no_mismatch = 0 elif len(l) == 5: no_mismatch = l[4].count(":") else: print l #------------------------ if header != header0: #-------------------- if header0 != "": # --- output ---- if len(family) == 1: U[header0] = family[0] else: if family[0][0] < family[1][0]: U[header0] = family[0] elif family[1][0] < family[0][0]: U[header0] = family[1] else: R[header0] = family[0][0] family=[] # --------------- header0 = header family = [[no_mismatch, chr, location]] member = 1 elif header == header0: if member == 1: family[-1][0] += no_mismatch family[-1].append(location) member = 2 elif member == 2: family.append([no_mismatch, chr, location]) member = 1 #------------------------------ fileinput.close() return U, R #---------------------------------------------------------------- def bs_pair_end(main_read_file_1, main_read_file_2, asktag, adapter_file, cut1, cut2, # add cut3 and cut4? no_small_lines, max_mismatch_no, aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False): #---------------------------------------------------------------- adapter = "" adapterA = "" adapterB = "" if adapter_file !="": try : adapter_inf=open(adapter_file,"r") if asktag=="N": #<--- directional library adapter=adapter_inf.readline() adapter_inf.close() adapter=adapter.rstrip("\n") elif asktag=="Y":#<--- un-directional library adapterA=adapter_inf.readline() adapterB=adapter_inf.readline() adapter_inf.close() adapterA=adapterA.rstrip("\n") adapterB=adapterB.rstrip("\n") except IOError : print "[Error] Cannot find adapter file : %s" % adapter_file exit(-1) #---------------------------------------------------------------- logm("End 1 filename: %s"% main_read_file_1 ) logm("End 2 filename: %s"% main_read_file_2 ) logm("The first base (for mapping): %d"% cut1 ) logm("The last base (for mapping): %d"% cut2 ) logm("Path for short reads aligner: %s"% aligner_command + '\n') logm("Reference genome library path: %s"% db_path ) if asktag == "Y" : logm("Un-directional library" ) else : logm("Directional library") logm("Number of mismatches allowed: %s"% max_mismatch_no ) if adapter_file !="": if asktag=="Y": logm("Adapters to be removed from 3' of the reads:" ) logm("-- A: %s" % adapterA ) logm("-- B: %s" % adapterB ) elif asktag=="N": logm("Adapter to be removed from 3' of the reads:" ) logm("-- %s" % adapter ) logm("-------------------------------- " ) #---------------------------------------------------------------- # helper method to join fname with tmp_path tmp_d = lambda fname: os.path.join(tmp_path, fname) db_d = lambda fname: os.path.join(db_path, fname) #---------------------------------------------------------------- # splitting the 2 big read files input_fname1 = os.path.split(main_read_file_1)[1] input_fname2 = os.path.split(main_read_file_2)[1] # TODO: run these in parallel with a subprocess split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines) split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines) dirList=os.listdir(tmp_path) my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)), sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList))) #---- Stats ------------------------------------------------------------ all_raw_reads=0 all_trimmed=0 all_mapped=0 all_mapped_passed=0 all_base_before_trim=0 all_base_after_trim=0 all_base_mapped=0 all_unmapped=0 numbers_premapped_lst=[0,0,0,0] numbers_mapped_lst=[0,0,0,0] mC_lst=[0,0,0] uC_lst=[0,0,0] no_my_files=0 read_id_lst_1 = dict() read_id_lst_2 = dict() #---------------------------------------------------------------- print "== Start mapping ==" for read_file_1, read_file_2 in my_files: no_my_files+=1 random_id=".tmp-"+str(random.randint(1000000,9999999)) original_bs_reads_1 = {} original_bs_reads_2 = {} original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2] if asktag == "Y" : #---------------------------------------------------------------- outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id) outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) try : if read_file_1.endswith(".gz") : # support input file ending with ".gz" read_inf = gzip.open(tmp_d(read_file_1), "rb") else : read_inf = open(tmp_d(read_file_1),"r") except IOError : print "[Error] Cannot open file : %s", tmp_d(read_file_1) exit(-1) oneline = read_inf.readline() l = oneline.split() input_format = "" if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) input_format="fastq" elif len(l)==1 and oneline[0]!=">": # pure sequences input_format="seq" elif len(l)==11: # Illumina GAII qseq file input_format="qseq" elif oneline[0]==">": # fasta input_format="fasta" read_inf.close() print "Detected data format: %s" % input_format #---------------------------------------------------------------- read_file_list = [read_file_1, read_file_2] outfile_FCT_list = [outfile_1FCT, outfile_2RGA] outfile_RCT_list = [outfile_1RCT, outfile_2RCT] n_list = [0, 0] for f in range(2): read_file = read_file_list[f] outf_FCT = open(outfile_FCT_list[f], 'w') outf_RGA = open(outfile_RCT_list[f], 'w') original_bs_reads = original_bs_reads_lst[f] n = n_list[f] read_id = "" seq = "" seq_ready = "N" line_no = 0 for line in fileinput.input(tmp_d(read_file)): l = line.split() line_no += 1 if input_format=="seq": n += 1 read_id = str(n) read_id = read_id.zfill(12) if f == 1 : read_id_lst_1[read_id] = read_id else : read_id_lst_2[read_id] = read_id seq = l[0] seq_ready = "Y" elif input_format=="fastq": l_fastq = math.fmod(line_no, 4) if l_fastq==1 : all_raw_reads += 1 read_id = str(line_no/4+1).zfill(12) if f==1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq_ready="N" elif l_fastq==2 : seq = l[0] seq_ready = "Y" else : seq = "" seq_ready = "N" elif input_format=="qseq": all_raw_reads += 1 read_id = str(line_no) read_id = read_id.zfill(12) if f == 1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq = l[8] seq_ready = "Y" elif input_format=="fasta": l_fasta = math.fmod(line_no,2) seq_ready = "N" if l_fasta==1: all_raw_reads += 1 read_id = str(line_no/2+1).zfill(12) if f==1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq = "" elif l_fasta==0 : seq = l[0] seq_ready = "Y" #---------------------------------------------------------------- if seq_ready=="Y": seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 seq = seq.upper() seq = seq.replace(".","N") #--striping BS adapter from 3' read -------------------------------------------------------------- all_base_before_trim += len(seq) if f==0 and adapterA!="" : new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) if len(new_read)<len(seq) : all_trimmed += 1 seq = new_read if f==1 and adapterB!="" : new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) if len(new_read)<len(seq) : all_trimmed += 1 seq = new_read all_base_after_trim += len(seq) if len(seq)<=4: seq = "N" * (cut2-cut1+1) #--------- trimmed_raw_BS_read ------------------ original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) #--------- RC_G2A ------------------ outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) n_list[f] = n outf_FCT.close() outf_RGA.close() fileinput.close() #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); all_raw_reads += n #-------------------------------------------------------------------------------- # Bowtie mapping #-------------------------------------------------------------------------------- WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file_1' : outfile_1FCT, 'input_file_2' : outfile_2RCT, 'output_file' : WC2T_fr}, aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 'input_file_1' : outfile_1FCT, 'input_file_2' : outfile_2RCT, 'output_file' : CG2A_fr}, aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file_1' : outfile_2RGA, 'input_file_2' : outfile_1RCT, 'output_file' : WC2T_rf}, aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 'input_file_1' : outfile_2RGA, 'input_file_2' : outfile_1RCT, 'output_file' : CC2T_rf}]) delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf) #---------------------------------------------------------------- # get uniq-hit reads #---------------------------------------------------------------- Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) Unique_FW_fr_C2T = set() # + Unique_FW_rf_C2T = set() # + Unique_RC_fr_G2A = set() # - Unique_RC_rf_C2T = set() # - Multiple_hits = set() for x in Union_set: _list = [] for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]: mis_lst = d.get(x,[99]) mis = int(mis_lst[0]) _list.append(mis) for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]: mis = d.get(x,99) _list.append(mis) mini = min(_list) if _list.count(mini)==1: mini_index = _list.index(mini) if mini_index==0: Unique_FW_fr_C2T.add(x) elif mini_index==1: Unique_FW_rf_C2T.add(x) elif mini_index==2: Unique_RC_fr_G2A.add(x) elif mini_index==3: Unique_RC_rf_C2T.add(x) else : Multiple_hits.add(x) else : Multiple_hits.add(x) # write reads rejected by Multiple Hits to file if show_multiple_hit : outf_MH = open("Multiple_hit.fa",'w') for i in Multiple_hits : outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) outf_MH.close() del Union_set del FW_C2T_fr_R del FW_C2T_rf_R del RC_G2A_fr_R del RC_C2T_rf_R FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] FW_C2T_fr_uniq_lst.sort() FW_C2T_rf_uniq_lst.sort() RC_C2T_fr_uniq_lst.sort() RC_C2T_rf_uniq_lst.sort() FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst] RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst] #---------------------------------------------------------------- numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) numbers_premapped_lst[1] += len(Unique_FW_rf_C2T) numbers_premapped_lst[2] += len(Unique_RC_fr_G2A) numbers_premapped_lst[3] += len(Unique_RC_rf_C2T) del Unique_FW_fr_C2T del Unique_FW_rf_C2T del Unique_RC_fr_G2A del Unique_RC_rf_C2T #---------------------------------------------------------------- nn = 0 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U), (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: nn += 1 mapped_chr0 = "" for header in ali_unique_lst: _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] #------------------------------------- if mapped_chr!=mapped_chr0: my_gseq = deserialize(db_d(mapped_chr)) chr_length = len(my_gseq) mapped_chr0 = mapped_chr #------------------------------------- if nn==1 or nn==3 : original_BS_1 = original_bs_reads_1[header] original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) else : original_BS_1 = original_bs_reads_2[header] original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) all_mapped += 1 if nn==1 : # FW-RC mapped to + strand: FR = "+FR" mapped_strand_1 = "+" mapped_strand_2 = "+" elif nn==2 : # RC-FW mapped to + strand: FR = "+RF" mapped_strand_1 = "+" mapped_strand_2 = "+" elif nn==3 : # FW-RC mapped to - strand: FR = "-FR" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 mapped_strand_1 = "-" mapped_location_2 = chr_length - mapped_location_2 - g_len_2 mapped_strand_2 = "-" elif nn==4 : # RC-FW mapped to - strand: FR = "-RF" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 mapped_strand_1 = "-" mapped_location_2 = chr_length - mapped_location_2 - g_len_2 mapped_strand_2 = "-" origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides N_mismatch = max(N_mismatch_1, N_mismatch_2) mm_no=float(max_mismatch_no) if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) and N_mismatch_2<=mm_no*len(original_BS_2)): all_mapped_passed += 1 numbers_mapped_lst[nn-1] += 1 #---- unmapped ------------------------- del original_bs_reads_1[header] del original_bs_reads_2[header] #--------------------------------------- methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1) methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2) mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) #---XS FILTER---------------- XS_1 = 0 nCH_1 = methy_1.count('y') + methy_1.count('z') nmCH_1 = methy_1.count('Y') + methy_1.count('Z') if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : XS_1 = 1 XS_2 = 0 nCH_2 = methy_2.count('y') + methy_2.count('z') nmCH_2 = methy_2.count('Y') + methy_2.count('Z') if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : XS_2 = 1 if mapped_strand_1=='+' : flag_1 = 67 # 1000011 : 1st read, + strand flag_2 = 131 #10000011 : 2nd read, + strand else : flag_1 = 115 # 1110011 : 1st read, - strand flag_2 = 179 # 10110011 : 2nd read, - strand outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) all_base_mapped += len(original_BS_1) outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) all_base_mapped += len(original_BS_2) print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) #---------------------------------------------------------------- # output unmapped pairs #---------------------------------------------------------------- unmapped_lst = original_bs_reads_1.keys() unmapped_lst.sort() # for u in unmapped_lst: # outf_u1.write("%s\n"%original_bs_reads_1[u]) # outf_u2.write("%s\n"%original_bs_reads_2[u]) all_unmapped += len(unmapped_lst) # Directional library if asktag=="N": #---------------------------------------------------------------- outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id) outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id) try : if read_file_1.endswith(".gz") : # support input file ending with ".gz" read_inf = gzip.open(tmp_d(read_file_1), "rb") else : read_inf = open(tmp_d(read_file_1),"r") except IOError : print "[Error] Cannot open file : %s", tmp_d(read_file_1) exit(-1) oneline = read_inf.readline() l = oneline.split() input_format = "" if oneline[0]=="@" : input_format = "fastq" elif len(l)==1 and oneline[0]!=">" : input_format = "seq" elif len(l)==11 : input_format = "qseq" elif oneline[0]==">" : input_format = "fasta" read_inf.close() print "Detected data format: %s" % input_format #---------------------------------------------------------------- read_file_list = [read_file_1,read_file_2] outfile_FCT_list = [outfile_1FCT,outfile_2RGA] n_list = [0,0] for f in range(2): read_file = read_file_list[f] outf_FCT = open(outfile_FCT_list[f],'w') original_bs_reads = original_bs_reads_lst[f] n = n_list[f] read_id = "" seq = "" seq_ready = "N" line_no = 0 for line in fileinput.input(tmp_d(read_file)): l = line.split() line_no += 1 if input_format=="seq": n += 1 read_id = str(n) read_id = read_id.zfill(12) if f == 1 : read_id_lst_1[read_id] = read_id else : read_id_lst_2[read_id] = read_id seq = l[0] seq_ready = "Y" elif input_format=="fastq": l_fastq = math.fmod(line_no, 4) if l_fastq==1 : all_raw_reads += 1 read_id = str(line_no/4+1).zfill(12) if f==1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq_ready = "N" elif l_fastq==2 : seq = l[0] seq_ready = "Y" else : seq = "" seq_ready = "N" elif input_format=="qseq": all_raw_reads += 1 read_id = str(line_no) read_id = read_id.zfill(12) if f == 1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq = l[8] seq_ready = "Y" elif input_format=="fasta": l_fasta = math.fmod(line_no,2) if l_fasta==1: seq_ready = "N" all_raw_reads += 1 read_id = str(line_no/2+1).zfill(12) if f == 1 : read_id_lst_1[read_id] = l[0][1:] else : read_id_lst_2[read_id] = l[0][1:] seq = "" elif l_fasta==0 : seq = l[0] seq_ready = "Y" #---------------------------------------------------------------- if seq_ready=="Y": seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 seq = seq.upper() seq = seq.replace(".","N") #--striping BS adapter from 3' read -------------------------------------------------------------- all_base_before_trim += len(seq) if f==0 and adapterA!="" : new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) if len(new_read) < len(seq) : all_trimmed += 1 seq = new_read if f==1 and adapterB!="" : new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) if len(new_read)<len(seq) : all_trimmed += 1 seq = new_read all_base_after_trim += len(seq) if len(seq)<=4: seq = "N" * (cut2-cut1+1) #--------- trimmed_raw_BS_read ------------------ original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ if f==0: outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T"))) elif f==1: outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A"))) n_list[f] = n outf_FCT.close() fileinput.close() #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); all_raw_reads += n #-------------------------------------------------------------------------------- # Bowtie mapping #-------------------------------------------------------------------------------- WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file_1' : outfile_1FCT, 'input_file_2' : outfile_2RGA, 'output_file' : WC2T_fr}, aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 'input_file_1' : outfile_1FCT, 'input_file_2' : outfile_2RGA, 'output_file' : CG2A_fr} ]) delete_files(outfile_1FCT, outfile_2RGA) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) #---------------------------------------------------------------- # get uniq-hit reads #---------------------------------------------------------------- Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) Unique_FW_fr_C2T = set() # + Unique_RC_fr_G2A = set() # - Multiple_hits=set() for x in Union_set: _list = [] for d in [FW_C2T_fr_U, RC_G2A_fr_U]: mis_lst = d.get(x,[99]) mis = int(mis_lst[0]) _list.append(mis) for d in [FW_C2T_fr_R, RC_G2A_fr_R]: mis = d.get(x,99) _list.append(mis) mini = min(_list) if _list.count(mini) == 1: mini_index = _list.index(mini) if mini_index == 0: Unique_FW_fr_C2T.add(x) elif mini_index == 1: Unique_RC_fr_G2A.add(x) else : Multiple_hits.add(x) else : Multiple_hits.add(x) # write reads rejected by Multiple Hits to file if show_multiple_hit : outf_MH = open("Multiple_hit.fa",'w') for i in Multiple_hits : outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) outf_MH.close() FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] FW_C2T_fr_uniq_lst.sort() RC_C2T_fr_uniq_lst.sort() FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] #---------------------------------------------------------------- numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) numbers_premapped_lst[1] += len(Unique_RC_fr_G2A) #---------------------------------------------------------------- nn = 0 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]: nn += 1 mapped_chr0 = "" for header in ali_unique_lst: _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] #------------------------------------- if mapped_chr != mapped_chr0: my_gseq = deserialize(db_d(mapped_chr)) chr_length = len(my_gseq) mapped_chr0 = mapped_chr #------------------------------------- original_BS_1 = original_bs_reads_1[header] original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) #original_BS_2 = original_bs_reads_2[header] r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) all_mapped += 1 if nn == 1: # FW-RC mapped to + strand: FR = "+FR" mapped_strand_1 = "+" mapped_strand_2 = "+" elif nn == 2: # FW-RC mapped to - strand: FR="-FR" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 mapped_strand_1 = "-" mapped_location_2 = chr_length - mapped_location_2 - g_len_2 mapped_strand_2 = "-" origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides # if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): # if N_mismatch <= int(max_mismatch_no) : N_mismatch = max(N_mismatch_1, N_mismatch_2) mm_no=float(max_mismatch_no) if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) and N_mismatch_2<=mm_no*len(original_BS_2)): numbers_mapped_lst[nn-1] += 1 all_mapped_passed += 1 #---- unmapped ------------------------- del original_bs_reads_1[header] del original_bs_reads_2[header] #--------------------------------------- methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst) # #---XS FILTER---------------- #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 XS_1 = 0 nCH_1 = methy_1.count('y') + methy_1.count('z') nmCH_1 = methy_1.count('Y') + methy_1.count('Z') if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : XS_1 = 1 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 XS_2 = 0 nCH_2 = methy_2.count('y') + methy_2.count('z') nmCH_2 = methy_2.count('Y') + methy_2.count('Z') if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : XS_2 = 1 if mapped_strand_1 == '+' : flag_1 = 67 # 1000011 : 1st read, + strand flag_2 = 131 # 10000011 : 2nd read, + strand else : flag_1 = 115 # 1110011 : 1st read, - strand flag_2 = 179 # 10110011 : 2nd read, - strand outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) all_base_mapped += len(original_BS_1) outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) all_base_mapped += len(original_BS_2) print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) #---------------------------------------------------------------- # output unmapped pairs #---------------------------------------------------------------- unmapped_lst=original_bs_reads_1.keys() unmapped_lst.sort() # for u in unmapped_lst: # outf_u1.write("%s\n"%(original_bs_reads_1[u])) # outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) all_unmapped+=len(unmapped_lst) #================================================================================================== delete_files(tmp_path) logm("-------------------------------- " ) logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) ) if adapterA != "" or adapterB != "" : logm("Number of reads having adapter removed: %d \n"% all_trimmed) logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) ) if all_raw_reads >0: logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n") if asktag=="Y": logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) elif asktag=="N": logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) if asktag=="Y": logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) elif asktag=="N": logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) ) logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) logm("Unmapped read pairs: %d"% all_unmapped+"\n") n_CG=mC_lst[0]+uC_lst[0] n_CHG=mC_lst[1]+uC_lst[1] n_CHH=mC_lst[2]+uC_lst[2] logm("-------------------------------- " ) logm("Methylated C in mapped reads " ) logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) ) logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) ) logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) ) logm("----------------------------------------------" ) logm("------------------- END ----------------------" ) elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))