Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_pair_end.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_pair_end.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,1011 @@ +import fileinput, os, random, math +from bs_utils.utils import * +from bs_align_utils import * +#---------------------------------------------------------------- +def extract_mapping(ali_file): + unique_hits = {} + non_unique_hits = {} + header0 = "" + family = [] + for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True): + #------------------------ + if header != header0: + + # --- output ---- + if len(family) == 1: + unique_hits[header0] = family[0] + elif len(family) > 1: + min_lst = min(family, key = lambda x: x[0]) + max_lst = max(family, key = lambda x: x[0]) + + if min_lst[0] < max_lst[0]: + unique_hits[header0] = min_lst + else: + non_unique_hits[header0] = min_lst[0] + + header0 = header + family = [] + family.append((no_mismatch, chr, location1, cigar1, location2, cigar2)) + + if len(family) == 1: + unique_hits[header0] = family[0] + elif len(family) > 1: + min_lst = min(family, key = lambda x: x[0]) + max_lst = max(family, key = lambda x: x[0]) + + if min_lst[0] < max_lst[0]: + unique_hits[header0] = min_lst + else: + non_unique_hits[header0] = min_lst[0] + + return unique_hits, non_unique_hits + +def _extract_mapping(ali_file): + U = {} + R = {} + header0 = "" + family = [] + for line in fileinput.input(ali_file): + l = line.split() + header = l[0][:-2] + chr = str(l[1]) + location = int(l[2]) + #no_hits=int(l[4]) + #-------- mismatches ----------- + if len(l) == 4: + no_mismatch = 0 + elif len(l) == 5: + no_mismatch = l[4].count(":") + else: + print l + #------------------------ + if header != header0: + #-------------------- + if header0 != "": + # --- output ---- + if len(family) == 1: + U[header0] = family[0] + else: + if family[0][0] < family[1][0]: + U[header0] = family[0] + elif family[1][0] < family[0][0]: + U[header0] = family[1] + else: + R[header0] = family[0][0] + family=[] + # --------------- + header0 = header + family = [[no_mismatch, chr, location]] + member = 1 + elif header == header0: + if member == 1: + family[-1][0] += no_mismatch + family[-1].append(location) + member = 2 + elif member == 2: + family.append([no_mismatch, chr, location]) + member = 1 + #------------------------------ + + fileinput.close() + return U, R + + +#---------------------------------------------------------------- + +def bs_pair_end(main_read_file_1, + main_read_file_2, + asktag, + adapter_file, + cut1, + cut2, # add cut3 and cut4? + no_small_lines, + max_mismatch_no, + aligner_command, + db_path, + tmp_path, + outfile, XS_pct, XS_count, show_multiple_hit=False): + + + #---------------------------------------------------------------- + adapter="" + adapterA="" + adapterB="" + if adapter_file !="": + try : + adapter_inf=open(adapter_file,"r") + if asktag=="N": #<--- directional library + adapter=adapter_inf.readline() + adapter_inf.close() + adapter=adapter.rstrip("\n") + elif asktag=="Y":#<--- un-directional library + adapterA=adapter_inf.readline() + adapterB=adapter_inf.readline() + adapter_inf.close() + adapterA=adapterA.rstrip("\n") + adapterB=adapterB.rstrip("\n") + except IOError : + print "[Error] Cannot find adapter file : %s" % adapter_file + exit(-1) + + + #---------------------------------------------------------------- + + logm("End 1 filename: %s"% main_read_file_1 ) + logm("End 2 filename: %s"% main_read_file_2 ) + logm("The first base (for mapping): %d"% cut1 ) + logm("The last base (for mapping): %d"% cut2 ) + + logm("-------------------------------- " ) + logm("Un-directional library: %s" % asktag ) + logm("Path for short reads aligner: %s"% aligner_command + '\n') + logm("Reference genome library path: %s"% db_path ) + logm("Number of mismatches allowed: %s"% max_mismatch_no ) + + if adapter_file !="": + if asktag=="Y": + logm("Adapters to be removed from 3' of the reads:" ) + logm("-- A: %s" % adapterA ) + logm("-- B: %s" % adapterB ) + elif asktag=="N": + logm("Adapter to be removed from 3' of the reads:" ) + logm("-- %s" % adapter ) + + logm("-------------------------------- " ) + + #---------------------------------------------------------------- + + # helper method to join fname with tmp_path + tmp_d = lambda fname: os.path.join(tmp_path, fname) + + db_d = lambda fname: os.path.join(db_path, fname) + + + #---------------------------------------------------------------- + # splitting the 2 big read files + + input_fname1 = os.path.split(main_read_file_1)[1] + input_fname2 = os.path.split(main_read_file_2)[1] + + # TODO: run these in parallel with a subprocess + split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines) + split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines) + + dirList=os.listdir(tmp_path) + my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)), + sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList))) + + + + #---- Stats ------------------------------------------------------------ + all_raw_reads=0 + all_trimmed=0 + all_mapped=0 + all_mapped_passed=0 + + all_unmapped=0 + + numbers_premapped_lst=[0,0,0,0] + numbers_mapped_lst=[0,0,0,0] + + mC_lst=[0,0,0] + uC_lst=[0,0,0] + + no_my_files=0 + + #---------------------------------------------------------------- + print "== Start mapping ==" + + for read_file_1, read_file_2 in my_files: + + no_my_files+=1 + + random_id=".tmp-"+str(random.randint(1000000,9999999)) + + original_bs_reads_1 = {} + original_bs_reads_2 = {} + original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2] + + + if asktag == "Y" : + + #---------------------------------------------------------------- + outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) + outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) + outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id) + outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) + + try : + read_inf = open(tmp_d(read_file_1),"r") + except IOError : + print "[Error] Cannot open file : %s", tmp_d(read_file_1) + exit(-1) + oneline = read_inf.readline() + l = oneline.split() + input_format = "" + + if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) + input_format="fastq" + n_fastq=0 + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="seq" + elif len(l)==11: # Illumina GAII qseq file + input_format="qseq" + elif oneline[0]==">": # fasta + input_format="fasta" + n_fasta=0 + read_inf.close() + + print "Detected data format: %s" % input_format + + #---------------------------------------------------------------- + read_file_list = [read_file_1, read_file_2] + outfile_FCT_list = [outfile_1FCT, outfile_2FCT] + outfile_RCT_list = [outfile_1RCT, outfile_2RCT] + n_list = [0, 0] + + + for f in range(2): + read_file = read_file_list[f] + outf_FCT = open(outfile_FCT_list[f], 'w') + outf_RCT = open(outfile_RCT_list[f], 'w') + original_bs_reads = original_bs_reads_lst[f] + n = n_list[f] + + seq_id = "" + seq = "" + seq_ready = "N" + for line in fileinput.input(tmp_d(read_file)): + l=line.split() + if input_format=="seq": + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="fastq": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="qseq": + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + n+=1 + seq_id=l[0][1:] + seq_id=seq_id.zfill(17) + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + #---------------------------------------------------------------- + if seq_ready=="Y": + seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 + seq=seq.upper() + seq=seq.replace(".","N") + + #--striping BS adapter from 3' read -------------------------------------------------------------- + if (adapterA !="") and (adapterB !=""): + signature=adapterA[:6] + if signature in seq: + signature_pos=seq.index(signature) + if seq[signature_pos:] in adapterA: + seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) + all_trimmed+=1 + else: + signature=adapterB[:6] + if signature in seq: + #print seq_id,seq,signature; + signature_pos=seq.index(signature) + if seq[signature_pos:] in adapterB: + seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) + all_trimmed+=1 + + if len(seq) <= 4: + seq = "N" * (cut2-cut1+1) + #--------- trimmed_raw_BS_read ------------------ + original_bs_reads[seq_id] = seq + + #--------- FW_C2T ------------------ + outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T"))) + #--------- RC_G2A ------------------ + outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) + + n_list[f]=n + + outf_FCT.close() + outf_RCT.close() + + fileinput.close() + + + #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); + all_raw_reads+=n + + #-------------------------------------------------------------------------------- + # Bowtie mapping + #-------------------------------------------------------------------------------- + WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) + CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) + + run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file_1' : outfile_1FCT, + 'input_file_2' : outfile_2RCT, + 'output_file' : WC2T_fr}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file_1' : outfile_1FCT, + 'input_file_2' : outfile_2RCT, + 'output_file' : CC2T_fr}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file_1' : outfile_2FCT, + 'input_file_2' : outfile_1RCT, + 'output_file' : WC2T_rf}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file_1' : outfile_2FCT, + 'input_file_2' : outfile_1RCT, + 'output_file' : CC2T_rf}]) + + + delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT) + + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + + FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) + FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) + RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) + RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) + + delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf) + + #---------------------------------------------------------------- + # get uniq-hit reads + #---------------------------------------------------------------- + Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) + + Unique_FW_fr_C2T=set() # + + Unique_FW_rf_C2T=set() # + + Unique_RC_fr_C2T=set() # - + Unique_RC_rf_C2T=set() # - + Multiple_hits=set() + + + for x in Union_set: + _list=[] + for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]: + mis_lst=d.get(x,[99]) + mis=int(mis_lst[0]) + _list.append(mis) + for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]: + mis=d.get(x,99) + _list.append(mis) + mini=min(_list) + if _list.count(mini)==1: + mini_index=_list.index(mini) + if mini_index==0: + Unique_FW_fr_C2T.add(x) + elif mini_index==1: + Unique_FW_rf_C2T.add(x) + elif mini_index==2: + Unique_RC_fr_C2T.add(x) + elif mini_index==3: + Unique_RC_rf_C2T.add(x) + else : + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + del Union_set + del FW_C2T_fr_R + del FW_C2T_rf_R + del RC_C2T_fr_R + del RC_C2T_rf_R + + FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] + FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] + RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] + RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] + + FW_C2T_fr_uniq_lst.sort() + FW_C2T_rf_uniq_lst.sort() + RC_C2T_fr_uniq_lst.sort() + RC_C2T_rf_uniq_lst.sort() + FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] + FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst] + RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] + RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst] + #---------------------------------------------------------------- + + numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) + numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T) + numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T) + numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T) + + del Unique_FW_fr_C2T + del Unique_FW_rf_C2T + del Unique_RC_fr_C2T + del Unique_RC_rf_C2T + + #---------------------------------------------------------------- + + nn = 0 + for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), + (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), + (RC_C2T_fr_uniq_lst,RC_C2T_fr_U), + (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: + nn += 1 + + mapped_chr0 = "" + for header in ali_unique_lst: + + _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] + + #------------------------------------- + if mapped_chr != mapped_chr0: + my_gseq=deserialize(db_d(mapped_chr)) + chr_length=len(my_gseq) + mapped_chr0=mapped_chr + #------------------------------------- + if nn == 1 or nn == 3: + original_BS_1 = original_bs_reads_1[header] + original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) + else: + original_BS_1 = original_bs_reads_2[header] + original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) + + r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) + r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) + + all_mapped += 1 + + if nn == 1: # FW-RC mapped to + strand: + + FR = "+FR" + +# mapped_location_1 += 1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "+" + +# mapped_location_2 += 1 +# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "+" + + elif nn==2: # RC-FW mapped to + strand: + +# original_BS_1 = original_bs_reads_2[header] +# original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) + FR = "+RF" +# mapped_location_1 += 1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "+" + +# mapped_location_2 += 1 +# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "+" + + + elif nn==3: # FW-RC mapped to - strand: +# original_BS_1=original_bs_reads_1[header] +# original_BS_2=reverse_compl_seq(original_bs_reads_2[header]) + + FR = "-FR" + mapped_location_1 = chr_length - mapped_location_1 - g_len_1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "-" + + mapped_location_2 = chr_length - mapped_location_2 - g_len_2 +# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ]) +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "-" + + elif nn==4: # RC-FW mapped to - strand: +# original_BS_1=original_bs_reads_2[header] +# original_BS_2=reverse_compl_seq(original_bs_reads_1[header]) + + FR = "-RF" + mapped_location_1 = chr_length - mapped_location_1 - g_len_1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "-" + + mapped_location_2 = chr_length - mapped_location_2 - g_len_2 +# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "-" + + + + + origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) + origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) + + r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) + r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) + + + N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides + N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides + + + if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) : + all_mapped_passed += 1 + numbers_mapped_lst[nn-1] += 1 + #---- unmapped ------------------------- + del original_bs_reads_1[header] + del original_bs_reads_2[header] + #--------------------------------------- + +# output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] +# output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] + + methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) + methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) + + mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) + mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) + + + # + #---XS FILTER---------------- + #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 + XS_1 = 0 + nCH_1 = methy_1.count('y') + methy_1.count('z') + nmCH_1 = methy_1.count('Y') + methy_1.count('Z') + if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : + XS_1 = 1 + #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 + XS_2 = 0 + nCH_2 = methy_2.count('y') + methy_2.count('z') + nmCH_2 = methy_2.count('Y') + methy_2.count('Z') + if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : + XS_2 = 1 + + + outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, + output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) + outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, + output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) + + print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) + #---------------------------------------------------------------- + # output unmapped pairs + #---------------------------------------------------------------- + + unmapped_lst=original_bs_reads_1.keys() + unmapped_lst.sort() + +# for u in unmapped_lst: +# outf_u1.write("%s\n"%original_bs_reads_1[u]) +# outf_u2.write("%s\n"%original_bs_reads_2[u]) + + all_unmapped += len(unmapped_lst) + + + if asktag=="N": + + #---------------------------------------------------------------- + outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id) + outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id) + + try : + read_inf=open(tmp_d(read_file_1),"r") + except IOError : + print "[Error] Cannot open file : %s", tmp_d(read_file_1) + exit(-1) + + oneline=read_inf.readline() + l=oneline.split() + input_format="" + + #if len(l)==5: # old solexa format + # input_format="old Solexa Seq file" + + if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) + input_format="FastQ" + n_fastq=0 + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="_list of sequences" + elif len(l)==11: # Illumina GAII qseq file + input_format="Illumina GAII qseq file" + elif oneline[0]==">": # fasta + input_format="fasta" + n_fasta=0 + + read_inf.close() + + print "Detected data format: %s" % input_format + + #---------------------------------------------------------------- + read_file_list=[read_file_1,read_file_2] + outfile_FCT_list=[outfile_1FCT,outfile_2FCT] + n_list=[0,0] + + for f in range(2): + read_file=read_file_list[f] + outf_FCT=open(outfile_FCT_list[f],'w') + original_bs_reads = original_bs_reads_lst[f] + n=n_list[f] + + seq_id="" + seq="" + seq_ready="N" + for line in fileinput.input(tmp_d(read_file)): + l=line.split() + if input_format=="old Solexa Seq file": + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq=l[4] + seq_ready="Y" + elif input_format=="_list of sequences": + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="FastQ": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="Illumina GAII qseq file": + n+=1 + seq_id=str(n) + seq_id=seq_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + n+=1 + seq_id=l[0][1:] + seq_id=seq_id.zfill(17) + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + #---------------------------------------------------------------- + if seq_ready=="Y": + seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 + seq=seq.upper() + seq=seq.replace(".","N") + + #--striping BS adapter from 3' read -------------------------------------------------------------- + if (adapterA !="") and (adapterB !=""): + signature=adapterA[:6] + if signature in seq: + signature_pos=seq.index(signature) + if seq[signature_pos:] in adapterA: + seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) + all_trimmed+=1 + else: + signature=adapterB[:6] + if signature in seq: + #print seq_id,seq,signature; + signature_pos=seq.index(signature) + if seq[signature_pos:] in adapterB: + seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) + all_trimmed+=1 + + if len(seq) <= 4: + seq = "N" * (cut2-cut1+1) + #--------- trimmed_raw_BS_read ------------------ + original_bs_reads[seq_id] = seq + + #--------- FW_C2T ------------------ + if f==0: + outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T"))) + elif f==1: + outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T"))) + + + n_list[f]=n + outf_FCT.close() + + fileinput.close() + + + #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); + all_raw_reads+=n + + #-------------------------------------------------------------------------------- + # Bowtie mapping + #-------------------------------------------------------------------------------- + WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + + run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file_1' : outfile_1FCT, + 'input_file_2' : outfile_2FCT, + 'output_file' : WC2T_fr}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file_1' : outfile_1FCT, + 'input_file_2' : outfile_2FCT, + 'output_file' : CC2T_fr} ]) + + + delete_files(outfile_1FCT, outfile_2FCT) + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) + RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) + + #---------------------------------------------------------------- + # get uniq-hit reads + #---------------------------------------------------------------- + Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) + + Unique_FW_fr_C2T = set() # + + Unique_RC_fr_C2T = set() # - + Multiple_hits=set() + + + for x in Union_set: + _list = [] + for d in [FW_C2T_fr_U, RC_C2T_fr_U]: + mis_lst = d.get(x,[99]) + mis = int(mis_lst[0]) + _list.append(mis) + for d in [FW_C2T_fr_R, RC_C2T_fr_R]: + mis = d.get(x,99) + _list.append(mis) + mini = min(_list) + if _list.count(mini) == 1: + mini_index = _list.index(mini) + if mini_index == 0: + Unique_FW_fr_C2T.add(x) + elif mini_index == 1: + Unique_RC_fr_C2T.add(x) + else : + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] + RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] + + FW_C2T_fr_uniq_lst.sort() + RC_C2T_fr_uniq_lst.sort() + FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] + RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] + #---------------------------------------------------------------- + + numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) + numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T) + + + #---------------------------------------------------------------- + + nn = 0 + for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]: + nn += 1 + mapped_chr0 = "" + for header in ali_unique_lst: + _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] + + + #------------------------------------- + if mapped_chr != mapped_chr0: + my_gseq = deserialize(db_d(mapped_chr)) + chr_length = len(my_gseq) + mapped_chr0 = mapped_chr + #------------------------------------- + + original_BS_1 = original_bs_reads_1[header] + original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) + + r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) + r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) + + all_mapped += 1 + + if nn == 1: # FW-RC mapped to + strand: + FR = "+FR" +# mapped_location_1 += 1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "+" + +# mapped_location_2 += 1 +# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "+" + + elif nn == 2: # FW-RC mapped to - strand: + + FR="-FR" + mapped_location_1 = chr_length - mapped_location_1 - g_len_1 +# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] +# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) +# origin_genome_1 = origin_genome_long_1[2:-2] + mapped_strand_1 = "-" + + mapped_location_2 = chr_length - mapped_location_2 - g_len_2 +# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) +# origin_genome_2 = origin_genome_long_2[2:-2] + mapped_strand_2 = "-" + + + + origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) + origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) + + + r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) + r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) + + N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides + N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides + + if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): + + numbers_mapped_lst[nn-1] += 1 + all_mapped_passed += 1 + + #---- unmapped ------------------------- + del original_bs_reads_1[header] + del original_bs_reads_2[header] + + #--------------------------------------- +# output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] +# output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] + methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) + methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) + mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) + mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst) + + # + #---XS FILTER---------------- + #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 + XS_1 = 0 + nCH_1 = methy_1.count('y') + methy_1.count('z') + nmCH_1 = methy_1.count('Y') + methy_1.count('Z') + if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : + XS_1 = 1 + #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 + XS_2 = 0 + nCH_2 = methy_2.count('y') + methy_2.count('z') + nmCH_2 = methy_2.count('Y') + methy_2.count('Z') + if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : + XS_2 = 1 + + + outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, + output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) + outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, + output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) + + print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) + #---------------------------------------------------------------- + # output unmapped pairs + #---------------------------------------------------------------- + + unmapped_lst=original_bs_reads_1.keys() + unmapped_lst.sort() + +# for u in unmapped_lst: +# outf_u1.write("%s\n"%(original_bs_reads_1[u])) +# outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) + + all_unmapped+=len(unmapped_lst) + + + #================================================================================================== +# outf.close() +# +# outf_u1.close() +# outf_u2.close() + delete_files(tmp_path) + + logm("-------------------------------- " ) + logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) ) + logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n") + + if all_raw_reads >0: + + logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n") + if asktag=="Y": + logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) + logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) + elif asktag=="N": + logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + + + logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) + logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) + if asktag=="Y": + logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) + logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) + logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) + elif asktag=="N": + logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) + + logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) + logm("O Unmapped read pairs: %d"% all_unmapped+"\n") + + + n_CG=mC_lst[0]+uC_lst[0] + n_CHG=mC_lst[1]+uC_lst[1] + n_CHH=mC_lst[2]+uC_lst[2] + + logm("-------------------------------- " ) + logm("Methylated C in mapped reads " ) + logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) ) + logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) ) + logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) ) + + logm("----------------------------------------------" ) + logm("------------------- END ----------------------" ) + elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))