Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_rrbs.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_rrbs.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,1041 @@ +import fileinput, random, math, os.path +from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS +from bs_utils.utils import * + +from bs_align.bs_single_end import extract_mapping +from bs_align_utils import * + +def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence + #print len(chr_regions) + out_serial = 0 + out_start = -1 + out_end = -1 + #print "mapped_location:", mapped_location + if FR == "+FW" or FR == "-RC": + my_location = str(mapped_location) + if my_location in chr_regions: + my_lst = chr_regions[my_location] + out_start = int(my_location) + out_end = my_lst[0] + out_serial = my_lst[1] + #else : + # print "[For debug]: +FW location %s cannot be found" % my_location + elif FR == "-FW" or FR == "+RC": + my_location = str(mapped_location) + if my_location in chr_regions: + my_lst = chr_regions[my_location] + out_end = int(my_location) + out_start = my_lst[0] + out_serial = my_lst[1] + #else : + # print "[For debug]: -FW location %s cannot be found" % my_location + + return out_serial, out_start, out_end + + +#---------------------------------------------------------------- + +def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no, + aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG", + show_multiple_hit=False): + #---------------------------------------------------------------- + # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" + #cut_context = re.sub("-", "", cut_format) + # Ex. cut_format="C-CGG,AT-CG,G-CWGC" + """ + + :param main_read_file: + :param asktag: + :param adapter_file: + :param cut_s: + :param cut_e: + :param no_small_lines: + :param max_mismatch_no: + :param aligner_command: + :param db_path: + :param tmp_path: + :param outfile: + :param XS_pct: + :param XS_count: + :param adapter_mismatch: + :param cut_format: + """ + cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] + cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] + cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] + cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC'] + cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5] + min_cut5_len = min([len(i) for i in cut5_context]) + #print cut_format_lst + #print cut_format + #print cut5_context + + cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT'] + cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst] + cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst] + check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ] + + #print "=======" + #print cut_tag_lst + #print cut3_tag_lst + #print cut5_tag_lst + #print check_pattern + + # set region[gx,gy] for checking_genome_context + gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG] + gy = [ 3+len(i) for i in cut3_tag_lst ] + + + #---------------------------------------------------------------- + + # helper method to join fname with tmp_path + tmp_d = lambda fname: os.path.join(tmp_path, fname) + db_d = lambda fname: os.path.join(db_path, fname) + + MAX_TRY = 500 # For finding the serial_no + whole_adapter_seq = "" + #---------------------------------------------------------------- + adapter_seq="" + if adapter_file: + try : + adapter_inf = open(adapter_file,"r") + whole_adapter_seq = adapter_inf.readline().strip() + adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter + adapter_inf.close() + except IOError: + print "[Error] Cannot find adapter file : %s !" % adapter_file + exit(-1) + + logm("I Read filename: %s" % main_read_file) + logm("I The last cycle (for mapping): %d" % cut_e ) + logm("I Bowtie path: %s" % aligner_command ) + logm("I Reference genome library path: %s" % db_path ) + logm("I Number of mismatches allowed: %s" % max_mismatch_no) + logm("I Adapter seq: %s" % whole_adapter_seq) + logm("----------------------------------------------") + + #---------------------------------------------------------------- + all_raw_reads=0 + all_tagged=0 + all_tagged_trimmed=0 + all_mapped=0 + all_mapped_passed=0 + n_cut_tag_lst={} + #print cut3_tag_lst + for x in cut3_tag_lst: + n_cut_tag_lst[x]=0 + + mC_lst=[0,0,0] + uC_lst=[0,0,0] + + no_my_files=0 + + num_mapped_FW_C2T = 0 + num_mapped_RC_C2T = 0 + num_mapped_FW_G2A = 0 + num_mapped_RC_G2A = 0 + + #=============================================== + # directional sequencing + #=============================================== + + if asktag=="N" : + #---------------------------------------------------------------- + logm("== Start mapping ==") + + input_fname = os.path.split(main_read_file)[1] + for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): + + logm("Processing read file: %s" % read_file) + original_bs_reads = {} + no_my_files+=1 + random_id = ".tmp-"+str(random.randint(1000000,9999999)) + outfile2=tmp_d('Trimmed_C2T.fa'+random_id) + + outf2=open(outfile2,'w') + + #--- Checking input format ------------------------------------------ + try : + read_inf=open(read_file,"r") + except IOError: + print "[Error] Cannot open input file : %s" % read_file + exit(-1) + + oneline=read_inf.readline() + l=oneline.split() + n_fastq=0 + n_fasta=0 + input_format="" + if oneline[0]=="@": # FastQ + input_format="fastq" + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="seq" + elif len(l)==11: # Illumina qseq + input_format="qseq" + elif oneline[0]==">": # fasta + input_format="fasta" + read_inf.close() + + #---------------------------------------------------------------- + seq_id="" + seq="" + seq_ready=0 + for line in fileinput.input(read_file): + l=line.split() + + if input_format=="seq": + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="fastq": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="qseq": + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + all_raw_reads+=1 + seq_id=l[0][1:] + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + #--------------------------------------------------------------- + if seq_ready=="Y": + # Normalize the characters + seq=seq.upper().replace(".","N") + + read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] + if len(read_tag) > 0 : + all_tagged += 1 + for i in read_tag : + n_cut_tag_lst[i] += 1 + + seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default + + #-- Trimming adapter sequence --- + if adapter_seq != "" : + new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) + if len(new_read) < len(seq) : + all_tagged_trimmed += 1 + seq = new_read + if len(seq) <= 4 : + seq = "N" * (cut_e - cut_s) + + # all reads will be considered, regardless of tags + #--------- trimmed_raw_BS_read and qscore ------------------ + original_bs_reads[seq_id] = seq + #--------- FW_C2T ------------------ + outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) + fileinput.close() + + outf2.close() + + delete_files(read_file) + logm("Processing input is done") + #-------------------------------------------------------------------------------- + + # mapping + #-------------------------------------------------------------------------------- + WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) + CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) + + run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file' : outfile2, + 'output_file' : WC2T}, + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file' : outfile2, + 'output_file' : CC2T} ]) + + logm("Aligning reads is done") + + delete_files(outfile2) + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) + RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) + logm("Extracting alignments is done") + + #---------------------------------------------------------------- + # get uniq-hit reads + #---------------------------------------------------------------- + Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys()) + + Unique_FW_C2T=set() # + + Unique_RC_C2T=set() # - + Multiple_hits=set() + + + for x in Union_set: + _list=[] + for dx in [FW_C2T_U, RC_C2T_U]: + mis_lst=dx.get(x,[99]) + mis=int(mis_lst[0]) + _list.append(mis) + for dx in [FW_C2T_R, RC_C2T_R]: + mis=dx.get(x,99) + _list.append(mis) + mini=min(_list) + if _list.count(mini)==1: + mini_index=_list.index(mini) + if mini_index==0: + Unique_FW_C2T.add(x) + elif mini_index==1: + Unique_RC_C2T.add(x) + else : + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + del Union_set + del FW_C2T_R + del RC_C2T_R + + FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] + RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] + FW_uniq_lst.sort() + RC_uniq_lst.sort() + FW_uniq_lst=[x[1] for x in FW_uniq_lst] + RC_uniq_lst=[x[1] for x in RC_uniq_lst] + + del Unique_FW_C2T + del Unique_RC_C2T + + #---------------------------------------------------------------- + # Post-filtering reads + + # ---- FW ---- + FW_regions = dict() + gseq = dict() + chr_length = dict() + for header in FW_uniq_lst : + _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in FW_regions : + FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + all_mapped+=1 + FR = "+FW" + mapped_strand = "+" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + # print "[For debug]: FW read has no tags" + try_count = 0 + try_pos = mapped_location - min_cut5_len + 1 + while my_region_serial == 0 and try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + try_pos -= 1 + try_count += 1 + + #if my_region_serial == 0 : + # print "[For debug]: chr=", mapped_chr + # print "[For debug]: +FW read still can not find fragment serial" + # Tip: sometimes "my_region_serial" is still 0 ... + + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_FW_C2T += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + else : + print "[For debug]: reads not in same lengths" + + #print "start RC" + # ---- RC ---- + RC_regions = dict() + for header in RC_uniq_lst : + _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in RC_regions : + RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + all_mapped+=1 + FR = "-FW" + mapped_strand = "-" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + #checking_genome_context = (output_genome[gx:gy] == check_pattern) + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : # and checking_genome_context: + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + try_pos , FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + #print "[For debug]: RC Read has no tags" + try_count = 0 + try_pos = mapped_location + g_len + min_cut5_len - 2 + while my_region_serial == 0 and try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + try_pos, FR) + try_pos += 1 + try_count += 1 + + #if my_region_serial == 0 : + # print "[For debug]: chr=", mapped_chr + # print "[For debug]: -FW read still cannot find fragment serial" + + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_RC_C2T += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + else : + print "[For debug]: reads not in same lengths" + + + # Finished both FW and RC + logm("Done: %s (%d) \n" % (read_file, no_my_files)) + print "--> %s (%d) "%(read_file, no_my_files) + del original_bs_reads + delete_files(WC2T, CC2T) + + # End of directional library + + + # ==================================================== + # un-directional library + # ==================================================== + + elif asktag=="Y" : + #---------------------------------------------------------------- + logm("== Start mapping ==") + + input_fname = os.path.split(main_read_file)[1] + for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): + + logm("Processing read file: %s" % read_file) + original_bs_reads = {} + no_my_files+=1 + random_id = ".tmp-"+str(random.randint(1000000,9999999)) + outfile2=tmp_d('Trimmed_C2T.fa'+random_id) + outfile3=tmp_d('Trimmed_G2A.fa'+random_id) + + outf2=open(outfile2,'w') + outf3=open(outfile3,'w') + + #--- Checking input format ------------------------------------------ + try : + read_inf=open(read_file,"r") + except IOError: + print "[Error] Cannot open input file : %s" % read_file + exit(-1) + + oneline=read_inf.readline() + l=oneline.split() + n_fastq=0 + n_fasta=0 + input_format="" + if oneline[0]=="@": # FastQ + input_format="fastq" + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="seq" + elif len(l)==11: # Illumina qseq + input_format="qseq" + elif oneline[0]==">": # fasta + input_format="fasta" + read_inf.close() + + #---------------------------------------------------------------- + seq_id = "" + seq = "" + seq_ready=0 + for line in fileinput.input(read_file): + l=line.split() + + if input_format == "seq": + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="fastq": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="qseq": + all_raw_reads+=1 + seq_id=str(all_raw_reads) + seq_id=seq_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + all_raw_reads+=1 + seq_id=l[0][1:] + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + #--------------------------------------------------------------- + if seq_ready=="Y": + # Normalize the characters + seq=seq.upper().replace(".","N") + + read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] + if len(read_tag) > 0 : + all_tagged += 1 + for i in read_tag : + n_cut_tag_lst[i] += 1 + + seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default + + #-- Trimming adapter sequence --- + if adapter_seq != "" : + new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) + if len(new_read) < len(seq) : + all_tagged_trimmed += 1 + seq = new_read + if len(seq) <= 4 : + seq = "N" * (cut_e - cut_s) + + # all reads will be considered, regardless of tags + #--------- trimmed_raw_BS_read and qscore ------------------ + original_bs_reads[seq_id] = seq + #--------- FW_C2T ------------------ + outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) + #--------- RC_G2A ------------------ + outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) + fileinput.close() + + outf2.close() + + delete_files(read_file) + logm("Processing input is done") + #-------------------------------------------------------------------------------- + + # mapping + #-------------------------------------------------------------------------------- + WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) + CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) + WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) + CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) + + run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file' : outfile2, + 'output_file' : WC2T}, + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file' : outfile2, + 'output_file' : CC2T}, + aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'), + 'input_file' : outfile3, + 'output_file' : WG2A}, + aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'), + 'input_file' : outfile3, + 'output_file' : CG2A} ]) + + logm("Aligning reads is done") + + delete_files(outfile2) + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) + RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) + + FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) + RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) + + logm("Extracting alignments is done") + + #---------------------------------------------------------------- + # get unique-hit reads + #---------------------------------------------------------------- + Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) + + Unique_FW_C2T=set() # + + Unique_RC_G2A=set() # + + Unique_FW_G2A=set() # - + Unique_RC_C2T=set() # - + Multiple_hits=set() + + for x in Union_set: + _list=[] + for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: + mis_lst=dx.get(x,[99]) + mis=int(mis_lst[0]) + _list.append(mis) + for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: + mis=dx.get(x,99) + _list.append(mis) + mini=min(_list) + if _list.count(mini) == 1: + mini_index=_list.index(mini) + if mini_index == 0: + Unique_FW_C2T.add(x) + elif mini_index == 1: + Unique_RC_G2A.add(x) + elif mini_index == 2: + Unique_FW_G2A.add(x) + elif mini_index == 3: + Unique_RC_C2T.add(x) + else : + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + del Union_set + del FW_C2T_R + del FW_G2A_R + del RC_C2T_R + del RC_G2A_R + + FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] + FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] + RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] + RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] + FW_C2T_uniq_lst.sort() + RC_C2T_uniq_lst.sort() + FW_G2A_uniq_lst.sort() + RC_G2A_uniq_lst.sort() + FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] + RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] + FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] + RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] + + del Unique_FW_C2T + del Unique_FW_G2A + del Unique_RC_C2T + del Unique_RC_G2A + + + #---------------------------------------------------------------- + # Post-filtering reads + # ---- FW_C2T ---- undirectional + FW_regions = dict() + gseq = dict() + chr_length = dict() + for header in FW_C2T_uniq_lst : + _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in FW_regions : + FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + all_mapped+=1 + FR = "+FW" + mapped_strand = "+" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + # print "[For debug]: FW read has no tags" + try_count = 0 + try_pos = mapped_location - min_cut5_len + 1 + while my_region_serial == 0 and try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + try_pos -= 1 + try_count += 1 + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_FW_C2T += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + else : + print "[For debug]: reads not in same lengths" + + + # ---- RC_C2T ---- undirectional + RC_regions = dict() + for header in RC_C2T_uniq_lst : + _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in RC_regions : + RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + all_mapped+=1 + FR = "-FW" + mapped_strand = "-" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : # and checking_genome_context: + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + try_pos , FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + #print "[For debug]: RC Read has no tags" + try_count = 0 + try_pos = mapped_location + g_len + min_cut5_len - 2 + while my_region_serial == 0 and try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + try_pos, FR) + try_pos += 1 + try_count += 1 + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_RC_C2T += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + + else : + print "[For debug]: reads not in same lengths" + + + # ---- FW_G2A ---- undirectional + FW_regions = dict() + gseq = dict() + chr_length = dict() + for header in FW_G2A_uniq_lst : + _, mapped_chr, mapped_location, cigar = FW_G2A_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in FW_regions : + FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + cigar = list(reversed(cigar)) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + all_mapped+=1 + FR = "-RC" + mapped_strand = "-" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + original_BS = reverse_compl_seq(original_BS) # for RC reads + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + #print "[For debug]: FW read has no tags" + try_count = 0 + try_pos = mapped_location - min_cut5_len + 1 + while my_region_serial == 0 and try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], + try_pos, FR) + try_pos += 1 + try_count += 1 + #if my_region_serial == 0 : + # print "[For debug]: chr=", mapped_chr + # print "[For debug]: FW_G2A read still can not find fragment serial" + # Tip: sometimes "my_region_serial" is still 0 ... + + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_FW_G2A += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + else : + print "[For debug]: reads not in same lengths" + + + # ---- RC_G2A ---- undirectional + RC_regions = dict() + for header in RC_G2A_uniq_lst : + _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] + original_BS = original_bs_reads[header] + if mapped_chr not in RC_regions : + RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + cigar = list(reversed(cigar)) + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + all_mapped+=1 + FR = "+RC" + mapped_strand = "+" + origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], + mapped_location, + mapped_location + g_len, + mapped_strand) + original_BS = reverse_compl_seq(original_BS) # for RC reads + checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln) : # and checking_genome_context: + my_region_serial, my_region_start, my_region_end = [-1, 0, 0] + if True in checking_genome_context : + try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + mapped_location + g_len + min_cut5_len -1, FR) + if my_region_serial == 0 : # still be 0 + # for some cases, read has no tags; searching the upstream sequence for tags + #print "[For debug]: RC Read has no tags" + try_count = 0 + try_pos = mapped_location + g_len + min_cut5_len - 2 + while try_count < MAX_TRY : + my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], + try_pos, FR) + try_pos += 1 + try_count += 1 + + #if my_region_serial == 0 : + # print "[For debug]: chr=", mapped_chr + # print "[For debug]: RC_C2A read still cannot find fragment serial" + + + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no) : + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next2bp) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + num_mapped_RC_G2A += 1 + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, + mapped_location, cigar, original_BS, methy, XS, + output_genome = output_genome, + rrbs = True, + my_region_serial = my_region_serial, + my_region_start = my_region_start, + my_region_end = my_region_end) + else : + print "[For debug]: reads not in same lengths" + + + + # Finished both FW and RC + logm("Done: %s (%d) \n" % (read_file, no_my_files)) + print "--> %s (%d) "%(read_file, no_my_files) + del original_bs_reads + delete_files(WC2T, CC2T, WG2A, CG2A) + + + + # End of un-directional library + + delete_files(tmp_path) + + + logm("O Number of raw reads: %d "% all_raw_reads) + if all_raw_reads >0: + logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) + for kk in range(len(n_cut_tag_lst)): + logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) + logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed) + logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) + logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped) + + logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) + logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) + + if asktag=="Y": # undiretional + logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) + logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) + logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) + logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) ) + # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration + # according to literal meaning + elif asktag=="N": # directional + logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) + logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) + + n_CG=mC_lst[0]+uC_lst[0] + n_CHG=mC_lst[1]+uC_lst[1] + n_CHH=mC_lst[2]+uC_lst[2] + + logm("----------------------------------------------") + logm("M Methylated C in mapped reads ") + logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) + logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) + logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) + logm("----------------------------------------------") + logm("------------------- END ----------------------") + + elapsed(main_read_file) + + close_log() + +