Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_rrbs.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
line wrap: on
line diff
--- a/BSseeker2/bs_align/bs_rrbs.py Fri Jul 12 18:47:28 2013 -0400 +++ b/BSseeker2/bs_align/bs_rrbs.py Tue Nov 05 01:55:39 2013 -0500 @@ -42,24 +42,7 @@ # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" #cut_context = re.sub("-", "", cut_format) # Ex. cut_format="C-CGG,AT-CG,G-CWGC" - """ - :param main_read_file: - :param asktag: - :param adapter_file: - :param cut_s: - :param cut_e: - :param no_small_lines: - :param max_mismatch_no: - :param aligner_command: - :param db_path: - :param tmp_path: - :param outfile: - :param XS_pct: - :param XS_count: - :param adapter_mismatch: - :param cut_format: - """ cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] @@ -106,18 +89,31 @@ print "[Error] Cannot find adapter file : %s !" % adapter_file exit(-1) - logm("I Read filename: %s" % main_read_file) - logm("I The last cycle (for mapping): %d" % cut_e ) - logm("I Bowtie path: %s" % aligner_command ) - logm("I Reference genome library path: %s" % db_path ) - logm("I Number of mismatches allowed: %s" % max_mismatch_no) - logm("I Adapter seq: %s" % whole_adapter_seq) - logm("----------------------------------------------") + logm("Read filename: %s" % main_read_file) + logm("The first base (for mapping): %d"% cut_s ) + logm("The last base (for mapping): %d"% cut_e ) + + logm("Path for short reads aligner: %s"% aligner_command + '\n') + logm("Reference genome library path: %s"% db_path ) + + if asktag == "Y" : + logm("Un-directional library" ) + else : + logm("Directional library") + + logm("Number of mismatches allowed: %s"% max_mismatch_no ) + + if adapter_file !="": + logm("Adapter seq: %s" % whole_adapter_seq) + logm("-------------------------------- " ) #---------------------------------------------------------------- all_raw_reads=0 all_tagged=0 all_tagged_trimmed=0 + all_base_before_trim=0 + all_base_after_trim=0 + all_base_mapped=0 all_mapped=0 all_mapped_passed=0 n_cut_tag_lst={} @@ -135,6 +131,9 @@ num_mapped_FW_G2A = 0 num_mapped_RC_G2A = 0 + # Count of nucleotides, which should be cut before the adapters + Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)]) + #=============================================== # directional sequencing #=============================================== @@ -156,72 +155,71 @@ #--- Checking input format ------------------------------------------ try : - read_inf=open(read_file,"r") + if read_file.endswith(".gz") : # support input file ending with ".gz" + read_inf = gzip.open(read_file, "rb") + else : + read_inf=open(read_file,"r") except IOError: print "[Error] Cannot open input file : %s" % read_file exit(-1) - oneline=read_inf.readline() - l=oneline.split() - n_fastq=0 - n_fasta=0 - input_format="" - if oneline[0]=="@": # FastQ - input_format="fastq" - elif len(l)==1 and oneline[0]!=">": # pure sequences - input_format="seq" - elif len(l)==11: # Illumina qseq - input_format="qseq" - elif oneline[0]==">": # fasta - input_format="fasta" + oneline = read_inf.readline() + l = oneline.split() + input_format = "" + if oneline[0]=="@": + input_format = "fastq" + elif len(l)==1 and oneline[0]!=">" : + input_format = "seq" + elif len(l)==11: + input_format = "qseq" + elif oneline[0]==">" : + input_format = "fasta" read_inf.close() + #---------------------------------------------------------------- - seq_id="" - seq="" - seq_ready=0 - for line in fileinput.input(read_file): - l=line.split() - + read_id = "" + seq = "" + seq_ready = 0 + line_no = 0 + for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): + l = line.split() + line_no += 1 if input_format=="seq": - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq=l[0] - seq_ready="Y" + all_raw_reads += 1 + read_id = str(all_raw_reads) + read_id = read_id.zfill(12) + seq = l[0] + seq_ready = "Y" elif input_format=="fastq": - m_fastq=math.fmod(n_fastq,4) - n_fastq+=1 - seq_ready="N" - if m_fastq==0: - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq="" - elif m_fastq==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + l_fastq = math.fmod(line_no, 4) + if l_fastq == 1 : + all_raw_reads += 1 + read_id = l[0][1:] + seq_ready = "N" + elif l_fastq == 2 : + seq = l[0] + seq_ready = "Y" + else : + seq = "" + seq_ready = "N" elif input_format=="qseq": - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq=l[8] - seq_ready="Y" - elif input_format=="fasta": - m_fasta=math.fmod(n_fasta,2) - n_fasta+=1 - seq_ready="N" - if m_fasta==0: - all_raw_reads+=1 - seq_id=l[0][1:] - seq="" - elif m_fasta==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + all_raw_reads += 1 + read_id = str(all_raw_reads) + read_id = read_id.zfill(12) + seq = l[8] + seq_ready = "Y" + elif input_format=="fasta" : + l_fasta = math.fmod(line_no,2) + if l_fasta==1: + all_raw_reads += 1 + read_id = l[0][1:] + seq = "" + seq_ready = "N" + elif l_fasta==0 : + seq = l[0] + seq_ready = "Y" + #--------------------------------------------------------------- if seq_ready=="Y": # Normalize the characters @@ -236,19 +234,22 @@ seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default #-- Trimming adapter sequence --- + + all_base_before_trim += len(seq) if adapter_seq != "" : - new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) + new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter) if len(new_read) < len(seq) : all_tagged_trimmed += 1 seq = new_read + all_base_after_trim += len(seq) if len(seq) <= 4 : seq = "N" * (cut_e - cut_s) # all reads will be considered, regardless of tags #--------- trimmed_raw_BS_read and qscore ------------------ - original_bs_reads[seq_id] = seq + original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ - outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) + outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T'))) fileinput.close() outf2.close() @@ -384,7 +385,9 @@ N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -402,6 +405,7 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" @@ -453,7 +457,9 @@ N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -471,6 +477,7 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" @@ -507,76 +514,74 @@ #--- Checking input format ------------------------------------------ try : - read_inf=open(read_file,"r") + if read_file.endswith(".gz") : # support input file ending with ".gz" + read_inf = gzip.open(read_file, "rb") + else : + read_inf=open(read_file,"r") except IOError: print "[Error] Cannot open input file : %s" % read_file exit(-1) - oneline=read_inf.readline() - l=oneline.split() - n_fastq=0 - n_fasta=0 - input_format="" - if oneline[0]=="@": # FastQ - input_format="fastq" - elif len(l)==1 and oneline[0]!=">": # pure sequences - input_format="seq" - elif len(l)==11: # Illumina qseq - input_format="qseq" - elif oneline[0]==">": # fasta - input_format="fasta" + oneline = read_inf.readline() + l = oneline.split() + input_format = "" + if oneline[0]=="@": + input_format = "fastq" + elif len(l)==1 and oneline[0]!=">" : + input_format = "seq" + elif len(l)==11: + input_format = "qseq" + elif oneline[0]==">" : + input_format = "fasta" read_inf.close() #---------------------------------------------------------------- - seq_id = "" + read_id = "" seq = "" - seq_ready=0 - for line in fileinput.input(read_file): - l=line.split() - - if input_format == "seq": - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq=l[0] - seq_ready="Y" + seq_ready = 0 + line_no = 0 + for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): + l = line.split() + line_no += 1 + if input_format=="seq": + all_raw_reads += 1 + read_id = str(all_raw_reads) + read_id = read_id.zfill(12) + seq = l[0] + seq_ready = "Y" elif input_format=="fastq": - m_fastq=math.fmod(n_fastq,4) - n_fastq+=1 - seq_ready="N" - if m_fastq==0: - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq="" - elif m_fastq==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + l_fastq = math.fmod(line_no, 4) + if l_fastq == 1 : + all_raw_reads += 1 + read_id = l[0][1:] + seq_ready = "N" + elif l_fastq == 2 : + seq = l[0] + seq_ready = "Y" + else : + seq = "" + seq_ready = "N" elif input_format=="qseq": - all_raw_reads+=1 - seq_id=str(all_raw_reads) - seq_id=seq_id.zfill(12) - seq=l[8] - seq_ready="Y" - elif input_format=="fasta": - m_fasta=math.fmod(n_fasta,2) - n_fasta+=1 - seq_ready="N" - if m_fasta==0: - all_raw_reads+=1 - seq_id=l[0][1:] - seq="" - elif m_fasta==1: - seq=l[0] - seq_ready="Y" - else: - seq="" - #--------------------------------------------------------------- + all_raw_reads += 1 + read_id = str(all_raw_reads) + read_id = read_id.zfill(12) + seq = l[8] + seq_ready = "Y" + elif input_format=="fasta" : + l_fasta = math.fmod(line_no,2) + if l_fasta==1: + all_raw_reads += 1 + read_id = l[0][1:] + seq = "" + seq_ready = "N" + elif l_fasta==0 : + seq = l[0] + seq_ready = "Y" + + #--------------------------------------------------------------- if seq_ready=="Y": # Normalize the characters - seq=seq.upper().replace(".","N") + seq = seq.upper().replace(".","N") read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] if len(read_tag) > 0 : @@ -587,21 +592,23 @@ seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default #-- Trimming adapter sequence --- + all_base_before_trim += len(seq) if adapter_seq != "" : - new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) + new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter) if len(new_read) < len(seq) : all_tagged_trimmed += 1 seq = new_read + all_base_after_trim += len(seq) if len(seq) <= 4 : seq = "N" * (cut_e - cut_s) # all reads will be considered, regardless of tags #--------- trimmed_raw_BS_read and qscore ------------------ - original_bs_reads[seq_id] = seq + original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ - outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) + outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T'))) #--------- RC_G2A ------------------ - outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) + outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) fileinput.close() outf2.close() @@ -612,10 +619,10 @@ # mapping #-------------------------------------------------------------------------------- - WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) - CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) - WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) - CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) + WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) + CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) + WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) + CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file' : outfile2, @@ -638,37 +645,37 @@ # Post processing #-------------------------------------------------------------------------------- - FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) - RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) + FW_C2T_U,FW_C2T_R = extract_mapping(WC2T) + RC_G2A_U,RC_G2A_R = extract_mapping(CG2A) - FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) - RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) + FW_G2A_U,FW_G2A_R = extract_mapping(WG2A) + RC_C2T_U,RC_C2T_R = extract_mapping(CC2T) logm("Extracting alignments is done") #---------------------------------------------------------------- # get unique-hit reads #---------------------------------------------------------------- - Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) + Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) - Unique_FW_C2T=set() # + - Unique_RC_G2A=set() # + - Unique_FW_G2A=set() # - - Unique_RC_C2T=set() # - - Multiple_hits=set() + Unique_FW_C2T = set() # + + Unique_RC_G2A = set() # + + Unique_FW_G2A = set() # - + Unique_RC_C2T = set() # - + Multiple_hits = set() - for x in Union_set: - _list=[] + for x in Union_set : + _list = [] for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: - mis_lst=dx.get(x,[99]) - mis=int(mis_lst[0]) + mis_lst = dx.get(x,[99]) + mis = int(mis_lst[0]) _list.append(mis) for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: - mis=dx.get(x,99) + mis = dx.get(x,99) _list.append(mis) - mini=min(_list) + mini = min(_list) if _list.count(mini) == 1: - mini_index=_list.index(mini) + mini_index = _list.index(mini) if mini_index == 0: Unique_FW_C2T.add(x) elif mini_index == 1: @@ -683,7 +690,7 @@ Multiple_hits.add(x) # write reads rejected by Multiple Hits to file if show_multiple_hit : - outf_MH=open("Multiple_hit.fa",'w') + outf_MH = open("Multiple_hit.fa",'w') for i in Multiple_hits : outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) @@ -695,18 +702,18 @@ del RC_C2T_R del RC_G2A_R - FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] - FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] - RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] - RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] + FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] + FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] + RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] + RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] FW_C2T_uniq_lst.sort() RC_C2T_uniq_lst.sort() FW_G2A_uniq_lst.sort() RC_G2A_uniq_lst.sort() - FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] - RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] - FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] - RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] + FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst] + RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst] + FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst] + RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst] del Unique_FW_C2T del Unique_FW_G2A @@ -716,7 +723,7 @@ #---------------------------------------------------------------- # Post-filtering reads - # ---- FW_C2T ---- undirectional + # ---- FW_C2T ---- un-directional FW_regions = dict() gseq = dict() chr_length = dict() @@ -758,7 +765,9 @@ try_count += 1 N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -776,11 +785,12 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" - # ---- RC_C2T ---- undirectional + # ---- RC_C2T ---- un-directional RC_regions = dict() for header in RC_C2T_uniq_lst : _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] @@ -821,7 +831,9 @@ try_count += 1 N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -839,12 +851,13 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" - # ---- FW_G2A ---- undirectional + # ---- FW_G2A ---- un-directional FW_regions = dict() gseq = dict() chr_length = dict() @@ -891,9 +904,10 @@ # print "[For debug]: FW_G2A read still can not find fragment serial" # Tip: sometimes "my_region_serial" is still 0 ... - N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + max_mismatch_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -911,11 +925,12 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" - # ---- RC_G2A ---- undirectional + # ---- RC_G2A ---- un-directional RC_regions = dict() for header in RC_G2A_uniq_lst : _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] @@ -961,9 +976,10 @@ # print "[For debug]: chr=", mapped_chr # print "[For debug]: RC_C2A read still cannot find fragment serial" - N_mismatch = N_MIS(r_aln, g_aln) - if N_mismatch <= int(max_mismatch_no) : +# if N_mismatch <= int(max_mismatch_no) : + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ): all_mapped_passed += 1 methy = methy_seq(r_aln, g_aln + next2bp) mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) @@ -981,37 +997,38 @@ my_region_serial = my_region_serial, my_region_start = my_region_start, my_region_end = my_region_end) + all_base_mapped += len(original_BS) else : print "[For debug]: reads not in same lengths" - - # Finished both FW and RC logm("Done: %s (%d) \n" % (read_file, no_my_files)) print "--> %s (%d) "%(read_file, no_my_files) del original_bs_reads delete_files(WC2T, CC2T, WG2A, CG2A) - - - # End of un-directional library delete_files(tmp_path) - logm("O Number of raw reads: %d "% all_raw_reads) - if all_raw_reads >0: - logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) + logm("Number of raw reads: %d "% all_raw_reads) + if all_raw_reads>0: + logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) for kk in range(len(n_cut_tag_lst)): - logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) - logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed) - logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) - logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped) + logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) + if adapter_seq!="" : + logm("Number of reads having adapter removed: %d "%all_tagged_trimmed) + logm("Number of bases in total: %d "%all_base_before_trim) + if adapter_seq!="" : + logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) ) + logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) ) + logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped) - logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) - logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) + logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) + logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) + logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) - if asktag=="Y": # undiretional + if asktag=="Y": # un-diretional logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) @@ -1021,16 +1038,17 @@ elif asktag=="N": # directional logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) + logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) - n_CG=mC_lst[0]+uC_lst[0] - n_CHG=mC_lst[1]+uC_lst[1] - n_CHH=mC_lst[2]+uC_lst[2] + n_CG = mC_lst[0] + uC_lst[0] + n_CHG = mC_lst[1] + uC_lst[1] + n_CHH = mC_lst[2] + uC_lst[2] logm("----------------------------------------------") - logm("M Methylated C in mapped reads ") - logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) - logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) - logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) + logm("Methylated C in mapped reads ") + logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0)) + logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0)) + logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0)) logm("----------------------------------------------") logm("------------------- END ----------------------")