bs_seeker2: BSseeker2/bs_align/bs

comparison BSseeker2/bs_align/bs_rrbs.py @ 1:8b26adf64adc draft default tip

V2.0.5

author	weilong-guo
date	Tue, 05 Nov 2013 01:55:39 -0500
parents	e6df770c0e58
children

comparison

equal deleted inserted replaced

-:e6df770c0e58
+:8b26adf64adc
 show_multiple_hit=False):
 #----------------------------------------------------------------
 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
 #cut_context = re.sub("-", "", cut_format)
 # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
-"""
-:param main_read_file:
-:param asktag:
-:param adapter_file:
-:param cut_s:
-:param cut_e:
-:param no_small_lines:
-:param max_mismatch_no:
-:param aligner_command:
-:param db_path:
-:param tmp_path:
-:param outfile:
-:param XS_pct:
-:param XS_count:
-:param adapter_mismatch:
-:param cut_format:
-"""
 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
 adapter_inf.close()
 except IOError:
 print "[Error] Cannot find adapter file : %s !" % adapter_file
 exit(-1)
-logm("I Read filename: %s" % main_read_file)
+logm("Read filename: %s" % main_read_file)
-logm("I The last cycle (for mapping): %d" % cut_e )
+logm("The first base (for mapping): %d"% cut_s  )
-logm("I Bowtie path: %s" % aligner_command )
+logm("The  last base (for mapping): %d"% cut_e  )
-logm("I Reference genome library path: %s" % db_path )
-logm("I Number of mismatches allowed: %s" % max_mismatch_no)
+logm("Path for short reads aligner: %s"% aligner_command + '\n')
-logm("I Adapter seq: %s" % whole_adapter_seq)
+logm("Reference genome library path: %s"% db_path  )
-logm("----------------------------------------------")
+if asktag == "Y" :
+logm("Un-directional library" )
+else :
+logm("Directional library")
+logm("Number of mismatches allowed: %s"% max_mismatch_no  )
+if adapter_file !="":
+logm("Adapter seq: %s" % whole_adapter_seq)
+logm("-------------------------------- " )
 #----------------------------------------------------------------
 all_raw_reads=0
 all_tagged=0
 all_tagged_trimmed=0
+all_base_before_trim=0
+all_base_after_trim=0
+all_base_mapped=0
 all_mapped=0
 all_mapped_passed=0
 n_cut_tag_lst={}
 #print cut3_tag_lst
 for x in cut3_tag_lst:
 num_mapped_FW_C2T = 0
 num_mapped_RC_C2T = 0
 num_mapped_FW_G2A = 0
 num_mapped_RC_G2A = 0
+# Count of nucleotides, which should be cut before the adapters
+Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)])
 #===============================================
 # directional sequencing
 #===============================================
 if asktag=="N" :
 outf2=open(outfile2,'w')
 #--- Checking input format ------------------------------------------
 try :
-read_inf=open(read_file,"r")
+if read_file.endswith(".gz") : # support input file ending with ".gz"
+read_inf = gzip.open(read_file, "rb")
+else :
+read_inf=open(read_file,"r")
 except IOError:
 print "[Error] Cannot open input file : %s" % read_file
 exit(-1)
-oneline=read_inf.readline()
+oneline = read_inf.readline()
-l=oneline.split()
+l = oneline.split()
-n_fastq=0
+input_format = ""
-n_fasta=0
+if oneline[0]=="@":
-input_format=""
+input_format = "fastq"
-if oneline[0]=="@":	# FastQ
+elif len(l)==1 and oneline[0]!=">" :
-input_format="fastq"
+input_format = "seq"
-elif len(l)==1 and oneline[0]!=">": # pure sequences
+elif len(l)==11:
-input_format="seq"
+input_format = "qseq"
-elif len(l)==11: # Illumina qseq
+elif oneline[0]==">" :
-input_format="qseq"
+input_format = "fasta"
-elif oneline[0]==">": # fasta
-input_format="fasta"
 read_inf.close()
 #----------------------------------------------------------------
-seq_id=""
+read_id = ""
-seq=""
+seq = ""
-seq_ready=0
+seq_ready = 0
-for line in fileinput.input(read_file):
+line_no = 0
-l=line.split()
+for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
+l = line.split()
+line_no += 1
 if input_format=="seq":
-all_raw_reads+=1
+all_raw_reads += 1
-seq_id=str(all_raw_reads)
+read_id = str(all_raw_reads)
-seq_id=seq_id.zfill(12)
+read_id = read_id.zfill(12)
-seq=l[0]
+seq = l[0]
-seq_ready="Y"
+seq_ready = "Y"
 elif input_format=="fastq":
-m_fastq=math.fmod(n_fastq,4)
+l_fastq = math.fmod(line_no, 4)
-n_fastq+=1
+if l_fastq == 1 :
-seq_ready="N"
+all_raw_reads += 1
-if m_fastq==0:
+read_id = l[0][1:]
-all_raw_reads+=1
+seq_ready = "N"
-seq_id=str(all_raw_reads)
+elif l_fastq == 2 :
-seq_id=seq_id.zfill(12)
+seq = l[0]
-seq=""
+seq_ready = "Y"
-elif m_fastq==1:
+else :
-seq=l[0]
+seq = ""
-seq_ready="Y"
+seq_ready = "N"
-else:
-seq=""
 elif input_format=="qseq":
-all_raw_reads+=1
+all_raw_reads += 1
-seq_id=str(all_raw_reads)
+read_id = str(all_raw_reads)
-seq_id=seq_id.zfill(12)
+read_id = read_id.zfill(12)
-seq=l[8]
+seq = l[8]
-seq_ready="Y"
+seq_ready = "Y"
-elif input_format=="fasta":
+elif input_format=="fasta" :
-m_fasta=math.fmod(n_fasta,2)
+l_fasta = math.fmod(line_no,2)
-n_fasta+=1
+if l_fasta==1:
-seq_ready="N"
+all_raw_reads += 1
-if m_fasta==0:
+read_id = l[0][1:]
-all_raw_reads+=1
+seq = ""
-seq_id=l[0][1:]
+seq_ready = "N"
-seq=""
+elif l_fasta==0 :
-elif m_fasta==1:
+seq = l[0]
-seq=l[0]
+seq_ready = "Y"
-seq_ready="Y"
-else:
-seq=""
 #---------------------------------------------------------------
 if seq_ready=="Y":
 # Normalize the characters
 seq=seq.upper().replace(".","N")
 n_cut_tag_lst[i] += 1
 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
 #-- Trimming adapter sequence ---
+all_base_before_trim += len(seq)
 if adapter_seq != "" :
-new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
+new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
 if len(new_read) < len(seq) :
 all_tagged_trimmed += 1
 seq = new_read
+all_base_after_trim += len(seq)
 if len(seq) <= 4 :
 seq = "N" * (cut_e - cut_s)
 # all reads will be considered, regardless of tags
 #---------  trimmed_raw_BS_read and qscore ------------------
-original_bs_reads[seq_id] = seq
+original_bs_reads[read_id] = seq
 #---------  FW_C2T  ------------------
-outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
+outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
 fileinput.close()
 outf2.close()
 delete_files(read_file)
 #    print "[For debug]: +FW read still can not find fragment serial"
 # Tip: sometimes "my_region_serial" is still 0 ...
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+mm_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
 #print "start RC"
 # ---- RC ----
 #    print "[For debug]: chr=", mapped_chr
 #    print "[For debug]: -FW read still cannot find fragment serial"
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+mm_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
 # Finished both FW and RC
 outf2=open(outfile2,'w')
 outf3=open(outfile3,'w')
 #--- Checking input format ------------------------------------------
 try :
-read_inf=open(read_file,"r")
+if read_file.endswith(".gz") : # support input file ending with ".gz"
+read_inf = gzip.open(read_file, "rb")
+else :
+read_inf=open(read_file,"r")
 except IOError:
 print "[Error] Cannot open input file : %s" % read_file
 exit(-1)
-oneline=read_inf.readline()
+oneline = read_inf.readline()
-l=oneline.split()
+l = oneline.split()
-n_fastq=0
+input_format = ""
-n_fasta=0
+if oneline[0]=="@":
-input_format=""
+input_format = "fastq"
-if oneline[0]=="@":	# FastQ
+elif len(l)==1 and oneline[0]!=">" :
-input_format="fastq"
+input_format = "seq"
-elif len(l)==1 and oneline[0]!=">": # pure sequences
+elif len(l)==11:
-input_format="seq"
+input_format = "qseq"
-elif len(l)==11: # Illumina qseq
+elif oneline[0]==">" :
-input_format="qseq"
+input_format = "fasta"
-elif oneline[0]==">": # fasta
-input_format="fasta"
 read_inf.close()
 #----------------------------------------------------------------
-seq_id = ""
+read_id = ""
 seq = ""
-seq_ready=0
+seq_ready = 0
-for line in fileinput.input(read_file):
+line_no = 0
-l=line.split()
+for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
+l = line.split()
-if input_format == "seq":
+line_no += 1
-all_raw_reads+=1
+if input_format=="seq":
-seq_id=str(all_raw_reads)
+all_raw_reads += 1
-seq_id=seq_id.zfill(12)
+read_id = str(all_raw_reads)
-seq=l[0]
+read_id = read_id.zfill(12)
-seq_ready="Y"
+seq = l[0]
+seq_ready = "Y"
 elif input_format=="fastq":
-m_fastq=math.fmod(n_fastq,4)
+l_fastq = math.fmod(line_no, 4)
-n_fastq+=1
+if l_fastq == 1 :
-seq_ready="N"
+all_raw_reads += 1
-if m_fastq==0:
+read_id = l[0][1:]
-all_raw_reads+=1
+seq_ready = "N"
-seq_id=str(all_raw_reads)
+elif l_fastq == 2 :
-seq_id=seq_id.zfill(12)
+seq = l[0]
-seq=""
+seq_ready = "Y"
-elif m_fastq==1:
+else :
-seq=l[0]
+seq = ""
-seq_ready="Y"
+seq_ready = "N"
-else:
-seq=""
 elif input_format=="qseq":
-all_raw_reads+=1
+all_raw_reads += 1
-seq_id=str(all_raw_reads)
+read_id = str(all_raw_reads)
-seq_id=seq_id.zfill(12)
+read_id = read_id.zfill(12)
-seq=l[8]
+seq = l[8]
-seq_ready="Y"
+seq_ready = "Y"
-elif input_format=="fasta":
+elif input_format=="fasta" :
-m_fasta=math.fmod(n_fasta,2)
+l_fasta = math.fmod(line_no,2)
-n_fasta+=1
+if l_fasta==1:
-seq_ready="N"
+all_raw_reads += 1
-if m_fasta==0:
+read_id = l[0][1:]
-all_raw_reads+=1
+seq = ""
-seq_id=l[0][1:]
+seq_ready = "N"
-seq=""
+elif l_fasta==0 :
-elif m_fasta==1:
+seq = l[0]
-seq=l[0]
+seq_ready = "Y"
-seq_ready="Y"
-else:
+#---------------------------------------------------------------
-seq=""
-#---------------------------------------------------------------
 if seq_ready=="Y":
 # Normalize the characters
-seq=seq.upper().replace(".","N")
+seq = seq.upper().replace(".","N")
 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
 if len(read_tag) > 0 :
 all_tagged += 1
 for i in read_tag :
 n_cut_tag_lst[i] += 1
 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
 #-- Trimming adapter sequence ---
+all_base_before_trim += len(seq)
 if adapter_seq != "" :
-new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
+new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
 if len(new_read) < len(seq) :
 all_tagged_trimmed += 1
 seq = new_read
+all_base_after_trim += len(seq)
 if len(seq) <= 4 :
 seq = "N" * (cut_e - cut_s)
 # all reads will be considered, regardless of tags
 #---------  trimmed_raw_BS_read and qscore ------------------
-original_bs_reads[seq_id] = seq
+original_bs_reads[read_id] = seq
 #---------  FW_C2T  ------------------
-outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
+outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
 #---------  RC_G2A  ------------------
-outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
+outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
 fileinput.close()
 outf2.close()
 delete_files(read_file)
 logm("Processing input is done")
 #--------------------------------------------------------------------------------
 # mapping
 #--------------------------------------------------------------------------------
-WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
+WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
-CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
+CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
-WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
+WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
-CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
+CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
 'input_file' : outfile2,
 'output_file' : WC2T},
 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
 #--------------------------------------------------------------------------------
 # Post processing
 #--------------------------------------------------------------------------------
-FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
+FW_C2T_U,FW_C2T_R = extract_mapping(WC2T)
-RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
+RC_G2A_U,RC_G2A_R = extract_mapping(CG2A)
-FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
+FW_G2A_U,FW_G2A_R = extract_mapping(WG2A)
-RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
+RC_C2T_U,RC_C2T_R = extract_mapping(CC2T)
 logm("Extracting alignments is done")
 #----------------------------------------------------------------
 # get unique-hit reads
 #----------------------------------------------------------------
-Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
+Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
-Unique_FW_C2T=set() # +
+Unique_FW_C2T = set() # +
-Unique_RC_G2A=set() # +
+Unique_RC_G2A = set() # +
-Unique_FW_G2A=set() # -
+Unique_FW_G2A = set() # -
-Unique_RC_C2T=set() # -
+Unique_RC_C2T = set() # -
-Multiple_hits=set()
+Multiple_hits = set()
-for x in Union_set:
+for x in Union_set :
-_list=[]
+_list = []
 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
-mis_lst=dx.get(x,[99])
+mis_lst = dx.get(x,[99])
-mis=int(mis_lst[0])
+mis = int(mis_lst[0])
 _list.append(mis)
 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
-mis=dx.get(x,99)
+mis = dx.get(x,99)
 _list.append(mis)
-mini=min(_list)
+mini = min(_list)
 if _list.count(mini) == 1:
-mini_index=_list.index(mini)
+mini_index = _list.index(mini)
 if mini_index == 0:
 Unique_FW_C2T.add(x)
 elif mini_index == 1:
 Unique_RC_G2A.add(x)
 elif mini_index == 2:
 Multiple_hits.add(x)
 else :
 Multiple_hits.add(x)
 # write reads rejected by Multiple Hits to file
 if show_multiple_hit :
-outf_MH=open("Multiple_hit.fa",'w')
+outf_MH = open("Multiple_hit.fa",'w')
 for i in Multiple_hits :
 outf_MH.write(">%s\n" % i)
 outf_MH.write("%s\n" % original_bs_reads[i])
 outf_MH.close()
 del FW_C2T_R
 del FW_G2A_R
 del RC_C2T_R
 del RC_G2A_R
-FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
+FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
-FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
+FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
-RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
+RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
-RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
+RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
 FW_C2T_uniq_lst.sort()
 RC_C2T_uniq_lst.sort()
 FW_G2A_uniq_lst.sort()
 RC_G2A_uniq_lst.sort()
-FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
+FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst]
-RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
+RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst]
-FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
+FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst]
-RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
+RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst]
 del Unique_FW_C2T
 del Unique_FW_G2A
 del Unique_RC_C2T
 del Unique_RC_G2A
 #----------------------------------------------------------------
 # Post-filtering reads
-# ---- FW_C2T  ---- undirectional
+# ---- FW_C2T  ---- un-directional
 FW_regions = dict()
 gseq = dict()
 chr_length = dict()
 for header in FW_C2T_uniq_lst :
 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
 try_pos, FR)
 try_pos -= 1
 try_count += 1
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+mm_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
-# ---- RC_C2T ---- undirectional
+# ---- RC_C2T ---- un-directional
 RC_regions = dict()
 for header in RC_C2T_uniq_lst :
 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
 original_BS = original_bs_reads[header]
 if mapped_chr not in RC_regions :
 try_pos, FR)
 try_pos += 1
 try_count += 1
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+mm_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
-# ---- FW_G2A  ---- undirectional
+# ---- FW_G2A  ---- un-directional
 FW_regions = dict()
 gseq = dict()
 chr_length = dict()
 for header in FW_G2A_uniq_lst :
 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
 #if my_region_serial == 0 :
 #    print "[For debug]: chr=", mapped_chr
 #    print "[For debug]: FW_G2A read still can not find fragment serial"
 # Tip: sometimes "my_region_serial" is still 0 ...
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+max_mismatch_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
-# ---- RC_G2A ---- undirectional
+# ---- RC_G2A ---- un-directional
 RC_regions = dict()
 for header in RC_G2A_uniq_lst :
 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
 original_BS = original_bs_reads[header]
 if mapped_chr not in RC_regions :
 #if my_region_serial == 0 :
 #    print "[For debug]: chr=", mapped_chr
 #    print "[For debug]: RC_C2A read still cannot find fragment serial"
 N_mismatch = N_MIS(r_aln, g_aln)
-if N_mismatch <= int(max_mismatch_no) :
+#                    if N_mismatch <= int(max_mismatch_no) :
+mm_no=float(max_mismatch_no)
+if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
 all_mapped_passed += 1
 methy = methy_seq(r_aln, g_aln + next2bp)
 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
 #---XS FILTER----------------
 XS = 0
 output_genome = output_genome,
 rrbs = True,
 my_region_serial = my_region_serial,
 my_region_start = my_region_start,
 my_region_end = my_region_end)
+all_base_mapped += len(original_BS)
 else :
 print "[For debug]: reads not in same lengths"
 # Finished both FW and RC
 logm("Done: %s (%d) \n" % (read_file, no_my_files))
 print "--> %s (%d) "%(read_file, no_my_files)
 del original_bs_reads
 delete_files(WC2T, CC2T, WG2A, CG2A)
 # End of un-directional library
 delete_files(tmp_path)
-logm("O Number of raw reads: %d "% all_raw_reads)
+logm("Number of raw reads: %d "% all_raw_reads)
-if all_raw_reads >0:
+if all_raw_reads>0:
-logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
+logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
 for kk in range(len(n_cut_tag_lst)):
-logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
+logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
-logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed)
+if adapter_seq!="" :
-logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+logm("Number of reads having adapter removed: %d "%all_tagged_trimmed)
-logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped)
+logm("Number of bases in total: %d "%all_base_before_trim)
+if adapter_seq!="" :
-logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
+logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
-logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
+logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped)
-if asktag=="Y": # undiretional
+logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
+logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
+logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
+if asktag=="Y": # un-diretional
 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
 # according to literal meaning
 elif asktag=="N": # directional
 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
+logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
-n_CG=mC_lst[0]+uC_lst[0]
-n_CHG=mC_lst[1]+uC_lst[1]
+n_CG = mC_lst[0] + uC_lst[0]
-n_CHH=mC_lst[2]+uC_lst[2]
+n_CHG = mC_lst[1] + uC_lst[1]
+n_CHH = mC_lst[2] + uC_lst[2]
 logm("----------------------------------------------")
-logm("M Methylated C in mapped reads ")
+logm("Methylated C in mapped reads ")
-logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
+logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0))
-logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
+logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0))
-logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
+logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0))
 logm("----------------------------------------------")
 logm("------------------- END ----------------------")
 elapsed(main_read_file)

Mercurial > repos > weilong-guo > bs_seeker2

comparison BSseeker2/bs_align/bs_rrbs.py @ 1:8b26adf64adc draft default tip