diff BSseeker2/bs_align/bs_single_end.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
line wrap: on
line diff
--- a/BSseeker2/bs_align/bs_single_end.py	Fri Jul 12 18:47:28 2013 -0400
+++ b/BSseeker2/bs_align/bs_single_end.py	Tue Nov 05 01:55:39 2013 -0500
@@ -1,6 +1,7 @@
 import fileinput, os, time, random, math
 from bs_utils.utils import *
 from bs_align_utils import *
+import gzip
 
 #----------------------------------------------------------------
 # Read from the mapped results, return lists of unique / multiple-hit reads
@@ -77,20 +78,23 @@
 
 
     #----------------------------------------------------------------
-    logm("Read filename: %s"% main_read_file )
-    logm("Un-directional library: %s" % asktag )
-    logm("The first base (for mapping): %d" % cut1)
-    logm("The last base (for mapping): %d" % cut2)
-    logm("Max. lines per mapping: %d"% no_small_lines)
-    logm("Aligner: %s" % aligner_command)
-    logm("Reference genome library path: %s" % db_path )
-    logm("Number of mismatches allowed: %s" % max_mismatch_no )
+    logm("Read filename: %s" % main_read_file)
+    logm("The first base (for mapping): %d"% cut1  )
+    logm("The  last base (for mapping): %d"% cut2  )
+
+    logm("Path for short reads aligner: %s"% aligner_command + '\n')
+    logm("Reference genome library path: %s"% db_path  )
+
+    if asktag == "Y" :
+        logm("Un-directional library" )
+    else :
+        logm("Directional library")
+
+    logm("Number of mismatches allowed: %s"% max_mismatch_no  )
+
     if adapter_file !="":
-        if asktag=="N":
-            logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
-        elif asktag=="Y":
-            logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
-            logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
+        logm("Adapter seq: %s" % adapter_fw)
+    logm("-------------------------------- " )
     #----------------------------------------------------------------
 
     # helper method to join fname with tmp_path
@@ -109,9 +113,12 @@
 
     #---- Stats ------------------------------------------------------------
     all_raw_reads=0
-    all_trimed=0
+    all_trimmed=0
     all_mapped=0
     all_mapped_passed=0
+    all_base_before_trim=0
+    all_base_after_trim=0
+    all_base_mapped=0
 
     numbers_premapped_lst=[0,0,0,0]
     numbers_mapped_lst=[0,0,0,0]
@@ -119,7 +126,6 @@
     mC_lst=[0,0,0]
     uC_lst=[0,0,0]
 
-
     no_my_files=0
 
     #----------------------------------------------------------------
@@ -132,7 +138,7 @@
         random_id = ".tmp-"+str(random.randint(1000000,9999999))
 
         #-------------------------------------------------------------------
-        # undirectional sequencing
+        # un-directional sequencing
         #-------------------------------------------------------------------
         if asktag=="Y":  
 
@@ -146,74 +152,70 @@
             #----------------------------------------------------------------
             # detect format of input file
             try :
-                read_inf=open(read_file,"r")
+                if read_file.endswith(".gz") : # support input file ending with ".gz"
+                    read_inf = gzip.open(read_file, "rb")
+                else :
+                    read_inf=open(read_file,"r")
             except IOError :
                 print "[Error] Cannot open input file : %s" % read_file
                 exit(-1)
 
-            oneline=read_inf.readline()
-            l=oneline.split()
-            input_format=""
-            if oneline[0]=="@":	# fastq
-                input_format="fastq"
-                n_fastq=0
-            elif len(l)==1 and oneline[0]!=">": # pure sequences
-                input_format="seq"
-            elif len(l)==11: # qseq
-                input_format="qseq"
-            elif oneline[0]==">":	# fasta
-                input_format="fasta"
-                n_fasta=0
+            oneline = read_inf.readline()
+            l = oneline.split()
+            input_format = ""
+            if oneline[0]=="@":
+                input_format = "fastq"
+            elif len(l)==1 and oneline[0]!=">":
+                input_format = "seq"
+            elif len(l)==11:
+                input_format = "qseq"
+            elif oneline[0]==">":
+                input_format = "fasta"
             read_inf.close()
 
             #----------------------------------------------------------------
-            # read sequence, remove adapter and convert 
-            read_id=""
-            seq=""
-            seq_ready="N"
-            for line in fileinput.input(read_file):
-                l=line.split()
-
+            # read sequence, remove adapter and convert
+            read_id = ""
+            seq = ""
+            seq_ready = "N"
+            line_no = 0
+            for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz
+                l = line.split()
+                line_no += 1
                 if input_format=="seq":
-                    all_raw_reads+=1
-                    read_id=str(all_raw_reads)
-                    read_id=read_id.zfill(12)
-                    seq=l[0]
-                    seq_ready="Y"
+                    all_raw_reads += 1
+                    read_id = str(all_raw_reads)
+                    read_id = read_id.zfill(12)
+                    seq = l[0]
+                    seq_ready = "Y"
                 elif input_format=="fastq":
-                    m_fastq=math.fmod(n_fastq,4)
-                    n_fastq+=1
-                    seq_ready="N"
-                    if m_fastq==0:
-                        all_raw_reads+=1
-                        read_id=str(all_raw_reads)
-                        read_id=read_id.zfill(12)
-                        seq=""
-                    elif m_fastq==1:
-                        seq=l[0]
-                        seq_ready="Y"
-                    else:
-                        seq=""
+                    l_fastq = math.fmod(line_no, 4)
+                    if l_fastq == 1 :
+                        all_raw_reads += 1
+                        read_id = l[0][1:]
+                        seq_ready = "N"
+                    elif l_fastq == 2 :
+                        seq = l[0]
+                        seq_ready = "Y"
+                    else :
+                        seq = ""
+                        seq_ready = "N"
                 elif input_format=="qseq":
-                    all_raw_reads+=1
-                    read_id=str(all_raw_reads)
-                    read_id=read_id.zfill(12)
-                    seq=l[8]
-                    seq_ready="Y"
-                elif input_format=="fasta":
-                    m_fasta=math.fmod(n_fasta,2)
-                    n_fasta+=1
-                    seq_ready="N"
-                    if m_fasta==0:
-                        all_raw_reads+=1
-                        #read_id=str(all_raw_reads)
-                        read_id=l[0][1:]
-                        seq=""
-                    elif m_fasta==1:
-                        seq=l[0]
-                        seq_ready="Y"
-                    else:
-                        seq=""
+                    all_raw_reads += 1
+                    read_id = str(all_raw_reads)
+                    read_id = read_id.zfill(12)
+                    seq = l[8]
+                    seq_ready = "Y"
+                elif input_format=="fasta" :
+                    l_fasta = math.fmod(line_no,2)
+                    if l_fasta==1:
+                        all_raw_reads += 1
+                        read_id = l[0][1:]
+                        seq = ""
+                        seq_ready = "N"
+                    elif l_fasta==0 :
+                        seq = l[0]
+                        seq_ready = "Y"
 
                 #----------------------------------------------------------------
                 if seq_ready=="Y":
@@ -222,13 +224,14 @@
                     seq=seq.replace(".","N")
 
                     # striping BS adapter from 3' read
+                    all_base_before_trim += len(seq)
                     if (adapter_fw !="") and (adapter_rc !="") :
                         new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
                         new_read = Remove_5end_Adapter(new_read, adapter_rc)
                         if len(new_read) < len(seq) :
-                            all_trimed += 1
+                            all_trimmed += 1
                         seq = new_read
-
+                    all_base_after_trim += len(seq)
                     if len(seq)<=4:
                         seq=''.join(["N" for x in xrange(cut2-cut1+1)])
 
@@ -353,18 +356,18 @@
             RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
             FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
             RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0] += len(Unique_FW_C2T)
+            numbers_premapped_lst[1] += len(Unique_RC_G2A)
+            numbers_premapped_lst[2] += len(Unique_FW_G2A)
+            numbers_premapped_lst[3] += len(Unique_RC_C2T)
 
             del Unique_FW_C2T
             del Unique_FW_G2A
             del Unique_RC_C2T
             del Unique_RC_G2A
 
-            #----------------------------------------------------------------
-            numbers_premapped_lst[0] += len(Unique_FW_C2T)
-            numbers_premapped_lst[1] += len(Unique_RC_G2A)
-            numbers_premapped_lst[2] += len(Unique_FW_G2A)
-            numbers_premapped_lst[3] += len(Unique_RC_C2T)
-
 
             #----------------------------------------------------------------
 
@@ -376,7 +379,6 @@
                                             (FW_G2A_uniq_lst,FW_G2A_U),
                                             (RC_C2T_uniq_lst,RC_C2T_U)]:
                 nn += 1
-                mapped_chr0 = ""
 
                 for header in ali_unique_lst:
 
@@ -422,7 +424,9 @@
 
                     if len(r_aln)==len(g_aln):
                         N_mismatch = N_MIS(r_aln, g_aln)
-                        if N_mismatch <= int(max_mismatch_no):
+#                        if N_mismatch <= int(max_mismatch_no):
+                        mm_no=float(max_mismatch_no)
+                        if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                             numbers_mapped_lst[nn-1] += 1
                             all_mapped_passed += 1
                             methy = methy_seq(r_aln, g_aln + next)
@@ -436,6 +440,7 @@
                                 XS = 1
 
                             outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
+                            all_base_mapped += len(original_BS)
 
             #----------------------------------------------------------------
             logm("--> %s (%d) "%(read_file, no_my_files))
@@ -449,78 +454,74 @@
 
         if asktag=="N":  
             #----------------------------------------------------------------
-            outfile2=tmp_d('Trimed_C2T.fa'+random_id)
+            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
             outf2=open(outfile2,'w')
-
-            n=0
             #----------------------------------------------------------------
             try :
-                read_inf=open(read_file,"r")
+                if read_file.endswith(".gz") : # support input file ending with ".gz"
+                    read_inf = gzip.open(read_file, "rb")
+                else :
+                    read_inf=open(read_file,"r")
             except IOError :
                 print "[Error] Cannot open input file : %s" % read_file
                 exit(-1)
 
-            oneline=read_inf.readline()
-            l=oneline.split()
-            input_format=""
-            if oneline[0]=="@":	# FastQ
-                input_format="fastq"
-                n_fastq=0
-            elif len(l)==1 and oneline[0]!=">": # pure sequences
-                input_format="seq"
-            elif len(l)==11: # Illumina GAII qseq file
-                input_format="qseq"
-            elif oneline[0]==">":	# fasta
-                input_format="fasta"
-                n_fasta=0
+            oneline = read_inf.readline()
+            l = oneline.split()
+            input_format = ""
+            if oneline[0]=="@":
+                input_format = "fastq"
+            elif len(l)==1 and oneline[0]!=">":
+                input_format = "seq"
+            elif len(l)==11:
+                input_format = "qseq"
+            elif oneline[0]==">":
+                input_format = "fasta"
             read_inf.close()
+
             #print "detected data format: %s"%(input_format)
             #----------------------------------------------------------------
             read_id=""
             seq=""
             seq_ready="N"
-            for line in fileinput.input(read_file):
-                l=line.split()
+            line_no = 0
+            for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
+                l = line.split()
+                line_no += 1
                 if input_format=="seq":
-                    all_raw_reads+=1
-                    read_id=str(all_raw_reads)
-                    read_id=read_id.zfill(12)
-                    seq=l[0]
-                    seq_ready="Y"
+                    all_raw_reads += 1
+                    read_id = str(all_raw_reads)
+                    read_id = read_id.zfill(12)
+                    seq = l[0]
+                    seq_ready = "Y"
                 elif input_format=="fastq":
-                    m_fastq=math.fmod(n_fastq,4)
-                    n_fastq+=1
-                    seq_ready="N"
-                    if m_fastq==0:
-                        all_raw_reads+=1
-                        read_id=str(all_raw_reads)
-                        read_id=read_id.zfill(12)
-                        seq=""
-                    elif m_fastq==1:
-                        seq=l[0]
-                        seq_ready="Y"
-                    else:
-                        seq=""
+                    l_fastq = math.fmod(line_no, 4)
+                    if l_fastq == 1 :
+                        all_raw_reads += 1
+                        read_id = l[0][1:]
+                        seq_ready = "N"
+                    elif l_fastq == 2 :
+                        seq = l[0]
+                        seq_ready = "Y"
+                    else :
+                        seq = ""
+                        seq_ready = "N"
                 elif input_format=="qseq":
-                    all_raw_reads+=1
-                    read_id=str(all_raw_reads)
-                    read_id=read_id.zfill(12)
-                    seq=l[8]
-                    seq_ready="Y"
-                elif input_format=="fasta":
-                    m_fasta=math.fmod(n_fasta,2)
-                    n_fasta+=1
-                    seq_ready="N"
-                    if m_fasta==0:
-                        all_raw_reads+=1
-                        read_id=l[0][1:]
-                        seq=""
-                    elif m_fasta==1:
-                        seq=l[0]
-                        seq_ready="Y"
-                    else:
-                        seq=""
-
+                    all_raw_reads += 1
+                    read_id = str(all_raw_reads)
+                    read_id = read_id.zfill(12)
+                    seq = l[8]
+                    seq_ready = "Y"
+                elif input_format=="fasta" :
+                    l_fasta = math.fmod(line_no,2)
+                    if l_fasta==1:
+                        all_raw_reads += 1
+                        read_id = l[0][1:]
+                        seq = ""
+                        seq_ready = "N"
+                    elif l_fasta==0 :
+                        seq = l[0]
+                        seq_ready = "Y"
                 #--------------------------------
                 if seq_ready=="Y":
                     seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72  -e 52
@@ -528,12 +529,13 @@
                     seq=seq.replace(".","N")
 
                     #--striping adapter from 3' read -------
+                    all_base_before_trim += len(seq)
                     if adapter != "":
                         new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
                         if len(new_read) < len(seq) :
-                            all_trimed += 1
+                            all_trimmed += 1
                         seq = new_read
-
+                    all_base_after_trim += len(seq)
                     if len(seq)<=4:
                         seq = "N" * (cut2-cut1+1)
 
@@ -612,7 +614,6 @@
                 outf_MH.close()
 
 
-
             FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
             RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
             FW_C2T_uniq_lst.sort()
@@ -633,7 +634,6 @@
             chr_length = dict()
             for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
                 nn += 1
-                mapped_chr0 = ""
                 for header in ali_unique_lst:
                     _, mapped_chr, mapped_location, cigar = ali_dic[header]
                     original_BS = original_bs_reads[header]
@@ -641,11 +641,6 @@
                     if mapped_chr not in gseq :
                         gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                         chr_length[mapped_chr] = len(gseq[mapped_chr])
-                    #if mapped_chr != mapped_chr0:
-                    #    my_gseq = deserialize(db_d(mapped_chr))
-                    #    chr_length = len(my_gseq)
-                    #    mapped_chr0 = mapped_chr
-                    #-------------------------------------
 
                     r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
 
@@ -664,7 +659,8 @@
 
                     if len(r_aln) == len(g_aln):
                         N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
-                        if N_mismatch <= int(max_mismatch_no):
+                        mm_no=float(max_mismatch_no)
+                        if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                             numbers_mapped_lst[nn-1] += 1
                             all_mapped_passed += 1
                             methy = methy_seq(r_aln, g_aln+next)
@@ -678,6 +674,7 @@
                                 XS = 1
 
                             outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
+                            all_base_mapped += len(original_BS)
 
             #----------------------------------------------------------------
             logm("--> %s (%d) "%(read_file,no_my_files))
@@ -686,14 +683,16 @@
 
     #----------------------------------------------------------------
 
-#    outf.close()
     delete_files(tmp_path)
 
     logm("Number of raw reads: %d \n"% all_raw_reads)
     if all_raw_reads >0:
-        logm("Number of reads having adapter removed: %d \n" % all_trimed )
-        logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
-        logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
+        logm("Number of bases in total: %d "%all_base_before_trim)
+        if adapter != "" :
+            logm("Number of reads having adapter removed: %d \n" % all_trimmed )
+            logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
+        logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+        logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped)
         if asktag=="Y":
             logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
             logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
@@ -713,6 +712,8 @@
             logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
             logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
         logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
+        logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
+
 
         n_CG=mC_lst[0]+uC_lst[0]
         n_CHG=mC_lst[1]+uC_lst[1]