Mercurial > repos > weilong-guo > bs_seeker2

import fileinput, random, math, os.path
from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS
from bs_utils.utils import *

from bs_align.bs_single_end import extract_mapping
from bs_align_utils import *

def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence
    #print len(chr_regions)
    out_serial = 0
    out_start = -1
    out_end = -1
    #print "mapped_location:", mapped_location
    if FR == "+FW" or FR == "-RC":
        my_location = str(mapped_location)
        if my_location in chr_regions:
            my_lst = chr_regions[my_location]
            out_start = int(my_location)
            out_end = my_lst[0]
            out_serial = my_lst[1]
        #else :
        #    print "[For debug]: +FW location %s cannot be found" % my_location
    elif FR == "-FW" or FR == "+RC":
        my_location = str(mapped_location)
        if my_location in chr_regions:
            my_lst = chr_regions[my_location]
            out_end = int(my_location)
            out_start = my_lst[0]
            out_serial = my_lst[1]
        #else :
        #    print "[For debug]: -FW location %s cannot be found" % my_location

    return out_serial, out_start, out_end


#----------------------------------------------------------------

def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,
            aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",
            show_multiple_hit=False):
    #----------------------------------------------------------------
    # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
    #cut_context = re.sub("-", "", cut_format)
    # Ex. cut_format="C-CGG,AT-CG,G-CWGC"

    cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
    cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
    cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
    cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
    cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
    min_cut5_len = min([len(i) for i in cut5_context])
    #print cut_format_lst
    #print cut_format
    #print cut5_context

    cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT']
    cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst]
    cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst]
    check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]

    #print "======="
    #print cut_tag_lst
    #print cut3_tag_lst
    #print cut5_tag_lst
    #print check_pattern

    # set region[gx,gy] for checking_genome_context
    gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]
    gy = [ 3+len(i) for i in cut3_tag_lst ]


    #----------------------------------------------------------------

    # helper method to join fname with tmp_path
    tmp_d = lambda fname: os.path.join(tmp_path, fname)
    db_d = lambda fname: os.path.join(db_path, fname)

    MAX_TRY = 500 # For finding the serial_no
    whole_adapter_seq = ""
    #----------------------------------------------------------------
    adapter_seq=""
    if adapter_file:
        try :
            adapter_inf = open(adapter_file,"r")
            whole_adapter_seq = adapter_inf.readline().strip()
            adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter
            adapter_inf.close()
        except IOError:
            print "[Error] Cannot find adapter file : %s !" % adapter_file
            exit(-1)

    logm("Read filename: %s" % main_read_file)
    logm("The first base (for mapping): %d"% cut_s  )
    logm("The  last base (for mapping): %d"% cut_e  )

    logm("Path for short reads aligner: %s"% aligner_command + '\n')
    logm("Reference genome library path: %s"% db_path  )

    if asktag == "Y" :
        logm("Un-directional library" )
    else :
        logm("Directional library")

    logm("Number of mismatches allowed: %s"% max_mismatch_no  )

    if adapter_file !="":
        logm("Adapter seq: %s" % whole_adapter_seq)
    logm("-------------------------------- " )

    #----------------------------------------------------------------
    all_raw_reads=0
    all_tagged=0
    all_tagged_trimmed=0
    all_base_before_trim=0
    all_base_after_trim=0
    all_base_mapped=0
    all_mapped=0
    all_mapped_passed=0
    n_cut_tag_lst={}
    #print cut3_tag_lst
    for x in cut3_tag_lst:
        n_cut_tag_lst[x]=0

    mC_lst=[0,0,0]
    uC_lst=[0,0,0]

    no_my_files=0

    num_mapped_FW_C2T = 0
    num_mapped_RC_C2T = 0
    num_mapped_FW_G2A = 0
    num_mapped_RC_G2A = 0

    # Count of nucleotides, which should be cut before the adapters
    Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)])

    #===============================================
    # directional sequencing
    #===============================================

    if asktag=="N" :
        #----------------------------------------------------------------
        logm("== Start mapping ==")

        input_fname = os.path.split(main_read_file)[1]
        for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):

            logm("Processing read file: %s" % read_file)
            original_bs_reads = {}
            no_my_files+=1
            random_id = ".tmp-"+str(random.randint(1000000,9999999))
            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)

            outf2=open(outfile2,'w')

            #--- Checking input format ------------------------------------------
            try :
                if read_file.endswith(".gz") : # support input file ending with ".gz"
                    read_inf = gzip.open(read_file, "rb")
                else :
                    read_inf=open(read_file,"r")
            except IOError:
                print "[Error] Cannot open input file : %s" % read_file
                exit(-1)

            oneline = read_inf.readline()
            l = oneline.split()
            input_format = ""
            if oneline[0]=="@":
                input_format = "fastq"
            elif len(l)==1 and oneline[0]!=">" :
                input_format = "seq"
            elif len(l)==11:
                input_format = "qseq"
            elif oneline[0]==">" :
                input_format = "fasta"
            read_inf.close()


            #----------------------------------------------------------------
            read_id = ""
            seq = ""
            seq_ready = 0
            line_no = 0
            for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
                l = line.split()
                line_no += 1
                if input_format=="seq":
                    all_raw_reads += 1
                    read_id = str(all_raw_reads)
                    read_id = read_id.zfill(12)
                    seq = l[0]
                    seq_ready = "Y"
                elif input_format=="fastq":
                    l_fastq = math.fmod(line_no, 4)
                    if l_fastq == 1 :
                        all_raw_reads += 1
                        read_id = l[0][1:]
                        seq_ready = "N"
                    elif l_fastq == 2 :
                        seq = l[0]
                        seq_ready = "Y"
                    else :
                        seq = ""
                        seq_ready = "N"
                elif input_format=="qseq":
                    all_raw_reads += 1
                    read_id = str(all_raw_reads)
                    read_id = read_id.zfill(12)
                    seq = l[8]
                    seq_ready = "Y"
                elif input_format=="fasta" :
                    l_fasta = math.fmod(line_no,2)
                    if l_fasta==1:
                        all_raw_reads += 1
                        read_id = l[0][1:]
                        seq = ""
                        seq_ready = "N"
                    elif l_fasta==0 :
                        seq = l[0]
                        seq_ready = "Y"

                #---------------------------------------------------------------
                if seq_ready=="Y":
                    # Normalize the characters
                    seq=seq.upper().replace(".","N")

                    read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
                    if len(read_tag) > 0 :
                        all_tagged += 1
                        for i in read_tag :
                            n_cut_tag_lst[i] += 1

                    seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default

                    #-- Trimming adapter sequence ---

                    all_base_before_trim += len(seq)
                    if adapter_seq != "" :
                        new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
                        if len(new_read) < len(seq) :
                            all_tagged_trimmed += 1
                        seq = new_read
                    all_base_after_trim += len(seq)
                    if len(seq) <= 4 :
                        seq = "N" * (cut_e - cut_s)

                    # all reads will be considered, regardless of tags
                    #---------  trimmed_raw_BS_read and qscore ------------------
                    original_bs_reads[read_id] = seq
                    #---------  FW_C2T  ------------------
                    outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
            fileinput.close()

            outf2.close()

            delete_files(read_file)
            logm("Processing input is done")
            #--------------------------------------------------------------------------------

            # mapping
            #--------------------------------------------------------------------------------
            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)

            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
                                                   'input_file' : outfile2,
                                                   'output_file' : WC2T},
                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
                                                   'input_file' : outfile2,
                                                   'output_file' : CC2T} ])

            logm("Aligning reads is done")

            delete_files(outfile2)

            #--------------------------------------------------------------------------------
            # Post processing
            #--------------------------------------------------------------------------------

            FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
            RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
            logm("Extracting alignments is done")

            #----------------------------------------------------------------
            # get uniq-hit reads
            #----------------------------------------------------------------
            Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())

            Unique_FW_C2T=set() # +
            Unique_RC_C2T=set() # -
            Multiple_hits=set()


            for x in Union_set:
                _list=[]
                for dx in [FW_C2T_U, RC_C2T_U]:
                    mis_lst=dx.get(x,[99])
                    mis=int(mis_lst[0])
                    _list.append(mis)
                for dx in [FW_C2T_R, RC_C2T_R]:
                    mis=dx.get(x,99)
                    _list.append(mis)
                mini=min(_list)
                if _list.count(mini)==1:
                    mini_index=_list.index(mini)
                    if mini_index==0:
                        Unique_FW_C2T.add(x)
                    elif mini_index==1:
                        Unique_RC_C2T.add(x)
                    else :
                        Multiple_hits.add(x)
                else :
                    Multiple_hits.add(x)
            # write reads rejected by Multiple Hits to file
            if show_multiple_hit :
                outf_MH=open("Multiple_hit.fa",'w')
                for i in Multiple_hits :
                    outf_MH.write(">%s\n" % i)
                    outf_MH.write("%s\n" % original_bs_reads[i])
                outf_MH.close()

            del Union_set
            del FW_C2T_R
            del RC_C2T_R

            FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
            RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
            FW_uniq_lst.sort()
            RC_uniq_lst.sort()
            FW_uniq_lst=[x[1] for x in FW_uniq_lst]
            RC_uniq_lst=[x[1] for x in RC_uniq_lst]

            del Unique_FW_C2T
            del Unique_RC_C2T

            #----------------------------------------------------------------
            # Post-filtering reads

            # ---- FW  ----
            FW_regions = dict()
            gseq = dict()
            chr_length = dict()
            for header in FW_uniq_lst :
                _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in FW_regions :
                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                all_mapped+=1
                FR = "+FW"
                mapped_strand = "+"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) :
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                try_pos, FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        # print "[For debug]: FW read has no tags"
                        try_count = 0
                        try_pos = mapped_location - min_cut5_len + 1
                        while my_region_serial == 0 and try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                                              try_pos, FR)
                            try_pos -= 1
                            try_count += 1

                        #if my_region_serial == 0 :
                        #    print "[For debug]: chr=", mapped_chr
                        #    print "[For debug]: +FW read still can not find fragment serial"
                        # Tip: sometimes "my_region_serial" is still 0 ...


                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    mm_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_FW_C2T += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)
                else :
                    print "[For debug]: reads not in same lengths"

            #print "start RC"
            # ---- RC ----
            RC_regions = dict()
            for header in RC_uniq_lst :
                _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in RC_regions :
                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
                all_mapped+=1
                FR = "-FW"
                mapped_strand = "-"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                #checking_genome_context = (output_genome[gx:gy] == check_pattern)
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) : # and checking_genome_context:
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                                           try_pos , FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        #print "[For debug]: RC Read has no tags"
                        try_count = 0
                        try_pos = mapped_location + g_len + min_cut5_len - 2
                        while my_region_serial == 0 and try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                                               try_pos, FR)
                            try_pos += 1
                            try_count += 1

                        #if my_region_serial == 0 :
                        #    print "[For debug]: chr=", mapped_chr
                        #    print "[For debug]: -FW read still cannot find fragment serial"


                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    mm_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_RC_C2T += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)
                else :
                    print "[For debug]: reads not in same lengths"


            # Finished both FW and RC
            logm("Done: %s (%d) \n" % (read_file, no_my_files))
            print "--> %s (%d) "%(read_file, no_my_files)
            del original_bs_reads
            delete_files(WC2T, CC2T)

    # End of directional library


    # ====================================================
    #  un-directional library
    # ====================================================

    elif asktag=="Y" :
        #----------------------------------------------------------------
        logm("== Start mapping ==")

        input_fname = os.path.split(main_read_file)[1]
        for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):

            logm("Processing read file: %s" % read_file)
            original_bs_reads = {}
            no_my_files+=1
            random_id = ".tmp-"+str(random.randint(1000000,9999999))
            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
            outfile3=tmp_d('Trimmed_G2A.fa'+random_id)

            outf2=open(outfile2,'w')
            outf3=open(outfile3,'w')

            #--- Checking input format ------------------------------------------
            try :
                if read_file.endswith(".gz") : # support input file ending with ".gz"
                    read_inf = gzip.open(read_file, "rb")
                else :
                    read_inf=open(read_file,"r")
            except IOError:
                print "[Error] Cannot open input file : %s" % read_file
                exit(-1)

            oneline = read_inf.readline()
            l = oneline.split()
            input_format = ""
            if oneline[0]=="@":
                input_format = "fastq"
            elif len(l)==1 and oneline[0]!=">" :
                input_format = "seq"
            elif len(l)==11:
                input_format = "qseq"
            elif oneline[0]==">" :
                input_format = "fasta"
            read_inf.close()

            #----------------------------------------------------------------
            read_id = ""
            seq = ""
            seq_ready = 0
            line_no = 0
            for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
                l = line.split()
                line_no += 1
                if input_format=="seq":
                    all_raw_reads += 1
                    read_id = str(all_raw_reads)
                    read_id = read_id.zfill(12)
                    seq = l[0]
                    seq_ready = "Y"
                elif input_format=="fastq":
                    l_fastq = math.fmod(line_no, 4)
                    if l_fastq == 1 :
                        all_raw_reads += 1
                        read_id = l[0][1:]
                        seq_ready = "N"
                    elif l_fastq == 2 :
                        seq = l[0]
                        seq_ready = "Y"
                    else :
                        seq = ""
                        seq_ready = "N"
                elif input_format=="qseq":
                    all_raw_reads += 1
                    read_id = str(all_raw_reads)
                    read_id = read_id.zfill(12)
                    seq = l[8]
                    seq_ready = "Y"
                elif input_format=="fasta" :
                    l_fasta = math.fmod(line_no,2)
                    if l_fasta==1:
                        all_raw_reads += 1
                        read_id = l[0][1:]
                        seq = ""
                        seq_ready = "N"
                    elif l_fasta==0 :
                        seq = l[0]
                        seq_ready = "Y"

                #---------------------------------------------------------------
                if seq_ready=="Y":
                    # Normalize the characters
                    seq = seq.upper().replace(".","N")

                    read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
                    if len(read_tag) > 0 :
                        all_tagged += 1
                        for i in read_tag :
                            n_cut_tag_lst[i] += 1

                    seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default

                    #-- Trimming adapter sequence ---
                    all_base_before_trim += len(seq)
                    if adapter_seq != "" :
                        new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
                        if len(new_read) < len(seq) :
                            all_tagged_trimmed += 1
                        seq = new_read
                    all_base_after_trim += len(seq)
                    if len(seq) <= 4 :
                        seq = "N" * (cut_e - cut_s)

                    # all reads will be considered, regardless of tags
                    #---------  trimmed_raw_BS_read and qscore ------------------
                    original_bs_reads[read_id] = seq
                    #---------  FW_C2T  ------------------
                    outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
                    #---------  RC_G2A  ------------------
                    outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
            fileinput.close()

            outf2.close()

            delete_files(read_file)
            logm("Processing input is done")
            #--------------------------------------------------------------------------------

            # mapping
            #--------------------------------------------------------------------------------
            WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
            CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
            WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
            CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)

            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
                                                 'input_file' : outfile2,
                                                 'output_file' : WC2T},
                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
                                                 'input_file' : outfile2,
                                                 'output_file' : CC2T},
                              aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
                                                 'input_file' : outfile3,
                                                 'output_file' : WG2A},
                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
                                                 'input_file' : outfile3,
                                                 'output_file' : CG2A} ])

            logm("Aligning reads is done")

            delete_files(outfile2)

            #--------------------------------------------------------------------------------
            # Post processing
            #--------------------------------------------------------------------------------

            FW_C2T_U,FW_C2T_R = extract_mapping(WC2T)
            RC_G2A_U,RC_G2A_R = extract_mapping(CG2A)

            FW_G2A_U,FW_G2A_R = extract_mapping(WG2A)
            RC_C2T_U,RC_C2T_R = extract_mapping(CC2T)

            logm("Extracting alignments is done")

            #----------------------------------------------------------------
            # get unique-hit reads
            #----------------------------------------------------------------
            Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())

            Unique_FW_C2T = set() # +
            Unique_RC_G2A = set() # +
            Unique_FW_G2A = set() # -
            Unique_RC_C2T = set() # -
            Multiple_hits = set()

            for x in Union_set :
                _list = []
                for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
                    mis_lst = dx.get(x,[99])
                    mis = int(mis_lst[0])
                    _list.append(mis)
                for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
                    mis = dx.get(x,99)
                    _list.append(mis)
                mini = min(_list)
                if _list.count(mini) == 1:
                    mini_index = _list.index(mini)
                    if mini_index == 0:
                        Unique_FW_C2T.add(x)
                    elif mini_index == 1:
                        Unique_RC_G2A.add(x)
                    elif mini_index == 2:
                        Unique_FW_G2A.add(x)
                    elif mini_index == 3:
                        Unique_RC_C2T.add(x)
                    else :
                        Multiple_hits.add(x)
                else :
                    Multiple_hits.add(x)
            # write reads rejected by Multiple Hits to file
            if show_multiple_hit :
                outf_MH = open("Multiple_hit.fa",'w')
                for i in Multiple_hits :
                    outf_MH.write(">%s\n" % i)
                    outf_MH.write("%s\n" % original_bs_reads[i])
                outf_MH.close()

            del Union_set
            del FW_C2T_R
            del FW_G2A_R
            del RC_C2T_R
            del RC_G2A_R

            FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
            FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
            RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
            RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
            FW_C2T_uniq_lst.sort()
            RC_C2T_uniq_lst.sort()
            FW_G2A_uniq_lst.sort()
            RC_G2A_uniq_lst.sort()
            FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst]
            RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst]
            FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst]
            RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst]

            del Unique_FW_C2T
            del Unique_FW_G2A
            del Unique_RC_C2T
            del Unique_RC_G2A


            #----------------------------------------------------------------
            # Post-filtering reads
            # ---- FW_C2T  ---- un-directional
            FW_regions = dict()
            gseq = dict()
            chr_length = dict()
            for header in FW_C2T_uniq_lst :
                _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in FW_regions :
                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                all_mapped+=1
                FR = "+FW"
                mapped_strand = "+"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) :
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                try_pos, FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        # print "[For debug]: FW read has no tags"
                        try_count = 0
                        try_pos = mapped_location - min_cut5_len + 1
                        while my_region_serial == 0 and try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                                              try_pos, FR)
                            try_pos -= 1
                            try_count += 1

                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    mm_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_FW_C2T += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)
                else :
                    print "[For debug]: reads not in same lengths"


            # ---- RC_C2T ---- un-directional
            RC_regions = dict()
            for header in RC_C2T_uniq_lst :
                _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in RC_regions :
                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
                all_mapped+=1
                FR = "-FW"
                mapped_strand = "-"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) : # and checking_genome_context:
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                                           try_pos , FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        #print "[For debug]: RC Read has no tags"
                        try_count = 0
                        try_pos = mapped_location + g_len + min_cut5_len - 2
                        while my_region_serial == 0 and try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                                               try_pos, FR)
                            try_pos += 1
                            try_count += 1

                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    mm_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_RC_C2T += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)

                else :
                    print "[For debug]: reads not in same lengths"


            # ---- FW_G2A  ---- un-directional
            FW_regions = dict()
            gseq = dict()
            chr_length = dict()
            for header in FW_G2A_uniq_lst :
                _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in FW_regions :
                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])
                cigar = list(reversed(cigar))

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                all_mapped+=1
                FR = "-RC"
                mapped_strand = "-"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                original_BS = reverse_compl_seq(original_BS)  # for RC reads
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) :
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                                            try_pos, FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        #print "[For debug]: FW read has no tags"
                        try_count = 0
                        try_pos = mapped_location - min_cut5_len + 1
                        while my_region_serial == 0 and try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
                                            try_pos, FR)
                            try_pos += 1
                            try_count += 1
                        #if my_region_serial == 0 :
                        #    print "[For debug]: chr=", mapped_chr
                        #    print "[For debug]: FW_G2A read still can not find fragment serial"
                        # Tip: sometimes "my_region_serial" is still 0 ...

                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    max_mismatch_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_FW_G2A += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)
                else :
                    print "[For debug]: reads not in same lengths"


            # ---- RC_G2A ---- un-directional
            RC_regions = dict()
            for header in RC_G2A_uniq_lst :
                _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
                original_BS = original_bs_reads[header]
                if mapped_chr not in RC_regions :
                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
                if mapped_chr not in gseq :
                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                    chr_length[mapped_chr] = len(gseq[mapped_chr])
                cigar = list(reversed(cigar))

                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
                all_mapped+=1
                FR = "+RC"
                mapped_strand = "+"
                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
                                                                             mapped_location,
                                                                             mapped_location + g_len,
                                                                             mapped_strand)
                original_BS = reverse_compl_seq(original_BS)  # for RC reads
                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                if len(r_aln) == len(g_aln) : # and checking_genome_context:
                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
                    if True in checking_genome_context :
                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                                    mapped_location + g_len + min_cut5_len -1, FR)
                    if my_region_serial == 0 : # still be 0
                        # for some cases, read has no tags; searching the upstream sequence for tags
                        #print "[For debug]: RC Read has no tags"
                        try_count = 0
                        try_pos = mapped_location + g_len + min_cut5_len - 2
                        while try_count < MAX_TRY :
                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
                                                    try_pos, FR)
                            try_pos += 1
                            try_count += 1

                        #if my_region_serial == 0 :
                        #    print "[For debug]: chr=", mapped_chr
                        #    print "[For debug]: RC_C2A read still cannot find fragment serial"

                    N_mismatch = N_MIS(r_aln, g_aln)
#                    if N_mismatch <= int(max_mismatch_no) :
                    mm_no=float(max_mismatch_no)
                    if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                        all_mapped_passed += 1
                        methy = methy_seq(r_aln, g_aln + next2bp)
                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
                        #---XS FILTER----------------
                        XS = 0
                        nCH = methy.count('y') + methy.count('z')
                        nmCH = methy.count('Y') + methy.count('Z')
                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                            XS = 1
                        num_mapped_RC_G2A += 1
                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
                                      mapped_location, cigar, original_BS, methy, XS,
                                      output_genome = output_genome,
                                      rrbs = True,
                                      my_region_serial = my_region_serial,
                                      my_region_start = my_region_start,
                                      my_region_end = my_region_end)
                        all_base_mapped += len(original_BS)
                else :
                    print "[For debug]: reads not in same lengths"

            # Finished both FW and RC
            logm("Done: %s (%d) \n" % (read_file, no_my_files))
            print "--> %s (%d) "%(read_file, no_my_files)
            del original_bs_reads
            delete_files(WC2T, CC2T, WG2A, CG2A)
    # End of un-directional library

    delete_files(tmp_path)


    logm("Number of raw reads: %d "% all_raw_reads)
    if all_raw_reads>0:
        logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
        for kk in range(len(n_cut_tag_lst)):
            logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
        if adapter_seq!="" :
            logm("Number of reads having adapter removed: %d "%all_tagged_trimmed)
        logm("Number of bases in total: %d "%all_base_before_trim)
        if adapter_seq!="" :
            logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
        logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
        logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped)

        logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
        logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
        logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )

        if asktag=="Y": # un-diretional
            logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
            logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
            logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
            logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
            # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
            # according to literal meaning
        elif asktag=="N": # directional
            logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
            logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
        logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )

        n_CG = mC_lst[0] + uC_lst[0]
        n_CHG = mC_lst[1] + uC_lst[1]
        n_CHH = mC_lst[2] + uC_lst[2]

        logm("----------------------------------------------")
        logm("Methylated C in mapped reads ")
        logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0))
        logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0))
        logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0))
    logm("----------------------------------------------")
    logm("------------------- END ----------------------")

    elapsed(main_read_file)

    close_log()
author	weilong-guo
date	Tue, 05 Nov 2013 01:55:39 -0500
parents	e6df770c0e58
children