view tools/regVariation/microsats_mutability.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/env python
#Guruprasad Ananda
"""
This tool computes microsatellite mutability for the orthologous microsatellites fetched from  'Extract Orthologous Microsatellites from pair-wise alignments' tool.
"""
from galaxy import eggs
import sys, string, re, commands, tempfile, os, fileinput
from galaxy.tools.util.galaxyops import *
from bx.intervals.io import *
from bx.intervals.operations import quicksect

fout = open(sys.argv[2],'w')
p_group = int(sys.argv[3])        #primary "group-by" feature
p_bin_size = int(sys.argv[4])
s_group = int(sys.argv[5])        #sub-group by feature
s_bin_size = int(sys.argv[6])
mono_threshold = 9
non_mono_threshold = 4
p_group_cols = [p_group, p_group+7]
s_group_cols = [s_group, s_group+7]
num_generations = int(sys.argv[7])
region = sys.argv[8] 
int_file = sys.argv[9]
if int_file != "None": #User has specified an interval file
    try:
        fint = open(int_file, 'r')
        dbkey_i = sys.argv[10]
        chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
    except:
        stop_err("Unable to open input Interval file")
    
def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

def reverse_complement(text):
    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
    comp = [ch for ch in text.translate(DNA_COMP)]
    comp.reverse()
    return "".join(comp)

def get_unique_elems(elems):
    seen=set()
    return[x for x in elems if x not in seen and not seen.add(x)]

def get_binned_lists(uniqlist, binsize):
    binnedlist=[]
    uniqlist.sort()
    start = int(uniqlist[0])
    bin_ind=0
    l_ind=0
    binnedlist.append([])
    while l_ind < len(uniqlist):
        elem = int(uniqlist[l_ind])
        if elem in range(start,start+binsize):
            binnedlist[bin_ind].append(elem)
        else:
            start += binsize
            bin_ind += 1
            binnedlist.append([])
            binnedlist[bin_ind].append(elem)
        l_ind += 1
    return binnedlist

def fetch_weight(H,C,t):
    if (H-(C-H)) < t:
        return 2.0
    else:
        return 1.0

def mutabilityEstimator(repeats1,repeats2,thresholds):
    mut_num = 0.0    #Mutability Numerator
    mut_den = 0.0    #Mutability denominator
    for ind,H in enumerate(repeats1):
        C = repeats2[ind]
        t = thresholds[ind]
        w = fetch_weight(H,C,t)
        mut_num += ((H-C)*(H-C)*w)
        mut_den += w
    return [mut_num, mut_den]

def output_writer(blk, blk_lines):
    global winspecies, speciesind
    all_elems_1=[]
    all_elems_2=[]
    all_s_elems_1=[]
    all_s_elems_2=[]
    for bline in blk_lines:
        if not(bline):
            continue
        items = bline.split('\t')
        seq1 = items[1]
        start1 = items[2]
        end1 = items[3]
        seq2 = items[8]
        start2 = items[9]
        end2 = items[10] 
        if p_group_cols[0] == 6:
            items[p_group_cols[0]] = int(items[p_group_cols[0]])
            items[p_group_cols[1]] = int(items[p_group_cols[1]])
        if s_group_cols[0] == 6:
            items[s_group_cols[0]] = int(items[s_group_cols[0]])
            items[s_group_cols[1]] = int(items[s_group_cols[1]])
        all_elems_1.append(items[p_group_cols[0]])    #primary col elements for species 1
        all_elems_2.append(items[p_group_cols[1]])    #primary col elements for species 2
        if s_group_cols[0] != -1:    #sub-group is not None
            all_s_elems_1.append(items[s_group_cols[0]])    #secondary col elements for species 1
            all_s_elems_2.append(items[s_group_cols[1]])    #secondary col elements for species 2
    uniq_elems_1 = get_unique_elems(all_elems_1)
    uniq_elems_2 = get_unique_elems(all_elems_2)
    if s_group_cols[0] != -1:
        uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
        uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
    mut1={}
    mut2={}
    count1 = {}
    count2 = {}
    """
    if p_group_cols[0] == 7:    #i.e. the option chosen is group-by unit(AG, GTC, etc)
        uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
    """
    if p_group_cols[0] == 6:    #i.e. the option chosen is group-by repeat number.
        uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
        uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
        
    if s_group_cols[0] == 6:    #i.e. the option chosen is subgroup-by repeat number.
        uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
        uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)

    for pitem1 in uniq_elems_1:
        #repeats1 = []
        #repeats2 = []
        thresholds = []
        if s_group_cols[0] != -1:    #Sub-group by feature is not None
            for sitem1 in uniq_s_elems_1:
                repeats1 = []
                repeats2 = []
                if type(sitem1) == type(''):
                    sitem1 = sitem1.strip()
                for bline in blk_lines:
                    belems = bline.split('\t')
                    if type(pitem1) == list:
                        if p_group_cols[0] == 6:
                            belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
                        if belems[p_group_cols[0]] in pitem1:
                            if belems[s_group_cols[0]]==sitem1:
                                repeats1.append(int(belems[6]))
                                repeats2.append(int(belems[13]))
                                if belems[4] == 'mononucleotide':
                                    thresholds.append(mono_threshold)
                                else:
                                    thresholds.append(non_mono_threshold)
                                mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
                                if region == 'align':
                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
                                else:    
                                    if winspecies == 1:
                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
                                    elif winspecies == 2:
                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
                    else:
                        if type(sitem1) == list:
                            if s_group_cols[0] == 6:
                                belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
                            if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
                                repeats1.append(int(belems[6]))
                                repeats2.append(int(belems[13]))
                                if belems[4] == 'mononucleotide':
                                    thresholds.append(mono_threshold)
                                else:
                                    thresholds.append(non_mono_threshold)
                                mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
                                if region == 'align':
                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
                                else:    
                                    if winspecies == 1:
                                        count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
                                    elif winspecies == 2:
                                        count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
                        else:
                            if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
                                repeats1.append(int(belems[6]))
                                repeats2.append(int(belems[13]))
                                if belems[4] == 'mononucleotide':
                                    thresholds.append(mono_threshold)
                                else:
                                    thresholds.append(non_mono_threshold)
                                mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
                                if region == 'align':
                                    count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
                                else:    
                                    if winspecies == 1:
                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
                                    elif winspecies == 2:
                                        count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
        else:   #Sub-group by feature is None
            for bline in blk_lines:
                belems = bline.split('\t')
                if type(pitem1) == list:
                    #print >>sys.stderr, "item: " + str(item1)
                    if p_group_cols[0] == 6:
                        belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
                    if belems[p_group_cols[0]] in pitem1:
                        repeats1.append(int(belems[6]))
                        repeats2.append(int(belems[13]))
                        if belems[4] == 'mononucleotide':
                            thresholds.append(mono_threshold)
                        else:
                            thresholds.append(non_mono_threshold)
                else:
                    if belems[p_group_cols[0]]==pitem1:
                        repeats1.append(int(belems[6]))
                        repeats2.append(int(belems[13]))
                        if belems[4] == 'mononucleotide':
                            thresholds.append(mono_threshold)
                        else:
                            thresholds.append(non_mono_threshold)
            mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
            if region == 'align':
                count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
            else:            
                if winspecies == 1:
                    count1[str(pitem1)]=sum(repeats1)
                elif winspecies == 2:
                    count1[str(pitem1)]=sum(repeats2)
                
    for pitem2 in uniq_elems_2:
        #repeats1 = []
        #repeats2 = []
        thresholds = []
        if s_group_cols[0] != -1:    #Sub-group by feature is not None
            for sitem2 in uniq_s_elems_2:
                repeats1 = []
                repeats2 = []
                if type(sitem2)==type(''):
                    sitem2 = sitem2.strip()
                for bline in blk_lines:
                    belems = bline.split('\t')
                    if type(pitem2) == list:
                        if p_group_cols[0] == 6:
                            belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
                        if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
                            repeats2.append(int(belems[13]))
                            repeats1.append(int(belems[6]))
                            if belems[4] == 'mononucleotide':
                                thresholds.append(mono_threshold)
                            else:
                                thresholds.append(non_mono_threshold)
                            mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
                            #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
                            if region == 'align':
                                count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
                            else: 
                                if winspecies == 1:
                                    count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
                                elif winspecies == 2:
                                    count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
                    else:
                        if type(sitem2) == list:
                            if s_group_cols[0] == 6:
                                belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
                            if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
                                repeats2.append(int(belems[13]))
                                repeats1.append(int(belems[6]))
                                if belems[4] == 'mononucleotide':
                                    thresholds.append(mono_threshold)
                                else:
                                    thresholds.append(non_mono_threshold)
                                mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
                                if region == 'align':
                                    count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
                                else: 
                                    if winspecies == 1:
                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
                                    elif winspecies == 2:
                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
                        else:
                            if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
                                repeats1.append(int(belems[13]))
                                repeats2.append(int(belems[6]))
                                if belems[4] == 'mononucleotide':
                                    thresholds.append(mono_threshold)
                                else:
                                    thresholds.append(non_mono_threshold)
                                mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
                                if region == 'align':
                                    count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
                                else: 
                                    if winspecies == 1:
                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
                                    elif winspecies == 2:
                                        count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
        else:   #Sub-group by feature is None
            for bline in blk_lines:
                belems = bline.split('\t')
                if type(pitem2) == list:
                    if p_group_cols[0] == 6:
                        belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
                    if belems[p_group_cols[1]] in pitem2:
                        repeats2.append(int(belems[13]))
                        repeats1.append(int(belems[6]))
                        if belems[4] == 'mononucleotide':
                            thresholds.append(mono_threshold)
                        else:
                            thresholds.append(non_mono_threshold)
                else:
                    if belems[p_group_cols[1]]==pitem2:
                        repeats2.append(int(belems[13]))
                        repeats1.append(int(belems[6]))
                        if belems[4] == 'mononucleotide':
                            thresholds.append(mono_threshold)
                        else:
                            thresholds.append(non_mono_threshold)
            mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
            if region == 'align':
                count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
            else:
                if winspecies == 1:
                    count2["%s" %(pitem2)]=sum(repeats2)
                elif winspecies == 2:
                    count2["%s" %(pitem2)]=sum(repeats1)
    for key in mut1.keys():
        if key in mut2.keys():
            mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
            count = count1[key]
            del mut2[key]
        else:
            unit_found = False
            if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
                if p_group_cols[0] == 7:
                    this,other = 0,1
                else:
                    this,other = 1,0
                groups1 = key.split('\t')
                mutn = mut1[key][0]
                mutd = mut1[key][1]
                count = 0
                for key2 in mut2.keys():
                    groups2 = key2.split('\t')
                    if groups1[other] == groups2[other]:
                        if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2:
                            #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
                            mutn += mut2[key2][0]
                            mutd += mut2[key2][1]
                            count += int(count2[key2])
                            unit_found = True
                            del mut2[key2]
                            #break
            if unit_found:
                mut = mutn/mutd
            else:
                mut = mut1[key][0]/mut1[key][1]
                count = count1[key]
        mut = "%.2e" %(mut/num_generations)
        if region == 'align':
            print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
        elif region == 'win':
            fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
            fout.flush()
            
    #catch any remaining repeats, for instance if the orthologous position contained different repeat units
    for remaining_key in mut2.keys():
        mut = mut2[remaining_key][0]/mut2[remaining_key][1]
        mut = "%.2e" %(mut/num_generations)
        count = count2[remaining_key]
        if region == 'align':
            print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
        elif region == 'win':
            fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
            fout.flush()
            #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)

def counter(node, start, end, report_func):
    if start <= node.start < end and start < node.end <= end:
        report_func(node) 
        if node.right:
            counter(node.right, start, end, report_func)
        if node.left:
            counter(node.left, start, end, report_func)
    elif node.start < start and node.right:
        counter(node.right, start, end, report_func)
    elif node.start >= end and node.left and node.left.maxend > start:
        counter(node.left, start, end, report_func)
            
        
def main():
    infile = sys.argv[1]
    
    for i, line in enumerate( file ( infile )):
        line = line.rstrip('\r\n')
        if len( line )>0 and not line.startswith( '#' ):
            elems = line.split( '\t' )
            break
        if i == 30:
            break # Hopefully we'll never get here...
    
    if len( elems ) != 15:
        stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
    global winspecies, speciesind
    if region == 'win':
        if dbkey_i in elems[1]:
            winspecies = 1
            speciesind = 1 
        elif dbkey_i in elems[8]:
            winspecies = 2
            speciesind = 8
        else:
            stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") 
        
    fin = open(infile, 'r')
    skipped = 0
    blk=0
    win=0
    linestr=""
    
    if region == 'win':
        
        msats = NiceReaderWrapper( fileinput.FileInput( infile ),
                                chrom_col = speciesind,
                                start_col = speciesind+1,
                                end_col = speciesind+2,
                                strand_col = -1,
                                fix_strand = True)
        msatTree = quicksect.IntervalTree()
        for item in msats:
            if type( item ) is GenomicInterval:
                msatTree.insert( item, msats.linenum, item.fields )
        
        for iline in fint:
            try:
                iline = iline.rstrip('\r\n')
                if not(iline) or iline == "":
                    continue
                ielems = iline.strip("\r\n").split('\t')
                ichr = ielems[chr_col_i]
                istart = int(ielems[start_col_i])
                iend = int(ielems[end_col_i])
                isrc = "%s.%s" %(dbkey_i,ichr)
                if isrc not in msatTree.chroms:
                    continue
                result = []
                root = msatTree.chroms[isrc]    #root node for the chrom
                counter(root, istart, iend, lambda node: result.append( node ))
                if not(result):
                    continue
                tmpfile1 = tempfile.NamedTemporaryFile('wb+')
                for node in result:
                    tmpfile1.write("%s\n" % "\t".join( node.other ))
                
                tmpfile1.seek(0)
                output_writer(iline, tmpfile1.readlines())
            except:
                skipped+=1
        if skipped:
            print "Skipped %d intervals as invalid." %(skipped)
    elif region == 'align':
        if s_group_cols[0] != -1:
            print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
        else:
            print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
        prev_bnum = -1
        try:
            for line in fin:
                line = line.strip("\r\n")
                if not(line) or line == "":
                    continue
                elems = line.split('\t')
                try:
                    assert int(elems[0])
                    assert len(elems) == 15
                except:
                    continue
                new_bnum = int(elems[0])
                if new_bnum != prev_bnum:
                    if prev_bnum != -1:
                        output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
                    linestr = line + "\n"
                else:
                    linestr += line
                    linestr += "\n"
                prev_bnum = new_bnum
            output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
        except Exception, ea:
            print >>sys.stderr, ea
            skipped += 1
        if skipped:
            print "Skipped %d lines as invalid." %(skipped)
if __name__ == "__main__":
    main()