view frequency_sliding/frequency_sliding.py @ 0:9b835eab8a1d draft

Uploaded
author nanettec
date Fri, 18 Mar 2016 05:22:30 -0400
parents
children
line wrap: on
line source

"""
@summary: Calculate the frequency of eQTLs and genes per sliding window bin
@author: nanette.coetzer@gmail.com
@version 5

"""
import optparse, sys
import subprocess
import tempfile
import os, re

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()
 
def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option("-m", "--rscript", default=None, dest="rscript", 
                      help="R script")
    parser.add_option("-i", "--input1", default=None, dest="input1", 
                      help="Frequency per bin file")
    parser.add_option("-j", "--input2", default=None, dest="input2", 
                      help="Lookup summary file")
    parser.add_option("-k", "--input3", default=None, dest="input3", 
                      help="Frequency summary file")
    parser.add_option("-o", "--output1", default=None, dest="output1", 
                      help="Sliding frequency file")
    parser.add_option("-p", "--output2", default=None, dest="output2", 
                      help="Sliding lookup file")
    parser.add_option("-q", "--output3", default=None, dest="output3", 
                      help="eQTL and genes polygon plot")
    (options, args) = parser.parse_args()
    
    try:
        open(options.input1, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Lookup frequency file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Lookup frequency file:\n" + str(e))
        
    try:
        open(options.input2, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Lookup summary file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Lookup summary file:\n" + str(e))
        
    try:
        open(options.input3, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Frequency summary file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Frequency summary file:\n" + str(e))
        
    ##############################################
    ##############################################

    infile3 = open(options.input3,'r')  # Frequency summary file
    for line in infile3:
        if line.startswith("Number"):
            num_intervals = int(line.strip().split("\t")[1])
            #print num_intervals
    infile3.close()
    
    chr_second_last = []
    chr_end = []
    tot = 0
    infile2 = open(options.input2,'r')  # Lookup summary file
    for line in infile2:
        l = line.strip().split("\t")
        if not l[4].startswith("i"):
            tot += int(l[4])
            chr_second_last.append(tot-2)
            chr_end.append(tot)
    
    freq_dict = {}
    sliding_id = 0
    prev_size = 0
    idlist = []
    slidingdict = {}
    cur_size = 0
    add_last = 0
    i = 0
    sizelist = []
    
    infile1 = open(options.input1,'r')  # Frequency.txt
    
    # 1b is where an interval always starts with a 2cM bin (plus a small bin if available)  -->  never split 2 bins on either sides of a marker
    if num_intervals == 1:
        for line in infile1:
            l2 = line.strip().split("\t")
            if not l2[0].startswith("int"):
                i += 1
                freq_dict[l2[0]] = l2
                if prev_size == 0:
                    # end of chromosome
                    if int(l2[0]) in chr_end:
                        pass
                    else:
                        prev_size = float(l2[6])
                        idlist.append(int(l2[0]))
                        sizelist.append(float(l2[6]))
                        cur_size = float(l2[6])
                        add_last = 0
                else:
                    prev_size = float(l2[6])
                    
                    if prev_size < 2 and add_last == 0:
                        cur_size = float(cur_size) + float(l2[6])
                        idlist.append(int(l2[0]))
                        sizelist.append(float(l2[6]))
                    elif prev_size == 2:
                        sliding_id = sliding_id + 1
                        slidingdict[int(sliding_id)] = idlist
                        idlist = [int(l2[0])]
                        sizelist = [float(l2[6])]
                        prev_size = float(l2[6])
                        cur_size = float(l2[6])
                        
                    if add_last == 1:
                        cur_size = float(cur_size) + float(l2[6])
                        idlist.append(int(l2[0]))
                        sizelist.append(float(l2[6]))
                        sliding_id = sliding_id + 1
                        slidingdict[sliding_id] = idlist
                        prev_size = 0
                        idlist = []
                        sizelist = []
                        cur_size = 0
                        
                    if int(l2[0]) in chr_second_last:
                        add_last = 1
                        
    # 1 is where an interval starts with a small bin if available (plus a 2cM bin) 
    if num_intervals == 9:
        for line in infile1:
            l2 = line.strip().split("\t")
            if not l2[0].startswith("int"):
                i += 1
                freq_dict[l2[0]] = l2
                print slidingdict
                if prev_size == 0:
                    # end of chromosome
                    if int(l2[0]) in chr_end:
                        pass
                    else:
                        prev_size = float(l2[6])
                        idlist.append(int(l2[0]))
                        cur_size = float(l2[6])
                        add_last = 0
                        if float(cur_size) >= 2 and add_last == 0:
                            sliding_id = sliding_id + 1
                            slidingdict[int(sliding_id)] = idlist
        
                            prev_size = float(l2[6])
                            idlist = []
                            cur_size = float(0)                
                else:
                    cur_size = float(cur_size) + float(l2[6])
                    idlist.append(int(l2[0]))
                    prev_size = float(l2[6])
                    if add_last == 1:
                        sliding_id = sliding_id + 1
                        slidingdict[sliding_id] = idlist
                        prev_size = 0
                        idlist = []
                        cur_size = 0
                        
                    if int(l2[0]) in chr_second_last:
                        add_last = 1
                    if float(cur_size) >= 2 and add_last == 0:
                        sliding_id = sliding_id + 1
                        slidingdict[int(sliding_id)] = idlist
                        prev_size = float(l2[6])
                        idlist = []
                        cur_size = float(0)

                    
    if num_intervals == 2:
        for line in infile1:
            l2 = line.strip().split("\t")
            if not l2[0].startswith("int"):
                i += 1
                freq_dict[l2[0]] = l2
                if prev_size == 0:
                    # end of chromosome
                    if int(l2[0]) in chr_end:
                        pass
                    else:
                        prev_size = float(l2[6])
                        idlist.append(int(l2[0]))
                        cur_size = float(l2[6])
                        add_last = 0  
                else:
                    cur_size = float(cur_size) + float(l2[6])
                    idlist.append(int(l2[0]))
                    prev_size = float(l2[6])
                    if add_last == 1:
                        sliding_id = sliding_id + 1
                        slidingdict[sliding_id] = idlist
                        prev_size = 0
                        idlist = []
                        cur_size = 0
                        
                    if int(l2[0]) in chr_second_last:
                        add_last = 1
                    if float(cur_size) >= 4 and add_last == 0:
                        sliding_id = sliding_id + 1
                        slidingdict[int(sliding_id)] = idlist
    
                        prev_size = float(l2[6])
                        idlist = [int(l2[0])]
                        cur_size = float(prev_size)
    
    if num_intervals == 3:
        for line in infile1:
            l2 = line.strip().split("\t")
            if not l2[0].startswith("int"):
                i += 1
                freq_dict[l2[0]] = l2
                if prev_size == 0:
                    # end of chromosome
                    if int(l2[0]) in chr_end:
                        pass
                    else:
                        prev_size = float(l2[6])
                        idlist.append(int(l2[0]))
                        sizelist.append(float(l2[6]))
                        cur_size = float(l2[6])
                        add_last = 0
                else:
                    cur_size = float(cur_size) + float(l2[6])
                    idlist.append(int(l2[0]))
                    sizelist.append(float(l2[6]))
                    prev_size = float(l2[6])
                    if add_last == 1:
                        sliding_id = sliding_id + 1
                        slidingdict[sliding_id] = idlist
                        prev_size = 0
                        idlist = []
                        sizelist = []
                        cur_size = 0
                        
                    if int(l2[0]) in chr_second_last:
                        add_last = 1
                    if float(cur_size) >= 6 and add_last == 0:
                        sliding_id = sliding_id + 1
                        slidingdict[int(sliding_id)] = idlist
                        prev_size = float(l2[6])
                        newsizelist = []
                        newidlist = []
                        newstart = 0
                        c = 0
                        cur_size = 0
                        for s in range(len(sizelist)):
                            if newstart == 1:
                                newsizelist.append(sizelist[s])
                                newidlist.append(idlist[s])
                                cur_size += float(sizelist[s])
                            if sizelist[s] == 2.0 and newstart == 0:
                                c += 1
                                if c == 2:
                                    newsizelist.append(sizelist[s])
                                    newidlist.append(idlist[s])
                                    cur_size += float(sizelist[s])
                                    newstart = 1
                        idlist = newidlist
                        sizelist = newsizelist
                    
    out1 = open(options.output1,'w')  # Frequency.txt
    out2 = open(options.output2,'w') 
    #out1.write("sliding.id\tchr\tsliding.cM\tsliding.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.eQTL/cM\tsliding.eQTL/cM/10genes\n")
    out1.write("sliding.id\tchr\tsliding.cM\tsliding.all.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.all.eQTL/cM\tsliding.cis.eQTL/cM\tsliding.trans.eQTL/cM\tsliding.genes/cM\n")
    tcM = 0
    tEQTL = 0
    tGENES = 0
    
    for j in range(len(slidingdict.keys())):
        totcM = 0
        toteqtl = 0
        totcis = 0
        tottrans = 0
        totgenes = 0
        totEcM_all = 0
        totEcM_cis = 0
        totEcM_trans = 0
        totGcM = 0
        #totEcM10G = 0
        out2.write(str(j+1)+"\t"+str(slidingdict[j+1])+"\n")
        for k in slidingdict[j+1]:
            totcM += float(freq_dict[str(k)][6])
            toteqtl += float(freq_dict[str(k)][7])
            totcis += float(freq_dict[str(k)][8])
            tottrans += float(freq_dict[str(k)][9])
            totgenes += float(freq_dict[str(k)][10])
            
        chr = freq_dict[str(k)][1]
        if totgenes == 0:
            totgenes = 1

        totEcM_all += round((toteqtl/totcM),2)
        totEcM_cis += round((totcis/totcM),2)
        totEcM_trans += round((tottrans/totcM),2)
        totGcM += round((totgenes/totcM),2)
        out1.write(str(j+1)+"\t"+str(chr)+"\t"+str(totcM)+"\t"+str(toteqtl)+"\t"+str(totcis)+"\t"+str(tottrans)+"\t"+str(totgenes)+"\t"+str(totEcM_all)+"\t"+str(totEcM_cis)+"\t"+str(totEcM_trans)+"\t"+str(totGcM)+"\n")
    
    out1.close()
    infile1.close()
    infile2.close()
    out2.close()
    

    # Create temp direcotry
    tempdir = tempfile.mkdtemp()

    # copy INPUT file to the temp directory
    s = "cp %s %s/sliding_frequency.txt" %(options.output1, tempdir)
    subprocess.call(s, shell=True)

    # create R script => save in temp directory
    # generate new header 
    new_script = open(tempdir+"/new_script.r","w")
    header = "setwd(\"%s\")" %tempdir
    new_script.write(header+"\n")
    # add script body
    script = open(options.rscript,"r")
    for line in script:
        new_script.write(line.strip()+"\n")
    new_script.close()
    
    # run R script from temp directory
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    
    # run R script from temp directory
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    
    os.system("mv %s/Rplot_eQTL_genes_polygon.pdf %s" %(tempdir,options.output3))
    
    ##############################################
    
if __name__=="__main__": 
    __main__()