Mercurial > repos > nanettec > frequency_sliding
view frequency_sliding/frequency_sliding.py @ 0:9b835eab8a1d draft
Uploaded
author | nanettec |
---|---|
date | Fri, 18 Mar 2016 05:22:30 -0400 |
parents | |
children |
line wrap: on
line source
""" @summary: Calculate the frequency of eQTLs and genes per sliding window bin @author: nanette.coetzer@gmail.com @version 5 """ import optparse, sys import subprocess import tempfile import os, re def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option("-m", "--rscript", default=None, dest="rscript", help="R script") parser.add_option("-i", "--input1", default=None, dest="input1", help="Frequency per bin file") parser.add_option("-j", "--input2", default=None, dest="input2", help="Lookup summary file") parser.add_option("-k", "--input3", default=None, dest="input3", help="Frequency summary file") parser.add_option("-o", "--output1", default=None, dest="output1", help="Sliding frequency file") parser.add_option("-p", "--output2", default=None, dest="output2", help="Sliding lookup file") parser.add_option("-q", "--output3", default=None, dest="output3", help="eQTL and genes polygon plot") (options, args) = parser.parse_args() try: open(options.input1, "r").close() except TypeError, e: stop_err("You need to supply the Lookup frequency file:\n" + str(e)) except IOError, e: stop_err("Can not open the Lookup frequency file:\n" + str(e)) try: open(options.input2, "r").close() except TypeError, e: stop_err("You need to supply the Lookup summary file:\n" + str(e)) except IOError, e: stop_err("Can not open the Lookup summary file:\n" + str(e)) try: open(options.input3, "r").close() except TypeError, e: stop_err("You need to supply the Frequency summary file:\n" + str(e)) except IOError, e: stop_err("Can not open the Frequency summary file:\n" + str(e)) ############################################## ############################################## infile3 = open(options.input3,'r') # Frequency summary file for line in infile3: if line.startswith("Number"): num_intervals = int(line.strip().split("\t")[1]) #print num_intervals infile3.close() chr_second_last = [] chr_end = [] tot = 0 infile2 = open(options.input2,'r') # Lookup summary file for line in infile2: l = line.strip().split("\t") if not l[4].startswith("i"): tot += int(l[4]) chr_second_last.append(tot-2) chr_end.append(tot) freq_dict = {} sliding_id = 0 prev_size = 0 idlist = [] slidingdict = {} cur_size = 0 add_last = 0 i = 0 sizelist = [] infile1 = open(options.input1,'r') # Frequency.txt # 1b is where an interval always starts with a 2cM bin (plus a small bin if available) --> never split 2 bins on either sides of a marker if num_intervals == 1: for line in infile1: l2 = line.strip().split("\t") if not l2[0].startswith("int"): i += 1 freq_dict[l2[0]] = l2 if prev_size == 0: # end of chromosome if int(l2[0]) in chr_end: pass else: prev_size = float(l2[6]) idlist.append(int(l2[0])) sizelist.append(float(l2[6])) cur_size = float(l2[6]) add_last = 0 else: prev_size = float(l2[6]) if prev_size < 2 and add_last == 0: cur_size = float(cur_size) + float(l2[6]) idlist.append(int(l2[0])) sizelist.append(float(l2[6])) elif prev_size == 2: sliding_id = sliding_id + 1 slidingdict[int(sliding_id)] = idlist idlist = [int(l2[0])] sizelist = [float(l2[6])] prev_size = float(l2[6]) cur_size = float(l2[6]) if add_last == 1: cur_size = float(cur_size) + float(l2[6]) idlist.append(int(l2[0])) sizelist.append(float(l2[6])) sliding_id = sliding_id + 1 slidingdict[sliding_id] = idlist prev_size = 0 idlist = [] sizelist = [] cur_size = 0 if int(l2[0]) in chr_second_last: add_last = 1 # 1 is where an interval starts with a small bin if available (plus a 2cM bin) if num_intervals == 9: for line in infile1: l2 = line.strip().split("\t") if not l2[0].startswith("int"): i += 1 freq_dict[l2[0]] = l2 print slidingdict if prev_size == 0: # end of chromosome if int(l2[0]) in chr_end: pass else: prev_size = float(l2[6]) idlist.append(int(l2[0])) cur_size = float(l2[6]) add_last = 0 if float(cur_size) >= 2 and add_last == 0: sliding_id = sliding_id + 1 slidingdict[int(sliding_id)] = idlist prev_size = float(l2[6]) idlist = [] cur_size = float(0) else: cur_size = float(cur_size) + float(l2[6]) idlist.append(int(l2[0])) prev_size = float(l2[6]) if add_last == 1: sliding_id = sliding_id + 1 slidingdict[sliding_id] = idlist prev_size = 0 idlist = [] cur_size = 0 if int(l2[0]) in chr_second_last: add_last = 1 if float(cur_size) >= 2 and add_last == 0: sliding_id = sliding_id + 1 slidingdict[int(sliding_id)] = idlist prev_size = float(l2[6]) idlist = [] cur_size = float(0) if num_intervals == 2: for line in infile1: l2 = line.strip().split("\t") if not l2[0].startswith("int"): i += 1 freq_dict[l2[0]] = l2 if prev_size == 0: # end of chromosome if int(l2[0]) in chr_end: pass else: prev_size = float(l2[6]) idlist.append(int(l2[0])) cur_size = float(l2[6]) add_last = 0 else: cur_size = float(cur_size) + float(l2[6]) idlist.append(int(l2[0])) prev_size = float(l2[6]) if add_last == 1: sliding_id = sliding_id + 1 slidingdict[sliding_id] = idlist prev_size = 0 idlist = [] cur_size = 0 if int(l2[0]) in chr_second_last: add_last = 1 if float(cur_size) >= 4 and add_last == 0: sliding_id = sliding_id + 1 slidingdict[int(sliding_id)] = idlist prev_size = float(l2[6]) idlist = [int(l2[0])] cur_size = float(prev_size) if num_intervals == 3: for line in infile1: l2 = line.strip().split("\t") if not l2[0].startswith("int"): i += 1 freq_dict[l2[0]] = l2 if prev_size == 0: # end of chromosome if int(l2[0]) in chr_end: pass else: prev_size = float(l2[6]) idlist.append(int(l2[0])) sizelist.append(float(l2[6])) cur_size = float(l2[6]) add_last = 0 else: cur_size = float(cur_size) + float(l2[6]) idlist.append(int(l2[0])) sizelist.append(float(l2[6])) prev_size = float(l2[6]) if add_last == 1: sliding_id = sliding_id + 1 slidingdict[sliding_id] = idlist prev_size = 0 idlist = [] sizelist = [] cur_size = 0 if int(l2[0]) in chr_second_last: add_last = 1 if float(cur_size) >= 6 and add_last == 0: sliding_id = sliding_id + 1 slidingdict[int(sliding_id)] = idlist prev_size = float(l2[6]) newsizelist = [] newidlist = [] newstart = 0 c = 0 cur_size = 0 for s in range(len(sizelist)): if newstart == 1: newsizelist.append(sizelist[s]) newidlist.append(idlist[s]) cur_size += float(sizelist[s]) if sizelist[s] == 2.0 and newstart == 0: c += 1 if c == 2: newsizelist.append(sizelist[s]) newidlist.append(idlist[s]) cur_size += float(sizelist[s]) newstart = 1 idlist = newidlist sizelist = newsizelist out1 = open(options.output1,'w') # Frequency.txt out2 = open(options.output2,'w') #out1.write("sliding.id\tchr\tsliding.cM\tsliding.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.eQTL/cM\tsliding.eQTL/cM/10genes\n") out1.write("sliding.id\tchr\tsliding.cM\tsliding.all.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.all.eQTL/cM\tsliding.cis.eQTL/cM\tsliding.trans.eQTL/cM\tsliding.genes/cM\n") tcM = 0 tEQTL = 0 tGENES = 0 for j in range(len(slidingdict.keys())): totcM = 0 toteqtl = 0 totcis = 0 tottrans = 0 totgenes = 0 totEcM_all = 0 totEcM_cis = 0 totEcM_trans = 0 totGcM = 0 #totEcM10G = 0 out2.write(str(j+1)+"\t"+str(slidingdict[j+1])+"\n") for k in slidingdict[j+1]: totcM += float(freq_dict[str(k)][6]) toteqtl += float(freq_dict[str(k)][7]) totcis += float(freq_dict[str(k)][8]) tottrans += float(freq_dict[str(k)][9]) totgenes += float(freq_dict[str(k)][10]) chr = freq_dict[str(k)][1] if totgenes == 0: totgenes = 1 totEcM_all += round((toteqtl/totcM),2) totEcM_cis += round((totcis/totcM),2) totEcM_trans += round((tottrans/totcM),2) totGcM += round((totgenes/totcM),2) out1.write(str(j+1)+"\t"+str(chr)+"\t"+str(totcM)+"\t"+str(toteqtl)+"\t"+str(totcis)+"\t"+str(tottrans)+"\t"+str(totgenes)+"\t"+str(totEcM_all)+"\t"+str(totEcM_cis)+"\t"+str(totEcM_trans)+"\t"+str(totGcM)+"\n") out1.close() infile1.close() infile2.close() out2.close() # Create temp direcotry tempdir = tempfile.mkdtemp() # copy INPUT file to the temp directory s = "cp %s %s/sliding_frequency.txt" %(options.output1, tempdir) subprocess.call(s, shell=True) # create R script => save in temp directory # generate new header new_script = open(tempdir+"/new_script.r","w") header = "setwd(\"%s\")" %tempdir new_script.write(header+"\n") # add script body script = open(options.rscript,"r") for line in script: new_script.write(line.strip()+"\n") new_script.close() # run R script from temp directory s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # run R script from temp directory s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) os.system("mv %s/Rplot_eQTL_genes_polygon.pdf %s" %(tempdir,options.output3)) ############################################## if __name__=="__main__": __main__()