comparison Contra/baseline.py @ 0:7564f3b1e675

Uploaded
author fcaramia
date Thu, 13 Sep 2012 02:31:43 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7564f3b1e675
1 #!/usr/bin/python
2
3 import os
4 import sys
5 import fnmatch
6 import shlex
7 import subprocess
8 import shutil
9 import math
10 from optparse import OptionParser
11 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome
12 from scripts.get_chr_length import *
13 from scripts.convert_targeted_regions import *
14 from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate
15 from multiprocessing import Pool
16
17 class Params:
18 """
19 Class for top-level system parameters:
20 """
21
22 def __init__(self):
23
24 def mult_files(option, opt_str, value, parser):
25 args=[]
26 for arg in parser.rargs:
27 if arg[0] != "-":
28 args.append(arg)
29 else:
30 del parser.rargs[:len(args)]
31 break
32 if getattr(parser.values, option.dest):
33 args.extend(getattr(parser.values, option.dest))
34 setattr(parser.values, option.dest, args)
35
36
37 #command-line option definition
38 self.parser = OptionParser()
39 self.parser.add_option("-t", "--target",
40 help="Target region definition file [REQUIRED] [BED Format]",
41 action="store", type="string", dest="target")
42 self.parser.add_option("-f", "--files",
43 help="Files to be converted to baselines [REQUIRED] [BAM]",
44 action="callback", callback=mult_files, dest="files")
45 self.parser.add_option("-o", "--output",
46 help="Output folder [REQUIRED]",
47 action="store", type="string", dest="output")
48 self.parser.add_option("-c","--trim",
49 help="Portion of outliers to be removed before calculating average [Default: 0.2]",
50 action="store", dest="trim", default="0.2")
51 self.parser.add_option("-n","--name",
52 help="Output baseline name [Default: baseline]",
53 action="store", dest="name", default="baseline")
54
55 # required parameters list:
56 self.ERRORLIST = []
57
58 #change system parameters based on any command line arguments
59 (options, args) = self.parser.parse_args()
60 if options.target:
61 self.TARGET=options.target
62 else:
63 self.ERRORLIST.append("target")
64
65 if options.files:
66 self.FILES=options.files
67 else:
68 self.ERRORLIST.append("files")
69
70 if options.output:
71 self.OUTPUT=options.output
72 else:
73 self.ERRORLIST.append("output")
74
75 if options.trim:
76 self.TRIM=float(options.trim)
77
78 if options.name:
79 self.NAME=options.name
80
81 if len(self.ERRORLIST) != 0:
82 self.parser.print_help()
83 self.parser.error("Missing required parameters")
84
85
86 def repeat(self):
87 #params test
88 print "target :", self.TARGET
89 print "files :", self.FILES
90 print "output :", self.OUTPUT
91 print "trim :", self.TRIM
92 print "name :", self.NAME
93
94
95 # option handling
96 params = Params()
97 #params.repeat()
98 targetFile = params.TARGET
99 infiles = params.FILES
100 output_dir = params.OUTPUT
101
102 #Debug
103 print " ------ baseline.py ------- "
104 print "Target:", targetFile
105 for files in infiles:
106 print "File:", files
107 print "Output Directory: ", output_dir
108
109 def make_new_directory(outdir):
110 print outdir
111 if not os.path.exists(outdir):
112 os.mkdir(outdir)
113
114 print " ----- creating output directory -----"
115 make_new_directory(output_dir)
116 outdir = os.path.join(output_dir, "buf")
117 make_new_directory(outdir)
118
119 targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted")
120 os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2))
121
122 def processInFile(infile):
123 infilename = os.path.basename(infile)
124 s_outdir = os.path.join(outdir, infilename)
125 make_new_directory(s_outdir)
126 genomeFile = os.path.join(s_outdir,infilename +'.chrsummary')
127 get_genome(infile, genomeFile)
128
129 bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
130 args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile))
131 iOutFile = open(bedgraph, "w")
132 output = subprocess.Popen(args, stdout = iOutFile).wait()
133 iOutFile.close()
134
135 targetList = convertTarget(targetFile2)
136
137 splitByChromosome(bedgraph)
138
139 convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/")
140 bedgraph_tgtonly = bedgraph+".TARGETONLY"
141 bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE"
142 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly)
143 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg)
144 shutil.copy(bedgraph_tgtonly,outdir)
145 shutil.copy(bedgraph_tgtonly_avg,outdir)
146
147 print "----- Processing Files -----"
148 pool = Pool(5)
149 pool.map(processInFile,infiles)
150
151
152 # STEP 2 -> Create Union BedGraph
153 allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")]
154 allfiles_path = [os.path.join(outdir,x) for x in allfiles_names]
155
156 args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names
157
158 print (str(args))
159 fo = os.path.join(outdir,"TARGETONLY.union.txt")
160 foh = open(fo,"w")
161
162 subprocess.Popen(args,stdout=foh).wait()
163 foh.close()
164
165 # STEP 3 -> POOL METHOD
166 TRIM = params.TRIM #TRIM = 0.2
167 f = fo
168 fh = open(f)
169
170 fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt")
171 fo_libsize=fo+".LIBSIZE"
172
173
174 foh=open(fo,'w')
175 fo_libsize_h=open(fo_libsize,'w')
176
177
178 fh.readline()
179
180 Ns = None
181 nSamples = None
182
183 lineCount=0
184 for line in fh:
185 lineCount+=1
186 line_elems=line.rstrip().split("\t")
187 rangeLen=int(line_elems[2])-int(line_elems[1])
188 if not Ns:
189 nSamples=len(line_elems)-3
190 Ns = [0 for x in range(nSamples)]
191 for i in range(nSamples):
192 ind=i+3
193 Ns[i] += int(line_elems[ind])*rangeLen
194
195
196 fh.close()
197 fh=open(f)
198 fh.readline()
199
200
201 lineCount=0
202 nSamples_exclude = int(math.floor(TRIM*nSamples))
203
204 def gm_mean(xs):
205 tmpprod = 1
206 p = 1.0/len(xs)
207 for x in xs:
208 tmpprod = tmpprod * math.pow(x,p)
209 return tmpprod
210
211 Ns_gmean = gm_mean(Ns)
212
213 def meanstdv(x):
214 from math import sqrt
215 n, mean, std = len(x), 0, 0
216 for a in x:
217 mean = mean+a
218 mean = mean / float(n)
219 for a in x:
220 std = std+(a-mean)**2
221 std=sqrt(std/float(n-1))
222 return mean, std
223
224
225 libsize = 0
226 for line in fh:
227 lineCount+=1
228 line_elems=line.rstrip().split("\t")
229 rangeLen=int(line_elems[2])-int(line_elems[1])
230 xs_tmp=[int(x) for x in line_elems[3:]]
231 xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))]
232 xs.sort()
233 xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)]
234 #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed))
235 trimmed_mean,std=meanstdv(xs_trimmed)
236 libsize+=rangeLen*trimmed_mean
237 foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n")
238
239
240 fo_libsize_h.write(str(int(round(libsize))))
241
242 fh.close()
243 foh.close()
244 fo_libsize_h.close()
245
246 def removeTempFolder(tempFolderPath):
247 import shutil
248
249 shutil.rmtree(tempFolderPath)
250 print "Temp Folder Removed"
251
252 # Removed Temp Folder
253 removeTempFolder(outdir)