Mercurial > repos > fcaramia > contra
comparison Contra/baseline.py @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7564f3b1e675 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import os | |
4 import sys | |
5 import fnmatch | |
6 import shlex | |
7 import subprocess | |
8 import shutil | |
9 import math | |
10 from optparse import OptionParser | |
11 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome | |
12 from scripts.get_chr_length import * | |
13 from scripts.convert_targeted_regions import * | |
14 from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate | |
15 from multiprocessing import Pool | |
16 | |
17 class Params: | |
18 """ | |
19 Class for top-level system parameters: | |
20 """ | |
21 | |
22 def __init__(self): | |
23 | |
24 def mult_files(option, opt_str, value, parser): | |
25 args=[] | |
26 for arg in parser.rargs: | |
27 if arg[0] != "-": | |
28 args.append(arg) | |
29 else: | |
30 del parser.rargs[:len(args)] | |
31 break | |
32 if getattr(parser.values, option.dest): | |
33 args.extend(getattr(parser.values, option.dest)) | |
34 setattr(parser.values, option.dest, args) | |
35 | |
36 | |
37 #command-line option definition | |
38 self.parser = OptionParser() | |
39 self.parser.add_option("-t", "--target", | |
40 help="Target region definition file [REQUIRED] [BED Format]", | |
41 action="store", type="string", dest="target") | |
42 self.parser.add_option("-f", "--files", | |
43 help="Files to be converted to baselines [REQUIRED] [BAM]", | |
44 action="callback", callback=mult_files, dest="files") | |
45 self.parser.add_option("-o", "--output", | |
46 help="Output folder [REQUIRED]", | |
47 action="store", type="string", dest="output") | |
48 self.parser.add_option("-c","--trim", | |
49 help="Portion of outliers to be removed before calculating average [Default: 0.2]", | |
50 action="store", dest="trim", default="0.2") | |
51 self.parser.add_option("-n","--name", | |
52 help="Output baseline name [Default: baseline]", | |
53 action="store", dest="name", default="baseline") | |
54 | |
55 # required parameters list: | |
56 self.ERRORLIST = [] | |
57 | |
58 #change system parameters based on any command line arguments | |
59 (options, args) = self.parser.parse_args() | |
60 if options.target: | |
61 self.TARGET=options.target | |
62 else: | |
63 self.ERRORLIST.append("target") | |
64 | |
65 if options.files: | |
66 self.FILES=options.files | |
67 else: | |
68 self.ERRORLIST.append("files") | |
69 | |
70 if options.output: | |
71 self.OUTPUT=options.output | |
72 else: | |
73 self.ERRORLIST.append("output") | |
74 | |
75 if options.trim: | |
76 self.TRIM=float(options.trim) | |
77 | |
78 if options.name: | |
79 self.NAME=options.name | |
80 | |
81 if len(self.ERRORLIST) != 0: | |
82 self.parser.print_help() | |
83 self.parser.error("Missing required parameters") | |
84 | |
85 | |
86 def repeat(self): | |
87 #params test | |
88 print "target :", self.TARGET | |
89 print "files :", self.FILES | |
90 print "output :", self.OUTPUT | |
91 print "trim :", self.TRIM | |
92 print "name :", self.NAME | |
93 | |
94 | |
95 # option handling | |
96 params = Params() | |
97 #params.repeat() | |
98 targetFile = params.TARGET | |
99 infiles = params.FILES | |
100 output_dir = params.OUTPUT | |
101 | |
102 #Debug | |
103 print " ------ baseline.py ------- " | |
104 print "Target:", targetFile | |
105 for files in infiles: | |
106 print "File:", files | |
107 print "Output Directory: ", output_dir | |
108 | |
109 def make_new_directory(outdir): | |
110 print outdir | |
111 if not os.path.exists(outdir): | |
112 os.mkdir(outdir) | |
113 | |
114 print " ----- creating output directory -----" | |
115 make_new_directory(output_dir) | |
116 outdir = os.path.join(output_dir, "buf") | |
117 make_new_directory(outdir) | |
118 | |
119 targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted") | |
120 os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2)) | |
121 | |
122 def processInFile(infile): | |
123 infilename = os.path.basename(infile) | |
124 s_outdir = os.path.join(outdir, infilename) | |
125 make_new_directory(s_outdir) | |
126 genomeFile = os.path.join(s_outdir,infilename +'.chrsummary') | |
127 get_genome(infile, genomeFile) | |
128 | |
129 bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") | |
130 args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile)) | |
131 iOutFile = open(bedgraph, "w") | |
132 output = subprocess.Popen(args, stdout = iOutFile).wait() | |
133 iOutFile.close() | |
134 | |
135 targetList = convertTarget(targetFile2) | |
136 | |
137 splitByChromosome(bedgraph) | |
138 | |
139 convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/") | |
140 bedgraph_tgtonly = bedgraph+".TARGETONLY" | |
141 bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE" | |
142 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly) | |
143 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg) | |
144 shutil.copy(bedgraph_tgtonly,outdir) | |
145 shutil.copy(bedgraph_tgtonly_avg,outdir) | |
146 | |
147 print "----- Processing Files -----" | |
148 pool = Pool(5) | |
149 pool.map(processInFile,infiles) | |
150 | |
151 | |
152 # STEP 2 -> Create Union BedGraph | |
153 allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")] | |
154 allfiles_path = [os.path.join(outdir,x) for x in allfiles_names] | |
155 | |
156 args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names | |
157 | |
158 print (str(args)) | |
159 fo = os.path.join(outdir,"TARGETONLY.union.txt") | |
160 foh = open(fo,"w") | |
161 | |
162 subprocess.Popen(args,stdout=foh).wait() | |
163 foh.close() | |
164 | |
165 # STEP 3 -> POOL METHOD | |
166 TRIM = params.TRIM #TRIM = 0.2 | |
167 f = fo | |
168 fh = open(f) | |
169 | |
170 fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt") | |
171 fo_libsize=fo+".LIBSIZE" | |
172 | |
173 | |
174 foh=open(fo,'w') | |
175 fo_libsize_h=open(fo_libsize,'w') | |
176 | |
177 | |
178 fh.readline() | |
179 | |
180 Ns = None | |
181 nSamples = None | |
182 | |
183 lineCount=0 | |
184 for line in fh: | |
185 lineCount+=1 | |
186 line_elems=line.rstrip().split("\t") | |
187 rangeLen=int(line_elems[2])-int(line_elems[1]) | |
188 if not Ns: | |
189 nSamples=len(line_elems)-3 | |
190 Ns = [0 for x in range(nSamples)] | |
191 for i in range(nSamples): | |
192 ind=i+3 | |
193 Ns[i] += int(line_elems[ind])*rangeLen | |
194 | |
195 | |
196 fh.close() | |
197 fh=open(f) | |
198 fh.readline() | |
199 | |
200 | |
201 lineCount=0 | |
202 nSamples_exclude = int(math.floor(TRIM*nSamples)) | |
203 | |
204 def gm_mean(xs): | |
205 tmpprod = 1 | |
206 p = 1.0/len(xs) | |
207 for x in xs: | |
208 tmpprod = tmpprod * math.pow(x,p) | |
209 return tmpprod | |
210 | |
211 Ns_gmean = gm_mean(Ns) | |
212 | |
213 def meanstdv(x): | |
214 from math import sqrt | |
215 n, mean, std = len(x), 0, 0 | |
216 for a in x: | |
217 mean = mean+a | |
218 mean = mean / float(n) | |
219 for a in x: | |
220 std = std+(a-mean)**2 | |
221 std=sqrt(std/float(n-1)) | |
222 return mean, std | |
223 | |
224 | |
225 libsize = 0 | |
226 for line in fh: | |
227 lineCount+=1 | |
228 line_elems=line.rstrip().split("\t") | |
229 rangeLen=int(line_elems[2])-int(line_elems[1]) | |
230 xs_tmp=[int(x) for x in line_elems[3:]] | |
231 xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))] | |
232 xs.sort() | |
233 xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)] | |
234 #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed)) | |
235 trimmed_mean,std=meanstdv(xs_trimmed) | |
236 libsize+=rangeLen*trimmed_mean | |
237 foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n") | |
238 | |
239 | |
240 fo_libsize_h.write(str(int(round(libsize)))) | |
241 | |
242 fh.close() | |
243 foh.close() | |
244 fo_libsize_h.close() | |
245 | |
246 def removeTempFolder(tempFolderPath): | |
247 import shutil | |
248 | |
249 shutil.rmtree(tempFolderPath) | |
250 print "Temp Folder Removed" | |
251 | |
252 # Removed Temp Folder | |
253 removeTempFolder(outdir) |