0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import fnmatch
|
|
6 import shlex
|
|
7 import subprocess
|
|
8 import shutil
|
|
9 import math
|
|
10 from optparse import OptionParser
|
|
11 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome
|
|
12 from scripts.get_chr_length import *
|
|
13 from scripts.convert_targeted_regions import *
|
|
14 from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate
|
|
15 from multiprocessing import Pool
|
|
16
|
|
17 class Params:
|
|
18 """
|
|
19 Class for top-level system parameters:
|
|
20 """
|
|
21
|
|
22 def __init__(self):
|
|
23
|
|
24 def mult_files(option, opt_str, value, parser):
|
|
25 args=[]
|
|
26 for arg in parser.rargs:
|
|
27 if arg[0] != "-":
|
|
28 args.append(arg)
|
|
29 else:
|
|
30 del parser.rargs[:len(args)]
|
|
31 break
|
|
32 if getattr(parser.values, option.dest):
|
|
33 args.extend(getattr(parser.values, option.dest))
|
|
34 setattr(parser.values, option.dest, args)
|
|
35
|
|
36
|
|
37 #command-line option definition
|
|
38 self.parser = OptionParser()
|
|
39 self.parser.add_option("-t", "--target",
|
|
40 help="Target region definition file [REQUIRED] [BED Format]",
|
|
41 action="store", type="string", dest="target")
|
|
42 self.parser.add_option("-f", "--files",
|
|
43 help="Files to be converted to baselines [REQUIRED] [BAM]",
|
|
44 action="callback", callback=mult_files, dest="files")
|
|
45 self.parser.add_option("-o", "--output",
|
|
46 help="Output folder [REQUIRED]",
|
|
47 action="store", type="string", dest="output")
|
|
48 self.parser.add_option("-c","--trim",
|
|
49 help="Portion of outliers to be removed before calculating average [Default: 0.2]",
|
|
50 action="store", dest="trim", default="0.2")
|
|
51 self.parser.add_option("-n","--name",
|
|
52 help="Output baseline name [Default: baseline]",
|
|
53 action="store", dest="name", default="baseline")
|
|
54
|
|
55 # required parameters list:
|
|
56 self.ERRORLIST = []
|
|
57
|
|
58 #change system parameters based on any command line arguments
|
|
59 (options, args) = self.parser.parse_args()
|
|
60 if options.target:
|
|
61 self.TARGET=options.target
|
|
62 else:
|
|
63 self.ERRORLIST.append("target")
|
|
64
|
|
65 if options.files:
|
|
66 self.FILES=options.files
|
|
67 else:
|
|
68 self.ERRORLIST.append("files")
|
|
69
|
|
70 if options.output:
|
|
71 self.OUTPUT=options.output
|
|
72 else:
|
|
73 self.ERRORLIST.append("output")
|
|
74
|
|
75 if options.trim:
|
|
76 self.TRIM=float(options.trim)
|
|
77
|
|
78 if options.name:
|
|
79 self.NAME=options.name
|
|
80
|
|
81 if len(self.ERRORLIST) != 0:
|
|
82 self.parser.print_help()
|
|
83 self.parser.error("Missing required parameters")
|
|
84
|
|
85
|
|
86 def repeat(self):
|
|
87 #params test
|
|
88 print "target :", self.TARGET
|
|
89 print "files :", self.FILES
|
|
90 print "output :", self.OUTPUT
|
|
91 print "trim :", self.TRIM
|
|
92 print "name :", self.NAME
|
|
93
|
|
94
|
|
95 # option handling
|
|
96 params = Params()
|
|
97 #params.repeat()
|
|
98 targetFile = params.TARGET
|
|
99 infiles = params.FILES
|
|
100 output_dir = params.OUTPUT
|
|
101
|
|
102 #Debug
|
|
103 print " ------ baseline.py ------- "
|
|
104 print "Target:", targetFile
|
|
105 for files in infiles:
|
|
106 print "File:", files
|
|
107 print "Output Directory: ", output_dir
|
|
108
|
|
109 def make_new_directory(outdir):
|
|
110 print outdir
|
|
111 if not os.path.exists(outdir):
|
|
112 os.mkdir(outdir)
|
|
113
|
|
114 print " ----- creating output directory -----"
|
|
115 make_new_directory(output_dir)
|
|
116 outdir = os.path.join(output_dir, "buf")
|
|
117 make_new_directory(outdir)
|
|
118
|
|
119 targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted")
|
|
120 os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2))
|
|
121
|
|
122 def processInFile(infile):
|
|
123 infilename = os.path.basename(infile)
|
|
124 s_outdir = os.path.join(outdir, infilename)
|
|
125 make_new_directory(s_outdir)
|
|
126 genomeFile = os.path.join(s_outdir,infilename +'.chrsummary')
|
|
127 get_genome(infile, genomeFile)
|
|
128
|
|
129 bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
|
|
130 args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile))
|
|
131 iOutFile = open(bedgraph, "w")
|
|
132 output = subprocess.Popen(args, stdout = iOutFile).wait()
|
|
133 iOutFile.close()
|
|
134
|
|
135 targetList = convertTarget(targetFile2)
|
|
136
|
|
137 splitByChromosome(bedgraph)
|
|
138
|
|
139 convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/")
|
|
140 bedgraph_tgtonly = bedgraph+".TARGETONLY"
|
|
141 bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE"
|
|
142 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly)
|
|
143 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg)
|
|
144 shutil.copy(bedgraph_tgtonly,outdir)
|
|
145 shutil.copy(bedgraph_tgtonly_avg,outdir)
|
|
146
|
|
147 print "----- Processing Files -----"
|
|
148 pool = Pool(5)
|
|
149 pool.map(processInFile,infiles)
|
|
150
|
|
151
|
|
152 # STEP 2 -> Create Union BedGraph
|
|
153 allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")]
|
|
154 allfiles_path = [os.path.join(outdir,x) for x in allfiles_names]
|
|
155
|
|
156 args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names
|
|
157
|
|
158 print (str(args))
|
|
159 fo = os.path.join(outdir,"TARGETONLY.union.txt")
|
|
160 foh = open(fo,"w")
|
|
161
|
|
162 subprocess.Popen(args,stdout=foh).wait()
|
|
163 foh.close()
|
|
164
|
|
165 # STEP 3 -> POOL METHOD
|
|
166 TRIM = params.TRIM #TRIM = 0.2
|
|
167 f = fo
|
|
168 fh = open(f)
|
|
169
|
|
170 fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt")
|
|
171 fo_libsize=fo+".LIBSIZE"
|
|
172
|
|
173
|
|
174 foh=open(fo,'w')
|
|
175 fo_libsize_h=open(fo_libsize,'w')
|
|
176
|
|
177
|
|
178 fh.readline()
|
|
179
|
|
180 Ns = None
|
|
181 nSamples = None
|
|
182
|
|
183 lineCount=0
|
|
184 for line in fh:
|
|
185 lineCount+=1
|
|
186 line_elems=line.rstrip().split("\t")
|
|
187 rangeLen=int(line_elems[2])-int(line_elems[1])
|
|
188 if not Ns:
|
|
189 nSamples=len(line_elems)-3
|
|
190 Ns = [0 for x in range(nSamples)]
|
|
191 for i in range(nSamples):
|
|
192 ind=i+3
|
|
193 Ns[i] += int(line_elems[ind])*rangeLen
|
|
194
|
|
195
|
|
196 fh.close()
|
|
197 fh=open(f)
|
|
198 fh.readline()
|
|
199
|
|
200
|
|
201 lineCount=0
|
|
202 nSamples_exclude = int(math.floor(TRIM*nSamples))
|
|
203
|
|
204 def gm_mean(xs):
|
|
205 tmpprod = 1
|
|
206 p = 1.0/len(xs)
|
|
207 for x in xs:
|
|
208 tmpprod = tmpprod * math.pow(x,p)
|
|
209 return tmpprod
|
|
210
|
|
211 Ns_gmean = gm_mean(Ns)
|
|
212
|
|
213 def meanstdv(x):
|
|
214 from math import sqrt
|
|
215 n, mean, std = len(x), 0, 0
|
|
216 for a in x:
|
|
217 mean = mean+a
|
|
218 mean = mean / float(n)
|
|
219 for a in x:
|
|
220 std = std+(a-mean)**2
|
|
221 std=sqrt(std/float(n-1))
|
|
222 return mean, std
|
|
223
|
|
224
|
|
225 libsize = 0
|
|
226 for line in fh:
|
|
227 lineCount+=1
|
|
228 line_elems=line.rstrip().split("\t")
|
|
229 rangeLen=int(line_elems[2])-int(line_elems[1])
|
|
230 xs_tmp=[int(x) for x in line_elems[3:]]
|
|
231 xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))]
|
|
232 xs.sort()
|
|
233 xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)]
|
|
234 #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed))
|
|
235 trimmed_mean,std=meanstdv(xs_trimmed)
|
|
236 libsize+=rangeLen*trimmed_mean
|
|
237 foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n")
|
|
238
|
|
239
|
|
240 fo_libsize_h.write(str(int(round(libsize))))
|
|
241
|
|
242 fh.close()
|
|
243 foh.close()
|
|
244 fo_libsize_h.close()
|
|
245
|
|
246 def removeTempFolder(tempFolderPath):
|
|
247 import shutil
|
|
248
|
|
249 shutil.rmtree(tempFolderPath)
|
|
250 print "Temp Folder Removed"
|
|
251
|
|
252 # Removed Temp Folder
|
|
253 removeTempFolder(outdir)
|