Mercurial > repos > cmmt > chromatra
comparison chromatral.py @ 3:519e45d9539f
Uploaded
author | cmmt |
---|---|
date | Mon, 23 Jan 2012 05:24:20 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:116d726d2ed4 | 3:519e45d9539f |
---|---|
1 import os; | |
2 import re; | |
3 import csv; | |
4 import sys; | |
5 import math; | |
6 import numpy as np; | |
7 import matplotlib; | |
8 matplotlib.use('Agg'); | |
9 from pylab import *; | |
10 | |
11 def extractGffFeatures(gffString): | |
12 # GFF v3 | |
13 # 1. seqname - Must be a chromosome or scaffold. | |
14 # 2. source - The program that generated this feature. | |
15 # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". | |
16 # 4. start - The starting position of the feature in the sequence. The first base is numbered 1. | |
17 # 5. end - The ending position of the feature (inclusive). | |
18 # 6. score - A score between 0 and 1000. If there is no score value, enter ".". | |
19 # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). | |
20 # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. | |
21 # 9. group - All lines with the same group are linked together into a single item. | |
22 | |
23 res = []; | |
24 | |
25 if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n) | |
26 try: | |
27 gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t'); | |
28 except: | |
29 return []; # exit, since we don't have 9 features | |
30 | |
31 # handle the gffSeqName | |
32 try: | |
33 chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]); | |
34 except: | |
35 return []; | |
36 | |
37 # handle gffStart | |
38 try: | |
39 probeStart = int(gffStart); | |
40 except ValueError: | |
41 return []; # exit, invalid gffStart | |
42 | |
43 # handle gffEnd | |
44 try: | |
45 probeEnd = int(gffEnd); | |
46 except ValueError: | |
47 return []; # exit, invalid gffEnd | |
48 | |
49 # handle gffScr | |
50 try: | |
51 probeScr = float(gffScr); | |
52 except ValueError: | |
53 return []; # exit, invalid gffScr | |
54 | |
55 # if the input string was well-formed, return the extracted features | |
56 res = [chrNo, probeStart, probeEnd, probeScr]; | |
57 return res; # return feature set (if any was found in this record) | |
58 | |
59 | |
60 # Define Constants | |
61 tickPlotAdj = 0.5; # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center) | |
62 | |
63 # Initialize Variables | |
64 feaNames = []; | |
65 feaChrNoList = []; | |
66 feaStrandList = []; | |
67 feaStartList = []; | |
68 feaEndList = []; | |
69 | |
70 gffChrNoList = []; | |
71 gffProbeCenterList = []; | |
72 gffMatScrList = []; | |
73 | |
74 # get command line arguments | |
75 gffFileName = sys.argv[1]; | |
76 feaFileName = sys.argv[2]; | |
77 outFileFormat = sys.argv[3]; | |
78 stepWidth = int(sys.argv[4]); | |
79 upStreamStretch = int(sys.argv[5]); # in bp | |
80 maxX = int(sys.argv[6]); # in bp, if -1 maxX <= max(feature length), see below | |
81 tickSpacing = int(sys.argv[7]); # in bp | |
82 plotTitle = sys.argv[8]; | |
83 outFileName = sys.argv[9]; | |
84 | |
85 # Read transcript information | |
86 feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t'); | |
87 for row in feaFileReader: | |
88 feaName, feaChrNo, feaStrand, feaStart, feaEnd = row; | |
89 feaNames.append(feaName); | |
90 feaChrNoList.append(int(feaChrNo)); | |
91 feaStrandList.append(int(feaStrand)); | |
92 feaStartList.append(int(feaStart)); | |
93 feaEndList.append(int(feaEnd)); | |
94 | |
95 # Read enrichment scores | |
96 gffFileHandle = open(gffFileName, "rU"); | |
97 for row in gffFileHandle: | |
98 cache = extractGffFeatures(row); | |
99 if len(cache) == 4: # if curr row wasn't well-formed, skip row | |
100 chrNo, probeStart, probeEnd, probeScr = cache; | |
101 gffChrNoList.append(chrNo); | |
102 gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe | |
103 gffMatScrList.append(probeScr); | |
104 | |
105 # convert numerical lists into vectors | |
106 feaChrNos = np.array(feaChrNoList); | |
107 feaStrands = np.array(feaStrandList); | |
108 feaStarts = np.array(feaStartList); | |
109 feaEnds = np.array(feaEndList); | |
110 | |
111 gfChrNos = np.array(gffChrNoList); | |
112 gfProbeLocs = np.array(gffProbeCenterList); | |
113 gfMatScrs = np.array(gffMatScrList); | |
114 | |
115 # preallocate the results array, bins and 1 extra cols for feature length | |
116 avgFeatureMatScrs = np.empty((len(feaStarts),\ | |
117 (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 1))),\ | |
118 ); | |
119 avgFeatureMatScrs[:] = np.NAN; | |
120 | |
121 featureCounter = 0; | |
122 | |
123 for chr in range(min(feaChrNos),max(feaChrNos) + 1): | |
124 #get all rows from the feature file that contain information about the i'th chromosome | |
125 currChrIdxs = (feaChrNos == chr); | |
126 currStrands = feaStrands[currChrIdxs]; | |
127 currStarts = feaStarts[currChrIdxs]; | |
128 currEnds = feaEnds[currChrIdxs]; | |
129 | |
130 #get all rows from ChIP-* data sets | |
131 currGfChrIdxs = (gfChrNos == chr); | |
132 currGfProbeLocs = gfProbeLocs[currGfChrIdxs]; | |
133 currGfScrs = gfMatScrs[currGfChrIdxs]; | |
134 | |
135 # iterate over all features of the current chromosome | |
136 for j in range(0,len(currStarts)): | |
137 feaStrand = currStrands[j]; | |
138 feaStart = currStarts[j]; | |
139 feaEnd = currEnds[j]; | |
140 | |
141 #calc the corresponding probe positions for the feature start | |
142 if feaStrand == 1: # +strand | |
143 refFeaSt = 0; | |
144 while feaStart >= currGfProbeLocs[refFeaSt]: | |
145 refFeaSt += 1; | |
146 else: # -strand | |
147 refFeaSt = 0; | |
148 while feaEnd >= currGfProbeLocs[refFeaSt]: | |
149 refFeaSt += 1; | |
150 #now refFeaSt points to the correct probe according to strand orientation | |
151 | |
152 #calc the avg probe scores per bin | |
153 if feaStrand == 1: | |
154 feaChunkStart = feaStart; # bp level pointer | |
155 refChunkStart = refFeaSt; # probe level pointer | |
156 refChunkEnd = refFeaSt; # probe level pointer | |
157 | |
158 # determine values for the upstream bins | |
159 for k in range(1,upStreamStretch/stepWidth + 1): | |
160 feaChunkEnd = feaStart - k * stepWidth; # bp level pointer | |
161 | |
162 # find the correct ref for the current feaChunkEnd; catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | |
163 try: | |
164 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | |
165 refChunkEnd -= 1; | |
166 except: | |
167 pass; # don't handle the exception further | |
168 | |
169 #calc probe mean | |
170 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]); | |
171 | |
172 #adjust pointers | |
173 refChunkStart = refChunkEnd; | |
174 | |
175 # determine values for bins in feature body | |
176 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | |
177 feaChunkEnd = feaStart + k*stepWidth; # bp level pointer | |
178 | |
179 try: | |
180 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | |
181 refChunkEnd += 1; | |
182 except: | |
183 pass; # don't handle the exception further | |
184 | |
185 # calc probe mean | |
186 avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]); | |
187 | |
188 # adjust chunk pointers | |
189 refChunkStart = refChunkEnd; | |
190 | |
191 # write feature length in last col of the array | |
192 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | |
193 featureCounter += 1; | |
194 else: # -1 strand | |
195 feaChunkStart = feaEnd; # bp level pointer | |
196 refChunkStart = refFeaSt; # probe level pointer | |
197 refChunkEnd = refFeaSt; # probe level pointer | |
198 | |
199 # determine values for the upStream region bins | |
200 for k in range(1,upStreamStretch/stepWidth + 1): | |
201 feaChunkEnd = feaEnd + k*stepWidth; # bp level pointer | |
202 | |
203 # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | |
204 try: | |
205 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | |
206 refChunkEnd += 1; | |
207 except: | |
208 pass; | |
209 | |
210 # calc probe mean | |
211 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]); | |
212 | |
213 # adjust chunk pointers | |
214 refChunkStart = refChunkEnd; | |
215 | |
216 # determine values for bins in feature body | |
217 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | |
218 feaChunkEnd = feaEnd - k*stepWidth; # bp level pointer | |
219 | |
220 try: | |
221 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | |
222 refChunkEnd -= 1; | |
223 except: | |
224 pass; | |
225 | |
226 # calc probe mean | |
227 avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]); | |
228 | |
229 # adjust chunk pointers | |
230 refChunkStart = refChunkEnd; | |
231 | |
232 # write feature length in last col of the array | |
233 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | |
234 featureCounter += 1; | |
235 | |
236 | |
237 # assign names and data format to all cols of the results array and sort array by feature length (last column) | |
238 sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1); | |
239 matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]}; | |
240 avgFeatureMatScrs.dtype = matDtype; | |
241 avgFeatureMatScrs.sort(axis = 0, order = sortByLengthCol); | |
242 | |
243 # generate colormap | |
244 BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))}; | |
245 BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9); | |
246 | |
247 lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0); | |
248 upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0); | |
249 cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge; # upper inner fence | |
250 | |
251 # remove custom array dtype used for sorting and plot avgScrs (without the last col which stores the length info) | |
252 imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-1], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim); | |
253 axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1); | |
254 | |
255 # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly | |
256 if maxX == -1: | |
257 maxX = max(feaEnds - feaStarts); | |
258 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj); | |
259 else: | |
260 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj); | |
261 | |
262 xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | |
263 xTicks.insert(-1,-tickPlotAdj); | |
264 xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | |
265 xTicksLabel.insert(-1, str(-upStreamStretch)); | |
266 imgplot.axes.set_xticks(xTicks); | |
267 imgplot.axes.set_xticklabels(xTicksLabel); | |
268 | |
269 imgplot.axes.set_title(plotTitle); | |
270 imgplot.axes.set_xlabel('Distance from feature start (bp)'); | |
271 imgplot.axes.set_ylabel(''); | |
272 imgplot.axes.set_yticks([]); | |
273 | |
274 imgplot.set_cmap(BYcmap); | |
275 colorbar(); | |
276 show(); | |
277 | |
278 savefig('chromatra_l_tmp_plot', format=outFileFormat); | |
279 data = file('chromatra_l_tmp_plot', 'rb').read(); | |
280 fp = open(outFileName, 'wb'); | |
281 fp.write(data); | |
282 fp.close(); | |
283 os.remove('chromatra_l_tmp_plot'); |