comparison chromatral.py @ 3:519e45d9539f

Uploaded
author cmmt
date Mon, 23 Jan 2012 05:24:20 -0500
parents
children
comparison
equal deleted inserted replaced
2:116d726d2ed4 3:519e45d9539f
1 import os;
2 import re;
3 import csv;
4 import sys;
5 import math;
6 import numpy as np;
7 import matplotlib;
8 matplotlib.use('Agg');
9 from pylab import *;
10
11 def extractGffFeatures(gffString):
12 # GFF v3
13 # 1. seqname - Must be a chromosome or scaffold.
14 # 2. source - The program that generated this feature.
15 # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
16 # 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
17 # 5. end - The ending position of the feature (inclusive).
18 # 6. score - A score between 0 and 1000. If there is no score value, enter ".".
19 # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
20 # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
21 # 9. group - All lines with the same group are linked together into a single item.
22
23 res = [];
24
25 if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n)
26 try:
27 gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t');
28 except:
29 return []; # exit, since we don't have 9 features
30
31 # handle the gffSeqName
32 try:
33 chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]);
34 except:
35 return [];
36
37 # handle gffStart
38 try:
39 probeStart = int(gffStart);
40 except ValueError:
41 return []; # exit, invalid gffStart
42
43 # handle gffEnd
44 try:
45 probeEnd = int(gffEnd);
46 except ValueError:
47 return []; # exit, invalid gffEnd
48
49 # handle gffScr
50 try:
51 probeScr = float(gffScr);
52 except ValueError:
53 return []; # exit, invalid gffScr
54
55 # if the input string was well-formed, return the extracted features
56 res = [chrNo, probeStart, probeEnd, probeScr];
57 return res; # return feature set (if any was found in this record)
58
59
60 # Define Constants
61 tickPlotAdj = 0.5; # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center)
62
63 # Initialize Variables
64 feaNames = [];
65 feaChrNoList = [];
66 feaStrandList = [];
67 feaStartList = [];
68 feaEndList = [];
69
70 gffChrNoList = [];
71 gffProbeCenterList = [];
72 gffMatScrList = [];
73
74 # get command line arguments
75 gffFileName = sys.argv[1];
76 feaFileName = sys.argv[2];
77 outFileFormat = sys.argv[3];
78 stepWidth = int(sys.argv[4]);
79 upStreamStretch = int(sys.argv[5]); # in bp
80 maxX = int(sys.argv[6]); # in bp, if -1 maxX <= max(feature length), see below
81 tickSpacing = int(sys.argv[7]); # in bp
82 plotTitle = sys.argv[8];
83 outFileName = sys.argv[9];
84
85 # Read transcript information
86 feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t');
87 for row in feaFileReader:
88 feaName, feaChrNo, feaStrand, feaStart, feaEnd = row;
89 feaNames.append(feaName);
90 feaChrNoList.append(int(feaChrNo));
91 feaStrandList.append(int(feaStrand));
92 feaStartList.append(int(feaStart));
93 feaEndList.append(int(feaEnd));
94
95 # Read enrichment scores
96 gffFileHandle = open(gffFileName, "rU");
97 for row in gffFileHandle:
98 cache = extractGffFeatures(row);
99 if len(cache) == 4: # if curr row wasn't well-formed, skip row
100 chrNo, probeStart, probeEnd, probeScr = cache;
101 gffChrNoList.append(chrNo);
102 gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe
103 gffMatScrList.append(probeScr);
104
105 # convert numerical lists into vectors
106 feaChrNos = np.array(feaChrNoList);
107 feaStrands = np.array(feaStrandList);
108 feaStarts = np.array(feaStartList);
109 feaEnds = np.array(feaEndList);
110
111 gfChrNos = np.array(gffChrNoList);
112 gfProbeLocs = np.array(gffProbeCenterList);
113 gfMatScrs = np.array(gffMatScrList);
114
115 # preallocate the results array, bins and 1 extra cols for feature length
116 avgFeatureMatScrs = np.empty((len(feaStarts),\
117 (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 1))),\
118 );
119 avgFeatureMatScrs[:] = np.NAN;
120
121 featureCounter = 0;
122
123 for chr in range(min(feaChrNos),max(feaChrNos) + 1):
124 #get all rows from the feature file that contain information about the i'th chromosome
125 currChrIdxs = (feaChrNos == chr);
126 currStrands = feaStrands[currChrIdxs];
127 currStarts = feaStarts[currChrIdxs];
128 currEnds = feaEnds[currChrIdxs];
129
130 #get all rows from ChIP-* data sets
131 currGfChrIdxs = (gfChrNos == chr);
132 currGfProbeLocs = gfProbeLocs[currGfChrIdxs];
133 currGfScrs = gfMatScrs[currGfChrIdxs];
134
135 # iterate over all features of the current chromosome
136 for j in range(0,len(currStarts)):
137 feaStrand = currStrands[j];
138 feaStart = currStarts[j];
139 feaEnd = currEnds[j];
140
141 #calc the corresponding probe positions for the feature start
142 if feaStrand == 1: # +strand
143 refFeaSt = 0;
144 while feaStart >= currGfProbeLocs[refFeaSt]:
145 refFeaSt += 1;
146 else: # -strand
147 refFeaSt = 0;
148 while feaEnd >= currGfProbeLocs[refFeaSt]:
149 refFeaSt += 1;
150 #now refFeaSt points to the correct probe according to strand orientation
151
152 #calc the avg probe scores per bin
153 if feaStrand == 1:
154 feaChunkStart = feaStart; # bp level pointer
155 refChunkStart = refFeaSt; # probe level pointer
156 refChunkEnd = refFeaSt; # probe level pointer
157
158 # determine values for the upstream bins
159 for k in range(1,upStreamStretch/stepWidth + 1):
160 feaChunkEnd = feaStart - k * stepWidth; # bp level pointer
161
162 # find the correct ref for the current feaChunkEnd; catch potential IndexOutOfBoundEx when upstream region too close to chr boundary
163 try:
164 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]:
165 refChunkEnd -= 1;
166 except:
167 pass; # don't handle the exception further
168
169 #calc probe mean
170 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]);
171
172 #adjust pointers
173 refChunkStart = refChunkEnd;
174
175 # determine values for bins in feature body
176 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)):
177 feaChunkEnd = feaStart + k*stepWidth; # bp level pointer
178
179 try:
180 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]:
181 refChunkEnd += 1;
182 except:
183 pass; # don't handle the exception further
184
185 # calc probe mean
186 avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]);
187
188 # adjust chunk pointers
189 refChunkStart = refChunkEnd;
190
191 # write feature length in last col of the array
192 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart;
193 featureCounter += 1;
194 else: # -1 strand
195 feaChunkStart = feaEnd; # bp level pointer
196 refChunkStart = refFeaSt; # probe level pointer
197 refChunkEnd = refFeaSt; # probe level pointer
198
199 # determine values for the upStream region bins
200 for k in range(1,upStreamStretch/stepWidth + 1):
201 feaChunkEnd = feaEnd + k*stepWidth; # bp level pointer
202
203 # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary
204 try:
205 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]:
206 refChunkEnd += 1;
207 except:
208 pass;
209
210 # calc probe mean
211 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]);
212
213 # adjust chunk pointers
214 refChunkStart = refChunkEnd;
215
216 # determine values for bins in feature body
217 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)):
218 feaChunkEnd = feaEnd - k*stepWidth; # bp level pointer
219
220 try:
221 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]:
222 refChunkEnd -= 1;
223 except:
224 pass;
225
226 # calc probe mean
227 avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]);
228
229 # adjust chunk pointers
230 refChunkStart = refChunkEnd;
231
232 # write feature length in last col of the array
233 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart;
234 featureCounter += 1;
235
236
237 # assign names and data format to all cols of the results array and sort array by feature length (last column)
238 sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1);
239 matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]};
240 avgFeatureMatScrs.dtype = matDtype;
241 avgFeatureMatScrs.sort(axis = 0, order = sortByLengthCol);
242
243 # generate colormap
244 BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))};
245 BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9);
246
247 lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0);
248 upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0);
249 cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge; # upper inner fence
250
251 # remove custom array dtype used for sorting and plot avgScrs (without the last col which stores the length info)
252 imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-1], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim);
253 axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1);
254
255 # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly
256 if maxX == -1:
257 maxX = max(feaEnds - feaStarts);
258 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj);
259 else:
260 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj);
261
262 xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)];
263 xTicks.insert(-1,-tickPlotAdj);
264 xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)];
265 xTicksLabel.insert(-1, str(-upStreamStretch));
266 imgplot.axes.set_xticks(xTicks);
267 imgplot.axes.set_xticklabels(xTicksLabel);
268
269 imgplot.axes.set_title(plotTitle);
270 imgplot.axes.set_xlabel('Distance from feature start (bp)');
271 imgplot.axes.set_ylabel('');
272 imgplot.axes.set_yticks([]);
273
274 imgplot.set_cmap(BYcmap);
275 colorbar();
276 show();
277
278 savefig('chromatra_l_tmp_plot', format=outFileFormat);
279 data = file('chromatra_l_tmp_plot', 'rb').read();
280 fp = open(outFileName, 'wb');
281 fp.write(data);
282 fp.close();
283 os.remove('chromatra_l_tmp_plot');