annotate chromatrat.py @ 9:743e69c1e122 default tip

Uploaded
author cmmt
date Mon, 23 Jan 2012 05:28:00 -0500
parents 9cb525f30060
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
1 import os;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
2 import re;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
3 import csv;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
4 import string;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
5 import sys;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
6 import math;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
7 import numpy as np;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
8 import matplotlib;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
9 matplotlib.use('Agg');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
10 from pylab import *;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
11
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
12 def extractGffFeatures(gffString):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
13 # GFF v3
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
14 # 1. seqname - Must be a chromosome or scaffold.
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
15 # 2. source - The program that generated this feature.
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
16 # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
17 # 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
18 # 5. end - The ending position of the feature (inclusive).
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
19 # 6. score - A score between 0 and 1000. If there is no score value, enter ".".
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
20 # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
21 # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
22 # 9. group - All lines with the same group are linked together into a single item.
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
23
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
24 res = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
25
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
26 if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n)
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
27 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
28 gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
29 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
30 return []; # exit, since we don't have 9 features
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
31
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
32 # handle the gffSeqName
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
33 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
34 chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
35 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
36 return [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
37
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
38 # handle gffStart
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
39 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
40 probeStart = int(gffStart);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
41 except ValueError:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
42 return []; # exit, invalid gffStart
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
43
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
44 # handle gffEnd
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
45 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
46 probeEnd = int(gffEnd);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
47 except ValueError:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
48 return []; # exit, invalid gffEnd
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
49
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
50 # handle gffScr
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
51 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
52 probeScr = float(gffScr);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
53 except ValueError:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
54 return []; # exit, invalid gffScr
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
55
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
56 # if the input string was well-formed, return the extracted features
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
57 res = [chrNo, probeStart, probeEnd, probeScr];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
58 return res; # return feature set (if any was found in this record)
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
59
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
60 # Define Constants
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
61 tickPlotAdj = 0.5; # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center)
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
62
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
63 # Initialize Variables
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
64 feaNames = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
65 feaChrNoList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
66 feaStrandList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
67 feaStartList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
68 feaEndList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
69 feaAttribList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
70 classBoundaryIdxs = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
71
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
72 gffChrNoList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
73 gffProbeCenterList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
74 gffMatScrList = [];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
75
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
76 # get command line arguments
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
77 gffFileName = sys.argv[1];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
78 feaFileName = sys.argv[2];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
79 outFileFormat = sys.argv[3];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
80 stepWidth = int(sys.argv[4]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
81 classBoundaryStr = sys.argv[5];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
82 upStreamStretch = int(sys.argv[6]); # in bp
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
83 maxX = int(sys.argv[7]); # in bp, if -1 maxX <= max(feature length), see below
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
84 tickSpacing = int(sys.argv[8]); # in bp
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
85 plotTitle = sys.argv[9];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
86 outFileName = sys.argv[10];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
87
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
88 # Extract class boundaries
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
89 classBoundaries = map(string.atoi, re.findall(r'\b\d+\b',classBoundaryStr));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
90
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
91 # Read transcript information
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
92 feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
93 for row in feaFileReader:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
94 feaName, feaChrNo, feaStrand, feaStart, feaEnd, feaAttrib = row;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
95 feaNames.append(feaName);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
96 feaChrNoList.append(int(feaChrNo));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
97 feaStrandList.append(int(feaStrand));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
98 feaStartList.append(int(feaStart));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
99 feaEndList.append(int(feaEnd));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
100 feaAttribList.append(float(feaAttrib));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
101
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
102 # Read enrichment scores
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
103 gffFileHandle = open(gffFileName, "rU");
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
104 for row in gffFileHandle:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
105 cache = extractGffFeatures(row);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
106 if len(cache) == 4: # if curr row wasn't well-formed, skip row
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
107 chrNo, probeStart, probeEnd, probeScr = cache;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
108 gffChrNoList.append(chrNo);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
109 gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
110 gffMatScrList.append(probeScr);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
111
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
112 # convert numerical lists into vectors
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
113 feaChrNos = np.array(feaChrNoList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
114 feaStrands = np.array(feaStrandList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
115 feaStarts = np.array(feaStartList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
116 feaEnds = np.array(feaEndList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
117 feaAttribs = np.array(feaAttribList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
118
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
119 gfChrNos = np.array(gffChrNoList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
120 gfProbeLocs = np.array(gffProbeCenterList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
121 gfMatScrs = np.array(gffMatScrList);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
122
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
123 # preallocate the results array, bins plus 2 extra cols for feature length and transfreq (or other freature characteristic) plus 1 col for sorting
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
124 avgFeatureMatScrs = np.empty((len(feaStarts),\
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
125 (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 3))),\
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
126 );
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
127 avgFeatureMatScrs[:] = np.NAN;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
128
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
129 featureCounter = 0;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
130
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
131 for chr in range(min(feaChrNos),max(feaChrNos) + 1):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
132 #get all rows from the feature file that contain information about the i'th chromosome
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
133 currChrIdxs = (feaChrNos == chr);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
134 currStrands = feaStrands[currChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
135 currStarts = feaStarts[currChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
136 currEnds = feaEnds[currChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
137 currAttribs = feaAttribs[currChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
138
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
139 #get all rows from ChIP-* data sets
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
140 currGfChrIdxs = (gfChrNos == chr);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
141 currGfProbeLocs = gfProbeLocs[currGfChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
142 currGfScrs = gfMatScrs[currGfChrIdxs];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
143
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
144 # iterate over all features of the current chromosome
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
145 for j in range(0,len(currStarts)):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
146 feaStrand = currStrands[j];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
147 feaStart = currStarts[j];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
148 feaEnd = currEnds[j];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
149 feaAttrib = currAttribs[j];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
150
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
151 #calc the corresponding probe positions for the feature start
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
152 if feaStrand == 1: # +strand
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
153 refFeaSt = 0;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
154 while feaStart >= currGfProbeLocs[refFeaSt]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
155 refFeaSt += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
156 else: # -strand
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
157 refFeaSt = 0;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
158 while feaEnd >= currGfProbeLocs[refFeaSt]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
159 refFeaSt += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
160 #now refFeaSt points to the correct probe according to strand orientation
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
161
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
162 #calc the avg probe scores per bin
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
163 if feaStrand == 1:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
164 feaChunkStart = feaStart; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
165 refChunkStart = refFeaSt; # probe level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
166 refChunkEnd = refFeaSt; # probe level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
167
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
168 # determine values for the upstream bins
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
169 for k in range(1,upStreamStretch/stepWidth + 1):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
170 feaChunkEnd = feaStart - k * stepWidth; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
171
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
172 # find the correct ref for the current feaChunkEnd; catch potential IndexOutOfBoundEx when upstream region too close to chr boundary
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
173 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
174 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
175 refChunkEnd -= 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
176 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
177 pass; # don't handle the exception further
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
178
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
179 #calc probe mean
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
180 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
181
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
182 #adjust pointers
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
183 refChunkStart = refChunkEnd;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
184
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
185 # determine values for bins in feature body
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
186 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
187 feaChunkEnd = feaStart + k*stepWidth; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
188
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
189 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
190 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
191 refChunkEnd += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
192 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
193 pass; # don't handle the exception further
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
194
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
195 # calc probe mean
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
196 avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
197
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
198 # adjust chunk pointers
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
199 refChunkStart = refChunkEnd;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
200
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
201 # write feature length and attrib in last cols of the array
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
202 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
203 avgFeatureMatScrs[featureCounter, -2] = feaAttrib;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
204 featureCounter += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
205 else: # -1 strand
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
206 feaChunkStart = feaEnd; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
207 refChunkStart = refFeaSt; # probe level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
208 refChunkEnd = refFeaSt; # probe level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
209
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
210 # determine values for the upStream region bins
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
211 for k in range(1,upStreamStretch/stepWidth + 1):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
212 feaChunkEnd = feaEnd + k*stepWidth; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
213
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
214 # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
215 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
216 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
217 refChunkEnd += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
218 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
219 pass;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
220
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
221 # calc probe mean
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
222 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
223
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
224 # adjust chunk pointers
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
225 refChunkStart = refChunkEnd;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
226
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
227 # determine values for bins in feature body
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
228 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)):
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
229 feaChunkEnd = feaEnd - k*stepWidth; # bp level pointer
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
230
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
231 try:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
232 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
233 refChunkEnd -= 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
234 except:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
235 pass;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
236
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
237 # calc probe mean
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
238 avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
239
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
240 # adjust chunk pointers
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
241 refChunkStart = refChunkEnd;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
242
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
243 # write feature length and attrib in last cols of the array
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
244 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
245 avgFeatureMatScrs[featureCounter, -2] = feaAttrib;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
246 featureCounter += 1;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
247
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
248 # determine idxs of class boundaries and write class value in respective cells
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
249 for c in range(len(classBoundaries)-1,-1,-1): # reverse iterate through the boundaries list
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
250 idxs = np.nonzero(avgFeatureMatScrs[:,-2] <= classBoundaries[c])[0];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
251 avgFeatureMatScrs[idxs, -3] = c;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
252
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
253 # assign names and data format to all cols of the results array and sort by feature attrib
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
254 sortByClassCol = 'col' + str(np.size(avgFeatureMatScrs,1)-3);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
255 sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
256
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
257 matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]};
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
258 avgFeatureMatScrs.dtype = matDtype;
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
259 avgFeatureMatScrs.sort(axis = 0, order = [sortByClassCol, sortByLengthCol]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
260
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
261 # generate colormap
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
262 BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))};
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
263 BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
264
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
265 lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
266 upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
267 cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge; # upper inner fence
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
268
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
269 # remove custom array dtype used for sorting and plot avg scrs (without the last 3 cols)
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
270 imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-3], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
271 axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
272
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
273 # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
274 if maxX == -1:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
275 maxX = max(feaEnds - feaStarts);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
276 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
277 else:
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
278 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
279
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
280 xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
281 xTicks.insert(-1,-tickPlotAdj);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
282 xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)];
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
283 xTicksLabel.insert(-1, str(-upStreamStretch));
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
284 imgplot.axes.set_xticks(xTicks);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
285 imgplot.axes.set_xticklabels(xTicksLabel);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
286
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
287 imgplot.axes.set_title(plotTitle);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
288 imgplot.axes.set_xlabel('Distance from feature start (bp)');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
289 imgplot.axes.set_ylabel('Feature class (low to high)');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
290 imgplot.axes.set_yticks([]);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
291
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
292 imgplot.set_cmap(BYcmap);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
293 colorbar();
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
294 show();
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
295
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
296 savefig('chromatra_t_tmp_plot', format=outFileFormat);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
297 data = file('chromatra_t_tmp_plot', 'rb').read();
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
298 fp = open(outFileName, 'wb');
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
299 fp.write(data);
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
300 fp.close();
9cb525f30060 Uploaded
cmmt
parents:
diff changeset
301 os.remove('chromatra_t_tmp_plot');