Mercurial > repos > cmmt > chromatra
comparison chromatrat.py @ 5:9cb525f30060
Uploaded
| author | cmmt | 
|---|---|
| date | Mon, 23 Jan 2012 05:25:26 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 4:4c9201237641 | 5:9cb525f30060 | 
|---|---|
| 1 import os; | |
| 2 import re; | |
| 3 import csv; | |
| 4 import string; | |
| 5 import sys; | |
| 6 import math; | |
| 7 import numpy as np; | |
| 8 import matplotlib; | |
| 9 matplotlib.use('Agg'); | |
| 10 from pylab import *; | |
| 11 | |
| 12 def extractGffFeatures(gffString): | |
| 13 # GFF v3 | |
| 14 # 1. seqname - Must be a chromosome or scaffold. | |
| 15 # 2. source - The program that generated this feature. | |
| 16 # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". | |
| 17 # 4. start - The starting position of the feature in the sequence. The first base is numbered 1. | |
| 18 # 5. end - The ending position of the feature (inclusive). | |
| 19 # 6. score - A score between 0 and 1000. If there is no score value, enter ".". | |
| 20 # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). | |
| 21 # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. | |
| 22 # 9. group - All lines with the same group are linked together into a single item. | |
| 23 | |
| 24 res = []; | |
| 25 | |
| 26 if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n) | |
| 27 try: | |
| 28 gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t'); | |
| 29 except: | |
| 30 return []; # exit, since we don't have 9 features | |
| 31 | |
| 32 # handle the gffSeqName | |
| 33 try: | |
| 34 chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]); | |
| 35 except: | |
| 36 return []; | |
| 37 | |
| 38 # handle gffStart | |
| 39 try: | |
| 40 probeStart = int(gffStart); | |
| 41 except ValueError: | |
| 42 return []; # exit, invalid gffStart | |
| 43 | |
| 44 # handle gffEnd | |
| 45 try: | |
| 46 probeEnd = int(gffEnd); | |
| 47 except ValueError: | |
| 48 return []; # exit, invalid gffEnd | |
| 49 | |
| 50 # handle gffScr | |
| 51 try: | |
| 52 probeScr = float(gffScr); | |
| 53 except ValueError: | |
| 54 return []; # exit, invalid gffScr | |
| 55 | |
| 56 # if the input string was well-formed, return the extracted features | |
| 57 res = [chrNo, probeStart, probeEnd, probeScr]; | |
| 58 return res; # return feature set (if any was found in this record) | |
| 59 | |
| 60 # Define Constants | |
| 61 tickPlotAdj = 0.5; # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center) | |
| 62 | |
| 63 # Initialize Variables | |
| 64 feaNames = []; | |
| 65 feaChrNoList = []; | |
| 66 feaStrandList = []; | |
| 67 feaStartList = []; | |
| 68 feaEndList = []; | |
| 69 feaAttribList = []; | |
| 70 classBoundaryIdxs = []; | |
| 71 | |
| 72 gffChrNoList = []; | |
| 73 gffProbeCenterList = []; | |
| 74 gffMatScrList = []; | |
| 75 | |
| 76 # get command line arguments | |
| 77 gffFileName = sys.argv[1]; | |
| 78 feaFileName = sys.argv[2]; | |
| 79 outFileFormat = sys.argv[3]; | |
| 80 stepWidth = int(sys.argv[4]); | |
| 81 classBoundaryStr = sys.argv[5]; | |
| 82 upStreamStretch = int(sys.argv[6]); # in bp | |
| 83 maxX = int(sys.argv[7]); # in bp, if -1 maxX <= max(feature length), see below | |
| 84 tickSpacing = int(sys.argv[8]); # in bp | |
| 85 plotTitle = sys.argv[9]; | |
| 86 outFileName = sys.argv[10]; | |
| 87 | |
| 88 # Extract class boundaries | |
| 89 classBoundaries = map(string.atoi, re.findall(r'\b\d+\b',classBoundaryStr)); | |
| 90 | |
| 91 # Read transcript information | |
| 92 feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t'); | |
| 93 for row in feaFileReader: | |
| 94 feaName, feaChrNo, feaStrand, feaStart, feaEnd, feaAttrib = row; | |
| 95 feaNames.append(feaName); | |
| 96 feaChrNoList.append(int(feaChrNo)); | |
| 97 feaStrandList.append(int(feaStrand)); | |
| 98 feaStartList.append(int(feaStart)); | |
| 99 feaEndList.append(int(feaEnd)); | |
| 100 feaAttribList.append(float(feaAttrib)); | |
| 101 | |
| 102 # Read enrichment scores | |
| 103 gffFileHandle = open(gffFileName, "rU"); | |
| 104 for row in gffFileHandle: | |
| 105 cache = extractGffFeatures(row); | |
| 106 if len(cache) == 4: # if curr row wasn't well-formed, skip row | |
| 107 chrNo, probeStart, probeEnd, probeScr = cache; | |
| 108 gffChrNoList.append(chrNo); | |
| 109 gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe | |
| 110 gffMatScrList.append(probeScr); | |
| 111 | |
| 112 # convert numerical lists into vectors | |
| 113 feaChrNos = np.array(feaChrNoList); | |
| 114 feaStrands = np.array(feaStrandList); | |
| 115 feaStarts = np.array(feaStartList); | |
| 116 feaEnds = np.array(feaEndList); | |
| 117 feaAttribs = np.array(feaAttribList); | |
| 118 | |
| 119 gfChrNos = np.array(gffChrNoList); | |
| 120 gfProbeLocs = np.array(gffProbeCenterList); | |
| 121 gfMatScrs = np.array(gffMatScrList); | |
| 122 | |
| 123 # preallocate the results array, bins plus 2 extra cols for feature length and transfreq (or other freature characteristic) plus 1 col for sorting | |
| 124 avgFeatureMatScrs = np.empty((len(feaStarts),\ | |
| 125 (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 3))),\ | |
| 126 ); | |
| 127 avgFeatureMatScrs[:] = np.NAN; | |
| 128 | |
| 129 featureCounter = 0; | |
| 130 | |
| 131 for chr in range(min(feaChrNos),max(feaChrNos) + 1): | |
| 132 #get all rows from the feature file that contain information about the i'th chromosome | |
| 133 currChrIdxs = (feaChrNos == chr); | |
| 134 currStrands = feaStrands[currChrIdxs]; | |
| 135 currStarts = feaStarts[currChrIdxs]; | |
| 136 currEnds = feaEnds[currChrIdxs]; | |
| 137 currAttribs = feaAttribs[currChrIdxs]; | |
| 138 | |
| 139 #get all rows from ChIP-* data sets | |
| 140 currGfChrIdxs = (gfChrNos == chr); | |
| 141 currGfProbeLocs = gfProbeLocs[currGfChrIdxs]; | |
| 142 currGfScrs = gfMatScrs[currGfChrIdxs]; | |
| 143 | |
| 144 # iterate over all features of the current chromosome | |
| 145 for j in range(0,len(currStarts)): | |
| 146 feaStrand = currStrands[j]; | |
| 147 feaStart = currStarts[j]; | |
| 148 feaEnd = currEnds[j]; | |
| 149 feaAttrib = currAttribs[j]; | |
| 150 | |
| 151 #calc the corresponding probe positions for the feature start | |
| 152 if feaStrand == 1: # +strand | |
| 153 refFeaSt = 0; | |
| 154 while feaStart >= currGfProbeLocs[refFeaSt]: | |
| 155 refFeaSt += 1; | |
| 156 else: # -strand | |
| 157 refFeaSt = 0; | |
| 158 while feaEnd >= currGfProbeLocs[refFeaSt]: | |
| 159 refFeaSt += 1; | |
| 160 #now refFeaSt points to the correct probe according to strand orientation | |
| 161 | |
| 162 #calc the avg probe scores per bin | |
| 163 if feaStrand == 1: | |
| 164 feaChunkStart = feaStart; # bp level pointer | |
| 165 refChunkStart = refFeaSt; # probe level pointer | |
| 166 refChunkEnd = refFeaSt; # probe level pointer | |
| 167 | |
| 168 # determine values for the upstream bins | |
| 169 for k in range(1,upStreamStretch/stepWidth + 1): | |
| 170 feaChunkEnd = feaStart - k * stepWidth; # bp level pointer | |
| 171 | |
| 172 # find the correct ref for the current feaChunkEnd; catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | |
| 173 try: | |
| 174 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | |
| 175 refChunkEnd -= 1; | |
| 176 except: | |
| 177 pass; # don't handle the exception further | |
| 178 | |
| 179 #calc probe mean | |
| 180 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]); | |
| 181 | |
| 182 #adjust pointers | |
| 183 refChunkStart = refChunkEnd; | |
| 184 | |
| 185 # determine values for bins in feature body | |
| 186 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | |
| 187 feaChunkEnd = feaStart + k*stepWidth; # bp level pointer | |
| 188 | |
| 189 try: | |
| 190 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | |
| 191 refChunkEnd += 1; | |
| 192 except: | |
| 193 pass; # don't handle the exception further | |
| 194 | |
| 195 # calc probe mean | |
| 196 avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]); | |
| 197 | |
| 198 # adjust chunk pointers | |
| 199 refChunkStart = refChunkEnd; | |
| 200 | |
| 201 # write feature length and attrib in last cols of the array | |
| 202 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | |
| 203 avgFeatureMatScrs[featureCounter, -2] = feaAttrib; | |
| 204 featureCounter += 1; | |
| 205 else: # -1 strand | |
| 206 feaChunkStart = feaEnd; # bp level pointer | |
| 207 refChunkStart = refFeaSt; # probe level pointer | |
| 208 refChunkEnd = refFeaSt; # probe level pointer | |
| 209 | |
| 210 # determine values for the upStream region bins | |
| 211 for k in range(1,upStreamStretch/stepWidth + 1): | |
| 212 feaChunkEnd = feaEnd + k*stepWidth; # bp level pointer | |
| 213 | |
| 214 # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | |
| 215 try: | |
| 216 while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | |
| 217 refChunkEnd += 1; | |
| 218 except: | |
| 219 pass; | |
| 220 | |
| 221 # calc probe mean | |
| 222 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]); | |
| 223 | |
| 224 # adjust chunk pointers | |
| 225 refChunkStart = refChunkEnd; | |
| 226 | |
| 227 # determine values for bins in feature body | |
| 228 for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | |
| 229 feaChunkEnd = feaEnd - k*stepWidth; # bp level pointer | |
| 230 | |
| 231 try: | |
| 232 while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | |
| 233 refChunkEnd -= 1; | |
| 234 except: | |
| 235 pass; | |
| 236 | |
| 237 # calc probe mean | |
| 238 avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]); | |
| 239 | |
| 240 # adjust chunk pointers | |
| 241 refChunkStart = refChunkEnd; | |
| 242 | |
| 243 # write feature length and attrib in last cols of the array | |
| 244 avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | |
| 245 avgFeatureMatScrs[featureCounter, -2] = feaAttrib; | |
| 246 featureCounter += 1; | |
| 247 | |
| 248 # determine idxs of class boundaries and write class value in respective cells | |
| 249 for c in range(len(classBoundaries)-1,-1,-1): # reverse iterate through the boundaries list | |
| 250 idxs = np.nonzero(avgFeatureMatScrs[:,-2] <= classBoundaries[c])[0]; | |
| 251 avgFeatureMatScrs[idxs, -3] = c; | |
| 252 | |
| 253 # assign names and data format to all cols of the results array and sort by feature attrib | |
| 254 sortByClassCol = 'col' + str(np.size(avgFeatureMatScrs,1)-3); | |
| 255 sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1); | |
| 256 | |
| 257 matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]}; | |
| 258 avgFeatureMatScrs.dtype = matDtype; | |
| 259 avgFeatureMatScrs.sort(axis = 0, order = [sortByClassCol, sortByLengthCol]); | |
| 260 | |
| 261 # generate colormap | |
| 262 BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))}; | |
| 263 BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9); | |
| 264 | |
| 265 lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0); | |
| 266 upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0); | |
| 267 cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge; # upper inner fence | |
| 268 | |
| 269 # remove custom array dtype used for sorting and plot avg scrs (without the last 3 cols) | |
| 270 imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-3], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim); | |
| 271 axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1); | |
| 272 | |
| 273 # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly | |
| 274 if maxX == -1: | |
| 275 maxX = max(feaEnds - feaStarts); | |
| 276 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj); | |
| 277 else: | |
| 278 imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj); | |
| 279 | |
| 280 xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | |
| 281 xTicks.insert(-1,-tickPlotAdj); | |
| 282 xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | |
| 283 xTicksLabel.insert(-1, str(-upStreamStretch)); | |
| 284 imgplot.axes.set_xticks(xTicks); | |
| 285 imgplot.axes.set_xticklabels(xTicksLabel); | |
| 286 | |
| 287 imgplot.axes.set_title(plotTitle); | |
| 288 imgplot.axes.set_xlabel('Distance from feature start (bp)'); | |
| 289 imgplot.axes.set_ylabel('Feature class (low to high)'); | |
| 290 imgplot.axes.set_yticks([]); | |
| 291 | |
| 292 imgplot.set_cmap(BYcmap); | |
| 293 colorbar(); | |
| 294 show(); | |
| 295 | |
| 296 savefig('chromatra_t_tmp_plot', format=outFileFormat); | |
| 297 data = file('chromatra_t_tmp_plot', 'rb').read(); | |
| 298 fp = open(outFileName, 'wb'); | |
| 299 fp.write(data); | |
| 300 fp.close(); | |
| 301 os.remove('chromatra_t_tmp_plot'); | 
