| 5 | 1 import os; | 
|  | 2 import re; | 
|  | 3 import csv; | 
|  | 4 import string; | 
|  | 5 import sys; | 
|  | 6 import math; | 
|  | 7 import numpy as np; | 
|  | 8 import matplotlib; | 
|  | 9 matplotlib.use('Agg'); | 
|  | 10 from pylab import *; | 
|  | 11 | 
|  | 12 def extractGffFeatures(gffString): | 
|  | 13     # GFF v3 | 
|  | 14     # 1. seqname - Must be a chromosome or scaffold. | 
|  | 15     # 2. source - The program that generated this feature. | 
|  | 16     # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". | 
|  | 17     # 4. start - The starting position of the feature in the sequence. The first base is numbered 1. | 
|  | 18     # 5. end - The ending position of the feature (inclusive). | 
|  | 19     # 6. score - A score between 0 and 1000. If there is no score value, enter ".". | 
|  | 20     # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). | 
|  | 21     # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. | 
|  | 22     # 9. group - All lines with the same group are linked together into a single item. | 
|  | 23 | 
|  | 24     res = []; | 
|  | 25 | 
|  | 26     if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n) | 
|  | 27         try: | 
|  | 28             gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t'); | 
|  | 29         except: | 
|  | 30             return []; # exit, since we don't have 9 features | 
|  | 31 | 
|  | 32         # handle the gffSeqName | 
|  | 33         try: | 
|  | 34             chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]); | 
|  | 35         except: | 
|  | 36             return []; | 
|  | 37 | 
|  | 38         # handle gffStart | 
|  | 39         try: | 
|  | 40             probeStart = int(gffStart); | 
|  | 41         except ValueError: | 
|  | 42             return []; # exit, invalid gffStart | 
|  | 43 | 
|  | 44         # handle gffEnd | 
|  | 45         try: | 
|  | 46             probeEnd = int(gffEnd); | 
|  | 47         except ValueError: | 
|  | 48             return []; # exit, invalid gffEnd | 
|  | 49 | 
|  | 50         # handle gffScr | 
|  | 51         try: | 
|  | 52             probeScr = float(gffScr); | 
|  | 53         except ValueError: | 
|  | 54             return []; # exit, invalid gffScr | 
|  | 55 | 
|  | 56         # if the input string was well-formed, return the extracted features | 
|  | 57         res = [chrNo, probeStart, probeEnd, probeScr]; | 
|  | 58     return res; # return feature set (if any was found in this record) | 
|  | 59 | 
|  | 60 # Define Constants | 
|  | 61 tickPlotAdj = 0.5;    # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center) | 
|  | 62 | 
|  | 63 # Initialize Variables | 
|  | 64 feaNames = []; | 
|  | 65 feaChrNoList = []; | 
|  | 66 feaStrandList = []; | 
|  | 67 feaStartList = []; | 
|  | 68 feaEndList = []; | 
|  | 69 feaAttribList = []; | 
|  | 70 classBoundaryIdxs = []; | 
|  | 71 | 
|  | 72 gffChrNoList = []; | 
|  | 73 gffProbeCenterList = []; | 
|  | 74 gffMatScrList = []; | 
|  | 75 | 
|  | 76 # get command line arguments | 
|  | 77 gffFileName = sys.argv[1]; | 
|  | 78 feaFileName = sys.argv[2]; | 
|  | 79 outFileFormat = sys.argv[3]; | 
|  | 80 stepWidth = int(sys.argv[4]); | 
|  | 81 classBoundaryStr = sys.argv[5]; | 
|  | 82 upStreamStretch = int(sys.argv[6]); # in bp | 
|  | 83 maxX = int(sys.argv[7]);            # in bp, if -1 maxX <= max(feature length), see below | 
|  | 84 tickSpacing = int(sys.argv[8]);     # in bp | 
|  | 85 plotTitle = sys.argv[9]; | 
|  | 86 outFileName = sys.argv[10]; | 
|  | 87 | 
|  | 88 # Extract class boundaries | 
|  | 89 classBoundaries = map(string.atoi, re.findall(r'\b\d+\b',classBoundaryStr)); | 
|  | 90 | 
|  | 91 # Read transcript information | 
|  | 92 feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t'); | 
|  | 93 for row in feaFileReader: | 
|  | 94     feaName, feaChrNo, feaStrand, feaStart, feaEnd, feaAttrib = row; | 
|  | 95     feaNames.append(feaName); | 
|  | 96     feaChrNoList.append(int(feaChrNo)); | 
|  | 97     feaStrandList.append(int(feaStrand)); | 
|  | 98     feaStartList.append(int(feaStart)); | 
|  | 99     feaEndList.append(int(feaEnd)); | 
|  | 100     feaAttribList.append(float(feaAttrib)); | 
|  | 101 | 
|  | 102 # Read enrichment scores | 
|  | 103 gffFileHandle = open(gffFileName, "rU"); | 
|  | 104 for row in gffFileHandle: | 
|  | 105     cache = extractGffFeatures(row); | 
|  | 106     if len(cache) == 4: # if curr row wasn't well-formed, skip row | 
|  | 107         chrNo, probeStart, probeEnd, probeScr = cache; | 
|  | 108         gffChrNoList.append(chrNo); | 
|  | 109         gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe | 
|  | 110         gffMatScrList.append(probeScr); | 
|  | 111 | 
|  | 112 # convert numerical lists into vectors | 
|  | 113 feaChrNos = np.array(feaChrNoList); | 
|  | 114 feaStrands = np.array(feaStrandList); | 
|  | 115 feaStarts = np.array(feaStartList); | 
|  | 116 feaEnds = np.array(feaEndList); | 
|  | 117 feaAttribs = np.array(feaAttribList); | 
|  | 118 | 
|  | 119 gfChrNos = np.array(gffChrNoList); | 
|  | 120 gfProbeLocs = np.array(gffProbeCenterList); | 
|  | 121 gfMatScrs = np.array(gffMatScrList); | 
|  | 122 | 
|  | 123 # preallocate the results array, bins plus 2 extra cols for feature length and transfreq (or other freature characteristic) plus 1 col for sorting | 
|  | 124 avgFeatureMatScrs = np.empty((len(feaStarts),\ | 
|  | 125                           (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 3))),\ | 
|  | 126                          ); | 
|  | 127 avgFeatureMatScrs[:] = np.NAN; | 
|  | 128 | 
|  | 129 featureCounter = 0; | 
|  | 130 | 
|  | 131 for chr in range(min(feaChrNos),max(feaChrNos) + 1): | 
|  | 132     #get all rows from the feature file that contain information about the i'th chromosome | 
|  | 133     currChrIdxs = (feaChrNos == chr); | 
|  | 134     currStrands = feaStrands[currChrIdxs]; | 
|  | 135     currStarts = feaStarts[currChrIdxs]; | 
|  | 136     currEnds = feaEnds[currChrIdxs]; | 
|  | 137     currAttribs = feaAttribs[currChrIdxs]; | 
|  | 138 | 
|  | 139     #get all rows from ChIP-* data sets | 
|  | 140     currGfChrIdxs = (gfChrNos == chr); | 
|  | 141     currGfProbeLocs = gfProbeLocs[currGfChrIdxs]; | 
|  | 142     currGfScrs = gfMatScrs[currGfChrIdxs]; | 
|  | 143 | 
|  | 144     # iterate over all features of the current chromosome | 
|  | 145     for j in range(0,len(currStarts)): | 
|  | 146         feaStrand = currStrands[j]; | 
|  | 147         feaStart = currStarts[j]; | 
|  | 148         feaEnd = currEnds[j]; | 
|  | 149         feaAttrib = currAttribs[j]; | 
|  | 150 | 
|  | 151         #calc the corresponding probe positions for the feature start | 
|  | 152         if feaStrand == 1:   # +strand | 
|  | 153             refFeaSt = 0; | 
|  | 154             while feaStart >= currGfProbeLocs[refFeaSt]: | 
|  | 155                 refFeaSt += 1; | 
|  | 156         else:               # -strand | 
|  | 157             refFeaSt = 0; | 
|  | 158             while feaEnd >= currGfProbeLocs[refFeaSt]: | 
|  | 159                 refFeaSt += 1; | 
|  | 160         #now refFeaSt points to the correct probe according to strand orientation | 
|  | 161 | 
|  | 162         #calc the avg probe scores per bin | 
|  | 163         if feaStrand == 1: | 
|  | 164             feaChunkStart = feaStart;  # bp level pointer | 
|  | 165             refChunkStart = refFeaSt;  # probe level pointer | 
|  | 166             refChunkEnd = refFeaSt;    # probe level pointer | 
|  | 167 | 
|  | 168             # determine values for the upstream bins | 
|  | 169             for k in range(1,upStreamStretch/stepWidth + 1): | 
|  | 170                 feaChunkEnd = feaStart - k * stepWidth;  # bp level pointer | 
|  | 171 | 
|  | 172                 # find the correct ref for the current feaChunkEnd;  catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | 
|  | 173                 try: | 
|  | 174                     while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | 
|  | 175                         refChunkEnd -= 1; | 
|  | 176                 except: | 
|  | 177                     pass; # don't handle the exception further | 
|  | 178 | 
|  | 179                 #calc probe mean | 
|  | 180                 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]); | 
|  | 181 | 
|  | 182                 #adjust pointers | 
|  | 183                 refChunkStart = refChunkEnd; | 
|  | 184 | 
|  | 185             # determine values for bins in feature body | 
|  | 186             for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | 
|  | 187                 feaChunkEnd = feaStart + k*stepWidth;   # bp level pointer | 
|  | 188 | 
|  | 189                 try: | 
|  | 190                     while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | 
|  | 191                         refChunkEnd += 1; | 
|  | 192                 except: | 
|  | 193                     pass; # don't handle the exception further | 
|  | 194 | 
|  | 195                 # calc probe mean | 
|  | 196                 avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]); | 
|  | 197 | 
|  | 198                 # adjust chunk pointers | 
|  | 199                 refChunkStart = refChunkEnd; | 
|  | 200 | 
|  | 201             # write feature length and attrib in last cols of the array | 
|  | 202             avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | 
|  | 203             avgFeatureMatScrs[featureCounter, -2] = feaAttrib; | 
|  | 204             featureCounter += 1; | 
|  | 205         else: # -1 strand | 
|  | 206             feaChunkStart = feaEnd;    # bp level pointer | 
|  | 207             refChunkStart = refFeaSt;  # probe level pointer | 
|  | 208             refChunkEnd = refFeaSt;    # probe level pointer | 
|  | 209 | 
|  | 210             # determine values for the upStream region bins | 
|  | 211             for k in range(1,upStreamStretch/stepWidth + 1): | 
|  | 212                 feaChunkEnd = feaEnd + k*stepWidth;  # bp level pointer | 
|  | 213 | 
|  | 214                 # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary | 
|  | 215                 try: | 
|  | 216                     while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: | 
|  | 217                         refChunkEnd += 1; | 
|  | 218                 except: | 
|  | 219                     pass; | 
|  | 220 | 
|  | 221                 # calc probe mean | 
|  | 222                 avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]); | 
|  | 223 | 
|  | 224                 # adjust chunk pointers | 
|  | 225                 refChunkStart = refChunkEnd; | 
|  | 226 | 
|  | 227             # determine values for bins in feature body | 
|  | 228             for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): | 
|  | 229                 feaChunkEnd = feaEnd - k*stepWidth;   # bp level pointer | 
|  | 230 | 
|  | 231                 try: | 
|  | 232                     while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: | 
|  | 233                         refChunkEnd -= 1; | 
|  | 234                 except: | 
|  | 235                     pass; | 
|  | 236 | 
|  | 237                 # calc probe mean | 
|  | 238                 avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]); | 
|  | 239 | 
|  | 240                 # adjust chunk pointers | 
|  | 241                 refChunkStart = refChunkEnd; | 
|  | 242 | 
|  | 243             # write feature length and attrib in last cols of the array | 
|  | 244             avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; | 
|  | 245             avgFeatureMatScrs[featureCounter, -2] = feaAttrib; | 
|  | 246             featureCounter += 1; | 
|  | 247 | 
|  | 248 # determine idxs of class boundaries and write class value in respective cells | 
|  | 249 for c in range(len(classBoundaries)-1,-1,-1):   # reverse iterate through the boundaries list | 
|  | 250     idxs = np.nonzero(avgFeatureMatScrs[:,-2] <= classBoundaries[c])[0]; | 
|  | 251     avgFeatureMatScrs[idxs, -3] = c; | 
|  | 252 | 
|  | 253 # assign names and data format to all cols of the results array and sort by feature attrib | 
|  | 254 sortByClassCol = 'col' + str(np.size(avgFeatureMatScrs,1)-3); | 
|  | 255 sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1); | 
|  | 256 | 
|  | 257 matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]}; | 
|  | 258 avgFeatureMatScrs.dtype = matDtype; | 
|  | 259 avgFeatureMatScrs.sort(axis = 0, order = [sortByClassCol, sortByLengthCol]); | 
|  | 260 | 
|  | 261 # generate colormap | 
|  | 262 BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))}; | 
|  | 263 BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9); | 
|  | 264 | 
|  | 265 lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0); | 
|  | 266 upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0); | 
|  | 267 cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge;   # upper inner fence | 
|  | 268 | 
|  | 269 # remove custom array dtype used for sorting and plot avg scrs (without the last 3 cols) | 
|  | 270 imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-3], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim); | 
|  | 271 axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1); | 
|  | 272 | 
|  | 273 # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly | 
|  | 274 if maxX == -1: | 
|  | 275     maxX = max(feaEnds - feaStarts); | 
|  | 276     imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj); | 
|  | 277 else: | 
|  | 278     imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj); | 
|  | 279 | 
|  | 280 xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | 
|  | 281 xTicks.insert(-1,-tickPlotAdj); | 
|  | 282 xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; | 
|  | 283 xTicksLabel.insert(-1, str(-upStreamStretch)); | 
|  | 284 imgplot.axes.set_xticks(xTicks); | 
|  | 285 imgplot.axes.set_xticklabels(xTicksLabel); | 
|  | 286 | 
|  | 287 imgplot.axes.set_title(plotTitle); | 
|  | 288 imgplot.axes.set_xlabel('Distance from feature start (bp)'); | 
|  | 289 imgplot.axes.set_ylabel('Feature class (low to high)'); | 
|  | 290 imgplot.axes.set_yticks([]); | 
|  | 291 | 
|  | 292 imgplot.set_cmap(BYcmap); | 
|  | 293 colorbar(); | 
|  | 294 show(); | 
|  | 295 | 
|  | 296 savefig('chromatra_t_tmp_plot', format=outFileFormat); | 
|  | 297 data = file('chromatra_t_tmp_plot', 'rb').read(); | 
|  | 298 fp = open(outFileName, 'wb'); | 
|  | 299 fp.write(data); | 
|  | 300 fp.close(); | 
|  | 301 os.remove('chromatra_t_tmp_plot'); |