Mercurial > repos > cmmt > chromatra
view chromatral.py @ 9:743e69c1e122 default tip
Uploaded
author | cmmt |
---|---|
date | Mon, 23 Jan 2012 05:28:00 -0500 |
parents | 519e45d9539f |
children |
line wrap: on
line source
import os; import re; import csv; import sys; import math; import numpy as np; import matplotlib; matplotlib.use('Agg'); from pylab import *; def extractGffFeatures(gffString): # GFF v3 # 1. seqname - Must be a chromosome or scaffold. # 2. source - The program that generated this feature. # 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". # 4. start - The starting position of the feature in the sequence. The first base is numbered 1. # 5. end - The ending position of the feature (inclusive). # 6. score - A score between 0 and 1000. If there is no score value, enter ".". # 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). # 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. # 9. group - All lines with the same group are linked together into a single item. res = []; if len(gffString) > 1: # check for empty line (i.e. row in GFF file only contains \n) try: gffSeqName, gffSource, gffFeature, gffStart, gffEnd, gffScr, gffStrand, gffFrame, gffGroup = gffString.split('\t'); except: return []; # exit, since we don't have 9 features # handle the gffSeqName try: chrNo = int(re.findall(r'[0-9]+', gffSeqName)[0]); except: return []; # handle gffStart try: probeStart = int(gffStart); except ValueError: return []; # exit, invalid gffStart # handle gffEnd try: probeEnd = int(gffEnd); except ValueError: return []; # exit, invalid gffEnd # handle gffScr try: probeScr = float(gffScr); except ValueError: return []; # exit, invalid gffScr # if the input string was well-formed, return the extracted features res = [chrNo, probeStart, probeEnd, probeScr]; return res; # return feature set (if any was found in this record) # Define Constants tickPlotAdj = 0.5; # adjust tick marks on x-axis such that they are exactly at the bin boundary (and not in the bin center) # Initialize Variables feaNames = []; feaChrNoList = []; feaStrandList = []; feaStartList = []; feaEndList = []; gffChrNoList = []; gffProbeCenterList = []; gffMatScrList = []; # get command line arguments gffFileName = sys.argv[1]; feaFileName = sys.argv[2]; outFileFormat = sys.argv[3]; stepWidth = int(sys.argv[4]); upStreamStretch = int(sys.argv[5]); # in bp maxX = int(sys.argv[6]); # in bp, if -1 maxX <= max(feature length), see below tickSpacing = int(sys.argv[7]); # in bp plotTitle = sys.argv[8]; outFileName = sys.argv[9]; # Read transcript information feaFileReader = csv.reader(open(feaFileName, 'rb'), delimiter = '\t'); for row in feaFileReader: feaName, feaChrNo, feaStrand, feaStart, feaEnd = row; feaNames.append(feaName); feaChrNoList.append(int(feaChrNo)); feaStrandList.append(int(feaStrand)); feaStartList.append(int(feaStart)); feaEndList.append(int(feaEnd)); # Read enrichment scores gffFileHandle = open(gffFileName, "rU"); for row in gffFileHandle: cache = extractGffFeatures(row); if len(cache) == 4: # if curr row wasn't well-formed, skip row chrNo, probeStart, probeEnd, probeScr = cache; gffChrNoList.append(chrNo); gffProbeCenterList.append(int(float(probeEnd - probeStart)/2.0 + probeStart)); # calc the center pos of each probe gffMatScrList.append(probeScr); # convert numerical lists into vectors feaChrNos = np.array(feaChrNoList); feaStrands = np.array(feaStrandList); feaStarts = np.array(feaStartList); feaEnds = np.array(feaEndList); gfChrNos = np.array(gffChrNoList); gfProbeLocs = np.array(gffProbeCenterList); gfMatScrs = np.array(gffMatScrList); # preallocate the results array, bins and 1 extra cols for feature length avgFeatureMatScrs = np.empty((len(feaStarts),\ (int(math.ceil(float(max(feaEnds - feaStarts))/float(stepWidth)) + upStreamStretch/stepWidth + 1))),\ ); avgFeatureMatScrs[:] = np.NAN; featureCounter = 0; for chr in range(min(feaChrNos),max(feaChrNos) + 1): #get all rows from the feature file that contain information about the i'th chromosome currChrIdxs = (feaChrNos == chr); currStrands = feaStrands[currChrIdxs]; currStarts = feaStarts[currChrIdxs]; currEnds = feaEnds[currChrIdxs]; #get all rows from ChIP-* data sets currGfChrIdxs = (gfChrNos == chr); currGfProbeLocs = gfProbeLocs[currGfChrIdxs]; currGfScrs = gfMatScrs[currGfChrIdxs]; # iterate over all features of the current chromosome for j in range(0,len(currStarts)): feaStrand = currStrands[j]; feaStart = currStarts[j]; feaEnd = currEnds[j]; #calc the corresponding probe positions for the feature start if feaStrand == 1: # +strand refFeaSt = 0; while feaStart >= currGfProbeLocs[refFeaSt]: refFeaSt += 1; else: # -strand refFeaSt = 0; while feaEnd >= currGfProbeLocs[refFeaSt]: refFeaSt += 1; #now refFeaSt points to the correct probe according to strand orientation #calc the avg probe scores per bin if feaStrand == 1: feaChunkStart = feaStart; # bp level pointer refChunkStart = refFeaSt; # probe level pointer refChunkEnd = refFeaSt; # probe level pointer # determine values for the upstream bins for k in range(1,upStreamStretch/stepWidth + 1): feaChunkEnd = feaStart - k * stepWidth; # bp level pointer # find the correct ref for the current feaChunkEnd; catch potential IndexOutOfBoundEx when upstream region too close to chr boundary try: while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: refChunkEnd -= 1; except: pass; # don't handle the exception further #calc probe mean avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkEnd:refChunkStart + 1]); #adjust pointers refChunkStart = refChunkEnd; # determine values for bins in feature body for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): feaChunkEnd = feaStart + k*stepWidth; # bp level pointer try: while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: refChunkEnd += 1; except: pass; # don't handle the exception further # calc probe mean avgFeatureMatScrs[featureCounter, k - 1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkStart:refChunkEnd + 1]); # adjust chunk pointers refChunkStart = refChunkEnd; # write feature length in last col of the array avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; featureCounter += 1; else: # -1 strand feaChunkStart = feaEnd; # bp level pointer refChunkStart = refFeaSt; # probe level pointer refChunkEnd = refFeaSt; # probe level pointer # determine values for the upStream region bins for k in range(1,upStreamStretch/stepWidth + 1): feaChunkEnd = feaEnd + k*stepWidth; # bp level pointer # find the ref for the current feaChunkEnd, catch potential IndexOutOfBoundEx when upstream region too close to chr boundary try: while feaChunkEnd >= currGfProbeLocs[refChunkEnd]: refChunkEnd += 1; except: pass; # calc probe mean avgFeatureMatScrs[featureCounter, upStreamStretch/stepWidth - k] = np.mean(currGfScrs[refChunkStart:refChunkEnd+1]); # adjust chunk pointers refChunkStart = refChunkEnd; # determine values for bins in feature body for k in range(1, int(math.ceil(float(feaEnd - feaStart)/float(stepWidth)) + 1)): feaChunkEnd = feaEnd - k*stepWidth; # bp level pointer try: while feaChunkEnd <= currGfProbeLocs[refChunkEnd]: refChunkEnd -= 1; except: pass; # calc probe mean avgFeatureMatScrs[featureCounter, k-1 + upStreamStretch/stepWidth] = np.mean(currGfScrs[refChunkEnd:refChunkStart+1]); # adjust chunk pointers refChunkStart = refChunkEnd; # write feature length in last col of the array avgFeatureMatScrs[featureCounter, -1] = feaEnd - feaStart; featureCounter += 1; # assign names and data format to all cols of the results array and sort array by feature length (last column) sortByLengthCol = 'col' + str(np.size(avgFeatureMatScrs,1)-1); matDtype = {'names':['col%i'%i for i in range(np.size(avgFeatureMatScrs,1))],'formats':(np.size(avgFeatureMatScrs,1))*[np.float]}; avgFeatureMatScrs.dtype = matDtype; avgFeatureMatScrs.sort(axis = 0, order = sortByLengthCol); # generate colormap BYcmapData = {'red': ((0.0, 0.0, 0.094), (0.5, 0.0, 0.0), (1.0, 0.992, 1.0)), 'green': ((0.0, 0.0,0.658), (0.5, 0.0, 0.0), (1.0, 0.996, 1.0)), 'blue': ((0.0, 1.0, 0.828), (0.5, 0.0, 0.0), (1.0, 0.0, 0.0))}; BYcmap = matplotlib.colors.LinearSegmentedColormap('BYcmap', BYcmapData,9); lowerHinge = matplotlib.mlab.prctile(gfMatScrs, 25.0); upperHinge = matplotlib.mlab.prctile(gfMatScrs, 75.0); cmLim = (upperHinge - lowerHinge) * 1.5 + upperHinge; # upper inner fence # remove custom array dtype used for sorting and plot avgScrs (without the last col which stores the length info) imgplot = imshow(avgFeatureMatScrs.view(np.float)[:,0:-1], aspect='auto', interpolation='nearest', origin='lower', vmin = -cmLim, vmax = cmLim); axvline(x = int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj, color = 'w', linewidth = 1); # if maxX wasn't set by hand, set it to max feature length and set x-coord accordingly if maxX == -1: maxX = max(feaEnds - feaStarts); imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj); else: imgplot.axes.set_xbound(-tickPlotAdj, int(float(maxX)/float(stepWidth)) - tickPlotAdj); xTicks = [k * int(float(tickSpacing)/float(stepWidth)) + int(float(upStreamStretch)/float(stepWidth)) - tickPlotAdj for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; xTicks.insert(-1,-tickPlotAdj); xTicksLabel = [k * tickSpacing for k in range(0, int(math.ceil(float(maxX)/float(tickSpacing)))+1)]; xTicksLabel.insert(-1, str(-upStreamStretch)); imgplot.axes.set_xticks(xTicks); imgplot.axes.set_xticklabels(xTicksLabel); imgplot.axes.set_title(plotTitle); imgplot.axes.set_xlabel('Distance from feature start (bp)'); imgplot.axes.set_ylabel(''); imgplot.axes.set_yticks([]); imgplot.set_cmap(BYcmap); colorbar(); show(); savefig('chromatra_l_tmp_plot', format=outFileFormat); data = file('chromatra_l_tmp_plot', 'rb').read(); fp = open(outFileName, 'wb'); fp.write(data); fp.close(); os.remove('chromatra_l_tmp_plot');