Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/getSizes.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | 769e306b7933 |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 42 from commons.core.utils.RepetOptionParser import RepetOptionParser | 42 from commons.core.utils.RepetOptionParser import RepetOptionParser |
| 43 | 43 |
| 44 LOG_DEPTH = "smart" | 44 LOG_DEPTH = "smart" |
| 45 | 45 |
| 46 class GetSizes(object): | 46 class GetSizes(object): |
| 47 | 47 |
| 48 def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, csv=False, verbosity = 0): | 48 def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0): |
| 49 self.inFileName = inFileName | 49 self.inFileName = inFileName |
| 50 self.inFormat= inFormat | 50 self.inFormat= inFormat |
| 51 self.outFileName = outFileName | 51 self.outFileName = outFileName |
| 52 self.query = query | 52 self.query = query |
| 53 self.xMax = xMax | 53 self.xMax = xMax |
| 54 self.xMin = xMin | 54 self.xMin = xMin |
| 55 self.xLab = "Size" | 55 self.xLab = "Size" |
| 56 self.yLab = "# reads" | 56 self.yLab = "# reads" |
| 57 self.barplot = False | 57 self.barplot = False |
| 58 self.csv = csv | 58 self._verbosity = verbosity |
| 59 self._verbosity = verbosity | 59 self.parser = None |
| 60 self.parser = None | 60 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) |
| 61 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | 61 |
| 62 | 62 def setAttributesFromCmdLine(self): |
| 63 def setAttributesFromCmdLine(self): | 63 description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" |
| 64 description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" | 64 epilog = "" |
| 65 epilog = "" | 65 parser = RepetOptionParser(description = description, epilog = epilog) |
| 66 parser = RepetOptionParser(description = description, epilog = epilog) | 66 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") |
| 67 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") | 67 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") |
| 68 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") | 68 parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") |
| 69 parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") | 69 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") |
| 70 parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") | 70 parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") |
| 71 parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") | 71 parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") |
| 72 parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") | 72 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") |
| 73 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") | 73 parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") |
| 74 parser.add_option("-c", "--csv", dest="csv", action="store", type="string", help="write a .csv file [format: bool] [default: false]") | 74 parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") |
| 75 parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") | 75 parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") |
| 76 parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") | 76 options = parser.parse_args()[0] |
| 77 parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") | 77 self._setAttributesFromOptions(options) |
| 78 options = parser.parse_args()[0] | 78 |
| 79 self._setAttributesFromOptions(options) | 79 def _setAttributesFromOptions(self, options): |
| 80 | 80 self.setInFileName(options.inputFileName) |
| 81 def _setAttributesFromOptions(self, options): | 81 self.setInFormat(options.format) |
| 82 self.setInFileName(options.inputFileName) | 82 self.setQuery(options.query) |
| 83 self.setInFormat(options.format) | 83 self.setOutFileName(options.outputFileName) |
| 84 self.setQuery(options.query) | 84 self.setXMax(options.xMax) |
| 85 self.setOutFileName(options.outputFileName) | 85 self.setXMin(options.xMin) |
| 86 self.setXMax(options.xMax) | 86 self.setxLab(options.xLab) |
| 87 self.setXMin(options.xMin) | 87 self.setyLab(options.yLab) |
| 88 self.setxLab(options.xLab) | 88 self.setBarplot(options.barplot) |
| 89 self.setyLab(options.yLab) | 89 self.setVerbosity(options.verbosity) |
| 90 self.setBarplot(options.barplot) | 90 |
| 91 self.setVerbosity(options.verbosity) | 91 def setInFileName(self, inputFileName): |
| 92 | 92 self.inFileName = inputFileName |
| 93 def setInFileName(self, inputFileName): | 93 |
| 94 self.inFileName = inputFileName | 94 def setInFormat(self, inFormat): |
| 95 | 95 self.inFormat = inFormat |
| 96 def setInFormat(self, inFormat): | 96 |
| 97 self.inFormat = inFormat | 97 def setQuery(self, query): |
| 98 | 98 self.query = query |
| 99 def setQuery(self, query): | 99 |
| 100 self.query = query | 100 def setOutFileName(self, outFileName): |
| 101 | 101 self.outFileName = outFileName |
| 102 def setOutFileName(self, outFileName): | 102 |
| 103 self.outFileName = outFileName | 103 def setXMax(self, xMax): |
| 104 | 104 self.xMax = xMax |
| 105 def setXMax(self, xMax): | 105 |
| 106 self.xMax = xMax | 106 def setXMin(self, xMin): |
| 107 | 107 self.xMin = xMin |
| 108 def setXMin(self, xMin): | 108 |
| 109 self.xMin = xMin | 109 def setxLab(self, xLab): |
| 110 | 110 self.xLab = xLab |
| 111 def setxLab(self, xLab): | 111 |
| 112 self.xLab = xLab | 112 def setyLab(self, yLab): |
| 113 | 113 self.yLab = yLab |
| 114 def setyLab(self, yLab): | 114 |
| 115 self.yLab = yLab | 115 def setBarplot(self, barplot): |
| 116 | 116 self.barplot = barplot |
| 117 def setBarplot(self, barplot): | 117 |
| 118 self.barplot = barplot | 118 def setVerbosity(self, verbosity): |
| 119 | 119 self._verbosity = verbosity |
| 120 def setCsv(self, csv): | 120 |
| 121 self.csv = csv | 121 def _checkOptions(self): |
| 122 | 122 if self.inFileName == None: |
| 123 def setVerbosity(self, verbosity): | 123 self._logAndRaise("ERROR: Missing input file name") |
| 124 self._verbosity = verbosity | 124 if self.inFormat == "fasta": |
| 125 | 125 self.parser = FastaParser(self.inFileName, self._verbosity) |
| 126 def _checkOptions(self): | 126 elif self.inFormat == "fastq": |
| 127 if self.inFileName == None: | 127 self.parser = FastqParser(self.inFileName, self._verbosity) |
| 128 self._logAndRaise("ERROR: Missing input file name") | 128 else: |
| 129 if self.inFormat == "fasta": | 129 self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) |
| 130 self.parser = FastaParser(self.inFileName, self._verbosity) | 130 |
| 131 elif self.inFormat == "fastq": | 131 def _logAndRaise(self, errorMsg): |
| 132 self.parser = FastqParser(self.inFileName, self._verbosity) | 132 self._log.error(errorMsg) |
| 133 else: | 133 raise Exception(errorMsg) |
| 134 self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) | 134 |
| 135 | 135 def run(self): |
| 136 def _logAndRaise(self, errorMsg): | 136 LoggerFactory.setLevel(self._log, self._verbosity) |
| 137 self._log.error(errorMsg) | 137 self._checkOptions() |
| 138 raise Exception(errorMsg) | 138 self._log.info("START getsizes") |
| 139 | 139 self._log.debug("Input file name: %s" % self.inFileName) |
| 140 def run(self): | 140 |
| 141 LoggerFactory.setLevel(self._log, self._verbosity) | 141 nbItems = self.parser.getNbItems() |
| 142 self._checkOptions() | 142 self._log.info( "%i items found" % (nbItems)) |
| 143 self._log.info("START getsizes") | 143 |
| 144 self._log.debug("Input file name: %s" % self.inFileName) | 144 # treat items |
| 145 | 145 progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) |
| 146 nbItems = self.parser.getNbItems() | 146 sizes = {} |
| 147 self._log.info( "%i items found" % (nbItems)) | 147 minimum = 1000000000000 |
| 148 | 148 maximum = 0 |
| 149 # treat items | 149 sum = 0 |
| 150 progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) | 150 number = 0 |
| 151 sizes = {} | 151 nbSubItems = 0 |
| 152 names = {} | 152 for item in self.parser.getIterator(): |
| 153 minimum = 1000000000000 | 153 items = [] |
| 154 maximum = 0 | 154 if self.query == "exon": |
| 155 sum = 0 | 155 items = item.getExons() |
| 156 number = 0 | 156 elif self.query == "exon1": |
| 157 nbSubItems = 0 | 157 if len(item.getExons()) > 1: |
| 158 for item in self.parser.getIterator(): | 158 item.sortExons() |
| 159 items = [] | 159 items = [item.getExons()[0]] |
| 160 if self.query == "exon": | 160 elif self.query == "intron": |
| 161 items = item.getExons() | 161 items = item.getIntrons() |
| 162 elif self.query == "exon1": | 162 else: |
| 163 if len(item.getExons()) > 1: | 163 items = [item, ] |
| 164 item.sortExons() | 164 |
| 165 items = [item.getExons()[0]] | 165 for thisItem in items: |
| 166 elif self.query == "intron": | 166 try: |
| 167 items = item.getIntrons() | 167 nbElements = int(float(thisItem.getTagValue("nbElements"))) |
| 168 else: | 168 if nbElements == None: |
| 169 items = [item, ] | 169 nbElements = 1 |
| 170 | 170 except: |
| 171 for thisItem in items: | 171 nbElements = 1 |
| 172 try: | 172 size = thisItem.getSize() |
| 173 nbElements = int(float(thisItem.getTagValue("nbElements"))) | 173 minimum = min(minimum, size) |
| 174 if nbElements == None: | 174 maximum = max(maximum, size) |
| 175 nbElements = 1 | 175 |
| 176 except: | 176 if size not in sizes: |
| 177 nbElements = 1 | 177 sizes[size] = nbElements |
| 178 size = thisItem.getSize() | 178 else: |
| 179 minimum = min(minimum, size) | 179 sizes[size] += nbElements |
| 180 maximum = max(maximum, size) | 180 sum += size |
| 181 name = thisItem.name.split()[0] | 181 nbSubItems += nbElements |
| 182 | 182 number += 1 |
| 183 if size not in sizes: | 183 progress.inc() |
| 184 sizes[size] = nbElements | 184 progress.done() |
| 185 if self.csv: | 185 |
| 186 names[size] = [name, ] | 186 if self.outFileName != None: |
| 187 else: | 187 plotter = RPlotter(self.outFileName, self._verbosity) |
| 188 sizes[size] += nbElements | 188 plotter.setFill(0) |
| 189 if self.csv: | 189 plotter.setMinimumX(self.xMin) |
| 190 names[size].append(name) | 190 plotter.setMaximumX(self.xMax) |
| 191 sum += size | 191 plotter.setXLabel(self.xLab) |
| 192 nbSubItems += nbElements | 192 plotter.setYLabel(self.yLab) |
| 193 number += 1 | 193 plotter.setBarplot(self.barplot) |
| 194 progress.inc() | 194 plotter.addLine(sizes) |
| 195 progress.done() | 195 plotter.plot() |
| 196 | 196 |
| 197 if self.outFileName != None: | 197 if nbSubItems == 0: |
| 198 plotter = RPlotter(self.outFileName, self._verbosity) | 198 self._logAndRaise("No item found") |
| 199 plotter.setFill(0) | 199 |
| 200 plotter.setMinimumX(self.xMin) | 200 self.items = number |
| 201 plotter.setMaximumX(self.xMax) | 201 self.subItems = nbSubItems |
| 202 plotter.setXLabel(self.xLab) | 202 self.nucleotides = sum |
| 203 plotter.setYLabel(self.yLab) | 203 self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) |
| 204 plotter.setBarplot(self.barplot) | 204 |
| 205 plotter.addLine(sizes) | 205 print "%d items" % (number) |
| 206 plotter.plot() | 206 print "%d sub-items" % (nbSubItems) |
| 207 | 207 print "%d nucleotides" % (sum) |
| 208 if nbSubItems == 0: | 208 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) |
| 209 self._logAndRaise("No item found") | 209 |
| 210 | 210 self._log.info("END getsizes") |
| 211 if self.csv: | |
| 212 csvHandle = open(self.csv, "w") | |
| 213 for size in range(min(sizes.keys()), max(sizes.keys())+1): | |
| 214 if size not in sizes: | |
| 215 csvHandle.write("%d,0,\n" % (size)) | |
| 216 else: | |
| 217 csvHandle.write("%d,%d,%s\n" % (size, sizes[size], ";".join(names[size]))) | |
| 218 csvHandle.close() | |
| 219 | |
| 220 self.items = number | |
| 221 self.subItems = nbSubItems | |
| 222 self.nucleotides = sum | |
| 223 self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) | |
| 224 | |
| 225 print "%d items" % (number) | |
| 226 print "%d sub-items" % (nbSubItems) | |
| 227 print "%d nucleotides" % (sum) | |
| 228 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) | |
| 229 | |
| 230 self._log.info("END getsizes") | |
| 231 | 211 |
| 232 | 212 |
| 233 if __name__ == "__main__": | 213 if __name__ == "__main__": |
| 234 iGetSizes = GetSizes() | 214 iGetSizes = GetSizes() |
| 235 iGetSizes.setAttributesFromCmdLine() | 215 iGetSizes.setAttributesFromCmdLine() |
| 236 iGetSizes.run() | 216 iGetSizes.run() |
| 237 | 217 |
| 238 #TODO: add two more options!!!!!! | 218 #TODO: add two more options!!!!!! |
