comparison smart_toolShed/SMART/Java/Python/mergeSlidingWindowsClusters.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 """
32 Merge sliding windows of two different clusterings
33 """
34
35 import sys
36 import re
37 import os
38 from optparse import OptionParser
39 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
40 from commons.core.writer.Gff3Writer import Gff3Writer
41 from SMART.Java.Python.misc.Progress import Progress
42 from SMART.Java.Python.structure.Transcript import Transcript
43
44 class MergeSlidingWindowsClusters(object):
45 """
46 Merge the ouptput of several sets of sliding windows
47 """
48
49 def __init__(self, verbosity = 0):
50 self.verbosity = verbosity
51 self.inputs = []
52 self.outputData = {}
53 self.nbData = 0
54 self.nbWrittenData = 0
55 self.chromosomes = []
56 self.writer = None
57
58 def __del__(self):
59 if self.writer != None:
60 self.writer.close()
61
62 def addInput(self, fileName, fileFormat):
63 self.inputs.append(TranscriptContainer(fileName, fileFormat, self.verbosity))
64 self.chromosomes = list(set(self.chromosomes).union(set(self.inputs[-1].getChromosomes())))
65
66 def setOutput(self, fileName):
67 self.writer = Gff3Writer(fileName, self.verbosity)
68
69 def readInput(self, i, chromosome):
70 progress = Progress(self.inputs[i].getNbTranscripts(), "Reading file #%d -- chromosome %s" % (i+1, chromosome), self.verbosity)
71 for transcript in self.inputs[i].getIterator():
72 progress.inc()
73 if chromosome != transcript.getChromosome(): continue
74 start = transcript.getStart()
75 end = transcript.getEnd()
76 direction = transcript.getDirection()
77 tags = transcript.tags
78 if chromosome not in self.outputData:
79 self.outputData[chromosome] = {}
80 if direction not in self.outputData[chromosome]:
81 self.outputData[chromosome][direction] = {}
82 if start not in self.outputData[chromosome][direction]:
83 self.outputData[chromosome][direction][start] = {}
84 if end in self.outputData[chromosome][direction][start]:
85 ends = self.outputData[chromosome][direction][start].keys()
86 if ends[0] != end:
87 sys.exit("Error! Two regions starting at %d end are not consistent (%d and %d) in %s on strand %d" % (start, end, ends[0], chromosome, direction))
88 self.outputData[chromosome][direction][start][end].update(tags)
89 else:
90 self.outputData[chromosome][direction][start][end] = tags
91 self.nbData += 1
92 progress.done()
93
94
95 def writeOutput(self, chromosome):
96 progress = Progress(self.nbData - self.nbWrittenData, "Writing output for chromosome %s" % (chromosome), self.verbosity)
97 for direction in self.outputData[chromosome]:
98 for start in self.outputData[chromosome][direction]:
99 for end in self.outputData[chromosome][direction][start]:
100 transcript = Transcript()
101 transcript.setChromosome(chromosome)
102 transcript.setStart(start)
103 transcript.setEnd(end)
104 transcript.setDirection(direction)
105 transcript.tags = self.outputData[chromosome][direction][start][end]
106 transcript.setName("region_%d" % (self.nbWrittenData + 1))
107 tags = transcript.getTagNames()
108 for tag in tags:
109 if tag.startswith("Name_") or tag.startswith("ID_"):
110 del transcript.tags[tag]
111 self.nbWrittenData += 1
112 self.writer.addTranscript(transcript)
113 progress.inc()
114 self.writer.write()
115 progress.done()
116 self.outputData = {}
117
118 def merge(self):
119 for chromosome in self.chromosomes:
120 for i, input in enumerate(self.inputs):
121 self.readInput(i, chromosome)
122 self.writeOutput(chromosome)
123 self.writer.close()
124
125
126 if __name__ == "__main__":
127
128 # parse command line
129 description = "Merge Sliding Windows Clusters v1.0.2: Merge two files containing the results of a sliding windows clustering. [Category: Sliding Windows]"
130
131 parser = OptionParser(description = description)
132 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
133 parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of the input file 1 [compulsory] [format: transcript file format]")
134 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
135 parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of the input file 2 [compulsory] [format: transcript file format]")
136 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
137 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
138 (options, args) = parser.parse_args()
139
140 merger = MergeSlidingWindowsClusters(options.verbosity)
141 merger.addInput(options.inputFileName1, options.inputFormat1)
142 merger.addInput(options.inputFileName2, options.inputFormat2)
143 merger.setOutput(options.outputFileName)
144 merger.merge()