comparison SMART/Java/Python/cleanGff.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents cd852f3e04ab
children
comparison
equal deleted inserted replaced
45:e454402ba9d9 46:169d364ddd91
41 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress 41 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
42 42
43 count = {} 43 count = {}
44 44
45 class ParsedLine(object): 45 class ParsedLine(object):
46 def __init__(self, line, cpt): 46 def __init__(self, line, cpt):
47 self.line = line 47 self.line = line
48 self.cpt = cpt 48 self.cpt = cpt
49 self.parse() 49 self.parse()
50 50
51 def parse(self): 51 def parse(self):
52 self.line = self.line.strip() 52 self.line = self.line.strip()
53 self.splittedLine = self.line.split(None, 8) 53 self.splittedLine = self.line.split(None, 8)
54 if len(self.splittedLine) < 9: 54 if len(self.splittedLine) < 9:
55 raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) 55 raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line))
56 self.type = self.splittedLine[2] 56 self.type = self.splittedLine[2]
57 self.parseOptions() 57 self.parseOptions()
58 self.getId() 58 self.getId()
59 self.getParents() 59 self.getParents()
60 60
61 def parseOptions(self): 61 def parseOptions(self):
62 self.parsedOptions = {} 62 self.parsedOptions = {}
63 for option in self.splittedLine[8].split(";"): 63 for option in self.splittedLine[8].split(";"):
64 option = option.strip() 64 option = option.strip()
65 if option == "": continue 65 if option == "": continue
66 posSpace = option.find(" ") 66 posSpace = option.find(" ")
67 posEqual = option.find("=") 67 posEqual = option.find("=")
68 if posEqual != -1 and (posEqual < posSpace or posSpace == -1): 68 if posEqual != -1 and (posEqual < posSpace or posSpace == -1):
69 key, value = option.split("=", 1) 69 key, value = option.split("=", 1)
70 elif posSpace != -1: 70 elif posSpace != -1:
71 key, value = option.split(None, 1) 71 key, value = option.split(None, 1)
72 else: 72 else:
73 key = "ID" 73 key = "ID"
74 value = option 74 value = option
75 self.parsedOptions[key.strip()] = value.strip(" \"") 75 self.parsedOptions[key.strip()] = value.strip(" \"")
76 76
77 def getId(self): 77 def getId(self):
78 for key in self.parsedOptions: 78 for key in self.parsedOptions:
79 if key.lower() == "id": 79 if key.lower() == "id":
80 self.id = self.parsedOptions[key] 80 self.id = self.parsedOptions[key]
81 return 81 return
82 if "Parent" in self.parsedOptions: 82 if "Parent" in self.parsedOptions:
83 parent = self.parsedOptions["Parent"].split(",")[0] 83 parent = self.parsedOptions["Parent"].split(",")[0]
84 if parent not in count: 84 if parent not in count:
85 count[parent] = {} 85 count[parent] = {}
86 if self.type not in count[parent]: 86 if self.type not in count[parent]:
87 count[parent][self.type] = 0 87 count[parent][self.type] = 0
88 count[parent][self.type] += 1 88 count[parent][self.type] += 1
89 self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) 89 self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type])
90 else: 90 else:
91 self.id = "smart%d" % (self.cpt) 91 self.id = "smart%d" % (self.cpt)
92 self.parsedOptions["ID"] = self.id 92 self.parsedOptions["ID"] = self.id
93 93
94 def getParents(self): 94 def getParents(self):
95 for key in self.parsedOptions: 95 for key in self.parsedOptions:
96 if key.lower() in ("parent", "derives_from"): 96 if key.lower() in ("parent", "derives_from"):
97 self.parents = self.parsedOptions[key].split(",") 97 self.parents = self.parsedOptions[key].split(",")
98 return 98 return
99 self.parents = None 99 self.parents = None
100 100
101 def removeParent(self): 101 def removeParent(self):
102 for key in self.parsedOptions.keys(): 102 for key in self.parsedOptions.keys():
103 if key.lower() in ("parent", "derives_from"): 103 if key.lower() in ("parent", "derives_from"):
104 del self.parsedOptions[key] 104 del self.parsedOptions[key]
105 105
106 def export(self): 106 def export(self):
107 self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) 107 self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()])
108 return "%s\n" % ("\t".join(self.splittedLine)) 108 return "%s\n" % ("\t".join(self.splittedLine))
109 109
110 110
111 class CleanGff(object): 111 class CleanGff(object):
112 112
113 def __init__(self, verbosity = 1): 113 def __init__(self, verbosity = 1):
114 self.verbosity = verbosity 114 self.verbosity = verbosity
115 self.lines = {} 115 self.lines = {}
116 self.acceptedTypes = [] 116 self.acceptedTypes = []
117 self.parents = [] 117 self.parents = []
118 self.children = {} 118 self.children = {}
119 119
120 def setInputFileName(self, name): 120 def setInputFileName(self, name):
121 self.inputFile = open(name) 121 self.inputFile = open(name)
122 122
123 def setOutputFileName(self, name): 123 def setOutputFileName(self, name):
124 self.outputFile = open(name, "w") 124 self.outputFile = open(name, "w")
125 125
126 def setAcceptedTypes(self, types): 126 def setAcceptedTypes(self, types):
127 self.acceptedTypes = types 127 self.acceptedTypes = types
128 128
129 def parse(self): 129 def parse(self):
130 progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) 130 progress = UnlimitedProgress(100000, "Reading input file", self.verbosity)
131 for cpt, line in enumerate(self.inputFile): 131 for cpt, line in enumerate(self.inputFile):
132 if not line or line[0] == "#": continue 132 if not line or line[0] == "#": continue
133 if line[0] == ">": break 133 if line[0] == ">": break
134 parsedLine = ParsedLine(line, cpt) 134 parsedLine = ParsedLine(line, cpt)
135 if parsedLine.type in self.acceptedTypes: 135 if parsedLine.type in self.acceptedTypes:
136 if parsedLine.id in self.lines: 136 self.lines[parsedLine.id] = parsedLine
137 cpt = 1 137 progress.inc()
138 while "%s-%d" % (parsedLine.id, cpt) in self.lines: 138 progress.done()
139 cpt += 1
140 parsedLine.id = "%s-%d" % (parsedLine.id, cpt)
141 self.lines[parsedLine.id] = parsedLine
142 progress.inc()
143 progress.done()
144 139
145 def sort(self): 140 def sort(self):
146 progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) 141 progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity)
147 for line in self.lines.values(): 142 for line in self.lines.values():
148 parentFound = False 143 parentFound = False
149 if line.parents: 144 if line.parents:
150 for parent in line.parents: 145 for parent in line.parents:
151 if parent in self.lines: 146 if parent in self.lines:
152 parentFound = True 147 parentFound = True
153 if parent in self.children: 148 if parent in self.children:
154 self.children[parent].append(line) 149 self.children[parent].append(line)
155 else: 150 else:
156 self.children[parent] = [line] 151 self.children[parent] = [line]
157 if not parentFound: 152 if not parentFound:
158 line.removeParent() 153 line.removeParent()
159 self.parents.append(line) 154 self.parents.append(line)
160 progress.inc() 155 progress.inc()
161 progress.done() 156 progress.done()
162 157
163 def write(self): 158 def write(self):
164 progress = Progress(len(self.parents), "Writing output file", self.verbosity) 159 progress = Progress(len(self.parents), "Writing output file", self.verbosity)
165 for line in self.parents: 160 for line in self.parents:
166 self.writeLine(line) 161 self.writeLine(line)
167 progress.inc() 162 progress.inc()
168 self.outputFile.close() 163 self.outputFile.close()
169 progress.done() 164 progress.done()
170 165
171 def writeLine(self, line): 166 def writeLine(self, line):
172 self.outputFile.write(line.export()) 167 self.outputFile.write(line.export())
173 if line.id in self.children: 168 if line.id in self.children:
174 for child in self.children[line.id]: 169 for child in self.children[line.id]:
175 self.writeLine(child) 170 self.writeLine(child)
176 171
177 def run(self): 172 def run(self):
178 self.parse() 173 self.parse()
179 self.sort() 174 self.sort()
180 self.write() 175 self.write()
181 176
182 177
183 if __name__ == "__main__": 178 if __name__ == "__main__":
184 179
185 # parse command line 180 # parse command line
186 description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" 181 description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]"
187 182
188 parser = OptionParser(description = description) 183 parser = OptionParser(description = description)
189 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") 184 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]")
190 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") 185 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
191 parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") 186 parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]")
192 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 187 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
193 (options, args) = parser.parse_args() 188 (options, args) = parser.parse_args()
194 189
195 cleanGff = CleanGff(options.verbosity) 190 cleanGff = CleanGff(options.verbosity)
196 cleanGff.setInputFileName(options.inputFileName) 191 cleanGff.setInputFileName(options.inputFileName)
197 cleanGff.setOutputFileName(options.outputFileName) 192 cleanGff.setOutputFileName(options.outputFileName)
198 cleanGff.setAcceptedTypes(options.types.split(",")) 193 cleanGff.setAcceptedTypes(options.types.split(","))
199 cleanGff.run() 194 cleanGff.run()
200 195