comparison planemo/lib/python3.7/site-packages/galaxy/util/rules_dsl.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 import abc
2 import itertools
3 import re
4
5 import six
6 from six.moves import map
7
8 from galaxy.util import strip_control_characters_nested
9
10
11 def _ensure_rule_contains_keys(rule, keys):
12 for key, instance_class in keys.items():
13 if key not in rule:
14 raise ValueError("Rule of type [%s] does not contain key [%s]." % (rule["type"], key))
15 value = rule[key]
16 if not isinstance(value, instance_class):
17 raise ValueError("Rule of type [%s] does not contain correct value type for key [%s]." % (rule["type"], key))
18
19
20 def _ensure_key_value_in(rule, key, values):
21 value = rule[key]
22 if value not in values:
23 raise ValueError("Invalid value [%s] for [%s] encountered." % (value, key))
24
25
26 def _ensure_valid_pattern(expression):
27 re.compile(expression)
28
29
30 def apply_regex(regex, target, data, replacement=None, group_count=None):
31 pattern = re.compile(regex)
32
33 def new_row(row):
34 source = row[target]
35 if replacement is None:
36 match = pattern.search(source)
37 if not match:
38 raise Exception("Problem applying regular expression [%s] to [%s]." % (regex, source))
39
40 if group_count:
41 if len(match.groups()) != group_count:
42 raise Exception("Problem applying regular expression, wrong number of groups found.")
43
44 result = row + list(match.groups())
45 else:
46 result = row + [match.group(0)]
47 else:
48 result = row + [pattern.search(source).expand(replacement)]
49
50 return result
51
52 new_data = list(map(new_row, data))
53 return new_data
54
55
56 @six.add_metaclass(abc.ABCMeta)
57 class BaseRuleDefinition(object):
58
59 @abc.abstractproperty
60 def rule_type(self):
61 """Short string describing type of rule (plugin class) to use."""
62
63 @abc.abstractmethod
64 def validate_rule(self, rule):
65 """Validate dictified rule definition of this type."""
66
67 @abc.abstractmethod
68 def apply(self, rule, data, sources):
69 """Apply validated, dictified rule definition to supplied data."""
70
71
72 class AddColumnMetadataRuleDefinition(BaseRuleDefinition):
73 rule_type = "add_column_metadata"
74
75 def validate_rule(self, rule):
76 _ensure_rule_contains_keys(rule, {"value": six.string_types})
77
78 def apply(self, rule, data, sources):
79 rule_value = rule["value"]
80 if rule_value.startswith("identifier"):
81 identifier_index = int(rule_value[len("identifier"):])
82
83 new_rows = []
84 for index, row in enumerate(data):
85 new_rows.append(row + [sources[index]["identifiers"][identifier_index]])
86
87 elif rule_value == "tags":
88
89 def sorted_tags(index):
90 tags = sorted(sources[index]["tags"])
91 return [",".join(tags)]
92
93 new_rows = []
94 for index, row in enumerate(data):
95 new_rows.append(row + sorted_tags(index))
96
97 return new_rows, sources
98
99
100 class AddColumnGroupTagValueRuleDefinition(BaseRuleDefinition):
101 rule_type = "add_column_group_tag_value"
102
103 def validate_rule(self, rule):
104 _ensure_rule_contains_keys(rule, {"value": six.string_types})
105
106 def apply(self, rule, data, sources):
107 rule_value = rule["value"]
108 tag_prefix = "group:%s:" % rule_value
109
110 new_rows = []
111 for index, row in enumerate(data):
112 group_tag_value = None
113 source = sources[index]
114 tags = source["tags"]
115 for tag in sorted(tags):
116 if tag.startswith(tag_prefix):
117 group_tag_value = tag[len(tag_prefix):]
118 break
119
120 if group_tag_value is None:
121 group_tag_value = rule.get("default_value", "")
122
123 new_rows.append(row + [group_tag_value])
124
125 return new_rows, sources
126
127
128 class AddColumnConcatenateRuleDefinition(BaseRuleDefinition):
129 rule_type = "add_column_concatenate"
130
131 def validate_rule(self, rule):
132 _ensure_rule_contains_keys(rule, {"target_column_0": int, "target_column_1": int})
133
134 def apply(self, rule, data, sources):
135 column_0 = rule["target_column_0"]
136 column_1 = rule["target_column_1"]
137
138 new_rows = []
139 for index, row in enumerate(data):
140 new_rows.append(row + [row[column_0] + row[column_1]])
141
142 return new_rows, sources
143
144
145 class AddColumnBasenameRuleDefinition(BaseRuleDefinition):
146 rule_type = "add_column_basename"
147
148 def validate_rule(self, rule):
149 _ensure_rule_contains_keys(rule, {"target_column": int})
150
151 def apply(self, rule, data, sources):
152 column = rule["target_column"]
153 re = r"[^/]*$"
154 return apply_regex(re, column, data), sources
155
156
157 class AddColumnRegexRuleDefinition(BaseRuleDefinition):
158 rule_type = "add_column_regex"
159
160 def validate_rule(self, rule):
161 _ensure_rule_contains_keys(rule, {"target_column": int, "expression": six.string_types})
162 _ensure_valid_pattern(rule["expression"])
163
164 def apply(self, rule, data, sources):
165 target = rule["target_column"]
166 expression = rule["expression"]
167 replacement = rule.get("replacement")
168 group_count = rule.get("group_count")
169
170 return apply_regex(expression, target, data, replacement, group_count), sources
171
172
173 class AddColumnRownumRuleDefinition(BaseRuleDefinition):
174 rule_type = "add_column_rownum"
175
176 def validate_rule(self, rule):
177 _ensure_rule_contains_keys(rule, {"start": int})
178
179 def apply(self, rule, data, sources):
180 start = rule["start"]
181
182 new_rows = []
183 for index, row in enumerate(data):
184 new_rows.append(row + ["%d" % (index + start)])
185
186 return new_rows, sources
187
188
189 class AddColumnValueRuleDefinition(BaseRuleDefinition):
190 rule_type = "add_column_value"
191
192 def validate_rule(self, rule):
193 _ensure_rule_contains_keys(rule, {"value": six.string_types})
194
195 def apply(self, rule, data, sources):
196 value = rule["value"]
197
198 new_rows = []
199 for index, row in enumerate(data):
200 new_rows.append(row + [str(value)])
201
202 return new_rows, sources
203
204
205 class AddColumnSubstrRuleDefinition(BaseRuleDefinition):
206 rule_type = "add_column_substr"
207
208 def validate_rule(self, rule):
209 _ensure_rule_contains_keys(rule, {
210 "target_column": int,
211 "length": int,
212 "substr_type": six.string_types,
213 })
214 _ensure_key_value_in(rule, "substr_type", ["keep_prefix", "drop_prefix", "keep_suffix", "drop_suffix"])
215
216 def apply(self, rule, data, sources):
217 target = rule["target_column"]
218 length = rule["length"]
219 substr_type = rule["substr_type"]
220
221 def new_row(row):
222 original_value = row[target]
223 start = 0
224 end = len(original_value)
225
226 if substr_type == "keep_prefix":
227 end = length
228 elif substr_type == "drop_prefix":
229 start = length
230 elif substr_type == "keep_suffix":
231 start = end - length
232 if start < 0:
233 start = 0
234 else:
235 end = end - length
236 if end < 0:
237 end = 0
238
239 return row + [original_value[start:end]]
240
241 return list(map(new_row, data)), sources
242
243
244 class RemoveColumnsRuleDefinition(BaseRuleDefinition):
245 rule_type = "remove_columns"
246
247 def validate_rule(self, rule):
248 _ensure_rule_contains_keys(rule, {
249 "target_columns": list,
250 })
251
252 def apply(self, rule, data, sources):
253 target_columns = rule["target_columns"]
254
255 def new_row(row):
256 new = []
257 for index, val in enumerate(row):
258 if index not in target_columns:
259 new.append(val)
260 return new
261
262 return list(map(new_row, data)), sources
263
264
265 def _filter_index(func, iterable):
266 result = []
267 for index, x in enumerate(iterable):
268 if func(index):
269 result.append(x)
270
271 return result
272
273
274 class AddFilterRegexRuleDefinition(BaseRuleDefinition):
275 rule_type = "add_filter_regex"
276
277 def validate_rule(self, rule):
278 _ensure_rule_contains_keys(rule, {
279 "target_column": int,
280 "invert": bool,
281 "expression": six.string_types,
282 })
283 _ensure_valid_pattern(rule["expression"])
284
285 def apply(self, rule, data, sources):
286 target_column = rule["target_column"]
287 invert = rule["invert"]
288 regex = rule["expression"]
289
290 def _filter(index):
291 row = data[index]
292 val = row[target_column]
293 pattern = re.compile(regex)
294 return not invert if pattern.search(val) else invert
295
296 return _filter_index(_filter, data), _filter_index(_filter, sources)
297
298
299 class AddFilterCountRuleDefinition(BaseRuleDefinition):
300 rule_type = "add_filter_count"
301
302 def validate_rule(self, rule):
303 _ensure_rule_contains_keys(rule, {
304 "count": int,
305 "invert": bool,
306 "which": six.string_types,
307 })
308 _ensure_key_value_in(rule, "which", ["first", "last"])
309
310 def apply(self, rule, data, sources):
311 num_rows = len(data)
312 invert = rule["invert"]
313 n = rule["count"]
314 which = rule["which"]
315
316 def _filter(index):
317 if which == "first":
318 matches = index >= n
319 else:
320 matches = index < (num_rows - n)
321 return not invert if matches else invert
322
323 return _filter_index(_filter, data), _filter_index(_filter, sources)
324
325
326 class AddFilterEmptyRuleDefinition(BaseRuleDefinition):
327 rule_type = "add_filter_empty"
328
329 def validate_rule(self, rule):
330 _ensure_rule_contains_keys(rule, {
331 "target_column": int,
332 "invert": bool
333 })
334
335 def apply(self, rule, data, sources):
336 invert = rule["invert"]
337 target_column = rule["target_column"]
338
339 def _filter(index):
340 non_empty = len(data[index][target_column]) != 0
341 return not invert if non_empty else invert
342
343 return _filter_index(_filter, data), _filter_index(_filter, sources)
344
345
346 class AddFilterMatchesRuleDefinition(BaseRuleDefinition):
347 rule_type = "add_filter_matches"
348
349 def validate_rule(self, rule):
350 _ensure_rule_contains_keys(rule, {
351 "target_column": int,
352 "invert": bool,
353 "value": six.string_types,
354 })
355
356 def apply(self, rule, data, sources):
357 invert = rule["invert"]
358 target_column = rule["target_column"]
359 value = rule["value"]
360
361 def _filter(index):
362 row = data[index]
363 val = row[target_column]
364 return not invert if val == value else invert
365
366 return _filter_index(_filter, data), _filter_index(_filter, sources)
367
368
369 class AddFilterCompareRuleDefinition(BaseRuleDefinition):
370 rule_type = "add_filter_compare"
371
372 def validate_rule(self, rule):
373 _ensure_rule_contains_keys(rule, {
374 "target_column": int,
375 "value": int,
376 "compare_type": six.string_types,
377 })
378 _ensure_key_value_in(rule, "compare_type", ["less_than", "less_than_equal", "greater_than", "greater_than_equal"])
379
380 def apply(self, rule, data, sources):
381 target_column = rule["target_column"]
382 value = rule["value"]
383 compare_type = rule["compare_type"]
384
385 def _filter(index):
386 row = data[index]
387 target_value = float(row[target_column])
388 if compare_type == "less_than":
389 matches = target_value < value
390 elif compare_type == "less_than_equal":
391 matches = target_value <= value
392 elif compare_type == "greater_than":
393 matches = target_value > value
394 elif compare_type == "greater_than_equal":
395 matches = target_value >= value
396
397 return matches
398
399 return _filter_index(_filter, data), _filter_index(_filter, sources)
400
401
402 class SortRuleDefinition(BaseRuleDefinition):
403 rule_type = "sort"
404
405 def validate_rule(self, rule):
406 _ensure_rule_contains_keys(rule, {
407 "target_column": int,
408 "numeric": bool,
409 })
410
411 def apply(self, rule, data, sources):
412 target = rule["target_column"]
413 numeric = rule["numeric"]
414
415 sortable = zip(data, sources)
416
417 def sort_func(item):
418 a_val = item[0][target]
419 if numeric:
420 a_val = float(a_val)
421 return a_val
422
423 sorted_data = sorted(sortable, key=sort_func)
424
425 new_data = []
426 new_sources = []
427
428 for (row, source) in sorted_data:
429 new_data.append(row)
430 new_sources.append(source)
431
432 return new_data, new_sources
433
434
435 class SwapColumnsRuleDefinition(BaseRuleDefinition):
436 rule_type = "swap_columns"
437
438 def validate_rule(self, rule):
439 _ensure_rule_contains_keys(rule, {
440 "target_column_0": int,
441 "target_column_1": int,
442 })
443
444 def apply(self, rule, data, sources):
445 target_column_0 = rule["target_column_0"]
446 target_column_1 = rule["target_column_1"]
447
448 def new_row(row):
449 row_copy = row[:]
450 row_copy[target_column_0] = row[target_column_1]
451 row_copy[target_column_1] = row[target_column_0]
452 return row_copy
453
454 return list(map(new_row, data)), sources
455
456
457 class SplitColumnsRuleDefinition(BaseRuleDefinition):
458 rule_type = "split_columns"
459
460 def validate_rule(self, rule):
461 _ensure_rule_contains_keys(rule, {
462 "target_columns_0": list,
463 "target_columns_1": list,
464 })
465
466 def apply(self, rule, data, sources):
467 target_columns_0 = rule["target_columns_0"]
468 target_columns_1 = rule["target_columns_1"]
469
470 def split_row(row):
471 new_row_0 = []
472 new_row_1 = []
473 for index, el in enumerate(row):
474 if index in target_columns_0:
475 new_row_0.append(el)
476 elif index in target_columns_1:
477 new_row_1.append(el)
478 else:
479 new_row_0.append(el)
480 new_row_1.append(el)
481
482 return [new_row_0, new_row_1]
483
484 data = flat_map(split_row, data)
485 sources = flat_map(lambda x: [x, x], sources)
486
487 return data, sources
488
489
490 def flat_map(f, items):
491 return list(itertools.chain.from_iterable(map(f, items)))
492
493
494 class RuleSet(object):
495
496 def __init__(self, rule_set_as_dict):
497 self.raw_rules = strip_control_characters_nested(rule_set_as_dict["rules"])
498 self.raw_mapping = rule_set_as_dict.get("mapping", [])
499
500 @property
501 def rules(self):
502 return self.raw_rules
503
504 def _rules_with_definitions(self):
505 for rule in self.raw_rules:
506 yield (rule, RULES_DEFINITIONS[rule["type"]])
507
508 def apply(self, data, sources):
509 for rule, rule_definition in self._rules_with_definitions():
510 rule_definition.validate_rule(rule)
511 data, sources = rule_definition.apply(rule, data, sources)
512
513 return data, sources
514
515 @property
516 def has_errors(self):
517 errored = False
518 try:
519 for rule, rule_definition in self._rules_with_definitions():
520 rule_definition.validate_rule(rule)
521 except Exception:
522 errored = True
523 return errored
524
525 @property
526 def mapping_as_dict(self):
527 as_dict = {}
528 for mapping in self.raw_mapping:
529 as_dict[mapping["type"]] = mapping
530
531 return as_dict
532
533 # Rest of this is generic, things here are Galaxy collection specific, think about about
534 # subclass of RuleSet for collection creation.
535 @property
536 def identifier_columns(self):
537 mapping_as_dict = self.mapping_as_dict
538 identifier_columns = []
539 if "list_identifiers" in mapping_as_dict:
540 identifier_columns.extend(mapping_as_dict["list_identifiers"]["columns"])
541 if "paired_identifier" in mapping_as_dict:
542 identifier_columns.append(mapping_as_dict["paired_identifier"]["columns"][0])
543
544 return identifier_columns
545
546 @property
547 def collection_type(self):
548 mapping_as_dict = self.mapping_as_dict
549 list_columns = mapping_as_dict.get("list_identifiers", {"columns": []})["columns"]
550 collection_type = ":".join(map(lambda c: "list", list_columns))
551 if "paired_identifier" in mapping_as_dict:
552 if collection_type:
553 collection_type += ":paired"
554 else:
555 collection_type = "paired"
556 return collection_type
557
558 @property
559 def display(self):
560 message = "Rules:\n"
561 message += "".join("- %s\n" % r for r in self.raw_rules)
562 message += "Column Definitions:\n"
563 message += "".join("- %s\n" % m for m in self.raw_mapping)
564 return message
565
566
567 RULES_DEFINITION_CLASSES = [
568 AddColumnMetadataRuleDefinition,
569 AddColumnGroupTagValueRuleDefinition,
570 AddColumnConcatenateRuleDefinition,
571 AddColumnBasenameRuleDefinition,
572 AddColumnRegexRuleDefinition,
573 AddColumnRownumRuleDefinition,
574 AddColumnValueRuleDefinition,
575 AddColumnSubstrRuleDefinition,
576 RemoveColumnsRuleDefinition,
577 AddFilterRegexRuleDefinition,
578 AddFilterCountRuleDefinition,
579 AddFilterEmptyRuleDefinition,
580 AddFilterMatchesRuleDefinition,
581 AddFilterCompareRuleDefinition,
582 SortRuleDefinition,
583 SwapColumnsRuleDefinition,
584 SplitColumnsRuleDefinition,
585 ]
586 RULES_DEFINITIONS = {}
587 for rule_class in RULES_DEFINITION_CLASSES:
588 RULES_DEFINITIONS[rule_class.rule_type] = rule_class()