Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/galaxy/util/rules_dsl.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 import abc | |
2 import itertools | |
3 import re | |
4 | |
5 import six | |
6 from six.moves import map | |
7 | |
8 from galaxy.util import strip_control_characters_nested | |
9 | |
10 | |
11 def _ensure_rule_contains_keys(rule, keys): | |
12 for key, instance_class in keys.items(): | |
13 if key not in rule: | |
14 raise ValueError("Rule of type [%s] does not contain key [%s]." % (rule["type"], key)) | |
15 value = rule[key] | |
16 if not isinstance(value, instance_class): | |
17 raise ValueError("Rule of type [%s] does not contain correct value type for key [%s]." % (rule["type"], key)) | |
18 | |
19 | |
20 def _ensure_key_value_in(rule, key, values): | |
21 value = rule[key] | |
22 if value not in values: | |
23 raise ValueError("Invalid value [%s] for [%s] encountered." % (value, key)) | |
24 | |
25 | |
26 def _ensure_valid_pattern(expression): | |
27 re.compile(expression) | |
28 | |
29 | |
30 def apply_regex(regex, target, data, replacement=None, group_count=None): | |
31 pattern = re.compile(regex) | |
32 | |
33 def new_row(row): | |
34 source = row[target] | |
35 if replacement is None: | |
36 match = pattern.search(source) | |
37 if not match: | |
38 raise Exception("Problem applying regular expression [%s] to [%s]." % (regex, source)) | |
39 | |
40 if group_count: | |
41 if len(match.groups()) != group_count: | |
42 raise Exception("Problem applying regular expression, wrong number of groups found.") | |
43 | |
44 result = row + list(match.groups()) | |
45 else: | |
46 result = row + [match.group(0)] | |
47 else: | |
48 result = row + [pattern.search(source).expand(replacement)] | |
49 | |
50 return result | |
51 | |
52 new_data = list(map(new_row, data)) | |
53 return new_data | |
54 | |
55 | |
56 @six.add_metaclass(abc.ABCMeta) | |
57 class BaseRuleDefinition(object): | |
58 | |
59 @abc.abstractproperty | |
60 def rule_type(self): | |
61 """Short string describing type of rule (plugin class) to use.""" | |
62 | |
63 @abc.abstractmethod | |
64 def validate_rule(self, rule): | |
65 """Validate dictified rule definition of this type.""" | |
66 | |
67 @abc.abstractmethod | |
68 def apply(self, rule, data, sources): | |
69 """Apply validated, dictified rule definition to supplied data.""" | |
70 | |
71 | |
72 class AddColumnMetadataRuleDefinition(BaseRuleDefinition): | |
73 rule_type = "add_column_metadata" | |
74 | |
75 def validate_rule(self, rule): | |
76 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
77 | |
78 def apply(self, rule, data, sources): | |
79 rule_value = rule["value"] | |
80 if rule_value.startswith("identifier"): | |
81 identifier_index = int(rule_value[len("identifier"):]) | |
82 | |
83 new_rows = [] | |
84 for index, row in enumerate(data): | |
85 new_rows.append(row + [sources[index]["identifiers"][identifier_index]]) | |
86 | |
87 elif rule_value == "tags": | |
88 | |
89 def sorted_tags(index): | |
90 tags = sorted(sources[index]["tags"]) | |
91 return [",".join(tags)] | |
92 | |
93 new_rows = [] | |
94 for index, row in enumerate(data): | |
95 new_rows.append(row + sorted_tags(index)) | |
96 | |
97 return new_rows, sources | |
98 | |
99 | |
100 class AddColumnGroupTagValueRuleDefinition(BaseRuleDefinition): | |
101 rule_type = "add_column_group_tag_value" | |
102 | |
103 def validate_rule(self, rule): | |
104 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
105 | |
106 def apply(self, rule, data, sources): | |
107 rule_value = rule["value"] | |
108 tag_prefix = "group:%s:" % rule_value | |
109 | |
110 new_rows = [] | |
111 for index, row in enumerate(data): | |
112 group_tag_value = None | |
113 source = sources[index] | |
114 tags = source["tags"] | |
115 for tag in sorted(tags): | |
116 if tag.startswith(tag_prefix): | |
117 group_tag_value = tag[len(tag_prefix):] | |
118 break | |
119 | |
120 if group_tag_value is None: | |
121 group_tag_value = rule.get("default_value", "") | |
122 | |
123 new_rows.append(row + [group_tag_value]) | |
124 | |
125 return new_rows, sources | |
126 | |
127 | |
128 class AddColumnConcatenateRuleDefinition(BaseRuleDefinition): | |
129 rule_type = "add_column_concatenate" | |
130 | |
131 def validate_rule(self, rule): | |
132 _ensure_rule_contains_keys(rule, {"target_column_0": int, "target_column_1": int}) | |
133 | |
134 def apply(self, rule, data, sources): | |
135 column_0 = rule["target_column_0"] | |
136 column_1 = rule["target_column_1"] | |
137 | |
138 new_rows = [] | |
139 for index, row in enumerate(data): | |
140 new_rows.append(row + [row[column_0] + row[column_1]]) | |
141 | |
142 return new_rows, sources | |
143 | |
144 | |
145 class AddColumnBasenameRuleDefinition(BaseRuleDefinition): | |
146 rule_type = "add_column_basename" | |
147 | |
148 def validate_rule(self, rule): | |
149 _ensure_rule_contains_keys(rule, {"target_column": int}) | |
150 | |
151 def apply(self, rule, data, sources): | |
152 column = rule["target_column"] | |
153 re = r"[^/]*$" | |
154 return apply_regex(re, column, data), sources | |
155 | |
156 | |
157 class AddColumnRegexRuleDefinition(BaseRuleDefinition): | |
158 rule_type = "add_column_regex" | |
159 | |
160 def validate_rule(self, rule): | |
161 _ensure_rule_contains_keys(rule, {"target_column": int, "expression": six.string_types}) | |
162 _ensure_valid_pattern(rule["expression"]) | |
163 | |
164 def apply(self, rule, data, sources): | |
165 target = rule["target_column"] | |
166 expression = rule["expression"] | |
167 replacement = rule.get("replacement") | |
168 group_count = rule.get("group_count") | |
169 | |
170 return apply_regex(expression, target, data, replacement, group_count), sources | |
171 | |
172 | |
173 class AddColumnRownumRuleDefinition(BaseRuleDefinition): | |
174 rule_type = "add_column_rownum" | |
175 | |
176 def validate_rule(self, rule): | |
177 _ensure_rule_contains_keys(rule, {"start": int}) | |
178 | |
179 def apply(self, rule, data, sources): | |
180 start = rule["start"] | |
181 | |
182 new_rows = [] | |
183 for index, row in enumerate(data): | |
184 new_rows.append(row + ["%d" % (index + start)]) | |
185 | |
186 return new_rows, sources | |
187 | |
188 | |
189 class AddColumnValueRuleDefinition(BaseRuleDefinition): | |
190 rule_type = "add_column_value" | |
191 | |
192 def validate_rule(self, rule): | |
193 _ensure_rule_contains_keys(rule, {"value": six.string_types}) | |
194 | |
195 def apply(self, rule, data, sources): | |
196 value = rule["value"] | |
197 | |
198 new_rows = [] | |
199 for index, row in enumerate(data): | |
200 new_rows.append(row + [str(value)]) | |
201 | |
202 return new_rows, sources | |
203 | |
204 | |
205 class AddColumnSubstrRuleDefinition(BaseRuleDefinition): | |
206 rule_type = "add_column_substr" | |
207 | |
208 def validate_rule(self, rule): | |
209 _ensure_rule_contains_keys(rule, { | |
210 "target_column": int, | |
211 "length": int, | |
212 "substr_type": six.string_types, | |
213 }) | |
214 _ensure_key_value_in(rule, "substr_type", ["keep_prefix", "drop_prefix", "keep_suffix", "drop_suffix"]) | |
215 | |
216 def apply(self, rule, data, sources): | |
217 target = rule["target_column"] | |
218 length = rule["length"] | |
219 substr_type = rule["substr_type"] | |
220 | |
221 def new_row(row): | |
222 original_value = row[target] | |
223 start = 0 | |
224 end = len(original_value) | |
225 | |
226 if substr_type == "keep_prefix": | |
227 end = length | |
228 elif substr_type == "drop_prefix": | |
229 start = length | |
230 elif substr_type == "keep_suffix": | |
231 start = end - length | |
232 if start < 0: | |
233 start = 0 | |
234 else: | |
235 end = end - length | |
236 if end < 0: | |
237 end = 0 | |
238 | |
239 return row + [original_value[start:end]] | |
240 | |
241 return list(map(new_row, data)), sources | |
242 | |
243 | |
244 class RemoveColumnsRuleDefinition(BaseRuleDefinition): | |
245 rule_type = "remove_columns" | |
246 | |
247 def validate_rule(self, rule): | |
248 _ensure_rule_contains_keys(rule, { | |
249 "target_columns": list, | |
250 }) | |
251 | |
252 def apply(self, rule, data, sources): | |
253 target_columns = rule["target_columns"] | |
254 | |
255 def new_row(row): | |
256 new = [] | |
257 for index, val in enumerate(row): | |
258 if index not in target_columns: | |
259 new.append(val) | |
260 return new | |
261 | |
262 return list(map(new_row, data)), sources | |
263 | |
264 | |
265 def _filter_index(func, iterable): | |
266 result = [] | |
267 for index, x in enumerate(iterable): | |
268 if func(index): | |
269 result.append(x) | |
270 | |
271 return result | |
272 | |
273 | |
274 class AddFilterRegexRuleDefinition(BaseRuleDefinition): | |
275 rule_type = "add_filter_regex" | |
276 | |
277 def validate_rule(self, rule): | |
278 _ensure_rule_contains_keys(rule, { | |
279 "target_column": int, | |
280 "invert": bool, | |
281 "expression": six.string_types, | |
282 }) | |
283 _ensure_valid_pattern(rule["expression"]) | |
284 | |
285 def apply(self, rule, data, sources): | |
286 target_column = rule["target_column"] | |
287 invert = rule["invert"] | |
288 regex = rule["expression"] | |
289 | |
290 def _filter(index): | |
291 row = data[index] | |
292 val = row[target_column] | |
293 pattern = re.compile(regex) | |
294 return not invert if pattern.search(val) else invert | |
295 | |
296 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
297 | |
298 | |
299 class AddFilterCountRuleDefinition(BaseRuleDefinition): | |
300 rule_type = "add_filter_count" | |
301 | |
302 def validate_rule(self, rule): | |
303 _ensure_rule_contains_keys(rule, { | |
304 "count": int, | |
305 "invert": bool, | |
306 "which": six.string_types, | |
307 }) | |
308 _ensure_key_value_in(rule, "which", ["first", "last"]) | |
309 | |
310 def apply(self, rule, data, sources): | |
311 num_rows = len(data) | |
312 invert = rule["invert"] | |
313 n = rule["count"] | |
314 which = rule["which"] | |
315 | |
316 def _filter(index): | |
317 if which == "first": | |
318 matches = index >= n | |
319 else: | |
320 matches = index < (num_rows - n) | |
321 return not invert if matches else invert | |
322 | |
323 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
324 | |
325 | |
326 class AddFilterEmptyRuleDefinition(BaseRuleDefinition): | |
327 rule_type = "add_filter_empty" | |
328 | |
329 def validate_rule(self, rule): | |
330 _ensure_rule_contains_keys(rule, { | |
331 "target_column": int, | |
332 "invert": bool | |
333 }) | |
334 | |
335 def apply(self, rule, data, sources): | |
336 invert = rule["invert"] | |
337 target_column = rule["target_column"] | |
338 | |
339 def _filter(index): | |
340 non_empty = len(data[index][target_column]) != 0 | |
341 return not invert if non_empty else invert | |
342 | |
343 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
344 | |
345 | |
346 class AddFilterMatchesRuleDefinition(BaseRuleDefinition): | |
347 rule_type = "add_filter_matches" | |
348 | |
349 def validate_rule(self, rule): | |
350 _ensure_rule_contains_keys(rule, { | |
351 "target_column": int, | |
352 "invert": bool, | |
353 "value": six.string_types, | |
354 }) | |
355 | |
356 def apply(self, rule, data, sources): | |
357 invert = rule["invert"] | |
358 target_column = rule["target_column"] | |
359 value = rule["value"] | |
360 | |
361 def _filter(index): | |
362 row = data[index] | |
363 val = row[target_column] | |
364 return not invert if val == value else invert | |
365 | |
366 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
367 | |
368 | |
369 class AddFilterCompareRuleDefinition(BaseRuleDefinition): | |
370 rule_type = "add_filter_compare" | |
371 | |
372 def validate_rule(self, rule): | |
373 _ensure_rule_contains_keys(rule, { | |
374 "target_column": int, | |
375 "value": int, | |
376 "compare_type": six.string_types, | |
377 }) | |
378 _ensure_key_value_in(rule, "compare_type", ["less_than", "less_than_equal", "greater_than", "greater_than_equal"]) | |
379 | |
380 def apply(self, rule, data, sources): | |
381 target_column = rule["target_column"] | |
382 value = rule["value"] | |
383 compare_type = rule["compare_type"] | |
384 | |
385 def _filter(index): | |
386 row = data[index] | |
387 target_value = float(row[target_column]) | |
388 if compare_type == "less_than": | |
389 matches = target_value < value | |
390 elif compare_type == "less_than_equal": | |
391 matches = target_value <= value | |
392 elif compare_type == "greater_than": | |
393 matches = target_value > value | |
394 elif compare_type == "greater_than_equal": | |
395 matches = target_value >= value | |
396 | |
397 return matches | |
398 | |
399 return _filter_index(_filter, data), _filter_index(_filter, sources) | |
400 | |
401 | |
402 class SortRuleDefinition(BaseRuleDefinition): | |
403 rule_type = "sort" | |
404 | |
405 def validate_rule(self, rule): | |
406 _ensure_rule_contains_keys(rule, { | |
407 "target_column": int, | |
408 "numeric": bool, | |
409 }) | |
410 | |
411 def apply(self, rule, data, sources): | |
412 target = rule["target_column"] | |
413 numeric = rule["numeric"] | |
414 | |
415 sortable = zip(data, sources) | |
416 | |
417 def sort_func(item): | |
418 a_val = item[0][target] | |
419 if numeric: | |
420 a_val = float(a_val) | |
421 return a_val | |
422 | |
423 sorted_data = sorted(sortable, key=sort_func) | |
424 | |
425 new_data = [] | |
426 new_sources = [] | |
427 | |
428 for (row, source) in sorted_data: | |
429 new_data.append(row) | |
430 new_sources.append(source) | |
431 | |
432 return new_data, new_sources | |
433 | |
434 | |
435 class SwapColumnsRuleDefinition(BaseRuleDefinition): | |
436 rule_type = "swap_columns" | |
437 | |
438 def validate_rule(self, rule): | |
439 _ensure_rule_contains_keys(rule, { | |
440 "target_column_0": int, | |
441 "target_column_1": int, | |
442 }) | |
443 | |
444 def apply(self, rule, data, sources): | |
445 target_column_0 = rule["target_column_0"] | |
446 target_column_1 = rule["target_column_1"] | |
447 | |
448 def new_row(row): | |
449 row_copy = row[:] | |
450 row_copy[target_column_0] = row[target_column_1] | |
451 row_copy[target_column_1] = row[target_column_0] | |
452 return row_copy | |
453 | |
454 return list(map(new_row, data)), sources | |
455 | |
456 | |
457 class SplitColumnsRuleDefinition(BaseRuleDefinition): | |
458 rule_type = "split_columns" | |
459 | |
460 def validate_rule(self, rule): | |
461 _ensure_rule_contains_keys(rule, { | |
462 "target_columns_0": list, | |
463 "target_columns_1": list, | |
464 }) | |
465 | |
466 def apply(self, rule, data, sources): | |
467 target_columns_0 = rule["target_columns_0"] | |
468 target_columns_1 = rule["target_columns_1"] | |
469 | |
470 def split_row(row): | |
471 new_row_0 = [] | |
472 new_row_1 = [] | |
473 for index, el in enumerate(row): | |
474 if index in target_columns_0: | |
475 new_row_0.append(el) | |
476 elif index in target_columns_1: | |
477 new_row_1.append(el) | |
478 else: | |
479 new_row_0.append(el) | |
480 new_row_1.append(el) | |
481 | |
482 return [new_row_0, new_row_1] | |
483 | |
484 data = flat_map(split_row, data) | |
485 sources = flat_map(lambda x: [x, x], sources) | |
486 | |
487 return data, sources | |
488 | |
489 | |
490 def flat_map(f, items): | |
491 return list(itertools.chain.from_iterable(map(f, items))) | |
492 | |
493 | |
494 class RuleSet(object): | |
495 | |
496 def __init__(self, rule_set_as_dict): | |
497 self.raw_rules = strip_control_characters_nested(rule_set_as_dict["rules"]) | |
498 self.raw_mapping = rule_set_as_dict.get("mapping", []) | |
499 | |
500 @property | |
501 def rules(self): | |
502 return self.raw_rules | |
503 | |
504 def _rules_with_definitions(self): | |
505 for rule in self.raw_rules: | |
506 yield (rule, RULES_DEFINITIONS[rule["type"]]) | |
507 | |
508 def apply(self, data, sources): | |
509 for rule, rule_definition in self._rules_with_definitions(): | |
510 rule_definition.validate_rule(rule) | |
511 data, sources = rule_definition.apply(rule, data, sources) | |
512 | |
513 return data, sources | |
514 | |
515 @property | |
516 def has_errors(self): | |
517 errored = False | |
518 try: | |
519 for rule, rule_definition in self._rules_with_definitions(): | |
520 rule_definition.validate_rule(rule) | |
521 except Exception: | |
522 errored = True | |
523 return errored | |
524 | |
525 @property | |
526 def mapping_as_dict(self): | |
527 as_dict = {} | |
528 for mapping in self.raw_mapping: | |
529 as_dict[mapping["type"]] = mapping | |
530 | |
531 return as_dict | |
532 | |
533 # Rest of this is generic, things here are Galaxy collection specific, think about about | |
534 # subclass of RuleSet for collection creation. | |
535 @property | |
536 def identifier_columns(self): | |
537 mapping_as_dict = self.mapping_as_dict | |
538 identifier_columns = [] | |
539 if "list_identifiers" in mapping_as_dict: | |
540 identifier_columns.extend(mapping_as_dict["list_identifiers"]["columns"]) | |
541 if "paired_identifier" in mapping_as_dict: | |
542 identifier_columns.append(mapping_as_dict["paired_identifier"]["columns"][0]) | |
543 | |
544 return identifier_columns | |
545 | |
546 @property | |
547 def collection_type(self): | |
548 mapping_as_dict = self.mapping_as_dict | |
549 list_columns = mapping_as_dict.get("list_identifiers", {"columns": []})["columns"] | |
550 collection_type = ":".join(map(lambda c: "list", list_columns)) | |
551 if "paired_identifier" in mapping_as_dict: | |
552 if collection_type: | |
553 collection_type += ":paired" | |
554 else: | |
555 collection_type = "paired" | |
556 return collection_type | |
557 | |
558 @property | |
559 def display(self): | |
560 message = "Rules:\n" | |
561 message += "".join("- %s\n" % r for r in self.raw_rules) | |
562 message += "Column Definitions:\n" | |
563 message += "".join("- %s\n" % m for m in self.raw_mapping) | |
564 return message | |
565 | |
566 | |
567 RULES_DEFINITION_CLASSES = [ | |
568 AddColumnMetadataRuleDefinition, | |
569 AddColumnGroupTagValueRuleDefinition, | |
570 AddColumnConcatenateRuleDefinition, | |
571 AddColumnBasenameRuleDefinition, | |
572 AddColumnRegexRuleDefinition, | |
573 AddColumnRownumRuleDefinition, | |
574 AddColumnValueRuleDefinition, | |
575 AddColumnSubstrRuleDefinition, | |
576 RemoveColumnsRuleDefinition, | |
577 AddFilterRegexRuleDefinition, | |
578 AddFilterCountRuleDefinition, | |
579 AddFilterEmptyRuleDefinition, | |
580 AddFilterMatchesRuleDefinition, | |
581 AddFilterCompareRuleDefinition, | |
582 SortRuleDefinition, | |
583 SwapColumnsRuleDefinition, | |
584 SplitColumnsRuleDefinition, | |
585 ] | |
586 RULES_DEFINITIONS = {} | |
587 for rule_class in RULES_DEFINITION_CLASSES: | |
588 RULES_DEFINITIONS[rule_class.rule_type] = rule_class() |