Mercurial > repos > jjohnson > query_tabular
comparison filters.py @ 20:ab27c4bd14b9 draft
Uploaded
author | jjohnson |
---|---|
date | Fri, 14 Jul 2017 11:39:27 -0400 |
parents | |
children | bed5018e7ae3 |
comparison
equal
deleted
inserted
replaced
19:9d9ab2c69014 | 20:ab27c4bd14b9 |
---|---|
1 #!/usr/binsenv python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import re | |
6 import sys | |
7 | |
8 | |
9 class LineFilter(object): | |
10 def __init__(self, source, filter_dict): | |
11 self.source = source | |
12 self.filter_dict = filter_dict | |
13 self.func = lambda i, l: l.rstrip('\r\n') if l else None | |
14 self.src_lines = [] | |
15 self.src_line_cnt = 0 | |
16 if not filter_dict: | |
17 return | |
18 if filter_dict['filter'] == 'regex': | |
19 rgx = re.compile(filter_dict['pattern']) | |
20 if filter_dict['action'] == 'exclude_match': | |
21 self.func = lambda i, l: l if not rgx.match(l) else None | |
22 elif filter_dict['action'] == 'include_match': | |
23 self.func = lambda i, l: l if rgx.match(l) else None | |
24 elif filter_dict['action'] == 'exclude_find': | |
25 self.func = lambda i, l: l if not rgx.search(l) else None | |
26 elif filter_dict['action'] == 'include_find': | |
27 self.func = lambda i, l: l if rgx.search(l) else None | |
28 elif filter_dict['filter'] == 'select_columns': | |
29 cols = [int(c) - 1 for c in filter_dict['columns']] | |
30 self.func = lambda i, l: self.select_columns(l, cols) | |
31 elif filter_dict['filter'] == 'replace': | |
32 p = filter_dict['pattern'] | |
33 r = filter_dict['replace'] | |
34 c = int(filter_dict['column']) - 1 | |
35 self.func = lambda i, l: '\t'.join( | |
36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) | |
37 elif filter_dict['filter'] == 'prepend_line_num': | |
38 self.func = lambda i, l: '%d\t%s' % (i, l) | |
39 elif filter_dict['filter'] == 'append_line_num': | |
40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) | |
41 elif filter_dict['filter'] == 'prepend_text': | |
42 s = filter_dict['column_text'] | |
43 self.func = lambda i, l: '%s\t%s' % (s, l) | |
44 elif filter_dict['filter'] == 'append_text': | |
45 s = filter_dict['column_text'] | |
46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) | |
47 elif filter_dict['filter'] == 'skip': | |
48 cnt = filter_dict['count'] | |
49 self.func = lambda i, l: l if i > cnt else None | |
50 elif filter_dict['filter'] == 'normalize': | |
51 cols = [int(c) - 1 for c in filter_dict['columns']] | |
52 sep = filter_dict['separator'] | |
53 self.func = lambda i, l: self.normalize(l, cols, sep) | |
54 | |
55 def __iter__(self): | |
56 return self | |
57 | |
58 def __next__(self): | |
59 return next(self) | |
60 | |
61 def next(self): | |
62 if not self.src_lines: | |
63 self.get_lines() | |
64 if self.src_lines: | |
65 return self.src_lines.pop(0) | |
66 raise StopIteration | |
67 | |
68 def select_columns(self, line, cols): | |
69 fields = line.split('\t') | |
70 return '\t'.join([fields[x] for x in cols]) | |
71 | |
72 def normalize(self, line, split_cols, sep): | |
73 lines = [] | |
74 fields = line.rstrip('\r\n').split('\t') | |
75 split_fields = dict() | |
76 cnt = 0 | |
77 for c in split_cols: | |
78 if c < len(fields): | |
79 split_fields[c] = fields[c].split(sep) | |
80 cnt = max(cnt, len(split_fields[c])) | |
81 if cnt == 0: | |
82 lines.append('\t'.join(fields)) | |
83 else: | |
84 for n in range(0, cnt): | |
85 flds = [x if c not in split_cols else split_fields[c][n] | |
86 if n < len(split_fields[c]) | |
87 else '' for (c, x) in enumerate(fields)] | |
88 lines.append('\t'.join(flds)) | |
89 return lines | |
90 | |
91 def get_lines(self): | |
92 for i, next_line in enumerate(self.source): | |
93 self.src_line_cnt += 1 | |
94 line = self.func(self.src_line_cnt, next_line) | |
95 if line: | |
96 if isinstance(line, list): | |
97 self.src_lines.extend(line) | |
98 else: | |
99 self.src_lines.append(line) | |
100 return | |
101 | |
102 | |
103 class TabularReader: | |
104 """ | |
105 Tabular file iterator. Returns a list | |
106 """ | |
107 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, | |
108 filters=None): | |
109 self.skip = skip | |
110 self.comment_char = comment_char | |
111 self.col_idx = col_idx | |
112 self.filters = filters | |
113 self.tsv_file = \ | |
114 input_file if isinstance(input_file, file) else open(input_file) | |
115 if skip and skip > 0: | |
116 for i in range(skip): | |
117 if not self.tsv_file.readline(): | |
118 break | |
119 source = LineFilter(self.tsv_file, None) | |
120 if comment_char: | |
121 source = LineFilter(source, | |
122 {"filter": "regex", "pattern": comment_char, | |
123 "action": "exclude_match"}) | |
124 if filters: | |
125 for f in filters: | |
126 source = LineFilter(source, f) | |
127 self.source = source | |
128 | |
129 def __iter__(self): | |
130 return self | |
131 | |
132 def __next__(self): | |
133 return next(self) | |
134 | |
135 def next(self): | |
136 ''' Iteration ''' | |
137 for i, line in enumerate(self.source): | |
138 fields = line.rstrip('\r\n').split('\t') | |
139 if self.col_idx: | |
140 fields = [fields[i] for i in self.col_idx] | |
141 return fields | |
142 raise StopIteration | |
143 | |
144 | |
145 def filter_file(input_file, output, skip=0, comment_char='#', filters=None): | |
146 data_lines = 0 | |
147 try: | |
148 tr = TabularReader(input_file, skip=skip, comment_char=comment_char, | |
149 filters=filters) | |
150 for linenum, fields in enumerate(tr): | |
151 data_lines += 1 | |
152 try: | |
153 output.write('%s\n' % '\t'.join(fields)) | |
154 except Exception as e: | |
155 print('Failed at line: %d err: %s' % (linenum, e), | |
156 file=sys.stderr) | |
157 except Exception as e: | |
158 print('Failed: %s' % (e), file=sys.stderr) | |
159 exit(1) |