comparison query_tabular.py @ 11:fd16243931d6 draft

Uploaded
author jjohnson
date Fri, 12 May 2017 10:18:42 -0400
parents e84d1c3bf4fe
children 5b4f6cf857cf
comparison
equal deleted inserted replaced
10:e84d1c3bf4fe 11:fd16243931d6
62 class LineFilter( object ): 62 class LineFilter( object ):
63 def __init__(self,source,filter_dict): 63 def __init__(self,source,filter_dict):
64 self.source = source 64 self.source = source
65 self.filter_dict = filter_dict 65 self.filter_dict = filter_dict
66 # print >> sys.stderr, 'LineFilter %s' % filter_dict if filter_dict else 'NONE' 66 # print >> sys.stderr, 'LineFilter %s' % filter_dict if filter_dict else 'NONE'
67 self.func = lambda l: l.rstrip('\r\n') if l else None 67 self.func = lambda i,l: l.rstrip('\r\n') if l else None
68 self.src_lines = []
69 self.src_line_cnt = 0
68 if not filter_dict: 70 if not filter_dict:
69 return 71 return
70 if filter_dict['filter'] == 'regex': 72 if filter_dict['filter'] == 'regex':
71 rgx = re.compile(filter_dict['pattern']) 73 rgx = re.compile(filter_dict['pattern'])
72 if filter_dict['action'] == 'exclude_match': 74 if filter_dict['action'] == 'exclude_match':
73 self.func = lambda l: l if not rgx.match(l) else None 75 self.func = lambda i,l: l if not rgx.match(l) else None
74 elif filter_dict['action'] == 'include_match': 76 elif filter_dict['action'] == 'include_match':
75 self.func = lambda l: l if rgx.match(l) else None 77 self.func = lambda i,l: l if rgx.match(l) else None
76 elif filter_dict['action'] == 'exclude_find': 78 elif filter_dict['action'] == 'exclude_find':
77 self.func = lambda l: l if not rgx.search(l) else None 79 self.func = lambda i,l: l if not rgx.search(l) else None
78 elif filter_dict['action'] == 'include_find': 80 elif filter_dict['action'] == 'include_find':
79 self.func = lambda l: l if rgx.search(l) else None 81 self.func = lambda i,l: l if rgx.search(l) else None
80 elif filter_dict['filter'] == 'replace': 82 elif filter_dict['filter'] == 'replace':
81 p = filter_dict['pattern'] 83 p = filter_dict['pattern']
82 r = filter_dict['replace'] 84 r = filter_dict['replace']
83 c = int(filter_dict['column']) - 1 85 c = int(filter_dict['column']) - 1
84 self.func = lambda l: '\t'.join([x if i != c else re.sub(p,r,x) for i,x in enumerate(l.split('\t'))]) 86 self.func = lambda i,l: '\t'.join([x if i != c else re.sub(p,r,x) for i,x in enumerate(l.split('\t'))])
87 elif filter_dict['filter'] == 'prepend_line_num':
88 self.func = lambda i,l: '%d\t%s' % (i,l)
89 elif filter_dict['filter'] == 'append_line_num':
90 self.func = lambda i,l: '%s\t%d' % (l.rstrip('\r\n'),i)
91 elif filter_dict['filter'] == 'skip':
92 cnt = filter_dict['count']
93 self.func = lambda i,l: l if i > cnt else None
94 elif filter_dict['filter'] == 'normalize':
95 cols = [int(c) - 1 for c in filter_dict['columns']]
96 sep = filter_dict['separator']
97 self.func = lambda i,l: self.normalize(l,cols,sep)
85 def __iter__(self): 98 def __iter__(self):
86 return self 99 return self
100 def normalize(self,line,split_cols,sep):
101 lines = []
102 fields = line.rstrip('\r\n').split('\t')
103 split_fields = dict()
104 cnt = 0
105 for c in split_cols:
106 if c < len(fields):
107 split_fields[c] = fields[c].split(sep)
108 cnt = max(cnt, len(split_fields[c]))
109 if cnt == 0:
110 lines.append('\t'.join(fields))
111 else:
112 for n in range(0, cnt):
113 flds = [x if c not in split_cols else split_fields[c][n] if n < len(split_fields[c]) else '' for (c, x) in enumerate(fields)]
114 lines.append('\t'.join(flds))
115 return lines
116 def get_lines(self):
117 for i,next_line in enumerate(self.source):
118 self.src_line_cnt += 1
119 line = self.func(self.src_line_cnt,next_line)
120 # print >> sys.stderr, 'LineFilter %s: %d %s' % (str(self.filter_dict),self.src_line_cnt,line)
121 if line:
122 if isinstance(line,list):
123 self.src_lines.extend(line)
124 else:
125 self.src_lines.append(line)
126 return
87 def next(self): 127 def next(self):
88 for i,next_line in enumerate(self.source): 128 if not self.src_lines:
89 line = self.func(next_line) 129 self.get_lines()
90 if line: 130 if self.src_lines:
91 return line 131 return self.src_lines.pop(0)
92 raise StopIteration 132 raise StopIteration
93 133
94 134
95 class TabularReader: 135 class TabularReader:
96 """ 136 """