Mercurial > repos > proteore > proteore_filter_keywords_values
comparison filter_kw_val.py @ 2:52a7afd01c6d draft
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author | proteore |
---|---|
date | Tue, 18 Dec 2018 09:25:11 -0500 |
parents | a55e8b137c6b |
children | 2080e2a4f209 |
comparison
equal
deleted
inserted
replaced
1:cb9555653b09 | 2:52a7afd01c6d |
---|---|
53 | 53 |
54 #Filter the document | 54 #Filter the document |
55 def filters(args): | 55 def filters(args): |
56 filename = args.input.split(",")[0] | 56 filename = args.input.split(",")[0] |
57 header = str_to_bool(args.input.split(",")[1]) | 57 header = str_to_bool(args.input.split(",")[1]) |
58 csv_file = read_file(filename) | 58 csv_file = blank_to_NA(read_file(filename)) |
59 results_dict = {} | 59 results_dict = {} |
60 | 60 |
61 if args.kw: | 61 if args.kw: |
62 keywords = args.kw | 62 keywords = args.kw |
63 for k in keywords: | 63 for k in keywords: |
64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) | 64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) |
65 | 65 |
66 if args.kw_file: | 66 if args.kw_file: |
67 key_files = args.kw_file | 67 key_files = args.kw_file |
68 for kf in key_files: | 68 for kf in key_files: |
69 keywords = read_option(kf[0]) | 69 header = str_to_bool(kf[1]) |
70 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2]) | 70 ncol = column_from_txt(kf[2]) |
71 keywords = read_keywords_file(kf[0],header,ncol) | |
72 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) | |
71 | 73 |
72 if args.value: | 74 if args.value: |
73 for v in args.value: | 75 for v in args.value: |
76 v[0] = v[0].replace(",",".") | |
74 if is_number("float", v[0]): | 77 if is_number("float", v[0]): |
78 csv_file = comma_number_to_float(csv_file,v[1],header) | |
75 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) | 79 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) |
76 else: | 80 else: |
77 raise ValueError("Please enter a number in filter by value") | 81 raise ValueError("Please enter a number in filter by value") |
78 | 82 |
79 if args.values_range: | 83 if args.values_range: |
80 for vr in args.values_range: | 84 for vr in args.values_range: |
85 vr[:2] = [value.replace(",",".") for value in vr[:2]] | |
86 csv_file = comma_number_to_float(csv_file,vr[2],header) | |
81 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): | 87 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): |
82 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) | 88 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) |
83 | 89 |
84 remaining_lines=[] | 90 remaining_lines=[] |
85 filtered_lines=[] | 91 filtered_lines=[] |
86 | 92 |
87 if header is True : | 93 if header is True : |
88 remaining_lines.append(csv_file[0]) | 94 remaining_lines.append(csv_file[0]) |
89 filtered_lines.append(csv_file[0]) | 95 filtered_lines.append(csv_file[0]) |
90 | 96 |
91 for id_line,line in enumerate(csv_file) : | 97 if results_dict == {} : #no filter used |
92 if id_line in results_dict : #skip header and empty lines | 98 remaining_lines.extend(csv_file[1:]) |
93 if args.operator == 'OR' : | 99 else : |
94 if any(results_dict[id_line]) : | 100 for id_line,line in enumerate(csv_file) : |
95 filtered_lines.append(line) | 101 if id_line in results_dict : #skip header and empty lines |
96 else : | 102 if args.operator == 'OR' : |
97 remaining_lines.append(line) | 103 if any(results_dict[id_line]) : |
98 | 104 filtered_lines.append(line) |
99 elif args.operator == "AND" : | 105 else : |
100 if all(results_dict[id_line]) : | 106 remaining_lines.append(line) |
101 filtered_lines.append(line) | 107 |
102 else : | 108 elif args.operator == "AND" : |
103 remaining_lines.append(line) | 109 if all(results_dict[id_line]) : |
104 | 110 filtered_lines.append(line) |
111 else : | |
112 remaining_lines.append(line) | |
113 | |
105 #sort of results by column | 114 #sort of results by column |
106 if args.sort_col : | 115 if args.sort_col : |
107 sort_col=args.sort_col.split(",")[0] | 116 sort_col=args.sort_col.split(",")[0] |
108 sort_col=column_from_txt(sort_col) | 117 sort_col=column_from_txt(sort_col) |
109 reverse=str_to_bool(args.sort_col.split(",")[1]) | 118 reverse=str_to_bool(args.sort_col.split(",")[1]) |
122 | 131 |
123 #function to sort the csv_file by value in a specific column | 132 #function to sort the csv_file by value in a specific column |
124 def sort_by_column(tab,sort_col,reverse,header): | 133 def sort_by_column(tab,sort_col,reverse,header): |
125 | 134 |
126 if len(tab) > 1 : #if there's more than just a header or 1 row | 135 if len(tab) > 1 : #if there's more than just a header or 1 row |
127 if header is True : | 136 if header : |
128 head=tab[0] | 137 head=tab[0] |
129 tab=tab[1:] | 138 tab=tab[1:] |
130 | 139 |
131 if is_number("int",tab[0][sort_col]) : | 140 #list of empty cells in the column to sort |
132 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | 141 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] |
133 elif is_number("float",tab[0][sort_col]) : | 142 unsorted_tab=[ tab[i] for i in unsortable_lines] |
143 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] | |
144 | |
145 if only_number(tab,sort_col) and any_float(tab,sort_col) : | |
134 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) | 146 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) |
147 elif only_number(tab,sort_col): | |
148 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | |
135 else : | 149 else : |
136 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) | 150 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) |
137 | 151 |
152 tab.extend(unsorted_tab) | |
138 if header is True : tab = [head]+tab | 153 if header is True : tab = [head]+tab |
139 | 154 |
140 return tab | 155 return tab |
141 | 156 |
157 | |
158 #replace all blank cells to NA | |
159 def blank_to_NA(csv_file) : | |
160 | |
161 tmp=[] | |
162 for line in csv_file : | |
163 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] | |
164 tmp.append(line) | |
165 | |
166 return tmp | |
167 | |
168 #turn into float a column | |
169 def comma_number_to_float(csv_file,ncol,header) : | |
170 ncol = int(ncol.replace("c","")) - 1 | |
171 if header : | |
172 tmp=[csv_file[0]] | |
173 csv_file=csv_file[1:] | |
174 else : | |
175 tmp=[] | |
176 | |
177 for line in csv_file : | |
178 line[ncol]=line[ncol].replace(",",".") | |
179 tmp.append(line) | |
180 | |
181 return (tmp) | |
182 | |
183 #return True is there is at least one float in the column | |
184 def any_float(tab,col) : | |
185 | |
186 for line in tab : | |
187 if is_number("float",line[col].replace(",",".")) : | |
188 return True | |
189 | |
190 return False | |
191 | |
192 def only_number(tab,col) : | |
193 | |
194 for line in tab : | |
195 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : | |
196 return False | |
197 return True | |
198 | |
142 #Read the keywords file to extract the list of keywords | 199 #Read the keywords file to extract the list of keywords |
143 def read_option(filename): | 200 def read_keywords_file(filename,header,ncol): |
144 with open(filename, "r") as f: | 201 with open(filename, "r") as csv_file : |
145 filter_list=f.read().splitlines() | 202 lines= csv.reader(csv_file, delimiter='\t') |
146 filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0] | 203 lines = blank_to_NA(lines) |
147 filters=";".join(filter_list) | 204 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] |
148 | 205 else : |
149 return filters | 206 keywords= ["".join(key) for key in lines] |
207 if header : keywords = keywords[1:] | |
208 keywords = list(set(keywords)) | |
209 | |
210 return keywords | |
150 | 211 |
151 # Read input file | 212 # Read input file |
152 def read_file(filename): | 213 def read_file(filename): |
153 with open(filename,"r") as f : | 214 with open(filename,"r") as f : |
154 reader=csv.reader(f,delimiter="\t") | 215 reader=csv.reader(f,delimiter="\t") |
162 | 223 |
163 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) | 224 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) |
164 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): | 225 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): |
165 match=str_to_bool(match) | 226 match=str_to_bool(match) |
166 ncol=column_from_txt(ncol) | 227 ncol=column_from_txt(ncol) |
167 | 228 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword |
168 keywords = keywords.upper().split(";") # Split list of filter keyword | |
169 [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords | |
170 keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords | |
171 | 229 |
172 for id_line,line in enumerate(csv_file): | 230 for id_line,line in enumerate(csv_file): |
173 if header is True and id_line == 0 : continue | 231 if header is True and id_line == 0 : continue |
174 #line = line.replace("\n", "") | |
175 keyword_inline = line[ncol].replace('"', "").split(";") | 232 keyword_inline = line[ncol].replace('"', "").split(";") |
176 #line = line + "\n" | |
177 | 233 |
178 #Perfect match or not | 234 #Perfect match or not |
179 if match is True : | 235 if match is True : |
180 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) | 236 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) |
181 else: | 237 else: |
190 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 246 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
191 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): | 247 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): |
192 | 248 |
193 filter_value = float(filter_value) | 249 filter_value = float(filter_value) |
194 ncol=column_from_txt(ncol) | 250 ncol=column_from_txt(ncol) |
251 nb_string=0 | |
195 | 252 |
196 for id_line,line in enumerate(csv_file): | 253 for id_line,line in enumerate(csv_file): |
197 if header is True and id_line == 0 : continue | 254 if header is True and id_line == 0 : continue |
198 value = line[ncol].replace('"', "").strip() | 255 value = line[ncol].replace('"', "").replace(",",".").strip() |
199 if value.replace(".", "", 1).isdigit(): | 256 if value.replace(".", "", 1).isdigit(): |
200 to_filter=value_compare(value,filter_value,opt) | 257 to_filter=value_compare(value,filter_value,opt) |
201 | 258 |
202 #adding the result to the dictionary | 259 #adding the result to the dictionary |
203 if id_line in results_dict : results_dict[id_line].append(to_filter) | 260 if id_line in results_dict : results_dict[id_line].append(to_filter) |
204 else : results_dict[id_line]=[to_filter] | 261 else : results_dict[id_line]=[to_filter] |
262 | |
263 #impossible to treat (ex : "" instead of a number), we keep the line by default | |
264 else : | |
265 nb_string+=1 | |
266 if id_line in results_dict : results_dict[id_line].append(False) | |
267 else : results_dict[id_line]=[False] | |
268 | |
269 #number of lines in the csv file | |
270 if header : nb_lines = len(csv_file) -1 | |
271 else : nb_lines = len(csv_file) | |
272 | |
273 #if there's no numeric value in the column | |
274 if nb_string == nb_lines : | |
275 print ('No numeric values found in the column '+str(ncol+1)) | |
276 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) | |
205 | 277 |
206 return results_dict | 278 return results_dict |
207 | 279 |
208 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 280 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) |
209 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): | 281 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): |
210 inclusive=str_to_bool(inclusive) | 282 inclusive=str_to_bool(inclusive) |
211 bottom_value = float(bottom_value) | 283 bottom_value = float(bottom_value) |
212 top_value=float(top_value) | 284 top_value=float(top_value) |
213 ncol=column_from_txt(ncol) | 285 ncol=column_from_txt(ncol) |
286 nb_string=0 | |
214 | 287 |
215 for id_line, line in enumerate(csv_file): | 288 for id_line, line in enumerate(csv_file): |
216 if header is True and id_line == 0 : continue | 289 if header is True and id_line == 0 : continue |
217 value = line[ncol].replace('"', "").strip() | 290 value = line[ncol].replace('"', "").replace(",",".").strip() |
218 if value.replace(".", "", 1).isdigit(): | 291 if value.replace(".", "", 1).isdigit(): |
219 value=float(value) | 292 value=float(value) |
220 if inclusive is True: | 293 if inclusive is True: |
221 in_range = not (bottom_value <= value <= top_value) | 294 in_range = not (bottom_value <= value <= top_value) |
222 else : | 295 else : |
223 in_range = not (bottom_value < value < top_value) | 296 in_range = not (bottom_value < value < top_value) |
224 | 297 |
225 #adding the result to the dictionary | 298 #adding the result to the dictionary |
226 if id_line in results_dict : results_dict[id_line].append(in_range) | 299 if id_line in results_dict : results_dict[id_line].append(in_range) |
227 else : results_dict[id_line]=[in_range] | 300 else : results_dict[id_line]=[in_range] |
301 | |
302 #impossible to treat (ex : "" instead of a number), we keep the line by default | |
303 else : | |
304 nb_string+=1 | |
305 if id_line in results_dict : results_dict[id_line].append(False) | |
306 else : results_dict[id_line]=[False] | |
307 | |
308 #number of lines in the csv file | |
309 if header : nb_lines = len(csv_file) -1 | |
310 else : nb_lines = len(csv_file) | |
311 | |
312 #if there's no numeric value in the column | |
313 if nb_string == nb_lines : | |
314 print ('No numeric values found in the column '+str(ncol+1)) | |
315 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | |
316 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | |
228 | 317 |
229 return results_dict | 318 return results_dict |
230 | 319 |
231 def column_from_txt(ncol): | 320 def column_from_txt(ncol): |
232 if is_number("int", ncol.replace("c", "")): | 321 if is_number("int", ncol.replace("c", "")): |