comparison filter_kw_val.py @ 2:52a7afd01c6d draft

planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author proteore
date Tue, 18 Dec 2018 09:25:11 -0500
parents a55e8b137c6b
children 2080e2a4f209
comparison
equal deleted inserted replaced
1:cb9555653b09 2:52a7afd01c6d
53 53
54 #Filter the document 54 #Filter the document
55 def filters(args): 55 def filters(args):
56 filename = args.input.split(",")[0] 56 filename = args.input.split(",")[0]
57 header = str_to_bool(args.input.split(",")[1]) 57 header = str_to_bool(args.input.split(",")[1])
58 csv_file = read_file(filename) 58 csv_file = blank_to_NA(read_file(filename))
59 results_dict = {} 59 results_dict = {}
60 60
61 if args.kw: 61 if args.kw:
62 keywords = args.kw 62 keywords = args.kw
63 for k in keywords: 63 for k in keywords:
64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) 64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
65 65
66 if args.kw_file: 66 if args.kw_file:
67 key_files = args.kw_file 67 key_files = args.kw_file
68 for kf in key_files: 68 for kf in key_files:
69 keywords = read_option(kf[0]) 69 header = str_to_bool(kf[1])
70 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2]) 70 ncol = column_from_txt(kf[2])
71 keywords = read_keywords_file(kf[0],header,ncol)
72 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
71 73
72 if args.value: 74 if args.value:
73 for v in args.value: 75 for v in args.value:
76 v[0] = v[0].replace(",",".")
74 if is_number("float", v[0]): 77 if is_number("float", v[0]):
78 csv_file = comma_number_to_float(csv_file,v[1],header)
75 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) 79 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
76 else: 80 else:
77 raise ValueError("Please enter a number in filter by value") 81 raise ValueError("Please enter a number in filter by value")
78 82
79 if args.values_range: 83 if args.values_range:
80 for vr in args.values_range: 84 for vr in args.values_range:
85 vr[:2] = [value.replace(",",".") for value in vr[:2]]
86 csv_file = comma_number_to_float(csv_file,vr[2],header)
81 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): 87 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
82 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) 88 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
83 89
84 remaining_lines=[] 90 remaining_lines=[]
85 filtered_lines=[] 91 filtered_lines=[]
86 92
87 if header is True : 93 if header is True :
88 remaining_lines.append(csv_file[0]) 94 remaining_lines.append(csv_file[0])
89 filtered_lines.append(csv_file[0]) 95 filtered_lines.append(csv_file[0])
90 96
91 for id_line,line in enumerate(csv_file) : 97 if results_dict == {} : #no filter used
92 if id_line in results_dict : #skip header and empty lines 98 remaining_lines.extend(csv_file[1:])
93 if args.operator == 'OR' : 99 else :
94 if any(results_dict[id_line]) : 100 for id_line,line in enumerate(csv_file) :
95 filtered_lines.append(line) 101 if id_line in results_dict : #skip header and empty lines
96 else : 102 if args.operator == 'OR' :
97 remaining_lines.append(line) 103 if any(results_dict[id_line]) :
98 104 filtered_lines.append(line)
99 elif args.operator == "AND" : 105 else :
100 if all(results_dict[id_line]) : 106 remaining_lines.append(line)
101 filtered_lines.append(line) 107
102 else : 108 elif args.operator == "AND" :
103 remaining_lines.append(line) 109 if all(results_dict[id_line]) :
104 110 filtered_lines.append(line)
111 else :
112 remaining_lines.append(line)
113
105 #sort of results by column 114 #sort of results by column
106 if args.sort_col : 115 if args.sort_col :
107 sort_col=args.sort_col.split(",")[0] 116 sort_col=args.sort_col.split(",")[0]
108 sort_col=column_from_txt(sort_col) 117 sort_col=column_from_txt(sort_col)
109 reverse=str_to_bool(args.sort_col.split(",")[1]) 118 reverse=str_to_bool(args.sort_col.split(",")[1])
122 131
123 #function to sort the csv_file by value in a specific column 132 #function to sort the csv_file by value in a specific column
124 def sort_by_column(tab,sort_col,reverse,header): 133 def sort_by_column(tab,sort_col,reverse,header):
125 134
126 if len(tab) > 1 : #if there's more than just a header or 1 row 135 if len(tab) > 1 : #if there's more than just a header or 1 row
127 if header is True : 136 if header :
128 head=tab[0] 137 head=tab[0]
129 tab=tab[1:] 138 tab=tab[1:]
130 139
131 if is_number("int",tab[0][sort_col]) : 140 #list of empty cells in the column to sort
132 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) 141 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
133 elif is_number("float",tab[0][sort_col]) : 142 unsorted_tab=[ tab[i] for i in unsortable_lines]
143 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
144
145 if only_number(tab,sort_col) and any_float(tab,sort_col) :
134 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) 146 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
147 elif only_number(tab,sort_col):
148 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
135 else : 149 else :
136 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) 150 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
137 151
152 tab.extend(unsorted_tab)
138 if header is True : tab = [head]+tab 153 if header is True : tab = [head]+tab
139 154
140 return tab 155 return tab
141 156
157
158 #replace all blank cells to NA
159 def blank_to_NA(csv_file) :
160
161 tmp=[]
162 for line in csv_file :
163 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
164 tmp.append(line)
165
166 return tmp
167
168 #turn into float a column
169 def comma_number_to_float(csv_file,ncol,header) :
170 ncol = int(ncol.replace("c","")) - 1
171 if header :
172 tmp=[csv_file[0]]
173 csv_file=csv_file[1:]
174 else :
175 tmp=[]
176
177 for line in csv_file :
178 line[ncol]=line[ncol].replace(",",".")
179 tmp.append(line)
180
181 return (tmp)
182
183 #return True is there is at least one float in the column
184 def any_float(tab,col) :
185
186 for line in tab :
187 if is_number("float",line[col].replace(",",".")) :
188 return True
189
190 return False
191
192 def only_number(tab,col) :
193
194 for line in tab :
195 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
196 return False
197 return True
198
142 #Read the keywords file to extract the list of keywords 199 #Read the keywords file to extract the list of keywords
143 def read_option(filename): 200 def read_keywords_file(filename,header,ncol):
144 with open(filename, "r") as f: 201 with open(filename, "r") as csv_file :
145 filter_list=f.read().splitlines() 202 lines= csv.reader(csv_file, delimiter='\t')
146 filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0] 203 lines = blank_to_NA(lines)
147 filters=";".join(filter_list) 204 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
148 205 else :
149 return filters 206 keywords= ["".join(key) for key in lines]
207 if header : keywords = keywords[1:]
208 keywords = list(set(keywords))
209
210 return keywords
150 211
151 # Read input file 212 # Read input file
152 def read_file(filename): 213 def read_file(filename):
153 with open(filename,"r") as f : 214 with open(filename,"r") as f :
154 reader=csv.reader(f,delimiter="\t") 215 reader=csv.reader(f,delimiter="\t")
162 223
163 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) 224 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
164 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): 225 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
165 match=str_to_bool(match) 226 match=str_to_bool(match)
166 ncol=column_from_txt(ncol) 227 ncol=column_from_txt(ncol)
167 228 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword
168 keywords = keywords.upper().split(";") # Split list of filter keyword
169 [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords
170 keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords
171 229
172 for id_line,line in enumerate(csv_file): 230 for id_line,line in enumerate(csv_file):
173 if header is True and id_line == 0 : continue 231 if header is True and id_line == 0 : continue
174 #line = line.replace("\n", "")
175 keyword_inline = line[ncol].replace('"', "").split(";") 232 keyword_inline = line[ncol].replace('"', "").split(";")
176 #line = line + "\n"
177 233
178 #Perfect match or not 234 #Perfect match or not
179 if match is True : 235 if match is True :
180 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) 236 found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
181 else: 237 else:
190 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 246 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
191 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): 247 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
192 248
193 filter_value = float(filter_value) 249 filter_value = float(filter_value)
194 ncol=column_from_txt(ncol) 250 ncol=column_from_txt(ncol)
251 nb_string=0
195 252
196 for id_line,line in enumerate(csv_file): 253 for id_line,line in enumerate(csv_file):
197 if header is True and id_line == 0 : continue 254 if header is True and id_line == 0 : continue
198 value = line[ncol].replace('"', "").strip() 255 value = line[ncol].replace('"', "").replace(",",".").strip()
199 if value.replace(".", "", 1).isdigit(): 256 if value.replace(".", "", 1).isdigit():
200 to_filter=value_compare(value,filter_value,opt) 257 to_filter=value_compare(value,filter_value,opt)
201 258
202 #adding the result to the dictionary 259 #adding the result to the dictionary
203 if id_line in results_dict : results_dict[id_line].append(to_filter) 260 if id_line in results_dict : results_dict[id_line].append(to_filter)
204 else : results_dict[id_line]=[to_filter] 261 else : results_dict[id_line]=[to_filter]
262
263 #impossible to treat (ex : "" instead of a number), we keep the line by default
264 else :
265 nb_string+=1
266 if id_line in results_dict : results_dict[id_line].append(False)
267 else : results_dict[id_line]=[False]
268
269 #number of lines in the csv file
270 if header : nb_lines = len(csv_file) -1
271 else : nb_lines = len(csv_file)
272
273 #if there's no numeric value in the column
274 if nb_string == nb_lines :
275 print ('No numeric values found in the column '+str(ncol+1))
276 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))
205 277
206 return results_dict 278 return results_dict
207 279
208 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) 280 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
209 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): 281 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
210 inclusive=str_to_bool(inclusive) 282 inclusive=str_to_bool(inclusive)
211 bottom_value = float(bottom_value) 283 bottom_value = float(bottom_value)
212 top_value=float(top_value) 284 top_value=float(top_value)
213 ncol=column_from_txt(ncol) 285 ncol=column_from_txt(ncol)
286 nb_string=0
214 287
215 for id_line, line in enumerate(csv_file): 288 for id_line, line in enumerate(csv_file):
216 if header is True and id_line == 0 : continue 289 if header is True and id_line == 0 : continue
217 value = line[ncol].replace('"', "").strip() 290 value = line[ncol].replace('"', "").replace(",",".").strip()
218 if value.replace(".", "", 1).isdigit(): 291 if value.replace(".", "", 1).isdigit():
219 value=float(value) 292 value=float(value)
220 if inclusive is True: 293 if inclusive is True:
221 in_range = not (bottom_value <= value <= top_value) 294 in_range = not (bottom_value <= value <= top_value)
222 else : 295 else :
223 in_range = not (bottom_value < value < top_value) 296 in_range = not (bottom_value < value < top_value)
224 297
225 #adding the result to the dictionary 298 #adding the result to the dictionary
226 if id_line in results_dict : results_dict[id_line].append(in_range) 299 if id_line in results_dict : results_dict[id_line].append(in_range)
227 else : results_dict[id_line]=[in_range] 300 else : results_dict[id_line]=[in_range]
301
302 #impossible to treat (ex : "" instead of a number), we keep the line by default
303 else :
304 nb_string+=1
305 if id_line in results_dict : results_dict[id_line].append(False)
306 else : results_dict[id_line]=[False]
307
308 #number of lines in the csv file
309 if header : nb_lines = len(csv_file) -1
310 else : nb_lines = len(csv_file)
311
312 #if there's no numeric value in the column
313 if nb_string == nb_lines :
314 print ('No numeric values found in the column '+str(ncol+1))
315 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
316 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
228 317
229 return results_dict 318 return results_dict
230 319
231 def column_from_txt(ncol): 320 def column_from_txt(ncol):
232 if is_number("int", ncol.replace("c", "")): 321 if is_number("int", ncol.replace("c", "")):