Mercurial > repos > proteore > proteore_filter_keywords_values
comparison filter_kw_val.py @ 8:98cb671a92eb draft default tip
"planemo upload commit 80e3e50ca52b5b232f91e6dd6850da606d9c4c5f-dirty"
author | proteore |
---|---|
date | Mon, 10 May 2021 12:27:04 +0000 |
parents | b4641c0f8a82 |
children |
comparison
equal
deleted
inserted
replaced
7:5621406a4d2f | 8:98cb671a92eb |
---|---|
1 import argparse, re, csv, sys | 1 import argparse |
2 import csv | |
3 import re | |
4 import sys | |
5 | |
2 | 6 |
3 def options(): | 7 def options(): |
4 """ | 8 """ |
5 Parse options: | 9 Parse options: |
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] | 10 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] # noqa 501 |
7 --kw Keyword to be filtered, the column number where this filter applies, | 11 --kw Keyword to be filtered, the column number where this filter applies, |
8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. | 12 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. |
9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" | 13 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" |
10 --kwfile A file that contains keywords to be filter, the column where this filter applies and | 14 --kwfile A file that contains keywords to be filter, the column where this filter applies and |
11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] | 15 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] |
12 --value The value to be filtered, the column number where this filter applies and the | 16 --value The value to be filtered, the column number where this filter applies and the |
13 operation symbol ["value,ncol,=/>/>=/</<=/!="] | 17 operation symbol ["value,ncol,=/>/>=/</<=/!="] |
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true | 18 --values_range range of values to be keep, example : --values_range 5 20 c1 true |
15 --operation 'keep' or 'discard' lines concerned by filter(s) | 19 --operation 'keep' or 'discard' lines concerned by filter(s) |
16 --operator The operator used to filter with several keywords/values : AND or OR | 20 --operator The operator used to filter with several keywords/values : AND or OR |
17 --o --output The output filename | 21 --o --output The output filename |
22 parser.add_argument("-i", "--input", help="Input file", required=True) | 26 parser.add_argument("-i", "--input", help="Input file", required=True) |
23 parser.add_argument("--kw", nargs="+", action="append", help="") | 27 parser.add_argument("--kw", nargs="+", action="append", help="") |
24 parser.add_argument("--kw_file", nargs="+", action="append", help="") | 28 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
25 parser.add_argument("--value", nargs="+", action="append", help="") | 29 parser.add_argument("--value", nargs="+", action="append", help="") |
26 parser.add_argument("--values_range", nargs="+", action="append", help="") | 30 parser.add_argument("--values_range", nargs="+", action="append", help="") |
27 parser.add_argument("--operation", default="keep", type=str, choices=['keep','discard'],help='') | 31 parser.add_argument("--operation", default="keep", type=str, choices=['keep', 'discard'], help='') # noqa 501 |
28 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='') | 32 parser.add_argument("--operator", default="OR", type=str, choices=['AND', 'OR'], help='') # noqa 501 |
29 parser.add_argument("-o", "--output", default="output.txt") | 33 parser.add_argument("-o", "--output", default="output.txt") |
30 parser.add_argument("--discarded_lines", default="filtered_output.txt") | 34 parser.add_argument("--discarded_lines", default="filtered_output.txt") |
31 parser.add_argument("-s","--sort_col", help="") | 35 parser.add_argument("-s", "--sort_col", help="") |
32 | 36 |
33 args = parser.parse_args() | 37 args = parser.parse_args() |
34 | 38 |
35 filters(args) | 39 filters(args) |
40 | |
36 | 41 |
37 def str_to_bool(v): | 42 def str_to_bool(v): |
38 if v.lower() in ('yes', 'true', 't', 'y', '1'): | 43 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
39 return True | 44 return True |
40 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | 45 elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
41 return False | 46 return False |
42 else: | 47 else: |
43 raise argparse.ArgumentTypeError('Boolean value expected.') | 48 raise argparse.ArgumentTypeError('Boolean value expected.') |
44 | 49 |
45 def proper_ncol (ncol,file): | 50 |
51 def proper_ncol(ncol, file): | |
46 if ncol not in range(len(file[0])): | 52 if ncol not in range(len(file[0])): |
47 print("Column "+str(ncol+1)+" not found in input file") | 53 print("Column "+str(ncol+1)+" not found in input file") |
48 #traceback.print_exc(file=sys.stdout) | 54 # traceback.print_exc(file=sys.stdout) |
49 sys.exit(1) | 55 sys.exit(1) |
50 | 56 |
51 #Check if a variable is a float or an integer | 57 # Check if a variable is a float or an integer |
58 | |
59 | |
52 def is_number(number_format, n): | 60 def is_number(number_format, n): |
53 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") | 61 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") |
54 int_format = re.compile(r"^[-]?[0-9][0-9]*$") | 62 int_format = re.compile(r"^[-]?[0-9][0-9]*$") |
55 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") | 63 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$") |
56 test = "" | 64 test = "" |
57 if number_format == "int": | 65 if number_format == "int": |
58 test = re.match(int_format, n) | 66 test = re.match(int_format, n) |
59 elif number_format == "float": | 67 elif number_format == "float": |
60 test = re.match(float_format, n) | 68 test = re.match(float_format, n) |
61 if test is None : test = re.match(scientific_number,n) | 69 if test is None: |
70 test = re.match(scientific_number, n) | |
62 | 71 |
63 if test: | 72 if test: |
64 return True | 73 return True |
65 else : | 74 else: |
66 return False | 75 return False |
67 | 76 |
68 #Filter the document | 77 # Filter the document |
78 | |
79 | |
69 def filters(args): | 80 def filters(args): |
70 filename = args.input.split(",")[0] | 81 filename = args.input.split(",")[0] |
71 header = str_to_bool(args.input.split(",")[1]) | 82 header = str_to_bool(args.input.split(",")[1]) |
72 csv_file = blank_to_NA(read_file(filename)) | 83 csv_file = blank_to_NA(read_file(filename)) |
73 results_dict = {} | 84 results_dict = {} |
74 operator_dict = { "Equal" : "=" , "Higher" : ">" , "Equal-or-higher" : ">=" , "Lower" : "<" , "Equal-or-lower" : "<=" , "Different" : "!=" } | 85 operator_dict = {"Equal": "=", "Higher": ">", "Equal-or-higher": ">=", "Lower": "<", "Equal-or-lower": "<=", "Different": "!="} # noqa 501 |
75 | 86 |
76 if args.kw: | 87 if args.kw: |
77 keywords = args.kw | 88 keywords = args.kw |
78 for k in keywords: | 89 for k in keywords: |
79 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) | 90 results_dict = filter_keyword(csv_file, |
91 header, | |
92 results_dict, | |
93 k[0], | |
94 k[1], | |
95 k[2]) | |
80 | 96 |
81 if args.kw_file: | 97 if args.kw_file: |
82 key_files = args.kw_file | 98 key_files = args.kw_file |
83 for kf in key_files: | 99 for kf in key_files: |
84 header = str_to_bool(kf[1]) | 100 header = str_to_bool(kf[1]) |
85 ncol = column_from_txt(kf[2],csv_file) | 101 ncol = column_from_txt(kf[2], csv_file) |
86 keywords = read_keywords_file(kf[0],header,ncol) | 102 keywords = read_keywords_file(kf[0], header, ncol) |
87 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) | 103 results_dict = filter_keyword(csv_file, header, results_dict, |
104 keywords, kf[3], kf[4]) | |
88 | 105 |
89 if args.value: | 106 if args.value: |
90 for v in args.value: | 107 for v in args.value: |
91 v[0] = v[0].replace(",",".") | 108 v[0] = v[0].replace(",", ".") |
92 v[2] = operator_dict[v[2]] | 109 v[2] = operator_dict[v[2]] |
93 if is_number("float", v[0]): | 110 if is_number("float", v[0]): |
94 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header) | 111 csv_file = comma_number_to_float(csv_file, |
95 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) | 112 column_from_txt( |
113 v[1], csv_file), header) | |
114 results_dict = filter_value(csv_file, header, | |
115 results_dict, v[0], v[1], v[2]) | |
96 else: | 116 else: |
97 raise ValueError("Please enter a number in filter by value") | 117 raise ValueError("Please enter a number in filter by value") |
98 | 118 |
99 if args.values_range: | 119 if args.values_range: |
100 for vr in args.values_range: | 120 for vr in args.values_range: |
101 vr[:2] = [value.replace(",",".") for value in vr[:2]] | 121 vr[:2] = [value.replace(",", ".") for value in vr[:2]] |
102 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header) | 122 csv_file = comma_number_to_float(csv_file, |
103 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): | 123 column_from_txt( |
104 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) | 124 vr[2], csv_file), header) |
105 | 125 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float", vr[1]) or is_number("int", vr[1])): # noqa 501 |
106 remaining_lines=[] | 126 results_dict = filter_values_range(csv_file, |
107 filtered_lines=[] | 127 header, results_dict, |
108 | 128 vr[0], vr[1], vr[2], vr[3]) |
109 if header is True : | 129 |
130 remaining_lines = [] | |
131 filtered_lines = [] | |
132 | |
133 if header is True: | |
110 remaining_lines.append(csv_file[0]) | 134 remaining_lines.append(csv_file[0]) |
111 filtered_lines.append(csv_file[0]) | 135 filtered_lines.append(csv_file[0]) |
112 | 136 |
113 if results_dict == {} : #no filter used | 137 if results_dict == {}: # no filter used |
114 remaining_lines.extend(csv_file[1:]) | 138 remaining_lines.extend(csv_file[1:]) |
115 else : | 139 else: |
116 for id_line,line in enumerate(csv_file) : | 140 for id_line, line in enumerate(csv_file): |
117 if id_line in results_dict : #skip header and empty lines | 141 if id_line in results_dict: # skip header and empty lines |
118 if args.operator == 'OR' : | 142 if args.operator == 'OR': |
119 if any(results_dict[id_line]) : | 143 if any(results_dict[id_line]): |
120 filtered_lines.append(line) | 144 filtered_lines.append(line) |
121 else : | 145 else: |
122 remaining_lines.append(line) | 146 remaining_lines.append(line) |
123 | 147 |
124 elif args.operator == "AND" : | 148 elif args.operator == "AND": |
125 if all(results_dict[id_line]) : | 149 if all(results_dict[id_line]): |
126 filtered_lines.append(line) | 150 filtered_lines.append(line) |
127 else : | 151 else: |
128 remaining_lines.append(line) | 152 remaining_lines.append(line) |
129 | 153 |
130 #sort of results by column | 154 # sort of results by column |
131 if args.sort_col : | 155 if args.sort_col: |
132 sort_col=args.sort_col.split(",")[0] | 156 sort_col = args.sort_col.split(",")[0] |
133 sort_col=column_from_txt(sort_col,csv_file) | 157 sort_col = column_from_txt(sort_col, csv_file) |
134 reverse=str_to_bool(args.sort_col.split(",")[1]) | 158 reverse = str_to_bool(args.sort_col.split(",")[1]) |
135 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) | 159 remaining_lines = sort_by_column(remaining_lines, sort_col, |
136 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) | 160 reverse, header) |
137 | 161 filtered_lines = sort_by_column(filtered_lines, sort_col, |
138 #swap lists of lines (files) if 'keep' option selected | 162 reverse, header) |
139 if args.operation == "keep" : | 163 |
164 # swap lists of lines (files) if 'keep' option selected | |
165 if args.operation == "keep": | |
140 swap = remaining_lines, filtered_lines | 166 swap = remaining_lines, filtered_lines |
141 remaining_lines = swap[1] | 167 remaining_lines = swap[1] |
142 filtered_lines = swap[0] | 168 filtered_lines = swap[0] |
143 | 169 |
144 # Write results to output | 170 # Write results to output |
145 with open(args.output,"w") as output : | 171 with open(args.output, "w") as output: |
146 writer = csv.writer(output,delimiter="\t") | 172 writer = csv.writer(output, delimiter="\t") |
147 writer.writerows(remaining_lines) | 173 writer.writerows(remaining_lines) |
148 | 174 |
149 # Write filtered lines to filtered_output | 175 # Write filtered lines to filtered_output |
150 with open(args.discarded_lines,"w") as filtered_output : | 176 with open(args.discarded_lines, "w") as filtered_output: |
151 writer = csv.writer(filtered_output,delimiter="\t") | 177 writer = csv.writer(filtered_output, delimiter="\t") |
152 writer.writerows(filtered_lines) | 178 writer.writerows(filtered_lines) |
153 | 179 |
154 #function to sort the csv_file by value in a specific column | 180 # function to sort the csv_file by value in a specific column |
155 def sort_by_column(tab,sort_col,reverse,header): | 181 |
156 | 182 |
157 if len(tab) > 1 : #if there's more than just a header or 1 row | 183 def sort_by_column(tab, sort_col, reverse, header): |
158 if header : | 184 |
159 head=tab[0] | 185 if len(tab) > 1: # if there's more than just a header or 1 row |
160 tab=tab[1:] | 186 if header: |
161 | 187 head = tab[0] |
162 #list of empty cells in the column to sort | 188 tab = tab[1:] |
163 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] | 189 |
164 unsorted_tab=[ tab[i] for i in unsortable_lines] | 190 # list of empty cells in the column to sort |
165 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] | 191 unsortable_lines = [i for i, line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] # noqa 501 |
166 | 192 unsorted_tab = [tab[i] for i in unsortable_lines] |
167 if only_number(tab,sort_col) and any_float(tab,sort_col) : | 193 tab = [line for i, line in enumerate(tab) if i not in unsortable_lines] |
168 tab = comma_number_to_float(tab,sort_col,False) | 194 |
169 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) | 195 if only_number(tab, sort_col) and any_float(tab, sort_col): |
170 elif only_number(tab,sort_col): | 196 tab = comma_number_to_float(tab, sort_col, False) |
171 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | 197 tab = sorted(tab, key=lambda row: float(row[sort_col]), |
172 else : | 198 reverse=reverse) |
199 elif only_number(tab, sort_col): | |
200 tab = sorted(tab, key=lambda row: int(row[sort_col]), | |
201 reverse=reverse) | |
202 else: | |
173 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) | 203 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) |
174 | 204 |
175 tab.extend(unsorted_tab) | 205 tab.extend(unsorted_tab) |
176 if header is True : tab = [head]+tab | 206 if header is True: |
207 tab = [head]+tab | |
177 | 208 |
178 return tab | 209 return tab |
179 | 210 |
180 | 211 |
181 #replace all blank cells to NA | 212 # replace all blank cells to NA |
182 def blank_to_NA(csv_file) : | 213 |
183 | 214 |
184 tmp=[] | 215 def blank_to_NA(csv_file): |
185 for line in csv_file : | 216 |
186 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] | 217 tmp = [] |
218 for line in csv_file: | |
219 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] # noqa 501 | |
187 tmp.append(line) | 220 tmp.append(line) |
188 | 221 |
189 return tmp | 222 return tmp |
190 | 223 |
191 #turn into float a column | 224 # turn into float a column |
192 def comma_number_to_float(csv_file,ncol,header) : | 225 |
193 if header : | 226 |
194 tmp=[csv_file[0]] | 227 def comma_number_to_float(csv_file, ncol, header): |
195 csv_file=csv_file[1:] | 228 if header: |
196 else : | 229 tmp = [csv_file[0]] |
197 tmp=[] | 230 csv_file = csv_file[1:] |
198 | 231 else: |
199 for line in csv_file : | 232 tmp = [] |
200 line[ncol]=line[ncol].replace(",",".") | 233 |
234 for line in csv_file: | |
235 line[ncol] = line[ncol].replace(",", ".") | |
201 tmp.append(line) | 236 tmp.append(line) |
202 | 237 |
203 return (tmp) | 238 return (tmp) |
204 | 239 |
205 #return True is there is at least one float in the column | 240 # return True is there is at least one float in the column |
206 def any_float(tab,col) : | 241 |
207 | 242 |
208 for line in tab : | 243 def any_float(tab, col): |
209 if is_number("float",line[col].replace(",",".")) : | 244 |
245 for line in tab: | |
246 if is_number("float", line[col].replace(",", ".")): | |
210 return True | 247 return True |
211 | 248 |
212 return False | 249 return False |
213 | 250 |
214 def only_number(tab,col) : | 251 |
215 for line in tab : | 252 def only_number(tab, col): |
216 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : | 253 for line in tab: |
254 if not (is_number("float", line[col].replace(",", ".")) or is_number("int", line[col].replace(",", "."))): # noqa 501 | |
217 return False | 255 return False |
218 return True | 256 return True |
219 | 257 |
220 #Read the keywords file to extract the list of keywords | 258 # Read the keywords file to extract the list of keywords |
221 def read_keywords_file(filename,header,ncol): | 259 |
222 with open(filename, "r") as csv_file : | 260 |
223 lines= csv.reader(csv_file, delimiter='\t') | 261 def read_keywords_file(filename, header, ncol): |
262 with open(filename, "r") as csv_file: | |
263 lines = csv.reader(csv_file, delimiter='\t') | |
224 lines = blank_to_NA(lines) | 264 lines = blank_to_NA(lines) |
225 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] | 265 if (len(lines[0])) > 1: |
226 else : | 266 keywords = [line[ncol] for line in lines] |
227 keywords= ["".join(key) for key in lines] | 267 else: |
228 if header : keywords = keywords[1:] | 268 keywords = ["".join(key) for key in lines] |
269 if header: | |
270 keywords = keywords[1:] | |
229 keywords = list(set(keywords)) | 271 keywords = list(set(keywords)) |
230 | 272 |
231 return keywords | 273 return keywords |
232 | 274 |
233 # Read input file | 275 # Read input file |
276 | |
277 | |
234 def read_file(filename): | 278 def read_file(filename): |
235 with open(filename,"r") as f : | 279 with open(filename, "r") as f: |
236 reader=csv.reader(f,delimiter="\t") | 280 reader = csv.reader(f, delimiter="\t") |
237 tab=list(reader) | 281 tab = list(reader) |
238 | 282 |
239 # Remove empty lines (contain only space or new line or "") | 283 # Remove empty lines (contain only space or new line or "") |
240 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] | 284 # [tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] |
241 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ] | 285 tab = [line for line in tab if len("".join(line).replace(" ", "")) != 0] # noqa 501 |
242 | 286 |
243 return tab | 287 return tab |
244 | 288 |
245 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) | 289 # seek for keywords in rows of csvfile, return a dictionary of boolean |
290 # (true if keyword found, false otherwise) | |
291 | |
292 | |
246 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): | 293 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): |
247 match=str_to_bool(match) | 294 match = str_to_bool(match) |
248 ncol=column_from_txt(ncol,csv_file) | 295 ncol = column_from_txt(ncol, csv_file) |
249 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword | 296 if type(keywords) != list: |
250 | 297 keywords = keywords.upper().split() # Split list of filter keyword |
251 for id_line,line in enumerate(csv_file): | 298 |
252 if header is True and id_line == 0 : continue | 299 for id_line, line in enumerate(csv_file): |
300 if header is True and id_line == 0: | |
301 continue | |
253 keyword_inline = line[ncol].replace('"', "").split(";") | 302 keyword_inline = line[ncol].replace('"', "").split(";") |
254 | 303 |
255 #Perfect match or not | 304 # Perfect match or not |
256 if match is True : | 305 if match is True: |
257 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) | 306 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) # noqa 501 |
258 else: | 307 else: |
259 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) | 308 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) # noqa 501 |
260 | 309 |
261 #if the keyword is found in line | 310 # if the keyword is found in line |
262 if id_line in results_dict : results_dict[id_line].append(found_in_line) | 311 if id_line in results_dict: |
263 else : results_dict[id_line]=[found_in_line] | 312 results_dict[id_line].append(found_in_line) |
313 else: | |
314 results_dict[id_line] = [found_in_line] | |
264 | 315 |
265 return results_dict | 316 return results_dict |
266 | 317 |
267 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 318 # filter ba determined value in rows of csvfile, return a dictionary |
319 # of boolean (true if value filtered, false otherwise) | |
320 | |
321 | |
268 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): | 322 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): |
269 | 323 |
270 filter_value = float(filter_value) | 324 filter_value = float(filter_value) |
271 ncol=column_from_txt(ncol,csv_file) | 325 ncol = column_from_txt(ncol, csv_file) |
272 nb_string=0 | 326 nb_string = 0 |
273 | 327 |
274 for id_line,line in enumerate(csv_file): | 328 for id_line, line in enumerate(csv_file): |
275 if header is True and id_line == 0 : continue | 329 if header is True and id_line == 0: |
276 value = line[ncol].replace('"', "").replace(",",".").strip() | 330 continue |
331 value = line[ncol].replace('"', "").replace(",", ".").strip() | |
277 if value.replace(".", "", 1).isdigit(): | 332 if value.replace(".", "", 1).isdigit(): |
278 to_filter=value_compare(value,filter_value,opt) | 333 to_filter = value_compare(value, filter_value, opt) |
279 | 334 |
280 #adding the result to the dictionary | 335 # adding the result to the dictionary |
281 if id_line in results_dict : results_dict[id_line].append(to_filter) | 336 if id_line in results_dict: |
282 else : results_dict[id_line]=[to_filter] | 337 results_dict[id_line].append(to_filter) |
283 | 338 else: |
284 #impossible to treat (ex : "" instead of a number), we keep the line by default | 339 results_dict[id_line] = [to_filter] |
285 else : | 340 |
286 nb_string+=1 | 341 # impossible to treat (ex : "" instead of a number), |
287 if id_line in results_dict : results_dict[id_line].append(False) | 342 # we keep the line by default |
288 else : results_dict[id_line]=[False] | 343 else: |
289 | 344 nb_string += 1 |
290 #number of lines in the csv file | 345 if id_line in results_dict: |
291 if header : nb_lines = len(csv_file) -1 | 346 results_dict[id_line].append(False) |
292 else : nb_lines = len(csv_file) | 347 else: |
293 | 348 results_dict[id_line] = [False] |
294 #if there's no numeric value in the column | 349 |
295 if nb_string == nb_lines : | 350 # number of lines in the csv file |
296 print ('No numeric values found in the column '+str(ncol+1)) | 351 if header: |
297 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) | 352 nb_lines = len(csv_file) - 1 |
298 | 353 else: |
354 nb_lines = len(csv_file) | |
355 | |
356 # if there's no numeric value in the column | |
357 if nb_string == nb_lines: | |
358 print('No numeric values found in the column '+str(ncol+1)) | |
359 print('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 | |
360 | |
299 return results_dict | 361 return results_dict |
300 | 362 |
301 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | 363 # filter ba determined value in rows of csvfile, return a dictionary |
302 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): | 364 # of boolean (true if value filtered, false otherwise) |
303 inclusive=str_to_bool(inclusive) | 365 |
366 | |
367 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): # noqa 501 | |
368 inclusive = str_to_bool(inclusive) | |
304 bottom_value = float(bottom_value) | 369 bottom_value = float(bottom_value) |
305 top_value=float(top_value) | 370 top_value = float(top_value) |
306 ncol=column_from_txt(ncol,csv_file) | 371 ncol = column_from_txt(ncol, csv_file) |
307 nb_string=0 | 372 nb_string = 0 |
308 | 373 |
309 for id_line, line in enumerate(csv_file): | 374 for id_line, line in enumerate(csv_file): |
310 if header is True and id_line == 0 : continue | 375 if header is True and id_line == 0: |
311 value = line[ncol].replace('"', "").replace(",",".").strip() | 376 continue |
377 value = line[ncol].replace('"', "").replace(",", ".").strip() | |
312 if value.replace(".", "", 1).isdigit(): | 378 if value.replace(".", "", 1).isdigit(): |
313 value=float(value) | 379 value = float(value) |
314 if inclusive is True: | 380 if inclusive is True: |
315 in_range = not (bottom_value <= value <= top_value) | 381 in_range = not (bottom_value <= value <= top_value) |
316 else : | 382 else: |
317 in_range = not (bottom_value < value < top_value) | 383 in_range = not (bottom_value < value < top_value) |
318 | 384 |
319 #adding the result to the dictionary | 385 # adding the result to the dictionary |
320 if id_line in results_dict : results_dict[id_line].append(in_range) | 386 if id_line in results_dict: |
321 else : results_dict[id_line]=[in_range] | 387 results_dict[id_line].append(in_range) |
322 | 388 else: |
323 #impossible to treat (ex : "" instead of a number), we keep the line by default | 389 results_dict[id_line] = [in_range] |
324 else : | 390 |
325 nb_string+=1 | 391 # impossible to treat (ex : "" instead of a number), |
326 if id_line in results_dict : results_dict[id_line].append(False) | 392 # we keep the line by default |
327 else : results_dict[id_line]=[False] | 393 else: |
328 | 394 nb_string += 1 |
329 #number of lines in the csv file | 395 if id_line in results_dict: |
330 if header : nb_lines = len(csv_file) -1 | 396 results_dict[id_line].append(False) |
331 else : nb_lines = len(csv_file) | 397 else: |
332 | 398 results_dict[id_line] = [False] |
333 #if there's no numeric value in the column | 399 |
334 if nb_string == nb_lines : | 400 # number of lines in the csv file |
335 print ('No numeric values found in the column '+str(ncol+1)) | 401 if header: |
336 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | 402 nb_lines = len(csv_file) - 1 |
337 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) | 403 else: |
338 | 404 nb_lines = len(csv_file) |
339 return results_dict | 405 |
340 | 406 # if there's no numeric value in the column |
341 def column_from_txt(ncol,file): | 407 if nb_string == nb_lines: |
342 if is_number("int", ncol.replace("c", "")): | 408 print('No numeric values found in the column '+str(ncol+1)) |
343 ncol = int(ncol.replace("c", "")) - 1 | 409 if inclusive: |
410 print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 | |
411 else: | |
412 print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 | |
413 | |
414 return results_dict | |
415 | |
416 | |
417 def column_from_txt(ncol, file): | |
418 if is_number("int", ncol.replace("c", "")): | |
419 ncol = int(ncol.replace("c", "")) - 1 | |
344 else: | 420 else: |
345 raise ValueError("Please specify the column where " | 421 raise ValueError("Please specify the column where " |
346 "you would like to apply the filter " | 422 "you would like to apply the filter " |
347 "with valid format") | 423 "with valid format") |
348 | 424 |
349 proper_ncol (ncol,file) | 425 proper_ncol(ncol, file) |
350 | 426 |
351 return ncol | 427 return ncol |
352 | 428 |
353 #return True if value is in the determined values, false otherwise | 429 # return True if value is in the determined values, false otherwise |
354 def value_compare(value,filter_value,opt): | 430 |
355 test_value=False | 431 |
432 def value_compare(value, filter_value, opt): | |
433 test_value = False | |
356 | 434 |
357 if opt == "<": | 435 if opt == "<": |
358 if float(value) < filter_value: | 436 if float(value) < filter_value: |
359 test_value = True | 437 test_value = True |
360 elif opt == "<=": | 438 elif opt == "<=": |
367 if float(value) >= filter_value: | 445 if float(value) >= filter_value: |
368 test_value = True | 446 test_value = True |
369 elif opt == "=": | 447 elif opt == "=": |
370 if float(value) == filter_value: | 448 if float(value) == filter_value: |
371 test_value = True | 449 test_value = True |
372 elif opt == "!=": | 450 elif opt == "!=": |
373 if float(value) != filter_value: | 451 if float(value) != filter_value: |
374 test_value = True | 452 test_value = True |
375 | 453 |
376 return test_value | 454 return test_value |
455 | |
377 | 456 |
378 if __name__ == "__main__": | 457 if __name__ == "__main__": |
379 options() | 458 options() |