Mercurial > repos > proteore > filter_keywords_values
comparison filter_kw_val.py @ 7:6f32c1e12572 draft default tip
planemo upload commit 72b345a7df2c87f07a9df71ecee1f252c9355337
author | proteore |
---|---|
date | Fri, 01 Jun 2018 11:10:47 -0400 |
parents | c6ba1e6f6869 |
children |
comparison
equal
deleted
inserted
replaced
6:c6ba1e6f6869 | 7:6f32c1e12572 |
---|---|
1 import argparse | 1 import argparse, re, csv |
2 import re | |
3 | |
4 | 2 |
5 def options(): | 3 def options(): |
6 """ | 4 """ |
7 Parse options: | 5 Parse options: |
8 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] | 6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] |
9 -m, --match if the keywords should be filtered in exact | |
10 --kw Keyword to be filtered, the column number where this filter applies, | 7 --kw Keyword to be filtered, the column number where this filter applies, |
11 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. | 8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. |
12 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" | 9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" |
13 --kwfile A file that contains keywords to be filter, the column where this filter applies and | 10 --kwfile A file that contains keywords to be filter, the column where this filter applies and |
14 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] | 11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] |
15 --value The value to be filtered, the column number where this filter applies and the | 12 --value The value to be filtered, the column number where this filter applies and the |
16 operation symbol ["value,ncol,=/>/>=/</<="] | 13 operation symbol ["value,ncol,=/>/>=/</<=/!="] |
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true | |
15 --operator The operator used to filter with several keywords/values : AND or OR | |
17 --o --output The output filename | 16 --o --output The output filename |
18 --trash_file The file contains removed lines | 17 --filtered_file The file contains removed lines |
18 -s --sort_col Used column to sort the file, ",true" for reverse sorting, ",false" otherwise example : c1,false | |
19 """ | 19 """ |
20 parser = argparse.ArgumentParser() | 20 parser = argparse.ArgumentParser() |
21 parser.add_argument("-i", "--input", help="Input file", required=True) | 21 parser.add_argument("-i", "--input", help="Input file", required=True) |
22 parser.add_argument("--kw", nargs="+", action="append", help="") | 22 parser.add_argument("--kw", nargs="+", action="append", help="") |
23 parser.add_argument("--kw_file", nargs="+", action="append", help="") | 23 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
24 parser.add_argument("--value", nargs="+", action="append", help="") | 24 parser.add_argument("--value", nargs="+", action="append", help="") |
25 parser.add_argument("--values_range", nargs="+", action="append", help="") | |
26 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='') | |
25 parser.add_argument("-o", "--output", default="output.txt") | 27 parser.add_argument("-o", "--output", default="output.txt") |
26 parser.add_argument("--trash_file", default="trash_MQfilter.txt") | 28 parser.add_argument("--filtered_file", default="filtered_output.txt") |
29 parser.add_argument("-s","--sort_col", help="") | |
27 | 30 |
28 args = parser.parse_args() | 31 args = parser.parse_args() |
29 | |
30 filters(args) | 32 filters(args) |
31 | 33 |
32 def isnumber(number_format, n): | 34 def str_to_bool(v): |
33 """ | 35 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
34 Check if a variable is a float or an integer | 36 return True |
35 """ | 37 elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
38 return False | |
39 else: | |
40 raise argparse.ArgumentTypeError('Boolean value expected.') | |
41 | |
42 #Check if a variable is a float or an integer | |
43 def is_number(number_format, n): | |
36 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") | 44 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") |
37 int_format = re.compile(r"^[-]?[0-9][0-9]*$") | 45 int_format = re.compile(r"^[-]?[0-9][0-9]*$") |
38 test = "" | 46 test = "" |
39 if number_format == "int": | 47 if number_format == "int": |
40 test = re.match(int_format, n) | 48 test = re.match(int_format, n) |
41 elif number_format == "float": | 49 elif number_format == "float": |
42 test = re.match(float_format, n) | 50 test = re.match(float_format, n) |
43 if test: | 51 if test: |
44 return True | 52 return True |
45 | 53 |
54 #Filter the document | |
46 def filters(args): | 55 def filters(args): |
47 """ | 56 filename = args.input.split(",")[0] |
48 Filter the document | 57 header = str_to_bool(args.input.split(",")[1]) |
49 """ | 58 csv_file = read_file(filename) |
50 MQfilename = args.input.split(",")[0] | 59 results_dict = {} |
51 header = args.input.split(",")[1] | |
52 MQfile = readMQ(MQfilename) | |
53 results = [MQfile, None] | |
54 | 60 |
55 if args.kw: | 61 if args.kw: |
56 keywords = args.kw | 62 keywords = args.kw |
57 for k in keywords: | 63 for k in keywords: |
58 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) | 64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2]) |
65 | |
59 if args.kw_file: | 66 if args.kw_file: |
60 key_files = args.kw_file | 67 key_files = args.kw_file |
61 for kf in key_files: | 68 for kf in key_files: |
62 ids = readOption(kf[0]) | 69 keywords = read_option(kf[0]) |
63 results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) | 70 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2]) |
71 | |
64 if args.value: | 72 if args.value: |
65 for v in args.value: | 73 for v in args.value: |
66 if isnumber("float", v[0]): | 74 if is_number("float", v[0]): |
67 results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) | 75 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) |
68 else: | 76 else: |
69 raise ValueError("Please enter a number in filter by value") | 77 raise ValueError("Please enter a number in filter by value") |
70 | 78 |
79 if args.values_range: | |
80 for vr in args.values_range: | |
81 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): | |
82 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) | |
83 | |
84 remaining_lines=[] | |
85 filtered_lines=[] | |
86 | |
87 if header is True : | |
88 remaining_lines.append(csv_file[0]) | |
89 filtered_lines.append(csv_file[0]) | |
90 | |
91 for id_line,line in enumerate(csv_file) : | |
92 if id_line in results_dict : #skip header and empty lines | |
93 if args.operator == 'OR' : | |
94 if any(results_dict[id_line]) : | |
95 filtered_lines.append(line) | |
96 else : | |
97 remaining_lines.append(line) | |
98 | |
99 elif args.operator == "AND" : | |
100 if all(results_dict[id_line]) : | |
101 filtered_lines.append(line) | |
102 else : | |
103 remaining_lines.append(line) | |
104 | |
105 #sort of results by column | |
106 if args.sort_col : | |
107 sort_col=args.sort_col.split(",")[0] | |
108 sort_col=column_from_txt(sort_col) | |
109 reverse=str_to_bool(args.sort_col.split(",")[1]) | |
110 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header) | |
111 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header) | |
112 | |
71 # Write results to output | 113 # Write results to output |
72 output = open(args.output, "w") | 114 with open(args.output,"w") as output : |
73 output.write("".join(results[0])) | 115 writer = csv.writer(output,delimiter="\t") |
74 output.close() | 116 writer.writerows(remaining_lines) |
75 | 117 |
76 # Write deleted lines to trash_file | 118 # Write filtered lines to filtered_output |
77 trash = open(args.trash_file, "w") | 119 with open(args.filtered_file,"w") as filtered_output : |
78 trash.write("".join(results[1])) | 120 writer = csv.writer(filtered_output,delimiter="\t") |
79 trash.close() | 121 writer.writerows(filtered_lines) |
80 | 122 |
81 def readOption(filename): | 123 #function to sort the csv_file by value in a specific column |
82 # Read the keywords file to extract the list of keywords | 124 def sort_by_column(tab,sort_col,reverse,header): |
83 f = open(filename, "r") | 125 |
84 file_content = f.read() | 126 if len(tab) > 1 : #if there's more than just a header or 1 row |
85 filter_list = file_content.split("\n") | 127 if header is True : |
86 filters = "" | 128 head=tab[0] |
87 for i in filter_list: | 129 tab=tab[1:] |
88 filters += i + ";" | 130 |
89 filters = filters[:-1] | 131 if is_number("int",tab[0][sort_col]) : |
132 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | |
133 elif is_number("float",tab[0][sort_col]) : | |
134 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) | |
135 else : | |
136 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) | |
137 | |
138 if header is True : tab = [head]+tab | |
139 | |
140 return tab | |
141 | |
142 #Read the keywords file to extract the list of keywords | |
143 def read_option(filename): | |
144 with open(filename, "r") as f: | |
145 filter_list=f.read().splitlines() | |
146 filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0] | |
147 filters=";".join(filter_list) | |
148 | |
90 return filters | 149 return filters |
91 | 150 |
92 def readMQ(MQfilename): | 151 # Read input file |
93 # Read input file | 152 def read_file(filename): |
94 mqfile = open(MQfilename, "r") | 153 with open(filename,"r") as f : |
95 mq = mqfile.readlines() | 154 reader=csv.reader(f,delimiter="\t") |
155 tab=list(reader) | |
156 | |
96 # Remove empty lines (contain only space or new line or "") | 157 # Remove empty lines (contain only space or new line or "") |
97 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] | 158 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] |
98 return mq | 159 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ] |
99 | 160 |
100 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): | 161 return tab |
101 mq = MQfile | 162 |
102 if isnumber("int", ncol.replace("c", "")): | 163 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise) |
103 id_index = int(ncol.replace("c", "")) - 1 | 164 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): |
165 match=str_to_bool(match) | |
166 ncol=column_from_txt(ncol) | |
167 | |
168 keywords = keywords.upper().split(";") # Split list of filter keyword | |
169 [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords | |
170 keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords | |
171 | |
172 for id_line,line in enumerate(csv_file): | |
173 if header is True and id_line == 0 : continue | |
174 #line = line.replace("\n", "") | |
175 keyword_inline = line[ncol].replace('"', "").split(";") | |
176 #line = line + "\n" | |
177 | |
178 #Perfect match or not | |
179 if match is True : | |
180 found_in_line = any(pid.upper() in keywords for pid in keyword_inline) | |
181 else: | |
182 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) | |
183 | |
184 #if the keyword is found in line | |
185 if id_line in results_dict : results_dict[id_line].append(found_in_line) | |
186 else : results_dict[id_line]=[found_in_line] | |
187 | |
188 return results_dict | |
189 | |
190 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | |
191 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): | |
192 | |
193 filter_value = float(filter_value) | |
194 ncol=column_from_txt(ncol) | |
195 | |
196 for id_line,line in enumerate(csv_file): | |
197 if header is True and id_line == 0 : continue | |
198 value = line[ncol].replace('"', "").strip() | |
199 if value.replace(".", "", 1).isdigit(): | |
200 to_filter=value_compare(value,filter_value,opt) | |
201 | |
202 #adding the result to the dictionary | |
203 if id_line in results_dict : results_dict[id_line].append(to_filter) | |
204 else : results_dict[id_line]=[to_filter] | |
205 | |
206 return results_dict | |
207 | |
208 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise) | |
209 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): | |
210 inclusive=str_to_bool(inclusive) | |
211 bottom_value = float(bottom_value) | |
212 top_value=float(top_value) | |
213 ncol=column_from_txt(ncol) | |
214 | |
215 for id_line, line in enumerate(csv_file): | |
216 if header is True and id_line == 0 : continue | |
217 value = line[ncol].replace('"', "").strip() | |
218 if value.replace(".", "", 1).isdigit(): | |
219 value=float(value) | |
220 if inclusive is True: | |
221 in_range = not (bottom_value <= value <= top_value) | |
222 else : | |
223 in_range = not (bottom_value < value < top_value) | |
224 | |
225 #adding the result to the dictionary | |
226 if id_line in results_dict : results_dict[id_line].append(in_range) | |
227 else : results_dict[id_line]=[in_range] | |
228 | |
229 return results_dict | |
230 | |
231 def column_from_txt(ncol): | |
232 if is_number("int", ncol.replace("c", "")): | |
233 ncol = int(ncol.replace("c", "")) - 1 | |
104 else: | 234 else: |
105 raise ValueError("Please specify the column where " | 235 raise ValueError("Please specify the column where " |
106 "you would like to apply the filter " | 236 "you would like to apply the filter " |
107 "with valid format") | 237 "with valid format") |
108 | 238 return ncol |
109 # Split list of filter IDs | 239 |
110 ids = ids.upper().split(";") | 240 #return True if value is in the determined values, false otherwise |
111 # Remove blank IDs | 241 def value_compare(value,filter_value,opt): |
112 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] | 242 test_value=False |
113 # Remove space from 2 heads of IDs | 243 |
114 ids = [id.strip() for id in ids] | 244 if opt == "<": |
115 | 245 if float(value) < filter_value: |
116 | 246 test_value = True |
117 if header == "true": | 247 elif opt == "<=": |
118 header = mq[0] | 248 if float(value) <= filter_value: |
119 content = mq[1:] | 249 test_value = True |
120 else: | 250 elif opt == ">": |
121 header = "" | 251 if float(value) > filter_value: |
122 content = mq[:] | 252 test_value = True |
123 | 253 elif opt == ">=": |
124 if not filtered_lines: # In case there is already some filtered lines from other filters | 254 if float(value) >= filter_value: |
125 filtered_lines = [] | 255 test_value = True |
126 if header != "": | 256 elif opt == "=": |
127 filtered_lines.append(header) | 257 if float(value) == filter_value: |
128 | 258 test_value = True |
129 for line in content: | 259 elif opt == "!=": |
130 line = line.replace("\n", "") | 260 if float(value) != filter_value: |
131 id_inline = line.split("\t")[id_index].replace('"', "").split(";") | 261 test_value = True |
132 # Take only first IDs | 262 |
133 #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) | 263 return test_value |
134 line = line + "\n" | |
135 | |
136 if match != "false": | |
137 # Filter protein IDs | |
138 if any(pid.upper() in ids for pid in id_inline): | |
139 filtered_lines.append(line) | |
140 mq.remove(line) | |
141 #else: | |
142 # mq[mq.index(line)] = one_id_line | |
143 else: | |
144 if any(ft in pid.upper() for pid in id_inline for ft in ids): | |
145 filtered_lines.append(line) | |
146 mq.remove(line) | |
147 #else: | |
148 # mq[mq.index(line)] = one_id_line | |
149 return mq, filtered_lines | |
150 | |
151 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): | |
152 mq = MQfile | |
153 if ncol and isnumber("int", ncol.replace("c", "")): | |
154 index = int(ncol.replace("c", "")) - 1 | |
155 else: | |
156 raise ValueError("Please specify the column where " | |
157 "you would like to apply the filter " | |
158 "with valid format") | |
159 if header == "true": | |
160 header = mq[0] | |
161 content = mq[1:] | |
162 else: | |
163 header = "" | |
164 content = mq[:] | |
165 if not filtered_prots: # In case there is already some filtered lines from other filters | |
166 filtered_prots = [] | |
167 if header != "": | |
168 filtered_prots.append(header) | |
169 | |
170 for line in content: | |
171 prot = line.replace("\n","") | |
172 filter_value = float(filter_value) | |
173 pep = prot.split("\t")[index].replace('"', "") | |
174 if pep.replace(".", "", 1).isdigit(): | |
175 if opt == "<": | |
176 if float(pep) >= filter_value: | |
177 filtered_prots.append(line) | |
178 mq.remove(line) | |
179 elif opt == "<=": | |
180 if float(pep) > filter_value: | |
181 filtered_prots.append(line) | |
182 mq.remove(line) | |
183 elif opt == ">": | |
184 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) | |
185 if float(pep) <= filter_value: | |
186 filtered_prots.append(line) | |
187 mq.remove(line) | |
188 elif opt == ">=": | |
189 if float(pep) < filter_value: | |
190 filtered_prots.append(line) | |
191 mq.remove(line) | |
192 else: | |
193 if float(pep) != filter_value: | |
194 filtered_prots.append(line) | |
195 mq.remove(line) | |
196 return mq, filtered_prots | |
197 | 264 |
198 if __name__ == "__main__": | 265 if __name__ == "__main__": |
199 options() | 266 options() |