comparison filter_kw_val.py @ 7:6f32c1e12572 draft default tip

planemo upload commit 72b345a7df2c87f07a9df71ecee1f252c9355337
author proteore
date Fri, 01 Jun 2018 11:10:47 -0400
parents c6ba1e6f6869
children
comparison
equal deleted inserted replaced
6:c6ba1e6f6869 7:6f32c1e12572
1 import argparse 1 import argparse, re, csv
2 import re
3
4 2
5 def options(): 3 def options():
6 """ 4 """
7 Parse options: 5 Parse options:
8 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] 6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
9 -m, --match if the keywords should be filtered in exact
10 --kw Keyword to be filtered, the column number where this filter applies, 7 --kw Keyword to be filtered, the column number where this filter applies,
11 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. 8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
12 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" 9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
13 --kwfile A file that contains keywords to be filter, the column where this filter applies and 10 --kwfile A file that contains keywords to be filter, the column where this filter applies and
14 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] 11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
15 --value The value to be filtered, the column number where this filter applies and the 12 --value The value to be filtered, the column number where this filter applies and the
16 operation symbol ["value,ncol,=/>/>=/</<="] 13 operation symbol ["value,ncol,=/>/>=/</<=/!="]
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true
15 --operator The operator used to filter with several keywords/values : AND or OR
17 --o --output The output filename 16 --o --output The output filename
18 --trash_file The file contains removed lines 17 --filtered_file The file contains removed lines
18 -s --sort_col Used column to sort the file, ",true" for reverse sorting, ",false" otherwise example : c1,false
19 """ 19 """
20 parser = argparse.ArgumentParser() 20 parser = argparse.ArgumentParser()
21 parser.add_argument("-i", "--input", help="Input file", required=True) 21 parser.add_argument("-i", "--input", help="Input file", required=True)
22 parser.add_argument("--kw", nargs="+", action="append", help="") 22 parser.add_argument("--kw", nargs="+", action="append", help="")
23 parser.add_argument("--kw_file", nargs="+", action="append", help="") 23 parser.add_argument("--kw_file", nargs="+", action="append", help="")
24 parser.add_argument("--value", nargs="+", action="append", help="") 24 parser.add_argument("--value", nargs="+", action="append", help="")
25 parser.add_argument("--values_range", nargs="+", action="append", help="")
26 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='')
25 parser.add_argument("-o", "--output", default="output.txt") 27 parser.add_argument("-o", "--output", default="output.txt")
26 parser.add_argument("--trash_file", default="trash_MQfilter.txt") 28 parser.add_argument("--filtered_file", default="filtered_output.txt")
29 parser.add_argument("-s","--sort_col", help="")
27 30
28 args = parser.parse_args() 31 args = parser.parse_args()
29
30 filters(args) 32 filters(args)
31 33
32 def isnumber(number_format, n): 34 def str_to_bool(v):
33 """ 35 if v.lower() in ('yes', 'true', 't', 'y', '1'):
34 Check if a variable is a float or an integer 36 return True
35 """ 37 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
38 return False
39 else:
40 raise argparse.ArgumentTypeError('Boolean value expected.')
41
42 #Check if a variable is a float or an integer
43 def is_number(number_format, n):
36 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") 44 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
37 int_format = re.compile(r"^[-]?[0-9][0-9]*$") 45 int_format = re.compile(r"^[-]?[0-9][0-9]*$")
38 test = "" 46 test = ""
39 if number_format == "int": 47 if number_format == "int":
40 test = re.match(int_format, n) 48 test = re.match(int_format, n)
41 elif number_format == "float": 49 elif number_format == "float":
42 test = re.match(float_format, n) 50 test = re.match(float_format, n)
43 if test: 51 if test:
44 return True 52 return True
45 53
54 #Filter the document
46 def filters(args): 55 def filters(args):
47 """ 56 filename = args.input.split(",")[0]
48 Filter the document 57 header = str_to_bool(args.input.split(",")[1])
49 """ 58 csv_file = read_file(filename)
50 MQfilename = args.input.split(",")[0] 59 results_dict = {}
51 header = args.input.split(",")[1]
52 MQfile = readMQ(MQfilename)
53 results = [MQfile, None]
54 60
55 if args.kw: 61 if args.kw:
56 keywords = args.kw 62 keywords = args.kw
57 for k in keywords: 63 for k in keywords:
58 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) 64 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
65
59 if args.kw_file: 66 if args.kw_file:
60 key_files = args.kw_file 67 key_files = args.kw_file
61 for kf in key_files: 68 for kf in key_files:
62 ids = readOption(kf[0]) 69 keywords = read_option(kf[0])
63 results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) 70 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2])
71
64 if args.value: 72 if args.value:
65 for v in args.value: 73 for v in args.value:
66 if isnumber("float", v[0]): 74 if is_number("float", v[0]):
67 results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) 75 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
68 else: 76 else:
69 raise ValueError("Please enter a number in filter by value") 77 raise ValueError("Please enter a number in filter by value")
70 78
79 if args.values_range:
80 for vr in args.values_range:
81 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
82 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
83
84 remaining_lines=[]
85 filtered_lines=[]
86
87 if header is True :
88 remaining_lines.append(csv_file[0])
89 filtered_lines.append(csv_file[0])
90
91 for id_line,line in enumerate(csv_file) :
92 if id_line in results_dict : #skip header and empty lines
93 if args.operator == 'OR' :
94 if any(results_dict[id_line]) :
95 filtered_lines.append(line)
96 else :
97 remaining_lines.append(line)
98
99 elif args.operator == "AND" :
100 if all(results_dict[id_line]) :
101 filtered_lines.append(line)
102 else :
103 remaining_lines.append(line)
104
105 #sort of results by column
106 if args.sort_col :
107 sort_col=args.sort_col.split(",")[0]
108 sort_col=column_from_txt(sort_col)
109 reverse=str_to_bool(args.sort_col.split(",")[1])
110 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header)
111 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header)
112
71 # Write results to output 113 # Write results to output
72 output = open(args.output, "w") 114 with open(args.output,"w") as output :
73 output.write("".join(results[0])) 115 writer = csv.writer(output,delimiter="\t")
74 output.close() 116 writer.writerows(remaining_lines)
75 117
76 # Write deleted lines to trash_file 118 # Write filtered lines to filtered_output
77 trash = open(args.trash_file, "w") 119 with open(args.filtered_file,"w") as filtered_output :
78 trash.write("".join(results[1])) 120 writer = csv.writer(filtered_output,delimiter="\t")
79 trash.close() 121 writer.writerows(filtered_lines)
80 122
81 def readOption(filename): 123 #function to sort the csv_file by value in a specific column
82 # Read the keywords file to extract the list of keywords 124 def sort_by_column(tab,sort_col,reverse,header):
83 f = open(filename, "r") 125
84 file_content = f.read() 126 if len(tab) > 1 : #if there's more than just a header or 1 row
85 filter_list = file_content.split("\n") 127 if header is True :
86 filters = "" 128 head=tab[0]
87 for i in filter_list: 129 tab=tab[1:]
88 filters += i + ";" 130
89 filters = filters[:-1] 131 if is_number("int",tab[0][sort_col]) :
132 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
133 elif is_number("float",tab[0][sort_col]) :
134 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
135 else :
136 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
137
138 if header is True : tab = [head]+tab
139
140 return tab
141
142 #Read the keywords file to extract the list of keywords
143 def read_option(filename):
144 with open(filename, "r") as f:
145 filter_list=f.read().splitlines()
146 filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0]
147 filters=";".join(filter_list)
148
90 return filters 149 return filters
91 150
92 def readMQ(MQfilename): 151 # Read input file
93 # Read input file 152 def read_file(filename):
94 mqfile = open(MQfilename, "r") 153 with open(filename,"r") as f :
95 mq = mqfile.readlines() 154 reader=csv.reader(f,delimiter="\t")
155 tab=list(reader)
156
96 # Remove empty lines (contain only space or new line or "") 157 # Remove empty lines (contain only space or new line or "")
97 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] 158 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
98 return mq 159 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ]
99 160
100 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): 161 return tab
101 mq = MQfile 162
102 if isnumber("int", ncol.replace("c", "")): 163 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
103 id_index = int(ncol.replace("c", "")) - 1 164 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
165 match=str_to_bool(match)
166 ncol=column_from_txt(ncol)
167
168 keywords = keywords.upper().split(";") # Split list of filter keyword
169 [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords
170 keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords
171
172 for id_line,line in enumerate(csv_file):
173 if header is True and id_line == 0 : continue
174 #line = line.replace("\n", "")
175 keyword_inline = line[ncol].replace('"', "").split(";")
176 #line = line + "\n"
177
178 #Perfect match or not
179 if match is True :
180 found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
181 else:
182 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords)
183
184 #if the keyword is found in line
185 if id_line in results_dict : results_dict[id_line].append(found_in_line)
186 else : results_dict[id_line]=[found_in_line]
187
188 return results_dict
189
190 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
191 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
192
193 filter_value = float(filter_value)
194 ncol=column_from_txt(ncol)
195
196 for id_line,line in enumerate(csv_file):
197 if header is True and id_line == 0 : continue
198 value = line[ncol].replace('"', "").strip()
199 if value.replace(".", "", 1).isdigit():
200 to_filter=value_compare(value,filter_value,opt)
201
202 #adding the result to the dictionary
203 if id_line in results_dict : results_dict[id_line].append(to_filter)
204 else : results_dict[id_line]=[to_filter]
205
206 return results_dict
207
208 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
209 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
210 inclusive=str_to_bool(inclusive)
211 bottom_value = float(bottom_value)
212 top_value=float(top_value)
213 ncol=column_from_txt(ncol)
214
215 for id_line, line in enumerate(csv_file):
216 if header is True and id_line == 0 : continue
217 value = line[ncol].replace('"', "").strip()
218 if value.replace(".", "", 1).isdigit():
219 value=float(value)
220 if inclusive is True:
221 in_range = not (bottom_value <= value <= top_value)
222 else :
223 in_range = not (bottom_value < value < top_value)
224
225 #adding the result to the dictionary
226 if id_line in results_dict : results_dict[id_line].append(in_range)
227 else : results_dict[id_line]=[in_range]
228
229 return results_dict
230
231 def column_from_txt(ncol):
232 if is_number("int", ncol.replace("c", "")):
233 ncol = int(ncol.replace("c", "")) - 1
104 else: 234 else:
105 raise ValueError("Please specify the column where " 235 raise ValueError("Please specify the column where "
106 "you would like to apply the filter " 236 "you would like to apply the filter "
107 "with valid format") 237 "with valid format")
108 238 return ncol
109 # Split list of filter IDs 239
110 ids = ids.upper().split(";") 240 #return True if value is in the determined values, false otherwise
111 # Remove blank IDs 241 def value_compare(value,filter_value,opt):
112 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] 242 test_value=False
113 # Remove space from 2 heads of IDs 243
114 ids = [id.strip() for id in ids] 244 if opt == "<":
115 245 if float(value) < filter_value:
116 246 test_value = True
117 if header == "true": 247 elif opt == "<=":
118 header = mq[0] 248 if float(value) <= filter_value:
119 content = mq[1:] 249 test_value = True
120 else: 250 elif opt == ">":
121 header = "" 251 if float(value) > filter_value:
122 content = mq[:] 252 test_value = True
123 253 elif opt == ">=":
124 if not filtered_lines: # In case there is already some filtered lines from other filters 254 if float(value) >= filter_value:
125 filtered_lines = [] 255 test_value = True
126 if header != "": 256 elif opt == "=":
127 filtered_lines.append(header) 257 if float(value) == filter_value:
128 258 test_value = True
129 for line in content: 259 elif opt == "!=":
130 line = line.replace("\n", "") 260 if float(value) != filter_value:
131 id_inline = line.split("\t")[id_index].replace('"', "").split(";") 261 test_value = True
132 # Take only first IDs 262
133 #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) 263 return test_value
134 line = line + "\n"
135
136 if match != "false":
137 # Filter protein IDs
138 if any(pid.upper() in ids for pid in id_inline):
139 filtered_lines.append(line)
140 mq.remove(line)
141 #else:
142 # mq[mq.index(line)] = one_id_line
143 else:
144 if any(ft in pid.upper() for pid in id_inline for ft in ids):
145 filtered_lines.append(line)
146 mq.remove(line)
147 #else:
148 # mq[mq.index(line)] = one_id_line
149 return mq, filtered_lines
150
151 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
152 mq = MQfile
153 if ncol and isnumber("int", ncol.replace("c", "")):
154 index = int(ncol.replace("c", "")) - 1
155 else:
156 raise ValueError("Please specify the column where "
157 "you would like to apply the filter "
158 "with valid format")
159 if header == "true":
160 header = mq[0]
161 content = mq[1:]
162 else:
163 header = ""
164 content = mq[:]
165 if not filtered_prots: # In case there is already some filtered lines from other filters
166 filtered_prots = []
167 if header != "":
168 filtered_prots.append(header)
169
170 for line in content:
171 prot = line.replace("\n","")
172 filter_value = float(filter_value)
173 pep = prot.split("\t")[index].replace('"', "")
174 if pep.replace(".", "", 1).isdigit():
175 if opt == "<":
176 if float(pep) >= filter_value:
177 filtered_prots.append(line)
178 mq.remove(line)
179 elif opt == "<=":
180 if float(pep) > filter_value:
181 filtered_prots.append(line)
182 mq.remove(line)
183 elif opt == ">":
184 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value)
185 if float(pep) <= filter_value:
186 filtered_prots.append(line)
187 mq.remove(line)
188 elif opt == ">=":
189 if float(pep) < filter_value:
190 filtered_prots.append(line)
191 mq.remove(line)
192 else:
193 if float(pep) != filter_value:
194 filtered_prots.append(line)
195 mq.remove(line)
196 return mq, filtered_prots
197 264
198 if __name__ == "__main__": 265 if __name__ == "__main__":
199 options() 266 options()