Mercurial > repos > proteore > filter_keywords_values
comparison filter_kw_val.py @ 0:6a45ccfc0e4c draft
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author | proteore |
---|---|
date | Sun, 26 Nov 2017 18:36:43 -0500 |
parents | |
children | d29e469b6b20 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6a45ccfc0e4c |
---|---|
1 import argparse | |
2 import re | |
3 | |
4 | |
5 def options(): | |
6 parser = argparse.ArgumentParser() | |
7 parser.add_argument("-i", "--input", help="Input file", required=True) | |
8 parser.add_argument("-m", "--match", help="Exact macth") | |
9 parser.add_argument("--kw", nargs="+", action="append", help="") # | |
10 parser.add_argument("--kw_file", nargs="+", action="append", help="") | |
11 parser.add_argument("--value", nargs="+", action="append", help="") | |
12 parser.add_argument("-o", "--output", default="output.txt") | |
13 parser.add_argument("--trash_file", default="trash_MQfilter.txt") | |
14 | |
15 args = parser.parse_args() | |
16 | |
17 filters(args) | |
18 | |
19 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" | |
20 | |
21 | |
22 def isnumber(format, n): | |
23 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") | |
24 int_format = re.compile("^[\-]?[1-9][0-9]*$") | |
25 test = "" | |
26 if format == "int": | |
27 test = re.match(int_format, n) | |
28 elif format == "float": | |
29 test = re.match(float_format, n) | |
30 if test: | |
31 return True | |
32 else: | |
33 return False | |
34 | |
35 def filters(args): | |
36 MQfilename = args.input.split(",")[0] | |
37 header = args.input.split(",")[1] | |
38 MQfile = readMQ(MQfilename) | |
39 results = [MQfile, None] | |
40 | |
41 if args.kw: | |
42 keywords = args.kw | |
43 for k in keywords: | |
44 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) | |
45 if args.kw_file: | |
46 key_files = args.kw_file | |
47 for kf in key_files: | |
48 ids = readOption(kf[0]) | |
49 results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2]) | |
50 if args.value: | |
51 for v in args.value: | |
52 if isnumber("float", v[0]): | |
53 results = filter_value(results[0], header, results[1], v[0], v[1], v[2]) | |
54 else: | |
55 raise ValueError("Please enter a number in filter by value") | |
56 | |
57 # Write results to output | |
58 output = open(args.output, "w") | |
59 output.write("".join(results[0])) | |
60 output.close() | |
61 | |
62 # Write deleted lines to trash_file | |
63 trash = open(args.trash_file, "w") | |
64 #print("".join(results[1])) | |
65 trash.write("".join(results[1])) | |
66 trash.close() | |
67 | |
68 def readOption(filename): | |
69 f = open(filename, "r") | |
70 file = f.read() | |
71 #print(file) | |
72 filter_list = file.split("\n") | |
73 #print(filter_list) | |
74 filters = "" | |
75 for i in filter_list: | |
76 filters += i + ":" | |
77 filters = filters[:-1] | |
78 #print(filters) | |
79 return filters | |
80 | |
81 def readMQ(MQfilename): | |
82 # Read MQ file | |
83 mqfile = open(MQfilename, "r") | |
84 mq = mqfile.readlines() | |
85 # Remove empty lines (contain only space or new line or "") | |
86 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] | |
87 return mq | |
88 | |
89 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): | |
90 mq = MQfile | |
91 if isnumber("int", ncol.replace("c", "")): | |
92 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") | |
93 else: | |
94 raise ValueError("Please specify the column where you would like to apply the filter with valid format") | |
95 | |
96 ids = ids.upper().split(":") | |
97 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] | |
98 | |
99 if header == "true": | |
100 header = mq[0] | |
101 content = mq[1:] | |
102 else: | |
103 header = "" | |
104 content = mq[:] | |
105 | |
106 if not filtered_lines: # In case there is already some filtered lines from other filters | |
107 filtered_lines = [] | |
108 if header != "": | |
109 filtered_lines.append(header) | |
110 | |
111 for line in content: | |
112 id_inline = line.split("\t")[id_index].replace('"', "").split(";") | |
113 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs | |
114 | |
115 if match != "false": | |
116 # Filter protein IDs | |
117 if any (pid.upper() in ids for pid in id_inline): | |
118 #ids = prot_ids.split(":") | |
119 #print(prot_ids.split(":")) | |
120 #if prot_id in ids: | |
121 filtered_lines.append(one_id_line) | |
122 mq.remove(line) | |
123 else: | |
124 mq[mq.index(line)] = one_id_line | |
125 else: | |
126 if any (ft in pid.upper() for pid in id_inline for ft in ids): | |
127 filtered_lines.append(one_id_line) | |
128 mq.remove(line) | |
129 else: | |
130 mq[mq.index(line)] = one_id_line | |
131 return mq, filtered_lines | |
132 | |
133 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): | |
134 mq = MQfile | |
135 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: | |
136 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") | |
137 else: | |
138 raise ValueError("Please specify the column where you would like to apply the filter with valid format") | |
139 | |
140 if header == "true": | |
141 header = mq[0] | |
142 content = mq[1:] | |
143 else: | |
144 header = "" | |
145 content = mq[:] | |
146 | |
147 if not filtered_prots: # In case there is already some filtered lines from other filters | |
148 filtered_prots = [] | |
149 if header != "": | |
150 filtered_prots.append(header) | |
151 | |
152 for prot in content: | |
153 filter_value = float(filter_value) | |
154 pep = prot.split("\t")[index].replace('"', "") | |
155 if pep.replace(".", "", 1).isdigit(): | |
156 if opt == "<": | |
157 if not float(pep) < filter_value: | |
158 filtered_prots.append(prot) | |
159 mq.remove(prot) | |
160 elif opt == "<=": | |
161 if not float(pep) <= filter_value: | |
162 filtered_prots.append(prot) | |
163 mq.remove(prot) | |
164 elif opt == ">": | |
165 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) | |
166 if not float(pep) > filter_value: | |
167 filtered_prots.append(prot) | |
168 mq.remove(prot) | |
169 elif opt == ">=": | |
170 if not float(pep) >= filter_value: | |
171 filtered_prots.append(prot) | |
172 mq.remove(prot) | |
173 else: | |
174 if not float(pep) == filter_value: | |
175 filtered_prots.append(prot) | |
176 mq.remove(prot) | |
177 return mq, filtered_prots #output, trash_file | |
178 | |
179 if __name__ == "__main__": | |
180 options() |