Mercurial > repos > proteore > filter_keywords_values
comparison filter_kw_val.py @ 1:d29e469b6b20 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author | proteore |
---|---|
date | Fri, 16 Feb 2018 03:27:43 -0500 |
parents | 6a45ccfc0e4c |
children | 1e9911190142 |
comparison
equal
deleted
inserted
replaced
0:6a45ccfc0e4c | 1:d29e469b6b20 |
---|---|
1 import argparse | 1 import argparse |
2 import re | 2 import re |
3 | 3 |
4 | 4 |
5 def options(): | 5 def options(): |
6 """ | |
7 Parse options | |
8 """ | |
6 parser = argparse.ArgumentParser() | 9 parser = argparse.ArgumentParser() |
7 parser.add_argument("-i", "--input", help="Input file", required=True) | 10 parser.add_argument("-i", "--input", help="Input file", required=True) |
8 parser.add_argument("-m", "--match", help="Exact macth") | 11 parser.add_argument("-m", "--match", help="Exact macth") |
9 parser.add_argument("--kw", nargs="+", action="append", help="") # | 12 parser.add_argument("--kw", nargs="+", action="append", help="") # |
10 parser.add_argument("--kw_file", nargs="+", action="append", help="") | 13 parser.add_argument("--kw_file", nargs="+", action="append", help="") |
14 | 17 |
15 args = parser.parse_args() | 18 args = parser.parse_args() |
16 | 19 |
17 filters(args) | 20 filters(args) |
18 | 21 |
19 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" | 22 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" |
20 | 23 # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" |
21 | 24 |
22 def isnumber(format, n): | 25 |
26 def isnumber(number_format, n): | |
27 """ | |
28 Check if a variable is a float or an integer | |
29 """ | |
23 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") | 30 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") |
24 int_format = re.compile("^[\-]?[1-9][0-9]*$") | 31 int_format = re.compile("^[\-]?[1-9][0-9]*$") |
25 test = "" | 32 test = "" |
26 if format == "int": | 33 if number_format == "int": |
27 test = re.match(int_format, n) | 34 test = re.match(int_format, n) |
28 elif format == "float": | 35 elif number_format == "float": |
29 test = re.match(float_format, n) | 36 test = re.match(float_format, n) |
30 if test: | 37 if test: |
31 return True | 38 return True |
32 else: | 39 # else: |
33 return False | 40 # return False |
34 | 41 |
35 def filters(args): | 42 def filters(args): |
43 """ | |
44 Filter the document | |
45 """ | |
36 MQfilename = args.input.split(",")[0] | 46 MQfilename = args.input.split(",")[0] |
37 header = args.input.split(",")[1] | 47 header = args.input.split(",")[1] |
38 MQfile = readMQ(MQfilename) | 48 MQfile = readMQ(MQfilename) |
39 results = [MQfile, None] | 49 results = [MQfile, None] |
40 | 50 |
41 if args.kw: | 51 if args.kw: |
42 keywords = args.kw | 52 keywords = args.kw |
43 for k in keywords: | 53 for k in keywords: |
44 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) | 54 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) |
45 if args.kw_file: | 55 if args.kw_file: |
54 else: | 64 else: |
55 raise ValueError("Please enter a number in filter by value") | 65 raise ValueError("Please enter a number in filter by value") |
56 | 66 |
57 # Write results to output | 67 # Write results to output |
58 output = open(args.output, "w") | 68 output = open(args.output, "w") |
59 output.write("".join(results[0])) | 69 output.write("\n".join(results[0])) |
60 output.close() | 70 output.close() |
61 | 71 |
62 # Write deleted lines to trash_file | 72 # Write deleted lines to trash_file |
63 trash = open(args.trash_file, "w") | 73 trash = open(args.trash_file, "w") |
64 #print("".join(results[1])) | 74 trash.write("\n".join(results[1])) |
65 trash.write("".join(results[1])) | |
66 trash.close() | 75 trash.close() |
67 | 76 |
68 def readOption(filename): | 77 def readOption(filename): |
69 f = open(filename, "r") | 78 f = open(filename, "r") |
70 file = f.read() | 79 file_content = f.read() |
71 #print(file) | 80 filter_list = file_content.split("\n") |
72 filter_list = file.split("\n") | |
73 #print(filter_list) | |
74 filters = "" | 81 filters = "" |
75 for i in filter_list: | 82 for i in filter_list: |
76 filters += i + ":" | 83 filters += i + ";" |
77 filters = filters[:-1] | 84 filters = filters[:-1] |
78 #print(filters) | |
79 return filters | 85 return filters |
80 | 86 |
81 def readMQ(MQfilename): | 87 def readMQ(MQfilename): |
82 # Read MQ file | 88 # Read MQ file |
83 mqfile = open(MQfilename, "r") | 89 mqfile = open(MQfilename, "r") |
84 mq = mqfile.readlines() | 90 mq = mqfile.readlines() |
85 # Remove empty lines (contain only space or new line or "") | 91 # Remove empty lines (contain only space or new line or "") |
86 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] | 92 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] |
87 return mq | 93 return mq |
88 | 94 |
89 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): | 95 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): |
90 mq = MQfile | 96 mq = MQfile |
91 if isnumber("int", ncol.replace("c", "")): | 97 if isnumber("int", ncol.replace("c", "")): |
92 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") | 98 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") |
93 else: | 99 else: |
94 raise ValueError("Please specify the column where you would like to apply the filter with valid format") | 100 raise ValueError("Please specify the column where " |
95 | 101 "you would like to apply the filter " |
96 ids = ids.upper().split(":") | 102 "with valid format") |
103 | |
104 # Split list of filter IDs | |
105 ids = ids.upper().split(";") | |
106 # Remove blank IDs | |
97 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] | 107 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] |
98 | 108 # Remove space from 2 heads of IDs |
109 ids = [id.strip() for id in ids] | |
110 | |
111 | |
99 if header == "true": | 112 if header == "true": |
100 header = mq[0] | 113 header = mq[0] |
101 content = mq[1:] | 114 content = mq[1:] |
102 else: | 115 else: |
103 header = "" | 116 header = "" |
104 content = mq[:] | 117 content = mq[:] |
105 | 118 |
106 if not filtered_lines: # In case there is already some filtered lines from other filters | 119 if not filtered_lines: # In case there is already some filtered lines from other filters |
107 filtered_lines = [] | 120 filtered_lines = [] |
108 if header != "": | 121 if header != "": |
109 filtered_lines.append(header) | 122 filtered_lines.append(header) |
110 | 123 |
111 for line in content: | 124 for line in content: |
125 line = line.replace("\n", "") | |
112 id_inline = line.split("\t")[id_index].replace('"', "").split(";") | 126 id_inline = line.split("\t")[id_index].replace('"', "").split(";") |
113 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs | 127 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs |
114 | 128 line = line + "\n" |
129 | |
115 if match != "false": | 130 if match != "false": |
116 # Filter protein IDs | 131 # Filter protein IDs |
117 if any (pid.upper() in ids for pid in id_inline): | 132 if any(pid.upper() in ids for pid in id_inline): |
118 #ids = prot_ids.split(":") | |
119 #print(prot_ids.split(":")) | |
120 #if prot_id in ids: | |
121 filtered_lines.append(one_id_line) | 133 filtered_lines.append(one_id_line) |
122 mq.remove(line) | 134 mq.remove(line) |
123 else: | 135 else: |
124 mq[mq.index(line)] = one_id_line | 136 mq[mq.index(line)] = one_id_line |
125 else: | 137 else: |
126 if any (ft in pid.upper() for pid in id_inline for ft in ids): | 138 if any(ft in pid.upper() for pid in id_inline for ft in ids): |
127 filtered_lines.append(one_id_line) | 139 filtered_lines.append(one_id_line) |
128 mq.remove(line) | 140 mq.remove(line) |
129 else: | 141 else: |
130 mq[mq.index(line)] = one_id_line | 142 mq[mq.index(line)] = one_id_line |
131 return mq, filtered_lines | 143 return mq, filtered_lines |
132 | 144 |
133 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): | 145 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): |
134 mq = MQfile | 146 mq = MQfile |
135 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: | 147 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: |
136 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") | 148 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") |
137 else: | 149 else: |
138 raise ValueError("Please specify the column where you would like to apply the filter with valid format") | 150 raise ValueError("Please specify the column where " |
139 | 151 "you would like to apply the filter " |
152 "with valid format") | |
140 if header == "true": | 153 if header == "true": |
141 header = mq[0] | 154 header = mq[0] |
142 content = mq[1:] | 155 content = mq[1:] |
143 else: | 156 else: |
144 header = "" | 157 header = "" |
145 content = mq[:] | 158 content = mq[:] |
146 | |
147 if not filtered_prots: # In case there is already some filtered lines from other filters | 159 if not filtered_prots: # In case there is already some filtered lines from other filters |
148 filtered_prots = [] | 160 filtered_prots = [] |
149 if header != "": | 161 if header != "": |
150 filtered_prots.append(header) | 162 filtered_prots.append(header) |
151 | 163 |
152 for prot in content: | 164 for line in content: |
165 prot = line.replace("\n","") | |
153 filter_value = float(filter_value) | 166 filter_value = float(filter_value) |
154 pep = prot.split("\t")[index].replace('"', "") | 167 pep = prot.split("\t")[index].replace('"', "") |
155 if pep.replace(".", "", 1).isdigit(): | 168 if pep.replace(".", "", 1).isdigit(): |
156 if opt == "<": | 169 if opt == "<": |
157 if not float(pep) < filter_value: | 170 if float(pep) >= filter_value: |
158 filtered_prots.append(prot) | 171 filtered_prots.append(line) |
159 mq.remove(prot) | 172 mq.remove(line) |
160 elif opt == "<=": | 173 elif opt == "<=": |
161 if not float(pep) <= filter_value: | 174 if float(pep) > filter_value: |
162 filtered_prots.append(prot) | 175 filtered_prots.append(line) |
163 mq.remove(prot) | 176 mq.remove(line) |
164 elif opt == ">": | 177 elif opt == ">": |
165 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) | 178 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) |
166 if not float(pep) > filter_value: | 179 if float(pep) <= filter_value: |
167 filtered_prots.append(prot) | 180 filtered_prots.append(line) |
168 mq.remove(prot) | 181 mq.remove(line) |
169 elif opt == ">=": | 182 elif opt == ">=": |
170 if not float(pep) >= filter_value: | 183 if float(pep) < filter_value: |
171 filtered_prots.append(prot) | 184 filtered_prots.append(line) |
172 mq.remove(prot) | 185 mq.remove(line) |
173 else: | 186 else: |
174 if not float(pep) == filter_value: | 187 if float(pep) != filter_value: |
175 filtered_prots.append(prot) | 188 filtered_prots.append(line) |
176 mq.remove(prot) | 189 mq.remove(line) |
177 return mq, filtered_prots #output, trash_file | 190 return mq, filtered_prots #output, trash_file |
178 | 191 |
179 if __name__ == "__main__": | 192 if __name__ == "__main__": |
180 options() | 193 options() |