comparison filter_kw_val.py @ 1:d29e469b6b20 draft

planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author proteore
date Fri, 16 Feb 2018 03:27:43 -0500
parents 6a45ccfc0e4c
children 1e9911190142
comparison
equal deleted inserted replaced
0:6a45ccfc0e4c 1:d29e469b6b20
1 import argparse 1 import argparse
2 import re 2 import re
3 3
4 4
5 def options(): 5 def options():
6 """
7 Parse options
8 """
6 parser = argparse.ArgumentParser() 9 parser = argparse.ArgumentParser()
7 parser.add_argument("-i", "--input", help="Input file", required=True) 10 parser.add_argument("-i", "--input", help="Input file", required=True)
8 parser.add_argument("-m", "--match", help="Exact macth") 11 parser.add_argument("-m", "--match", help="Exact macth")
9 parser.add_argument("--kw", nargs="+", action="append", help="") # 12 parser.add_argument("--kw", nargs="+", action="append", help="") #
10 parser.add_argument("--kw_file", nargs="+", action="append", help="") 13 parser.add_argument("--kw_file", nargs="+", action="append", help="")
14 17
15 args = parser.parse_args() 18 args = parser.parse_args()
16 19
17 filters(args) 20 filters(args)
18 21
19 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" 22 # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
20 23 # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
21 24
22 def isnumber(format, n): 25
26 def isnumber(number_format, n):
27 """
28 Check if a variable is a float or an integer
29 """
23 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") 30 float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
24 int_format = re.compile("^[\-]?[1-9][0-9]*$") 31 int_format = re.compile("^[\-]?[1-9][0-9]*$")
25 test = "" 32 test = ""
26 if format == "int": 33 if number_format == "int":
27 test = re.match(int_format, n) 34 test = re.match(int_format, n)
28 elif format == "float": 35 elif number_format == "float":
29 test = re.match(float_format, n) 36 test = re.match(float_format, n)
30 if test: 37 if test:
31 return True 38 return True
32 else: 39 # else:
33 return False 40 # return False
34 41
35 def filters(args): 42 def filters(args):
43 """
44 Filter the document
45 """
36 MQfilename = args.input.split(",")[0] 46 MQfilename = args.input.split(",")[0]
37 header = args.input.split(",")[1] 47 header = args.input.split(",")[1]
38 MQfile = readMQ(MQfilename) 48 MQfile = readMQ(MQfilename)
39 results = [MQfile, None] 49 results = [MQfile, None]
40 50
41 if args.kw: 51 if args.kw:
42 keywords = args.kw 52 keywords = args.kw
43 for k in keywords: 53 for k in keywords:
44 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2]) 54 results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2])
45 if args.kw_file: 55 if args.kw_file:
54 else: 64 else:
55 raise ValueError("Please enter a number in filter by value") 65 raise ValueError("Please enter a number in filter by value")
56 66
57 # Write results to output 67 # Write results to output
58 output = open(args.output, "w") 68 output = open(args.output, "w")
59 output.write("".join(results[0])) 69 output.write("\n".join(results[0]))
60 output.close() 70 output.close()
61 71
62 # Write deleted lines to trash_file 72 # Write deleted lines to trash_file
63 trash = open(args.trash_file, "w") 73 trash = open(args.trash_file, "w")
64 #print("".join(results[1])) 74 trash.write("\n".join(results[1]))
65 trash.write("".join(results[1]))
66 trash.close() 75 trash.close()
67 76
68 def readOption(filename): 77 def readOption(filename):
69 f = open(filename, "r") 78 f = open(filename, "r")
70 file = f.read() 79 file_content = f.read()
71 #print(file) 80 filter_list = file_content.split("\n")
72 filter_list = file.split("\n")
73 #print(filter_list)
74 filters = "" 81 filters = ""
75 for i in filter_list: 82 for i in filter_list:
76 filters += i + ":" 83 filters += i + ";"
77 filters = filters[:-1] 84 filters = filters[:-1]
78 #print(filters)
79 return filters 85 return filters
80 86
81 def readMQ(MQfilename): 87 def readMQ(MQfilename):
82 # Read MQ file 88 # Read MQ file
83 mqfile = open(MQfilename, "r") 89 mqfile = open(MQfilename, "r")
84 mq = mqfile.readlines() 90 mq = mqfile.readlines()
85 # Remove empty lines (contain only space or new line or "") 91 # Remove empty lines (contain only space or new line or "")
86 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] 92 [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
87 return mq 93 return mq
88 94
89 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): 95 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
90 mq = MQfile 96 mq = MQfile
91 if isnumber("int", ncol.replace("c", "")): 97 if isnumber("int", ncol.replace("c", "")):
92 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") 98 id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs")
93 else: 99 else:
94 raise ValueError("Please specify the column where you would like to apply the filter with valid format") 100 raise ValueError("Please specify the column where "
95 101 "you would like to apply the filter "
96 ids = ids.upper().split(":") 102 "with valid format")
103
104 # Split list of filter IDs
105 ids = ids.upper().split(";")
106 # Remove blank IDs
97 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] 107 [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""]
98 108 # Remove space from 2 heads of IDs
109 ids = [id.strip() for id in ids]
110
111
99 if header == "true": 112 if header == "true":
100 header = mq[0] 113 header = mq[0]
101 content = mq[1:] 114 content = mq[1:]
102 else: 115 else:
103 header = "" 116 header = ""
104 content = mq[:] 117 content = mq[:]
105 118
106 if not filtered_lines: # In case there is already some filtered lines from other filters 119 if not filtered_lines: # In case there is already some filtered lines from other filters
107 filtered_lines = [] 120 filtered_lines = []
108 if header != "": 121 if header != "":
109 filtered_lines.append(header) 122 filtered_lines.append(header)
110 123
111 for line in content: 124 for line in content:
125 line = line.replace("\n", "")
112 id_inline = line.split("\t")[id_index].replace('"', "").split(";") 126 id_inline = line.split("\t")[id_index].replace('"', "").split(";")
113 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs 127 one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
114 128 line = line + "\n"
129
115 if match != "false": 130 if match != "false":
116 # Filter protein IDs 131 # Filter protein IDs
117 if any (pid.upper() in ids for pid in id_inline): 132 if any(pid.upper() in ids for pid in id_inline):
118 #ids = prot_ids.split(":")
119 #print(prot_ids.split(":"))
120 #if prot_id in ids:
121 filtered_lines.append(one_id_line) 133 filtered_lines.append(one_id_line)
122 mq.remove(line) 134 mq.remove(line)
123 else: 135 else:
124 mq[mq.index(line)] = one_id_line 136 mq[mq.index(line)] = one_id_line
125 else: 137 else:
126 if any (ft in pid.upper() for pid in id_inline for ft in ids): 138 if any(ft in pid.upper() for pid in id_inline for ft in ids):
127 filtered_lines.append(one_id_line) 139 filtered_lines.append(one_id_line)
128 mq.remove(line) 140 mq.remove(line)
129 else: 141 else:
130 mq[mq.index(line)] = one_id_line 142 mq[mq.index(line)] = one_id_line
131 return mq, filtered_lines 143 return mq, filtered_lines
132 144
133 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): 145 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
134 mq = MQfile 146 mq = MQfile
135 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: 147 if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
136 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") 148 index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
137 else: 149 else:
138 raise ValueError("Please specify the column where you would like to apply the filter with valid format") 150 raise ValueError("Please specify the column where "
139 151 "you would like to apply the filter "
152 "with valid format")
140 if header == "true": 153 if header == "true":
141 header = mq[0] 154 header = mq[0]
142 content = mq[1:] 155 content = mq[1:]
143 else: 156 else:
144 header = "" 157 header = ""
145 content = mq[:] 158 content = mq[:]
146
147 if not filtered_prots: # In case there is already some filtered lines from other filters 159 if not filtered_prots: # In case there is already some filtered lines from other filters
148 filtered_prots = [] 160 filtered_prots = []
149 if header != "": 161 if header != "":
150 filtered_prots.append(header) 162 filtered_prots.append(header)
151 163
152 for prot in content: 164 for line in content:
165 prot = line.replace("\n","")
153 filter_value = float(filter_value) 166 filter_value = float(filter_value)
154 pep = prot.split("\t")[index].replace('"', "") 167 pep = prot.split("\t")[index].replace('"', "")
155 if pep.replace(".", "", 1).isdigit(): 168 if pep.replace(".", "", 1).isdigit():
156 if opt == "<": 169 if opt == "<":
157 if not float(pep) < filter_value: 170 if float(pep) >= filter_value:
158 filtered_prots.append(prot) 171 filtered_prots.append(line)
159 mq.remove(prot) 172 mq.remove(line)
160 elif opt == "<=": 173 elif opt == "<=":
161 if not float(pep) <= filter_value: 174 if float(pep) > filter_value:
162 filtered_prots.append(prot) 175 filtered_prots.append(line)
163 mq.remove(prot) 176 mq.remove(line)
164 elif opt == ">": 177 elif opt == ">":
165 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) 178 #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value)
166 if not float(pep) > filter_value: 179 if float(pep) <= filter_value:
167 filtered_prots.append(prot) 180 filtered_prots.append(line)
168 mq.remove(prot) 181 mq.remove(line)
169 elif opt == ">=": 182 elif opt == ">=":
170 if not float(pep) >= filter_value: 183 if float(pep) < filter_value:
171 filtered_prots.append(prot) 184 filtered_prots.append(line)
172 mq.remove(prot) 185 mq.remove(line)
173 else: 186 else:
174 if not float(pep) == filter_value: 187 if float(pep) != filter_value:
175 filtered_prots.append(prot) 188 filtered_prots.append(line)
176 mq.remove(prot) 189 mq.remove(line)
177 return mq, filtered_prots #output, trash_file 190 return mq, filtered_prots #output, trash_file
178 191
179 if __name__ == "__main__": 192 if __name__ == "__main__":
180 options() 193 options()