annotate filter_kw_val.py @ 6:b4641c0f8a82 draft

planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
author proteore
date Mon, 11 Mar 2019 09:14:42 -0400
parents 33ca9ba2495a
children 98cb671a92eb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
1 import argparse, re, csv, sys
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
2
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
3 def options():
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
4 """
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
5 Parse options:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
6 -i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
7 --kw Keyword to be filtered, the column number where this filter applies,
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
8 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
9 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
10 --kwfile A file that contains keywords to be filter, the column where this filter applies and
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
11 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
12 --value The value to be filtered, the column number where this filter applies and the
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
13 operation symbol ["value,ncol,=/>/>=/</<=/!="]
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
14 --values_range range of values to be keep, example : --values_range 5 20 c1 true
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
15 --operation 'keep' or 'discard' lines concerned by filter(s)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
16 --operator The operator used to filter with several keywords/values : AND or OR
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
17 --o --output The output filename
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
18 --discarded_lines The file contains removed lines
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
19 -s --sort_col Used column to sort the file, ",true" for reverse sorting, ",false" otherwise example : c1,false
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
20 """
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
21 parser = argparse.ArgumentParser()
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
22 parser.add_argument("-i", "--input", help="Input file", required=True)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
23 parser.add_argument("--kw", nargs="+", action="append", help="")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
24 parser.add_argument("--kw_file", nargs="+", action="append", help="")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
25 parser.add_argument("--value", nargs="+", action="append", help="")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
26 parser.add_argument("--values_range", nargs="+", action="append", help="")
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
27 parser.add_argument("--operation", default="keep", type=str, choices=['keep','discard'],help='')
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
28 parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='')
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
29 parser.add_argument("-o", "--output", default="output.txt")
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
30 parser.add_argument("--discarded_lines", default="filtered_output.txt")
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
31 parser.add_argument("-s","--sort_col", help="")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
32
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
33 args = parser.parse_args()
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
34
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
35 filters(args)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
36
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
37 def str_to_bool(v):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
38 if v.lower() in ('yes', 'true', 't', 'y', '1'):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
39 return True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
40 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
41 return False
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
42 else:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
43 raise argparse.ArgumentTypeError('Boolean value expected.')
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
44
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
45 def proper_ncol (ncol,file):
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
46 if ncol not in range(len(file[0])):
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
47 print("Column "+str(ncol+1)+" not found in input file")
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
48 #traceback.print_exc(file=sys.stdout)
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
49 sys.exit(1)
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
50
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
51 #Check if a variable is a float or an integer
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
52 def is_number(number_format, n):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
53 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
54 int_format = re.compile(r"^[-]?[0-9][0-9]*$")
4
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
55 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$")
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
56 test = ""
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
57 if number_format == "int":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
58 test = re.match(int_format, n)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
59 elif number_format == "float":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
60 test = re.match(float_format, n)
4
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
61 if test is None : test = re.match(scientific_number,n)
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
62
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
63 if test:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
64 return True
4
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
65 else :
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
66 return False
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
67
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
68 #Filter the document
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
69 def filters(args):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
70 filename = args.input.split(",")[0]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
71 header = str_to_bool(args.input.split(",")[1])
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
72 csv_file = blank_to_NA(read_file(filename))
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
73 results_dict = {}
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
74 operator_dict = { "Equal" : "=" , "Higher" : ">" , "Equal-or-higher" : ">=" , "Lower" : "<" , "Equal-or-lower" : "<=" , "Different" : "!=" }
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
75
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
76 if args.kw:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
77 keywords = args.kw
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
78 for k in keywords:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
79 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
80
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
81 if args.kw_file:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
82 key_files = args.kw_file
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
83 for kf in key_files:
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
84 header = str_to_bool(kf[1])
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
85 ncol = column_from_txt(kf[2],csv_file)
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
86 keywords = read_keywords_file(kf[0],header,ncol)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
87 results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
88
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
89 if args.value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
90 for v in args.value:
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
91 v[0] = v[0].replace(",",".")
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
92 v[2] = operator_dict[v[2]]
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
93 if is_number("float", v[0]):
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
94 csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
95 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
96 else:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
97 raise ValueError("Please enter a number in filter by value")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
98
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
99 if args.values_range:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
100 for vr in args.values_range:
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
101 vr[:2] = [value.replace(",",".") for value in vr[:2]]
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
102 csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
103 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
104 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
105
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
106 remaining_lines=[]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
107 filtered_lines=[]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
108
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
109 if header is True :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
110 remaining_lines.append(csv_file[0])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
111 filtered_lines.append(csv_file[0])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
112
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
113 if results_dict == {} : #no filter used
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
114 remaining_lines.extend(csv_file[1:])
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
115 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
116 for id_line,line in enumerate(csv_file) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
117 if id_line in results_dict : #skip header and empty lines
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
118 if args.operator == 'OR' :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
119 if any(results_dict[id_line]) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
120 filtered_lines.append(line)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
121 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
122 remaining_lines.append(line)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
123
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
124 elif args.operator == "AND" :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
125 if all(results_dict[id_line]) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
126 filtered_lines.append(line)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
127 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
128 remaining_lines.append(line)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
129
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
130 #sort of results by column
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
131 if args.sort_col :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
132 sort_col=args.sort_col.split(",")[0]
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
133 sort_col=column_from_txt(sort_col,csv_file)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
134 reverse=str_to_bool(args.sort_col.split(",")[1])
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
135 remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
136 filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header)
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
137
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
138 #swap lists of lines (files) if 'keep' option selected
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
139 if args.operation == "keep" :
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
140 swap = remaining_lines, filtered_lines
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
141 remaining_lines = swap[1]
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
142 filtered_lines = swap[0]
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
143
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
144 # Write results to output
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
145 with open(args.output,"w") as output :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
146 writer = csv.writer(output,delimiter="\t")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
147 writer.writerows(remaining_lines)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
148
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
149 # Write filtered lines to filtered_output
5
33ca9ba2495a planemo upload commit 395d6aa47cce1fb7642b7c06133636c43d80f3c7-dirty
proteore
parents: 4
diff changeset
150 with open(args.discarded_lines,"w") as filtered_output :
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
151 writer = csv.writer(filtered_output,delimiter="\t")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
152 writer.writerows(filtered_lines)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
153
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
154 #function to sort the csv_file by value in a specific column
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
155 def sort_by_column(tab,sort_col,reverse,header):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
156
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
157 if len(tab) > 1 : #if there's more than just a header or 1 row
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
158 if header :
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
159 head=tab[0]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
160 tab=tab[1:]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
161
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
162 #list of empty cells in the column to sort
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
163 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
164 unsorted_tab=[ tab[i] for i in unsortable_lines]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
165 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
166
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
167 if only_number(tab,sort_col) and any_float(tab,sort_col) :
4
2080e2a4f209 planemo upload commit ef71f7a32bb76c79052b535be1d0beceff6e03a5-dirty
proteore
parents: 2
diff changeset
168 tab = comma_number_to_float(tab,sort_col,False)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
169 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
170 elif only_number(tab,sort_col):
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
171 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
172 else :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
173 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
174
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
175 tab.extend(unsorted_tab)
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
176 if header is True : tab = [head]+tab
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
177
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
178 return tab
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
179
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
180
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
181 #replace all blank cells to NA
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
182 def blank_to_NA(csv_file) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
183
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
184 tmp=[]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
185 for line in csv_file :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
186 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
187 tmp.append(line)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
188
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
189 return tmp
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
190
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
191 #turn into float a column
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
192 def comma_number_to_float(csv_file,ncol,header) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
193 if header :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
194 tmp=[csv_file[0]]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
195 csv_file=csv_file[1:]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
196 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
197 tmp=[]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
198
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
199 for line in csv_file :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
200 line[ncol]=line[ncol].replace(",",".")
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
201 tmp.append(line)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
202
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
203 return (tmp)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
204
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
205 #return True is there is at least one float in the column
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
206 def any_float(tab,col) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
207
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
208 for line in tab :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
209 if is_number("float",line[col].replace(",",".")) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
210 return True
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
211
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
212 return False
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
213
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
214 def only_number(tab,col) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
215 for line in tab :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
216 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
217 return False
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
218 return True
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
219
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
220 #Read the keywords file to extract the list of keywords
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
221 def read_keywords_file(filename,header,ncol):
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
222 with open(filename, "r") as csv_file :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
223 lines= csv.reader(csv_file, delimiter='\t')
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
224 lines = blank_to_NA(lines)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
225 if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
226 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
227 keywords= ["".join(key) for key in lines]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
228 if header : keywords = keywords[1:]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
229 keywords = list(set(keywords))
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
230
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
231 return keywords
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
232
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
233 # Read input file
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
234 def read_file(filename):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
235 with open(filename,"r") as f :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
236 reader=csv.reader(f,delimiter="\t")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
237 tab=list(reader)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
238
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
239 # Remove empty lines (contain only space or new line or "")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
240 #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
241 tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
242
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
243 return tab
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
244
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
245 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
246 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
247 match=str_to_bool(match)
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
248 ncol=column_from_txt(ncol,csv_file)
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
249 if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
250
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
251 for id_line,line in enumerate(csv_file):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
252 if header is True and id_line == 0 : continue
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
253 keyword_inline = line[ncol].replace('"', "").split(";")
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
254
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
255 #Perfect match or not
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
256 if match is True :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
257 found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
258 else:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
259 found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
260
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
261 #if the keyword is found in line
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
262 if id_line in results_dict : results_dict[id_line].append(found_in_line)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
263 else : results_dict[id_line]=[found_in_line]
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
264
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
265 return results_dict
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
266
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
267 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
268 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
269
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
270 filter_value = float(filter_value)
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
271 ncol=column_from_txt(ncol,csv_file)
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
272 nb_string=0
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
273
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
274 for id_line,line in enumerate(csv_file):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
275 if header is True and id_line == 0 : continue
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
276 value = line[ncol].replace('"', "").replace(",",".").strip()
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
277 if value.replace(".", "", 1).isdigit():
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
278 to_filter=value_compare(value,filter_value,opt)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
279
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
280 #adding the result to the dictionary
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
281 if id_line in results_dict : results_dict[id_line].append(to_filter)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
282 else : results_dict[id_line]=[to_filter]
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
283
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
284 #impossible to treat (ex : "" instead of a number), we keep the line by default
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
285 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
286 nb_string+=1
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
287 if id_line in results_dict : results_dict[id_line].append(False)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
288 else : results_dict[id_line]=[False]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
289
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
290 #number of lines in the csv file
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
291 if header : nb_lines = len(csv_file) -1
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
292 else : nb_lines = len(csv_file)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
293
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
294 #if there's no numeric value in the column
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
295 if nb_string == nb_lines :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
296 print ('No numeric values found in the column '+str(ncol+1))
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
297 print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
298
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
299 return results_dict
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
300
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
301 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
302 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
303 inclusive=str_to_bool(inclusive)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
304 bottom_value = float(bottom_value)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
305 top_value=float(top_value)
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
306 ncol=column_from_txt(ncol,csv_file)
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
307 nb_string=0
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
308
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
309 for id_line, line in enumerate(csv_file):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
310 if header is True and id_line == 0 : continue
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
311 value = line[ncol].replace('"', "").replace(",",".").strip()
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
312 if value.replace(".", "", 1).isdigit():
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
313 value=float(value)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
314 if inclusive is True:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
315 in_range = not (bottom_value <= value <= top_value)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
316 else :
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
317 in_range = not (bottom_value < value < top_value)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
318
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
319 #adding the result to the dictionary
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
320 if id_line in results_dict : results_dict[id_line].append(in_range)
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
321 else : results_dict[id_line]=[in_range]
2
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
322
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
323 #impossible to treat (ex : "" instead of a number), we keep the line by default
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
324 else :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
325 nb_string+=1
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
326 if id_line in results_dict : results_dict[id_line].append(False)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
327 else : results_dict[id_line]=[False]
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
328
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
329 #number of lines in the csv file
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
330 if header : nb_lines = len(csv_file) -1
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
331 else : nb_lines = len(csv_file)
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
332
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
333 #if there's no numeric value in the column
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
334 if nb_string == nb_lines :
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
335 print ('No numeric values found in the column '+str(ncol+1))
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
336 if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
52a7afd01c6d planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
proteore
parents: 0
diff changeset
337 else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
338
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
339 return results_dict
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
340
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
341 def column_from_txt(ncol,file):
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
342 if is_number("int", ncol.replace("c", "")):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
343 ncol = int(ncol.replace("c", "")) - 1
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
344 else:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
345 raise ValueError("Please specify the column where "
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
346 "you would like to apply the filter "
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
347 "with valid format")
6
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
348
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
349 proper_ncol (ncol,file)
b4641c0f8a82 planemo upload commit 77279e994f5751c6cd9aa165aa0604db3d241271-dirty
proteore
parents: 5
diff changeset
350
0
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
351 return ncol
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
352
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
353 #return True if value is in the determined values, false otherwise
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
354 def value_compare(value,filter_value,opt):
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
355 test_value=False
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
356
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
357 if opt == "<":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
358 if float(value) < filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
359 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
360 elif opt == "<=":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
361 if float(value) <= filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
362 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
363 elif opt == ">":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
364 if float(value) > filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
365 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
366 elif opt == ">=":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
367 if float(value) >= filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
368 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
369 elif opt == "=":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
370 if float(value) == filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
371 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
372 elif opt == "!=":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
373 if float(value) != filter_value:
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
374 test_value = True
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
375
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
376 return test_value
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
377
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
378 if __name__ == "__main__":
a55e8b137c6b planemo upload commit 688c456ca57914a63c20eba942ec5fe81e896099-dirty
proteore
parents:
diff changeset
379 options()