proteore_filter_keywords_values: filter_kw

comparison filter_kw_val.py @ 8:98cb671a92eb draft default tip

"planemo upload commit 80e3e50ca52b5b232f91e6dd6850da606d9c4c5f-dirty"

author	proteore
date	Mon, 10 May 2021 12:27:04 +0000
parents	b4641c0f8a82
children

comparison

equal deleted inserted replaced

-:5621406a4d2f
+:98cb671a92eb
-import argparse, re, csv, sys
+import argparse
+import csv
+import re
+import sys
 def options():
 """
 Parse options:
--i, --input     Input filename and boolean value if the file contains header ["filename,true/false"]
+-i, --input     Input filename and boolean value if the file contains header ["filename,true/false"]  # noqa 501
 --kw            Keyword to be filtered, the column number where this filter applies,
 boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
 This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
 --kwfile        A file that contains keywords to be filter, the column where this filter applies and
 boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
 --value         The value to be filtered, the column number where this filter applies and the
 operation symbol ["value,ncol,=/>/>=/</<=/!="]
 --values_range  range of values to be keep, example : --values_range 5 20 c1 true
 --operation     'keep' or 'discard' lines concerned by filter(s)
 --operator      The operator used to filter with several keywords/values : AND or OR
 --o --output    The output filename
 parser.add_argument("-i", "--input", help="Input file", required=True)
 parser.add_argument("--kw", nargs="+", action="append", help="")
 parser.add_argument("--kw_file", nargs="+", action="append", help="")
 parser.add_argument("--value", nargs="+", action="append", help="")
 parser.add_argument("--values_range", nargs="+", action="append", help="")
-parser.add_argument("--operation", default="keep", type=str, choices=['keep','discard'],help='')
+parser.add_argument("--operation", default="keep", type=str, choices=['keep', 'discard'], help='')  # noqa 501
-parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='')
+parser.add_argument("--operator", default="OR", type=str, choices=['AND', 'OR'], help='')  # noqa 501
 parser.add_argument("-o", "--output", default="output.txt")
 parser.add_argument("--discarded_lines", default="filtered_output.txt")
-parser.add_argument("-s","--sort_col", help="")
+parser.add_argument("-s", "--sort_col", help="")
 args = parser.parse_args()
 filters(args)
 def str_to_bool(v):
 if v.lower() in ('yes', 'true', 't', 'y', '1'):
 return True
 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 return False
 else:
 raise argparse.ArgumentTypeError('Boolean value expected.')
-def proper_ncol (ncol,file):
+def proper_ncol(ncol, file):
 if ncol not in range(len(file[0])):
 print("Column "+str(ncol+1)+" not found in input file")
-#traceback.print_exc(file=sys.stdout)
+# traceback.print_exc(file=sys.stdout)
 sys.exit(1)
-#Check if a variable is a float or an integer
+# Check if a variable is a float or an integer
 def is_number(number_format, n):
 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
 int_format = re.compile(r"^[-]?[0-9][0-9]*$")
 scientific_number = re.compile(r"^[-+]?[\d]+\.?[\d]*[Ee](?:[-+]?[\d]+)?$")
 test = ""
 if number_format == "int":
 test = re.match(int_format, n)
 elif number_format == "float":
 test = re.match(float_format, n)
-if test is None : test = re.match(scientific_number,n)
+if test is None:
+test = re.match(scientific_number, n)
 if test:
 return True
-else :
+else:
 return False
-#Filter the document
+# Filter the document
 def filters(args):
 filename = args.input.split(",")[0]
 header = str_to_bool(args.input.split(",")[1])
 csv_file = blank_to_NA(read_file(filename))
 results_dict = {}
-operator_dict = { "Equal" : "=" , "Higher" : ">" , "Equal-or-higher" : ">=" , "Lower" : "<" , "Equal-or-lower" : "<=" , "Different" : "!=" }
+operator_dict = {"Equal": "=", "Higher": ">", "Equal-or-higher": ">=", "Lower": "<", "Equal-or-lower": "<=", "Different": "!="}  # noqa 501
 if args.kw:
 keywords = args.kw
 for k in keywords:
-results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
+results_dict = filter_keyword(csv_file,
+header,
+results_dict,
+k[0],
+k[1],
+k[2])
 if args.kw_file:
 key_files = args.kw_file
 for kf in key_files:
 header = str_to_bool(kf[1])
-ncol = column_from_txt(kf[2],csv_file)
+ncol = column_from_txt(kf[2], csv_file)
-keywords = read_keywords_file(kf[0],header,ncol)
+keywords = read_keywords_file(kf[0], header, ncol)
-results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
+results_dict = filter_keyword(csv_file, header, results_dict,
+keywords, kf[3], kf[4])
 if args.value:
 for v in args.value:
-v[0] = v[0].replace(",",".")
+v[0] = v[0].replace(",", ".")
 v[2] = operator_dict[v[2]]
 if is_number("float", v[0]):
-csv_file = comma_number_to_float(csv_file,column_from_txt(v[1],csv_file),header)
+csv_file = comma_number_to_float(csv_file,
-results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
+column_from_txt(
+v[1], csv_file), header)
+results_dict = filter_value(csv_file, header,
+results_dict, v[0], v[1], v[2])
 else:
 raise ValueError("Please enter a number in filter by value")
 if args.values_range:
 for vr in args.values_range:
-vr[:2] = [value.replace(",",".") for value in vr[:2]]
+vr[:2] = [value.replace(",", ".") for value in vr[:2]]
-csv_file = comma_number_to_float(csv_file,column_from_txt(vr[2],csv_file),header)
+csv_file = comma_number_to_float(csv_file,
-if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
+column_from_txt(
-results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
+vr[2], csv_file), header)
+if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float", vr[1]) or is_number("int", vr[1])):  # noqa 501
-remaining_lines=[]
+results_dict = filter_values_range(csv_file,
-filtered_lines=[]
+header, results_dict,
+vr[0], vr[1], vr[2], vr[3])
-if header is True :
+remaining_lines = []
+filtered_lines = []
+if header is True:
 remaining_lines.append(csv_file[0])
 filtered_lines.append(csv_file[0])
-if results_dict == {} :   #no filter used
+if results_dict == {}:   # no filter used
 remaining_lines.extend(csv_file[1:])
-else :
+else:
-for id_line,line in enumerate(csv_file) :
+for id_line, line in enumerate(csv_file):
-if id_line in results_dict :   #skip header and empty lines
+if id_line in results_dict:   # skip header and empty lines
-if args.operator == 'OR' :
+if args.operator == 'OR':
-if any(results_dict[id_line]) :
+if any(results_dict[id_line]):
 filtered_lines.append(line)
-else :
+else:
 remaining_lines.append(line)
-elif args.operator == "AND" :
+elif args.operator == "AND":
-if all(results_dict[id_line]) :
+if all(results_dict[id_line]):
 filtered_lines.append(line)
-else :
+else:
 remaining_lines.append(line)
-#sort of results by column
+# sort of results by column
-if args.sort_col :
+if args.sort_col:
-sort_col=args.sort_col.split(",")[0]
+sort_col = args.sort_col.split(",")[0]
-sort_col=column_from_txt(sort_col,csv_file)
+sort_col = column_from_txt(sort_col, csv_file)
-reverse=str_to_bool(args.sort_col.split(",")[1])
+reverse = str_to_bool(args.sort_col.split(",")[1])
-remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header)
+remaining_lines = sort_by_column(remaining_lines, sort_col,
-filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header)
+reverse, header)
+filtered_lines = sort_by_column(filtered_lines, sort_col,
-#swap lists of lines (files) if 'keep' option selected
+reverse, header)
-if args.operation == "keep" :
+# swap lists of lines (files) if 'keep' option selected
+if args.operation == "keep":
 swap = remaining_lines, filtered_lines
 remaining_lines = swap[1]
 filtered_lines = swap[0]
 # Write results to output
-with open(args.output,"w") as output :
+with open(args.output, "w") as output:
-writer = csv.writer(output,delimiter="\t")
+writer = csv.writer(output, delimiter="\t")
 writer.writerows(remaining_lines)
 # Write filtered lines to filtered_output
-with open(args.discarded_lines,"w") as filtered_output :
+with open(args.discarded_lines, "w") as filtered_output:
-writer = csv.writer(filtered_output,delimiter="\t")
+writer = csv.writer(filtered_output, delimiter="\t")
 writer.writerows(filtered_lines)
-#function to sort the csv_file by value in a specific column
+# function to sort the csv_file by value in a specific column
-def sort_by_column(tab,sort_col,reverse,header):
-if len(tab) > 1 : #if there's more than just a header or 1 row
+def sort_by_column(tab, sort_col, reverse, header):
-if header :
-head=tab[0]
+if len(tab) > 1:  # if there's more than just a header or 1 row
-tab=tab[1:]
+if header:
+head = tab[0]
-#list of empty cells in the column to sort
+tab = tab[1:]
-unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
-unsorted_tab=[ tab[i] for i in unsortable_lines]
+# list of empty cells in the column to sort
-tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
+unsortable_lines = [i for i, line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]  # noqa 501
+unsorted_tab = [tab[i] for i in unsortable_lines]
-if only_number(tab,sort_col) and any_float(tab,sort_col)  :
+tab = [line for i, line in enumerate(tab) if i not in unsortable_lines]
-tab = comma_number_to_float(tab,sort_col,False)
-tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
+if only_number(tab, sort_col) and any_float(tab, sort_col):
-elif only_number(tab,sort_col):
+tab = comma_number_to_float(tab, sort_col, False)
-tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
+tab = sorted(tab, key=lambda row: float(row[sort_col]),
-else :
+reverse=reverse)
+elif only_number(tab, sort_col):
+tab = sorted(tab, key=lambda row: int(row[sort_col]),
+reverse=reverse)
+else:
 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
 tab.extend(unsorted_tab)
-if header is True : tab = [head]+tab
+if header is True:
+tab = [head]+tab
 return tab
-#replace all blank cells to NA
+# replace all blank cells to NA
-def blank_to_NA(csv_file) :
-tmp=[]
+def blank_to_NA(csv_file):
-for line in csv_file :
-line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
+tmp = []
+for line in csv_file:
+line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]  # noqa 501
 tmp.append(line)
 return tmp
-#turn into float a column
+# turn into float a column
-def comma_number_to_float(csv_file,ncol,header) :
-if header :
-tmp=[csv_file[0]]
+def comma_number_to_float(csv_file, ncol, header):
-csv_file=csv_file[1:]
+if header:
-else :
+tmp = [csv_file[0]]
-tmp=[]
+csv_file = csv_file[1:]
+else:
-for line in csv_file :
+tmp = []
-line[ncol]=line[ncol].replace(",",".")
+for line in csv_file:
+line[ncol] = line[ncol].replace(",", ".")
 tmp.append(line)
 return (tmp)
-#return True is there is at least one float in the column
+# return True is there is at least one float in the column
-def any_float(tab,col) :
-for line in tab :
+def any_float(tab, col):
-if is_number("float",line[col].replace(",",".")) :
+for line in tab:
+if is_number("float", line[col].replace(",", ".")):
 return True
 return False
-def only_number(tab,col) :
-for line in tab :
+def only_number(tab, col):
-if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
+for line in tab:
+if not (is_number("float", line[col].replace(",", ".")) or is_number("int", line[col].replace(",", "."))):  # noqa 501
 return False
 return True
-#Read the keywords file to extract the list of keywords
+# Read the keywords file to extract the list of keywords
-def read_keywords_file(filename,header,ncol):
-with open(filename, "r") as csv_file :
-lines= csv.reader(csv_file, delimiter='\t')
+def read_keywords_file(filename, header, ncol):
+with open(filename, "r") as csv_file:
+lines = csv.reader(csv_file, delimiter='\t')
 lines = blank_to_NA(lines)
-if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
+if (len(lines[0])) > 1:
-else :
+keywords = [line[ncol] for line in lines]
-keywords= ["".join(key) for key in lines]
+else:
-if header : keywords = keywords[1:]
+keywords = ["".join(key) for key in lines]
+if header:
+keywords = keywords[1:]
 keywords = list(set(keywords))
 return keywords
 # Read input file
 def read_file(filename):
-with open(filename,"r") as f :
+with open(filename, "r") as f:
-reader=csv.reader(f,delimiter="\t")
+reader = csv.reader(f, delimiter="\t")
-tab=list(reader)
+tab = list(reader)
 # Remove empty lines (contain only space or new line or "")
-#[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
+# [tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
-tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ]
+tab = [line for line in tab if len("".join(line).replace(" ", "")) != 0]  # noqa 501
 return tab
-#seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
+# seek for keywords in rows of csvfile, return a dictionary of boolean
+# (true if keyword found, false otherwise)
 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
-match=str_to_bool(match)
+match = str_to_bool(match)
-ncol=column_from_txt(ncol,csv_file)
+ncol = column_from_txt(ncol, csv_file)
-if type(keywords) != list : keywords = keywords.upper().split()            # Split list of filter keyword
+if type(keywords) != list:
+keywords = keywords.upper().split()  # Split list of filter keyword
-for id_line,line in enumerate(csv_file):
-if header is True and id_line == 0 : continue
+for id_line, line in enumerate(csv_file):
+if header is True and id_line == 0:
+continue
 keyword_inline = line[ncol].replace('"', "").split(";")
-#Perfect match or not
+# Perfect match or not
-if match is True :
+if match is True:
-found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
+found_in_line = any(pid.upper() in keywords for pid in keyword_inline)  # noqa 501
 else:
-found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords)
+found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords)  # noqa 501
-#if the keyword is found in line
+# if the keyword is found in line
-if id_line in results_dict : results_dict[id_line].append(found_in_line)
+if id_line in results_dict:
-else : results_dict[id_line]=[found_in_line]
+results_dict[id_line].append(found_in_line)
+else:
+results_dict[id_line] = [found_in_line]
 return results_dict
-#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
+# filter ba determined value in rows of csvfile, return a dictionary
+# of boolean (true if value filtered, false otherwise)
 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
 filter_value = float(filter_value)
-ncol=column_from_txt(ncol,csv_file)
+ncol = column_from_txt(ncol, csv_file)
-nb_string=0
+nb_string = 0
-for id_line,line in enumerate(csv_file):
+for id_line, line in enumerate(csv_file):
-if header is True and id_line == 0 : continue
+if header is True and id_line == 0:
-value = line[ncol].replace('"', "").replace(",",".").strip()
+continue
+value = line[ncol].replace('"', "").replace(",", ".").strip()
 if value.replace(".", "", 1).isdigit():
-to_filter=value_compare(value,filter_value,opt)
+to_filter = value_compare(value, filter_value, opt)
-#adding the result to the dictionary
+# adding the result to the dictionary
-if id_line in results_dict : results_dict[id_line].append(to_filter)
+if id_line in results_dict:
-else : results_dict[id_line]=[to_filter]
+results_dict[id_line].append(to_filter)
+else:
-#impossible to treat (ex : "" instead of a number), we keep the line by default
+results_dict[id_line] = [to_filter]
-else :
-nb_string+=1
+# impossible to treat (ex : "" instead of a number),
-if id_line in results_dict : results_dict[id_line].append(False)
+# we keep the line by default
-else : results_dict[id_line]=[False]
+else:
+nb_string += 1
-#number of lines in the csv file
+if id_line in results_dict:
-if header : nb_lines = len(csv_file) -1
+results_dict[id_line].append(False)
-else : nb_lines = len(csv_file)
+else:
+results_dict[id_line] = [False]
-#if there's no numeric value in the column
-if nb_string == nb_lines :
+# number of lines in the csv file
-print ('No numeric values found in the column '+str(ncol+1))
+if header:
-print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))
+nb_lines = len(csv_file) - 1
+else:
+nb_lines = len(csv_file)
+# if there's no numeric value in the column
+if nb_string == nb_lines:
+print('No numeric values found in the column '+str(ncol+1))
+print('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))  # noqa 501
 return results_dict
-#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
+# filter ba determined value in rows of csvfile, return a dictionary
-def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
+# of boolean (true if value filtered, false otherwise)
-inclusive=str_to_bool(inclusive)
+def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):  # noqa 501
+inclusive = str_to_bool(inclusive)
 bottom_value = float(bottom_value)
-top_value=float(top_value)
+top_value = float(top_value)
-ncol=column_from_txt(ncol,csv_file)
+ncol = column_from_txt(ncol, csv_file)
-nb_string=0
+nb_string = 0
 for id_line, line in enumerate(csv_file):
-if header is True and id_line == 0 : continue
+if header is True and id_line == 0:
-value = line[ncol].replace('"', "").replace(",",".").strip()
+continue
+value = line[ncol].replace('"', "").replace(",", ".").strip()
 if value.replace(".", "", 1).isdigit():
-value=float(value)
+value = float(value)
 if inclusive is True:
 in_range = not (bottom_value <= value <= top_value)
-else :
+else:
 in_range = not (bottom_value < value < top_value)
-#adding the result to the dictionary
+# adding the result to the dictionary
-if id_line in results_dict : results_dict[id_line].append(in_range)
+if id_line in results_dict:
-else : results_dict[id_line]=[in_range]
+results_dict[id_line].append(in_range)
+else:
-#impossible to treat (ex : "" instead of a number), we keep the line by default
+results_dict[id_line] = [in_range]
-else :
-nb_string+=1
+# impossible to treat (ex : "" instead of a number),
-if id_line in results_dict : results_dict[id_line].append(False)
+# we keep the line by default
-else : results_dict[id_line]=[False]
+else:
+nb_string += 1
-#number of lines in the csv file
+if id_line in results_dict:
-if header : nb_lines = len(csv_file) -1
+results_dict[id_line].append(False)
-else : nb_lines = len(csv_file)
+else:
+results_dict[id_line] = [False]
-#if there's no numeric value in the column
-if nb_string == nb_lines :
+# number of lines in the csv file
-print ('No numeric values found in the column '+str(ncol+1))
+if header:
-if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
+nb_lines = len(csv_file) - 1
-else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
+else:
+nb_lines = len(csv_file)
-return results_dict
+# if there's no numeric value in the column
-def column_from_txt(ncol,file):
+if nb_string == nb_lines:
-if is_number("int", ncol.replace("c", "")):
+print('No numeric values found in the column '+str(ncol+1))
-ncol = int(ncol.replace("c", "")) - 1
+if inclusive:
+print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))  # noqa 501
+else:
+print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))  # noqa 501
+return results_dict
+def column_from_txt(ncol, file):
+if is_number("int", ncol.replace("c", "")):
+ncol = int(ncol.replace("c", "")) - 1
 else:
 raise ValueError("Please specify the column where "
 "you would like to apply the filter "
 "with valid format")
-proper_ncol (ncol,file)
+proper_ncol(ncol, file)
 return ncol
-#return True if value is in the determined values, false otherwise
+# return True if value is in the determined values, false otherwise
-def value_compare(value,filter_value,opt):
-test_value=False
+def value_compare(value, filter_value, opt):
+test_value = False
 if opt == "<":
 if float(value) < filter_value:
 test_value = True
 elif opt == "<=":
 if float(value) >= filter_value:
 test_value = True
 elif opt == "=":
 if float(value) == filter_value:
 test_value = True
 elif opt == "!=":
 if float(value) != filter_value:
 test_value = True
 return test_value
 if __name__ == "__main__":
 options()

Mercurial > repos > proteore > proteore_filter_keywords_values

comparison filter_kw_val.py @ 8:98cb671a92eb draft default tip