proteore_filter_keywords_values: filter_kw

comparison filter_kw_val.py @ 2:52a7afd01c6d draft

planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba

author	proteore
date	Tue, 18 Dec 2018 09:25:11 -0500
parents	a55e8b137c6b
children	2080e2a4f209

comparison

equal deleted inserted replaced

-:cb9555653b09
+:52a7afd01c6d
 #Filter the document
 def filters(args):
 filename = args.input.split(",")[0]
 header = str_to_bool(args.input.split(",")[1])
-csv_file = read_file(filename)
+csv_file = blank_to_NA(read_file(filename))
 results_dict = {}
 if args.kw:
 keywords = args.kw
 for k in keywords:
 results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
 if args.kw_file:
 key_files = args.kw_file
 for kf in key_files:
-keywords = read_option(kf[0])
+header = str_to_bool(kf[1])
-results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2])
+ncol = column_from_txt(kf[2])
+keywords = read_keywords_file(kf[0],header,ncol)
+results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])
 if args.value:
 for v in args.value:
+v[0] = v[0].replace(",",".")
 if is_number("float", v[0]):
+csv_file = comma_number_to_float(csv_file,v[1],header)
 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
 else:
 raise ValueError("Please enter a number in filter by value")
 if args.values_range:
 for vr in args.values_range:
+vr[:2] = [value.replace(",",".") for value in vr[:2]]
+csv_file = comma_number_to_float(csv_file,vr[2],header)
 if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
 remaining_lines=[]
 filtered_lines=[]
 if header is True :
 remaining_lines.append(csv_file[0])
 filtered_lines.append(csv_file[0])
-for id_line,line in enumerate(csv_file) :
+if results_dict == {} :   #no filter used
-if id_line in results_dict :   #skip header and empty lines
+remaining_lines.extend(csv_file[1:])
-if args.operator == 'OR' :
+else :
-if any(results_dict[id_line]) :
+for id_line,line in enumerate(csv_file) :
-filtered_lines.append(line)
+if id_line in results_dict :   #skip header and empty lines
-else :
+if args.operator == 'OR' :
-remaining_lines.append(line)
+if any(results_dict[id_line]) :
+filtered_lines.append(line)
-elif args.operator == "AND" :
+else :
-if all(results_dict[id_line]) :
+remaining_lines.append(line)
-filtered_lines.append(line)
-else :
+elif args.operator == "AND" :
-remaining_lines.append(line)
+if all(results_dict[id_line]) :
+filtered_lines.append(line)
+else :
+remaining_lines.append(line)
 #sort of results by column
 if args.sort_col :
 sort_col=args.sort_col.split(",")[0]
 sort_col=column_from_txt(sort_col)
 reverse=str_to_bool(args.sort_col.split(",")[1])
 #function to sort the csv_file by value in a specific column
 def sort_by_column(tab,sort_col,reverse,header):
 if len(tab) > 1 : #if there's more than just a header or 1 row
-if header is True :
+if header :
 head=tab[0]
 tab=tab[1:]
-if is_number("int",tab[0][sort_col]) :
+#list of empty cells in the column to sort
-tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
+unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
-elif is_number("float",tab[0][sort_col]) :
+unsorted_tab=[ tab[i] for i in unsortable_lines]
+tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
+if only_number(tab,sort_col) and any_float(tab,sort_col)  :
 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
+elif only_number(tab,sort_col):
+tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
 else :
 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
+tab.extend(unsorted_tab)
 if header is True : tab = [head]+tab
 return tab
+#replace all blank cells to NA
+def blank_to_NA(csv_file) :
+tmp=[]
+for line in csv_file :
+line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
+tmp.append(line)
+return tmp
+#turn into float a column
+def comma_number_to_float(csv_file,ncol,header) :
+ncol = int(ncol.replace("c","")) - 1
+if header :
+tmp=[csv_file[0]]
+csv_file=csv_file[1:]
+else :
+tmp=[]
+for line in csv_file :
+line[ncol]=line[ncol].replace(",",".")
+tmp.append(line)
+return (tmp)
+#return True is there is at least one float in the column
+def any_float(tab,col) :
+for line in tab :
+if is_number("float",line[col].replace(",",".")) :
+return True
+return False
+def only_number(tab,col) :
+for line in tab :
+if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
+return False
+return True
 #Read the keywords file to extract the list of keywords
-def read_option(filename):
+def read_keywords_file(filename,header,ncol):
-with open(filename, "r") as f:
+with open(filename, "r") as csv_file :
-filter_list=f.read().splitlines()
+lines= csv.reader(csv_file, delimiter='\t')
-filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0]
+lines = blank_to_NA(lines)
-filters=";".join(filter_list)
+if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
+else :
-return filters
+keywords= ["".join(key) for key in lines]
+if header : keywords = keywords[1:]
+keywords = list(set(keywords))
+return keywords
 # Read input file
 def read_file(filename):
 with open(filename,"r") as f :
 reader=csv.reader(f,delimiter="\t")
 #seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
 match=str_to_bool(match)
 ncol=column_from_txt(ncol)
+if type(keywords) != list : keywords = keywords.upper().split()            # Split list of filter keyword
-keywords = keywords.upper().split(";")                                            # Split list of filter keyword
-[keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""]  # Remove blank keywords
-keywords = [k.strip() for k in keywords]        # Remove space from 2 heads of keywords
 for id_line,line in enumerate(csv_file):
 if header is True and id_line == 0 : continue
-#line = line.replace("\n", "")
 keyword_inline = line[ncol].replace('"', "").split(";")
-#line = line + "\n"
 #Perfect match or not
 if match is True :
 found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
 else:
 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
 def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
 filter_value = float(filter_value)
 ncol=column_from_txt(ncol)
+nb_string=0
 for id_line,line in enumerate(csv_file):
 if header is True and id_line == 0 : continue
-value = line[ncol].replace('"', "").strip()
+value = line[ncol].replace('"', "").replace(",",".").strip()
 if value.replace(".", "", 1).isdigit():
 to_filter=value_compare(value,filter_value,opt)
 #adding the result to the dictionary
 if id_line in results_dict : results_dict[id_line].append(to_filter)
 else : results_dict[id_line]=[to_filter]
+#impossible to treat (ex : "" instead of a number), we keep the line by default
+else :
+nb_string+=1
+if id_line in results_dict : results_dict[id_line].append(False)
+else : results_dict[id_line]=[False]
+#number of lines in the csv file
+if header : nb_lines = len(csv_file) -1
+else : nb_lines = len(csv_file)
+#if there's no numeric value in the column
+if nb_string == nb_lines :
+print ('No numeric values found in the column '+str(ncol+1))
+print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))
 return results_dict
 #filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
 def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
 inclusive=str_to_bool(inclusive)
 bottom_value = float(bottom_value)
 top_value=float(top_value)
 ncol=column_from_txt(ncol)
+nb_string=0
 for id_line, line in enumerate(csv_file):
 if header is True and id_line == 0 : continue
-value = line[ncol].replace('"', "").strip()
+value = line[ncol].replace('"', "").replace(",",".").strip()
 if value.replace(".", "", 1).isdigit():
 value=float(value)
 if inclusive is True:
 in_range = not (bottom_value <= value <= top_value)
 else :
 in_range = not (bottom_value < value < top_value)
 #adding the result to the dictionary
 if id_line in results_dict : results_dict[id_line].append(in_range)
 else : results_dict[id_line]=[in_range]
+#impossible to treat (ex : "" instead of a number), we keep the line by default
+else :
+nb_string+=1
+if id_line in results_dict : results_dict[id_line].append(False)
+else : results_dict[id_line]=[False]
+#number of lines in the csv file
+if header : nb_lines = len(csv_file) -1
+else : nb_lines = len(csv_file)
+#if there's no numeric value in the column
+if nb_string == nb_lines :
+print ('No numeric values found in the column '+str(ncol+1))
+if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
+else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
 return results_dict
 def column_from_txt(ncol):
 if is_number("int", ncol.replace("c", "")):

Mercurial > repos > proteore > proteore_filter_keywords_values

comparison filter_kw_val.py @ 2:52a7afd01c6d draft