Mercurial > repos > proteore > proteore_filter_keywords_values
changeset 2:52a7afd01c6d draft
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author | proteore |
---|---|
date | Tue, 18 Dec 2018 09:25:11 -0500 |
parents | cb9555653b09 |
children | 68cee865018e |
files | filter_kw_val.py filter_kw_val.xml |
diffstat | 2 files changed, 205 insertions(+), 112 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_kw_val.py Fri Sep 21 06:03:25 2018 -0400 +++ b/filter_kw_val.py Tue Dec 18 09:25:11 2018 -0500 @@ -55,7 +55,7 @@ def filters(args): filename = args.input.split(",")[0] header = str_to_bool(args.input.split(",")[1]) - csv_file = read_file(filename) + csv_file = blank_to_NA(read_file(filename)) results_dict = {} if args.kw: @@ -66,18 +66,24 @@ if args.kw_file: key_files = args.kw_file for kf in key_files: - keywords = read_option(kf[0]) - results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2]) + header = str_to_bool(kf[1]) + ncol = column_from_txt(kf[2]) + keywords = read_keywords_file(kf[0],header,ncol) + results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4]) if args.value: for v in args.value: + v[0] = v[0].replace(",",".") if is_number("float", v[0]): + csv_file = comma_number_to_float(csv_file,v[1],header) results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2]) else: raise ValueError("Please enter a number in filter by value") if args.values_range: for vr in args.values_range: + vr[:2] = [value.replace(",",".") for value in vr[:2]] + csv_file = comma_number_to_float(csv_file,vr[2],header) if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])): results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3]) @@ -88,20 +94,23 @@ remaining_lines.append(csv_file[0]) filtered_lines.append(csv_file[0]) - for id_line,line in enumerate(csv_file) : - if id_line in results_dict : #skip header and empty lines - if args.operator == 'OR' : - if any(results_dict[id_line]) : - filtered_lines.append(line) - else : - remaining_lines.append(line) + if results_dict == {} : #no filter used + remaining_lines.extend(csv_file[1:]) + else : + for id_line,line in enumerate(csv_file) : + if id_line in results_dict : #skip header and empty lines + if args.operator == 'OR' : + if any(results_dict[id_line]) : + filtered_lines.append(line) + else : + remaining_lines.append(line) - elif args.operator == "AND" : - if all(results_dict[id_line]) : - filtered_lines.append(line) - else : - remaining_lines.append(line) - + elif args.operator == "AND" : + if all(results_dict[id_line]) : + filtered_lines.append(line) + else : + remaining_lines.append(line) + #sort of results by column if args.sort_col : sort_col=args.sort_col.split(",")[0] @@ -124,29 +133,81 @@ def sort_by_column(tab,sort_col,reverse,header): if len(tab) > 1 : #if there's more than just a header or 1 row - if header is True : + if header : head=tab[0] tab=tab[1:] - if is_number("int",tab[0][sort_col]) : - tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) - elif is_number("float",tab[0][sort_col]) : + #list of empty cells in the column to sort + unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] + unsorted_tab=[ tab[i] for i in unsortable_lines] + tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] + + if only_number(tab,sort_col) and any_float(tab,sort_col) : tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) + elif only_number(tab,sort_col): + tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) else : tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) + tab.extend(unsorted_tab) if header is True : tab = [head]+tab return tab + +#replace all blank cells to NA +def blank_to_NA(csv_file) : + + tmp=[] + for line in csv_file : + line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] + tmp.append(line) + + return tmp + +#turn into float a column +def comma_number_to_float(csv_file,ncol,header) : + ncol = int(ncol.replace("c","")) - 1 + if header : + tmp=[csv_file[0]] + csv_file=csv_file[1:] + else : + tmp=[] + + for line in csv_file : + line[ncol]=line[ncol].replace(",",".") + tmp.append(line) + + return (tmp) + +#return True is there is at least one float in the column +def any_float(tab,col) : + + for line in tab : + if is_number("float",line[col].replace(",",".")) : + return True + + return False + +def only_number(tab,col) : + + for line in tab : + if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : + return False + return True + #Read the keywords file to extract the list of keywords -def read_option(filename): - with open(filename, "r") as f: - filter_list=f.read().splitlines() - filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0] - filters=";".join(filter_list) +def read_keywords_file(filename,header,ncol): + with open(filename, "r") as csv_file : + lines= csv.reader(csv_file, delimiter='\t') + lines = blank_to_NA(lines) + if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines] + else : + keywords= ["".join(key) for key in lines] + if header : keywords = keywords[1:] + keywords = list(set(keywords)) - return filters + return keywords # Read input file def read_file(filename): @@ -164,16 +225,11 @@ def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): match=str_to_bool(match) ncol=column_from_txt(ncol) - - keywords = keywords.upper().split(";") # Split list of filter keyword - [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords - keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords + if type(keywords) != list : keywords = keywords.upper().split() # Split list of filter keyword for id_line,line in enumerate(csv_file): if header is True and id_line == 0 : continue - #line = line.replace("\n", "") keyword_inline = line[ncol].replace('"', "").split(";") - #line = line + "\n" #Perfect match or not if match is True : @@ -192,16 +248,32 @@ filter_value = float(filter_value) ncol=column_from_txt(ncol) + nb_string=0 for id_line,line in enumerate(csv_file): if header is True and id_line == 0 : continue - value = line[ncol].replace('"', "").strip() + value = line[ncol].replace('"', "").replace(",",".").strip() if value.replace(".", "", 1).isdigit(): to_filter=value_compare(value,filter_value,opt) #adding the result to the dictionary if id_line in results_dict : results_dict[id_line].append(to_filter) else : results_dict[id_line]=[to_filter] + + #impossible to treat (ex : "" instead of a number), we keep the line by default + else : + nb_string+=1 + if id_line in results_dict : results_dict[id_line].append(False) + else : results_dict[id_line]=[False] + + #number of lines in the csv file + if header : nb_lines = len(csv_file) -1 + else : nb_lines = len(csv_file) + + #if there's no numeric value in the column + if nb_string == nb_lines : + print ('No numeric values found in the column '+str(ncol+1)) + print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) return results_dict @@ -211,10 +283,11 @@ bottom_value = float(bottom_value) top_value=float(top_value) ncol=column_from_txt(ncol) + nb_string=0 for id_line, line in enumerate(csv_file): if header is True and id_line == 0 : continue - value = line[ncol].replace('"', "").strip() + value = line[ncol].replace('"', "").replace(",",".").strip() if value.replace(".", "", 1).isdigit(): value=float(value) if inclusive is True: @@ -225,6 +298,22 @@ #adding the result to the dictionary if id_line in results_dict : results_dict[id_line].append(in_range) else : results_dict[id_line]=[in_range] + + #impossible to treat (ex : "" instead of a number), we keep the line by default + else : + nb_string+=1 + if id_line in results_dict : results_dict[id_line].append(False) + else : results_dict[id_line]=[False] + + #number of lines in the csv file + if header : nb_lines = len(csv_file) -1 + else : nb_lines = len(csv_file) + + #if there's no numeric value in the column + if nb_string == nb_lines : + print ('No numeric values found in the column '+str(ncol+1)) + if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) + else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) return results_dict
--- a/filter_kw_val.xml Fri Sep 21 06:03:25 2018 -0400 +++ b/filter_kw_val.xml Tue Dec 18 09:25:11 2018 -0500 @@ -1,4 +1,4 @@ -<tool id="MQoutputfilter" name="Filter by keywords or numerical value" version="2018.09.21"> +<tool id="MQoutputfilter" name="Filter by keywords and/or numerical value" version="2018.12.18"> <description></description> <requirements> </requirements> @@ -18,7 +18,7 @@ #if $key.k.kw == "text" --kw "$key.k.txt" "$key.ncol" "$key.match" #else if $key.k.kw == "file" - --kw_file "$key.k.file" "$key.ncol" "$key.match" + --kw_file "$key.k.file" "$key.k.header" "$key.k.ncol" "$key.ncol" "$key.match" #end if #end if #end for @@ -56,25 +56,23 @@ ]]></command> <inputs> - <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics identification and/or quantitative results" /> - <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" /> - <param name="operator" type="select" label="Please select an operator to combine your filters (if more than one)" help="OR : only one filter must be satisfied to filter a row, AND : all your filters must be satisfied to filter a row" > + <param type="data" name="input1" format="txt,tabular" label="Input file" /> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" /> + <param name="operator" type="select" label="Select an operator to combine your filters (if more than one)" help="OR : only one filter must be satisfied to filter a row, AND : all your filters must be satisfied to filter a row" > <option value="OR" selected="True">OR</option> <option value="AND">AND</option> </param> - <param name="sort_column" type="text" value="" label="If you want to sort the result files by values from a column, please enter a column number" help="For example : fill in 'c1' if you want to sort your result file by the column 1 values." /> - <param name="reversed_sort" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Sort in descending order ?"/> + <repeat name="keyword" title="Filter by keywords" > - <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> - <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more details' /> + <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> + <param type="boolean" name="match" truevalue="True" label="Search for exact match?" help='Choosing "Yes" will only filter out exact match (e.g. case sensitive), see help section' /> <conditional name="k" > - <param argument="--kw" type="select" label="Filter by keyword" > - <option value="text" selected="true">Enter keywords (copy/paste)</option> - <option value="file">Choose a file containing keywords</option> + <param name="kw" type="select" label="Enter keywords" > + <option value="text" selected="true">copy/paste</option> + <option value="file">File containing keywords</option> </param> - <when value="None" /> <when value="text" > - <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords must be separated by ";", for example: A8K2U0;Q5TA79;O43175' > + <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords must be separated by tab, space or carriage return into the form field, for example: A8K2U0 Q5TA79 O43175' > <sanitizer> <valid initial="string.printable"> <remove value="'"/> @@ -86,15 +84,16 @@ </param> </when> <when value="file" > - <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" /> + <param name="file" type="data" format="txt,tabular" label="File containing keywords" /> + <param name="ncol" type="text" value="c1" label="Specify the column containing keywords" help='For example, fill in "c1" if keywords are in the first column' /> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" /> </when> </conditional> </repeat> - <repeat name="value" title="Filter by value" > - <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> + <repeat name="value" title="Filter by numerical value" > + <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> <conditional name="v" > - <param argument="--val" type="select" label="Filter by value" > - <option value="None">---</option> + <param argument="Numerical Value" type="select" label="Select operator" > <option value="Equal">=</option> <option value="Higher">></option> <option value="Equal or higher">>=</option> @@ -102,8 +101,6 @@ <option value="Equal or lower"><=</option> <option value="Different">!=</option> </param> - <when value="None" > - </when> <when value="Equal" > <param name="equal" type="float" value="" label="Value" /> </when> @@ -124,16 +121,18 @@ </when> </conditional> </repeat> - <repeat name="values_range" title="Filter by range of values"> - <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> - <param name="bottom_value" type="float" value="" label="Please enter the bottom value" /> - <param name="top_value" type="float" value="" label="Please enter the top value" /> + <repeat name="values_range" title="Filter by range of numerical values"> + <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> + <param name="bottom_value" type="float" value="" label="Enter the bottom value" /> + <param name="top_value" type="float" value="" label="Enter the top value" /> <param name="inclusive" type="boolean" label="inclusive range ?" checked="false" truevalue="true" falsevalue="false" /> </repeat> - </inputs> + <param name="sort_column" type="text" value="" label="Sort result files by:" help="Fill in 'c1' if you want to sort your result file by the column 1 values" /> + <param name="reversed_sort" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Sort in descending order ?"/> + </inputs> <outputs> - <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" /> - <data name="filtered_file" format="tabular" label="${tool.name} on ${input1.name} - Filtered lines" /> + <data name="output1" format="tsv" label="${tool.name} on ${input1.name}" /> + <data name="filtered_file" format="tsv" label="${tool.name} on ${input1.name} - Filtered lines" /> </outputs> <tests> <test> @@ -147,7 +146,7 @@ <param name="match" value="True" /> <conditional name="k"> <param name="kw" value="text" /> - <param name="txt" value="P04264;P35908;P13645;Q5D862;Q5T749;Q8IW75;P81605;P22531;P59666;P78386" /> + <param name="txt" value="P04264 P35908 P13645 Q5D862 Q5T749 Q8IW75 P81605 P22531 P59666 P78386" /> </conditional> </repeat> <repeat name="value"> @@ -162,93 +161,98 @@ </test> </tests> <help><![CDATA[ -This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). +**Description** + +This tool allows to filter out data according to different criteria such as keywords (e.g. a list of contaminants) or numerical values (e.g. intensity measurements below a given threshold). +A boolean operator "OR/AND" allows to combine different type of filters making this tool very powerful. + +----- + +**Input** + +A table (file in txt, tab, tsv, csv format) of your identification and/or quantification results for example. + +----- + +**Parameters** + +**AND/OR operator** + +As many filters as needed can be combined, you can choose how filters apply on your data by using the following boolean operators: + +- OR: only one filter must be satisfied to remove one row +- AND: all filters must be satisfied to remove one row + +----- **Filter by keyword(s)** -Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords. +Click on the "Filter by keywords" box to use it. You can either fill in the field (copy/paste) or upload a file which contains the keywords. -- If you choose to fill in the field, the keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175 +"Column number on which to apply the filter": You must then specify the column number of your input file on wich to apply the filter by keywords. + +- If you choose to fill in the field, the keywords should be separated by tab, space or carriage return into the form field, for example: A8K2U0 Q5TA79 O43175 - If you choose to upload a file in a text format in which each line is a keyword, for example: -REV - TRYP_PIG ALDOA_RABBIT -**The line that contains these keywords will be eliminated from input file.** +LYSO_ECOLI -**Keywords search can be applied by performing either exact match or partial one by using the following option** +Lines that contains these keywords will be removed from input file. + +"Search for exact match?": Keywords search can be applied by performing either exact match or partial one by using the following option: - If you choose **Yes**, only the fields that contains exactly the same content will be removed. - If you choose **No**, all the fields containing the keyword will be removed. -For example: - -**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. - -**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. - ------ - -**Filter by values** +Example: -You can filter your data by a column of numerical values. -Enter the column to be use and select one operator in the list : +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly the word "kinase" will be removed. -- = -- != -- < -- <= -- > -- >= - -Then enter the value to filter and specify the column to apply that option. -If a row contains a value that correspond to your settings, it will be filtered. +**No** option (partial match) for "kinase": lines which contain "kinase" and lines with "alpha-kinase" (and so on) will be removed. ----- -**Filter by a range of values** +**Filter by numerical values**: You can filter your data by a column of numerical values. + +"Column number on which to apply the filter": you must specify the column number of your input file on wich to apply the filter by numerical value. + +Then select one of the operators in the list : -You can also set a range of values to filter your file. -In opposition to value filter, rows with values inside of the defined range are kept. +- = (equal) +- != (not equal) +- < (lower than) +- <= (lower than or equal to) +- > (greater than) +- >= (greater than or equal to) -Rows with values outside of the defined range will be filtered. +Then enter the numerical threshold to apply by filling the "Value" box. +If you choose > 10, each row containing a numerical value (in the chosen column of your input file) that correspond to your settings will be filtered out. ----- -**AND/OR operator** - -Since you can add as many filters as you want, you can choose how filters apply on your data. - -AND or OR operator option works on all filters : - -- OR : only one filter to be satisfied to remove one row -- AND : all filters must be satisfied to remove one row +**Filter by a range of values**: You can also set a range of values to filter your file. +Conversely to the numeric filter, rows with numerical values within the defined range will be kept while rows with values out of this range will be filtered out. ----- -**Sort the results files** - -You can sort the result file if you wish, it can help you to check results. +**Sort results files** -In order to do so : enter the column to be used, all columns will be sorted according to the one filled in. - -Rows stay intact, just in different order like excel. -You can also choose ascending or descending order, by default ascending order is set. +You can sort your results by column in ascending (default value) or descending by entering the column number on which to sort the data. ----- **Output** -The tool will produce 2 output files. +The tool returns two output files. -* A text file containing the resulting filtered input file. +* A text file containing the results that pass your filters -* A text file containing the rows removed from the input file. +* A text file containing the rows removed from the input file (i.e. containing data taht do not pass your filter(s). -----