Mercurial > repos > proteore > proteore_filter_keywords_values

--- a/filter_kw_val.py	Fri Sep 21 06:03:25 2018 -0400
+++ b/filter_kw_val.py	Tue Dec 18 09:25:11 2018 -0500
@@ -55,7 +55,7 @@
 def filters(args):
     filename = args.input.split(",")[0]
     header = str_to_bool(args.input.split(",")[1])
-    csv_file = read_file(filename)
+    csv_file = blank_to_NA(read_file(filename))
     results_dict = {}

     if args.kw:
@@ -66,18 +66,24 @@
     if args.kw_file:
         key_files = args.kw_file
         for kf in key_files:
-            keywords = read_option(kf[0])
-            results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2])
+            header = str_to_bool(kf[1])
+            ncol = column_from_txt(kf[2])
+            keywords = read_keywords_file(kf[0],header,ncol)
+            results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[3], kf[4])

     if args.value:
         for v in args.value:
+            v[0] = v[0].replace(",",".")
             if is_number("float", v[0]):
+                csv_file = comma_number_to_float(csv_file,v[1],header)
                 results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
             else:
                 raise ValueError("Please enter a number in filter by value")

     if args.values_range:
         for vr in args.values_range:
+            vr[:2] = [value.replace(",",".") for value in vr[:2]]
+            csv_file = comma_number_to_float(csv_file,vr[2],header)
             if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
                 results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])

@@ -88,20 +94,23 @@
         remaining_lines.append(csv_file[0])
         filtered_lines.append(csv_file[0])

-    for id_line,line in enumerate(csv_file) :
-        if id_line in results_dict :   #skip header and empty lines
-            if args.operator == 'OR' :
-                if any(results_dict[id_line]) :
-                    filtered_lines.append(line)
-                else :
-                    remaining_lines.append(line)
+    if results_dict == {} :   #no filter used
+        remaining_lines.extend(csv_file[1:])
+    else :
+        for id_line,line in enumerate(csv_file) :
+            if id_line in results_dict :   #skip header and empty lines
+                if args.operator == 'OR' :
+                    if any(results_dict[id_line]) :
+                        filtered_lines.append(line)
+                    else :
+                        remaining_lines.append(line)

-            elif args.operator == "AND" :
-                if all(results_dict[id_line]) :
-                    filtered_lines.append(line)
-                else :
-                    remaining_lines.append(line)
-
+                elif args.operator == "AND" :
+                    if all(results_dict[id_line]) :
+                        filtered_lines.append(line)
+                    else :
+                        remaining_lines.append(line)
+
     #sort of results by column
     if args.sort_col :
         sort_col=args.sort_col.split(",")[0]
@@ -124,29 +133,81 @@
 def sort_by_column(tab,sort_col,reverse,header):

     if len(tab) > 1 : #if there's more than just a header or 1 row
-        if header is True :
+        if header :
             head=tab[0]
             tab=tab[1:]

-        if is_number("int",tab[0][sort_col]) :
-            tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
-        elif is_number("float",tab[0][sort_col]) :
+        #list of empty cells in the column to sort
+        unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')]
+        unsorted_tab=[ tab[i] for i in unsortable_lines]
+        tab= [line for i,line in enumerate(tab) if i not in unsortable_lines]
+
+        if only_number(tab,sort_col) and any_float(tab,sort_col)  :
             tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
+        elif only_number(tab,sort_col):
+            tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
         else :
             tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)

+        tab.extend(unsorted_tab)
         if header is True : tab = [head]+tab

     return tab

+
+#replace all blank cells to NA
+def blank_to_NA(csv_file) :
+
+    tmp=[]
+    for line in csv_file :
+        line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ]
+        tmp.append(line)
+
+    return tmp
+
+#turn into float a column
+def comma_number_to_float(csv_file,ncol,header) :
+    ncol = int(ncol.replace("c","")) - 1
+    if header :
+        tmp=[csv_file[0]]
+        csv_file=csv_file[1:]
+    else :
+        tmp=[]
+
+    for line in csv_file :
+        line[ncol]=line[ncol].replace(",",".")
+        tmp.append(line)
+
+    return (tmp)
+
+#return True is there is at least one float in the column
+def any_float(tab,col) :
+
+    for line in tab :
+        if is_number("float",line[col].replace(",",".")) :
+            return True
+
+    return False
+
+def only_number(tab,col) :
+
+    for line in tab :
+        if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) :
+            return False
+    return True
+
 #Read the keywords file to extract the list of keywords
-def read_option(filename):
-    with open(filename, "r") as f:
-        filter_list=f.read().splitlines()
-    filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0]
-    filters=";".join(filter_list)
+def read_keywords_file(filename,header,ncol):
+    with open(filename, "r") as csv_file :
+        lines= csv.reader(csv_file, delimiter='\t')
+        lines = blank_to_NA(lines)
+        if (len(lines[0])) > 1 : keywords = [line[ncol] for line in lines]
+        else :
+            keywords= ["".join(key) for key in lines]
+    if header : keywords = keywords[1:]
+    keywords = list(set(keywords))

-    return filters
+    return keywords

 # Read input file
 def read_file(filename):
@@ -164,16 +225,11 @@
 def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
     match=str_to_bool(match)
     ncol=column_from_txt(ncol)
-
-    keywords = keywords.upper().split(";")                                            # Split list of filter keyword
-    [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""]  # Remove blank keywords
-    keywords = [k.strip() for k in keywords]        # Remove space from 2 heads of keywords
+    if type(keywords) != list : keywords = keywords.upper().split()            # Split list of filter keyword

     for id_line,line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        #line = line.replace("\n", "")
         keyword_inline = line[ncol].replace('"', "").split(";")
-        #line = line + "\n"

         #Perfect match or not
         if match is True :
@@ -192,16 +248,32 @@

     filter_value = float(filter_value)
     ncol=column_from_txt(ncol)
+    nb_string=0

     for id_line,line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        value = line[ncol].replace('"', "").strip()
+        value = line[ncol].replace('"', "").replace(",",".").strip()
         if value.replace(".", "", 1).isdigit():
             to_filter=value_compare(value,filter_value,opt)

             #adding the result to the dictionary
             if id_line in results_dict : results_dict[id_line].append(to_filter)
             else : results_dict[id_line]=[to_filter]
+
+        #impossible to treat (ex : "" instead of a number), we keep the line by default
+        else :
+            nb_string+=1
+            if id_line in results_dict : results_dict[id_line].append(False)
+            else : results_dict[id_line]=[False]
+
+    #number of lines in the csv file
+    if header : nb_lines = len(csv_file) -1
+    else : nb_lines = len(csv_file)
+
+    #if there's no numeric value in the column
+    if nb_string == nb_lines :
+        print ('No numeric values found in the column '+str(ncol+1))
+        print ('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1))

     return results_dict

@@ -211,10 +283,11 @@
     bottom_value = float(bottom_value)
     top_value=float(top_value)
     ncol=column_from_txt(ncol)
+    nb_string=0

     for id_line, line in enumerate(csv_file):
         if header is True and id_line == 0 : continue
-        value = line[ncol].replace('"', "").strip()
+        value = line[ncol].replace('"', "").replace(",",".").strip()
         if value.replace(".", "", 1).isdigit():
             value=float(value)
             if inclusive is True:
@@ -225,6 +298,22 @@
             #adding the result to the dictionary
             if id_line in results_dict : results_dict[id_line].append(in_range)
             else : results_dict[id_line]=[in_range]
+
+        #impossible to treat (ex : "" instead of a number), we keep the line by default
+        else :
+            nb_string+=1
+            if id_line in results_dict : results_dict[id_line].append(False)
+            else : results_dict[id_line]=[False]
+
+    #number of lines in the csv file
+    if header : nb_lines = len(csv_file) -1
+    else : nb_lines = len(csv_file)
+
+    #if there's no numeric value in the column
+    if nb_string == nb_lines :
+        print ('No numeric values found in the column '+str(ncol+1))
+        if inclusive : print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1))
+        else : print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1))

     return results_dict
--- a/filter_kw_val.xml	Fri Sep 21 06:03:25 2018 -0400
+++ b/filter_kw_val.xml	Tue Dec 18 09:25:11 2018 -0500
@@ -1,4 +1,4 @@
-<tool id="MQoutputfilter" name="Filter by keywords or numerical value" version="2018.09.21">
+<tool id="MQoutputfilter" name="Filter by keywords and/or numerical value" version="2018.12.18">
     <description></description>
     <requirements>
     </requirements>
@@ -18,7 +18,7 @@
                 #if $key.k.kw == "text"
                     --kw "$key.k.txt" "$key.ncol" "$key.match"
                 #else if $key.k.kw == "file"
-                    --kw_file "$key.k.file" "$key.ncol" "$key.match"
+                    --kw_file "$key.k.file" "$key.k.header" "$key.k.ncol" "$key.ncol" "$key.match"
                 #end if
             #end if
         #end for
@@ -56,25 +56,23 @@

     ]]></command>
     <inputs>
-        <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics identification and/or quantitative results" />
-        <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" />
-        <param name="operator" type="select" label="Please select an operator to combine your filters (if more than one)" help="OR : only one filter must be satisfied to filter a row, AND : all your filters must be satisfied to filter a row" >
+        <param type="data" name="input1" format="txt,tabular" label="Input file" />
+        <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" />
+        <param name="operator" type="select" label="Select an operator to combine your filters (if more than one)" help="OR : only one filter must be satisfied to filter a row, AND : all your filters must be satisfied to filter a row" >
             <option value="OR" selected="True">OR</option>
             <option value="AND">AND</option>
         </param>
-        <param name="sort_column" type="text" value="" label="If you want to sort the result files by values from a column, please enter a column number" help="For example : fill in 'c1' if you want to sort your result file by the column 1 values." />
-        <param name="reversed_sort" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Sort in descending order ?"/>
+
         <repeat name="keyword" title="Filter by keywords" >
-            <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
-            <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more details' />
+            <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
+            <param type="boolean" name="match" truevalue="True" label="Search for exact match?" help='Choosing "Yes" will only filter out exact match (e.g. case sensitive), see help section' />
             <conditional name="k" >
-                <param argument="--kw" type="select" label="Filter by keyword" >
-                    <option value="text" selected="true">Enter keywords (copy/paste)</option>
-                    <option value="file">Choose a file containing keywords</option>
+                <param name="kw" type="select" label="Enter keywords" >
+                    <option value="text" selected="true">copy/paste</option>
+                    <option value="file">File containing keywords</option>
                 </param>
-                <when value="None" />
                 <when value="text" >
-                    <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords must be separated by ";", for example: A8K2U0;Q5TA79;O43175' >
+                    <param name="txt" type="text" label="Copy/paste keywords to be filtered out" help='Keywords must be separated by tab, space or carriage return into the form field, for example: A8K2U0 Q5TA79 O43175' >
                         <sanitizer>
                         <valid initial="string.printable">
                             <remove value="&apos;"/>
@@ -86,15 +84,16 @@
                     </param>
                 </when>
                 <when value="file" >
-                    <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" />
+                    <param name="file" type="data" format="txt,tabular" label="File containing keywords" />
+                    <param name="ncol" type="text" value="c1" label="Specify the column containing keywords" help='For example, fill in "c1" if keywords are in the first column' />
+                    <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" />
                 </when>
             </conditional>
         </repeat>
-        <repeat name="value" title="Filter by value" >
-            <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
+        <repeat name="value" title="Filter by numerical value" >
+            <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
             <conditional name="v" >
-                <param argument="--val" type="select" label="Filter by value" >
-                    <option value="None">---</option>
+                <param argument="Numerical Value" type="select" label="Select operator" >
                     <option value="Equal">=</option>
                     <option value="Higher">&gt;</option>
                     <option value="Equal or higher">&gt;=</option>
@@ -102,8 +101,6 @@
                     <option value="Equal or lower">&lt;=</option>
                     <option value="Different">!=</option>
                 </param>
-                <when value="None" >
-                </when>
                 <when value="Equal" >
                     <param name="equal" type="float" value="" label="Value" />
                 </when>
@@ -124,16 +121,18 @@
                 </when>
             </conditional>
         </repeat>
-        <repeat name="values_range" title="Filter by range of values">
-            <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
-            <param name="bottom_value" type="float" value="" label="Please enter the bottom value" />
-            <param name="top_value" type="float" value="" label="Please enter the top value" />
+        <repeat name="values_range" title="Filter by range of numerical values">
+            <param name="ncol" type="text" value="c1" label="Column number on which to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
+            <param name="bottom_value" type="float" value="" label="Enter the bottom value" />
+            <param name="top_value" type="float" value="" label="Enter the top value" />
             <param name="inclusive" type="boolean" label="inclusive range ?" checked="false" truevalue="true" falsevalue="false" />
         </repeat>
-    </inputs>
+        <param name="sort_column" type="text" value="" label="Sort result files by:" help="Fill in 'c1' if you want to sort your result file by the column 1 values" />
+        <param name="reversed_sort" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Sort in descending order ?"/>
+    </inputs>
     <outputs>
-        <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" />
-        <data name="filtered_file" format="tabular" label="${tool.name} on ${input1.name} - Filtered lines" />
+        <data name="output1" format="tsv" label="${tool.name} on ${input1.name}" />
+        <data name="filtered_file" format="tsv" label="${tool.name} on ${input1.name} - Filtered lines" />
     </outputs>
     <tests>
         <test>
@@ -147,7 +146,7 @@
                 <param name="match" value="True" />
                 <conditional name="k">
                     <param name="kw" value="text" />
-                    <param name="txt" value="P04264;P35908;P13645;Q5D862;Q5T749;Q8IW75;P81605;P22531;P59666;P78386" />
+                    <param name="txt" value="P04264 P35908 P13645 Q5D862 Q5T749 Q8IW75 P81605 P22531 P59666 P78386" />
                 </conditional>
             </repeat>
             <repeat name="value">
@@ -162,93 +161,98 @@
         </test>
     </tests>
     <help><![CDATA[
-This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
+**Description**
+
+This tool allows to filter out data according to different criteria such as keywords (e.g. a list of contaminants) or numerical values (e.g. intensity measurements below a given threshold).
+A boolean operator "OR/AND" allows to combine different type of filters making this tool very powerful.
+
+-----
+
+**Input**
+
+A table (file in txt, tab, tsv, csv format) of your identification and/or quantification results for example.
+
+-----
+
+**Parameters**
+
+**AND/OR operator**
+
+As many filters as needed can be combined, you can choose how filters apply on your data by using the following boolean operators:
+
+- OR: only one filter must be satisfied to remove one row
+- AND: all filters must be satisfied to remove one row
+
+-----

 **Filter by keyword(s)**

-Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords.
+Click on the "Filter by keywords" box to use it. You can either fill in the field (copy/paste) or upload a file which contains the keywords.

-- If you choose to fill in the field, the keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175
+"Column number on which to apply the filter": You must then specify the column number of your input file on wich to apply the filter by keywords.
+
+- If you choose to fill in the field, the keywords should be separated by tab, space or carriage return into the form field, for example: A8K2U0 Q5TA79 O43175

 - If you choose to upload a file in a text format in which each line is a keyword, for example:

-REV
-
 TRYP_PIG

 ALDOA_RABBIT

-**The line that contains these keywords will be eliminated from input file.**
+LYSO_ECOLI

-**Keywords search can be applied by performing either exact match or partial one by using the following option**
+Lines that contains these keywords will be removed from input file.
+
+"Search for exact match?": Keywords search can be applied by performing either exact match or partial one by using the following option:

 - If you choose **Yes**, only the fields that contains exactly the same content will be removed.

 - If you choose **No**, all the fields containing the keyword will be removed.

-For example:
-
-**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
-
-**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so  on) are removed.
-
------
-
-**Filter by values**
+Example:

-You can filter your data by a column of numerical values.
-Enter the column to be use and select one operator in the list :
+**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly the word "kinase" will be removed.

-- =
-- !=
-- <
-- <=
-- >
-- >=
-
-Then enter the value to filter and specify the column to apply that option.
-If a row contains a value that correspond to your settings, it will be filtered.
+**No** option (partial match) for "kinase": lines which contain "kinase" and lines with "alpha-kinase" (and so on) will be removed.

 -----

-**Filter by a range of values**
+**Filter by numerical values**: You can filter your data by a column of numerical values.
+
+"Column number on which to apply the filter": you must specify the column number of your input file on wich to apply the filter by numerical value.
+
+Then select one of the operators in the list :

-You can also set a range of values to filter your file.
-In opposition to value filter, rows with values inside of the defined range are kept.
+- = (equal)
+- != (not equal)
+- < (lower than)
+- <= (lower than or equal to)
+- > (greater than)
+- >= (greater than or equal to)

-Rows with values outside of the defined range will be filtered.
+Then enter the numerical threshold to apply by filling the "Value" box.
+If you choose > 10, each row containing a numerical value (in the chosen column of your input file) that correspond to your settings will be filtered out.

 -----

-**AND/OR operator**
-
-Since you can add as many filters as you want, you can choose how filters apply on your data.
-
-AND or OR operator option works on all filters :
-
-- OR : only one filter to be satisfied to remove one row
-- AND : all filters must be satisfied to remove one row
+**Filter by a range of values**: You can also set a range of values to filter your file.
+Conversely to the numeric filter, rows with numerical values within the defined range will be kept while rows with values out of this range will be filtered out.

 -----

-**Sort the results files**
-
-You can sort the result file if you wish, it can help you to check results.
+**Sort results files**

-In order to do so : enter the column to be used, all columns will be sorted according to the one filled in.
-
-Rows stay intact, just in different order like excel.
-You can also choose ascending or descending order, by default ascending order is set.
+You can sort your results by column in ascending (default value) or descending by entering the column number on which to sort the data.

 -----

 **Output**

-The tool will produce 2 output files.
+The tool returns two output files.

-* A text file containing the resulting filtered input file.
+* A text file containing the results that pass your filters

-* A text file containing the rows removed from the input file.
+* A text file containing the rows removed from the input file (i.e. containing data taht do not pass your filter(s).

 -----