diff filter_kw_val.py @ 5:1e9911190142 draft

planemo upload commit 08f1831e097df5d74bf60ff5955e7e9c8e524cc8-dirty
author proteore
date Wed, 14 Mar 2018 10:24:54 -0400
parents d29e469b6b20
children c6ba1e6f6869
line wrap: on
line diff
--- a/filter_kw_val.py	Thu Mar 08 10:41:08 2018 -0500
+++ b/filter_kw_val.py	Wed Mar 14 10:24:54 2018 -0400
@@ -4,12 +4,22 @@
 
 def options():
     """
-    Parse options
+    Parse options:
+        -i, --input     Input filename and boolean value if the file contains header ["filename,true/false"]
+        -m, --match     if the keywords should be filtered in exact
+        --kw            Keyword to be filtered, the column number where this filter applies, 
+                        boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
+                        This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
+        --kwfile        A file that contains keywords to be filter, the column where this filter applies and 
+                        boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
+        --value         The value to be filtered, the column number where this filter applies and the 
+                        operation symbol ["value,ncol,=/>/>=/</<="]
+        --o --output    The output filename
+        --trash_file    The file contains removed lines
     """
     parser = argparse.ArgumentParser()
     parser.add_argument("-i", "--input", help="Input file", required=True)
-    parser.add_argument("-m", "--match", help="Exact macth")
-    parser.add_argument("--kw", nargs="+", action="append", help="") #
+    parser.add_argument("--kw", nargs="+", action="append", help="")
     parser.add_argument("--kw_file", nargs="+", action="append", help="")
     parser.add_argument("--value", nargs="+", action="append", help="")
     parser.add_argument("-o", "--output", default="output.txt")
@@ -19,16 +29,12 @@
 
     filters(args)
 
-    # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
-    # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
-
-
 def isnumber(number_format, n):
     """
     Check if a variable is a float or an integer
     """
-    float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
-    int_format = re.compile("^[\-]?[1-9][0-9]*$")
+    float_format = re.compile(r"^[-]?[1-9][0-9]*.?[0-9]+$")
+    int_format = re.compile(r"^[-]?[1-9][0-9]*$")
     test = ""
     if number_format == "int":
         test = re.match(int_format, n)
@@ -36,8 +42,6 @@
         test = re.match(float_format, n)
     if test:
         return True
-#    else:
-#        return False
 
 def filters(args):
     """
@@ -66,15 +70,16 @@
 
     # Write results to output
     output = open(args.output, "w")
-    output.write("\n".join(results[0]))
+    output.write("".join(results[0]))
     output.close()
 
     # Write deleted lines to trash_file
     trash = open(args.trash_file, "w")
-    trash.write("\n".join(results[1]))
+    trash.write("".join(results[1]))
     trash.close()
 
 def readOption(filename):
+    # Read the keywords file to extract the list of keywords
     f = open(filename, "r")
     file_content = f.read()
     filter_list = file_content.split("\n")
@@ -85,7 +90,7 @@
     return filters
 
 def readMQ(MQfilename):
-    # Read MQ file
+    # Read input file
     mqfile = open(MQfilename, "r")
     mq = mqfile.readlines()
     # Remove empty lines (contain only space or new line or "")
@@ -95,7 +100,7 @@
 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
     mq = MQfile
     if isnumber("int", ncol.replace("c", "")):
-        id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs")
+        id_index = int(ncol.replace("c", "")) - 1 
     else:
         raise ValueError("Please specify the column where "
                          "you would like to apply the filter "
@@ -124,28 +129,29 @@
     for line in content:
         line = line.replace("\n", "")
         id_inline = line.split("\t")[id_index].replace('"', "").split(";")
-        one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
+        # Take only first IDs
+        #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) 
         line = line + "\n"
 
         if match != "false":
             # Filter protein IDs
             if any(pid.upper() in ids for pid in id_inline):
-                filtered_lines.append(one_id_line)
+                filtered_lines.append(line)
                 mq.remove(line)
-            else:
-                mq[mq.index(line)] = one_id_line
+            #else:
+            #    mq[mq.index(line)] = one_id_line
         else:
             if any(ft in pid.upper() for pid in id_inline for ft in ids):
-                filtered_lines.append(one_id_line)
+                filtered_lines.append(line)
                 mq.remove(line)
-            else:
-                mq[mq.index(line)] = one_id_line
+            #else:
+            #    mq[mq.index(line)] = one_id_line
     return mq, filtered_lines
 
 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
     mq = MQfile
-    if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
-        index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
+    if ncol and isnumber("int", ncol.replace("c", "")): 
+        index = int(ncol.replace("c", "")) - 1 
     else:
         raise ValueError("Please specify the column where "
                          "you would like to apply the filter "
@@ -187,7 +193,7 @@
                 if float(pep) != filter_value:
                     filtered_prots.append(line)
                     mq.remove(line)
-    return mq, filtered_prots #output, trash_file
+    return mq, filtered_prots
 
 if __name__ == "__main__":
     options()