# HG changeset patch # User proteore # Date 1518769663 18000 # Node ID d29e469b6b20bfaa83f652ed9d89a4ec98f9dcb7 # Parent 6a45ccfc0e4cf1f2beed92d774df06fbaf74afab planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty diff -r 6a45ccfc0e4c -r d29e469b6b20 filter_kw_val.py --- a/filter_kw_val.py Sun Nov 26 18:36:43 2017 -0500 +++ b/filter_kw_val.py Fri Feb 16 03:27:43 2018 -0500 @@ -3,6 +3,9 @@ def options(): + """ + Parse options + """ parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input file", required=True) parser.add_argument("-m", "--match", help="Exact macth") @@ -16,28 +19,35 @@ filters(args) - # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" - + # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" + # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" + -def isnumber(format, n): +def isnumber(number_format, n): + """ + Check if a variable is a float or an integer + """ float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") int_format = re.compile("^[\-]?[1-9][0-9]*$") test = "" - if format == "int": + if number_format == "int": test = re.match(int_format, n) - elif format == "float": + elif number_format == "float": test = re.match(float_format, n) if test: return True - else: - return False +# else: +# return False def filters(args): + """ + Filter the document + """ MQfilename = args.input.split(",")[0] header = args.input.split(",")[1] MQfile = readMQ(MQfilename) results = [MQfile, None] - + if args.kw: keywords = args.kw for k in keywords: @@ -56,26 +66,22 @@ # Write results to output output = open(args.output, "w") - output.write("".join(results[0])) + output.write("\n".join(results[0])) output.close() # Write deleted lines to trash_file trash = open(args.trash_file, "w") - #print("".join(results[1])) - trash.write("".join(results[1])) + trash.write("\n".join(results[1])) trash.close() def readOption(filename): f = open(filename, "r") - file = f.read() - #print(file) - filter_list = file.split("\n") - #print(filter_list) + file_content = f.read() + filter_list = file_content.split("\n") filters = "" for i in filter_list: - filters += i + ":" + filters += i + ";" filters = filters[:-1] - #print(filters) return filters def readMQ(MQfilename): @@ -83,97 +89,104 @@ mqfile = open(MQfilename, "r") mq = mqfile.readlines() # Remove empty lines (contain only space or new line or "") - [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] + [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] return mq - + def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): mq = MQfile if isnumber("int", ncol.replace("c", "")): id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") else: - raise ValueError("Please specify the column where you would like to apply the filter with valid format") - - ids = ids.upper().split(":") + raise ValueError("Please specify the column where " + "you would like to apply the filter " + "with valid format") + + # Split list of filter IDs + ids = ids.upper().split(";") + # Remove blank IDs [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] - + # Remove space from 2 heads of IDs + ids = [id.strip() for id in ids] + + if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] - + if not filtered_lines: # In case there is already some filtered lines from other filters filtered_lines = [] if header != "": filtered_lines.append(header) - for line in content: + for line in content: + line = line.replace("\n", "") id_inline = line.split("\t")[id_index].replace('"', "").split(";") one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs - + line = line + "\n" + if match != "false": # Filter protein IDs - if any (pid.upper() in ids for pid in id_inline): - #ids = prot_ids.split(":") - #print(prot_ids.split(":")) - #if prot_id in ids: + if any(pid.upper() in ids for pid in id_inline): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line else: - if any (ft in pid.upper() for pid in id_inline for ft in ids): + if any(ft in pid.upper() for pid in id_inline for ft in ids): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line return mq, filtered_lines - + def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): mq = MQfile if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") else: - raise ValueError("Please specify the column where you would like to apply the filter with valid format") - + raise ValueError("Please specify the column where " + "you would like to apply the filter " + "with valid format") if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] - if not filtered_prots: # In case there is already some filtered lines from other filters filtered_prots = [] if header != "": filtered_prots.append(header) - - for prot in content: + + for line in content: + prot = line.replace("\n","") filter_value = float(filter_value) pep = prot.split("\t")[index].replace('"', "") if pep.replace(".", "", 1).isdigit(): if opt == "<": - if not float(pep) < filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) >= filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == "<=": - if not float(pep) <= filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) > filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == ">": #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) - if not float(pep) > filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) <= filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == ">=": - if not float(pep) >= filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) < filter_value: + filtered_prots.append(line) + mq.remove(line) else: - if not float(pep) == filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) != filter_value: + filtered_prots.append(line) + mq.remove(line) return mq, filtered_prots #output, trash_file if __name__ == "__main__": diff -r 6a45ccfc0e4c -r d29e469b6b20 filter_kw_val.xml --- a/filter_kw_val.xml Sun Nov 26 18:36:43 2017 -0500 +++ b/filter_kw_val.xml Fri Feb 16 03:27:43 2018 -0500 @@ -1,13 +1,13 @@ - - Filter a file by keywords or values + + Filter a file by keywords or numerical values " + $val.v.higher "$val.ncol" ">" #else if $val.v.val == "Equal or higher" - $val.v.equal_higher "$val.v.ncol" ">=" + $val.v.equal_higher "$val.ncol" ">=" #else if $val.v.val == "Lower" - $val.v.lower "$val.v.ncol" "<" + $val.v.lower "$val.ncol" "<" #else - $val.v.equal_lower "$val.v.ncol" "<=" + $val.v.equal_lower "$val.ncol" "<=" #end if #end if #end for - + ]]> - + - + + - + - + @@ -64,16 +65,15 @@ - - - + + @@ -87,54 +87,42 @@ - - - - - - + - + - - + + - + + - - + - - - - - - - - - - + + +