# HG changeset patch
# User proteore
# Date 1527865847 14400
# Node ID 6f32c1e12572fa1c1aeff77bb3863e0d67992c85
# Parent c6ba1e6f686955126f67e085adeab232604f1357
planemo upload commit 72b345a7df2c87f07a9df71ecee1f252c9355337
diff -r c6ba1e6f6869 -r 6f32c1e12572 README.rst
--- a/README.rst Fri Apr 20 09:07:23 2018 -0400
+++ b/README.rst Fri Jun 01 11:10:47 2018 -0400
@@ -3,7 +3,7 @@
**Authors**
-T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+T.P. Lien Nguyen, David Christiany, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
@@ -15,9 +15,7 @@
This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
-**For each row, if there are more than one protein IDs/protein names/gene names, only the first one will be considered in the output**
-
-**Filter the file by keywords**
+**Filter by keyword(s)**
Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords.
@@ -45,11 +43,55 @@
**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed.
-**Filter the file by values**
+-------------------------------------------------------
+
+**Filter by values**
+
+You can filter your data by a column of numerical values.
+Enter the column to be use and select one operator in the list :
+
+- "="
+- "!="
+- "<"
+- "<="
+- ">"
+- ">="
+
+Then enter the value to filter and specify the column to apply that option.
+If a row contains a value that correspond to your settings, it will be filtered.
+
+-------------------------------------------------------
+
+**Filter by a range of values**
+
+You can also set a range of values to filter your file.
+In opposition to value filter, rows with values inside of the defined range are kept.
-You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.).
+Rows with values outside of the defined range will be filtered.
+
+-------------------------------------------------------
+
+**AND/OR operator**
+
+Since you can add as many filters as you want, you can choose how filters apply on your data.
+
+AND or OR operator option works on all filters :
+
+- OR : only one filter to be satisfied to remove one row
+- AND : all filters must be satisfied to remove one row
-* For each option, you can choose between "=", ">", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option.
+-------------------------------------------------------
+
+**Sort the results files**
+
+You can sort the result file if you wish, it can help you to check results.
+
+In order to do so : enter the column to be used, all columns will be sorted according to the one filled in.
+
+Rows stay intact, just in different order like excel.
+You can also choose ascending or descending order, by default descending order is set.
+
+-------------------------------------------------------
**Output**
diff -r c6ba1e6f6869 -r 6f32c1e12572 filter_kw_val.py
--- a/filter_kw_val.py Fri Apr 20 09:07:23 2018 -0400
+++ b/filter_kw_val.py Fri Jun 01 11:10:47 2018 -0400
@@ -1,38 +1,46 @@
-import argparse
-import re
-
+import argparse, re, csv
def options():
"""
Parse options:
-i, --input Input filename and boolean value if the file contains header ["filename,true/false"]
- -m, --match if the keywords should be filtered in exact
--kw Keyword to be filtered, the column number where this filter applies,
boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"].
This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true"
--kwfile A file that contains keywords to be filter, the column where this filter applies and
boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"]
--value The value to be filtered, the column number where this filter applies and the
- operation symbol ["value,ncol,=/>/>=/<="]
+ operation symbol ["value,ncol,=/>/>=/<=/!="]
+ --values_range range of values to be keep, example : --values_range 5 20 c1 true
+ --operator The operator used to filter with several keywords/values : AND or OR
--o --output The output filename
- --trash_file The file contains removed lines
+ --filtered_file The file contains removed lines
+ -s --sort_col Used column to sort the file, ",true" for reverse sorting, ",false" otherwise example : c1,false
"""
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="Input file", required=True)
parser.add_argument("--kw", nargs="+", action="append", help="")
parser.add_argument("--kw_file", nargs="+", action="append", help="")
parser.add_argument("--value", nargs="+", action="append", help="")
+ parser.add_argument("--values_range", nargs="+", action="append", help="")
+ parser.add_argument("--operator", default="OR", type=str, choices=['AND','OR'],help='')
parser.add_argument("-o", "--output", default="output.txt")
- parser.add_argument("--trash_file", default="trash_MQfilter.txt")
+ parser.add_argument("--filtered_file", default="filtered_output.txt")
+ parser.add_argument("-s","--sort_col", help="")
args = parser.parse_args()
-
filters(args)
-def isnumber(number_format, n):
- """
- Check if a variable is a float or an integer
- """
+def str_to_bool(v):
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
+ return True
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+ return False
+ else:
+ raise argparse.ArgumentTypeError('Boolean value expected.')
+
+#Check if a variable is a float or an integer
+def is_number(number_format, n):
float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$")
int_format = re.compile(r"^[-]?[0-9][0-9]*$")
test = ""
@@ -43,157 +51,216 @@
if test:
return True
+#Filter the document
def filters(args):
- """
- Filter the document
- """
- MQfilename = args.input.split(",")[0]
- header = args.input.split(",")[1]
- MQfile = readMQ(MQfilename)
- results = [MQfile, None]
+ filename = args.input.split(",")[0]
+ header = str_to_bool(args.input.split(",")[1])
+ csv_file = read_file(filename)
+ results_dict = {}
if args.kw:
keywords = args.kw
for k in keywords:
- results = filter_keyword(results[0], header, results[1], k[0], k[1], k[2])
+ results_dict=filter_keyword(csv_file, header, results_dict, k[0], k[1], k[2])
+
if args.kw_file:
key_files = args.kw_file
for kf in key_files:
- ids = readOption(kf[0])
- results = filter_keyword(results[0], header, results[1], ids, kf[1], kf[2])
+ keywords = read_option(kf[0])
+ results_dict=filter_keyword(csv_file, header, results_dict, keywords, kf[1], kf[2])
+
if args.value:
for v in args.value:
- if isnumber("float", v[0]):
- results = filter_value(results[0], header, results[1], v[0], v[1], v[2])
+ if is_number("float", v[0]):
+ results_dict = filter_value(csv_file, header, results_dict, v[0], v[1], v[2])
else:
raise ValueError("Please enter a number in filter by value")
- # Write results to output
- output = open(args.output, "w")
- output.write("".join(results[0]))
- output.close()
+ if args.values_range:
+ for vr in args.values_range:
+ if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float",vr[1]) or is_number("int",vr[1])):
+ results_dict = filter_values_range(csv_file, header, results_dict, vr[0], vr[1], vr[2], vr[3])
+
+ remaining_lines=[]
+ filtered_lines=[]
- # Write deleted lines to trash_file
- trash = open(args.trash_file, "w")
- trash.write("".join(results[1]))
- trash.close()
+ if header is True :
+ remaining_lines.append(csv_file[0])
+ filtered_lines.append(csv_file[0])
+
+ for id_line,line in enumerate(csv_file) :
+ if id_line in results_dict : #skip header and empty lines
+ if args.operator == 'OR' :
+ if any(results_dict[id_line]) :
+ filtered_lines.append(line)
+ else :
+ remaining_lines.append(line)
-def readOption(filename):
- # Read the keywords file to extract the list of keywords
- f = open(filename, "r")
- file_content = f.read()
- filter_list = file_content.split("\n")
- filters = ""
- for i in filter_list:
- filters += i + ";"
- filters = filters[:-1]
+ elif args.operator == "AND" :
+ if all(results_dict[id_line]) :
+ filtered_lines.append(line)
+ else :
+ remaining_lines.append(line)
+
+ #sort of results by column
+ if args.sort_col :
+ sort_col=args.sort_col.split(",")[0]
+ sort_col=column_from_txt(sort_col)
+ reverse=str_to_bool(args.sort_col.split(",")[1])
+ remaining_lines= sort_by_column(remaining_lines,sort_col,reverse,header)
+ filtered_lines = sort_by_column(filtered_lines,sort_col,reverse,header)
+
+ # Write results to output
+ with open(args.output,"w") as output :
+ writer = csv.writer(output,delimiter="\t")
+ writer.writerows(remaining_lines)
+
+ # Write filtered lines to filtered_output
+ with open(args.filtered_file,"w") as filtered_output :
+ writer = csv.writer(filtered_output,delimiter="\t")
+ writer.writerows(filtered_lines)
+
+#function to sort the csv_file by value in a specific column
+def sort_by_column(tab,sort_col,reverse,header):
+
+ if len(tab) > 1 : #if there's more than just a header or 1 row
+ if header is True :
+ head=tab[0]
+ tab=tab[1:]
+
+ if is_number("int",tab[0][sort_col]) :
+ tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse)
+ elif is_number("float",tab[0][sort_col]) :
+ tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse)
+ else :
+ tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse)
+
+ if header is True : tab = [head]+tab
+
+ return tab
+
+#Read the keywords file to extract the list of keywords
+def read_option(filename):
+ with open(filename, "r") as f:
+ filter_list=f.read().splitlines()
+ filter_list=[key for key in filter_list if len(key.replace(' ',''))!=0]
+ filters=";".join(filter_list)
+
return filters
-def readMQ(MQfilename):
- # Read input file
- mqfile = open(MQfilename, "r")
- mq = mqfile.readlines()
+# Read input file
+def read_file(filename):
+ with open(filename,"r") as f :
+ reader=csv.reader(f,delimiter="\t")
+ tab=list(reader)
+
# Remove empty lines (contain only space or new line or "")
- [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
- return mq
+ #[tab.remove(blank) for blank in tab if blank.isspace() or blank == ""]
+ tab=[line for line in tab if len("".join(line).replace(" ","")) !=0 ]
+
+ return tab
+
+#seek for keywords in rows of csvfile, return a dictionary of boolean (true if keyword found, false otherwise)
+def filter_keyword(csv_file, header, results_dict, keywords, ncol, match):
+ match=str_to_bool(match)
+ ncol=column_from_txt(ncol)
+
+ keywords = keywords.upper().split(";") # Split list of filter keyword
+ [keywords.remove(blank) for blank in keywords if blank.isspace() or blank == ""] # Remove blank keywords
+ keywords = [k.strip() for k in keywords] # Remove space from 2 heads of keywords
+
+ for id_line,line in enumerate(csv_file):
+ if header is True and id_line == 0 : continue
+ #line = line.replace("\n", "")
+ keyword_inline = line[ncol].replace('"', "").split(";")
+ #line = line + "\n"
+
+ #Perfect match or not
+ if match is True :
+ found_in_line = any(pid.upper() in keywords for pid in keyword_inline)
+ else:
+ found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords)
+
+ #if the keyword is found in line
+ if id_line in results_dict : results_dict[id_line].append(found_in_line)
+ else : results_dict[id_line]=[found_in_line]
+
+ return results_dict
+
+#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
+def filter_value(csv_file, header, results_dict, filter_value, ncol, opt):
+
+ filter_value = float(filter_value)
+ ncol=column_from_txt(ncol)
-def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
- mq = MQfile
- if isnumber("int", ncol.replace("c", "")):
- id_index = int(ncol.replace("c", "")) - 1
+ for id_line,line in enumerate(csv_file):
+ if header is True and id_line == 0 : continue
+ value = line[ncol].replace('"', "").strip()
+ if value.replace(".", "", 1).isdigit():
+ to_filter=value_compare(value,filter_value,opt)
+
+ #adding the result to the dictionary
+ if id_line in results_dict : results_dict[id_line].append(to_filter)
+ else : results_dict[id_line]=[to_filter]
+
+ return results_dict
+
+#filter ba determined value in rows of csvfile, return a dictionary of boolean (true if value filtered, false otherwise)
+def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive):
+ inclusive=str_to_bool(inclusive)
+ bottom_value = float(bottom_value)
+ top_value=float(top_value)
+ ncol=column_from_txt(ncol)
+
+ for id_line, line in enumerate(csv_file):
+ if header is True and id_line == 0 : continue
+ value = line[ncol].replace('"', "").strip()
+ if value.replace(".", "", 1).isdigit():
+ value=float(value)
+ if inclusive is True:
+ in_range = not (bottom_value <= value <= top_value)
+ else :
+ in_range = not (bottom_value < value < top_value)
+
+ #adding the result to the dictionary
+ if id_line in results_dict : results_dict[id_line].append(in_range)
+ else : results_dict[id_line]=[in_range]
+
+ return results_dict
+
+def column_from_txt(ncol):
+ if is_number("int", ncol.replace("c", "")):
+ ncol = int(ncol.replace("c", "")) - 1
else:
raise ValueError("Please specify the column where "
"you would like to apply the filter "
"with valid format")
-
- # Split list of filter IDs
- ids = ids.upper().split(";")
- # Remove blank IDs
- [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""]
- # Remove space from 2 heads of IDs
- ids = [id.strip() for id in ids]
-
-
- if header == "true":
- header = mq[0]
- content = mq[1:]
- else:
- header = ""
- content = mq[:]
-
- if not filtered_lines: # In case there is already some filtered lines from other filters
- filtered_lines = []
- if header != "":
- filtered_lines.append(header)
+ return ncol
- for line in content:
- line = line.replace("\n", "")
- id_inline = line.split("\t")[id_index].replace('"', "").split(";")
- # Take only first IDs
- #one_id_line = line.replace(line.split("\t")[id_index], id_inline[0])
- line = line + "\n"
-
- if match != "false":
- # Filter protein IDs
- if any(pid.upper() in ids for pid in id_inline):
- filtered_lines.append(line)
- mq.remove(line)
- #else:
- # mq[mq.index(line)] = one_id_line
- else:
- if any(ft in pid.upper() for pid in id_inline for ft in ids):
- filtered_lines.append(line)
- mq.remove(line)
- #else:
- # mq[mq.index(line)] = one_id_line
- return mq, filtered_lines
+#return True if value is in the determined values, false otherwise
+def value_compare(value,filter_value,opt):
+ test_value=False
-def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
- mq = MQfile
- if ncol and isnumber("int", ncol.replace("c", "")):
- index = int(ncol.replace("c", "")) - 1
- else:
- raise ValueError("Please specify the column where "
- "you would like to apply the filter "
- "with valid format")
- if header == "true":
- header = mq[0]
- content = mq[1:]
- else:
- header = ""
- content = mq[:]
- if not filtered_prots: # In case there is already some filtered lines from other filters
- filtered_prots = []
- if header != "":
- filtered_prots.append(header)
+ if opt == "<":
+ if float(value) < filter_value:
+ test_value = True
+ elif opt == "<=":
+ if float(value) <= filter_value:
+ test_value = True
+ elif opt == ">":
+ if float(value) > filter_value:
+ test_value = True
+ elif opt == ">=":
+ if float(value) >= filter_value:
+ test_value = True
+ elif opt == "=":
+ if float(value) == filter_value:
+ test_value = True
+ elif opt == "!=":
+ if float(value) != filter_value:
+ test_value = True
- for line in content:
- prot = line.replace("\n","")
- filter_value = float(filter_value)
- pep = prot.split("\t")[index].replace('"', "")
- if pep.replace(".", "", 1).isdigit():
- if opt == "<":
- if float(pep) >= filter_value:
- filtered_prots.append(line)
- mq.remove(line)
- elif opt == "<=":
- if float(pep) > filter_value:
- filtered_prots.append(line)
- mq.remove(line)
- elif opt == ">":
- #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value)
- if float(pep) <= filter_value:
- filtered_prots.append(line)
- mq.remove(line)
- elif opt == ">=":
- if float(pep) < filter_value:
- filtered_prots.append(line)
- mq.remove(line)
- else:
- if float(pep) != filter_value:
- filtered_prots.append(line)
- mq.remove(line)
- return mq, filtered_prots
+ return test_value
if __name__ == "__main__":
options()
diff -r c6ba1e6f6869 -r 6f32c1e12572 filter_kw_val.xml
--- a/filter_kw_val.xml Fri Apr 20 09:07:23 2018 -0400
+++ b/filter_kw_val.xml Fri Jun 01 11:10:47 2018 -0400
@@ -9,10 +9,11 @@
python $__tool_directory__/filter_kw_val.py
-i "$input1,$header"
-o "$output1"
- --trash_file "$trash_file"
+ --filtered_file "$filtered_file"
+ --operator "$operator"
## Keywords
- #for $i, $key in enumerate($keyword)
+ #for $key in $keyword
#if $key.k.kw != "None"
#if $key.k.kw == "text"
--kw "$key.k.txt" "$key.ncol" "$key.match"
@@ -22,8 +23,8 @@
#end if
#end for
- ## Number of proteins
- #for $i, $val in enumerate($value)
+ ## value to filter
+ #for $val in $value
#if $val.v.val != "None"
--value
#if $val.v.val == "Equal"
@@ -34,16 +35,35 @@
$val.v.equal_higher "$val.ncol" ">="
#else if $val.v.val == "Lower"
$val.v.lower "$val.ncol" "<"
- #else
+ #else if $val.v.val == "Equal or lower"
$val.v.equal_lower "$val.ncol" "<="
+ #else
+ $val.v.different "$val.ncol" "!="
#end if
#end if
#end for
+ ##range of values to keep
+ #for $vr in $values_range
+ #if vr
+ --values_range $vr.bottom_value $vr.top_value $vr.ncol $vr.inclusive
+ #end if
+ #end for
+
+ #if $sort_column != ""
+ --sort_col "$sort_column,$reversed_sort"
+ #end if
+
]]>
+
+
+
+
+
+
@@ -71,7 +91,6 @@
-
@@ -82,6 +101,7 @@
+
@@ -100,18 +120,29 @@
+
+
+
-
+
+
+
+
+
+
-
+
+
+
+
@@ -120,16 +151,21 @@
-
-
+
+
+
+
+
+
+
+
+
"
+- ">="
+
+Then enter the value to filter and specify the column to apply that option.
+If a row contains a value that correspond to your settings, it will be filtered.
+
+-----
+
+**Filter by a range of values**
+
+You can also set a range of values to filter your file.
+In opposition to value filter, rows with values inside of the defined range are kept.
-You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.).
+Rows with values outside of the defined range will be filtered.
+
+-----
+
+**AND/OR operator**
+
+Since you can add as many filters as you want, you can choose how filters apply on your data.
+
+AND or OR operator option works on all filters :
+
+- OR : only one filter to be satisfied to remove one row
+- AND : all filters must be satisfied to remove one row
-* For each option, you can choose between "=", ">", ">=", "<" and "<=", then enter the value to filter and specify the column to apply that option.
+-----
+
+**Sort the results files**
+
+You can sort the result file if you wish, it can help you to check results.
+
+In order to do so : enter the column to be used, all columns will be sorted according to the one filled in.
+
+Rows stay intact, just in different order like excel.
+You can also choose ascending or descending order, by default descending order is set.
+
+-----
**Output**
@@ -169,7 +249,7 @@
* A text file containing the resulting filtered input file.
-* A text file containing the rows that have been filtered from the input file.
+* A text file containing the rows removed from the input file.
-----
@@ -177,7 +257,7 @@
**Authors**
-T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+T.P. Lien Nguyen, David Christiany, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR
diff -r c6ba1e6f6869 -r 6f32c1e12572 test-data/FKW_Lacombe_et_al_2017_OK.txt
--- a/test-data/FKW_Lacombe_et_al_2017_OK.txt Fri Apr 20 09:07:23 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,154 +0,0 @@
-Protein accession number (UniProt) Protein name Number of peptides (razor + unique)
-
-P15924 Desmoplakin 69
-P02538 Keratin, type II cytoskeletal 6A 53
-P02768 Serum albumin 44
-P08779 Keratin, type I cytoskeletal 16 29
-Q02413 Desmoglein-1 24
-P07355 "Annexin A2;Putative annexin A2-like protein" 22
-P14923 Junction plakoglobin 22
-P02788 Lactotransferrin 21
-Q9HC84 Mucin-5B 21
-P29508 Serpin B3 20
-P63261 Actin, cytoplasmic 2 19
-Q8N1N4 Keratin, type II cytoskeletal 78 18
-Q04695 Keratin, type I cytoskeletal 17 18
-P01876 Ig alpha-1 chain C region 16
-Q01469 Fatty acid-binding protein 5, epidermal 15
-P31944 Caspase-14 15
-P01833 Polymeric immunoglobulin receptor 15
-P06733 Alpha-enolase 15
-P25311 Zinc-alpha-2-glycoprotein 15
-Q15149 Plectin 15
-P19013 Keratin, type II cytoskeletal 4 13
-Q6KB66 Keratin, type II cytoskeletal 80 13
-Q08188 Protein-glutamine gamma-glutamyltransferase E 12
-P13646 Keratin, type I cytoskeletal 13 11
-Q86YZ3 Hornerin 11
-P04259 Keratin, type II cytoskeletal 6B 10
-P02545 "Prelamin-A/C;Lamin-A/C" 10
-P04083 Annexin A1 10
-P11021 78 kDa glucose-regulated protein 10
-P02787 Serotransferrin 9
-P04040 Catalase 9
-P31151 Protein S100-A7 9
-P31947 14-3-3 protein sigma 9
-Q96P63 Serpin B12 9
-P14618 Pyruvate kinase PKM 9
-P60174 Triosephosphate isomerase 9
-Q06830 Peroxiredoxin-1 9
-P01040 Cystatin-A 8
-P05089 Arginase-1 8
-P01834 Ig kappa chain C region 8
-P04406 Glyceraldehyde-3-phosphate dehydrogenase 8
-P0DMV9 Heat shock 70 kDa protein 1B 8
-P13639 Elongation factor 2 8
-P35579 Myosin-9 8
-P68371 Tubulin beta-4B chain 8
-Q8WVV4 Protein POF1B 8
-O75635 Serpin B7 7
-P01857 Ig gamma-1 chain C region 7
-P61626 Lysozyme C 7
-P68363 Tubulin alpha-1B chain 7
-P01009 "Alpha-1-antitrypsin;Short peptide from AAT" 6
-P07900 Heat shock protein HSP 90-alpha 6
-Q9NZH8 Interleukin-36 gamma 6
-O43707 "Alpha-actinin-4;Alpha-actinin-1" 6
-O75223 Gamma-glutamylcyclotransferase 6
-P00338 L-lactate dehydrogenase A chain 6
-P07339 Cathepsin D 6
-P62987 Ubiquitin-60S ribosomal protein L40 6
-P10599 Thioredoxin 6
-Q9UGM3 Deleted in malignant brain tumors 1 protein 6
-Q9UI42 Carboxypeptidase A4 6
-P47929 Galectin-7 5
-Q13867 Bleomycin hydrolase 5
-Q6P4A8 Phospholipase B-like 1 5
-O75369 Filamin-B 5
-P00441 Superoxide dismutase [Cu-Zn] 5
-P04792 Heat shock protein beta-1 5
-P11142 Heat shock cognate 71 kDa protein 5
-P58107 Epiplakin 5
-P60842 Eukaryotic initiation factor 4A-I 5
-P62937 Peptidyl-prolyl cis-trans isomerase A 5
-P63104 14-3-3 protein zeta/delta 5
-Q92820 Gamma-glutamyl hydrolase 5
-O75342 Arachidonate 12-lipoxygenase, 12R-type 4
-P09211 Glutathione S-transferase P 4
-P31025 Lipocalin-1 4
-P48594 Serpin B4 4
-Q14574 Desmocollin-3 4
-Q5T750 Skin-specific protein 32 4
-Q6UWP8 Suprabasin 4
-O60911 Cathepsin L2 4
-P00558 Phosphoglycerate kinase 1 4
-P04075 Fructose-bisphosphate aldolase A 4
-P07384 Calpain-1 catalytic subunit 4
-P0CG05 Ig lambda-2 chain C regions 4
-P18206 Vinculin 4
-P62258 14-3-3 protein epsilon 4
-P68871 Hemoglobin subunit beta 4
-Q9C075 Keratin, type I cytoskeletal 23 4
-A8K2U0 Alpha-2-macroglobulin-like protein 1 3
-P00738 Haptoglobin 3
-P01011 Alpha-1-antichymotrypsin 3
-P02763 Alpha-1-acid glycoprotein 1 3
-P18510 Interleukin-1 receptor antagonist protein 3
-P22528 Cornifin-B 3
-P30740 Leukocyte elastase inhibitor 3
-P80188 Neutrophil gelatinase-associated lipocalin 3
-Q15828 Cystatin-M 3
-Q9HCY8 Protein S100-A14 3
-P01623 Ig kappa chain V-III region 3
-P01877 Ig alpha-2 chain C region 3
-P06396 Gelsolin 3
-P14735 Insulin-degrading enzyme 3
-P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3
-P25788 Proteasome subunit alpha type-3 3
-P26641 Elongation factor 1-gamma 3
-P36952 Serpin B5 3
-P40926 Malate dehydrogenase, mitochondrial 3
-Q9Y6R7 IgGFc-binding protein 3
-O95274 Ly6/PLAUR domain-containing protein 3 2
-P00491 Purine nucleoside phosphorylase 2
-P04080 Cystatin-B 2
-P09972 Fructose-bisphosphate aldolase C 2
-P19012 Keratin, type I cytoskeletal 15 2
-P20930 Filaggrin 2
-Q96FX8 p53 apoptosis effector related to PMP-22 2
-Q9UIV8 Serpin B13 2
-P01625 Ig kappa chain V-IV region Len 2
-P01765 Ig heavy chain V-III region TIL 2
-P01766 Ig heavy chain V-III region BRO 2
-P01860 Ig gamma-3 chain C region 2
-P01871 Ig mu chain C region 2
-P05090 Apolipoprotein D 2
-P06870 Kallikrein-1 2
-P07858 Cathepsin B 2
-P08865 40S ribosomal protein SA 2
-P11279 Lysosome-associated membrane glycoprotein 1 2
-P13473 Lysosome-associated membrane glycoprotein 2 2
-P19971 Thymidine phosphorylase 2
-P23284 Peptidyl-prolyl cis-trans isomerase B 2
-P23396 40S ribosomal protein S3 2
-P25705 ATP synthase subunit alpha, mitochondrial 2
-P27482 Calmodulin-like protein 3 2
-P31949 Protein S100-A11 2
-P40121 Macrophage-capping protein 2
-P42357 Histidine ammonia-lyase 2
-P47756 F-actin-capping protein subunit beta 2
-P48637 Glutathione synthetase 2
-P49720 Proteasome subunit beta type-3 2
-P50395 Rab GDP dissociation inhibitor beta 2
-P59998 Actin-related protein 2/3 complex subunit 4 2
-P61160 Actin-related protein 2 2
-P61916 Epididymal secretory protein E1 2
-P04745 Alpha-amylase 1 23
-Q9NZT1 Calmodulin-like protein 5 8
-P12273 Prolactin-inducible protein 6
-Q96DA0 Zymogen granule protein 16 homolog B 5
-P01036 Cystatin-S 5
-Q8TAX7 Mucin-7 2
-P01037 Cystatin-SN 2
-P09228 Cystatin-SA 2
-
\ No newline at end of file
diff -r c6ba1e6f6869 -r 6f32c1e12572 test-data/Lacombe_et_al_2017_OK.txt
diff -r c6ba1e6f6869 -r 6f32c1e12572 test-data/Trash_FKW_Lacombe_et_al_2017_OK.txt
--- a/test-data/Trash_FKW_Lacombe_et_al_2017_OK.txt Fri Apr 20 09:07:23 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-Protein accession number (UniProt) Protein name Number of peptides (razor + unique)
-
-P04264 Keratin, type II cytoskeletal 1 61
-P35908 Keratin, type II cytoskeletal 2 epidermal 40
-P13645 Keratin, type I cytoskeletal 10 40
-Q5D862 Filaggrin-2 14
-Q5T749 Keratinocyte proline-rich protein 13
-Q8IW75 Serpin A12 3
-P81605 Dermcidin 3
-P22531 Small proline-rich protein 2E 3
-P59666 Neutrophil defensin 3 2
-P78386 Keratin, type II cuticular Hb5 2
\ No newline at end of file
diff -r c6ba1e6f6869 -r 6f32c1e12572 test-data/filtered_output.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_output.csv Fri Jun 01 11:10:47 2018 -0400
@@ -0,0 +1,21 @@
+Protein accession number (UniProt) Protein name Number of peptides (razor + unique)
+P02538 Keratin, type II cytoskeletal 6A 53
+P02768 Serum albumin 44
+P02788 Lactotransferrin 21
+P04264 Keratin, type II cytoskeletal 1 61
+P04745 Alpha-amylase 1 23
+P07355 Annexin A2;Putative annexin A2-like protein 22
+P08779 Keratin, type I cytoskeletal 16 29
+P13645 Keratin, type I cytoskeletal 10 40
+P14923 Junction plakoglobin 22
+P15924 Desmoplakin 69
+P22531 Small proline-rich protein 2E 3
+P35908 Keratin, type II cytoskeletal 2 epidermal 40
+P59666 Neutrophil defensin 3 2
+P78386 Keratin, type II cuticular Hb5 2
+P81605 Dermcidin 3
+Q02413 Desmoglein-1 24
+Q5D862 Filaggrin-2 14
+Q5T749 Keratinocyte proline-rich protein 13
+Q8IW75 Serpin A12 3
+Q9HC84 Mucin-5B 21
diff -r c6ba1e6f6869 -r 6f32c1e12572 test-data/output.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.csv Fri Jun 01 11:10:47 2018 -0400
@@ -0,0 +1,142 @@
+Protein accession number (UniProt) Protein name Number of peptides (razor + unique)
+A8K2U0 Alpha-2-macroglobulin-like protein 1 3
+O43707 Alpha-actinin-4;Alpha-actinin-1 6
+O60911 Cathepsin L2 4
+O75223 Gamma-glutamylcyclotransferase 6
+O75342 Arachidonate 12-lipoxygenase, 12R-type 4
+O75369 Filamin-B 5
+O75635 Serpin B7 7
+O95274 Ly6/PLAUR domain-containing protein 3 2
+P00338 L-lactate dehydrogenase A chain 6
+P00441 Superoxide dismutase [Cu-Zn] 5
+P00491 Purine nucleoside phosphorylase 2
+P00558 Phosphoglycerate kinase 1 4
+P00738 Haptoglobin 3
+P01009 Alpha-1-antitrypsin;Short peptide from AAT 6
+P01011 Alpha-1-antichymotrypsin 3
+P01036 Cystatin-S 5
+P01037 Cystatin-SN 2
+P01040 Cystatin-A 8
+P01623 Ig kappa chain V-III region 3
+P01625 Ig kappa chain V-IV region Len 2
+P01765 Ig heavy chain V-III region TIL 2
+P01766 Ig heavy chain V-III region BRO 2
+P01833 Polymeric immunoglobulin receptor 15
+P01834 Ig kappa chain C region 8
+P01857 Ig gamma-1 chain C region 7
+P01860 Ig gamma-3 chain C region 2
+P01871 Ig mu chain C region 2
+P01876 Ig alpha-1 chain C region 16
+P01877 Ig alpha-2 chain C region 3
+P02545 Prelamin-A/C;Lamin-A/C 10
+P02763 Alpha-1-acid glycoprotein 1 3
+P02787 Serotransferrin 9
+P04040 Catalase 9
+P04075 Fructose-bisphosphate aldolase A 4
+P04080 Cystatin-B 2
+P04083 Annexin A1 10
+P04259 Keratin, type II cytoskeletal 6B 10
+P04406 Glyceraldehyde-3-phosphate dehydrogenase 8
+P04792 Heat shock protein beta-1 5
+P05089 Arginase-1 8
+P05090 Apolipoprotein D 2
+P06396 Gelsolin 3
+P06733 Alpha-enolase 15
+P06870 Kallikrein-1 2
+P07339 Cathepsin D 6
+P07384 Calpain-1 catalytic subunit 4
+P07858 Cathepsin B 2
+P07900 Heat shock protein HSP 90-alpha 6
+P08865 40S ribosomal protein SA 2
+P09211 Glutathione S-transferase P 4
+P09228 Cystatin-SA 2
+P09972 Fructose-bisphosphate aldolase C 2
+P0CG05 Ig lambda-2 chain C regions 4
+P0DMV9 Heat shock 70 kDa protein 1B 8
+P10599 Thioredoxin 6
+P11021 78 kDa glucose-regulated protein 10
+P11142 Heat shock cognate 71 kDa protein 5
+P11279 Lysosome-associated membrane glycoprotein 1 2
+P12273 Prolactin-inducible protein 6
+P13473 Lysosome-associated membrane glycoprotein 2 2
+P13639 Elongation factor 2 8
+P13646 Keratin, type I cytoskeletal 13 11
+P14618 Pyruvate kinase PKM 9
+P14735 Insulin-degrading enzyme 3
+P18206 Vinculin 4
+P18510 Interleukin-1 receptor antagonist protein 3
+P19012 Keratin, type I cytoskeletal 15 2
+P19013 Keratin, type II cytoskeletal 4 13
+P19971 Thymidine phosphorylase 2
+P20930 Filaggrin 2
+P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3
+P22528 Cornifin-B 3
+P23284 Peptidyl-prolyl cis-trans isomerase B 2
+P23396 40S ribosomal protein S3 2
+P25311 Zinc-alpha-2-glycoprotein 15
+P25705 ATP synthase subunit alpha, mitochondrial 2
+P25788 Proteasome subunit alpha type-3 3
+P26641 Elongation factor 1-gamma 3
+P27482 Calmodulin-like protein 3 2
+P29508 Serpin B3 20
+P30740 Leukocyte elastase inhibitor 3
+P31025 Lipocalin-1 4
+P31151 Protein S100-A7 9
+P31944 Caspase-14 15
+P31947 14-3-3 protein sigma 9
+P31949 Protein S100-A11 2
+P35579 Myosin-9 8
+P36952 Serpin B5 3
+P40121 Macrophage-capping protein 2
+P40926 Malate dehydrogenase, mitochondrial 3
+P42357 Histidine ammonia-lyase 2
+P47756 F-actin-capping protein subunit beta 2
+P47929 Galectin-7 5
+P48594 Serpin B4 4
+P48637 Glutathione synthetase 2
+P49720 Proteasome subunit beta type-3 2
+P50395 Rab GDP dissociation inhibitor beta 2
+P58107 Epiplakin 5
+P59998 Actin-related protein 2/3 complex subunit 4 2
+P60174 Triosephosphate isomerase 9
+P60842 Eukaryotic initiation factor 4A-I 5
+P61160 Actin-related protein 2 2
+P61626 Lysozyme C 7
+P61916 Epididymal secretory protein E1 2
+P62258 14-3-3 protein epsilon 4
+P62937 Peptidyl-prolyl cis-trans isomerase A 5
+P62987 Ubiquitin-60S ribosomal protein L40 6
+P63104 14-3-3 protein zeta/delta 5
+P63261 Actin, cytoplasmic 2 19
+P68363 Tubulin alpha-1B chain 7
+P68371 Tubulin beta-4B chain 8
+P68871 Hemoglobin subunit beta 4
+P80188 Neutrophil gelatinase-associated lipocalin 3
+Q01469 Fatty acid-binding protein 5, epidermal 15
+Q04695 Keratin, type I cytoskeletal 17 18
+Q06830 Peroxiredoxin-1 9
+Q08188 Protein-glutamine gamma-glutamyltransferase E 12
+Q13867 Bleomycin hydrolase 5
+Q14574 Desmocollin-3 4
+Q15149 Plectin 15
+Q15828 Cystatin-M 3
+Q5T750 Skin-specific protein 32 4
+Q6KB66 Keratin, type II cytoskeletal 80 13
+Q6P4A8 Phospholipase B-like 1 5
+Q6UWP8 Suprabasin 4
+Q86YZ3 Hornerin 11
+Q8N1N4 Keratin, type II cytoskeletal 78 18
+Q8TAX7 Mucin-7 2
+Q8WVV4 Protein POF1B 8
+Q92820 Gamma-glutamyl hydrolase 5
+Q96DA0 Zymogen granule protein 16 homolog B 5
+Q96FX8 p53 apoptosis effector related to PMP-22 2
+Q96P63 Serpin B12 9
+Q9C075 Keratin, type I cytoskeletal 23 4
+Q9HCY8 Protein S100-A14 3
+Q9NZH8 Interleukin-36 gamma 6
+Q9NZT1 Calmodulin-like protein 5 8
+Q9UGM3 Deleted in malignant brain tumors 1 protein 6
+Q9UI42 Carboxypeptidase A4 6
+Q9UIV8 Serpin B13 2
+Q9Y6R7 IgGFc-binding protein 3