Mercurial > repos > proteore > filter_keywords_values

--- a/filter_kw_val.py	Sun Nov 26 18:36:43 2017 -0500
+++ b/filter_kw_val.py	Fri Feb 16 03:27:43 2018 -0500
@@ -3,6 +3,9 @@


 def options():
+    """
+    Parse options
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument("-i", "--input", help="Input file", required=True)
     parser.add_argument("-m", "--match", help="Exact macth")
@@ -16,28 +19,35 @@

     filters(args)

-    # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
-
+    # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt"
+    # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt"
+

-def isnumber(format, n):
+def isnumber(number_format, n):
+    """
+    Check if a variable is a float or an integer
+    """
     float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$")
     int_format = re.compile("^[\-]?[1-9][0-9]*$")
     test = ""
-    if format == "int":
+    if number_format == "int":
         test = re.match(int_format, n)
-    elif format == "float":
+    elif number_format == "float":
         test = re.match(float_format, n)
     if test:
         return True
-    else:
-        return False
+#    else:
+#        return False

 def filters(args):
+    """
+    Filter the document
+    """
     MQfilename = args.input.split(",")[0]
     header = args.input.split(",")[1]
     MQfile = readMQ(MQfilename)
     results = [MQfile, None]
-
+
     if args.kw:
         keywords = args.kw
         for k in keywords:
@@ -56,26 +66,22 @@

     # Write results to output
     output = open(args.output, "w")
-    output.write("".join(results[0]))
+    output.write("\n".join(results[0]))
     output.close()

     # Write deleted lines to trash_file
     trash = open(args.trash_file, "w")
-    #print("".join(results[1]))
-    trash.write("".join(results[1]))
+    trash.write("\n".join(results[1]))
     trash.close()

 def readOption(filename):
     f = open(filename, "r")
-    file = f.read()
-    #print(file)
-    filter_list = file.split("\n")
-    #print(filter_list)
+    file_content = f.read()
+    filter_list = file_content.split("\n")
     filters = ""
     for i in filter_list:
-        filters += i + ":"
+        filters += i + ";"
     filters = filters[:-1]
-    #print(filters)
     return filters

 def readMQ(MQfilename):
@@ -83,97 +89,104 @@
     mqfile = open(MQfilename, "r")
     mq = mqfile.readlines()
     # Remove empty lines (contain only space or new line or "")
-    [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
+    [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""]
     return mq
-
+
 def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match):
     mq = MQfile
     if isnumber("int", ncol.replace("c", "")):
         id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs")
     else:
-        raise ValueError("Please specify the column where you would like to apply the filter with valid format")
-
-    ids = ids.upper().split(":")
+        raise ValueError("Please specify the column where "
+                         "you would like to apply the filter "
+                         "with valid format")
+
+    # Split list of filter IDs
+    ids = ids.upper().split(";")
+    # Remove blank IDs
     [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""]
-
+    # Remove space from 2 heads of IDs
+    ids = [id.strip() for id in ids]
+
+
     if header == "true":
         header = mq[0]
         content = mq[1:]
     else:
         header = ""
         content = mq[:]
-
+
     if not filtered_lines: # In case there is already some filtered lines from other filters
         filtered_lines = []
         if header != "":
             filtered_lines.append(header)

-    for line in content:
+    for line in content:
+        line = line.replace("\n", "")
         id_inline = line.split("\t")[id_index].replace('"', "").split(";")
         one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs
-
+        line = line + "\n"
+
         if match != "false":
             # Filter protein IDs
-            if any (pid.upper() in ids for pid in id_inline):
-                #ids = prot_ids.split(":")
-                #print(prot_ids.split(":"))
-                #if prot_id in ids:
+            if any(pid.upper() in ids for pid in id_inline):
                 filtered_lines.append(one_id_line)
                 mq.remove(line)
             else:
                 mq[mq.index(line)] = one_id_line
         else:
-            if any (ft in pid.upper() for pid in id_inline for ft in ids):
+            if any(ft in pid.upper() for pid in id_inline for ft in ids):
                 filtered_lines.append(one_id_line)
                 mq.remove(line)
             else:
                 mq[mq.index(line)] = one_id_line
     return mq, filtered_lines
-
+
 def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt):
     mq = MQfile
     if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns:
         index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names")
     else:
-        raise ValueError("Please specify the column where you would like to apply the filter with valid format")
-
+        raise ValueError("Please specify the column where "
+                         "you would like to apply the filter "
+                         "with valid format")
     if header == "true":
         header = mq[0]
         content = mq[1:]
     else:
         header = ""
         content = mq[:]
-
     if not filtered_prots: # In case there is already some filtered lines from other filters
         filtered_prots = []
         if header != "":
             filtered_prots.append(header)
-
-    for prot in content:
+
+    for line in content:
+        prot = line.replace("\n","")
         filter_value = float(filter_value)
         pep = prot.split("\t")[index].replace('"', "")
         if pep.replace(".", "", 1).isdigit():
             if opt == "<":
-                if not float(pep) < filter_value:
-                    filtered_prots.append(prot)
-                    mq.remove(prot)
+                if float(pep) >= filter_value:
+                    filtered_prots.append(line)
+                    mq.remove(line)
             elif opt == "<=":
-                if not float(pep) <= filter_value:
-                    filtered_prots.append(prot)
-                    mq.remove(prot)
+                if float(pep) > filter_value:
+                    filtered_prots.append(line)
+                    mq.remove(line)
             elif opt == ">":
             #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value)
-                if not float(pep) > filter_value:
-                    filtered_prots.append(prot)
-                    mq.remove(prot)
+                if float(pep) <= filter_value:
+                    filtered_prots.append(line)
+                    mq.remove(line)
             elif opt == ">=":
-                if not float(pep) >= filter_value:
-                    filtered_prots.append(prot)
-                    mq.remove(prot)
+                if float(pep) < filter_value:
+                    filtered_prots.append(line)
+                    mq.remove(line)
             else:
-                if not float(pep) == filter_value:
-                    filtered_prots.append(prot)
-                    mq.remove(prot)
+                if float(pep) != filter_value:
+                    filtered_prots.append(line)
+                    mq.remove(line)
     return mq, filtered_prots #output, trash_file

 if __name__ == "__main__":
--- a/filter_kw_val.xml	Sun Nov 26 18:36:43 2017 -0500
+++ b/filter_kw_val.xml	Fri Feb 16 03:27:43 2018 -0500
@@ -1,13 +1,13 @@
-<tool id="MQoutputfilter" name="Filter out keywords and/or numerical values" version="0.1.0">
-    <description>Filter a file by keywords or values</description>
+<tool id="MQoutputfilter" name="Filter lines by keywords or numerical value" version="0.1.0">
+    <description>Filter a file by keywords or numerical values</description>
     <requirements>
     </requirements>
     <stdio>
         <exit_code range="1:" />
     </stdio>
     <command><![CDATA[
-        python $__tool_directory__/filter_kw_val.py
-        -i "$input1,$header"
+        python $__tool_directory__/filter_kw_val.py
+        -i "$input1,$header"
         -o "$output1"
         --trash_file "$trash_file"

@@ -15,46 +15,47 @@
         #for $i, $key in enumerate($keyword)
             #if $key.k.kw != "None"
                 #if $key.k.kw == "text"
-                    --kw "$key.k.txt" "$key.k.ncol" "$key.match"
+                    --kw "$key.k.txt" "$key.ncol" "$key.match"
                 #else if $key.k.kw == "file"
-                    --kw_file "$key.k.file" "$key.k.ncol" "$key.match"
+                    --kw_file "$key.k.file" "$key.ncol" "$key.match"
                 #end if
             #end if
         #end for
-
+
         ## Number of proteins
         #for $i, $val in enumerate($value)
             #if $val.v.val != "None"
                 --value
                 #if $val.v.val == "Equal"
-                    $val.v.equal "$value.ncol" "="
+                    $val.v.equal "$val.ncol" "="
                 #else if $val.v.val == "Higher"
-                    $val.v.higher "$val.v.ncol" ">"
+                    $val.v.higher "$val.ncol" ">"
                 #else if $val.v.val == "Equal or higher"
-                    $val.v.equal_higher "$val.v.ncol" ">="
+                    $val.v.equal_higher "$val.ncol" ">="
                 #else if $val.v.val == "Lower"
-                    $val.v.lower "$val.v.ncol" "<"
+                    $val.v.lower "$val.ncol" "<"
                 #else
-                    $val.v.equal_lower "$val.v.ncol" "<="
+                    $val.v.equal_lower "$val.ncol" "<="
                 #end if
             #end if
         #end for
-
+
     ]]></command>
     <inputs>
-        <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics results (e.g. output file from MaxQuant or Proline softwares" />
+        <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics identification and/or quantitative results" />
         <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" />
         <repeat name="keyword" title="Filter by keywords" >
-            <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more detail' />
+            <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
+            <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more details' />
             <conditional name="k" >
                 <param argument="--kw" type="select" label="Filter by keyword" >
                     <option value="None" selected="True">---</option>
-                    <option value="text">Enter keywords</option>
+                    <option value="text">Enter keywords (copy/paste)</option>
                     <option value="file">Choose a file containing keywords</option>
                 </param>
                 <when value="None" />
                 <when value="text" >
-                    <param name="txt" type="text" label="Enter keywords or a file containing keywords to be removed" >
+                    <param name="txt" type="text" label="Copy/paste keywords to be removed" >
                         <sanitizer>
                         <valid initial="string.printable">
                             <remove value="&apos;"/>
@@ -64,16 +65,15 @@
                         </mapping>
                         </sanitizer>
                     </param>
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
                 <when value="file" >
                     <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column on which to apply this filter" help='For example, fill in "c1" if the keyword you want to filter out is expected in the first column' />
                 </when>
             </conditional>
         </repeat>
-
+
         <repeat name="value" title="Filter by value" >
+            <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' />
             <conditional name="v" >
                 <param argument="--val" type="select" label="Filter by value" >
                     <option value="None">---</option>
@@ -87,54 +87,42 @@
                 </when>
                 <when value="Equal" >
                     <param name="equal" type="float" value="" label="Value" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
                 <when value="Higher" >
                     <param type="float" name="higher" value="" label="Value" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
                 <when value="Equal or higher" >
                     <param type="float" name="equal_higher" value="" label="Value" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
                 <when value="Lower" >
                     <param type="float" name="lower" value="" label="Value" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
                 <when value="Equal or lower" >
                     <param type="float" name="equal_lower" value="" label="Value" />
-                    <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' />
                 </when>
             </conditional>
         </repeat>
-
+
     </inputs>
     <outputs>
         <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" />
-        <data name="trash_file" format="tabular" label="Removed proteins from input file" />
+        <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Removed lines" />
     </outputs>
     <tests>
         <test>
-            <param name="input1" value="UnipIDs.txt" />
-            <param name="header" value="false" />
+            <param name="input1" value="Lacombe_et_al_2017_OK.txt" />
+            <param name="header" value="true" />
             <repeat name="keyword">
-                <param name="match" value="false" />
+                <param name="ncol" value="c1" />
+                <param name="match" value="True" />
                 <conditional name="k">
                     <param name="kw" value="text" />
-                    <param name="txt" value="A" />
-                    <param name="ncol" value="c3" />
+                    <param name="txt" value="P04264;P35908;P13645;Q5D862;Q5T749;Q8IW75;P81605;P22531;P59666;P78386" />
                 </conditional>
             </repeat>
-            <repeat name="value">
-                <conditional name="v">
-                    <param name="val" value="Equal or higher"/>
-                    <param name="equal_higher" value="1.0" />
-                    <param name="ncol" value="c2" />
-                </conditional>
-            </repeat>
-            <output name="output1" file="filter_keywords_values_output.txt" />
-            <output name="trash_file" file="filter_keywords_values_removed.txt" />
-        </test>
+            <output name="output1" file="FKW_Lacombe_et_al_2017_OK.txt" />
+            <output name="trash_file" file="Trash_FKW_Lacombe_et_al_2017_OK.txt" />
+        </test>
     </tests>
     <help><![CDATA[
 This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output).
@@ -145,30 +133,30 @@

 Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords.

-- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175
-
-- If you choose to upload a file in a text format in which each line is a keyword, for example:
+- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175
+
+- If you choose to upload a file in a text format in which each line is a keyword, for example:

- REV
-
- TRYP_PIG
-
- ALDOA_RABBIT
-
+REV
+
+TRYP_PIG
+
+ALDOA_RABBIT
+
 **The line that contains these keywords will be eliminated from input file.**
-
-**Keywords search can be applied by performing either exact match or partial one by using the following option**
-
-- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
-
+
+**Keywords search can be applied by performing either exact match or partial one by using the following option**
+
+- If you choose **Yes**, only the fields that contains exactly the same content will be removed.
+
 - If you choose **No**, all the fields containing the keyword will be removed.
-
+
 For example:
-
+
 **Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed.
-
+
 **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so  on) are removed.
-
+
 **Filter the file by values**

 You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.).
@@ -181,7 +169,7 @@

 * A text file containing the resulting filtered input file.

-* A text file containing the rows removed from the input file.
+* A text file containing the rows removed from the input file.

 -----

@@ -189,8 +177,9 @@

 **Authors**

-T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
-Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
+T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR

 This work has been partially funded through the French National Agency for Research (ANR) IFB project.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/FKW_Lacombe_et_al_2017_OK.txt	Fri Feb 16 03:27:43 2018 -0500
@@ -0,0 +1,154 @@
+Protein accession number (UniProt)	Protein name	Number of peptides (razor + unique)
+
+P15924	Desmoplakin	69
+P02538	Keratin, type II cytoskeletal 6A	53
+P02768	Serum albumin	44
+P08779	Keratin, type I cytoskeletal 16	29
+Q02413	Desmoglein-1	24
+P07355	"Annexin A2;Putative annexin A2-like protein"	22
+P14923	Junction plakoglobin	22
+P02788	Lactotransferrin	21
+Q9HC84	Mucin-5B	21
+P29508	Serpin B3	20
+P63261	Actin, cytoplasmic 2	19
+Q8N1N4	Keratin, type II cytoskeletal 78	18
+Q04695	Keratin, type I cytoskeletal 17	18
+P01876	Ig alpha-1 chain C region	16
+Q01469	Fatty acid-binding protein 5, epidermal	15
+P31944	Caspase-14	15
+P01833	Polymeric immunoglobulin receptor	15
+P06733	Alpha-enolase	15
+P25311	Zinc-alpha-2-glycoprotein	15
+Q15149	Plectin	15
+P19013	Keratin, type II cytoskeletal 4	13
+Q6KB66	Keratin, type II cytoskeletal 80	13
+Q08188	Protein-glutamine gamma-glutamyltransferase E	12
+P13646	Keratin, type I cytoskeletal 13	11
+Q86YZ3	Hornerin	11
+P04259	Keratin, type II cytoskeletal 6B	10
+P02545	"Prelamin-A/C;Lamin-A/C"	10
+P04083	Annexin A1	10
+P11021	78 kDa glucose-regulated protein	10
+P02787	Serotransferrin	9
+P04040	Catalase	9
+P31151	Protein S100-A7	9
+P31947	14-3-3 protein sigma	9
+Q96P63	Serpin B12	9
+P14618	Pyruvate kinase PKM	9
+P60174	Triosephosphate isomerase	9
+Q06830	Peroxiredoxin-1	9
+P01040	Cystatin-A	8
+P05089	Arginase-1	8
+P01834	Ig kappa chain C region	8
+P04406	Glyceraldehyde-3-phosphate dehydrogenase	8
+P0DMV9	Heat shock 70 kDa protein 1B	8
+P13639	Elongation factor 2	8
+P35579	Myosin-9	8
+P68371	Tubulin beta-4B chain	8
+Q8WVV4	Protein POF1B	8
+O75635	Serpin B7	7
+P01857	Ig gamma-1 chain C region	7
+P61626	Lysozyme C	7
+P68363	Tubulin alpha-1B chain	7
+P01009	"Alpha-1-antitrypsin;Short peptide from AAT"	6
+P07900	Heat shock protein HSP 90-alpha	6
+Q9NZH8	Interleukin-36 gamma	6
+O43707	"Alpha-actinin-4;Alpha-actinin-1"	6
+O75223	Gamma-glutamylcyclotransferase	6
+P00338	L-lactate dehydrogenase A chain	6
+P07339	Cathepsin D	6
+P62987	Ubiquitin-60S ribosomal protein L40	6
+P10599	Thioredoxin	6
+Q9UGM3	Deleted in malignant brain tumors 1 protein	6
+Q9UI42	Carboxypeptidase A4	6
+P47929	Galectin-7	5
+Q13867	Bleomycin hydrolase	5
+Q6P4A8	Phospholipase B-like 1	5
+O75369	Filamin-B	5
+P00441	Superoxide dismutase [Cu-Zn]	5
+P04792	Heat shock protein beta-1	5
+P11142	Heat shock cognate 71 kDa protein	5
+P58107	Epiplakin	5
+P60842	Eukaryotic initiation factor 4A-I	5
+P62937	Peptidyl-prolyl cis-trans isomerase A	5
+P63104	14-3-3 protein zeta/delta	5
+Q92820	Gamma-glutamyl hydrolase	5
+O75342	Arachidonate 12-lipoxygenase, 12R-type	4
+P09211	Glutathione S-transferase P	4
+P31025	Lipocalin-1	4
+P48594	Serpin B4	4
+Q14574	Desmocollin-3	4
+Q5T750	Skin-specific protein 32	4
+Q6UWP8	Suprabasin	4
+O60911	Cathepsin L2	4
+P00558	Phosphoglycerate kinase 1	4
+P04075	Fructose-bisphosphate aldolase A	4
+P07384	Calpain-1 catalytic subunit	4
+P0CG05	Ig lambda-2 chain C regions	4
+P18206	Vinculin	4
+P62258	14-3-3 protein epsilon	4
+P68871	Hemoglobin subunit beta	4
+Q9C075	Keratin, type I cytoskeletal 23	4
+A8K2U0	Alpha-2-macroglobulin-like protein 1	3
+P00738	Haptoglobin	3
+P01011	Alpha-1-antichymotrypsin	3
+P02763	Alpha-1-acid glycoprotein 1	3
+P18510	Interleukin-1 receptor antagonist protein	3
+P22528	Cornifin-B	3
+P30740	Leukocyte elastase inhibitor	3
+P80188	Neutrophil gelatinase-associated lipocalin	3
+Q15828	Cystatin-M	3
+Q9HCY8	Protein S100-A14	3
+P01623	Ig kappa chain V-III region	3
+P01877	Ig alpha-2 chain C region	3
+P06396	Gelsolin	3
+P14735	Insulin-degrading enzyme	3
+P20933	N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase	3
+P25788	Proteasome subunit alpha type-3	3
+P26641	Elongation factor 1-gamma	3
+P36952	Serpin B5	3
+P40926	Malate dehydrogenase, mitochondrial	3
+Q9Y6R7	IgGFc-binding protein	3
+O95274	Ly6/PLAUR domain-containing protein 3	2
+P00491	Purine nucleoside phosphorylase	2
+P04080	Cystatin-B	2
+P09972	Fructose-bisphosphate aldolase C	2
+P19012	Keratin, type I cytoskeletal 15	2
+P20930	Filaggrin	2
+Q96FX8	p53 apoptosis effector related to PMP-22	2
+Q9UIV8	Serpin B13	2
+P01625	Ig kappa chain V-IV region Len	2
+P01765	Ig heavy chain V-III region TIL	2
+P01766	Ig heavy chain V-III region BRO	2
+P01860	Ig gamma-3 chain C region	2
+P01871	Ig mu chain C region	2
+P05090	Apolipoprotein D	2
+P06870	Kallikrein-1	2
+P07858	Cathepsin B	2
+P08865	40S ribosomal protein SA	2
+P11279	Lysosome-associated membrane glycoprotein 1	2
+P13473	Lysosome-associated membrane glycoprotein 2	2
+P19971	Thymidine phosphorylase	2
+P23284	Peptidyl-prolyl cis-trans isomerase B	2
+P23396	40S ribosomal protein S3	2
+P25705	ATP synthase subunit alpha, mitochondrial	2
+P27482	Calmodulin-like protein 3	2
+P31949	Protein S100-A11	2
+P40121	Macrophage-capping protein	2
+P42357	Histidine ammonia-lyase	2
+P47756	F-actin-capping protein subunit beta	2
+P48637	Glutathione synthetase	2
+P49720	Proteasome subunit beta type-3	2
+P50395	Rab GDP dissociation inhibitor beta	2
+P59998	Actin-related protein 2/3 complex subunit 4	2
+P61160	Actin-related protein 2	2
+P61916	Epididymal secretory protein E1	2
+P04745	Alpha-amylase 1	23
+Q9NZT1	Calmodulin-like protein 5	8
+P12273	Prolactin-inducible protein	6
+Q96DA0	Zymogen granule protein 16 homolog B	5
+P01036	Cystatin-S	5
+Q8TAX7	Mucin-7	2
+P01037	Cystatin-SN	2
+P09228	Cystatin-SA	2
+
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Lacombe_et_al_2017_OK.txt	Fri Feb 16 03:27:43 2018 -0500
@@ -0,0 +1,165 @@
+Protein accession number (UniProt)	Protein name	Number of peptides (razor + unique)
+P15924	Desmoplakin	69
+P02538	Keratin, type II cytoskeletal 6A	53
+P02768	Serum albumin	44
+P08779	Keratin, type I cytoskeletal 16	29
+Q02413	Desmoglein-1	24
+P07355	"Annexin A2;Putative annexin A2-like protein"	22
+P14923	Junction plakoglobin	22
+P02788	Lactotransferrin	21
+Q9HC84	Mucin-5B	21
+P29508	Serpin B3	20
+P63261	Actin, cytoplasmic 2	19
+Q8N1N4	Keratin, type II cytoskeletal 78	18
+Q04695	Keratin, type I cytoskeletal 17	18
+P01876	Ig alpha-1 chain C region	16
+Q01469	Fatty acid-binding protein 5, epidermal	15
+P31944	Caspase-14	15
+P01833	Polymeric immunoglobulin receptor	15
+P06733	Alpha-enolase	15
+P25311	Zinc-alpha-2-glycoprotein	15
+Q15149	Plectin	15
+P19013	Keratin, type II cytoskeletal 4	13
+Q6KB66	Keratin, type II cytoskeletal 80	13
+Q08188	Protein-glutamine gamma-glutamyltransferase E	12
+P13646	Keratin, type I cytoskeletal 13	11
+Q86YZ3	Hornerin	11
+P04259	Keratin, type II cytoskeletal 6B	10
+P02545	"Prelamin-A/C;Lamin-A/C"	10
+P04083	Annexin A1	10
+P11021	78 kDa glucose-regulated protein	10
+P02787	Serotransferrin	9
+P04040	Catalase	9
+P31151	Protein S100-A7	9
+P31947	14-3-3 protein sigma	9
+Q96P63	Serpin B12	9
+P14618	Pyruvate kinase PKM	9
+P60174	Triosephosphate isomerase	9
+Q06830	Peroxiredoxin-1	9
+P01040	Cystatin-A	8
+P05089	Arginase-1	8
+P01834	Ig kappa chain C region	8
+P04406	Glyceraldehyde-3-phosphate dehydrogenase	8
+P0DMV9	Heat shock 70 kDa protein 1B	8
+P13639	Elongation factor 2	8
+P35579	Myosin-9	8
+P68371	Tubulin beta-4B chain	8
+Q8WVV4	Protein POF1B	8
+O75635	Serpin B7	7
+P01857	Ig gamma-1 chain C region	7
+P61626	Lysozyme C	7
+P68363	Tubulin alpha-1B chain	7
+P01009	"Alpha-1-antitrypsin;Short peptide from AAT"	6
+P07900	Heat shock protein HSP 90-alpha	6
+Q9NZH8	Interleukin-36 gamma	6
+O43707	"Alpha-actinin-4;Alpha-actinin-1"	6
+O75223	Gamma-glutamylcyclotransferase	6
+P00338	L-lactate dehydrogenase A chain	6
+P07339	Cathepsin D	6
+P62987	Ubiquitin-60S ribosomal protein L40	6
+P10599	Thioredoxin	6
+Q9UGM3	Deleted in malignant brain tumors 1 protein	6
+Q9UI42	Carboxypeptidase A4	6
+P47929	Galectin-7	5
+Q13867	Bleomycin hydrolase	5
+Q6P4A8	Phospholipase B-like 1	5
+O75369	Filamin-B	5
+P00441	Superoxide dismutase [Cu-Zn]	5
+P04792	Heat shock protein beta-1	5
+P11142	Heat shock cognate 71 kDa protein	5
+P58107	Epiplakin	5
+P60842	Eukaryotic initiation factor 4A-I	5
+P62937	Peptidyl-prolyl cis-trans isomerase A	5
+P63104	14-3-3 protein zeta/delta	5
+Q92820	Gamma-glutamyl hydrolase	5
+O75342	Arachidonate 12-lipoxygenase, 12R-type	4
+P09211	Glutathione S-transferase P	4
+P31025	Lipocalin-1	4
+P48594	Serpin B4	4
+Q14574	Desmocollin-3	4
+Q5T750	Skin-specific protein 32	4
+Q6UWP8	Suprabasin	4
+O60911	Cathepsin L2	4
+P00558	Phosphoglycerate kinase 1	4
+P04075	Fructose-bisphosphate aldolase A	4
+P07384	Calpain-1 catalytic subunit	4
+P0CG05	Ig lambda-2 chain C regions	4
+P18206	Vinculin	4
+P62258	14-3-3 protein epsilon	4
+P68871	Hemoglobin subunit beta	4
+Q9C075	Keratin, type I cytoskeletal 23	4
+A8K2U0	Alpha-2-macroglobulin-like protein 1	3
+P00738	Haptoglobin	3
+P01011	Alpha-1-antichymotrypsin	3
+P02763	Alpha-1-acid glycoprotein 1	3
+P18510	Interleukin-1 receptor antagonist protein	3
+P22528	Cornifin-B	3
+P30740	Leukocyte elastase inhibitor	3
+P80188	Neutrophil gelatinase-associated lipocalin	3
+Q15828	Cystatin-M	3
+Q9HCY8	Protein S100-A14	3
+P01623	Ig kappa chain V-III region	3
+P01877	Ig alpha-2 chain C region	3
+P06396	Gelsolin	3
+P14735	Insulin-degrading enzyme	3
+P20933	N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase	3
+P25788	Proteasome subunit alpha type-3	3
+P26641	Elongation factor 1-gamma	3
+P36952	Serpin B5	3
+P40926	Malate dehydrogenase, mitochondrial	3
+Q9Y6R7	IgGFc-binding protein	3
+O95274	Ly6/PLAUR domain-containing protein 3	2
+P00491	Purine nucleoside phosphorylase	2
+P04080	Cystatin-B	2
+P09972	Fructose-bisphosphate aldolase C	2
+P19012	Keratin, type I cytoskeletal 15	2
+P20930	Filaggrin	2
+Q96FX8	p53 apoptosis effector related to PMP-22	2
+Q9UIV8	Serpin B13	2
+P01625	Ig kappa chain V-IV region Len	2
+P01765	Ig heavy chain V-III region TIL	2
+P01766	Ig heavy chain V-III region BRO	2
+P01860	Ig gamma-3 chain C region	2
+P01871	Ig mu chain C region	2
+P05090	Apolipoprotein D	2
+P06870	Kallikrein-1	2
+P07858	Cathepsin B	2
+P08865	40S ribosomal protein SA	2
+P11279	Lysosome-associated membrane glycoprotein 1	2
+P13473	Lysosome-associated membrane glycoprotein 2	2
+P19971	Thymidine phosphorylase	2
+P23284	Peptidyl-prolyl cis-trans isomerase B	2
+P23396	40S ribosomal protein S3	2
+P25705	ATP synthase subunit alpha, mitochondrial	2
+P27482	Calmodulin-like protein 3	2
+P31949	Protein S100-A11	2
+P40121	Macrophage-capping protein	2
+P42357	Histidine ammonia-lyase	2
+P47756	F-actin-capping protein subunit beta	2
+P48637	Glutathione synthetase	2
+P49720	Proteasome subunit beta type-3	2
+P50395	Rab GDP dissociation inhibitor beta	2
+P59998	Actin-related protein 2/3 complex subunit 4	2
+P61160	Actin-related protein 2	2
+P61916	Epididymal secretory protein E1	2
+P04745	Alpha-amylase 1	23
+Q9NZT1	Calmodulin-like protein 5	8
+P12273	Prolactin-inducible protein	6
+Q96DA0	Zymogen granule protein 16 homolog B	5
+P01036	Cystatin-S	5
+Q8TAX7	Mucin-7	2
+P01037	Cystatin-SN	2
+P09228	Cystatin-SA	2
+P04264	Keratin, type II cytoskeletal 1	61
+P35908	Keratin, type II cytoskeletal 2 epidermal	40
+P13645	Keratin, type I cytoskeletal 10	40
+Q5D862	Filaggrin-2	14
+Q5T749	Keratinocyte proline-rich protein	13
+Q8IW75	Serpin A12	3
+P81605	Dermcidin	3
+P22531	Small proline-rich protein 2E	3
+P59666	Neutrophil defensin 3	2
+P78386	Keratin, type II cuticular Hb5	2
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Trash_FKW_Lacombe_et_al_2017_OK.txt	Fri Feb 16 03:27:43 2018 -0500
@@ -0,0 +1,12 @@
+Protein accession number (UniProt)	Protein name	Number of peptides (razor + unique)
+
+P04264	Keratin, type II cytoskeletal 1	61
+P35908	Keratin, type II cytoskeletal 2 epidermal	40
+P13645	Keratin, type I cytoskeletal 10	40
+Q5D862	Filaggrin-2	14
+Q5T749	Keratinocyte proline-rich protein	13
+Q8IW75	Serpin A12	3
+P81605	Dermcidin	3
+P22531	Small proline-rich protein 2E	3
+P59666	Neutrophil defensin 3	2
+P78386	Keratin, type II cuticular Hb5	2
\ No newline at end of file
--- a/test-data/UnipIDs.txt	Sun Nov 26 18:36:43 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-P04637
-P08246
-P63244
-P10275
-P00533
-Q14524
-P05067
-P35555
-P35222
-O95273
-P00451
-P38398
-Q05086
-Q12802
-P68871
-P04585
-Q96EB6
-Q9NYL2
-P31749
-P01137
-Q5S007
-Q08379
-P02649
-P35498
-P12931
--- a/test-data/filter_keywords_values_output.txt	Sun Nov 26 18:36:43 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-P08246	2	B0
-P63244	1.5	C1
-Q14524	3.5	D1
-P05067	1	B3
-P00451	2	B2
-P38398	5	B4
-Q12802	3	D5
-P68871	1.5	B4
-P04585	2.5	D3
-Q9NYL2	1	B1
-P01137	5	B6
-Q5S007	8	D4
-Q08379	2	C4
-P35498	1	C5
--- a/test-data/filter_keywords_values_removed.txt	Sun Nov 26 18:36:43 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-P04637	1	A0
-P10275	3	A2
-P00533	2	A3
-O95273	1.1	A4
-P31749	3	A1
-P12931	3	A5
-P35555	0	C0
-P35222	0.9	D2
-Q05086	0	C2
-Q96EB6	0	C3
-P02649	0	B5