Mercurial > repos > proteore > filter_keywords_values
changeset 1:d29e469b6b20 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author | proteore |
---|---|
date | Fri, 16 Feb 2018 03:27:43 -0500 |
parents | 6a45ccfc0e4c |
children | 330d6c7b1916 |
files | filter_kw_val.py filter_kw_val.xml test-data/FKW_Lacombe_et_al_2017_OK.txt test-data/Lacombe_et_al_2017_OK.txt test-data/Trash_FKW_Lacombe_et_al_2017_OK.txt test-data/UnipIDs.txt test-data/filter_keywords_values_output.txt test-data/filter_keywords_values_removed.txt |
diffstat | 8 files changed, 448 insertions(+), 165 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_kw_val.py Sun Nov 26 18:36:43 2017 -0500 +++ b/filter_kw_val.py Fri Feb 16 03:27:43 2018 -0500 @@ -3,6 +3,9 @@ def options(): + """ + Parse options + """ parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input file", required=True) parser.add_argument("-m", "--match", help="Exact macth") @@ -16,28 +19,35 @@ filters(args) - # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" - + # python filter2.py -i "/projet/galaxydev/galaxy/tools/proteore_uc1/proteinGroups_Maud.txt" + # --protein_IDs "A2A288:A8K2U0" --peptides 2 "=" -o "test-data/output_MQfilter.txt" + -def isnumber(format, n): +def isnumber(number_format, n): + """ + Check if a variable is a float or an integer + """ float_format = re.compile("^[\-]?[1-9][0-9]*\.?[0-9]+$") int_format = re.compile("^[\-]?[1-9][0-9]*$") test = "" - if format == "int": + if number_format == "int": test = re.match(int_format, n) - elif format == "float": + elif number_format == "float": test = re.match(float_format, n) if test: return True - else: - return False +# else: +# return False def filters(args): + """ + Filter the document + """ MQfilename = args.input.split(",")[0] header = args.input.split(",")[1] MQfile = readMQ(MQfilename) results = [MQfile, None] - + if args.kw: keywords = args.kw for k in keywords: @@ -56,26 +66,22 @@ # Write results to output output = open(args.output, "w") - output.write("".join(results[0])) + output.write("\n".join(results[0])) output.close() # Write deleted lines to trash_file trash = open(args.trash_file, "w") - #print("".join(results[1])) - trash.write("".join(results[1])) + trash.write("\n".join(results[1])) trash.close() def readOption(filename): f = open(filename, "r") - file = f.read() - #print(file) - filter_list = file.split("\n") - #print(filter_list) + file_content = f.read() + filter_list = file_content.split("\n") filters = "" for i in filter_list: - filters += i + ":" + filters += i + ";" filters = filters[:-1] - #print(filters) return filters def readMQ(MQfilename): @@ -83,97 +89,104 @@ mqfile = open(MQfilename, "r") mq = mqfile.readlines() # Remove empty lines (contain only space or new line or "") - [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] + [mq.remove(blank) for blank in mq if blank.isspace() or blank == ""] return mq - + def filter_keyword(MQfile, header, filtered_lines, ids, ncol, match): mq = MQfile if isnumber("int", ncol.replace("c", "")): id_index = int(ncol.replace("c", "")) - 1 #columns.index("Majority protein IDs") else: - raise ValueError("Please specify the column where you would like to apply the filter with valid format") - - ids = ids.upper().split(":") + raise ValueError("Please specify the column where " + "you would like to apply the filter " + "with valid format") + + # Split list of filter IDs + ids = ids.upper().split(";") + # Remove blank IDs [ids.remove(blank) for blank in ids if blank.isspace() or blank == ""] - + # Remove space from 2 heads of IDs + ids = [id.strip() for id in ids] + + if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] - + if not filtered_lines: # In case there is already some filtered lines from other filters filtered_lines = [] if header != "": filtered_lines.append(header) - for line in content: + for line in content: + line = line.replace("\n", "") id_inline = line.split("\t")[id_index].replace('"', "").split(";") one_id_line = line.replace(line.split("\t")[id_index], id_inline[0]) # Take only first IDs - + line = line + "\n" + if match != "false": # Filter protein IDs - if any (pid.upper() in ids for pid in id_inline): - #ids = prot_ids.split(":") - #print(prot_ids.split(":")) - #if prot_id in ids: + if any(pid.upper() in ids for pid in id_inline): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line else: - if any (ft in pid.upper() for pid in id_inline for ft in ids): + if any(ft in pid.upper() for pid in id_inline for ft in ids): filtered_lines.append(one_id_line) mq.remove(line) else: mq[mq.index(line)] = one_id_line return mq, filtered_lines - + def filter_value(MQfile, header, filtered_prots, filter_value, ncol, opt): mq = MQfile if ncol and isnumber("int", ncol.replace("c", "")): #"Gene names" in columns: index = int(ncol.replace("c", "")) - 1 #columns.index("Gene names") else: - raise ValueError("Please specify the column where you would like to apply the filter with valid format") - + raise ValueError("Please specify the column where " + "you would like to apply the filter " + "with valid format") if header == "true": header = mq[0] content = mq[1:] else: header = "" content = mq[:] - if not filtered_prots: # In case there is already some filtered lines from other filters filtered_prots = [] if header != "": filtered_prots.append(header) - - for prot in content: + + for line in content: + prot = line.replace("\n","") filter_value = float(filter_value) pep = prot.split("\t")[index].replace('"', "") if pep.replace(".", "", 1).isdigit(): if opt == "<": - if not float(pep) < filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) >= filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == "<=": - if not float(pep) <= filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) > filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == ">": #print(prot.number_of_prots, filter_value, int(prot.number_of_prots) > filter_value) - if not float(pep) > filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) <= filter_value: + filtered_prots.append(line) + mq.remove(line) elif opt == ">=": - if not float(pep) >= filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) < filter_value: + filtered_prots.append(line) + mq.remove(line) else: - if not float(pep) == filter_value: - filtered_prots.append(prot) - mq.remove(prot) + if float(pep) != filter_value: + filtered_prots.append(line) + mq.remove(line) return mq, filtered_prots #output, trash_file if __name__ == "__main__":
--- a/filter_kw_val.xml Sun Nov 26 18:36:43 2017 -0500 +++ b/filter_kw_val.xml Fri Feb 16 03:27:43 2018 -0500 @@ -1,13 +1,13 @@ -<tool id="MQoutputfilter" name="Filter out keywords and/or numerical values" version="0.1.0"> - <description>Filter a file by keywords or values</description> +<tool id="MQoutputfilter" name="Filter lines by keywords or numerical value" version="0.1.0"> + <description>Filter a file by keywords or numerical values</description> <requirements> </requirements> <stdio> <exit_code range="1:" /> </stdio> <command><![CDATA[ - python $__tool_directory__/filter_kw_val.py - -i "$input1,$header" + python $__tool_directory__/filter_kw_val.py + -i "$input1,$header" -o "$output1" --trash_file "$trash_file" @@ -15,46 +15,47 @@ #for $i, $key in enumerate($keyword) #if $key.k.kw != "None" #if $key.k.kw == "text" - --kw "$key.k.txt" "$key.k.ncol" "$key.match" + --kw "$key.k.txt" "$key.ncol" "$key.match" #else if $key.k.kw == "file" - --kw_file "$key.k.file" "$key.k.ncol" "$key.match" + --kw_file "$key.k.file" "$key.ncol" "$key.match" #end if #end if #end for - + ## Number of proteins #for $i, $val in enumerate($value) #if $val.v.val != "None" --value #if $val.v.val == "Equal" - $val.v.equal "$value.ncol" "=" + $val.v.equal "$val.ncol" "=" #else if $val.v.val == "Higher" - $val.v.higher "$val.v.ncol" ">" + $val.v.higher "$val.ncol" ">" #else if $val.v.val == "Equal or higher" - $val.v.equal_higher "$val.v.ncol" ">=" + $val.v.equal_higher "$val.ncol" ">=" #else if $val.v.val == "Lower" - $val.v.lower "$val.v.ncol" "<" + $val.v.lower "$val.ncol" "<" #else - $val.v.equal_lower "$val.v.ncol" "<=" + $val.v.equal_lower "$val.ncol" "<=" #end if #end if #end for - + ]]></command> <inputs> - <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics results (e.g. output file from MaxQuant or Proline softwares" /> + <param type="data" name="input1" format="txt,tabular" label="Input file" help="Input file is a tab-delimited file containing proteomics identification and/or quantitative results" /> <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" /> <repeat name="keyword" title="Filter by keywords" > - <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more detail' /> + <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> + <param type="boolean" name="match" truevalue="True" label="Would you like to search for exact match?" help='Choosing "Yes" will only filter out exact match (i.e. case sensitive), see below for more details' /> <conditional name="k" > <param argument="--kw" type="select" label="Filter by keyword" > <option value="None" selected="True">---</option> - <option value="text">Enter keywords</option> + <option value="text">Enter keywords (copy/paste)</option> <option value="file">Choose a file containing keywords</option> </param> <when value="None" /> <when value="text" > - <param name="txt" type="text" label="Enter keywords or a file containing keywords to be removed" > + <param name="txt" type="text" label="Copy/paste keywords to be removed" > <sanitizer> <valid initial="string.printable"> <remove value="'"/> @@ -64,16 +65,15 @@ </mapping> </sanitizer> </param> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> <when value="file" > <param name="file" type="data" format="txt,tabular" label="Choose a file containing keywords" /> - <param name="ncol" type="text" value="c1" label="Please specify the column on which to apply this filter" help='For example, fill in "c1" if the keyword you want to filter out is expected in the first column' /> </when> </conditional> </repeat> - + <repeat name="value" title="Filter by value" > + <param name="ncol" type="text" value="c1" label="Please specify the column number of the input file on which you want to apply the filter" help='For example, fill in "c1" if the keywords you want to filter out are listed in the first column' /> <conditional name="v" > <param argument="--val" type="select" label="Filter by value" > <option value="None">---</option> @@ -87,54 +87,42 @@ </when> <when value="Equal" > <param name="equal" type="float" value="" label="Value" /> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> <when value="Higher" > <param type="float" name="higher" value="" label="Value" /> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> <when value="Equal or higher" > <param type="float" name="equal_higher" value="" label="Value" /> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> <when value="Lower" > <param type="float" name="lower" value="" label="Value" /> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> <when value="Equal or lower" > <param type="float" name="equal_lower" value="" label="Value" /> - <param name="ncol" type="text" value="c1" label="Please specify the column where you would like to apply this filter" help='For example, fill in "c1" if you want to filter the first column' /> </when> </conditional> </repeat> - + </inputs> <outputs> <data name="output1" format="tabular" label="${tool.name} on ${input1.name}" /> - <data name="trash_file" format="tabular" label="Removed proteins from input file" /> + <data name="trash_file" format="tabular" label="${tool.name} on ${input1.name} - Removed lines" /> </outputs> <tests> <test> - <param name="input1" value="UnipIDs.txt" /> - <param name="header" value="false" /> + <param name="input1" value="Lacombe_et_al_2017_OK.txt" /> + <param name="header" value="true" /> <repeat name="keyword"> - <param name="match" value="false" /> + <param name="ncol" value="c1" /> + <param name="match" value="True" /> <conditional name="k"> <param name="kw" value="text" /> - <param name="txt" value="A" /> - <param name="ncol" value="c3" /> + <param name="txt" value="P04264;P35908;P13645;Q5D862;Q5T749;Q8IW75;P81605;P22531;P59666;P78386" /> </conditional> </repeat> - <repeat name="value"> - <conditional name="v"> - <param name="val" value="Equal or higher"/> - <param name="equal_higher" value="1.0" /> - <param name="ncol" value="c2" /> - </conditional> - </repeat> - <output name="output1" file="filter_keywords_values_output.txt" /> - <output name="trash_file" file="filter_keywords_values_removed.txt" /> - </test> + <output name="output1" file="FKW_Lacombe_et_al_2017_OK.txt" /> + <output name="trash_file" file="Trash_FKW_Lacombe_et_al_2017_OK.txt" /> + </test> </tests> <help><![CDATA[ This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). @@ -145,30 +133,30 @@ Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords. -- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175 - -- If you choose to upload a file in a text format in which each line is a keyword, for example: +- If you choose to fill in the field, the keywords should be separated by ":", for example: A8K2U0:Q5TA79:O43175 + +- If you choose to upload a file in a text format in which each line is a keyword, for example: - REV - - TRYP_PIG - - ALDOA_RABBIT - +REV + +TRYP_PIG + +ALDOA_RABBIT + **The line that contains these keywords will be eliminated from input file.** - -**Keywords search can be applied by performing either exact match or partial one by using the following option** - -- If you choose **Yes**, only the fields that contains exactly the same content will be removed. - + +**Keywords search can be applied by performing either exact match or partial one by using the following option** + +- If you choose **Yes**, only the fields that contains exactly the same content will be removed. + - If you choose **No**, all the fields containing the keyword will be removed. - + For example: - + **Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. - + **No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. - + **Filter the file by values** You can choose to use one or more options (e.g. to filter out peptides of low intensity value, by q-value, etc.). @@ -181,7 +169,7 @@ * A text file containing the resulting filtered input file. -* A text file containing the rows removed from the input file. +* A text file containing the rows removed from the input file. ----- @@ -189,8 +177,9 @@ **Authors** -T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR -Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform +T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR This work has been partially funded through the French National Agency for Research (ANR) IFB project.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FKW_Lacombe_et_al_2017_OK.txt Fri Feb 16 03:27:43 2018 -0500 @@ -0,0 +1,154 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) + +P15924 Desmoplakin 69 +P02538 Keratin, type II cytoskeletal 6A 53 +P02768 Serum albumin 44 +P08779 Keratin, type I cytoskeletal 16 29 +Q02413 Desmoglein-1 24 +P07355 "Annexin A2;Putative annexin A2-like protein" 22 +P14923 Junction plakoglobin 22 +P02788 Lactotransferrin 21 +Q9HC84 Mucin-5B 21 +P29508 Serpin B3 20 +P63261 Actin, cytoplasmic 2 19 +Q8N1N4 Keratin, type II cytoskeletal 78 18 +Q04695 Keratin, type I cytoskeletal 17 18 +P01876 Ig alpha-1 chain C region 16 +Q01469 Fatty acid-binding protein 5, epidermal 15 +P31944 Caspase-14 15 +P01833 Polymeric immunoglobulin receptor 15 +P06733 Alpha-enolase 15 +P25311 Zinc-alpha-2-glycoprotein 15 +Q15149 Plectin 15 +P19013 Keratin, type II cytoskeletal 4 13 +Q6KB66 Keratin, type II cytoskeletal 80 13 +Q08188 Protein-glutamine gamma-glutamyltransferase E 12 +P13646 Keratin, type I cytoskeletal 13 11 +Q86YZ3 Hornerin 11 +P04259 Keratin, type II cytoskeletal 6B 10 +P02545 "Prelamin-A/C;Lamin-A/C" 10 +P04083 Annexin A1 10 +P11021 78 kDa glucose-regulated protein 10 +P02787 Serotransferrin 9 +P04040 Catalase 9 +P31151 Protein S100-A7 9 +P31947 14-3-3 protein sigma 9 +Q96P63 Serpin B12 9 +P14618 Pyruvate kinase PKM 9 +P60174 Triosephosphate isomerase 9 +Q06830 Peroxiredoxin-1 9 +P01040 Cystatin-A 8 +P05089 Arginase-1 8 +P01834 Ig kappa chain C region 8 +P04406 Glyceraldehyde-3-phosphate dehydrogenase 8 +P0DMV9 Heat shock 70 kDa protein 1B 8 +P13639 Elongation factor 2 8 +P35579 Myosin-9 8 +P68371 Tubulin beta-4B chain 8 +Q8WVV4 Protein POF1B 8 +O75635 Serpin B7 7 +P01857 Ig gamma-1 chain C region 7 +P61626 Lysozyme C 7 +P68363 Tubulin alpha-1B chain 7 +P01009 "Alpha-1-antitrypsin;Short peptide from AAT" 6 +P07900 Heat shock protein HSP 90-alpha 6 +Q9NZH8 Interleukin-36 gamma 6 +O43707 "Alpha-actinin-4;Alpha-actinin-1" 6 +O75223 Gamma-glutamylcyclotransferase 6 +P00338 L-lactate dehydrogenase A chain 6 +P07339 Cathepsin D 6 +P62987 Ubiquitin-60S ribosomal protein L40 6 +P10599 Thioredoxin 6 +Q9UGM3 Deleted in malignant brain tumors 1 protein 6 +Q9UI42 Carboxypeptidase A4 6 +P47929 Galectin-7 5 +Q13867 Bleomycin hydrolase 5 +Q6P4A8 Phospholipase B-like 1 5 +O75369 Filamin-B 5 +P00441 Superoxide dismutase [Cu-Zn] 5 +P04792 Heat shock protein beta-1 5 +P11142 Heat shock cognate 71 kDa protein 5 +P58107 Epiplakin 5 +P60842 Eukaryotic initiation factor 4A-I 5 +P62937 Peptidyl-prolyl cis-trans isomerase A 5 +P63104 14-3-3 protein zeta/delta 5 +Q92820 Gamma-glutamyl hydrolase 5 +O75342 Arachidonate 12-lipoxygenase, 12R-type 4 +P09211 Glutathione S-transferase P 4 +P31025 Lipocalin-1 4 +P48594 Serpin B4 4 +Q14574 Desmocollin-3 4 +Q5T750 Skin-specific protein 32 4 +Q6UWP8 Suprabasin 4 +O60911 Cathepsin L2 4 +P00558 Phosphoglycerate kinase 1 4 +P04075 Fructose-bisphosphate aldolase A 4 +P07384 Calpain-1 catalytic subunit 4 +P0CG05 Ig lambda-2 chain C regions 4 +P18206 Vinculin 4 +P62258 14-3-3 protein epsilon 4 +P68871 Hemoglobin subunit beta 4 +Q9C075 Keratin, type I cytoskeletal 23 4 +A8K2U0 Alpha-2-macroglobulin-like protein 1 3 +P00738 Haptoglobin 3 +P01011 Alpha-1-antichymotrypsin 3 +P02763 Alpha-1-acid glycoprotein 1 3 +P18510 Interleukin-1 receptor antagonist protein 3 +P22528 Cornifin-B 3 +P30740 Leukocyte elastase inhibitor 3 +P80188 Neutrophil gelatinase-associated lipocalin 3 +Q15828 Cystatin-M 3 +Q9HCY8 Protein S100-A14 3 +P01623 Ig kappa chain V-III region 3 +P01877 Ig alpha-2 chain C region 3 +P06396 Gelsolin 3 +P14735 Insulin-degrading enzyme 3 +P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3 +P25788 Proteasome subunit alpha type-3 3 +P26641 Elongation factor 1-gamma 3 +P36952 Serpin B5 3 +P40926 Malate dehydrogenase, mitochondrial 3 +Q9Y6R7 IgGFc-binding protein 3 +O95274 Ly6/PLAUR domain-containing protein 3 2 +P00491 Purine nucleoside phosphorylase 2 +P04080 Cystatin-B 2 +P09972 Fructose-bisphosphate aldolase C 2 +P19012 Keratin, type I cytoskeletal 15 2 +P20930 Filaggrin 2 +Q96FX8 p53 apoptosis effector related to PMP-22 2 +Q9UIV8 Serpin B13 2 +P01625 Ig kappa chain V-IV region Len 2 +P01765 Ig heavy chain V-III region TIL 2 +P01766 Ig heavy chain V-III region BRO 2 +P01860 Ig gamma-3 chain C region 2 +P01871 Ig mu chain C region 2 +P05090 Apolipoprotein D 2 +P06870 Kallikrein-1 2 +P07858 Cathepsin B 2 +P08865 40S ribosomal protein SA 2 +P11279 Lysosome-associated membrane glycoprotein 1 2 +P13473 Lysosome-associated membrane glycoprotein 2 2 +P19971 Thymidine phosphorylase 2 +P23284 Peptidyl-prolyl cis-trans isomerase B 2 +P23396 40S ribosomal protein S3 2 +P25705 ATP synthase subunit alpha, mitochondrial 2 +P27482 Calmodulin-like protein 3 2 +P31949 Protein S100-A11 2 +P40121 Macrophage-capping protein 2 +P42357 Histidine ammonia-lyase 2 +P47756 F-actin-capping protein subunit beta 2 +P48637 Glutathione synthetase 2 +P49720 Proteasome subunit beta type-3 2 +P50395 Rab GDP dissociation inhibitor beta 2 +P59998 Actin-related protein 2/3 complex subunit 4 2 +P61160 Actin-related protein 2 2 +P61916 Epididymal secretory protein E1 2 +P04745 Alpha-amylase 1 23 +Q9NZT1 Calmodulin-like protein 5 8 +P12273 Prolactin-inducible protein 6 +Q96DA0 Zymogen granule protein 16 homolog B 5 +P01036 Cystatin-S 5 +Q8TAX7 Mucin-7 2 +P01037 Cystatin-SN 2 +P09228 Cystatin-SA 2 + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Lacombe_et_al_2017_OK.txt Fri Feb 16 03:27:43 2018 -0500 @@ -0,0 +1,165 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) +P15924 Desmoplakin 69 +P02538 Keratin, type II cytoskeletal 6A 53 +P02768 Serum albumin 44 +P08779 Keratin, type I cytoskeletal 16 29 +Q02413 Desmoglein-1 24 +P07355 "Annexin A2;Putative annexin A2-like protein" 22 +P14923 Junction plakoglobin 22 +P02788 Lactotransferrin 21 +Q9HC84 Mucin-5B 21 +P29508 Serpin B3 20 +P63261 Actin, cytoplasmic 2 19 +Q8N1N4 Keratin, type II cytoskeletal 78 18 +Q04695 Keratin, type I cytoskeletal 17 18 +P01876 Ig alpha-1 chain C region 16 +Q01469 Fatty acid-binding protein 5, epidermal 15 +P31944 Caspase-14 15 +P01833 Polymeric immunoglobulin receptor 15 +P06733 Alpha-enolase 15 +P25311 Zinc-alpha-2-glycoprotein 15 +Q15149 Plectin 15 +P19013 Keratin, type II cytoskeletal 4 13 +Q6KB66 Keratin, type II cytoskeletal 80 13 +Q08188 Protein-glutamine gamma-glutamyltransferase E 12 +P13646 Keratin, type I cytoskeletal 13 11 +Q86YZ3 Hornerin 11 +P04259 Keratin, type II cytoskeletal 6B 10 +P02545 "Prelamin-A/C;Lamin-A/C" 10 +P04083 Annexin A1 10 +P11021 78 kDa glucose-regulated protein 10 +P02787 Serotransferrin 9 +P04040 Catalase 9 +P31151 Protein S100-A7 9 +P31947 14-3-3 protein sigma 9 +Q96P63 Serpin B12 9 +P14618 Pyruvate kinase PKM 9 +P60174 Triosephosphate isomerase 9 +Q06830 Peroxiredoxin-1 9 +P01040 Cystatin-A 8 +P05089 Arginase-1 8 +P01834 Ig kappa chain C region 8 +P04406 Glyceraldehyde-3-phosphate dehydrogenase 8 +P0DMV9 Heat shock 70 kDa protein 1B 8 +P13639 Elongation factor 2 8 +P35579 Myosin-9 8 +P68371 Tubulin beta-4B chain 8 +Q8WVV4 Protein POF1B 8 +O75635 Serpin B7 7 +P01857 Ig gamma-1 chain C region 7 +P61626 Lysozyme C 7 +P68363 Tubulin alpha-1B chain 7 +P01009 "Alpha-1-antitrypsin;Short peptide from AAT" 6 +P07900 Heat shock protein HSP 90-alpha 6 +Q9NZH8 Interleukin-36 gamma 6 +O43707 "Alpha-actinin-4;Alpha-actinin-1" 6 +O75223 Gamma-glutamylcyclotransferase 6 +P00338 L-lactate dehydrogenase A chain 6 +P07339 Cathepsin D 6 +P62987 Ubiquitin-60S ribosomal protein L40 6 +P10599 Thioredoxin 6 +Q9UGM3 Deleted in malignant brain tumors 1 protein 6 +Q9UI42 Carboxypeptidase A4 6 +P47929 Galectin-7 5 +Q13867 Bleomycin hydrolase 5 +Q6P4A8 Phospholipase B-like 1 5 +O75369 Filamin-B 5 +P00441 Superoxide dismutase [Cu-Zn] 5 +P04792 Heat shock protein beta-1 5 +P11142 Heat shock cognate 71 kDa protein 5 +P58107 Epiplakin 5 +P60842 Eukaryotic initiation factor 4A-I 5 +P62937 Peptidyl-prolyl cis-trans isomerase A 5 +P63104 14-3-3 protein zeta/delta 5 +Q92820 Gamma-glutamyl hydrolase 5 +O75342 Arachidonate 12-lipoxygenase, 12R-type 4 +P09211 Glutathione S-transferase P 4 +P31025 Lipocalin-1 4 +P48594 Serpin B4 4 +Q14574 Desmocollin-3 4 +Q5T750 Skin-specific protein 32 4 +Q6UWP8 Suprabasin 4 +O60911 Cathepsin L2 4 +P00558 Phosphoglycerate kinase 1 4 +P04075 Fructose-bisphosphate aldolase A 4 +P07384 Calpain-1 catalytic subunit 4 +P0CG05 Ig lambda-2 chain C regions 4 +P18206 Vinculin 4 +P62258 14-3-3 protein epsilon 4 +P68871 Hemoglobin subunit beta 4 +Q9C075 Keratin, type I cytoskeletal 23 4 +A8K2U0 Alpha-2-macroglobulin-like protein 1 3 +P00738 Haptoglobin 3 +P01011 Alpha-1-antichymotrypsin 3 +P02763 Alpha-1-acid glycoprotein 1 3 +P18510 Interleukin-1 receptor antagonist protein 3 +P22528 Cornifin-B 3 +P30740 Leukocyte elastase inhibitor 3 +P80188 Neutrophil gelatinase-associated lipocalin 3 +Q15828 Cystatin-M 3 +Q9HCY8 Protein S100-A14 3 +P01623 Ig kappa chain V-III region 3 +P01877 Ig alpha-2 chain C region 3 +P06396 Gelsolin 3 +P14735 Insulin-degrading enzyme 3 +P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3 +P25788 Proteasome subunit alpha type-3 3 +P26641 Elongation factor 1-gamma 3 +P36952 Serpin B5 3 +P40926 Malate dehydrogenase, mitochondrial 3 +Q9Y6R7 IgGFc-binding protein 3 +O95274 Ly6/PLAUR domain-containing protein 3 2 +P00491 Purine nucleoside phosphorylase 2 +P04080 Cystatin-B 2 +P09972 Fructose-bisphosphate aldolase C 2 +P19012 Keratin, type I cytoskeletal 15 2 +P20930 Filaggrin 2 +Q96FX8 p53 apoptosis effector related to PMP-22 2 +Q9UIV8 Serpin B13 2 +P01625 Ig kappa chain V-IV region Len 2 +P01765 Ig heavy chain V-III region TIL 2 +P01766 Ig heavy chain V-III region BRO 2 +P01860 Ig gamma-3 chain C region 2 +P01871 Ig mu chain C region 2 +P05090 Apolipoprotein D 2 +P06870 Kallikrein-1 2 +P07858 Cathepsin B 2 +P08865 40S ribosomal protein SA 2 +P11279 Lysosome-associated membrane glycoprotein 1 2 +P13473 Lysosome-associated membrane glycoprotein 2 2 +P19971 Thymidine phosphorylase 2 +P23284 Peptidyl-prolyl cis-trans isomerase B 2 +P23396 40S ribosomal protein S3 2 +P25705 ATP synthase subunit alpha, mitochondrial 2 +P27482 Calmodulin-like protein 3 2 +P31949 Protein S100-A11 2 +P40121 Macrophage-capping protein 2 +P42357 Histidine ammonia-lyase 2 +P47756 F-actin-capping protein subunit beta 2 +P48637 Glutathione synthetase 2 +P49720 Proteasome subunit beta type-3 2 +P50395 Rab GDP dissociation inhibitor beta 2 +P59998 Actin-related protein 2/3 complex subunit 4 2 +P61160 Actin-related protein 2 2 +P61916 Epididymal secretory protein E1 2 +P04745 Alpha-amylase 1 23 +Q9NZT1 Calmodulin-like protein 5 8 +P12273 Prolactin-inducible protein 6 +Q96DA0 Zymogen granule protein 16 homolog B 5 +P01036 Cystatin-S 5 +Q8TAX7 Mucin-7 2 +P01037 Cystatin-SN 2 +P09228 Cystatin-SA 2 +P04264 Keratin, type II cytoskeletal 1 61 +P35908 Keratin, type II cytoskeletal 2 epidermal 40 +P13645 Keratin, type I cytoskeletal 10 40 +Q5D862 Filaggrin-2 14 +Q5T749 Keratinocyte proline-rich protein 13 +Q8IW75 Serpin A12 3 +P81605 Dermcidin 3 +P22531 Small proline-rich protein 2E 3 +P59666 Neutrophil defensin 3 2 +P78386 Keratin, type II cuticular Hb5 2 + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Trash_FKW_Lacombe_et_al_2017_OK.txt Fri Feb 16 03:27:43 2018 -0500 @@ -0,0 +1,12 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) + +P04264 Keratin, type II cytoskeletal 1 61 +P35908 Keratin, type II cytoskeletal 2 epidermal 40 +P13645 Keratin, type I cytoskeletal 10 40 +Q5D862 Filaggrin-2 14 +Q5T749 Keratinocyte proline-rich protein 13 +Q8IW75 Serpin A12 3 +P81605 Dermcidin 3 +P22531 Small proline-rich protein 2E 3 +P59666 Neutrophil defensin 3 2 +P78386 Keratin, type II cuticular Hb5 2 \ No newline at end of file
--- a/test-data/UnipIDs.txt Sun Nov 26 18:36:43 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -P04637 -P08246 -P63244 -P10275 -P00533 -Q14524 -P05067 -P35555 -P35222 -O95273 -P00451 -P38398 -Q05086 -Q12802 -P68871 -P04585 -Q96EB6 -Q9NYL2 -P31749 -P01137 -Q5S007 -Q08379 -P02649 -P35498 -P12931
--- a/test-data/filter_keywords_values_output.txt Sun Nov 26 18:36:43 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -P08246 2 B0 -P63244 1.5 C1 -Q14524 3.5 D1 -P05067 1 B3 -P00451 2 B2 -P38398 5 B4 -Q12802 3 D5 -P68871 1.5 B4 -P04585 2.5 D3 -Q9NYL2 1 B1 -P01137 5 B6 -Q5S007 8 D4 -Q08379 2 C4 -P35498 1 C5