Mercurial > repos > recetox > matchms
comparison matchms_similarity_wrapper.py @ 0:30e680e555d4 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 4d2ac914c951166e386a94d8ebb8cb1becfac122"
author | recetox |
---|---|
date | Tue, 22 Mar 2022 16:07:32 +0000 |
parents | |
children | f680068b7863 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:30e680e555d4 |
---|---|
1 import argparse | |
2 import sys | |
3 | |
4 import numpy as np | |
5 from matchms import calculate_scores | |
6 from matchms.importing import load_from_mgf, load_from_msp | |
7 from matchms.similarity import CosineGreedy, CosineHungarian, MetadataMatch, ModifiedCosine | |
8 from pandas import DataFrame | |
9 | |
10 | |
11 def convert_precursor_mz(spectrum): | |
12 """ | |
13 Check the presence of precursor m/z since it is needed for ModifiedCosine similarity metric. Convert to float if | |
14 needed, raise error if missing. | |
15 """ | |
16 | |
17 if "precursor_mz" in spectrum.metadata: | |
18 metadata = spectrum.metadata | |
19 metadata["precursor_mz"] = float(metadata["precursor_mz"]) | |
20 spectrum.metadata = metadata | |
21 return spectrum | |
22 else: | |
23 raise ValueError("Precursor_mz missing. Apply 'add_precursor_mz' filter first.") | |
24 | |
25 | |
26 def main(argv): | |
27 parser = argparse.ArgumentParser(description="Compute MSP similarity scores") | |
28 parser.add_argument("-r", dest="ri_tolerance", type=float, help="Use RI filtering with given tolerance.") | |
29 parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") | |
30 parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") | |
31 parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") | |
32 parser.add_argument("queries_filename", type=str, help="Path to query spectra.") | |
33 parser.add_argument("queries_format", type=str, help="Query spectra file format.") | |
34 parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') | |
35 parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") | |
36 parser.add_argument("mz_power", type=float, help="The power to raise mz to in the cosine function.") | |
37 parser.add_argument("intensity_power", type=float, help="The power to raise intensity to in the cosine function.") | |
38 parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") | |
39 parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") | |
40 args = parser.parse_args() | |
41 | |
42 if args.queries_format == 'msp': | |
43 queries_spectra = list(load_from_msp(args.queries_filename)) | |
44 elif args.queries_format == 'mgf': | |
45 queries_spectra = list(load_from_mgf(args.queries_filename)) | |
46 else: | |
47 raise ValueError(f'File format {args.queries_format} not supported for query spectra.') | |
48 | |
49 if args.symmetric: | |
50 reference_spectra = [] | |
51 else: | |
52 if args.references_format == 'msp': | |
53 reference_spectra = list(load_from_msp(args.references_filename)) | |
54 elif args.references_format == 'mgf': | |
55 reference_spectra = list(load_from_mgf(args.references_filename)) | |
56 else: | |
57 raise ValueError(f'File format {args.references_format} not supported for reference spectra library.') | |
58 | |
59 if args.similarity_metric == 'CosineGreedy': | |
60 similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) | |
61 elif args.similarity_metric == 'CosineHungarian': | |
62 similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) | |
63 elif args.similarity_metric == 'ModifiedCosine': | |
64 similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) | |
65 reference_spectra = list(map(convert_precursor_mz, reference_spectra)) | |
66 queries_spectra = list(map(convert_precursor_mz, queries_spectra)) | |
67 else: | |
68 return -1 | |
69 | |
70 print("Calculating scores...") | |
71 scores = calculate_scores( | |
72 references=queries_spectra if args.symmetric else reference_spectra, | |
73 queries=queries_spectra, | |
74 similarity_function=similarity_metric, | |
75 is_symmetric=args.symmetric | |
76 ) | |
77 | |
78 if args.ri_tolerance is not None: | |
79 print("RI filtering with tolerance ", args.ri_tolerance) | |
80 ri_matches = calculate_scores(reference_spectra, queries_spectra, MetadataMatch("retention_index", "difference", args.ri_tolerance)).scores | |
81 scores.scores["score"] = np.where(ri_matches, scores.scores["score"], 0.0) | |
82 | |
83 write_outputs(args, scores) | |
84 return 0 | |
85 | |
86 | |
87 def write_outputs(args, scores): | |
88 print("Storing outputs...") | |
89 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries] | |
90 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references] | |
91 | |
92 # Write scores to dataframe | |
93 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) | |
94 dataframe_scores.to_csv(args.output_filename_scores, sep='\t') | |
95 | |
96 # Write number of matches to dataframe | |
97 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) | |
98 dataframe_matches.to_csv(args.output_filename_matches, sep='\t') | |
99 | |
100 | |
101 if __name__ == "__main__": | |
102 main(argv=sys.argv[1:]) | |
103 pass |