comparison formatter.py @ 10:1b09315a3f87 draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
author recetox
date Tue, 27 Jun 2023 14:25:59 +0000
parents 966b4134ad12
children ae45992f969e
comparison
equal deleted inserted replaced
9:715fe77be601 10:1b09315a3f87
1 import click 1 import click
2 from matchms.importing import scores_from_json 2 from matchms.importing import scores_from_json
3 from pandas import DataFrame 3 from pandas import DataFrame
4 4
5 5
6 def create_long_table(data: DataFrame, value_id: str) -> DataFrame: 6 def scores_to_dataframe(scores):
7 """Convert the table from compact into long format.
8 See DataFrame.melt(...).
9
10 Args:
11 data (DataFrame): The data table to convert.
12 value_id (str): The name to assign to the added column through conversion to long format.
13
14 Returns:
15 DataFrame: Table in long format.
16 """
17 return data.transpose().melt(ignore_index=False, var_name='compound', value_name=value_id)
18
19
20 def join_df(x: DataFrame, y: DataFrame, on=[], how="inner") -> DataFrame:
21 """Shortcut functions to join to dataframes on columns and index
22
23 Args:
24 x (DataFrame): Table X
25 y (DataFrame): Table Y
26 on (list, optional): Columns on which to join. Defaults to [].
27 how (str, optional): Join method, see DataFrame.join(...). Defaults to "inner".
28
29 Returns:
30 DataFrame: Joined dataframe.
31 """
32 df_x = x.set_index([x.index] + on)
33 df_y = y.set_index([y.index] + on)
34 combined = df_x.join(df_y, how=how)
35 return combined
36
37
38 def get_top_k_matches(data: DataFrame, k: int) -> DataFrame:
39 """Function to get top k matches from dataframe with scores.
40
41 Args:
42 data (DataFrame): A table with score column.
43 k (int): Number of top scores to retrieve.
44
45 Returns:
46 DataFrame: Table containing only the top k best matches for each compound.
47 """
48 return data.groupby(level=0, group_keys=False).apply(DataFrame.nlargest, n=k, columns=['score'])
49
50
51 def filter_thresholds(data: DataFrame, t_score: float, t_matches: float) -> DataFrame:
52 """Filter a dataframe with scores and matches to only contain values above specified thresholds.
53
54 Args:
55 data (DataFrame): Table to filter.
56 t_score (float): Score threshold.
57 t_matches (float): Matches threshold.
58
59 Returns:
60 DataFrame: Filtered dataframe.
61 """
62 filtered = data[data['score'] > t_score]
63 filtered = filtered[filtered['matches'] > t_matches]
64 return filtered
65
66
67 def scores_to_dataframes(scores):
68 """Unpack scores from matchms.scores into two dataframes of scores and matches. 7 """Unpack scores from matchms.scores into two dataframes of scores and matches.
69 8
70 Args: 9 Args:
71 scores (matchms.scores): matchms.scores object. 10 scores (matchms.scores): matchms.scores object.
72 11
73 Returns: 12 Returns:
74 DataFrame: Scores 13 DataFrame: Scores
75 DataFrame: Matches 14 DataFrame: Matches
76 """ 15 """
77 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries] 16 dataframe = DataFrame(columns=['query', 'reference', *scores.scores.score_names])
78 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references]
79 17
80 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) 18 for i, (row, col) in enumerate(zip(scores.scores.row, scores.scores.col)):
81 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) 19 dataframe.loc[i] = [scores.queries[col].metadata['compound_name'], scores.references[row].metadata['compound_name'], *scores.scores.data[i]]
82 20
83 return dataframe_scores, dataframe_matches 21 return dataframe
84 22
85 23
86 def load_data(scores_filename: str) -> DataFrame: 24 def load_data(scores_filename: str) -> DataFrame:
87 """Load data from filenames and join on compound id. 25 """Load data from filenames and join on compound id.
88 26
91 29
92 Returns: 30 Returns:
93 DataFrame: Joined dataframe on compounds containing scores and matches in long format. 31 DataFrame: Joined dataframe on compounds containing scores and matches in long format.
94 """ 32 """
95 scores = scores_from_json(scores_filename) 33 scores = scores_from_json(scores_filename)
96 scores, matches = scores_to_dataframes(scores) 34 scores = scores_to_dataframe(scores)
97 35
98 scores_long = create_long_table(scores, 'score') 36 return scores
99 matches_long = create_long_table(matches, 'matches')
100
101 combined = join_df(matches_long, scores_long, on=['compound'], how='inner')
102 return combined
103 37
104 38
105 @click.group() 39 @click.group(invoke_without_command=True)
106 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) 40 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True)
107 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) 41 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True)
108 @click.pass_context 42 def cli(scores_filename, output_filename):
109 def cli(ctx, scores_filename, output_filename): 43 result = load_data(scores_filename)
110 ctx.ensure_object(dict) 44 result.to_csv(output_filename, sep="\t", index=False)
111 ctx.obj['data'] = load_data(scores_filename)
112 pass 45 pass
113 46
114 47
115 @cli.command()
116 @click.option('--st', 'scores_threshold', type=float, required=True)
117 @click.option('--mt', 'matches_threshold', type=float, required=True)
118 @click.pass_context
119 def get_thresholded_data(ctx, scores_threshold, matches_threshold):
120 result = filter_thresholds(ctx.obj['data'], scores_threshold, matches_threshold)
121 return result
122
123
124 @cli.command()
125 @click.option('--k', 'k', type=int, required=True)
126 @click.pass_context
127 def get_top_k_data(ctx, k):
128 result = get_top_k_matches(ctx.obj['data'], k)
129 return result
130
131
132 @cli.result_callback()
133 def write_output(result: DataFrame, scores_filename, output_filename):
134 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'})
135 result.to_csv(output_filename, sep="\t", index=False)
136
137
138 if __name__ == '__main__': 48 if __name__ == '__main__':
139 cli(obj={}) 49 cli()