diff formatter.py @ 4:966b4134ad12 draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 5661cf2406e0616d7b2f4bee1b57ec43716088de
author recetox
date Tue, 18 Oct 2022 11:02:18 +0000
parents 574c6331e9db
children 1b09315a3f87
line wrap: on
line diff
--- a/formatter.py	Wed Sep 21 15:29:51 2022 +0000
+++ b/formatter.py	Tue Oct 18 11:02:18 2022 +0000
@@ -1,5 +1,6 @@
 import click
-from pandas import DataFrame, read_csv, to_numeric
+from matchms.importing import scores_from_json
+from pandas import DataFrame
 
 
 def create_long_table(data: DataFrame, value_id: str) -> DataFrame:
@@ -63,18 +64,36 @@
     return filtered
 
 
-def load_data(scores_filename: str, matches_filename: str) -> DataFrame:
+def scores_to_dataframes(scores):
+    """Unpack scores from matchms.scores into two dataframes of scores and matches.
+
+    Args:
+        scores (matchms.scores): matchms.scores object.
+
+    Returns:
+        DataFrame: Scores
+        DataFrame: Matches
+    """
+    query_names = [spectra.metadata['compound_name'] for spectra in scores.queries]
+    reference_names = [spectra.metadata['compound_name'] for spectra in scores.references]
+
+    dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names)
+    dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names)
+
+    return dataframe_scores, dataframe_matches
+
+
+def load_data(scores_filename: str) -> DataFrame:
     """Load data from filenames and join on compound id.
 
     Args:
-        scores_filename (str): Path to scores table.
-        matches_filename (str): Path to matches table.
+        scores_filename (str): Path to json file with serialized scores.
 
     Returns:
-        DataFrame: Joined dataframe on compounds containing scores an matches in long format.
+        DataFrame: Joined dataframe on compounds containing scores and matches in long format.
     """
-    matches = read_csv(matches_filename, sep="\t", index_col=0, header=0).apply(to_numeric)
-    scores = read_csv(scores_filename, sep="\t", index_col=0, header=0).apply(to_numeric)
+    scores = scores_from_json(scores_filename)
+    scores, matches = scores_to_dataframes(scores)
 
     scores_long = create_long_table(scores, 'score')
     matches_long = create_long_table(matches, 'matches')
@@ -85,12 +104,11 @@
 
 @click.group()
 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True)
-@click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True)
 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True)
 @click.pass_context
-def cli(ctx, scores_filename, matches_filename, output_filename):
+def cli(ctx, scores_filename, output_filename):
     ctx.ensure_object(dict)
-    ctx.obj['data'] = load_data(scores_filename, matches_filename)
+    ctx.obj['data'] = load_data(scores_filename)
     pass
 
 
@@ -111,8 +129,8 @@
     return result
 
 
-@cli.resultcallback()
-def write_output(result: DataFrame, scores_filename, matches_filename, output_filename):
+@cli.result_callback()
+def write_output(result: DataFrame, scores_filename, output_filename):
     result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'})
     result.to_csv(output_filename, sep="\t", index=False)