Mercurial > repos > jay > gaiac_pm_data_pulling
changeset 4:ab9445f02bef draft default tip
planemo upload for repository https://github.com/jaidevjoshi83/gaiac commit d93e649fdfa6940af167c34c2d6de23f231e2f1b-dirty
| author | jay |
|---|---|
| date | Wed, 28 Jan 2026 14:30:01 +0000 |
| parents | d1fe9a064c46 |
| children | |
| files | gaiac_time_sync/gaiac_time_sync.py gaiac_time_sync/test_output.csv gaiac_time_sync/test_output_3files.csv |
| diffstat | 3 files changed, 130 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/gaiac_time_sync/gaiac_time_sync.py Tue Jan 27 03:03:49 2026 +0000 +++ b/gaiac_time_sync/gaiac_time_sync.py Wed Jan 28 14:30:01 2026 +0000 @@ -4,43 +4,146 @@ def align_sensor_data(file_list, date_time, sep=',', output_mode='multiple', output='aligned.tsv'): - file_list = file_list.split(',') + if isinstance(file_list, str): + file_list = file_list.split(',') + if len(file_list) < 2: print("Please provide at least two files.") return - # Read all files into a list of dataframes - dfs = [pd.read_csv(file, sep=sep, parse_dates=[date_time]) for file in file_list] + # Check if date_time is numeric (column index) or string (column name) + use_index = False + try: + # User input '1' likely means 1st column (index 0) + col_idx = int(date_time) - 1 + if col_idx < 0: + raise ValueError("Column index must be >= 1") + use_index = True + print(f"Using column index {col_idx} (from input '{date_time}')") + except ValueError: + # Not an integer, treat as column name + merge_col = date_time + print(f"Using column name '{merge_col}'") + + dfs = [] + for file in file_list: + file = file.strip() # clean whitespace + if not file: continue + + if use_index: + # Parse dates using index + df = pd.read_csv(file, sep=sep, parse_dates=[col_idx]) + # Normalize the column name to specific identifier for merging + # This handles cases where different files describe the date column with different headers + original_col_name = df.columns[col_idx] + merge_col = "__common_timestamp__" + df.rename(columns={original_col_name: merge_col}, inplace=True) + else: + # Parse dates using name + df = pd.read_csv(file, sep=sep, parse_dates=[date_time]) + + dfs.append(df) + + if not dfs: + print("No valid dataframes loaded.") + return # Get common timestamps by successive inner merges - common_times = dfs[0][[date_time]] + common_times = dfs[0][[merge_col]] for df in dfs[1:]: - common_times = common_times.merge(df[[date_time]], on=date_time, how='inner') + common_times = common_times.merge(df[[merge_col]], on=merge_col, how='inner') # Now filter each dataframe to contain only common timestamps aligned_dfs = [ - df[df[date_time].isin(common_times[date_time])].reset_index(drop=True) + df[df[merge_col].isin(common_times[merge_col])].reset_index(drop=True) for df in dfs ] + # After filtering, if we used a placeholder name, we might want to restore original names? + # Or keep it universal. + # The requirement says "returns files with time synchronized data". + # If output_mode='multiple', we dump them back. + # If we renamed the timestamp column to '__common_timestamp__', it will appear as such in output. + # User might prefer the original name. + # But if input files had DIFFERENT names for that column, which one should we use? + # Let's simple restore it to "date_time" (user input) or something generic if it was index. + + # Actually, simplest is to rename it back to "Date_Time" or similar if we changed it. + if use_index and output_mode == 'multiple': + for df in aligned_dfs: + df.rename(columns={merge_col: "Date_Time"}, inplace=True) + # Update merge_col to new name so single mode merging works if triggered + merge_col = "Date_Time" + # Output files if output_mode == 'multiple': for i, (file, df) in enumerate(zip(file_list, aligned_dfs)): - filename = os.path.splitext(os.path.basename(file))[0] + # Clean filename logic (handle paths) + filename = os.path.splitext(os.path.basename(file.strip()))[0] + # output is just a prefix-ish or single file arg? + # In XML, -o $out. But $out is a single file path in Galaxy typically unless discover_datasets used. + # Wait, XML output is: <data name='out' ... /> + # If output_mode is 'multiple', the script generates multiple files? + # XML says one output 'out'. + # The script default is 'single' in argparse, but XML doesn't set mode! + # XML command: python ... -o $out + # XML inputs don't allow selecting mode. + # Python script default mode is 'single'. + + # So output_mode is likely 'single'. + pass + + # Re-eval python default arguments: + # parser.add_argument('-m', '--mode', default='single', ...) + # XML doesn't pass -m. So it uses 'single'. + # So we merge into one file. + + if output_mode == 'single': + # Merge all aligned dataframes + merged_df = aligned_dfs[0] + # Rename back if needed? + if use_index: + merged_df.rename(columns={merge_col: "Date_Time"}, inplace=True) + merge_col = "Date_Time" + + for i, df in enumerate(aligned_dfs[1:]): + # When merging 'single', we end up with wide format? + # Or just inner join? + # Original code: + # merged_df = merged_df.merge(df, on=date_time, how='inner') + + # If we merge, we need suffixes if other columns have same names! + # Original code didn't specify suffixes, so pandas defaults _x, _y. + # With >2 files, it gets messy (_x, _y, _x, _y...) + # But let's keep original logic for suffixes. + + # If we used index, the column is named 'merge_col' in 'df' too. + if use_index: + df.rename(columns={'__common_timestamp__': merge_col}, inplace=True) + + merged_df = merged_df.merge(df, on=merge_col, how='inner') + + merged_df.to_csv(output, index=False, sep=sep) + print("Single merged file saved.") + + elif output_mode == 'multiple': # Original logic for multiple + # ... + # The original code's "multiple" block was slightly broken or unused by Galaxy XML + # because Galaxy XML expects specific output file or discovery. + # But I digress, I just need to fix the Date parsing error. + + for i, (file, df) in enumerate(zip(file_list, aligned_dfs)): + filename = os.path.splitext(os.path.basename(file.strip()))[0] + # output is passed as full path 'out.tsv' probably. + # If default output was 'aligned', it tries 'aligned_filename.csv' + # Here 'output' arg is likely a file path from Galaxy. + output_prefix = os.path.splitext(output)[0] output_file = f"{output_prefix}_{filename}.csv" df.to_csv(output_file, index=False, sep=sep) - print("Aligned files saved individually.") - elif output_mode == 'single': - # Merge all aligned dataframes on date_time - merged_df = aligned_dfs[0] - for df in aligned_dfs[1:]: - merged_df = merged_df.merge(df, on=date_time, how='inner') + print("Aligned files saved individually.") + else: + print("Invalid output mode.") - - merged_df.to_csv(output, index=False, sep=sep) - print("Single merged file saved.") - else: - print("Invalid output mode. Use 'multiple' or 'single'.") def main():
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gaiac_time_sync/test_output.csv Wed Jan 28 14:30:01 2026 +0000 @@ -0,0 +1,5 @@ +Date_Time Temparature1 Humidity1 Temparature2 Humidity2 +2019-06-07 13:28:00 39.0 50.471 39.0 50.471 +2019-06-07 13:29:00 39.0 51.0 39.0 50.471 +2019-06-07 13:30:00 39.588 49.647 39.588 49.647 +2019-06-07 13:31:00 38.9 50.0 40.0 49.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gaiac_time_sync/test_output_3files.csv Wed Jan 28 14:30:01 2026 +0000 @@ -0,0 +1,4 @@ +Date_Time Temparature1 Humidity1 Temparature2 Humidity2 Temparature3 humidity3 +2019-06-07 13:28:00 39.0 50.471 39.0 50.471 38 53.0 +2019-06-07 13:29:00 39.0 51.0 39.0 50.471 38 52.824 +2019-06-07 13:31:00 38.9 50.0 40.0 49.0 38 51.944
