Mercurial > repos > jay > gaiac_pm_data_pulling

--- a/gaiac_time_sync/gaiac_time_sync.py	Tue Jan 27 03:03:49 2026 +0000
+++ b/gaiac_time_sync/gaiac_time_sync.py	Wed Jan 28 14:30:01 2026 +0000
@@ -4,43 +4,146 @@

 def align_sensor_data(file_list, date_time, sep=',', output_mode='multiple', output='aligned.tsv'):

-    file_list =  file_list.split(',')
+    if isinstance(file_list, str):
+        file_list = file_list.split(',')
+
     if len(file_list) < 2:
         print("Please provide at least two files.")
         return

-    # Read all files into a list of dataframes
-    dfs = [pd.read_csv(file, sep=sep, parse_dates=[date_time]) for file in file_list]
+    # Check if date_time is numeric (column index) or string (column name)
+    use_index = False
+    try:
+        # User input '1' likely means 1st column (index 0)
+        col_idx = int(date_time) - 1
+        if col_idx < 0:
+            raise ValueError("Column index must be >= 1")
+        use_index = True
+        print(f"Using column index {col_idx} (from input '{date_time}')")
+    except ValueError:
+        # Not an integer, treat as column name
+        merge_col = date_time
+        print(f"Using column name '{merge_col}'")
+
+    dfs = []
+    for file in file_list:
+        file = file.strip() # clean whitespace
+        if not file: continue
+
+        if use_index:
+            # Parse dates using index
+            df = pd.read_csv(file, sep=sep, parse_dates=[col_idx])
+            # Normalize the column name to specific identifier for merging
+            # This handles cases where different files describe the date column with different headers
+            original_col_name = df.columns[col_idx]
+            merge_col = "__common_timestamp__"
+            df.rename(columns={original_col_name: merge_col}, inplace=True)
+        else:
+            # Parse dates using name
+            df = pd.read_csv(file, sep=sep, parse_dates=[date_time])
+
+        dfs.append(df)
+
+    if not dfs:
+        print("No valid dataframes loaded.")
+        return

     # Get common timestamps by successive inner merges
-    common_times = dfs[0][[date_time]]
+    common_times = dfs[0][[merge_col]]
     for df in dfs[1:]:
-        common_times = common_times.merge(df[[date_time]], on=date_time, how='inner')
+        common_times = common_times.merge(df[[merge_col]], on=merge_col, how='inner')

     # Now filter each dataframe to contain only common timestamps
     aligned_dfs = [
-        df[df[date_time].isin(common_times[date_time])].reset_index(drop=True)
+        df[df[merge_col].isin(common_times[merge_col])].reset_index(drop=True)
         for df in dfs
     ]

+    # After filtering, if we used a placeholder name, we might want to restore original names?
+    # Or keep it universal.
+    # The requirement says "returns files with time synchronized data".
+    # If output_mode='multiple', we dump them back.
+    # If we renamed the timestamp column to '__common_timestamp__', it will appear as such in output.
+    # User might prefer the original name.
+    # But if input files had DIFFERENT names for that column, which one should we use?
+    # Let's simple restore it to "date_time" (user input) or something generic if it was index.
+
+    # Actually, simplest is to rename it back to "Date_Time" or similar if we changed it.
+    if use_index and output_mode == 'multiple':
+        for df in aligned_dfs:
+            df.rename(columns={merge_col: "Date_Time"}, inplace=True)
+        # Update merge_col to new name so single mode merging works if triggered
+        merge_col = "Date_Time"
+
     # Output files
     if output_mode == 'multiple':
         for i, (file, df) in enumerate(zip(file_list, aligned_dfs)):
-            filename = os.path.splitext(os.path.basename(file))[0]
+            # Clean filename logic (handle paths)
+            filename = os.path.splitext(os.path.basename(file.strip()))[0]
+            # output is just a prefix-ish or single file arg?
+            # In XML, -o $out. But $out is a single file path in Galaxy typically unless discover_datasets used.
+            # Wait, XML output is: <data name='out' ... />
+            # If output_mode is 'multiple', the script generates multiple files?
+            # XML says one output 'out'.
+            # The script default is 'single' in argparse, but XML doesn't set mode!
+            # XML command: python ... -o $out
+            # XML inputs don't allow selecting mode.
+            # Python script default mode is 'single'.
+
+            # So output_mode is likely 'single'.
+            pass
+
+    # Re-eval python default arguments:
+    # parser.add_argument('-m', '--mode', default='single', ...)
+    # XML doesn't pass -m. So it uses 'single'.
+    # So we merge into one file.
+
+    if output_mode == 'single':
+        # Merge all aligned dataframes
+        merged_df = aligned_dfs[0]
+        # Rename back if needed?
+        if use_index:
+             merged_df.rename(columns={merge_col: "Date_Time"}, inplace=True)
+             merge_col = "Date_Time"
+
+        for i, df in enumerate(aligned_dfs[1:]):
+             # When merging 'single', we end up with wide format?
+             # Or just inner join?
+             # Original code:
+             # merged_df = merged_df.merge(df, on=date_time, how='inner')
+
+             # If we merge, we need suffixes if other columns have same names!
+             # Original code didn't specify suffixes, so pandas defaults _x, _y.
+             # With >2 files, it gets messy (_x, _y, _x, _y...)
+             # But let's keep original logic for suffixes.
+
+             # If we used index, the column is named 'merge_col' in 'df' too.
+             if use_index:
+                 df.rename(columns={'__common_timestamp__': merge_col}, inplace=True)
+
+             merged_df = merged_df.merge(df, on=merge_col, how='inner')
+
+        merged_df.to_csv(output, index=False, sep=sep)
+        print("Single merged file saved.")
+
+    elif output_mode == 'multiple': # Original logic for multiple
+         # ...
+         # The original code's "multiple" block was slightly broken or unused by Galaxy XML
+         # because Galaxy XML expects specific output file or discovery.
+         # But I digress, I just need to fix the Date parsing error.
+
+         for i, (file, df) in enumerate(zip(file_list, aligned_dfs)):
+            filename = os.path.splitext(os.path.basename(file.strip()))[0]
+            # output is passed as full path 'out.tsv' probably.
+            # If default output was 'aligned', it tries 'aligned_filename.csv'
+            # Here 'output' arg is likely a file path from Galaxy.
+            output_prefix = os.path.splitext(output)[0]
             output_file = f"{output_prefix}_{filename}.csv"
             df.to_csv(output_file, index=False, sep=sep)
-        print("Aligned files saved individually.")
-    elif output_mode == 'single':
-        # Merge all aligned dataframes on date_time
-        merged_df = aligned_dfs[0]
-        for df in aligned_dfs[1:]:
-            merged_df = merged_df.merge(df, on=date_time, how='inner')
+         print("Aligned files saved individually.")
+    else:
+        print("Invalid output mode.")

-
-        merged_df.to_csv(output, index=False, sep=sep)
-        print("Single merged file saved.")
-    else:
-        print("Invalid output mode. Use 'multiple' or 'single'.")


 def main():
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gaiac_time_sync/test_output.csv	Wed Jan 28 14:30:01 2026 +0000
@@ -0,0 +1,5 @@
+Date_Time	Temparature1	Humidity1	Temparature2	Humidity2
+2019-06-07 13:28:00	39.0	50.471	39.0	50.471
+2019-06-07 13:29:00	39.0	51.0	39.0	50.471
+2019-06-07 13:30:00	39.588	49.647	39.588	49.647
+2019-06-07 13:31:00	38.9	50.0	40.0	49.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gaiac_time_sync/test_output_3files.csv	Wed Jan 28 14:30:01 2026 +0000
@@ -0,0 +1,4 @@
+Date_Time	Temparature1	Humidity1	Temparature2	Humidity2	Temparature3	humidity3
+2019-06-07 13:28:00	39.0	50.471	39.0	50.471	38	53.0
+2019-06-07 13:29:00	39.0	51.0	39.0	50.471	38	52.824
+2019-06-07 13:31:00	38.9	50.0	40.0	49.0	38	51.944