Mercurial > repos > immport-devteam > merge_ds_flowtext
diff FCStxtMergeDownsample.py @ 1:3c0e4179be7a draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/merge_ds_flowtext commit 7858e5b085fc3c60c88fe87b2f343969d50d9b1e"
author | azomics |
---|---|
date | Mon, 22 Jun 2020 17:42:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FCStxtMergeDownsample.py Mon Jun 22 17:42:26 2020 -0400 @@ -0,0 +1,243 @@ +#!/usr/bin/env python + +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### + +from __future__ import print_function +from __future__ import division +import sys +import os +import pandas as pd +from argparse import ArgumentParser + + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + + +def is_integer(s): + try: + int(s) + return True + except ValueError: + return False + + +def compare_headers(files): + headers = {} + for eachfile in files: + with open(eachfile, "r") as ef: + headers[eachfile] = ef.readline().strip().lower().split("\t") + + hdgs_in_common = [] + flag = {} + + for ref_hdgs in headers[files[0]]: + flag[ref_hdgs] = 1 + + for ij in range(1, len(files)): + if ref_hdgs in headers[files[ij]]: + flag[ref_hdgs] += 1 + if flag[ref_hdgs] == len(files): + hdgs_in_common.append(ref_hdgs) + + if not hdgs_in_common: + sys.exit(9) + return(hdgs_in_common) + + +def get_nb_lines(files): + tot_event = 0 + for f in files: + df = pd.read_table(f) + tot_event += (len(df.index) - 1) + return(tot_event) + + +def get_headers_index(list_headings, headings): + idxs = [] + lhdgs = [x.lower() for x in headings] + for element in list_headings: + idxs.append(int(lhdgs.index(element))) + return(idxs) + + +def merge_and_DS_txt(in_files, out_file, col_names, factor_ds): + """Concatenates together tab-separated files. + The output will have only the columns in common to all the files provided + as input, as determined by the headers. + All lines after the header line must contain only numbers. + Potential errors are logged to stderr. If the number of errors reaches 10, + the program stops. + If a downsampling factor is given, returns the indicated fraction of + random lines. + """ + + nb_errors = 0 + max_error = 10 + + # get list of headers in common to all files + list_hdgs = compare_headers(in_files) + total_events = get_nb_lines(in_files) + total_final = total_events * ds_factor + nb_per_file = int(total_final / len(in_files)) + + with open(out_file, "w") as outf: + ff_order = [] + # HEADERS: + with open(in_files[0], "r") as first_file: + headings_ff = first_file.readline().strip() + headings = headings_ff.split("\t") + # Get index of headers in common: + hdrs_idx = get_headers_index(list_hdgs, headings) + + # If column to merge on were provided: + if col_names: + for ix in col_names: + if ix not in hdrs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0], + "does not exist in all files or has a different header.\n"])) + if nb_errors == max_error: + exit_code = 4 + sys.stderr.write("Run aborted - too many errors.") + os.remove(out_file) + hdrs_idx = col_names + + # Print out to output file: + headings_to_write = [] + for cti in range(0, len(headings)): + if cti in hdrs_idx: + headings_to_write.append(headings[cti]) + ff_order.append(headings[cti]) + outf.write("\t".join(headings_to_write) + "\n") + + # DATA + for infile in in_files: + with open(infile, "r") as inf: + headings_inf = inf.readline().strip() + hdgs = headings_inf.split("\t") + # Get the index of columns to keep: + hdgs_idx = [] + for ctc in ff_order: + hdgs_idx.append(int(hdgs.index(ctc))) + if col_names: + for iy in col_names: + if iy not in hdgs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile, + "does not exist in all files or has a different header.\n"])) + if nb_errors == max_error: + exit_code = 4 + sys.stderr.write("Run aborted - too many errors.") + os.remove(out_file) + hdgs_idx = col_names + + df = pd.read_table(infile, usecols=hdrs_idx) + df_ds = df.sample(nb_per_file, replace=False) + + for cols in df_ds.columns.values: + if df_ds[cols].count() != len(df_ds[cols]): + sys.stderr.write(infile + "contains non-numeric data\n") + + with open(infile, "r") as checkfile: + fl = checkfile.readline() + count_lines = 1 + for checklines in checkfile: + to_check = checklines.strip().split("\t") + count_lines += 1 + for item in to_check: + if not is_number(item): + sys.stderr.write(" ".join(["WARNING: line", str(count_lines), + "in", infile, "contains non-numeric results\n"])) + sys.exit(2) + + df_ds = df_ds.ix[:, ff_order] + df_ds.to_csv(outf, sep="\t", header=False, index=False) + + if nb_errors > 0: + exit_code = 3 + if nb_errors == max_error: + exit_code = 4 + sys.stderr.write("Run aborted - too many errors.") + os.remove(out_file) + sys.exit(exit_code) + return + + +if __name__ == "__main__": + parser = ArgumentParser( + prog="FCStxtmerge", + description="Merge based on headers text-converted FCS files into one text file.") + + parser.add_argument( + '-i', + dest="input_files", + required=True, + action='append', + help="File location for the text files.") + + parser.add_argument( + '-o', + dest="output_file", + required=True, + help="Name of the output file.") + + parser.add_argument( + '-c', + dest="columns", + help="Specify which column to keep in output file") + + parser.add_argument( + '-d', + dest="downsampling_factor", + help="How much of each file to keep") + + args = parser.parse_args() + + # Get columns to merge on if any: + default_value_col = ["i.e.:1,2,5", "default", "Default"] + columns = [] + if args.columns: + if args.columns not in default_value_col: + tmp_col = args.columns.split(",") + if len(tmp_col) == 1: + if not tmp_col[0].strip(): + columns = [] + elif not is_integer(tmp_col[0].strip()): + sys.exit(7) + else: + columns.append(int(tmp_col[0].strip()) - 1) + else: + for c in range(0, len(tmp_col)): + if not is_integer(tmp_col[c].strip()): + sys.exit(6) + else: + columns.append(int(tmp_col[c].strip()) - 1) + + # Get down sampling factor if any: + # Note: change '%' to 'X' because somehow that's what Galaxy passes? + default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"] + ds_factor = 0.1 + if args.downsampling_factor: + if args.downsampling_factor not in default_value_ds: + args.downsampling_factor = args.downsampling_factor.strip() + downsampling_factor = args.downsampling_factor.rstrip("X") + if is_number(downsampling_factor): + ds_factor = float(downsampling_factor) + if ds_factor > 1 and ds_factor <= 100: + ds_factor = float(downsampling_factor) / 100 + elif ds_factor > 100 or ds_factor <= 0: + sys.stderr.write(str(ds_factor)) + sys.exit(8) + else: + sys.exit(8) + + input_files = [f for f in args.input_files] + merge_and_DS_txt(input_files, args.output_file, columns, ds_factor)