Mercurial > repos > immport-devteam > merge_ds_flowtext
changeset 0:426650130311 draft
Uploaded
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:03:02 -0500 |
parents | |
children | 3c0e4179be7a |
files | merge_ds_flowtext/FCStxtMergeDownsample.py merge_ds_flowtext/FCStxtMergeDownsample.xml merge_ds_flowtext/test-data/merge1.flowtext merge_ds_flowtext/test-data/merge2.flowtext merge_ds_flowtext/test-data/test1/input1.txt merge_ds_flowtext/test-data/test1/input2.txt merge_ds_flowtext/test-data/test1/input3.txt merge_ds_flowtext/test-data/test2/input1.txt merge_ds_flowtext/test-data/test2/input2.txt merge_ds_flowtext/test-data/test2/input3.txt |
diffstat | 10 files changed, 504 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/FCStxtMergeDownsample.py Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,225 @@ +#!/usr/bin/env python + +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### + +from __future__ import print_function +from __future__ import division +import sys +import os +import pandas as pd +from argparse import ArgumentParser + + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + + +def is_integer(s): + try: + int(s) + return True + except ValueError: + return False + + +def compare_headers(files): + headers = {} + for eachfile in files: + with open(eachfile, "r") as ef: + headers[eachfile] = ef.readline().strip().lower().split("\t") + + hdgs_in_common = [] + flag = {} + + for ref_hdgs in headers[files[0]]: + flag[ref_hdgs] = 1 + + for ij in range(1, len(files)): + if ref_hdgs in headers[files[ij]]: + flag[ref_hdgs] += 1 + if flag[ref_hdgs] == len(files): + hdgs_in_common.append(ref_hdgs) + + if not hdgs_in_common: + sys.exit(9) + return(hdgs_in_common) + + +def get_headers_index(list_headings, headings): + idxs = [] + lhdgs = [x.lower() for x in headings] + for element in list_headings: + idxs.append(int(lhdgs.index(element))) + return(idxs) + + +def merge_and_DS_txt(in_files, out_file, col_names, factor_ds): + """Concatenates together tab-separated files. + The output will have only the columns in common to all the files provided + as input, as determined by the headers. + All lines after the header line must contain only numbers. + Potential errors are logged to stderr. If the number of errors reaches 10, + the program stops. + If a downsampling factor is given, returns the indicated fraction of + random lines. + """ + + nb_errors = 0 + max_error = 10 + + # get list of headers in common to all files + list_hdgs = compare_headers(in_files) + + with open(out_file, "w") as outf: + ff_order = [] + # HEADERS: + with open(in_files[0], "r") as first_file: + headings_ff = first_file.readline().strip() + headings = headings_ff.split("\t") + # Get index of headers in common: + hdrs_idx = get_headers_index(list_hdgs, headings) + + # If column to merge on were provided: + if col_names: + for ix in col_names: + if ix not in hdrs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0], + "does not exist in all files or has a different header.\n"])) + hdrs_idx = col_names + + # Print out to output file: + headings_to_write = [] + for cti in range(0, len(headings)): + if cti in hdrs_idx: + headings_to_write.append(headings[cti]) + ff_order.append(headings[cti]) + outf.write("\t".join(headings_to_write) + "\n") + + # DATA + for infile in in_files: + with open(infile, "r") as inf: + headings_inf = inf.readline().strip() + hdgs = headings_inf.split("\t") + # Get the index of columns to keep: + hdgs_idx = [] + for ctc in ff_order: + hdgs_idx.append(int(hdgs.index(ctc))) + if col_names: + for iy in col_names: + if iy not in hdgs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile, + "does not exist in all files or has a different header.\n"])) + hdgs_idx = col_names + + df = pd.read_table(infile, usecols=hdrs_idx) + wc_file = len(df.index) - 1 + df_ds = df.sample(int(wc_file * factor_ds), replace=False) + + for cols in df_ds.columns.values: + if df_ds[cols].count() != len(df_ds[cols]): + sys.stderr.write(infile + "contains non-numeric data\n") + + with open(infile, "r") as checkfile: + fl = checkfile.readline() + count_lines = 1 + for checklines in checkfile: + to_check = checklines.strip().split("\t") + count_lines += 1 + for item in to_check: + if not is_number(item): + sys.stderr.write(" ".join(["WARNING: line", str(count_lines), + "in", infile, "contains non-numeric results\n"])) + sys.exit(2) + + df_ds = df_ds.ix[:, ff_order] + df_ds.to_csv(outf, sep="\t", header=False, index=False) + + if nb_errors > 0: + exit_code = 3 + if nb_errors == max_error: + exit_code = 4 + sys.stderr.write("Run aborted - too many errors.") + os.remove(out_file) + sys.exit(exit_code) + return + + +if __name__ == "__main__": + parser = ArgumentParser( + prog="FCStxtmerge", + description="Merge based on headers text-converted FCS files into one text file.") + + parser.add_argument( + '-i', + dest="input_files", + required=True, + action='append', + help="File location for the text files.") + + parser.add_argument( + '-o', + dest="output_file", + required=True, + help="Name of the output file.") + + parser.add_argument( + '-c', + dest="columns", + help="Specify which column to keep in output file") + + parser.add_argument( + '-d', + dest="downsampling_factor", + help="How much of each file to keep") + + args = parser.parse_args() + + # Get columns to merge on if any: + default_value_col = ["i.e.:1,2,5", "default", "Default"] + columns = [] + if args.columns: + if args.columns not in default_value_col: + tmp_col = args.columns.split(",") + if len(tmp_col) == 1: + if not tmp_col[0].strip(): + columns = [] + elif not is_integer(tmp_col[0].strip()): + sys.exit(7) + else: + columns.append(int(tmp_col[0].strip()) - 1) + else: + for c in range(0, len(tmp_col)): + if not is_integer(tmp_col[c].strip()): + sys.exit(6) + else: + columns.append(int(tmp_col[c].strip()) - 1) + + # Get down sampling factor if any: + # Note: change '%' to 'X' because somehow that's what Galaxy passes? + default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"] + ds_factor = 1 + if args.downsampling_factor: + if args.downsampling_factor not in default_value_ds: + args.downsampling_factor = args.downsampling_factor.strip() + downsampling_factor = args.downsampling_factor.rstrip("X") + if is_number(downsampling_factor): + ds_factor = float(downsampling_factor) + if ds_factor > 1: + ds_factor = float(downsampling_factor) / 100 + if ds_factor > 100: + sys.exit(8) + else: + sys.exit(8) + + input_files = [f for f in args.input_files] + merge_and_DS_txt(input_files, args.output_file, columns, ds_factor) + sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/FCStxtMergeDownsample.xml Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,175 @@ +<tool id="fcstxt_merge_downsample" name="Downsample and merge" version="1.1"> + <description>txt-converted FCS files into one text file based on headers.</description> + <requirements> + <requirement type="package" version="1.10.2">numpy</requirement> + <requirement type="package" version="0.17.1">pandas</requirement> + </requirements> + <stdio> + <exit_code range="2" level="fatal" description="Non-numeric data. See stderr for more details." /> + <exit_code range="3" level="warning" description="Selected columns do not exist in all files" /> + <exit_code range="4" level="fatal" description="Run aborted - too many errors" /> + <exit_code range="6" level="fatal" description="Please provide integers for columns you want to merge on." /> + <exit_code range="7" level="fatal" description="Please provide a comma separated list of integers for columns you want to merge on." /> + <exit_code range="8" level="fatal" description="Please provide a numeric value [0,1] for the downsampling factor." /> + <exit_code range="9" level="fatal" description="There are no columns in common to all files." /> + </stdio> + <command><![CDATA[ + python $__tool_directory__/FCStxtMergeDownsample.py -o "${output_file}" -d "${factorDS}" + #if $columns + -c "${columns}" + #end if + #for $f in $input# + -i "${f}" + #end for# + ]]> + </command> + <inputs> + <param format="flowtext" name="input" type="data_collection" collection_type="list" label="Text files Collection"/> + <param name="factorDS" type="text" label="Downsample by:" value="i.e.:0.1 or 10%" optional="true" help="1 by default (no downsampling)."/> + <param name="columns" type="text" label="Merge columns:" value="i.e.:1,2,5" optional="true" help="By default, will merge on the columns in common to all files."> + </param> + </inputs> + <outputs> + <data format="flowtext" name="output_file" label="Merge flowtext on ${input.name}"/> + </outputs> + <tests> + <test> + <param name="input"> + <collection type="list"> + <element name="input1.txt" value="test1/input1.txt"/> + <element name="input2.txt" value="test1/input2.txt"/> + <element name="input3.txt" value="test1/input3.txt"/> + </collection> + </param> + <param name="factorDS" value=".8"/> + <param name="columns" value="i.e.:1,2,5"/> + <output name="output_file" file="merge1.flowtext" compare="sim_size"/> + </test> + <test> + <param name="input"> + <collection type="list"> + <element name="input1.txt" value="test2/input1.txt"/> + <element name="input2.txt" value="test2/input2.txt"/> + <element name="input3.txt" value="test2/input3.txt"/> + </collection> + </param> + <param name="factorDS" value="i.e.:0.1 or 10%"/> + <param name="columns" value="1,2,3"/> + <output name="output_file" file="merge2.flowtext" compare="sim_size"/> + </test> + </tests> + <help><![CDATA[ + This tool downsamples and merges multiple txt-converted FCS files into one text file. + +----- + +**Input files** + +This tool requires collections of txt, flowtext or tabular files as input. + +**Downsampling** + +By default, files are not downsampled. If a downsampling factor is provided, each file in the input dataset collection will be downsampled randomly without replacement as follows: + +- If n is between 0 and 1, the size of the output will be n times that of the input files. +- If n is between 1 and 100, the size of the output will be n% that of the input files. + +.. class:: warningmark + +At this time, up-sampling is not supported. If the number provided is greater than 100, the tool will exit. + +**Output file** + +The output flowtext file contains is a concatenation of the input files provided all data after the header contains only numbers. By default, only columns existing in all input files (as assessed by the header) are concatenated. The user can specify columns to merge, bypassing the headers check. If a downsampling factor is provided, the corresponding proportion of each input file ONLY will be read in (and checked for errors). + +.. class:: warningmark + +Potential errors are logged to stderr. If the number of errors reaches 10, the run will be aborted. If a file contains non-numeric data, the run will be aborted. + +.. class:: infomark + +Tip: Three tools in the Flow File Tools section can help prepare files for merging and/or downsampling: + +- Check headers tool provides a list of headers for all files in a collection of text, flowtext or tabular files. +- Remove, rearrange and/or rename columns tool allows manipulation of the columns of a file or a set of files. +- Check data tool identifies the lines in a file containing non-numeric data. + +----- + +**Example** + +*File1*:: + + Marker1 Marker2 Marker3 + 34 45 12 + 33 65 10 + 87 26 76 + 24 56 32 + 95 83 53 + 74 15 87 + +*File2*:: + + Marker4 Marker5 Marker3 + 19 62 98 + 12 36 58 + 41 42 68 + 76 74 53 + 62 34 45 + 93 21 76 + +*Output* + +.. class:: infomark + +If run without specifying the columns:: + + Marker3 + 12 + 10 + 76 + 32 + 53 + 87 + 98 + 58 + 68 + 53 + 45 + 76 + +.. class:: infomark + +If run specifying columns 1,2,3:: + + Marker1 Marker2 Marker3 + 34 45 12 + 33 65 10 + 87 26 76 + 24 56 32 + 95 83 53 + 74 15 87 + 19 62 98 + 12 36 58 + 41 42 68 + 76 74 53 + 62 34 45 + 93 21 76 + +.. class:: infomark + +If run specifying columns 1,2,3 and with a downsampling factor of 0.5:: + + Marker1 Marker2 Marker3 + 34 45 12 + 24 56 32 + 95 83 53 + 19 62 98 + 12 36 58 + 62 34 45 + ]]> + </help> + <citations> + <citation type="doi">10.1038/srep02327</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/merge1.flowtext Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,19 @@ +CD4 CCR3 CD8 CCR7 +437 69 0 146 +551 129 169 292 +199 277 320 227 +83 138 335 194 +534 111 83 177 +499 0 0 224 +175 361 225 237 +216 310 270 294 +519 44 51 148 +550 200 0 127 +552 479 0 62 +525 121 0 138 +438 0 626 480 +139 227 293 259 +0 292 641 327 +30 147 483 386 +537 338 568 201 +156 228 734 408
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/merge2.flowtext Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,25 @@ +Forward Scatter Side Scatter FITC CD4 +340 115 509 +262 73 437 +894 1023 199 +316 76 50 +449 157 551 +388 97 534 +383 139 499 +394 144 83 +372 126 519 +788 1023 216 +1023 1023 289 +363 76 550 +668 1019 73 +420 211 552 +770 1023 175 +602 578 385 +418 105 561 +352 153 30 +383 190 156 +733 970 139 +451 120 537 +373 104 3 +358 185 0 +289 56 438
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input1.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +551 129 169 292 +199 277 320 227 +437 69 0 146 +509 268 0 74 +50 0 60 129 +83 138 335 194 +499 0 0 224 +239 284 288 280 +534 111 83 177
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input2.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +550 200 0 127 +519 44 51 148 +289 401 362 254 +175 361 225 237 +525 121 0 138 +385 286 222 131 +216 310 270 294 +552 479 0 62 +73 193 227 132
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input3.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +438 0 626 480 +30 147 483 386 +156 228 734 408 +432 121 598 555 +537 338 568 201 +3 110 621 584 +561 0 610 562 +0 292 641 327 +139 227 293 259
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input1.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CCR3 PP CD8 APC CCR4 +449 157 551 129 169 292 +894 1023 199 277 320 227 +262 73 437 69 0 146 +340 115 509 268 0 74 +316 76 50 0 60 129 +394 144 83 138 335 194 +383 139 499 0 0 224 +800 1023 239 284 288 280 +388 97 534 111 83 177
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input2.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CXCR3 PP CD8 APC CCR5 +363 76 550 200 0 127 +372 126 519 44 51 148 +1023 1023 289 401 362 254 +770 1023 175 361 225 237 +384 111 525 121 0 138 +602 578 385 286 222 131 +788 1023 216 310 270 294 +420 211 552 479 0 62 +668 1019 73 193 227 132
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input3.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CD25 PP CD3 APC CD45RA +289 56 438 0 626 480 +352 153 30 147 483 386 +383 190 156 228 734 408 +261 62 432 121 598 555 +451 120 537 338 568 201 +373 104 3 110 621 584 +418 105 561 0 610 562 +358 185 0 292 641 327 +733 970 139 227 293 259