comparison FCStxtMergeDownsample.py @ 1:3c0e4179be7a draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/merge_ds_flowtext commit 7858e5b085fc3c60c88fe87b2f343969d50d9b1e"
author azomics
date Mon, 22 Jun 2020 17:42:26 -0400
parents
children
comparison
equal deleted inserted replaced
0:426650130311 1:3c0e4179be7a
1 #!/usr/bin/env python
2
3 ######################################################################
4 # Copyright (c) 2016 Northrop Grumman.
5 # All rights reserved.
6 ######################################################################
7
8 from __future__ import print_function
9 from __future__ import division
10 import sys
11 import os
12 import pandas as pd
13 from argparse import ArgumentParser
14
15
16 def is_number(s):
17 try:
18 float(s)
19 return True
20 except ValueError:
21 return False
22
23
24 def is_integer(s):
25 try:
26 int(s)
27 return True
28 except ValueError:
29 return False
30
31
32 def compare_headers(files):
33 headers = {}
34 for eachfile in files:
35 with open(eachfile, "r") as ef:
36 headers[eachfile] = ef.readline().strip().lower().split("\t")
37
38 hdgs_in_common = []
39 flag = {}
40
41 for ref_hdgs in headers[files[0]]:
42 flag[ref_hdgs] = 1
43
44 for ij in range(1, len(files)):
45 if ref_hdgs in headers[files[ij]]:
46 flag[ref_hdgs] += 1
47 if flag[ref_hdgs] == len(files):
48 hdgs_in_common.append(ref_hdgs)
49
50 if not hdgs_in_common:
51 sys.exit(9)
52 return(hdgs_in_common)
53
54
55 def get_nb_lines(files):
56 tot_event = 0
57 for f in files:
58 df = pd.read_table(f)
59 tot_event += (len(df.index) - 1)
60 return(tot_event)
61
62
63 def get_headers_index(list_headings, headings):
64 idxs = []
65 lhdgs = [x.lower() for x in headings]
66 for element in list_headings:
67 idxs.append(int(lhdgs.index(element)))
68 return(idxs)
69
70
71 def merge_and_DS_txt(in_files, out_file, col_names, factor_ds):
72 """Concatenates together tab-separated files.
73 The output will have only the columns in common to all the files provided
74 as input, as determined by the headers.
75 All lines after the header line must contain only numbers.
76 Potential errors are logged to stderr. If the number of errors reaches 10,
77 the program stops.
78 If a downsampling factor is given, returns the indicated fraction of
79 random lines.
80 """
81
82 nb_errors = 0
83 max_error = 10
84
85 # get list of headers in common to all files
86 list_hdgs = compare_headers(in_files)
87 total_events = get_nb_lines(in_files)
88 total_final = total_events * ds_factor
89 nb_per_file = int(total_final / len(in_files))
90
91 with open(out_file, "w") as outf:
92 ff_order = []
93 # HEADERS:
94 with open(in_files[0], "r") as first_file:
95 headings_ff = first_file.readline().strip()
96 headings = headings_ff.split("\t")
97 # Get index of headers in common:
98 hdrs_idx = get_headers_index(list_hdgs, headings)
99
100 # If column to merge on were provided:
101 if col_names:
102 for ix in col_names:
103 if ix not in hdrs_idx:
104 nb_errors += 1
105 sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0],
106 "does not exist in all files or has a different header.\n"]))
107 if nb_errors == max_error:
108 exit_code = 4
109 sys.stderr.write("Run aborted - too many errors.")
110 os.remove(out_file)
111 hdrs_idx = col_names
112
113 # Print out to output file:
114 headings_to_write = []
115 for cti in range(0, len(headings)):
116 if cti in hdrs_idx:
117 headings_to_write.append(headings[cti])
118 ff_order.append(headings[cti])
119 outf.write("\t".join(headings_to_write) + "\n")
120
121 # DATA
122 for infile in in_files:
123 with open(infile, "r") as inf:
124 headings_inf = inf.readline().strip()
125 hdgs = headings_inf.split("\t")
126 # Get the index of columns to keep:
127 hdgs_idx = []
128 for ctc in ff_order:
129 hdgs_idx.append(int(hdgs.index(ctc)))
130 if col_names:
131 for iy in col_names:
132 if iy not in hdgs_idx:
133 nb_errors += 1
134 sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile,
135 "does not exist in all files or has a different header.\n"]))
136 if nb_errors == max_error:
137 exit_code = 4
138 sys.stderr.write("Run aborted - too many errors.")
139 os.remove(out_file)
140 hdgs_idx = col_names
141
142 df = pd.read_table(infile, usecols=hdrs_idx)
143 df_ds = df.sample(nb_per_file, replace=False)
144
145 for cols in df_ds.columns.values:
146 if df_ds[cols].count() != len(df_ds[cols]):
147 sys.stderr.write(infile + "contains non-numeric data\n")
148
149 with open(infile, "r") as checkfile:
150 fl = checkfile.readline()
151 count_lines = 1
152 for checklines in checkfile:
153 to_check = checklines.strip().split("\t")
154 count_lines += 1
155 for item in to_check:
156 if not is_number(item):
157 sys.stderr.write(" ".join(["WARNING: line", str(count_lines),
158 "in", infile, "contains non-numeric results\n"]))
159 sys.exit(2)
160
161 df_ds = df_ds.ix[:, ff_order]
162 df_ds.to_csv(outf, sep="\t", header=False, index=False)
163
164 if nb_errors > 0:
165 exit_code = 3
166 if nb_errors == max_error:
167 exit_code = 4
168 sys.stderr.write("Run aborted - too many errors.")
169 os.remove(out_file)
170 sys.exit(exit_code)
171 return
172
173
174 if __name__ == "__main__":
175 parser = ArgumentParser(
176 prog="FCStxtmerge",
177 description="Merge based on headers text-converted FCS files into one text file.")
178
179 parser.add_argument(
180 '-i',
181 dest="input_files",
182 required=True,
183 action='append',
184 help="File location for the text files.")
185
186 parser.add_argument(
187 '-o',
188 dest="output_file",
189 required=True,
190 help="Name of the output file.")
191
192 parser.add_argument(
193 '-c',
194 dest="columns",
195 help="Specify which column to keep in output file")
196
197 parser.add_argument(
198 '-d',
199 dest="downsampling_factor",
200 help="How much of each file to keep")
201
202 args = parser.parse_args()
203
204 # Get columns to merge on if any:
205 default_value_col = ["i.e.:1,2,5", "default", "Default"]
206 columns = []
207 if args.columns:
208 if args.columns not in default_value_col:
209 tmp_col = args.columns.split(",")
210 if len(tmp_col) == 1:
211 if not tmp_col[0].strip():
212 columns = []
213 elif not is_integer(tmp_col[0].strip()):
214 sys.exit(7)
215 else:
216 columns.append(int(tmp_col[0].strip()) - 1)
217 else:
218 for c in range(0, len(tmp_col)):
219 if not is_integer(tmp_col[c].strip()):
220 sys.exit(6)
221 else:
222 columns.append(int(tmp_col[c].strip()) - 1)
223
224 # Get down sampling factor if any:
225 # Note: change '%' to 'X' because somehow that's what Galaxy passes?
226 default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"]
227 ds_factor = 0.1
228 if args.downsampling_factor:
229 if args.downsampling_factor not in default_value_ds:
230 args.downsampling_factor = args.downsampling_factor.strip()
231 downsampling_factor = args.downsampling_factor.rstrip("X")
232 if is_number(downsampling_factor):
233 ds_factor = float(downsampling_factor)
234 if ds_factor > 1 and ds_factor <= 100:
235 ds_factor = float(downsampling_factor) / 100
236 elif ds_factor > 100 or ds_factor <= 0:
237 sys.stderr.write(str(ds_factor))
238 sys.exit(8)
239 else:
240 sys.exit(8)
241
242 input_files = [f for f in args.input_files]
243 merge_and_DS_txt(input_files, args.output_file, columns, ds_factor)