Mercurial > repos > immport-devteam > merge_ds_flowtext
comparison FCStxtMergeDownsample.py @ 1:3c0e4179be7a draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/merge_ds_flowtext commit 7858e5b085fc3c60c88fe87b2f343969d50d9b1e"
author | azomics |
---|---|
date | Mon, 22 Jun 2020 17:42:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:426650130311 | 1:3c0e4179be7a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ###################################################################### | |
4 # Copyright (c) 2016 Northrop Grumman. | |
5 # All rights reserved. | |
6 ###################################################################### | |
7 | |
8 from __future__ import print_function | |
9 from __future__ import division | |
10 import sys | |
11 import os | |
12 import pandas as pd | |
13 from argparse import ArgumentParser | |
14 | |
15 | |
16 def is_number(s): | |
17 try: | |
18 float(s) | |
19 return True | |
20 except ValueError: | |
21 return False | |
22 | |
23 | |
24 def is_integer(s): | |
25 try: | |
26 int(s) | |
27 return True | |
28 except ValueError: | |
29 return False | |
30 | |
31 | |
32 def compare_headers(files): | |
33 headers = {} | |
34 for eachfile in files: | |
35 with open(eachfile, "r") as ef: | |
36 headers[eachfile] = ef.readline().strip().lower().split("\t") | |
37 | |
38 hdgs_in_common = [] | |
39 flag = {} | |
40 | |
41 for ref_hdgs in headers[files[0]]: | |
42 flag[ref_hdgs] = 1 | |
43 | |
44 for ij in range(1, len(files)): | |
45 if ref_hdgs in headers[files[ij]]: | |
46 flag[ref_hdgs] += 1 | |
47 if flag[ref_hdgs] == len(files): | |
48 hdgs_in_common.append(ref_hdgs) | |
49 | |
50 if not hdgs_in_common: | |
51 sys.exit(9) | |
52 return(hdgs_in_common) | |
53 | |
54 | |
55 def get_nb_lines(files): | |
56 tot_event = 0 | |
57 for f in files: | |
58 df = pd.read_table(f) | |
59 tot_event += (len(df.index) - 1) | |
60 return(tot_event) | |
61 | |
62 | |
63 def get_headers_index(list_headings, headings): | |
64 idxs = [] | |
65 lhdgs = [x.lower() for x in headings] | |
66 for element in list_headings: | |
67 idxs.append(int(lhdgs.index(element))) | |
68 return(idxs) | |
69 | |
70 | |
71 def merge_and_DS_txt(in_files, out_file, col_names, factor_ds): | |
72 """Concatenates together tab-separated files. | |
73 The output will have only the columns in common to all the files provided | |
74 as input, as determined by the headers. | |
75 All lines after the header line must contain only numbers. | |
76 Potential errors are logged to stderr. If the number of errors reaches 10, | |
77 the program stops. | |
78 If a downsampling factor is given, returns the indicated fraction of | |
79 random lines. | |
80 """ | |
81 | |
82 nb_errors = 0 | |
83 max_error = 10 | |
84 | |
85 # get list of headers in common to all files | |
86 list_hdgs = compare_headers(in_files) | |
87 total_events = get_nb_lines(in_files) | |
88 total_final = total_events * ds_factor | |
89 nb_per_file = int(total_final / len(in_files)) | |
90 | |
91 with open(out_file, "w") as outf: | |
92 ff_order = [] | |
93 # HEADERS: | |
94 with open(in_files[0], "r") as first_file: | |
95 headings_ff = first_file.readline().strip() | |
96 headings = headings_ff.split("\t") | |
97 # Get index of headers in common: | |
98 hdrs_idx = get_headers_index(list_hdgs, headings) | |
99 | |
100 # If column to merge on were provided: | |
101 if col_names: | |
102 for ix in col_names: | |
103 if ix not in hdrs_idx: | |
104 nb_errors += 1 | |
105 sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0], | |
106 "does not exist in all files or has a different header.\n"])) | |
107 if nb_errors == max_error: | |
108 exit_code = 4 | |
109 sys.stderr.write("Run aborted - too many errors.") | |
110 os.remove(out_file) | |
111 hdrs_idx = col_names | |
112 | |
113 # Print out to output file: | |
114 headings_to_write = [] | |
115 for cti in range(0, len(headings)): | |
116 if cti in hdrs_idx: | |
117 headings_to_write.append(headings[cti]) | |
118 ff_order.append(headings[cti]) | |
119 outf.write("\t".join(headings_to_write) + "\n") | |
120 | |
121 # DATA | |
122 for infile in in_files: | |
123 with open(infile, "r") as inf: | |
124 headings_inf = inf.readline().strip() | |
125 hdgs = headings_inf.split("\t") | |
126 # Get the index of columns to keep: | |
127 hdgs_idx = [] | |
128 for ctc in ff_order: | |
129 hdgs_idx.append(int(hdgs.index(ctc))) | |
130 if col_names: | |
131 for iy in col_names: | |
132 if iy not in hdgs_idx: | |
133 nb_errors += 1 | |
134 sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile, | |
135 "does not exist in all files or has a different header.\n"])) | |
136 if nb_errors == max_error: | |
137 exit_code = 4 | |
138 sys.stderr.write("Run aborted - too many errors.") | |
139 os.remove(out_file) | |
140 hdgs_idx = col_names | |
141 | |
142 df = pd.read_table(infile, usecols=hdrs_idx) | |
143 df_ds = df.sample(nb_per_file, replace=False) | |
144 | |
145 for cols in df_ds.columns.values: | |
146 if df_ds[cols].count() != len(df_ds[cols]): | |
147 sys.stderr.write(infile + "contains non-numeric data\n") | |
148 | |
149 with open(infile, "r") as checkfile: | |
150 fl = checkfile.readline() | |
151 count_lines = 1 | |
152 for checklines in checkfile: | |
153 to_check = checklines.strip().split("\t") | |
154 count_lines += 1 | |
155 for item in to_check: | |
156 if not is_number(item): | |
157 sys.stderr.write(" ".join(["WARNING: line", str(count_lines), | |
158 "in", infile, "contains non-numeric results\n"])) | |
159 sys.exit(2) | |
160 | |
161 df_ds = df_ds.ix[:, ff_order] | |
162 df_ds.to_csv(outf, sep="\t", header=False, index=False) | |
163 | |
164 if nb_errors > 0: | |
165 exit_code = 3 | |
166 if nb_errors == max_error: | |
167 exit_code = 4 | |
168 sys.stderr.write("Run aborted - too many errors.") | |
169 os.remove(out_file) | |
170 sys.exit(exit_code) | |
171 return | |
172 | |
173 | |
174 if __name__ == "__main__": | |
175 parser = ArgumentParser( | |
176 prog="FCStxtmerge", | |
177 description="Merge based on headers text-converted FCS files into one text file.") | |
178 | |
179 parser.add_argument( | |
180 '-i', | |
181 dest="input_files", | |
182 required=True, | |
183 action='append', | |
184 help="File location for the text files.") | |
185 | |
186 parser.add_argument( | |
187 '-o', | |
188 dest="output_file", | |
189 required=True, | |
190 help="Name of the output file.") | |
191 | |
192 parser.add_argument( | |
193 '-c', | |
194 dest="columns", | |
195 help="Specify which column to keep in output file") | |
196 | |
197 parser.add_argument( | |
198 '-d', | |
199 dest="downsampling_factor", | |
200 help="How much of each file to keep") | |
201 | |
202 args = parser.parse_args() | |
203 | |
204 # Get columns to merge on if any: | |
205 default_value_col = ["i.e.:1,2,5", "default", "Default"] | |
206 columns = [] | |
207 if args.columns: | |
208 if args.columns not in default_value_col: | |
209 tmp_col = args.columns.split(",") | |
210 if len(tmp_col) == 1: | |
211 if not tmp_col[0].strip(): | |
212 columns = [] | |
213 elif not is_integer(tmp_col[0].strip()): | |
214 sys.exit(7) | |
215 else: | |
216 columns.append(int(tmp_col[0].strip()) - 1) | |
217 else: | |
218 for c in range(0, len(tmp_col)): | |
219 if not is_integer(tmp_col[c].strip()): | |
220 sys.exit(6) | |
221 else: | |
222 columns.append(int(tmp_col[c].strip()) - 1) | |
223 | |
224 # Get down sampling factor if any: | |
225 # Note: change '%' to 'X' because somehow that's what Galaxy passes? | |
226 default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"] | |
227 ds_factor = 0.1 | |
228 if args.downsampling_factor: | |
229 if args.downsampling_factor not in default_value_ds: | |
230 args.downsampling_factor = args.downsampling_factor.strip() | |
231 downsampling_factor = args.downsampling_factor.rstrip("X") | |
232 if is_number(downsampling_factor): | |
233 ds_factor = float(downsampling_factor) | |
234 if ds_factor > 1 and ds_factor <= 100: | |
235 ds_factor = float(downsampling_factor) / 100 | |
236 elif ds_factor > 100 or ds_factor <= 0: | |
237 sys.stderr.write(str(ds_factor)) | |
238 sys.exit(8) | |
239 else: | |
240 sys.exit(8) | |
241 | |
242 input_files = [f for f in args.input_files] | |
243 merge_and_DS_txt(input_files, args.output_file, columns, ds_factor) |