Mercurial > repos > immport-devteam > flow_overview
comparison genFlowOverview.py @ 1:b5453d07f740 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
author | azomics |
---|---|
date | Wed, 29 Jul 2020 17:03:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:8283ff163ba6 | 1:b5453d07f740 |
---|---|
1 | |
2 #!/usr/bin/env python | |
3 ###################################################################### | |
4 # Copyright (c) 2016 Northrop Grumman. | |
5 # All rights reserved. | |
6 ###################################################################### | |
7 | |
8 # version 1.1 - August 2017 | |
9 # added upper limit to nb of clusters (40) | |
10 # | |
11 import sys | |
12 import os | |
13 import pandas as pd | |
14 import logging | |
15 import fileinput | |
16 | |
17 from argparse import ArgumentParser | |
18 from jinja2 import Environment, FileSystemLoader | |
19 from collections import defaultdict | |
20 | |
21 from color_palette import color_palette | |
22 from flowstatlib import gen_overview_stats | |
23 import matplotlib as mpl | |
24 mpl.use('Agg') | |
25 import matplotlib.pyplot as plt | |
26 | |
27 | |
28 profile_key = { | |
29 "1": "-", | |
30 "2": "lo", | |
31 "3": "+", | |
32 "4": "hi" | |
33 } | |
34 | |
35 | |
36 # flow CL functions | |
37 def run_flowCL(phenotype, output_txt, output_pdf, tool): | |
38 run_command = " ". join(["Rscript --slave --vanilla", tool, output_txt, | |
39 phenotype]) | |
40 os.system(run_command) | |
41 | |
42 get_graph = " ".join(["mv flowCL_results/*.pdf", output_pdf]) | |
43 os.system(get_graph) | |
44 return | |
45 | |
46 | |
47 def generate_flowCL_query(list_markers, list_types): | |
48 if (len(list_markers) != len(list_types)): | |
49 return("pb with headers") | |
50 query = [] | |
51 # go through both lists, remove fsc/ssc | |
52 for i in range(1, len(list_markers)): | |
53 if not list_markers[i].startswith("FSC") and not list_markers[i].startswith("SSC"): | |
54 query.append(list_markers[i].upper()) | |
55 query.append(profile_key[list_types[i]]) | |
56 # return concatenated string | |
57 return("".join(query)) | |
58 | |
59 | |
60 def translate_profiles(input_file, tool_dir, html_dir): | |
61 tool = "/".join([tool_dir, "getOntology.R"]) | |
62 html_table = "".join([html_dir, "/CLprofiles.txt"]) | |
63 score_table = "".join(["cp ", input_file, " ", html_dir, "/scores.txt"]) | |
64 os.system(score_table) | |
65 | |
66 # read profile | |
67 with open(input_file, "r") as flock_profiles, open(html_table, "w") as out: | |
68 headers = flock_profiles.readline() | |
69 headers = headers.strip() | |
70 markers = headers.split("\t")[:-2] | |
71 counter = 0 | |
72 | |
73 out.write("Population\tFlowCL Query\tNb Results\tLink to PDF\t") | |
74 out.write("Top Result Label\tTop Result Score\tTop Result CL\n") | |
75 queries = {} | |
76 # create marker query for each population | |
77 for lines in flock_profiles: | |
78 lines = lines.strip("\n") | |
79 pop_profile = lines.split("\t")[:-2] | |
80 flowcl_query = generate_flowCL_query(markers, pop_profile) | |
81 counter += 1 | |
82 nb_results = "0" | |
83 top_label = "no_match" | |
84 top_score = "NA" | |
85 top_CL = "NA" | |
86 pdf_link = "NA" | |
87 | |
88 # check if query was run before | |
89 if flowcl_query not in queries: | |
90 # create filenames for results & graphs | |
91 txt = "".join(["flowcl_pop", str(counter).zfill(2), ".txt"]) | |
92 text_result = "/".join([html_dir, txt]) | |
93 graph = "".join(["flowcl_pop", str(counter).zfill(2), ".pdf"]) | |
94 graph_output = "/".join([html_dir, graph]) | |
95 # run flowCL for each marker profile | |
96 run_flowCL(flowcl_query, text_result, graph_output, tool) | |
97 | |
98 # test that text file exists if not results are all NAs: | |
99 if os.path.isfile(text_result): | |
100 with open(text_result, "r") as res: | |
101 for line in res: | |
102 if line.startswith("Score"): | |
103 data = line.split(") ") | |
104 top_score = data[2][:-2] | |
105 tot_results = len(data) - 2 | |
106 nb_results = str(tot_results) | |
107 if tot_results == 5: | |
108 if len(data[6].split("+")) > 1: | |
109 nb_results = "5+" | |
110 elif line.startswith("Cell ID"): | |
111 prep_link = line.split(") ")[1][:-2] | |
112 cl = prep_link.replace("_", ":") | |
113 link = "".join(['<a href="http://www.immport-labs.org/immport-ontology/public/home/home/', cl, '" target="_blank">']) | |
114 top_CL = "".join([link, prep_link, "</a>"]) | |
115 elif line.startswith("Cell Label"): | |
116 top_label = line.split(") ")[1][:-2] | |
117 pdf_link = "".join(['<a href="', graph, '" target="_blank">PDF</a>']) | |
118 tmpflowcl_query = "".join(['<a href="', txt, '" target="_blank">', flowcl_query, '</a>']) | |
119 queries[flowcl_query] = { | |
120 "query": tmpflowcl_query, | |
121 "results": nb_results, | |
122 "pdf": pdf_link, | |
123 "label": top_label, | |
124 "score": top_score, | |
125 "CL": top_CL | |
126 } | |
127 | |
128 # write query results to CLprofiles.txt | |
129 out.write("\t".join([pop_profile[0], | |
130 queries[flowcl_query]["query"], | |
131 queries[flowcl_query]["results"], | |
132 queries[flowcl_query]["pdf"], | |
133 queries[flowcl_query]["label"], | |
134 queries[flowcl_query]["score"], | |
135 queries[flowcl_query]["CL"]]) + "\n") | |
136 | |
137 | |
138 # boxplots data massaging | |
139 def panel_to_json_string(df): | |
140 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs | |
141 def __merge_stream(key, stream): | |
142 return '"' + key + '"' + ': ' + stream + ', ' | |
143 try: | |
144 if 'Unnamed: 0' in df.columns: | |
145 df = df.drop(['Unnamed: 0'], axis=1) | |
146 stream = '{' | |
147 for index, subdf in df.groupby(level=0): | |
148 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) | |
149 # take out extra last comma | |
150 stream = stream[:-2] | |
151 # add the final paren | |
152 stream += '}' | |
153 except: | |
154 logging.exception('Panel Encoding did not work') | |
155 return stream | |
156 | |
157 | |
158 def get_outliers(group, upper, lower): | |
159 cat = group.name | |
160 out = {} | |
161 for marker in group: | |
162 # skip population since upper and lower don't contain it, since it was made after a group by Population | |
163 if marker != 'Population': | |
164 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] | |
165 return out | |
166 | |
167 | |
168 def get_boxplot_stats(all_data, mfi_file, output_json): | |
169 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html | |
170 # Get initial MFI values | |
171 mfi = pd.read_table(mfi_file) | |
172 mfi = mfi.set_index('Population') | |
173 | |
174 df = pd.read_table(all_data) | |
175 # check if ever some pops not in cs_files | |
176 missing_pop = [x for x in mfi.index if x not in set(df.Population)] | |
177 | |
178 if (missing_pop): | |
179 zeros = {} | |
180 for m in df.columns: | |
181 zeros[m] = [0 for x in missing_pop] | |
182 tmpdf = pd.DataFrame(zeros) | |
183 tmpdf.Population = missing_pop | |
184 df = df.append(tmpdf) | |
185 | |
186 pops = df.groupby('Population') | |
187 q1 = pops.quantile(q=0.25) | |
188 q2 = pops.quantile(q=0.5) | |
189 q3 = pops.quantile(q=0.75) | |
190 iqr = q3 - q1 | |
191 upper = q3 + 1.5*iqr | |
192 lower = q1 - 1.5*iqr | |
193 resampled = False | |
194 # get outliers | |
195 out = pops.apply(get_outliers, upper, lower).dropna() | |
196 outliers = defaultdict(dict) | |
197 for population in set(df.Population): | |
198 for marker in df.columns: | |
199 if marker != 'Population': | |
200 tmp_outliers = list(out[population][marker]) | |
201 if (len(list(out[population][marker]))> 100): | |
202 tmp_outliers = list(out[population][marker].sample(n=100)) | |
203 resampled = True | |
204 outliers[population][marker] = tmp_outliers | |
205 outdf = pd.DataFrame(outliers) | |
206 | |
207 data = pd.concat({'q1': q1, | |
208 'q2': q2, | |
209 'q3': q3, | |
210 'upper': upper, | |
211 'lower': lower, | |
212 'outliers': outdf.T, | |
213 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) | |
214 | |
215 with open(output_json, "w") as js_all: | |
216 js_all.write(panel_to_json_string(data)) | |
217 | |
218 return resampled | |
219 | |
220 # html generation | |
221 def gen_flow_overview(args): | |
222 flow_stats = gen_overview_stats(args.input_file) | |
223 if len(set(flow_stats['population'])) > 40: | |
224 nbpop = str(len(set(flow_stats['population']))) | |
225 sys.stderr.write("There are " + nbpop + " in the input file.") | |
226 sys.exit(3) | |
227 | |
228 os.mkdir(args.output_directory) | |
229 html_template = "genOverview.template" | |
230 | |
231 if args.scores: | |
232 translate_profiles(args.scores, args.tool_directory, args.output_directory) | |
233 html_template = "genOverviewCL.template" | |
234 | |
235 env = Environment(loader=FileSystemLoader(args.tool_directory + "/templates")) | |
236 template = env.get_template(html_template) | |
237 | |
238 real_directory = args.output_directory.replace("/job_working_directory", "") | |
239 context = {'outputDirectory': real_directory} | |
240 overview = template.render(**context) | |
241 with open(args.output_file, "w") as f: | |
242 f.write(overview) | |
243 | |
244 flow_sample_file_name = args.output_directory + "/flow.sample" | |
245 with open(flow_sample_file_name, "w") as flow_sample_file: | |
246 flow_stats['sample'].to_csv(flow_sample_file, sep="\t", index=False, float_format='%.0f') | |
247 | |
248 flow_mfi_file_name = args.output_directory + "/flow.mfi" | |
249 with open(flow_mfi_file_name, "w") as flow_mfi_file: | |
250 flow_stats[args.mfi_calc].to_csv(flow_mfi_file, sep="\t", float_format='%.0f') | |
251 | |
252 mpop = "_".join([args.mfi_calc, "pop"]) | |
253 flow_mfi_pop_file_name = args.output_directory + "/flow.mfi_pop" | |
254 with open(flow_mfi_pop_file_name, "w") as flow_mfi_pop_file: | |
255 flow_stats[mpop].to_csv(flow_mfi_pop_file, sep="\t", index=False, float_format="%.2f") | |
256 | |
257 # box plot data | |
258 boxplot_data = args.output_directory + "/boxplotData.json" | |
259 resampled = get_boxplot_stats(args.input_file, flow_mfi_file_name, boxplot_data) | |
260 | |
261 # Generate the Images -- eventually we should change that over to D3 | |
262 fcm = flow_stats['sample_data'].values | |
263 colors = [] | |
264 for i, j in enumerate(flow_stats['sample_population']): | |
265 colors.append(color_palette[j]) | |
266 | |
267 for i in range(flow_stats['columns']): | |
268 for j in range(flow_stats['columns']): | |
269 file_name = "m" + str(i) + "_m" + str(j) | |
270 ax = plt.subplot(1, 1, 1) | |
271 plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=1.0, wspace=0.0, hspace=0.0) | |
272 plt.scatter(fcm[:, i], fcm[:, j], s=1, c=colors, edgecolors='none') | |
273 plt.axis([0, 1024, 0, 1024]) | |
274 plt.xticks([]) | |
275 plt.yticks([]) | |
276 F = plt.gcf() | |
277 F.set_size_inches(1, 1) | |
278 F.set_dpi(90) | |
279 png_file = file_name + "_90X90.png" | |
280 F.savefig(args.output_directory + "/" + png_file) | |
281 plt.clf() | |
282 | |
283 flow_overview_file_name = args.output_directory + "/flow.overview" | |
284 with open(flow_overview_file_name, "w") as flow_overview_file: | |
285 flow_overview_file.write("<table>\n") | |
286 flow_overview_file.write("<tr><td> </td>\n") | |
287 for i in range(flow_stats['columns']): | |
288 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") | |
289 | |
290 for i in range(flow_stats['columns']): | |
291 flow_overview_file.write("<tr>\n") | |
292 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") | |
293 for j in range(flow_stats['columns']): | |
294 file_name = "m" + str(j) + "_m" + str(i) | |
295 image_file = file_name + "_90X90.png" | |
296 flow_overview_file.write('<td><img src="' + image_file + '"/></td>') | |
297 flow_overview_file.write("</tr>\n") | |
298 | |
299 flow_overview_file.write("</table>\n</body>\n<html>\n") | |
300 | |
301 if resampled: | |
302 to_find = '<div id="outlierWarning" style="display:none;">' | |
303 to_replace = '<div id="outlierWarning">' | |
304 ## yay python 2.7 | |
305 ro = fileinput.input(args.output_file, inplace=True, backup=".bak") | |
306 for roline in ro: | |
307 print(roline.replace(to_find, to_replace), end='') | |
308 ro.close() | |
309 | |
310 | |
311 if __name__ == "__main__": | |
312 parser = ArgumentParser( | |
313 prog="genOverview", | |
314 description="Generate an overview plot of Flow results.") | |
315 | |
316 parser.add_argument( | |
317 '-i', | |
318 dest="input_file", | |
319 required=True, | |
320 help="File location for the Flow Text file.") | |
321 | |
322 parser.add_argument( | |
323 '-o', | |
324 dest="output_file", | |
325 required=True, | |
326 help="File location for the HTML output file.") | |
327 | |
328 parser.add_argument( | |
329 '-d', | |
330 dest="output_directory", | |
331 required=True, | |
332 help="Directory location for the Flow Plot.") | |
333 | |
334 parser.add_argument( | |
335 '-M', | |
336 dest="mfi_calc", | |
337 required=True, | |
338 help="what to calculate for centroids.") | |
339 | |
340 parser.add_argument( | |
341 '-p', | |
342 dest="scores", | |
343 help="File location for FLOCK population scores.") | |
344 | |
345 parser.add_argument( | |
346 '-t', | |
347 dest="tool_directory", | |
348 required=True, | |
349 help="Location of the Tool Directory.") | |
350 | |
351 args = parser.parse_args() | |
352 | |
353 gen_flow_overview(args) |