Mercurial > repos > immport-devteam > flow_overview
comparison genFlowOverview.py @ 1:b5453d07f740 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
| author | azomics |
|---|---|
| date | Wed, 29 Jul 2020 17:03:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:8283ff163ba6 | 1:b5453d07f740 |
|---|---|
| 1 | |
| 2 #!/usr/bin/env python | |
| 3 ###################################################################### | |
| 4 # Copyright (c) 2016 Northrop Grumman. | |
| 5 # All rights reserved. | |
| 6 ###################################################################### | |
| 7 | |
| 8 # version 1.1 - August 2017 | |
| 9 # added upper limit to nb of clusters (40) | |
| 10 # | |
| 11 import sys | |
| 12 import os | |
| 13 import pandas as pd | |
| 14 import logging | |
| 15 import fileinput | |
| 16 | |
| 17 from argparse import ArgumentParser | |
| 18 from jinja2 import Environment, FileSystemLoader | |
| 19 from collections import defaultdict | |
| 20 | |
| 21 from color_palette import color_palette | |
| 22 from flowstatlib import gen_overview_stats | |
| 23 import matplotlib as mpl | |
| 24 mpl.use('Agg') | |
| 25 import matplotlib.pyplot as plt | |
| 26 | |
| 27 | |
| 28 profile_key = { | |
| 29 "1": "-", | |
| 30 "2": "lo", | |
| 31 "3": "+", | |
| 32 "4": "hi" | |
| 33 } | |
| 34 | |
| 35 | |
| 36 # flow CL functions | |
| 37 def run_flowCL(phenotype, output_txt, output_pdf, tool): | |
| 38 run_command = " ". join(["Rscript --slave --vanilla", tool, output_txt, | |
| 39 phenotype]) | |
| 40 os.system(run_command) | |
| 41 | |
| 42 get_graph = " ".join(["mv flowCL_results/*.pdf", output_pdf]) | |
| 43 os.system(get_graph) | |
| 44 return | |
| 45 | |
| 46 | |
| 47 def generate_flowCL_query(list_markers, list_types): | |
| 48 if (len(list_markers) != len(list_types)): | |
| 49 return("pb with headers") | |
| 50 query = [] | |
| 51 # go through both lists, remove fsc/ssc | |
| 52 for i in range(1, len(list_markers)): | |
| 53 if not list_markers[i].startswith("FSC") and not list_markers[i].startswith("SSC"): | |
| 54 query.append(list_markers[i].upper()) | |
| 55 query.append(profile_key[list_types[i]]) | |
| 56 # return concatenated string | |
| 57 return("".join(query)) | |
| 58 | |
| 59 | |
| 60 def translate_profiles(input_file, tool_dir, html_dir): | |
| 61 tool = "/".join([tool_dir, "getOntology.R"]) | |
| 62 html_table = "".join([html_dir, "/CLprofiles.txt"]) | |
| 63 score_table = "".join(["cp ", input_file, " ", html_dir, "/scores.txt"]) | |
| 64 os.system(score_table) | |
| 65 | |
| 66 # read profile | |
| 67 with open(input_file, "r") as flock_profiles, open(html_table, "w") as out: | |
| 68 headers = flock_profiles.readline() | |
| 69 headers = headers.strip() | |
| 70 markers = headers.split("\t")[:-2] | |
| 71 counter = 0 | |
| 72 | |
| 73 out.write("Population\tFlowCL Query\tNb Results\tLink to PDF\t") | |
| 74 out.write("Top Result Label\tTop Result Score\tTop Result CL\n") | |
| 75 queries = {} | |
| 76 # create marker query for each population | |
| 77 for lines in flock_profiles: | |
| 78 lines = lines.strip("\n") | |
| 79 pop_profile = lines.split("\t")[:-2] | |
| 80 flowcl_query = generate_flowCL_query(markers, pop_profile) | |
| 81 counter += 1 | |
| 82 nb_results = "0" | |
| 83 top_label = "no_match" | |
| 84 top_score = "NA" | |
| 85 top_CL = "NA" | |
| 86 pdf_link = "NA" | |
| 87 | |
| 88 # check if query was run before | |
| 89 if flowcl_query not in queries: | |
| 90 # create filenames for results & graphs | |
| 91 txt = "".join(["flowcl_pop", str(counter).zfill(2), ".txt"]) | |
| 92 text_result = "/".join([html_dir, txt]) | |
| 93 graph = "".join(["flowcl_pop", str(counter).zfill(2), ".pdf"]) | |
| 94 graph_output = "/".join([html_dir, graph]) | |
| 95 # run flowCL for each marker profile | |
| 96 run_flowCL(flowcl_query, text_result, graph_output, tool) | |
| 97 | |
| 98 # test that text file exists if not results are all NAs: | |
| 99 if os.path.isfile(text_result): | |
| 100 with open(text_result, "r") as res: | |
| 101 for line in res: | |
| 102 if line.startswith("Score"): | |
| 103 data = line.split(") ") | |
| 104 top_score = data[2][:-2] | |
| 105 tot_results = len(data) - 2 | |
| 106 nb_results = str(tot_results) | |
| 107 if tot_results == 5: | |
| 108 if len(data[6].split("+")) > 1: | |
| 109 nb_results = "5+" | |
| 110 elif line.startswith("Cell ID"): | |
| 111 prep_link = line.split(") ")[1][:-2] | |
| 112 cl = prep_link.replace("_", ":") | |
| 113 link = "".join(['<a href="http://www.immport-labs.org/immport-ontology/public/home/home/', cl, '" target="_blank">']) | |
| 114 top_CL = "".join([link, prep_link, "</a>"]) | |
| 115 elif line.startswith("Cell Label"): | |
| 116 top_label = line.split(") ")[1][:-2] | |
| 117 pdf_link = "".join(['<a href="', graph, '" target="_blank">PDF</a>']) | |
| 118 tmpflowcl_query = "".join(['<a href="', txt, '" target="_blank">', flowcl_query, '</a>']) | |
| 119 queries[flowcl_query] = { | |
| 120 "query": tmpflowcl_query, | |
| 121 "results": nb_results, | |
| 122 "pdf": pdf_link, | |
| 123 "label": top_label, | |
| 124 "score": top_score, | |
| 125 "CL": top_CL | |
| 126 } | |
| 127 | |
| 128 # write query results to CLprofiles.txt | |
| 129 out.write("\t".join([pop_profile[0], | |
| 130 queries[flowcl_query]["query"], | |
| 131 queries[flowcl_query]["results"], | |
| 132 queries[flowcl_query]["pdf"], | |
| 133 queries[flowcl_query]["label"], | |
| 134 queries[flowcl_query]["score"], | |
| 135 queries[flowcl_query]["CL"]]) + "\n") | |
| 136 | |
| 137 | |
| 138 # boxplots data massaging | |
| 139 def panel_to_json_string(df): | |
| 140 # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs | |
| 141 def __merge_stream(key, stream): | |
| 142 return '"' + key + '"' + ': ' + stream + ', ' | |
| 143 try: | |
| 144 if 'Unnamed: 0' in df.columns: | |
| 145 df = df.drop(['Unnamed: 0'], axis=1) | |
| 146 stream = '{' | |
| 147 for index, subdf in df.groupby(level=0): | |
| 148 stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) | |
| 149 # take out extra last comma | |
| 150 stream = stream[:-2] | |
| 151 # add the final paren | |
| 152 stream += '}' | |
| 153 except: | |
| 154 logging.exception('Panel Encoding did not work') | |
| 155 return stream | |
| 156 | |
| 157 | |
| 158 def get_outliers(group, upper, lower): | |
| 159 cat = group.name | |
| 160 out = {} | |
| 161 for marker in group: | |
| 162 # skip population since upper and lower don't contain it, since it was made after a group by Population | |
| 163 if marker != 'Population': | |
| 164 out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] | |
| 165 return out | |
| 166 | |
| 167 | |
| 168 def get_boxplot_stats(all_data, mfi_file, output_json): | |
| 169 # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html | |
| 170 # Get initial MFI values | |
| 171 mfi = pd.read_table(mfi_file) | |
| 172 mfi = mfi.set_index('Population') | |
| 173 | |
| 174 df = pd.read_table(all_data) | |
| 175 # check if ever some pops not in cs_files | |
| 176 missing_pop = [x for x in mfi.index if x not in set(df.Population)] | |
| 177 | |
| 178 if (missing_pop): | |
| 179 zeros = {} | |
| 180 for m in df.columns: | |
| 181 zeros[m] = [0 for x in missing_pop] | |
| 182 tmpdf = pd.DataFrame(zeros) | |
| 183 tmpdf.Population = missing_pop | |
| 184 df = df.append(tmpdf) | |
| 185 | |
| 186 pops = df.groupby('Population') | |
| 187 q1 = pops.quantile(q=0.25) | |
| 188 q2 = pops.quantile(q=0.5) | |
| 189 q3 = pops.quantile(q=0.75) | |
| 190 iqr = q3 - q1 | |
| 191 upper = q3 + 1.5*iqr | |
| 192 lower = q1 - 1.5*iqr | |
| 193 resampled = False | |
| 194 # get outliers | |
| 195 out = pops.apply(get_outliers, upper, lower).dropna() | |
| 196 outliers = defaultdict(dict) | |
| 197 for population in set(df.Population): | |
| 198 for marker in df.columns: | |
| 199 if marker != 'Population': | |
| 200 tmp_outliers = list(out[population][marker]) | |
| 201 if (len(list(out[population][marker]))> 100): | |
| 202 tmp_outliers = list(out[population][marker].sample(n=100)) | |
| 203 resampled = True | |
| 204 outliers[population][marker] = tmp_outliers | |
| 205 outdf = pd.DataFrame(outliers) | |
| 206 | |
| 207 data = pd.concat({'q1': q1, | |
| 208 'q2': q2, | |
| 209 'q3': q3, | |
| 210 'upper': upper, | |
| 211 'lower': lower, | |
| 212 'outliers': outdf.T, | |
| 213 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) | |
| 214 | |
| 215 with open(output_json, "w") as js_all: | |
| 216 js_all.write(panel_to_json_string(data)) | |
| 217 | |
| 218 return resampled | |
| 219 | |
| 220 # html generation | |
| 221 def gen_flow_overview(args): | |
| 222 flow_stats = gen_overview_stats(args.input_file) | |
| 223 if len(set(flow_stats['population'])) > 40: | |
| 224 nbpop = str(len(set(flow_stats['population']))) | |
| 225 sys.stderr.write("There are " + nbpop + " in the input file.") | |
| 226 sys.exit(3) | |
| 227 | |
| 228 os.mkdir(args.output_directory) | |
| 229 html_template = "genOverview.template" | |
| 230 | |
| 231 if args.scores: | |
| 232 translate_profiles(args.scores, args.tool_directory, args.output_directory) | |
| 233 html_template = "genOverviewCL.template" | |
| 234 | |
| 235 env = Environment(loader=FileSystemLoader(args.tool_directory + "/templates")) | |
| 236 template = env.get_template(html_template) | |
| 237 | |
| 238 real_directory = args.output_directory.replace("/job_working_directory", "") | |
| 239 context = {'outputDirectory': real_directory} | |
| 240 overview = template.render(**context) | |
| 241 with open(args.output_file, "w") as f: | |
| 242 f.write(overview) | |
| 243 | |
| 244 flow_sample_file_name = args.output_directory + "/flow.sample" | |
| 245 with open(flow_sample_file_name, "w") as flow_sample_file: | |
| 246 flow_stats['sample'].to_csv(flow_sample_file, sep="\t", index=False, float_format='%.0f') | |
| 247 | |
| 248 flow_mfi_file_name = args.output_directory + "/flow.mfi" | |
| 249 with open(flow_mfi_file_name, "w") as flow_mfi_file: | |
| 250 flow_stats[args.mfi_calc].to_csv(flow_mfi_file, sep="\t", float_format='%.0f') | |
| 251 | |
| 252 mpop = "_".join([args.mfi_calc, "pop"]) | |
| 253 flow_mfi_pop_file_name = args.output_directory + "/flow.mfi_pop" | |
| 254 with open(flow_mfi_pop_file_name, "w") as flow_mfi_pop_file: | |
| 255 flow_stats[mpop].to_csv(flow_mfi_pop_file, sep="\t", index=False, float_format="%.2f") | |
| 256 | |
| 257 # box plot data | |
| 258 boxplot_data = args.output_directory + "/boxplotData.json" | |
| 259 resampled = get_boxplot_stats(args.input_file, flow_mfi_file_name, boxplot_data) | |
| 260 | |
| 261 # Generate the Images -- eventually we should change that over to D3 | |
| 262 fcm = flow_stats['sample_data'].values | |
| 263 colors = [] | |
| 264 for i, j in enumerate(flow_stats['sample_population']): | |
| 265 colors.append(color_palette[j]) | |
| 266 | |
| 267 for i in range(flow_stats['columns']): | |
| 268 for j in range(flow_stats['columns']): | |
| 269 file_name = "m" + str(i) + "_m" + str(j) | |
| 270 ax = plt.subplot(1, 1, 1) | |
| 271 plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=1.0, wspace=0.0, hspace=0.0) | |
| 272 plt.scatter(fcm[:, i], fcm[:, j], s=1, c=colors, edgecolors='none') | |
| 273 plt.axis([0, 1024, 0, 1024]) | |
| 274 plt.xticks([]) | |
| 275 plt.yticks([]) | |
| 276 F = plt.gcf() | |
| 277 F.set_size_inches(1, 1) | |
| 278 F.set_dpi(90) | |
| 279 png_file = file_name + "_90X90.png" | |
| 280 F.savefig(args.output_directory + "/" + png_file) | |
| 281 plt.clf() | |
| 282 | |
| 283 flow_overview_file_name = args.output_directory + "/flow.overview" | |
| 284 with open(flow_overview_file_name, "w") as flow_overview_file: | |
| 285 flow_overview_file.write("<table>\n") | |
| 286 flow_overview_file.write("<tr><td> </td>\n") | |
| 287 for i in range(flow_stats['columns']): | |
| 288 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") | |
| 289 | |
| 290 for i in range(flow_stats['columns']): | |
| 291 flow_overview_file.write("<tr>\n") | |
| 292 flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") | |
| 293 for j in range(flow_stats['columns']): | |
| 294 file_name = "m" + str(j) + "_m" + str(i) | |
| 295 image_file = file_name + "_90X90.png" | |
| 296 flow_overview_file.write('<td><img src="' + image_file + '"/></td>') | |
| 297 flow_overview_file.write("</tr>\n") | |
| 298 | |
| 299 flow_overview_file.write("</table>\n</body>\n<html>\n") | |
| 300 | |
| 301 if resampled: | |
| 302 to_find = '<div id="outlierWarning" style="display:none;">' | |
| 303 to_replace = '<div id="outlierWarning">' | |
| 304 ## yay python 2.7 | |
| 305 ro = fileinput.input(args.output_file, inplace=True, backup=".bak") | |
| 306 for roline in ro: | |
| 307 print(roline.replace(to_find, to_replace), end='') | |
| 308 ro.close() | |
| 309 | |
| 310 | |
| 311 if __name__ == "__main__": | |
| 312 parser = ArgumentParser( | |
| 313 prog="genOverview", | |
| 314 description="Generate an overview plot of Flow results.") | |
| 315 | |
| 316 parser.add_argument( | |
| 317 '-i', | |
| 318 dest="input_file", | |
| 319 required=True, | |
| 320 help="File location for the Flow Text file.") | |
| 321 | |
| 322 parser.add_argument( | |
| 323 '-o', | |
| 324 dest="output_file", | |
| 325 required=True, | |
| 326 help="File location for the HTML output file.") | |
| 327 | |
| 328 parser.add_argument( | |
| 329 '-d', | |
| 330 dest="output_directory", | |
| 331 required=True, | |
| 332 help="Directory location for the Flow Plot.") | |
| 333 | |
| 334 parser.add_argument( | |
| 335 '-M', | |
| 336 dest="mfi_calc", | |
| 337 required=True, | |
| 338 help="what to calculate for centroids.") | |
| 339 | |
| 340 parser.add_argument( | |
| 341 '-p', | |
| 342 dest="scores", | |
| 343 help="File location for FLOCK population scores.") | |
| 344 | |
| 345 parser.add_argument( | |
| 346 '-t', | |
| 347 dest="tool_directory", | |
| 348 required=True, | |
| 349 help="Location of the Tool Directory.") | |
| 350 | |
| 351 args = parser.parse_args() | |
| 352 | |
| 353 gen_flow_overview(args) |
