Mercurial > repos > immport-devteam > flow_overview
view genFlowOverview.py @ 1:b5453d07f740 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
author | azomics |
---|---|
date | Wed, 29 Jul 2020 17:03:53 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ###################################################################### # Copyright (c) 2016 Northrop Grumman. # All rights reserved. ###################################################################### # version 1.1 - August 2017 # added upper limit to nb of clusters (40) # import sys import os import pandas as pd import logging import fileinput from argparse import ArgumentParser from jinja2 import Environment, FileSystemLoader from collections import defaultdict from color_palette import color_palette from flowstatlib import gen_overview_stats import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt profile_key = { "1": "-", "2": "lo", "3": "+", "4": "hi" } # flow CL functions def run_flowCL(phenotype, output_txt, output_pdf, tool): run_command = " ". join(["Rscript --slave --vanilla", tool, output_txt, phenotype]) os.system(run_command) get_graph = " ".join(["mv flowCL_results/*.pdf", output_pdf]) os.system(get_graph) return def generate_flowCL_query(list_markers, list_types): if (len(list_markers) != len(list_types)): return("pb with headers") query = [] # go through both lists, remove fsc/ssc for i in range(1, len(list_markers)): if not list_markers[i].startswith("FSC") and not list_markers[i].startswith("SSC"): query.append(list_markers[i].upper()) query.append(profile_key[list_types[i]]) # return concatenated string return("".join(query)) def translate_profiles(input_file, tool_dir, html_dir): tool = "/".join([tool_dir, "getOntology.R"]) html_table = "".join([html_dir, "/CLprofiles.txt"]) score_table = "".join(["cp ", input_file, " ", html_dir, "/scores.txt"]) os.system(score_table) # read profile with open(input_file, "r") as flock_profiles, open(html_table, "w") as out: headers = flock_profiles.readline() headers = headers.strip() markers = headers.split("\t")[:-2] counter = 0 out.write("Population\tFlowCL Query\tNb Results\tLink to PDF\t") out.write("Top Result Label\tTop Result Score\tTop Result CL\n") queries = {} # create marker query for each population for lines in flock_profiles: lines = lines.strip("\n") pop_profile = lines.split("\t")[:-2] flowcl_query = generate_flowCL_query(markers, pop_profile) counter += 1 nb_results = "0" top_label = "no_match" top_score = "NA" top_CL = "NA" pdf_link = "NA" # check if query was run before if flowcl_query not in queries: # create filenames for results & graphs txt = "".join(["flowcl_pop", str(counter).zfill(2), ".txt"]) text_result = "/".join([html_dir, txt]) graph = "".join(["flowcl_pop", str(counter).zfill(2), ".pdf"]) graph_output = "/".join([html_dir, graph]) # run flowCL for each marker profile run_flowCL(flowcl_query, text_result, graph_output, tool) # test that text file exists if not results are all NAs: if os.path.isfile(text_result): with open(text_result, "r") as res: for line in res: if line.startswith("Score"): data = line.split(") ") top_score = data[2][:-2] tot_results = len(data) - 2 nb_results = str(tot_results) if tot_results == 5: if len(data[6].split("+")) > 1: nb_results = "5+" elif line.startswith("Cell ID"): prep_link = line.split(") ")[1][:-2] cl = prep_link.replace("_", ":") link = "".join(['<a href="http://www.immport-labs.org/immport-ontology/public/home/home/', cl, '" target="_blank">']) top_CL = "".join([link, prep_link, "</a>"]) elif line.startswith("Cell Label"): top_label = line.split(") ")[1][:-2] pdf_link = "".join(['<a href="', graph, '" target="_blank">PDF</a>']) tmpflowcl_query = "".join(['<a href="', txt, '" target="_blank">', flowcl_query, '</a>']) queries[flowcl_query] = { "query": tmpflowcl_query, "results": nb_results, "pdf": pdf_link, "label": top_label, "score": top_score, "CL": top_CL } # write query results to CLprofiles.txt out.write("\t".join([pop_profile[0], queries[flowcl_query]["query"], queries[flowcl_query]["results"], queries[flowcl_query]["pdf"], queries[flowcl_query]["label"], queries[flowcl_query]["score"], queries[flowcl_query]["CL"]]) + "\n") # boxplots data massaging def panel_to_json_string(df): # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs def __merge_stream(key, stream): return '"' + key + '"' + ': ' + stream + ', ' try: if 'Unnamed: 0' in df.columns: df = df.drop(['Unnamed: 0'], axis=1) stream = '{' for index, subdf in df.groupby(level=0): stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) # take out extra last comma stream = stream[:-2] # add the final paren stream += '}' except: logging.exception('Panel Encoding did not work') return stream def get_outliers(group, upper, lower): cat = group.name out = {} for marker in group: # skip population since upper and lower don't contain it, since it was made after a group by Population if marker != 'Population': out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] return out def get_boxplot_stats(all_data, mfi_file, output_json): # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html # Get initial MFI values mfi = pd.read_table(mfi_file) mfi = mfi.set_index('Population') df = pd.read_table(all_data) # check if ever some pops not in cs_files missing_pop = [x for x in mfi.index if x not in set(df.Population)] if (missing_pop): zeros = {} for m in df.columns: zeros[m] = [0 for x in missing_pop] tmpdf = pd.DataFrame(zeros) tmpdf.Population = missing_pop df = df.append(tmpdf) pops = df.groupby('Population') q1 = pops.quantile(q=0.25) q2 = pops.quantile(q=0.5) q3 = pops.quantile(q=0.75) iqr = q3 - q1 upper = q3 + 1.5*iqr lower = q1 - 1.5*iqr resampled = False # get outliers out = pops.apply(get_outliers, upper, lower).dropna() outliers = defaultdict(dict) for population in set(df.Population): for marker in df.columns: if marker != 'Population': tmp_outliers = list(out[population][marker]) if (len(list(out[population][marker]))> 100): tmp_outliers = list(out[population][marker].sample(n=100)) resampled = True outliers[population][marker] = tmp_outliers outdf = pd.DataFrame(outliers) data = pd.concat({'q1': q1, 'q2': q2, 'q3': q3, 'upper': upper, 'lower': lower, 'outliers': outdf.T, 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) with open(output_json, "w") as js_all: js_all.write(panel_to_json_string(data)) return resampled # html generation def gen_flow_overview(args): flow_stats = gen_overview_stats(args.input_file) if len(set(flow_stats['population'])) > 40: nbpop = str(len(set(flow_stats['population']))) sys.stderr.write("There are " + nbpop + " in the input file.") sys.exit(3) os.mkdir(args.output_directory) html_template = "genOverview.template" if args.scores: translate_profiles(args.scores, args.tool_directory, args.output_directory) html_template = "genOverviewCL.template" env = Environment(loader=FileSystemLoader(args.tool_directory + "/templates")) template = env.get_template(html_template) real_directory = args.output_directory.replace("/job_working_directory", "") context = {'outputDirectory': real_directory} overview = template.render(**context) with open(args.output_file, "w") as f: f.write(overview) flow_sample_file_name = args.output_directory + "/flow.sample" with open(flow_sample_file_name, "w") as flow_sample_file: flow_stats['sample'].to_csv(flow_sample_file, sep="\t", index=False, float_format='%.0f') flow_mfi_file_name = args.output_directory + "/flow.mfi" with open(flow_mfi_file_name, "w") as flow_mfi_file: flow_stats[args.mfi_calc].to_csv(flow_mfi_file, sep="\t", float_format='%.0f') mpop = "_".join([args.mfi_calc, "pop"]) flow_mfi_pop_file_name = args.output_directory + "/flow.mfi_pop" with open(flow_mfi_pop_file_name, "w") as flow_mfi_pop_file: flow_stats[mpop].to_csv(flow_mfi_pop_file, sep="\t", index=False, float_format="%.2f") # box plot data boxplot_data = args.output_directory + "/boxplotData.json" resampled = get_boxplot_stats(args.input_file, flow_mfi_file_name, boxplot_data) # Generate the Images -- eventually we should change that over to D3 fcm = flow_stats['sample_data'].values colors = [] for i, j in enumerate(flow_stats['sample_population']): colors.append(color_palette[j]) for i in range(flow_stats['columns']): for j in range(flow_stats['columns']): file_name = "m" + str(i) + "_m" + str(j) ax = plt.subplot(1, 1, 1) plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=1.0, wspace=0.0, hspace=0.0) plt.scatter(fcm[:, i], fcm[:, j], s=1, c=colors, edgecolors='none') plt.axis([0, 1024, 0, 1024]) plt.xticks([]) plt.yticks([]) F = plt.gcf() F.set_size_inches(1, 1) F.set_dpi(90) png_file = file_name + "_90X90.png" F.savefig(args.output_directory + "/" + png_file) plt.clf() flow_overview_file_name = args.output_directory + "/flow.overview" with open(flow_overview_file_name, "w") as flow_overview_file: flow_overview_file.write("<table>\n") flow_overview_file.write("<tr><td> </td>\n") for i in range(flow_stats['columns']): flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") for i in range(flow_stats['columns']): flow_overview_file.write("<tr>\n") flow_overview_file.write("<td>" + flow_stats['markers'][i] + "</td>\n") for j in range(flow_stats['columns']): file_name = "m" + str(j) + "_m" + str(i) image_file = file_name + "_90X90.png" flow_overview_file.write('<td><img src="' + image_file + '"/></td>') flow_overview_file.write("</tr>\n") flow_overview_file.write("</table>\n</body>\n<html>\n") if resampled: to_find = '<div id="outlierWarning" style="display:none;">' to_replace = '<div id="outlierWarning">' ## yay python 2.7 ro = fileinput.input(args.output_file, inplace=True, backup=".bak") for roline in ro: print(roline.replace(to_find, to_replace), end='') ro.close() if __name__ == "__main__": parser = ArgumentParser( prog="genOverview", description="Generate an overview plot of Flow results.") parser.add_argument( '-i', dest="input_file", required=True, help="File location for the Flow Text file.") parser.add_argument( '-o', dest="output_file", required=True, help="File location for the HTML output file.") parser.add_argument( '-d', dest="output_directory", required=True, help="Directory location for the Flow Plot.") parser.add_argument( '-M', dest="mfi_calc", required=True, help="what to calculate for centroids.") parser.add_argument( '-p', dest="scores", help="File location for FLOCK population scores.") parser.add_argument( '-t', dest="tool_directory", required=True, help="Location of the Tool Directory.") args = parser.parse_args() gen_flow_overview(args)