# HG changeset patch # User azomics # Date 1595939556 14400 # Node ID a64ece32a01a1e8686220d8fed3a10a7f8cb3b75 # Parent bca68066a9573eef0afaa7b43a116cc70ab78a5f "planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cs_overview commit a46097db0b6056e1125237393eb6974cfd51eb41" diff -r bca68066a957 -r a64ece32a01a crossSampleOverview.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crossSampleOverview.py Tue Jul 28 08:32:36 2020 -0400 @@ -0,0 +1,212 @@ +#!/usr/bin/env python + +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### + +# version 1.1 -- August 2017 +# added checks for consistency between input files +# and upper limit on nb of cluster to look at + +from __future__ import print_function +import sys +import os +import logging +import fileinput +import pandas as pd +from argparse import ArgumentParser +from jinja2 import Environment, FileSystemLoader +from shutil import copyfile +from collections import defaultdict + + +def check_pops(mfi_file, stat1): + df = pd.read_table(mfi_file) + df1 = pd.read_table(stat1) + nb_pop = len(set(df.Population)) + nb_pop1 = len(df1.columns) - 2 + if (nb_pop > 40): + sys.stderr.write("There are " + str(nb_pop) + " in the input file.") + sys.exit(1) + if (nb_pop != nb_pop1): + sys.exit(2) + + +def panel_to_json_string(df): + # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs + def __merge_stream(key, stream): + return '"' + key + '"' + ': ' + stream + ', ' + try: + if 'Unnamed: 0' in df.columns: + df = df.drop(['Unnamed: 0'], axis=1) + stream = '{' + for index, subdf in df.groupby(level=0): + stream += __merge_stream(index, df.loc[index, :, :].droplevel(0).to_json()) + # take out extra last comma + stream = stream[:-2] + # add the final paren + stream += '}' + except: + logging.exception('Panel Encoding did not work') + return stream + + +def get_outliers(group, upper, lower): + cat = group.name + out = {} + for marker in group: + # skip population since upper and lower don't contain it, since it was made after a group by Population + if marker != 'Population': + out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] + return out + + +def get_boxplot_stats(all_data, mfi_file, output_json): + # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html + # Get initial MFI values + mfi = pd.read_table(mfi_file) + mfi = mfi.set_index('Population') + + df = pd.read_table(all_data) + # check if ever some pops not in cs_files + missing_pop = [x for x in mfi.index if x not in set(df.Population)] + + if (missing_pop): + zeros = {} + for m in df.columns: + zeros[m] = [0 for x in missing_pop] + tmpdf = pd.DataFrame(zeros) + tmpdf.Population = missing_pop + df = df.append(tmpdf) + + pops = df.groupby('Population') + q1 = pops.quantile(q=0.25) + q2 = pops.quantile(q=0.5) + q3 = pops.quantile(q=0.75) + iqr = q3 - q1 + upper = q3 + 1.5*iqr + lower = q1 - 1.5*iqr + resampled = False + # get outliers + out = pops.apply(get_outliers, upper, lower).dropna() + outliers = defaultdict(dict) + for population in set(df.Population): + for marker in df.columns: + if marker != 'Population': + tmp_outliers = list(out[population][marker]) + if (len(list(out[population][marker])) > 100): + tmp_outliers = list(out[population][marker].sample(n=100)) + resampled = True + outliers[population][marker] = tmp_outliers + outdf = pd.DataFrame(outliers) + + data = pd.concat({'q1': q1, + 'q2': q2, + 'q3': q3, + 'upper': upper, + 'lower': lower, + 'outliers': outdf.T, + 'mfi': mfi}, keys=['q1','q2','q3','upper','lower','outliers','mfi']) + + with open(output_json, "w") as js_all: + js_all.write(panel_to_json_string(data)) + + return resampled + + +def cs_overview(input_file, input_mfi, init_mfi, output_file, output_dir, tools_dir, cs_files): + os.mkdir(output_dir) + + env = Environment(loader=FileSystemLoader(tools_dir + "/templates")) + template = env.get_template("csOverview.template") + + real_directory = output_dir.replace("/job_working_directory", "") + context = {'outputDirectory': real_directory} + overview = template.render(**context) + with open(output_file, "w") as outf: + outf.write(overview) + + cs_overview_file = output_dir + "/csOverview.tsv" + copyfile(input_file, cs_overview_file) + + cs_overview_mfis = output_dir + "/csAllMFIs.tsv" + copyfile(input_mfi, cs_overview_mfis) + + # Get all the data to calculate quantiles, IRC and outliers. + tmp_all_data = "csAllData.tsv" + with open(tmp_all_data, "a") as alldata: + # assumes that the files have ran through flock and CS and therefore have the same headers + df1 = pd.read_table(cs_files[0]) + df1.to_csv(alldata, sep="\t", header=True, index=False) + for i in range(1, len(cs_files)): + df = pd.read_table(cs_files[i]) + df.to_csv(alldata, sep="\t", header=False, index=False) + + cs_boxplot_data = output_dir + "/csBoxplotData.json" + resampled = get_boxplot_stats(tmp_all_data, init_mfi, cs_boxplot_data) + if resampled: + to_find = '