Mercurial > repos > immport-devteam > cross_sample
diff runCrossSample.py @ 4:e80b0f62ffb3 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cross_sample commit e7eab2dca0c1f73f580362f61425a78d4c8892ce"
author | azomics |
---|---|
date | Wed, 29 Jul 2020 13:32:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runCrossSample.py Wed Jul 29 13:32:17 2020 -0400 @@ -0,0 +1,216 @@ +#!/usr/bin/env python +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### +import sys +import os +from scipy.stats import gmean +from argparse import ArgumentParser +from collections import defaultdict +import pandas as pd + +# +# version 1.1 -- April 2016 -- C. Thomas +# modified to read in several input files and output to a directory +# + generates summary statistics +# also checks before running that input files are consistent with centroid file +# + + +def compare_MFIs(input_files, f_names, mfi_file): + header_MFIs = "" + flag_error = False + with open(mfi_file, "r") as mfi_check: + mfi_fl = mfi_check.readline().split("\t") + header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))]) + + for hh, files in enumerate(input_files): + with open(files, "r") as inf: + hdrs = inf.readline() + if hdrs != header_MFIs: + sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n") + flag_error = True + if flag_error: + sys.exit(2) + + +def stats_MFIs(cs_df, ctr, mfi_calc): + if mfi_calc == "mfi": + MFIs = cs_df.groupby('Population').mean().round(decimals=2) + elif mfi_calc == "gmfi": + MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) + else: + MFIs = cs_df.groupby('Population').median().round(decimals=2) + pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2) + sorted_pop_freq = pop_freq.sort_index() + MFIs['Percentage'] = sorted_pop_freq + MFIs['Population'] = MFIs.index + MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)]) + return MFIs + + +def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc): + pop_count = defaultdict(dict) + mrk = marker_names.strip().split("\t") + markers = "\t".join([mrk[m] for m in range(1, len(mrk))]) + + ctr_mfi = 0 + nb_pop = 0 + tot = {} + with open(mfi_stats, "a") as mfis: + mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n") + for files in input_files: + cs = pd.read_table(files) + tot[files] = len(cs.index) + for pops in cs.Population: + if pops in pop_count[files]: + pop_count[files][pops] += 1 + else: + pop_count[files][pops] = 1 + max_nb_pop = max(set(cs.Population)) + if (max_nb_pop > nb_pop): + nb_pop = max_nb_pop + ctr_mfi += 1 + cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc) + cs_stats.to_csv(mfis, sep="\t", header=False, index=False) + + ctr = 0 + with open(summary_stat, "w") as outf: + itpop = [str(x) for x in range(1, nb_pop + 1)] + cols = "\t".join(itpop) + outf.write("FileID\tSampleName\t" + cols + "\n") + for eachfile in pop_count: + tmp = [] + for num in range(1, nb_pop + 1): + if num not in pop_count[eachfile]: + pop_count[eachfile][num] = 0 + tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100)) + props = "\t".join(tmp) + ctr += 1 + sample_name = "".join(["Sample", str(ctr).zfill(2)]) + outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n") + + +def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat, + mfi_stats, mfi_calc): + markers = "" + # Strip off Header Line + with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out: + markers = mfi_in.readline().strip("\n") + for line in mfi_in: + mfi_out.write(line) + + # Create output directory + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + outputs = {} + # Run cent_adjust + for nm, flow_file in enumerate(input_files): + run_command = "cent_adjust mfi.txt " + flow_file + print(run_command) + os.system(run_command) + flow_name = os.path.split(flow_file)[1] + outfile = os.path.join(output_dir, f_names[nm] + ".flowclr") + outputs[outfile] = f_names[nm] + with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf: + f_line = flowf.readline() + f_line = f_line.rstrip() + f_line = f_line + "\tPopulation\n" + outf.write(f_line) + + for line in flowf: + line = line.rstrip() + pop_line = popf.readline() + pop_line = pop_line.rstrip() + line = line + "\t" + pop_line + "\n" + outf.write(line) + get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc) + return + + +def generate_CS_stats(mfi_stats, all_stats): + df = pd.read_table(mfi_stats) + means = df.groupby('Population').mean().round(decimals=2) + medians = df.groupby('Population').median().round(decimals=2) + stdev = df.groupby('Population').std().round(decimals=2) + all_markers = [] + with open(mfi_stats, "r") as ms: + ms_fl = ms.readline().strip() + all_markers = ms_fl.split("\t")[0:-2] + + with open(all_stats, "w") as mstats: + hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers] + mstats.write("Population\t") + mstats.write("\t".join(hdgs) + "\n") + for pops in set(df.Population): + tmp_line = [] + for mar in all_markers: + tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])])) + mstats.write(str(pops) + "\t") + mstats.write("\t".join(tmp_line) + "\n") + + +if __name__ == "__main__": + parser = ArgumentParser( + prog="runCrossSample", + description="Run CrossSample on Flow file") + + parser.add_argument( + '-i', + dest="input_files", + required=True, + action='append', + help="File locations for flow text files.") + + parser.add_argument( + '-n', + dest="filenames", + required=True, + action='append', + help="Filenames") + + parser.add_argument( + '-m', + dest="mfi", + required=True, + help="File location for the MFI text file.") + + parser.add_argument( + '-o', + dest="out_path", + required=True, + help="Path to the directory for the output files.") + + parser.add_argument( + '-M', + dest="mfi_calc", + required=True, + help="what to calculate for centroids.") + + parser.add_argument( + '-s', + dest="sstat", + required=True, + help="File location for the summary statistics.") + + parser.add_argument( + '-S', + dest="mfi_stat", + required=True, + help="File location for the MFI summary statistics.") + + parser.add_argument( + '-a', + dest="all_stats", + required=True, + help="File location for stats on all markers.") + + args = parser.parse_args() + + input_files = [f for f in args.input_files] + input_names = [n for n in args.filenames] + compare_MFIs(input_files, input_names, args.mfi) + run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.mfi_calc) + generate_CS_stats(args.mfi_stat, args.all_stats)