Mercurial > repos > immport-devteam > cross_sample
view runCrossSample.py @ 4:e80b0f62ffb3 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cross_sample commit e7eab2dca0c1f73f580362f61425a78d4c8892ce"
author | azomics |
---|---|
date | Wed, 29 Jul 2020 13:32:17 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ###################################################################### # Copyright (c) 2016 Northrop Grumman. # All rights reserved. ###################################################################### import sys import os from scipy.stats import gmean from argparse import ArgumentParser from collections import defaultdict import pandas as pd # # version 1.1 -- April 2016 -- C. Thomas # modified to read in several input files and output to a directory # + generates summary statistics # also checks before running that input files are consistent with centroid file # def compare_MFIs(input_files, f_names, mfi_file): header_MFIs = "" flag_error = False with open(mfi_file, "r") as mfi_check: mfi_fl = mfi_check.readline().split("\t") header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))]) for hh, files in enumerate(input_files): with open(files, "r") as inf: hdrs = inf.readline() if hdrs != header_MFIs: sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n") flag_error = True if flag_error: sys.exit(2) def stats_MFIs(cs_df, ctr, mfi_calc): if mfi_calc == "mfi": MFIs = cs_df.groupby('Population').mean().round(decimals=2) elif mfi_calc == "gmfi": MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) else: MFIs = cs_df.groupby('Population').median().round(decimals=2) pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2) sorted_pop_freq = pop_freq.sort_index() MFIs['Percentage'] = sorted_pop_freq MFIs['Population'] = MFIs.index MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)]) return MFIs def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc): pop_count = defaultdict(dict) mrk = marker_names.strip().split("\t") markers = "\t".join([mrk[m] for m in range(1, len(mrk))]) ctr_mfi = 0 nb_pop = 0 tot = {} with open(mfi_stats, "a") as mfis: mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n") for files in input_files: cs = pd.read_table(files) tot[files] = len(cs.index) for pops in cs.Population: if pops in pop_count[files]: pop_count[files][pops] += 1 else: pop_count[files][pops] = 1 max_nb_pop = max(set(cs.Population)) if (max_nb_pop > nb_pop): nb_pop = max_nb_pop ctr_mfi += 1 cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc) cs_stats.to_csv(mfis, sep="\t", header=False, index=False) ctr = 0 with open(summary_stat, "w") as outf: itpop = [str(x) for x in range(1, nb_pop + 1)] cols = "\t".join(itpop) outf.write("FileID\tSampleName\t" + cols + "\n") for eachfile in pop_count: tmp = [] for num in range(1, nb_pop + 1): if num not in pop_count[eachfile]: pop_count[eachfile][num] = 0 tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100)) props = "\t".join(tmp) ctr += 1 sample_name = "".join(["Sample", str(ctr).zfill(2)]) outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n") def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat, mfi_stats, mfi_calc): markers = "" # Strip off Header Line with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out: markers = mfi_in.readline().strip("\n") for line in mfi_in: mfi_out.write(line) # Create output directory if not os.path.exists(output_dir): os.makedirs(output_dir) outputs = {} # Run cent_adjust for nm, flow_file in enumerate(input_files): run_command = "cent_adjust mfi.txt " + flow_file print(run_command) os.system(run_command) flow_name = os.path.split(flow_file)[1] outfile = os.path.join(output_dir, f_names[nm] + ".flowclr") outputs[outfile] = f_names[nm] with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf: f_line = flowf.readline() f_line = f_line.rstrip() f_line = f_line + "\tPopulation\n" outf.write(f_line) for line in flowf: line = line.rstrip() pop_line = popf.readline() pop_line = pop_line.rstrip() line = line + "\t" + pop_line + "\n" outf.write(line) get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc) return def generate_CS_stats(mfi_stats, all_stats): df = pd.read_table(mfi_stats) means = df.groupby('Population').mean().round(decimals=2) medians = df.groupby('Population').median().round(decimals=2) stdev = df.groupby('Population').std().round(decimals=2) all_markers = [] with open(mfi_stats, "r") as ms: ms_fl = ms.readline().strip() all_markers = ms_fl.split("\t")[0:-2] with open(all_stats, "w") as mstats: hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers] mstats.write("Population\t") mstats.write("\t".join(hdgs) + "\n") for pops in set(df.Population): tmp_line = [] for mar in all_markers: tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])])) mstats.write(str(pops) + "\t") mstats.write("\t".join(tmp_line) + "\n") if __name__ == "__main__": parser = ArgumentParser( prog="runCrossSample", description="Run CrossSample on Flow file") parser.add_argument( '-i', dest="input_files", required=True, action='append', help="File locations for flow text files.") parser.add_argument( '-n', dest="filenames", required=True, action='append', help="Filenames") parser.add_argument( '-m', dest="mfi", required=True, help="File location for the MFI text file.") parser.add_argument( '-o', dest="out_path", required=True, help="Path to the directory for the output files.") parser.add_argument( '-M', dest="mfi_calc", required=True, help="what to calculate for centroids.") parser.add_argument( '-s', dest="sstat", required=True, help="File location for the summary statistics.") parser.add_argument( '-S', dest="mfi_stat", required=True, help="File location for the MFI summary statistics.") parser.add_argument( '-a', dest="all_stats", required=True, help="File location for stats on all markers.") args = parser.parse_args() input_files = [f for f in args.input_files] input_names = [n for n in args.filenames] compare_MFIs(input_files, input_names, args.mfi) run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.mfi_calc) generate_CS_stats(args.mfi_stat, args.all_stats)