Mercurial > repos > immport-devteam > cross_sample
comparison runCrossSample.py @ 4:e80b0f62ffb3 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/cross_sample commit e7eab2dca0c1f73f580362f61425a78d4c8892ce"
author | azomics |
---|---|
date | Wed, 29 Jul 2020 13:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:5f670146a9af | 4:e80b0f62ffb3 |
---|---|
1 #!/usr/bin/env python | |
2 ###################################################################### | |
3 # Copyright (c) 2016 Northrop Grumman. | |
4 # All rights reserved. | |
5 ###################################################################### | |
6 import sys | |
7 import os | |
8 from scipy.stats import gmean | |
9 from argparse import ArgumentParser | |
10 from collections import defaultdict | |
11 import pandas as pd | |
12 | |
13 # | |
14 # version 1.1 -- April 2016 -- C. Thomas | |
15 # modified to read in several input files and output to a directory | |
16 # + generates summary statistics | |
17 # also checks before running that input files are consistent with centroid file | |
18 # | |
19 | |
20 | |
21 def compare_MFIs(input_files, f_names, mfi_file): | |
22 header_MFIs = "" | |
23 flag_error = False | |
24 with open(mfi_file, "r") as mfi_check: | |
25 mfi_fl = mfi_check.readline().split("\t") | |
26 header_MFIs = "\t".join([mfi_fl[h] for h in range(1, len(mfi_fl))]) | |
27 | |
28 for hh, files in enumerate(input_files): | |
29 with open(files, "r") as inf: | |
30 hdrs = inf.readline() | |
31 if hdrs != header_MFIs: | |
32 sys.stderr.write(hdrs + "headers in " + f_names[hh] + " are not consistent with FLOCK centroid file:\n" + header_MFIs + "\n") | |
33 flag_error = True | |
34 if flag_error: | |
35 sys.exit(2) | |
36 | |
37 | |
38 def stats_MFIs(cs_df, ctr, mfi_calc): | |
39 if mfi_calc == "mfi": | |
40 MFIs = cs_df.groupby('Population').mean().round(decimals=2) | |
41 elif mfi_calc == "gmfi": | |
42 MFIs = cs_df.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) | |
43 else: | |
44 MFIs = cs_df.groupby('Population').median().round(decimals=2) | |
45 pop_freq = (cs_df.Population.value_counts(normalize=True) * 100).round(decimals=2) | |
46 sorted_pop_freq = pop_freq.sort_index() | |
47 MFIs['Percentage'] = sorted_pop_freq | |
48 MFIs['Population'] = MFIs.index | |
49 MFIs['SampleName'] = "".join(["Sample", str(ctr).zfill(2)]) | |
50 return MFIs | |
51 | |
52 | |
53 def get_pop_prop(input_files, summary_stat, mfi_stats, marker_names, mfi_calc): | |
54 pop_count = defaultdict(dict) | |
55 mrk = marker_names.strip().split("\t") | |
56 markers = "\t".join([mrk[m] for m in range(1, len(mrk))]) | |
57 | |
58 ctr_mfi = 0 | |
59 nb_pop = 0 | |
60 tot = {} | |
61 with open(mfi_stats, "a") as mfis: | |
62 mfis.write("\t".join([markers, "Percentage", "Population", "SampleName"]) + "\n") | |
63 for files in input_files: | |
64 cs = pd.read_table(files) | |
65 tot[files] = len(cs.index) | |
66 for pops in cs.Population: | |
67 if pops in pop_count[files]: | |
68 pop_count[files][pops] += 1 | |
69 else: | |
70 pop_count[files][pops] = 1 | |
71 max_nb_pop = max(set(cs.Population)) | |
72 if (max_nb_pop > nb_pop): | |
73 nb_pop = max_nb_pop | |
74 ctr_mfi += 1 | |
75 cs_stats = stats_MFIs(cs, ctr_mfi, mfi_calc) | |
76 cs_stats.to_csv(mfis, sep="\t", header=False, index=False) | |
77 | |
78 ctr = 0 | |
79 with open(summary_stat, "w") as outf: | |
80 itpop = [str(x) for x in range(1, nb_pop + 1)] | |
81 cols = "\t".join(itpop) | |
82 outf.write("FileID\tSampleName\t" + cols + "\n") | |
83 for eachfile in pop_count: | |
84 tmp = [] | |
85 for num in range(1, nb_pop + 1): | |
86 if num not in pop_count[eachfile]: | |
87 pop_count[eachfile][num] = 0 | |
88 tmp.append(str((pop_count[eachfile][num] / float(tot[eachfile])) * 100)) | |
89 props = "\t".join(tmp) | |
90 ctr += 1 | |
91 sample_name = "".join(["Sample", str(ctr).zfill(2)]) | |
92 outf.write("\t".join([input_files[eachfile], sample_name, props]) + "\n") | |
93 | |
94 | |
95 def run_cross_sample(input_files, f_names, mfi_file, output_dir, summary_stat, | |
96 mfi_stats, mfi_calc): | |
97 markers = "" | |
98 # Strip off Header Line | |
99 with open(mfi_file, "r") as mfi_in, open("mfi.txt", "w") as mfi_out: | |
100 markers = mfi_in.readline().strip("\n") | |
101 for line in mfi_in: | |
102 mfi_out.write(line) | |
103 | |
104 # Create output directory | |
105 if not os.path.exists(output_dir): | |
106 os.makedirs(output_dir) | |
107 | |
108 outputs = {} | |
109 # Run cent_adjust | |
110 for nm, flow_file in enumerate(input_files): | |
111 run_command = "cent_adjust mfi.txt " + flow_file | |
112 print(run_command) | |
113 os.system(run_command) | |
114 flow_name = os.path.split(flow_file)[1] | |
115 outfile = os.path.join(output_dir, f_names[nm] + ".flowclr") | |
116 outputs[outfile] = f_names[nm] | |
117 with open(flow_file, "r") as flowf, open("population_id.txt", "r") as popf, open(outfile, "w") as outf: | |
118 f_line = flowf.readline() | |
119 f_line = f_line.rstrip() | |
120 f_line = f_line + "\tPopulation\n" | |
121 outf.write(f_line) | |
122 | |
123 for line in flowf: | |
124 line = line.rstrip() | |
125 pop_line = popf.readline() | |
126 pop_line = pop_line.rstrip() | |
127 line = line + "\t" + pop_line + "\n" | |
128 outf.write(line) | |
129 get_pop_prop(outputs, summary_stat, mfi_stats, markers, mfi_calc) | |
130 return | |
131 | |
132 | |
133 def generate_CS_stats(mfi_stats, all_stats): | |
134 df = pd.read_table(mfi_stats) | |
135 means = df.groupby('Population').mean().round(decimals=2) | |
136 medians = df.groupby('Population').median().round(decimals=2) | |
137 stdev = df.groupby('Population').std().round(decimals=2) | |
138 all_markers = [] | |
139 with open(mfi_stats, "r") as ms: | |
140 ms_fl = ms.readline().strip() | |
141 all_markers = ms_fl.split("\t")[0:-2] | |
142 | |
143 with open(all_stats, "w") as mstats: | |
144 hdgs = ["\t".join(["_".join([mrs, "mean"]), "_".join([mrs, "median"]), "_".join([mrs, "stdev"])]) for mrs in all_markers] | |
145 mstats.write("Population\t") | |
146 mstats.write("\t".join(hdgs) + "\n") | |
147 for pops in set(df.Population): | |
148 tmp_line = [] | |
149 for mar in all_markers: | |
150 tmp_line.append("\t".join([str(means.loc[pops, mar]), str(medians.loc[pops, mar]), str(stdev.loc[pops, mar])])) | |
151 mstats.write(str(pops) + "\t") | |
152 mstats.write("\t".join(tmp_line) + "\n") | |
153 | |
154 | |
155 if __name__ == "__main__": | |
156 parser = ArgumentParser( | |
157 prog="runCrossSample", | |
158 description="Run CrossSample on Flow file") | |
159 | |
160 parser.add_argument( | |
161 '-i', | |
162 dest="input_files", | |
163 required=True, | |
164 action='append', | |
165 help="File locations for flow text files.") | |
166 | |
167 parser.add_argument( | |
168 '-n', | |
169 dest="filenames", | |
170 required=True, | |
171 action='append', | |
172 help="Filenames") | |
173 | |
174 parser.add_argument( | |
175 '-m', | |
176 dest="mfi", | |
177 required=True, | |
178 help="File location for the MFI text file.") | |
179 | |
180 parser.add_argument( | |
181 '-o', | |
182 dest="out_path", | |
183 required=True, | |
184 help="Path to the directory for the output files.") | |
185 | |
186 parser.add_argument( | |
187 '-M', | |
188 dest="mfi_calc", | |
189 required=True, | |
190 help="what to calculate for centroids.") | |
191 | |
192 parser.add_argument( | |
193 '-s', | |
194 dest="sstat", | |
195 required=True, | |
196 help="File location for the summary statistics.") | |
197 | |
198 parser.add_argument( | |
199 '-S', | |
200 dest="mfi_stat", | |
201 required=True, | |
202 help="File location for the MFI summary statistics.") | |
203 | |
204 parser.add_argument( | |
205 '-a', | |
206 dest="all_stats", | |
207 required=True, | |
208 help="File location for stats on all markers.") | |
209 | |
210 args = parser.parse_args() | |
211 | |
212 input_files = [f for f in args.input_files] | |
213 input_names = [n for n in args.filenames] | |
214 compare_MFIs(input_files, input_names, args.mfi) | |
215 run_cross_sample(input_files, input_names, args.mfi, args.out_path, args.sstat, args.mfi_stat, args.mfi_calc) | |
216 generate_CS_stats(args.mfi_stat, args.all_stats) |