0
|
1 ######################################################################
|
|
2 # Copyright (c) 2016 Northrop Grumman.
|
|
3 # All rights reserved.
|
|
4 ######################################################################
|
|
5 from __future__ import print_function
|
|
6 import sys
|
|
7 import pandas as pd
|
|
8 from scipy.stats import gmean
|
|
9 from argparse import ArgumentParser
|
|
10
|
|
11
|
|
12 def gen_overview_stats(file_name):
|
|
13 flow_stats = {}
|
|
14 fcs = pd.read_table(file_name)
|
|
15 (events, columns) = fcs.shape
|
|
16 flow_stats['fcs'] = fcs
|
|
17 flow_stats['events'] = events
|
|
18 flow_stats['columns'] = columns - 1
|
|
19 flow_stats['data'] = fcs.iloc[:, :-1]
|
|
20 flow_stats['population'] = fcs.iloc[:, -1:].iloc[:, 0]
|
|
21 flow_stats['population_freq'] = flow_stats['population'].value_counts()
|
|
22 flow_stats['population_sample'] = (flow_stats['population_freq'] * (20000/float(events))).round(decimals=0)
|
|
23 flow_stats['population_freq_sort'] = flow_stats['population_freq'].sort_index()
|
|
24 flow_stats['population_per'] = (flow_stats['population'].value_counts(normalize=True) * 100).round(decimals=2)
|
|
25 flow_stats['population_per_sort'] = flow_stats['population_per'].sort_index()
|
|
26 flow_stats['population_all'] = pd.concat([flow_stats['population_freq_sort'], flow_stats['population_per_sort']], axis=1)
|
|
27 flow_stats['population_all'].columns = ['Count', 'Percentage']
|
|
28 flow_stats['min'] = flow_stats['data'].values.min()
|
|
29 flow_stats['max'] = flow_stats['data'].values.max()
|
|
30 flow_stats['markers'] = list(flow_stats['data'].columns)
|
|
31 flow_stats['mfi'] = fcs.groupby('Population').mean().round(decimals=2)
|
|
32 flow_stats['mfi_pop'] = pd.merge(flow_stats['mfi'], flow_stats['population_all'], left_index=True, right_index=True)
|
|
33 flow_stats['mfi_pop']['Population'] = flow_stats['mfi_pop'].index
|
|
34 flow_stats['gmfi'] = fcs.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
|
|
35 flow_stats['gmfi_pop'] = pd.merge(flow_stats['gmfi'], flow_stats['population_all'], left_index=True, right_index=True)
|
|
36 flow_stats['gmfi_pop']['Population'] = flow_stats['gmfi_pop'].index
|
|
37 flow_stats['mdfi'] = fcs.groupby('Population').median().round(decimals=2)
|
|
38 flow_stats['mdfi_pop'] = pd.merge(flow_stats['mdfi'], flow_stats['population_all'], left_index=True, right_index=True)
|
|
39 flow_stats['mdfi_pop']['Population'] = flow_stats['mdfi_pop'].index
|
|
40
|
|
41 #
|
|
42 # If the number of events is less than 20000, then return
|
|
43 # the complete data set,
|
|
44 # Otherwise sample the data to only return 20000 events.
|
|
45 if events <= 20000:
|
|
46 flow_stats['sample'] = fcs
|
|
47 else:
|
|
48 fcs_np = fcs.values
|
|
49 sample_data = []
|
|
50 pop_found = {}
|
|
51 for i in range(0, events):
|
|
52 population_number = fcs_np[i][columns-1]
|
|
53 if population_number in pop_found:
|
|
54 if pop_found[population_number] < flow_stats['population_sample'][population_number]:
|
|
55 pop_found[population_number] += 1
|
|
56 sample_data.append(fcs_np[i])
|
|
57 else:
|
|
58 pop_found[population_number] = 1
|
|
59 sample_data.append(fcs_np[i])
|
|
60 flow_stats['sample'] = pd.DataFrame(sample_data)
|
|
61 flow_stats['sample'].columns = fcs.columns
|
|
62
|
|
63 flow_stats['sample_data'] = flow_stats['sample'].iloc[:, :-1]
|
|
64 flow_stats['sample_population'] = flow_stats['sample'].iloc[:, -1:].iloc[:, 0]
|
|
65
|
|
66 return flow_stats
|
|
67
|
|
68
|
|
69 if __name__ == '__main__':
|
|
70 parser = ArgumentParser(
|
|
71 prog="flowstats",
|
|
72 description="Gets statistics on FLOCK run")
|
|
73
|
|
74 parser.add_argument(
|
|
75 '-i',
|
|
76 dest="input_file",
|
|
77 required=True,
|
|
78 help="File locations for flow clr file.")
|
|
79
|
|
80 parser.add_argument(
|
|
81 '-o',
|
|
82 dest="out_file",
|
|
83 required=True,
|
|
84 help="Path to the directory for the output file.")
|
|
85 args = parser.parse_args()
|
|
86
|
|
87 flow_stats = gen_overview_stats(args.input_file)
|
|
88 with open(args.out_file, "w") as outf:
|
|
89 outf.write("Events: ", flow_stats['events'])
|
|
90 outf.write("Min: ", flow_stats['min'])
|
|
91 outf.write("Max: ", flow_stats['max'])
|
|
92 outf.write("Columns: ", flow_stats['columns'])
|
|
93 outf.write("Markers: ", flow_stats['markers'])
|
|
94 outf.write("Population: ", flow_stats['population'])
|
|
95 outf.write("Population Freq: ", flow_stats['population_freq'])
|
|
96 outf.write("Population Sample: ", flow_stats['population_sample'])
|
|
97 outf.write("Population Per: ", flow_stats['population_per'])
|
|
98 outf.write("Sample Data contains ", len(flow_stats['sample']), " events")
|
|
99 outf.write("MIF_POP ", flow_stats['mfi_pop'])
|
|
100 sys.exit(0)
|