Mercurial > repos > immport-devteam > flow_overview
comparison flowstatlib.py @ 1:b5453d07f740 draft default tip
"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
| author | azomics |
|---|---|
| date | Wed, 29 Jul 2020 17:03:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:8283ff163ba6 | 1:b5453d07f740 |
|---|---|
| 1 ###################################################################### | |
| 2 # Copyright (c) 2016 Northrop Grumman. | |
| 3 # All rights reserved. | |
| 4 ###################################################################### | |
| 5 import pandas as pd | |
| 6 from scipy.stats import gmean | |
| 7 from argparse import ArgumentParser | |
| 8 | |
| 9 | |
| 10 def gen_overview_stats(file_name): | |
| 11 flow_stats = {} | |
| 12 fcs = pd.read_table(file_name) | |
| 13 (events, columns) = fcs.shape | |
| 14 flow_stats['fcs'] = fcs | |
| 15 flow_stats['events'] = events | |
| 16 flow_stats['columns'] = columns - 1 | |
| 17 flow_stats['data'] = fcs.iloc[:, :-1] | |
| 18 flow_stats['population'] = fcs.iloc[:, -1:].iloc[:, 0] | |
| 19 flow_stats['population_freq'] = flow_stats['population'].value_counts() | |
| 20 flow_stats['population_sample'] = (flow_stats['population_freq'] * (20000/float(events))).round(decimals=0) | |
| 21 flow_stats['population_freq_sort'] = flow_stats['population_freq'].sort_index() | |
| 22 flow_stats['population_per'] = (flow_stats['population'].value_counts(normalize=True) * 100).round(decimals=2) | |
| 23 flow_stats['population_per_sort'] = flow_stats['population_per'].sort_index() | |
| 24 flow_stats['population_all'] = pd.concat([flow_stats['population_freq_sort'], flow_stats['population_per_sort']], axis=1) | |
| 25 flow_stats['population_all'].columns = ['Count', 'Percentage'] | |
| 26 flow_stats['min'] = flow_stats['data'].values.min() | |
| 27 flow_stats['max'] = flow_stats['data'].values.max() | |
| 28 flow_stats['markers'] = list(flow_stats['data'].columns) | |
| 29 flow_stats['mfi'] = fcs.groupby('Population').mean().round(decimals=2) | |
| 30 flow_stats['mfi_pop'] = pd.merge(flow_stats['mfi'], flow_stats['population_all'], left_index=True, right_index=True) | |
| 31 flow_stats['mfi_pop']['Population'] = flow_stats['mfi_pop'].index | |
| 32 flow_stats['gmfi'] = fcs.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) | |
| 33 flow_stats['gmfi_pop'] = pd.merge(flow_stats['gmfi'], flow_stats['population_all'], left_index=True, right_index=True) | |
| 34 flow_stats['gmfi_pop']['Population'] = flow_stats['gmfi_pop'].index | |
| 35 flow_stats['mdfi'] = fcs.groupby('Population').median().round(decimals=2) | |
| 36 flow_stats['mdfi_pop'] = pd.merge(flow_stats['mdfi'], flow_stats['population_all'], left_index=True, right_index=True) | |
| 37 flow_stats['mdfi_pop']['Population'] = flow_stats['mdfi_pop'].index | |
| 38 | |
| 39 # | |
| 40 # If the number of events is less than 20000, then return | |
| 41 # the complete data set, | |
| 42 # Otherwise sample the data to only return 20000 events. | |
| 43 if events <= 20000: | |
| 44 flow_stats['sample'] = fcs | |
| 45 else: | |
| 46 fcs_np = fcs.values | |
| 47 sample_data = [] | |
| 48 pop_found = {} | |
| 49 for i in range(0, events): | |
| 50 population_number = fcs_np[i][columns-1] | |
| 51 if population_number in pop_found: | |
| 52 if pop_found[population_number] < flow_stats['population_sample'][population_number]: | |
| 53 pop_found[population_number] += 1 | |
| 54 sample_data.append(fcs_np[i]) | |
| 55 else: | |
| 56 pop_found[population_number] = 1 | |
| 57 sample_data.append(fcs_np[i]) | |
| 58 flow_stats['sample'] = pd.DataFrame(sample_data) | |
| 59 flow_stats['sample'].columns = fcs.columns | |
| 60 | |
| 61 flow_stats['sample_data'] = flow_stats['sample'].iloc[:, :-1] | |
| 62 flow_stats['sample_population'] = flow_stats['sample'].iloc[:, -1:].iloc[:, 0] | |
| 63 | |
| 64 return flow_stats | |
| 65 | |
| 66 | |
| 67 if __name__ == '__main__': | |
| 68 parser = ArgumentParser( | |
| 69 prog="flowstats", | |
| 70 description="Gets statistics on FLOCK run") | |
| 71 | |
| 72 parser.add_argument( | |
| 73 '-i', | |
| 74 dest="input_file", | |
| 75 required=True, | |
| 76 help="File locations for flow clr file.") | |
| 77 | |
| 78 parser.add_argument( | |
| 79 '-o', | |
| 80 dest="out_file", | |
| 81 required=True, | |
| 82 help="Path to the directory for the output file.") | |
| 83 args = parser.parse_args() | |
| 84 | |
| 85 flow_stats = gen_overview_stats(args.input_file) | |
| 86 with open(args.out_file, "w") as outf: | |
| 87 outf.write("Events: ", flow_stats['events']) | |
| 88 outf.write("Min: ", flow_stats['min']) | |
| 89 outf.write("Max: ", flow_stats['max']) | |
| 90 outf.write("Columns: ", flow_stats['columns']) | |
| 91 outf.write("Markers: ", flow_stats['markers']) | |
| 92 outf.write("Population: ", flow_stats['population']) | |
| 93 outf.write("Population Freq: ", flow_stats['population_freq']) | |
| 94 outf.write("Population Sample: ", flow_stats['population_sample']) | |
| 95 outf.write("Population Per: ", flow_stats['population_per']) | |
| 96 outf.write("Sample Data contains ", len(flow_stats['sample']), " events") | |
| 97 outf.write("MIF_POP ", flow_stats['mfi_pop']) |
