Mercurial > repos > abims-sbr > mutcount
comparison scripts/S02b_extreme_2states.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:28:50 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc3674e515b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #coding: utf-8 | |
| 3 #Author : Eric Fontanillas (2010) - Victor Mataigne (2018) | |
| 4 | |
| 5 import pandas as pd | |
| 6 import argparse, os | |
| 7 | |
| 8 def loop_on_elems(list_of_elems, path_in, path_out, sps_group_1, sps_group_2, colnames): | |
| 9 | |
| 10 # sub-routine | |
| 11 | |
| 12 def tableu(fileu, sps_group_1, sps_group_2): | |
| 13 """ a | |
| 14 | |
| 15 Args : | |
| 16 fileu : input file with counts of AAs / AAs types per orthogorup | |
| 17 sps_group_1 : species for condition 1 (ex : hot water species) | |
| 18 sps_group_2 : species for condition 2 (ex : cold water species) | |
| 19 | |
| 20 Returns : | |
| 21 greater_dict : | |
| 22 lower_dict : | |
| 23 | |
| 24 """ | |
| 25 df = pd.read_csv(fileu, sep=',', index_col=0, header=0) | |
| 26 # species = list(df) #columns names = species names | |
| 27 | |
| 28 # initialize counts | |
| 29 greater_dict = {} | |
| 30 lower_dict = {} | |
| 31 | |
| 32 for specie in sps_group_1+sps_group_2: | |
| 33 greater_dict[specie] = 0 | |
| 34 lower_dict[specie] = 0 | |
| 35 | |
| 36 #nb_trials = 0 | |
| 37 for (index, row) in df.iterrows(): | |
| 38 # min and max counts for each condition | |
| 39 if not df.loc[index, sps_group_1+sps_group_2].isnull().values.any() : | |
| 40 #nb_trials += 1 | |
| 41 | |
| 42 max_cat1 = max(df.loc[index, sps_group_1]) # species in category 1 (ex : hots) | |
| 43 min_cat1 = min(df.loc[index, sps_group_1]) | |
| 44 max_cat2 = max(df.loc[index, sps_group_2]) # species in category 2 (ex : colds) | |
| 45 min_cat2 = min(df.loc[index, sps_group_2]) | |
| 46 | |
| 47 for specie in sps_group_1: | |
| 48 if df.loc[index, specie] > max_cat2 : | |
| 49 greater_dict[specie] += 1 | |
| 50 elif df.loc[index, specie] < min_cat2 : | |
| 51 lower_dict[specie] += 1 | |
| 52 | |
| 53 for specie in sps_group_2: | |
| 54 if df.loc[index, specie] > max_cat1 : | |
| 55 greater_dict[specie] += 1 | |
| 56 elif df.loc[index, specie] < min_cat1 : | |
| 57 lower_dict[specie] += 1 | |
| 58 | |
| 59 return greater_dict, lower_dict#, nb_trials | |
| 60 | |
| 61 # Function ------------------------------------------------------ | |
| 62 | |
| 63 for variable in list_of_elems: | |
| 64 print 'Processing : {} ...'.format(variable) | |
| 65 file_in = "{}/{}.csv".format(path_in, variable) | |
| 66 file_out = open('{}/{}.csv'.format(path_out,variable), 'w') | |
| 67 | |
| 68 # Compute succeses and fails on each variable | |
| 69 greater_dict, lower_dict = tableu(file_in, sps_group_1, sps_group_2) | |
| 70 | |
| 71 # totals and diffs | |
| 72 diff_dict = {} | |
| 73 total_dict = {} | |
| 74 for key in greater_dict.keys(): | |
| 75 diff_dict[key] = greater_dict[key] - lower_dict[key] | |
| 76 total_dict[key] = greater_dict[key] + lower_dict[key] | |
| 77 #total_dict[key] = number_trials | |
| 78 | |
| 79 # results frame | |
| 80 df = pd.DataFrame([greater_dict, lower_dict, diff_dict, total_dict]) | |
| 81 df = df.rename({0:'Greater',1:'Lower',2:'Difference',3:'Trial_Number'}) #, axis='index' if pandas 0.15 | |
| 82 df = df.rename(index=str, columns=colnames) | |
| 83 | |
| 84 df.to_csv("{}/{}.csv".format(path_out, variable), sep=",", encoding="utf-8") | |
| 85 | |
| 86 def main(): | |
| 87 parser = argparse.ArgumentParser() | |
| 88 parser.add_argument("sps_group_1", help="List of species separated by commas") | |
| 89 parser.add_argument("sps_group_2", help="List of species separated by commas") | |
| 90 parser.add_argument("format", choices=['nucleic', 'proteic'], help="input files format") | |
| 91 args = parser.parse_args() | |
| 92 | |
| 93 # used only if format = nucleic | |
| 94 LN =['A','C','T','G'] | |
| 95 Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000'] | |
| 96 | |
| 97 # used only if format = proteic | |
| 98 LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] | |
| 99 LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP', | |
| 100 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC', | |
| 101 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM'] | |
| 102 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] | |
| 103 | |
| 104 # inputs and outputs paths | |
| 105 if args.format == 'nucleic': | |
| 106 input_path_elem = '02_tables_per_nucleotide' | |
| 107 input_path_var = '02_tables_per_nuc_variable' | |
| 108 out_path_elem = '03_tables_counts_signTest_nucleotides' | |
| 109 out_path_var = '03_tables_counts_signTest_nuc_variables' | |
| 110 elif args.format == 'proteic': | |
| 111 input_path_elem = '02_tables_per_aa' | |
| 112 input_path_var = '02_tables_per_aa_variable' | |
| 113 out_path_elem = '03_tables_counts_signTest_aa' | |
| 114 out_path_var = '03_tables_counts_signTest_aa_variables' | |
| 115 | |
| 116 os.mkdir(out_path_elem) | |
| 117 os.mkdir(out_path_var) | |
| 118 | |
| 119 sps_group_1 = args.sps_group_1.split(',') | |
| 120 sps_group_2 = args.sps_group_2.split(',') | |
| 121 | |
| 122 # Prepare colnames for final frames | |
| 123 colnames = {} | |
| 124 # for specie in sps_group_1: | |
| 125 # colnames[specie] = '{}_vs_condition_1'.format(specie) | |
| 126 # for specie in sps_group_2: | |
| 127 # colnames[specie] = '{}_vs_condition_2'.format(specie) | |
| 128 | |
| 129 for specie in sps_group_1: | |
| 130 colnames[specie] = '{}_vs_{}'.format(specie, args.sps_group_2.replace(',','')) | |
| 131 for specie in sps_group_2: | |
| 132 colnames[specie] = '{}_vs_{}'.format(specie, args.sps_group_1.replace(',','')) | |
| 133 | |
| 134 # Building tables | |
| 135 if args.format == 'nucleic': | |
| 136 loop_on_elems(LN, input_path_elem, out_path_elem, sps_group_1, sps_group_2, colnames) | |
| 137 loop_on_elems(Lratios, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames) | |
| 138 elif args.format == 'proteic': | |
| 139 loop_on_elems(LAA, input_path_elem, out_path_elem, sps_group_1, sps_group_2, colnames) | |
| 140 loop_on_elems(LV, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames) | |
| 141 loop_on_elems(LS, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames) | |
| 142 | |
| 143 # Final R script launching sign test | |
| 144 print 'Processing : binomial sign tests ...' | |
| 145 | |
| 146 if args.format == 'nucleic': | |
| 147 final_output_elem = '04_outputs_nucleotides' | |
| 148 final_output_var = '04_outputs_nuc_variables' | |
| 149 elif args.format == 'proteic': | |
| 150 final_output_elem = '04_outputs_aa' | |
| 151 final_output_var = '04_outputs_aa_variables' | |
| 152 | |
| 153 os.mkdir(final_output_elem) | |
| 154 os.mkdir(final_output_var) | |
| 155 os.system('Rscript S03b_sign_test_binomial.R --indir %s --outdir %s' %(out_path_elem, final_output_elem)) | |
| 156 os.system('Rscript S03b_sign_test_binomial.R --indir %s --outdir %s' %(out_path_var, final_output_var)) | |
| 157 | |
| 158 if __name__ == '__main__': | |
| 159 main() |
