Mercurial > repos > jay > pdaug_addclasslabel
comparison PDAUG_Fishers_Plot/PDAUG_Fishers_Plot.py @ 0:2df11ea23f10 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
| author | jay |
|---|---|
| date | Wed, 28 Oct 2020 02:36:27 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2df11ea23f10 |
|---|---|
| 1 import matplotlib | |
| 2 matplotlib.use('Agg') | |
| 3 import os | |
| 4 import sys | |
| 5 sys.path.insert(0, os.path.abspath('..')) | |
| 6 | |
| 7 import quantiprot | |
| 8 from quantiprot.utils.io import load_fasta_file | |
| 9 from quantiprot.utils.feature import Feature, FeatureSet | |
| 10 from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy | |
| 11 from quantiprot.metrics.basic import average | |
| 12 | |
| 13 from matplotlib import pyplot as plt | |
| 14 | |
| 15 | |
| 16 from math import log10, floor | |
| 17 import numpy as np | |
| 18 from matplotlib import pyplot as plt | |
| 19 from scipy.stats import fisher_exact | |
| 20 from quantiprot.utils.sequence import SequenceSet, compact | |
| 21 | |
| 22 | |
| 23 def _count_frame(data, frame_range, num_bins): | |
| 24 """ | |
| 25 Count instances in a 2D frame | |
| 26 | |
| 27 The function discretizes the feature space into a grid of cells. | |
| 28 Then it counts the number of instances that fall into each cell. | |
| 29 An efficient method for counting instances is used. It performs parallel | |
| 30 logical comparisons of data instances to vectors that hold information on | |
| 31 grid lines. | |
| 32 | |
| 33 Args: | |
| 34 data (numpy.matrix): a Nx2 data matrix | |
| 35 frame_range (numpy.matrix): a 2x2 matrix which defines feature ranges | |
| 36 num_bins (list): a pair defining the resolution of the 2D grid | |
| 37 Returns: | |
| 38 cell_counts (numpy.matrix): a matrix holding counts of instances in | |
| 39 each grid cell | |
| 40 bin_ranges (tuple): a pair of numpy matrices holding information on | |
| 41 bin(grid_cell) ranges | |
| 42 """ | |
| 43 grid_x = np.linspace(start=frame_range[0, 0], stop=frame_range[1, 0],\ | |
| 44 num=num_bins[0]+1, endpoint=True) | |
| 45 grid_y = np.linspace(start=frame_range[0, 1], stop=frame_range[1, 1],\ | |
| 46 num=num_bins[1]+1, endpoint=True) | |
| 47 # copy because we add ones in the next lines | |
| 48 bin_ranges = (np.copy(grid_x), np.copy(grid_y)) | |
| 49 | |
| 50 | |
| 51 #Count points in each grid cell | |
| 52 grid_x[-1] += 1 # the last cell has to contain data at the border | |
| 53 grid_y[-1] += 1 # the last cell has to contain data at the border | |
| 54 | |
| 55 gte_x = np.matrix(data[:, 0] >= grid_x, dtype='float64') | |
| 56 lt_x = np.matrix(data[:, 0] < grid_x, dtype='float64') | |
| 57 gte_y = np.matrix(data[:, 1] >= grid_y, dtype='float64') | |
| 58 lt_y = np.matrix(data[:, 1] < grid_y, dtype='float64') | |
| 59 | |
| 60 dif_x = gte_x - lt_x | |
| 61 dif_y = gte_y - lt_y | |
| 62 | |
| 63 bins_x = dif_x.argmin(axis=1) - 1 | |
| 64 bins_y = dif_y.argmin(axis=1) - 1 | |
| 65 | |
| 66 coords = np.concatenate((bins_x, bins_y), axis=1) | |
| 67 | |
| 68 cell_counts = np.zeros(shape=(len(grid_x)-1, len(grid_y)-1)) | |
| 69 | |
| 70 for i in range(coords.shape[0]): | |
| 71 cell_counts[coords[i, 0], coords[i, 1]] += 1 | |
| 72 | |
| 73 return cell_counts, bin_ranges | |
| 74 | |
| 75 | |
| 76 def local_fisher_2d(set1, set2, features=None, \ | |
| 77 windows_per_frame=10, overlap_factor=1, frame_range=None): | |
| 78 """ | |
| 79 Compare local and global distribution of samples from two populations | |
| 80 in the 2d feature space using the Fisher's exact test. | |
| 81 | |
| 82 The function performs the Fisher Exact Test for comparing local and global | |
| 83 ratia of instance counts from two different populations. It uses the | |
| 84 '_count_frame' function to discretize the feature space and get instance | |
| 85 counts. Then it scans the 2d feature space with a sliding window and | |
| 86 performs the Fisher Exact test. | |
| 87 | |
| 88 Args: | |
| 89 set1 (SequenceSet or numpy.matrix): the first set with at least | |
| 90 2 sequence features. | |
| 91 set2 (SequenceSet or numpy.matrix): the second set with at least | |
| 92 2 sequence features. | |
| 93 features (tuple or list): strings with feature names for running | |
| 94 the 2d Fisher test. If None then the first two features are | |
| 95 used. Relevant only if 'set1' or 'set2' are SequenceSets. | |
| 96 windows_per_frame (int): ratio between the whole feature space and | |
| 97 the sliding window (default 10). | |
| 98 overlap_factor (int):ratio between the size of a sliding window | |
| 99 and a discretization grid cell (default 1). | |
| 100 frame_range(numpy.matrix): 2x2 matrix with range of features | |
| 101 in both dimensions. | |
| 102 | |
| 103 Returns final_res (dict): a dictionary including: | |
| 104 'odds_ratio' (numpy.matrix): a matrix of odds_ratios obtained | |
| 105 in each sliding window position. | |
| 106 'p_value' (numpy.matrix): a matrix containing Fisher test outcome | |
| 107 pvalues in each sliding window position. | |
| 108 'w_counts1' (numpy.matrix): a matrix with first population instance | |
| 109 counts in each sliding window position. | |
| 110 'w_counts2' (numpy.matrix): a matrix with second population instance | |
| 111 counts in each sliding window position. | |
| 112 'w_center_x' (numpy.matrix): matrix containing coordinates of window | |
| 113 centers in the X dimension. | |
| 114 'w_center_y' (numpy.matrix): matrix containing coordinates of window | |
| 115 centers in the Y dimension. | |
| 116 '_bin_ranges_x' (numpy.matrix): matrix containing bin(grid_cell) | |
| 117 ranges in the X dimension. | |
| 118 '_bin_ranges_y' (numpy.matrix): matrix containing bin(grid_cell) | |
| 119 ranges in the Y dimension. | |
| 120 """ | |
| 121 | |
| 122 if isinstance(set1, SequenceSet): | |
| 123 mat1 = np.transpose(np.matrix(compact(set1, | |
| 124 features=features).columns())) | |
| 125 if isinstance(set2, SequenceSet): | |
| 126 mat2 = np.transpose(np.matrix(compact(set2, | |
| 127 features=features).columns())) | |
| 128 | |
| 129 #Deal with window_per_frame and overlap_factor | |
| 130 #given either as a scalar or as a list-like | |
| 131 if not hasattr(windows_per_frame, "__len__"): | |
| 132 w_per_frame = (windows_per_frame, windows_per_frame) | |
| 133 else: | |
| 134 w_per_frame = (windows_per_frame[0], windows_per_frame[1]) | |
| 135 | |
| 136 if not hasattr(overlap_factor, "__len__"): | |
| 137 w_size = (overlap_factor, overlap_factor) | |
| 138 else: | |
| 139 w_size = (overlap_factor[0], overlap_factor[1]) | |
| 140 | |
| 141 num_bins = (w_per_frame[0]*w_size[0], w_per_frame[1]*w_size[1]) | |
| 142 | |
| 143 if frame_range is None: | |
| 144 #Evaluate the range of features in both populations. | |
| 145 | |
| 146 frame_range = np.concatenate((np.minimum(mat1.min(0), mat2.min(0)),\ | |
| 147 np.maximum(mat1.max(0), mat2.max(0)))) | |
| 148 | |
| 149 margin_x = (frame_range[1, 0] - frame_range[0, 0])/w_per_frame[0] | |
| 150 margin_y = (frame_range[1, 1] - frame_range[0, 1])/w_per_frame[1] | |
| 151 | |
| 152 frame_range[0, 0] -= margin_x | |
| 153 frame_range[1, 0] += margin_x | |
| 154 | |
| 155 frame_range[0, 1] -= margin_y | |
| 156 frame_range[1, 1] += margin_y | |
| 157 | |
| 158 #Discretize feature space into NxM grid, | |
| 159 #where N = w_per_frame[0]*w_size[0]. | |
| 160 # M = w_per_frame[1]*w_size[1]. | |
| 161 #count instances of population1 and population2 in each grid cell. | |
| 162 #both bin ranges are always the same because the frame range is common. | |
| 163 cell_counts1, bin_ranges = _count_frame(mat1, frame_range=frame_range,\ | |
| 164 num_bins=num_bins) | |
| 165 cell_counts2, _ = _count_frame(mat2, frame_range=frame_range,\ | |
| 166 num_bins=num_bins) | |
| 167 | |
| 168 #Number of windows that fit in a single row/column of a frame | |
| 169 w_number = (cell_counts1.shape[0]-w_size[0]+1, | |
| 170 cell_counts1.shape[1]-w_size[1]+1) | |
| 171 | |
| 172 #Initialize matrices holding counts at scanning window positions. | |
| 173 window_counts1 = np.zeros(shape=w_number) | |
| 174 window_counts2 = np.zeros(shape=w_number) | |
| 175 | |
| 176 #Initialize matrices holding window coordinates | |
| 177 window_center_x = np.zeros(shape=w_number[0]) | |
| 178 window_center_y = np.zeros(shape=w_number[1]) | |
| 179 | |
| 180 #Initialize matrices holding Fisher Exact test results | |
| 181 fisher_pv = np.ones(shape=w_number) | |
| 182 odds_ratio = np.ones(shape=w_number) | |
| 183 | |
| 184 #Calculate population totals in the whole feature space | |
| 185 all1 = cell_counts1.sum() | |
| 186 all2 = cell_counts2.sum() | |
| 187 | |
| 188 #Calculate window centers | |
| 189 for start_x in range(0, w_number[0]): | |
| 190 window_center_x[start_x] = (bin_ranges[0][start_x]+ \ | |
| 191 bin_ranges[0][start_x+w_size[0]])/2 | |
| 192 for start_y in range(0, w_number[1]): | |
| 193 window_center_y[start_y] = (bin_ranges[1][start_y]+ \ | |
| 194 bin_ranges[1][start_y+w_size[1]])/2 | |
| 195 | |
| 196 #Scan the feature space with a step of 1 cell. | |
| 197 for start_x in range(0, w_number[0]): | |
| 198 | |
| 199 for start_y in range(0, w_number[1]): | |
| 200 #Count instances of each population in the window | |
| 201 window_counts1[start_x, start_y] = \ | |
| 202 cell_counts1[start_x:(start_x+w_size[0]), \ | |
| 203 start_y:(start_y+w_size[1])].sum() | |
| 204 window_counts2[start_x, start_y] = \ | |
| 205 cell_counts2[start_x:(start_x+w_size[0]), \ | |
| 206 start_y:(start_y+w_size[1])].sum() | |
| 207 #Perform the Fisher Exact Test against | |
| 208 #h0: population ratio in the window the same as in the whole space. | |
| 209 odds_ratio[start_x, start_y], fisher_pv[start_x, start_y] =\ | |
| 210 fisher_exact([[all1, window_counts1[start_x, start_y]],\ | |
| 211 [all2, window_counts2[start_x, start_y]]]) | |
| 212 | |
| 213 fisher_res = {'p_value':fisher_pv, 'odds_ratio':odds_ratio,\ | |
| 214 'w_counts1':window_counts1, 'w_counts2':window_counts2,\ | |
| 215 'w_center_x':window_center_x, 'w_center_y':window_center_y,\ | |
| 216 '_bin_ranges_x':bin_ranges[0], '_bin_ranges_y':bin_ranges[1]} | |
| 217 | |
| 218 return fisher_res | |
| 219 | |
| 220 | |
| 221 def _plot_local_fisher_2d(fisher_res, xlabel="feat_1", ylabel="feat_2", | |
| 222 pop1_label="pop_1", pop2_label="pop_2", out_file_path=None, fig_width=8, fig_hight=8, fig_hspace=0.35, fig_wspace=0.25): | |
| 223 """ | |
| 224 Plot results of the local Fisher's extact test in the 2d space. | |
| 225 | |
| 226 Args: | |
| 227 fisher_res (dict): output from 'fisher_local_2d'. | |
| 228 xlabel (str): name of the 1st feature to appear in the plots | |
| 229 (default: "feat_1") | |
| 230 ylabel (str): name of the 2nd feature to appear in the plots | |
| 231 (default: "feat_2") | |
| 232 pop1_label (str): name of the 1st population to appear in the plots | |
| 233 (default: "pop_1") | |
| 234 pop2_label (str): name of the 2nd population to appear in the plots | |
| 235 (default: "pop_2") | |
| 236 """ | |
| 237 fisher_or = fisher_res["odds_ratio"] | |
| 238 fisher_c1 = fisher_res["w_counts1"] | |
| 239 fisher_c2 = fisher_res["w_counts2"] | |
| 240 fisher_pv = fisher_res["p_value"] | |
| 241 | |
| 242 for pos_x in range(len(fisher_or)): | |
| 243 for pos_y in range(len(fisher_or[0])): | |
| 244 if fisher_c1[pos_x][pos_y] == 0 and fisher_c2[pos_x][pos_y] == 0: | |
| 245 fisher_or[pos_x][pos_y] = np.nan | |
| 246 elif fisher_c1[pos_x][pos_y] == 0: | |
| 247 fisher_or[pos_x][pos_y] = np.inf | |
| 248 elif fisher_c2[pos_x][pos_y] == 0: | |
| 249 fisher_or[pos_x][pos_y] = -np.inf | |
| 250 elif fisher_or[pos_x][pos_y] < 1: | |
| 251 fisher_or[pos_x][pos_y] = -1.0/fisher_or[pos_x][pos_y] | |
| 252 | |
| 253 vmax_abs = np.nanmax(np.abs([x for x in np.array(fisher_or).flatten() | |
| 254 if x > -np.inf and x < np.inf])) | |
| 255 | |
| 256 for pos_x in range(len(fisher_or)): | |
| 257 for pos_y in range(len(fisher_or[0])): | |
| 258 if abs(fisher_or[pos_x][pos_y]) == np.inf: | |
| 259 fisher_or[pos_x][pos_y] = np.sign(fisher_or[pos_x][pos_y])*vmax_abs | |
| 260 | |
| 261 ##### Extra Fig perimeters added ################################ | |
| 262 plt.figure(figsize=(fig_width, fig_hight)) # Figure size | |
| 263 plt.subplots_adjust(hspace = fig_hspace, wspace = fig_wspace) # space between the subplots. | |
| 264 ################################################################## | |
| 265 | |
| 266 plt.subplot(221) | |
| 267 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
| 268 np.ma.masked_invalid(fisher_c1).T, cmap="Reds") | |
| 269 plt.colorbar() | |
| 270 plt.xlabel(xlabel) | |
| 271 plt.ylabel(ylabel) | |
| 272 plt.title("Counts "+pop1_label) | |
| 273 | |
| 274 plt.subplot(222) | |
| 275 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
| 276 np.ma.masked_invalid(fisher_c2).T, cmap="Reds") | |
| 277 plt.colorbar() | |
| 278 plt.xlabel(xlabel) | |
| 279 plt.ylabel(ylabel) | |
| 280 plt.title("Counts "+pop2_label) | |
| 281 | |
| 282 cmap = plt.get_cmap('RdBu') | |
| 283 cmap.set_bad(color='k', alpha=1.) | |
| 284 | |
| 285 cbar_lo = 1.0/vmax_abs | |
| 286 cbar_lo_places = max(0, -floor(log10(cbar_lo))+1) | |
| 287 cbar_hi = vmax_abs | |
| 288 cbar_hi_places = max(0, -floor(log10(cbar_hi))+1) | |
| 289 | |
| 290 plt.subplot(223) | |
| 291 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
| 292 np.ma.masked_invalid(fisher_or).T, cmap=cmap, | |
| 293 vmin=-vmax_abs, vmax=vmax_abs) | |
| 294 cbar = plt.colorbar(ticks=([-vmax_abs, 0, vmax_abs])) | |
| 295 cbar.ax.set_yticklabels(['< '+str(round(cbar_lo, int(cbar_lo_places))), '1', | |
| 296 '> '+str(round(cbar_hi, int(cbar_hi_places)))]) | |
| 297 plt.xlabel(xlabel) | |
| 298 plt.ylabel(ylabel) | |
| 299 plt.title("Odds ratio") | |
| 300 | |
| 301 plt.subplot(224) | |
| 302 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
| 303 np.log10(np.ma.masked_invalid(fisher_pv)).T, cmap="RdGy") | |
| 304 plt.colorbar() | |
| 305 plt.xlabel(xlabel) | |
| 306 plt.ylabel(ylabel) | |
| 307 plt.title("Fisher test\np-value (logarithm of 10)") | |
| 308 | |
| 309 #Savefig function added with preserving default behavior | |
| 310 | |
| 311 if out_file_path==None: | |
| 312 plt.show() | |
| 313 else: | |
| 314 plt.savefig(out_file_path,dpi=300) | |
| 315 | |
| 316 | |
| 317 def HTML_Gen(html): | |
| 318 | |
| 319 out_html = open(html,'w') | |
| 320 part_1 = """ | |
| 321 | |
| 322 <!DOCTYPE html> | |
| 323 <html lang="en"> | |
| 324 <head> | |
| 325 <title>Bootstrap Example</title> | |
| 326 <meta charset="utf-8"> | |
| 327 <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| 328 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css"> | |
| 329 <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script> | |
| 330 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script> | |
| 331 <body> | |
| 332 <style> | |
| 333 div.container_1 { | |
| 334 width:600px; | |
| 335 margin: auto; | |
| 336 padding-right: 10; | |
| 337 } | |
| 338 div.table { | |
| 339 width:600px; | |
| 340 margin: auto; | |
| 341 padding-right: 10; | |
| 342 } | |
| 343 </style> | |
| 344 </head> | |
| 345 <div class="jumbotron text-center"> | |
| 346 <h1> Fisher's Plot </h1> | |
| 347 </div> | |
| 348 <div class="container"> | |
| 349 <div class="row"> | |
| 350 <div class="col-sm-4"> | |
| 351 <img src="1.png" alt="Smiley face" height="800" width="800"> | |
| 352 </div> | |
| 353 | |
| 354 </div> | |
| 355 </div> | |
| 356 </body> | |
| 357 </html> | |
| 358 """ | |
| 359 out_html.write(part_1) | |
| 360 out_html.close() | |
| 361 # Load sets of amyloidogenic and non-amyloidogenic peptides: | |
| 362 | |
| 363 def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath): | |
| 364 | |
| 365 if not os.path.exists(htmlOutDir): | |
| 366 os.makedirs(htmlOutDir) | |
| 367 | |
| 368 amyload_pos_seq = load_fasta_file(Fasta1) | |
| 369 amyload_neg_seq = load_fasta_file(Fasta2) | |
| 370 | |
| 371 # Calculate quantitive features: volume and hydropathy | |
| 372 mean_volume = Feature(get_aa2volume()).then(average) | |
| 373 mean_hydropathy = Feature(get_aa2hydropathy()).then(average) | |
| 374 | |
| 375 fs = FeatureSet("volume'n'hydropathy") | |
| 376 fs.add(mean_volume) | |
| 377 fs.add(mean_hydropathy) | |
| 378 | |
| 379 amyload_pos_conv_seq = fs(amyload_pos_seq) | |
| 380 amyload_neg_conv_seq = fs(amyload_neg_seq) | |
| 381 | |
| 382 # Do local Fisher: | |
| 383 result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, | |
| 384 windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor)) | |
| 385 | |
| 386 # Plot local Fisher: | |
| 387 _plot_local_fisher_2d(result, xlabel=xlabel, | |
| 388 ylabel=ylabel, | |
| 389 pop1_label=pop1_label, | |
| 390 pop2_label=pop2_label, | |
| 391 out_file_path =os.path.join(os.getcwd(), "out.png") | |
| 392 ) | |
| 393 | |
| 394 | |
| 395 # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png")) | |
| 396 | |
| 397 HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname)) | |
| 398 | |
| 399 if __name__=="__main__": | |
| 400 | |
| 401 | |
| 402 import argparse | |
| 403 | |
| 404 parser = argparse.ArgumentParser() | |
| 405 | |
| 406 parser.add_argument("-f1", "--Fasta1", required=True, default=None, help="First fasta file ") | |
| 407 parser.add_argument("-f2", "--Fasta2", required=True, default=None, help="Second fasta file") | |
| 408 parser.add_argument("-o", "--overlap_factor", required=False, default=5, help="Overlap factor") | |
| 409 parser.add_argument("-w", "--windows_per_frame", required=False, default=5, help="Windows per frame") | |
| 410 parser.add_argument("-x", "--xlabel", required=True, default=None, help="X label") | |
| 411 parser.add_argument("-y", "--ylabel", required=True, default=None, help="Y label") | |
| 412 parser.add_argument("-p1", "--pop1_label", required=True, default=None, help="First population label") | |
| 413 parser.add_argument("-p2", "--pop2_label", required=True, default=None, help="Second population label") | |
| 414 parser.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="Path to html dir") | |
| 415 parser.add_argument("--htmlFname", required=False, help="html output file", default="report.html") | |
| 416 parser.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Path to output Working Directory") | |
| 417 args = parser.parse_args() | |
| 418 | |
| 419 run(args.Fasta1, args.Fasta2, args.windows_per_frame, args.overlap_factor, args.xlabel, args.ylabel, args.pop1_label, args.pop2_label, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
| 420 |
