Mercurial > repos > jay > pdaug_merge_dataframes
diff PDAUG_Fishers_Plot/PDAUG_Fishers_Plot.py @ 0:5bb52d4bf172 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 01:54:31 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PDAUG_Fishers_Plot/PDAUG_Fishers_Plot.py Wed Oct 28 01:54:31 2020 +0000 @@ -0,0 +1,420 @@ +import matplotlib +matplotlib.use('Agg') +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +import quantiprot +from quantiprot.utils.io import load_fasta_file +from quantiprot.utils.feature import Feature, FeatureSet +from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy +from quantiprot.metrics.basic import average + +from matplotlib import pyplot as plt + + +from math import log10, floor +import numpy as np +from matplotlib import pyplot as plt +from scipy.stats import fisher_exact +from quantiprot.utils.sequence import SequenceSet, compact + + +def _count_frame(data, frame_range, num_bins): + """ + Count instances in a 2D frame + + The function discretizes the feature space into a grid of cells. + Then it counts the number of instances that fall into each cell. + An efficient method for counting instances is used. It performs parallel + logical comparisons of data instances to vectors that hold information on + grid lines. + + Args: + data (numpy.matrix): a Nx2 data matrix + frame_range (numpy.matrix): a 2x2 matrix which defines feature ranges + num_bins (list): a pair defining the resolution of the 2D grid + Returns: + cell_counts (numpy.matrix): a matrix holding counts of instances in + each grid cell + bin_ranges (tuple): a pair of numpy matrices holding information on + bin(grid_cell) ranges + """ + grid_x = np.linspace(start=frame_range[0, 0], stop=frame_range[1, 0],\ + num=num_bins[0]+1, endpoint=True) + grid_y = np.linspace(start=frame_range[0, 1], stop=frame_range[1, 1],\ + num=num_bins[1]+1, endpoint=True) + # copy because we add ones in the next lines + bin_ranges = (np.copy(grid_x), np.copy(grid_y)) + + + #Count points in each grid cell + grid_x[-1] += 1 # the last cell has to contain data at the border + grid_y[-1] += 1 # the last cell has to contain data at the border + + gte_x = np.matrix(data[:, 0] >= grid_x, dtype='float64') + lt_x = np.matrix(data[:, 0] < grid_x, dtype='float64') + gte_y = np.matrix(data[:, 1] >= grid_y, dtype='float64') + lt_y = np.matrix(data[:, 1] < grid_y, dtype='float64') + + dif_x = gte_x - lt_x + dif_y = gte_y - lt_y + + bins_x = dif_x.argmin(axis=1) - 1 + bins_y = dif_y.argmin(axis=1) - 1 + + coords = np.concatenate((bins_x, bins_y), axis=1) + + cell_counts = np.zeros(shape=(len(grid_x)-1, len(grid_y)-1)) + + for i in range(coords.shape[0]): + cell_counts[coords[i, 0], coords[i, 1]] += 1 + + return cell_counts, bin_ranges + + +def local_fisher_2d(set1, set2, features=None, \ + windows_per_frame=10, overlap_factor=1, frame_range=None): + """ + Compare local and global distribution of samples from two populations + in the 2d feature space using the Fisher's exact test. + + The function performs the Fisher Exact Test for comparing local and global + ratia of instance counts from two different populations. It uses the + '_count_frame' function to discretize the feature space and get instance + counts. Then it scans the 2d feature space with a sliding window and + performs the Fisher Exact test. + + Args: + set1 (SequenceSet or numpy.matrix): the first set with at least + 2 sequence features. + set2 (SequenceSet or numpy.matrix): the second set with at least + 2 sequence features. + features (tuple or list): strings with feature names for running + the 2d Fisher test. If None then the first two features are + used. Relevant only if 'set1' or 'set2' are SequenceSets. + windows_per_frame (int): ratio between the whole feature space and + the sliding window (default 10). + overlap_factor (int):ratio between the size of a sliding window + and a discretization grid cell (default 1). + frame_range(numpy.matrix): 2x2 matrix with range of features + in both dimensions. + + Returns final_res (dict): a dictionary including: + 'odds_ratio' (numpy.matrix): a matrix of odds_ratios obtained + in each sliding window position. + 'p_value' (numpy.matrix): a matrix containing Fisher test outcome + pvalues in each sliding window position. + 'w_counts1' (numpy.matrix): a matrix with first population instance + counts in each sliding window position. + 'w_counts2' (numpy.matrix): a matrix with second population instance + counts in each sliding window position. + 'w_center_x' (numpy.matrix): matrix containing coordinates of window + centers in the X dimension. + 'w_center_y' (numpy.matrix): matrix containing coordinates of window + centers in the Y dimension. + '_bin_ranges_x' (numpy.matrix): matrix containing bin(grid_cell) + ranges in the X dimension. + '_bin_ranges_y' (numpy.matrix): matrix containing bin(grid_cell) + ranges in the Y dimension. + """ + + if isinstance(set1, SequenceSet): + mat1 = np.transpose(np.matrix(compact(set1, + features=features).columns())) + if isinstance(set2, SequenceSet): + mat2 = np.transpose(np.matrix(compact(set2, + features=features).columns())) + + #Deal with window_per_frame and overlap_factor + #given either as a scalar or as a list-like + if not hasattr(windows_per_frame, "__len__"): + w_per_frame = (windows_per_frame, windows_per_frame) + else: + w_per_frame = (windows_per_frame[0], windows_per_frame[1]) + + if not hasattr(overlap_factor, "__len__"): + w_size = (overlap_factor, overlap_factor) + else: + w_size = (overlap_factor[0], overlap_factor[1]) + + num_bins = (w_per_frame[0]*w_size[0], w_per_frame[1]*w_size[1]) + + if frame_range is None: + #Evaluate the range of features in both populations. + + frame_range = np.concatenate((np.minimum(mat1.min(0), mat2.min(0)),\ + np.maximum(mat1.max(0), mat2.max(0)))) + + margin_x = (frame_range[1, 0] - frame_range[0, 0])/w_per_frame[0] + margin_y = (frame_range[1, 1] - frame_range[0, 1])/w_per_frame[1] + + frame_range[0, 0] -= margin_x + frame_range[1, 0] += margin_x + + frame_range[0, 1] -= margin_y + frame_range[1, 1] += margin_y + + #Discretize feature space into NxM grid, + #where N = w_per_frame[0]*w_size[0]. + # M = w_per_frame[1]*w_size[1]. + #count instances of population1 and population2 in each grid cell. + #both bin ranges are always the same because the frame range is common. + cell_counts1, bin_ranges = _count_frame(mat1, frame_range=frame_range,\ + num_bins=num_bins) + cell_counts2, _ = _count_frame(mat2, frame_range=frame_range,\ + num_bins=num_bins) + + #Number of windows that fit in a single row/column of a frame + w_number = (cell_counts1.shape[0]-w_size[0]+1, + cell_counts1.shape[1]-w_size[1]+1) + + #Initialize matrices holding counts at scanning window positions. + window_counts1 = np.zeros(shape=w_number) + window_counts2 = np.zeros(shape=w_number) + + #Initialize matrices holding window coordinates + window_center_x = np.zeros(shape=w_number[0]) + window_center_y = np.zeros(shape=w_number[1]) + + #Initialize matrices holding Fisher Exact test results + fisher_pv = np.ones(shape=w_number) + odds_ratio = np.ones(shape=w_number) + + #Calculate population totals in the whole feature space + all1 = cell_counts1.sum() + all2 = cell_counts2.sum() + + #Calculate window centers + for start_x in range(0, w_number[0]): + window_center_x[start_x] = (bin_ranges[0][start_x]+ \ + bin_ranges[0][start_x+w_size[0]])/2 + for start_y in range(0, w_number[1]): + window_center_y[start_y] = (bin_ranges[1][start_y]+ \ + bin_ranges[1][start_y+w_size[1]])/2 + + #Scan the feature space with a step of 1 cell. + for start_x in range(0, w_number[0]): + + for start_y in range(0, w_number[1]): + #Count instances of each population in the window + window_counts1[start_x, start_y] = \ + cell_counts1[start_x:(start_x+w_size[0]), \ + start_y:(start_y+w_size[1])].sum() + window_counts2[start_x, start_y] = \ + cell_counts2[start_x:(start_x+w_size[0]), \ + start_y:(start_y+w_size[1])].sum() + #Perform the Fisher Exact Test against + #h0: population ratio in the window the same as in the whole space. + odds_ratio[start_x, start_y], fisher_pv[start_x, start_y] =\ + fisher_exact([[all1, window_counts1[start_x, start_y]],\ + [all2, window_counts2[start_x, start_y]]]) + + fisher_res = {'p_value':fisher_pv, 'odds_ratio':odds_ratio,\ + 'w_counts1':window_counts1, 'w_counts2':window_counts2,\ + 'w_center_x':window_center_x, 'w_center_y':window_center_y,\ + '_bin_ranges_x':bin_ranges[0], '_bin_ranges_y':bin_ranges[1]} + + return fisher_res + + +def _plot_local_fisher_2d(fisher_res, xlabel="feat_1", ylabel="feat_2", + pop1_label="pop_1", pop2_label="pop_2", out_file_path=None, fig_width=8, fig_hight=8, fig_hspace=0.35, fig_wspace=0.25): + """ + Plot results of the local Fisher's extact test in the 2d space. + + Args: + fisher_res (dict): output from 'fisher_local_2d'. + xlabel (str): name of the 1st feature to appear in the plots + (default: "feat_1") + ylabel (str): name of the 2nd feature to appear in the plots + (default: "feat_2") + pop1_label (str): name of the 1st population to appear in the plots + (default: "pop_1") + pop2_label (str): name of the 2nd population to appear in the plots + (default: "pop_2") + """ + fisher_or = fisher_res["odds_ratio"] + fisher_c1 = fisher_res["w_counts1"] + fisher_c2 = fisher_res["w_counts2"] + fisher_pv = fisher_res["p_value"] + + for pos_x in range(len(fisher_or)): + for pos_y in range(len(fisher_or[0])): + if fisher_c1[pos_x][pos_y] == 0 and fisher_c2[pos_x][pos_y] == 0: + fisher_or[pos_x][pos_y] = np.nan + elif fisher_c1[pos_x][pos_y] == 0: + fisher_or[pos_x][pos_y] = np.inf + elif fisher_c2[pos_x][pos_y] == 0: + fisher_or[pos_x][pos_y] = -np.inf + elif fisher_or[pos_x][pos_y] < 1: + fisher_or[pos_x][pos_y] = -1.0/fisher_or[pos_x][pos_y] + + vmax_abs = np.nanmax(np.abs([x for x in np.array(fisher_or).flatten() + if x > -np.inf and x < np.inf])) + + for pos_x in range(len(fisher_or)): + for pos_y in range(len(fisher_or[0])): + if abs(fisher_or[pos_x][pos_y]) == np.inf: + fisher_or[pos_x][pos_y] = np.sign(fisher_or[pos_x][pos_y])*vmax_abs + + ##### Extra Fig perimeters added ################################ + plt.figure(figsize=(fig_width, fig_hight)) # Figure size + plt.subplots_adjust(hspace = fig_hspace, wspace = fig_wspace) # space between the subplots. + ################################################################## + + plt.subplot(221) + plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], + np.ma.masked_invalid(fisher_c1).T, cmap="Reds") + plt.colorbar() + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title("Counts "+pop1_label) + + plt.subplot(222) + plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], + np.ma.masked_invalid(fisher_c2).T, cmap="Reds") + plt.colorbar() + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title("Counts "+pop2_label) + + cmap = plt.get_cmap('RdBu') + cmap.set_bad(color='k', alpha=1.) + + cbar_lo = 1.0/vmax_abs + cbar_lo_places = max(0, -floor(log10(cbar_lo))+1) + cbar_hi = vmax_abs + cbar_hi_places = max(0, -floor(log10(cbar_hi))+1) + + plt.subplot(223) + plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], + np.ma.masked_invalid(fisher_or).T, cmap=cmap, + vmin=-vmax_abs, vmax=vmax_abs) + cbar = plt.colorbar(ticks=([-vmax_abs, 0, vmax_abs])) + cbar.ax.set_yticklabels(['< '+str(round(cbar_lo, int(cbar_lo_places))), '1', + '> '+str(round(cbar_hi, int(cbar_hi_places)))]) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title("Odds ratio") + + plt.subplot(224) + plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], + np.log10(np.ma.masked_invalid(fisher_pv)).T, cmap="RdGy") + plt.colorbar() + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title("Fisher test\np-value (logarithm of 10)") + + #Savefig function added with preserving default behavior + + if out_file_path==None: + plt.show() + else: + plt.savefig(out_file_path,dpi=300) + + +def HTML_Gen(html): + + out_html = open(html,'w') + part_1 = """ + + <!DOCTYPE html> + <html lang="en"> + <head> + <title>Bootstrap Example</title> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css"> + <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script> + <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script> + <body> + <style> + div.container_1 { + width:600px; + margin: auto; + padding-right: 10; + } + div.table { + width:600px; + margin: auto; + padding-right: 10; + } + </style> + </head> + <div class="jumbotron text-center"> + <h1> Fisher's Plot </h1> + </div> + <div class="container"> + <div class="row"> + <div class="col-sm-4"> + <img src="1.png" alt="Smiley face" height="800" width="800"> + </div> + + </div> + </div> + </body> + </html> + """ + out_html.write(part_1) + out_html.close() +# Load sets of amyloidogenic and non-amyloidogenic peptides: + +def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath): + + if not os.path.exists(htmlOutDir): + os.makedirs(htmlOutDir) + + amyload_pos_seq = load_fasta_file(Fasta1) + amyload_neg_seq = load_fasta_file(Fasta2) + + # Calculate quantitive features: volume and hydropathy + mean_volume = Feature(get_aa2volume()).then(average) + mean_hydropathy = Feature(get_aa2hydropathy()).then(average) + + fs = FeatureSet("volume'n'hydropathy") + fs.add(mean_volume) + fs.add(mean_hydropathy) + + amyload_pos_conv_seq = fs(amyload_pos_seq) + amyload_neg_conv_seq = fs(amyload_neg_seq) + + # Do local Fisher: + result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, + windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor)) + + # Plot local Fisher: + _plot_local_fisher_2d(result, xlabel=xlabel, + ylabel=ylabel, + pop1_label=pop1_label, + pop2_label=pop2_label, + out_file_path =os.path.join(os.getcwd(), "out.png") + ) + + + # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png")) + + HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname)) + +if __name__=="__main__": + + + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-f1", "--Fasta1", required=True, default=None, help="First fasta file ") + parser.add_argument("-f2", "--Fasta2", required=True, default=None, help="Second fasta file") + parser.add_argument("-o", "--overlap_factor", required=False, default=5, help="Overlap factor") + parser.add_argument("-w", "--windows_per_frame", required=False, default=5, help="Windows per frame") + parser.add_argument("-x", "--xlabel", required=True, default=None, help="X label") + parser.add_argument("-y", "--ylabel", required=True, default=None, help="Y label") + parser.add_argument("-p1", "--pop1_label", required=True, default=None, help="First population label") + parser.add_argument("-p2", "--pop2_label", required=True, default=None, help="Second population label") + parser.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="Path to html dir") + parser.add_argument("--htmlFname", required=False, help="html output file", default="report.html") + parser.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Path to output Working Directory") + args = parser.parse_args() + + run(args.Fasta1, args.Fasta2, args.windows_per_frame, args.overlap_factor, args.xlabel, args.ylabel, args.pop1_label, args.pop2_label, args.htmlOutDir, args.htmlFname, args.Workdirpath) +