Mercurial > repos > chemteam > biomd_rmsd_clustering
diff rmsd_clustering.py @ 0:ee1f38eb220e draft
"planemo upload for repository https://github.com/galaxycomputationalchemistry/galaxy-tools-compchem/ commit 1b23e024af45cc0999d9142d07de6897d4189ec2"
author | chemteam |
---|---|
date | Mon, 24 Aug 2020 06:08:17 -0400 |
parents | |
children | b9c46dbe9605 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmsd_clustering.py Mon Aug 24 06:08:17 2020 -0400 @@ -0,0 +1,114 @@ +import argparse +import json + +import matplotlib.pyplot as plt + +import numpy as np + +from scipy.cluster.hierarchy import cophenet, dendrogram, linkage +from scipy.spatial.distance import pdist + + +def json_to_np(fname, start=None, end=None): + """ + Load json file and convert to numpy array + """ + with open(fname) as f: + k = json.load(f) + print(np.array(k)[:, :, start:end].shape) + return np.array(k)[:, :, start:end] + + +def flatten_tensor(tensor, normalize=True): + """ + Flatten tensor to a 2D matrix along the time axis + """ + av = np.mean(tensor, axis=(0, 1)) if normalize else 1 + return np.mean(tensor/av, axis=2) + + +def get_cluster_linkage_array(mat, clustering_method='average'): + Z = linkage(mat, clustering_method) + c, coph_dists = cophenet(Z, pdist(mat)) + print('Cophenetic correlation coefficient: {}'.format(c)) + return Z + + +def plot_dist_mat(mat, output, cmap='plasma'): + """ + Plot distance matrix as heatmap + """ + fig, ax = plt.subplots(1) + p = ax.pcolormesh(mat, cmap=cmap) + plt.xlabel('Trajectory number') + plt.ylabel('Trajectory number') + plt.colorbar(p) + plt.draw() + plt.savefig(output, format='png') + + +def plot_dendrogram(Z, output): + plt.figure(figsize=(25, 10)) + plt.title('Hierarchical Clustering Dendrogram') + plt.xlabel('Trajectory index') + plt.ylabel('distance') + dendrogram( + Z, + leaf_rotation=90., # rotates the x axis labels + leaf_font_size=8., # font size for the x axis labels + ) + plt.savefig(output, format='png') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--json', help='JSON input file (for 3D tensor).') + parser.add_argument('--mat', help='Input tabular file (for 2D matrix).') + parser.add_argument('--outp-mat', help='Tabular output file.') + parser.add_argument('--Z', required=True, + help='File for cluster linkage array.') + parser.add_argument('--dendrogram', + help="Path to the output dendrogram file") + parser.add_argument('--heatmap', + help="Path to the output distance matrix file") + parser.add_argument('--clustering-method', default='average', + choices=['single', 'complete', 'average', + 'centroid', 'median', 'ward', 'weighted'], + help="Method to use for clustering.") + parser.add_argument('--cmap', type=str, default='plasma', + help="Matplotlib colormap to use" + "for plotting distance matrix.") + parser.add_argument('--start', type=int, + help="First trajectory frame to" + "calculate distance matrix") + parser.add_argument('--end', type=int, + help="Last trajectory frame to" + "calculate distance matrix") + parser.add_argument('--normalize', action="store_true", + help="Normalize the RMSD variation over" + "the trajectories before averaging.") + args = parser.parse_args() + + print(args) + if args.json: + tensor = json_to_np(args.json, args.start, args.end) + mat = flatten_tensor(tensor, args.normalize) + np.savetxt(args.outp_mat, mat) + elif args.mat: + mat = np.loadtxt(args.mat) + else: + print("Either --json or --mat must be specified.") + exit(1) + + Z = get_cluster_linkage_array(mat, args.clustering_method) + np.savetxt(args.Z, Z) + + if args.heatmap: + plot_dist_mat(mat, args.heatmap, args.cmap) + + if args.dendrogram: + plot_dendrogram(Z, args.dendrogram) + + +if __name__ == "__main__": + main()