diff tools/mentalist_tree/mentalist_tree @ 8:f29e7738bb64 draft

planemo upload for repository https://github.com/WGS-TB/MentaLiST/tree/master/galaxy commit d6aa0144475c93626400a46e933acbbd77fefb01-dirty
author dfornika
date Mon, 18 Jun 2018 15:52:32 -0400
parents
children a6cd59f35832
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mentalist_tree/mentalist_tree	Mon Jun 18 15:52:32 2018 -0400
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import numpy as np
+
+import Bio.Phylo
+from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor
+
+def usage():
+  print("usage: mentalist_tree <input.tsv>\n")
+
+def process_input_matrix(input_matrix):
+    """ Converts an array-of-arrays containting sample IDs and distances
+        into a BioPython DistanceMatrix object
+    """
+    input_matrix.pop(0)
+    sample_names = [row[0] for row in input_matrix]
+    for row in input_matrix:
+        row.pop(0)
+    distance_matrix = []
+    for input_matrix_row in input_matrix:
+        distance_matrix.append([int(i) for i in input_matrix_row])
+    """ np.tril() converts a matrix like this: [[0 1 2]
+                                                [1 0 1]
+                                                [2 1 0]]
+        ...into this: [[0 0 0]
+                       [1 0 0]
+                       [2 1 0]]
+        ...but what we need to pass to DistanceMatrix() is this: [[0]
+                                                                  [1 0]
+                                                                  [2 1 0]]
+        ...so that's what the (somewhat cryptic) code below does.
+    """
+    distance_matrix = np.tril(np.array(distance_matrix))
+    num_rows = distance_matrix.shape[0]
+    """ masking the distance matrix with tril_indices gives a linearized
+        distance matrix [0 1 0 2 1 0] that we need to re-construct into [[0], [1, 0], [2, 1, 0]]
+    """
+    lower_triangular_idx_mask = np.tril_indices(num_rows)
+    linear_distance_matrix = distance_matrix[lower_triangular_idx_mask]
+    distance_matrix = []
+    min = 0
+    max = 1
+    for i in range(num_rows):
+        distance_matrix.append(linear_distance_matrix[min:max].tolist())
+        min = max
+        max = max + (i + 2) 
+    distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix)
+    return distance_matrix
+
+def main():
+    if len(sys.argv) < 2:
+        usage()
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    reader = csv.reader(open(input_file, "r"), delimiter="\t")
+    input_matrix = list(reader)
+    distance_matrix = process_input_matrix(input_matrix)
+    constructor = DistanceTreeConstructor()
+    tree = constructor.nj(distance_matrix)
+    Bio.Phylo.write(tree, sys.stdout, 'newick')
+
+if __name__ == '__main__':
+    main()