annotate src/edu/unc/genomics/visualization/KMeans.java @ 2:e16016635b2a

Uploaded
author timpalpant
date Mon, 13 Feb 2012 22:12:06 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
1 package edu.unc.genomics.visualization;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
2
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
3 import java.io.BufferedReader;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
4 import java.io.BufferedWriter;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
5 import java.io.IOException;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
6 import java.nio.charset.Charset;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
7 import java.nio.file.Files;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
8 import java.nio.file.Path;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
9 import java.util.ArrayList;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
10 import java.util.Arrays;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
11 import java.util.HashMap;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
12 import java.util.List;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
13 import java.util.Map;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
14 import java.util.Random;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
15
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
16 import org.apache.commons.lang3.StringUtils;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
17 import org.apache.commons.math.stat.clustering.Cluster;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
18 import org.apache.commons.math.stat.clustering.KMeansPlusPlusClusterer;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
19 import org.apache.log4j.Logger;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
20
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
21 import com.beust.jcommander.Parameter;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
22
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
23 import edu.unc.genomics.CommandLineTool;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
24 import edu.unc.genomics.ReadablePathValidator;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
25 import edu.unc.genomics.io.IntervalFileSnifferException;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
26 import edu.unc.genomics.io.WigFileException;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
27
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
28 public class KMeans extends CommandLineTool {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
29
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
30 private static final Logger log = Logger.getLogger(KMeans.class);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
31
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
32 @Parameter(names = {"-i", "--input"}, description = "Input file (matrix2png format)", required = true, validateWith = ReadablePathValidator.class)
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
33 public Path inputFile;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
34 @Parameter(names = {"-k", "--clusters"}, description = "Number of clusters")
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
35 public int k = 10;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
36 @Parameter(names = {"-1", "--min"}, description = "Minimum column to use for clustering")
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
37 public int minCol = 1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
38 @Parameter(names = {"-2", "--max"}, description = "Maximum column to use for clustering")
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
39 public Integer maxCol;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
40 @Parameter(names = {"-o", "--output"}, description = "Output file (clustered matrix2png format)", required = true)
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
41 public Path outputFile;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
42
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
43 private Map<String, String> rows = new HashMap<String, String>();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
44 private List<KMeansRow> data = new ArrayList<KMeansRow>();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
45
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
46 @Override
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
47 public void run() throws IOException {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
48 log.debug("Loading data from the input matrix");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
49 String headerLine = "";
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
50 try (BufferedReader reader = Files.newBufferedReader(inputFile, Charset.defaultCharset())) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
51 // Header line
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
52 int lineNum = 1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
53 headerLine = reader.readLine();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
54 int numColsInMatrix = StringUtils.countMatches(headerLine, "\t");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
55
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
56 // Validate the range info
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
57 if (maxCol != null) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
58 if (maxCol > numColsInMatrix) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
59 throw new RuntimeException("Invalid range of data specified for clustering ("+maxCol+" > "+numColsInMatrix+")");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
60 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
61 } else {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
62 maxCol = numColsInMatrix;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
63 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
64
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
65 // Loop over the rows and load the data
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
66 String line;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
67 while ((line = reader.readLine()) != null) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
68 lineNum++;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
69 if (StringUtils.countMatches(line, "\t") != numColsInMatrix) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
70 throw new RuntimeException("Irregular input matrix does not have same number of columns on line " + lineNum);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
71 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
72
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
73 int delim = line.indexOf('\t');
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
74 String id = line.substring(0, delim);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
75 String[] row = line.substring(delim+1).split("\t");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
76 String[] subset = Arrays.copyOfRange(row, minCol, maxCol);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
77 float[] rowData = new float[subset.length];
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
78 for (int i = 0; i < subset.length; i++) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
79 try {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
80 rowData[i] = Float.parseFloat(subset[i]);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
81 } catch (NumberFormatException e) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
82 rowData[i] = Float.NaN;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
83 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
84 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
85 data.add(new KMeansRow(id, rowData));
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
86 rows.put(id, line);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
87 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
88 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
89
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
90 // Perform the clustering
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
91 log.debug("Clustering the data");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
92 Random rng = new Random();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
93 KMeansPlusPlusClusterer<KMeansRow> clusterer = new KMeansPlusPlusClusterer<KMeansRow>(rng);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
94 List<Cluster<KMeansRow>> clusters = clusterer.cluster(data, k, 50);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
95
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
96 // Write to output
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
97 log.debug("Writing clustered data to output file");
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
98 try (BufferedWriter writer = Files.newBufferedWriter(outputFile, Charset.defaultCharset())) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
99 writer.write(headerLine);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
100 writer.newLine();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
101 int n = 1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
102 int count = 1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
103 for (Cluster<KMeansRow> cluster : clusters) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
104 int numRowsInCluster = cluster.getPoints().size();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
105 int stop = count + numRowsInCluster - 1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
106 log.info("Cluster "+(n++)+": rows "+count+"-"+stop);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
107 count = stop+1;
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
108 for (KMeansRow row : cluster.getPoints()) {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
109 writer.write(rows.get(row.getId()));
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
110 writer.newLine();
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
111 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
112 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
113 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
114 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
115
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
116 public static void main(String[] args) throws IOException, WigFileException, IntervalFileSnifferException {
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
117 new KMeans().instanceMain(args);
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
118 }
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
119
e16016635b2a Uploaded
timpalpant
parents:
diff changeset
120 }