annotate R_functions/outlier_trimming.R @ 0:cb54350e76ae draft default tip

Uploaded
author jason-ellul
date Wed, 01 Jun 2016 03:24:56 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
1 # Finding outliers by standard deviation
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
2
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
3 # Get samples whose pc1 OR pc2 values lie more than 'numsds' s.devs
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
4 # away from the sample median for that pc.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
5 outliers_by_sd = function(pca_data, xsamples, numsds) {
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
6 pc1_outliers = find_outliers(pca_data$values[, 1], numsds)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
7 pc2_outliers = find_outliers(pca_data$values[, 2], numsds)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
8 all_outliers = union(pc1_outliers, pc2_outliers)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
9 return(all_outliers)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
10 }
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
11
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
12 # compute outliers
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
13 # Returns indices of all samples which lie more than
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
14 # 'numsds' s.devs away from the sample median
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
15 find_outliers = function(input_data, numsds) {
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
16 lower = median(input_data) - numsds*sd(input_data)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
17 upper = median(input_data) + numsds*sd(input_data)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
18
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
19 outliers = which(input_data < lower | input_data > upper)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
20 return(outliers)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
21 }