comparison w4mcorcov_salience.R @ 13:2ae2d26e3270 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author eschen42
date Wed, 12 Dec 2018 09:20:02 -0500
parents 06c51af11531
children
comparison
equal deleted inserted replaced
12:ddaf84e15d06 13:2ae2d26e3270
17 t_data_matrix <- t(data_matrix) 17 t_data_matrix <- t(data_matrix)
18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { 18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) {
19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") 19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric")
20 return (NULL) 20 return (NULL)
21 } 21 }
22
23 feature_names <- colnames(t_data_matrix)
24
22 n_features <- ncol(t_data_matrix) 25 n_features <- ncol(t_data_matrix)
23 n_features_plus_1 <- 1 + n_features
24 features <- colnames(t_data_matrix)
25 n_samples <- nrow(t_data_matrix) 26 n_samples <- nrow(t_data_matrix)
26 if ( length(sample_class) != n_samples ) { 27 if ( length(sample_class) != n_samples ) {
27 strF(data_matrix) 28 strF(data_matrix)
28 strF(sample_class) 29 strF(sample_class)
29 failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class))) 30 failure_action(
31 sprintf(
32 "w4msalience: The data_matrix has %d samples but sample_class has %d"
33 , n_samples
34 , length(sample_class)
35 )
36 )
30 return (NULL) 37 return (NULL)
31 } 38 }
32 # end sanity checks 39 # end sanity checks
33 40
34 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." 41 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'."
37 x = as.data.frame(t_data_matrix) 44 x = as.data.frame(t_data_matrix)
38 , by = list(sample_class) 45 , by = list(sample_class)
39 , FUN = "median" 46 , FUN = "median"
40 ) 47 )
41 48
42 # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'."
43 maxOfFeatureBySampleClassLevel <- aggregate(
44 x = as.data.frame(t_data_matrix)
45 , by = list(sample_class)
46 , FUN = "max"
47 )
48
49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." 49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'."
50 # cv is less robust; deviation from normality degrades performance 50 # cv is less robust; deviation from normality degrades performance
51 # cv(x) == sd(x) / mean(x) 51 # cv(x) == sd(x) / mean(x)
52 # rcv is a "robust" coefficient of variation, expressed as a proportion 52 # rcv is a "robust" coefficient of variation, expressed as a proportion
53 # rcv(x) == mad(x) / median(x) 53 # rcv(x) == mad(x) / median(x)
54 madOfFeatureBySampleClassLevel <- aggregate( 54 madOfFeatureBySampleClassLevel <- aggregate(
55 x = as.data.frame(t_data_matrix) 55 x = as.data.frame(t_data_matrix)
56 , by = list(sample_class) 56 , by = list(sample_class)
57 , FUN = "mad" 57 , FUN = "mad"
58 ) 58 )
59 rcvOfFeatureBySampleClassLevel <- as.matrix( 59
60 madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1] 60 # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))`
61 # produces a matrix with two rows and ten columns
62
63 my_list <- apply(
64 X = array(1:n_features)
65 , MARGIN = 1
66 , FUN = function(x) {
67 my_df <- data.frame(
68 median = medianOfFeatureBySampleClassLevel[ , 1 + x]
69 , mad = madOfFeatureBySampleClassLevel[ , 1 + x]
70 )
71 my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1]
72 my_df <- my_df[ order(my_df$median, decreasing = TRUE), ]
73 my_dist_df <- my_df[ 1:2, ]
74 # "robust coefficient of variation", i.e.,
75 # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
76 rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1]
77 dist_result <-
78 ( my_dist_df$median[1] - my_dist_df$median[2] ) /
79 sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] )
80 if (is.infinite(dist_result) || is.nan(dist_result))
81 dist_result <- 0
82 mean_median <- mean(my_df$median)
83 salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0
84 return (
85 data.frame(
86 dist_result = dist_result
87 , max_median = my_df$median[1]
88 , mean_median = mean_median
89 , salience_result = salience_result
90 , salient_level = my_df$salient_level[1]
91 , rcv_result = rcv_result
92 )
93 )
94 }
61 ) 95 )
62 rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) 96 results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]])
97 results_df <- as.data.frame(t(results_matrix))
63 98
64 # "For each feature, 'select max(max_feature_intensity) from feature'." 99 relative_salient_distance <- unlist(results_df$dist_result)
65 maxApplyMaxOfFeatureBySampleClassLevel <- sapply( 100 salience <- unlist(results_df$salience_result)
66 X = 1:n_features 101 salient_level <- unlist(results_df$salient_level)
67 , FUN = function(i) { 102 max_median <- unlist(results_df$max_median)
68 match( 103 mean_median <- unlist(results_df$mean_median)
69 max(maxOfFeatureBySampleClassLevel[, i + 1]) 104 rcv_result <- unlist(results_df$rcv_result)
70 , maxOfFeatureBySampleClassLevel[, i + 1] 105
71 ) 106 salience_df <-
72 } 107 data.frame(
108 # the feature name
109 feature = feature_names
110 # the name (or factor-level) of the class-level with the highest median intensity for the feature
111 , max_level = salient_level
112 # the median intensity for the feature and the level max_level
113 , max_median = max_median
114 # the distance between the maximum intensities for the feature at the two highest levels
115 , relative_salient_distance = relative_salient_distance
116 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
117 , salience_rcv = rcv_result
118 # the mean of the medians of intensity for all class-levels for the feature
119 , mean_median = mean_median
120 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
121 , salience = salience
122 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
123 , stringsAsFactors = FALSE
73 ) 124 )
74
75 # "For each feature, 'select mean(median_feature_intensity) from feature'."
76 meanApplyMedianOfFeatureBySampleClassLevel <- sapply(
77 X = 1:n_features
78 , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1])
79 )
80
81 # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature
82 # is for one particular class-level relative to the intensity across all class-levels.
83 salience_df <- data.frame(
84 # the feature name
85 feature = features
86 # the name (or factor-level) of the class-level with the highest median intensity for the feature
87 , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1]
88 # the median intensity for the feature and the level max_level
89 , max_median = sapply(
90 X = 1:n_features
91 , FUN = function(i) {
92 maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i]
93 }
94 )
95 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
96 , max_rcv = sapply(
97 X = 1:n_features
98 , FUN = function(i) {
99 rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i]
100 }
101 )
102 # the mean of the medians of intensity for all class-levels for the feature
103 , mean_median = meanApplyMedianOfFeatureBySampleClassLevel
104 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
105 , stringsAsFactors = FALSE
106 )
107 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
108 salience_df$salience <- sapply(
109 X = 1:nrow(salience_df)
110 , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } )
111 )
112 # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
113 salience_df$salient_rcv <- salience_df$max_rcv
114 125
115 return (salience_df) 126 return (salience_df)
116 } 127 }
117 128