0
|
1 #==================================================================================
|
|
2 # About the script
|
|
3 #==================================================================================
|
|
4 # Version: V1
|
|
5 # This script works for single sample only
|
|
6 # It takes GE (Gene Expression) and PE (Protein expression) data of one sample and perform correlation, regression analysis between PE and GE data
|
|
7 # Input data can be of tsv format
|
|
8 # Script also need a parameter or option file
|
|
9
|
|
10 #==================================================================================
|
|
11 # Dependencies
|
|
12 #==================================================================================
|
|
13 # Following R package has to be installed.
|
|
14 # data.table
|
|
15 # gplots
|
|
16 # MASS
|
|
17 # DMwR
|
|
18 # mgcv
|
|
19 # It can be installed by following R command in R session. e.g. install.packages("data.table")
|
|
20
|
|
21 #==================================================================================
|
|
22 # How to Run
|
|
23 #==================================================================================
|
|
24 # Rscript PE_GE_association_singleSample_V1.r <PE_file> <GE_file> <Option_file containing tool parameters> <Ensembl map file containing directory path> <outdir>
|
|
25
|
|
26 #==================================================================================
|
|
27 # Arguments
|
|
28 #==================================================================================
|
|
29 # Arg1. <PE file>: PE data (tsv format)
|
|
30 # Arg2. <GE file>: GE data (tsv format)
|
|
31 # Arg3. <Option file>: tsv format, key\tvalue
|
|
32 # Options are
|
|
33 # PE_idcolno: Column number of PE file containing protein IDs
|
|
34 # GE_idcolno: Column number of GE file containing transcript IDs
|
|
35 # PE_expcolno: Column number of PE file containing protein expression values
|
|
36 # GE_expcolno: Column number of GE file containing transcript expression values
|
|
37 # PE_idtype: protein id type. It can be either Uniprot or Ensembl or HGNC_symbol
|
|
38 # GE_idtype: transcript id type. At present it is only one type i.e. Ensembl or HGNC_symbol
|
|
39 # Organism: Organism
|
|
40 # writeMapUnmap: Whether to write mapped and unmapped data in input data format. It takes value as 1 or 0. If 1, mapped and unmapped data is written. Default is 1.
|
|
41 # doscale: Whether perform scaling to input data or not. If yet, abundance values are normalized by standard normalization. Default 1
|
|
42 # Arg4. <Ensembl map file containg directory>: Path to Ensembl map file containg directory e.g. /home/user/Ensembl/mapfiles
|
|
43 # Arg5. <Outdir>: output directory (e.g. /home/user/out1)
|
|
44
|
|
45 #==================================================================================
|
|
46 # Sample option file
|
|
47 #==================================================================================
|
|
48 #PE_idcolno 7
|
|
49 #GE_idcolno 1
|
|
50 #PE_expcolno 2
|
|
51 #GE_expcolno 3
|
|
52 #PE_idtype Ensembl
|
|
53 #GE_idtype Ensembl
|
|
54 #Organism mmusculus
|
|
55 #writeMapUnmap 1
|
|
56 #doscale 1
|
|
57
|
|
58 #==================================================================================
|
|
59 # Output
|
|
60 #==================================================================================
|
|
61 # The script outputs image and data folder along with Correlation_result.html and Result.log file
|
|
62 # Result.log: Log file
|
|
63 # Correlation_result.html; main result file in html format
|
|
64
|
|
65 # data folder contains following output files
|
|
66
|
|
67 # PE_abundance.tsv: 2 column tsv file containing mapped id and protein expression values
|
|
68 # GE_abundance.tsv: 2 column tsv file containing mapped id and transcript expression values
|
|
69
|
|
70 # If writeMapUnmap is 1 i.e. to write mapped and unmapped data, 4 additional file will be written
|
|
71 # PE_unmapped.tsv: Output format is same as input, PE unmapped data is written
|
|
72 # GE_unmapped.tsv: Output format is same as input, GE unmapped data is written
|
|
73 # PE_mapped.tsv: Output format is same as input, PE mapped data is written
|
|
74 # GE_mapped.tsv: Output format is same as input, GE mapped data is written
|
|
75
|
|
76 # PE_GE_influential_observation.tsv: Influential observations based on cook's distance
|
|
77 # PE_GE_kmeans_clusterpoints.txt: Observations clustered based on kmeans clustering. File contains cluster assignment of each observations
|
|
78
|
|
79 #==================================================================================
|
|
80 # ............................SCRIPT STARTS FROM HERE .............................
|
|
81 #==================================================================================
|
|
82 # Warning Off
|
5
|
83 # oldw <- getOption("warn")
|
|
84 # options(warn = -1)
|
0
|
85 #=============================================================================================================
|
|
86 # Functions
|
|
87 #=============================================================================================================
|
|
88 usage <- function()
|
|
89 {
|
|
90 cat("\n\n###########\nERROR: Rscript PE_GE_association_singleSample_V1.r <PE file> <GE file> <Option file containing tool parameters> <Ensembl map file containing directory path> <outdir>\n###########\n\n");
|
|
91 }
|
|
92
|
|
93 #=============================================================================================================
|
|
94 # Global variables
|
|
95 #=============================================================================================================
|
|
96 noargs = 13;
|
|
97
|
|
98 #=============================================================================================================
|
|
99 # Parse command line arguments in args object
|
|
100 #=============================================================================================================
|
|
101 args = commandArgs(trailingOnly = TRUE);
|
|
102
|
|
103
|
|
104 #=============================================================================================================
|
|
105 # Check for No of arguments
|
|
106 #=============================================================================================================
|
|
107 if(length(args) != noargs)
|
|
108 {
|
|
109 usage();
|
|
110 stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse=""));
|
|
111 }
|
|
112
|
|
113 #======================================================================
|
|
114 # Load libraries
|
|
115 #======================================================================
|
2
|
116 library(data.table);
|
|
117 library(lattice);
|
|
118 library(grid);
|
|
119 library(nlme);
|
|
120 library(gplots);
|
|
121 library(MASS);
|
|
122 library(DMwR);
|
|
123 library(mgcv);
|
0
|
124
|
|
125
|
|
126 #=============================================================================================================
|
|
127 # Set variables
|
|
128 #=============================================================================================================
|
|
129 #PE_file = args[1]; # Protein abundance data file
|
|
130 #GE_file = args[2]; # Gene expression data file
|
|
131 #option_file = args[3]; # Option file containing various parameters
|
|
132 #biomartdir = args[4]; # Biomart map file containing directory path in local system
|
|
133 #outdir = args[5]; # output directory
|
|
134
|
|
135 PE_file = args[1]; # Protein abundance data file
|
|
136 GE_file = args[2]; # Gene expression data file
|
|
137 #option_file = args[3]; # Option file containing various parameters
|
|
138 #biomartdir = args[4]; # Biomart map file containing directory path in local system
|
|
139 outdir = args[13]; # output directory
|
|
140
|
|
141 #imagesubdirprefix = "image";
|
|
142 #datasubdirprefix = "data";
|
|
143 #htmloutfile = "Correlation_result.html";
|
|
144 htmloutfile = args[12]
|
|
145
|
|
146 #logfile = "Result.log";
|
|
147 PE_outfile_mapped = "PE_mapped.tsv";
|
|
148 GE_outfile_mapped = "GE_mapped.tsv";
|
|
149 PE_outfile_unmapped = "PE_unmapped.tsv";
|
|
150 GE_outfile_unmapped = "GE_unmapped.tsv";
|
|
151 PE_expfile = "PE_abundance.tsv";
|
|
152 GE_expfile = "GE_abundance.tsv";
|
|
153 PE_outfile_excluded_naInf = "PE_excluded_NA_Inf.tsv";
|
|
154 GE_outfile_excluded_naInf = "GE_excluded_NA_Inf.tsv";
|
|
155
|
|
156 #=============================================================================================================
|
|
157 # Check input files existance
|
|
158 #=============================================================================================================
|
|
159 if(! file.exists(PE_file))
|
|
160 {
|
|
161 usage();
|
|
162 stop(paste("Input PE_file file does not exists. Path given: ",PE_file,sep="",collapse=""));
|
|
163 }
|
|
164 if(! file.exists(GE_file))
|
|
165 {
|
|
166 usage();
|
|
167 stop(paste("Input GE_file does not exists. Path given: ",GE_file,sep="",collapse=""));
|
|
168 }
|
|
169 #if(! file.exists(option_file))
|
|
170 #{
|
|
171 # usage();
|
|
172 # stop(paste("Input option_file does not exists. Path given: ",option_file,sep="",collapse=""));
|
|
173 #}
|
|
174
|
|
175 #=============================================================================================================
|
|
176 # Read param_file and set parameter/option data frame
|
|
177 #=============================================================================================================
|
|
178 #optiondf = read.table(option_file, header = FALSE, stringsAsFactors = FALSE)
|
|
179 #rownames(optiondf) = optiondf[,1];
|
|
180
|
|
181 #=============================================================================================================
|
|
182 # Define option variables
|
|
183 #=============================================================================================================
|
|
184 #PE_idcolno = as.numeric(optiondf["PE_idcolno",2]);
|
|
185 #GE_idcolno = as.numeric(optiondf["GE_idcolno",2]);
|
|
186 #PE_expcolno = as.numeric(optiondf["PE_expcolno",2]);
|
|
187 #GE_expcolno = as.numeric(optiondf["GE_expcolno",2]);
|
|
188 #PE_idtype = optiondf["PE_idtype",2];
|
|
189 #GE_idtype = optiondf["GE_idtype",2];
|
|
190 #Organism = optiondf["Organism",2];
|
|
191 #writeMapUnmap = as.logical(as.numeric(optiondf["writeMapUnmap",2]));
|
|
192 #doscale = as.logical(as.numeric(optiondf["doscale",2]));
|
|
193
|
|
194 PE_idcolno = as.numeric(args[3])
|
|
195 GE_idcolno = as.numeric(args[4])
|
|
196 PE_expcolno = as.numeric(args[5])
|
|
197 GE_expcolno = as.numeric(args[6])
|
|
198 PE_idtype = args[7]
|
|
199 GE_idtype = args[8]
|
|
200 #Organism = args[9]
|
|
201 writeMapUnmap = as.logical(as.numeric(args[10]));
|
|
202 doscale = as.logical(as.numeric(args[11]));
|
|
203
|
|
204
|
|
205 #1 PE_file = "test_data/PE_mouse_singlesample.txt"
|
|
206 #2 GE_file = "test_data/GE_mouse_singlesample.txt"
|
|
207 #3 PE_idcolno = 7
|
|
208 #4 GE_idcolno = 1
|
|
209 #5 PE_expcolno = 13
|
|
210 #6 GE_expcolno = 10
|
|
211 #7 PE_idtype = "Ensembl_with_version"
|
|
212 #8 GE_idtype = "Ensembl_with_version"
|
|
213 #10 writeMapUnmap = 1
|
|
214 #11 doscale = 1
|
|
215 #9 biomart_mapfile = "test_data/mmusculus_gene_ensembl__GRCm38.p6.map"
|
|
216 #12 htmloutfile = "html_out.html"
|
|
217 #13 outdir = "output_elements"
|
|
218
|
|
219 #=============================================================================================================
|
|
220 # Set column name of biomart map file (idtype) based on whether Ensembl or Uniprot
|
|
221 #=============================================================================================================
|
|
222 if(PE_idtype == "Ensembl")
|
|
223 {
|
|
224 PE_idtype = "ensembl_peptide_id";
|
|
225 }else
|
|
226 {
|
|
227 if(PE_idtype == "Ensembl_with_version")
|
|
228 {
|
|
229 PE_idtype = "ensembl_peptide_id_version";
|
|
230 }else{
|
|
231 if(PE_idtype == "HGNC_symbol")
|
|
232 {
|
|
233 PE_idtype = "hgnc_symbol";
|
|
234 }
|
|
235 }
|
|
236 }
|
|
237
|
|
238 if(GE_idtype == "Ensembl")
|
|
239 {
|
|
240 GE_idtype = "ensembl_transcript_id";
|
|
241 }else
|
|
242 {
|
|
243 if(GE_idtype == "Ensembl_with_version")
|
|
244 {
|
|
245 GE_idtype = "ensembl_transcript_id_version";
|
|
246 }else{
|
|
247 if(GE_idtype == "HGNC_symbol")
|
|
248 {
|
|
249 GE_idtype = "hgnc_symbol";
|
|
250 }
|
|
251 }
|
|
252 }
|
|
253 #=============================================================================================================
|
|
254 # Identify biomart mapping file
|
|
255 #=============================================================================================================
|
|
256 #biomartdir = gsub(biomartdir, pattern="/$", replacement="")
|
|
257 #biomart_mapfilename = list.files(path = biomartdir, pattern = Organism);
|
|
258 #biomart_mapfile = paste(biomartdir,"/",biomart_mapfilename,sep="",collapse="");
|
|
259 #print(biomart_mapfile);
|
|
260 biomart_mapfile = args[9];
|
|
261 #=============================================================================================================
|
|
262 # Parse PE, GE, biomart file file
|
|
263 #=============================================================================================================
|
|
264 PE_df = as.data.frame(fread(input=PE_file, header=T, sep="\t"));
|
|
265 GE_df = as.data.frame(fread(input=GE_file, header=T, sep="\t"));
|
|
266 biomart_df = as.data.frame(fread(input=biomart_mapfile, header=T, sep="\t"));
|
|
267
|
|
268 #=============================================================================================================
|
|
269 # Create directory structures and then set the working directory to output directory
|
|
270 #=============================================================================================================
|
|
271 if(! file.exists(outdir))
|
|
272 {
|
|
273 dir.create(outdir);
|
|
274 }
|
|
275
|
|
276 #tempdir = paste(outdir,"/",imagesubdirprefix,sep="",collapse="");
|
|
277 #if(! file.exists(tempdir))
|
|
278 #{
|
|
279 # dir.create(tempdir);
|
|
280 #}
|
|
281
|
|
282 #tempdir = paste(outdir,"/",datasubdirprefix,sep="",collapse="");
|
|
283 #if(! file.exists(tempdir))
|
|
284 #{
|
|
285 # dir.create(tempdir);
|
|
286 #}
|
|
287
|
|
288 #setwd(outdir);
|
|
289 logfile = paste(outdir,"/", "Result.log",sep="",collapse="");
|
|
290
|
|
291 #=============================================================================================================
|
|
292 # Write initial data summary in html outfile
|
|
293 #=============================================================================================================
|
|
294 cat("<html><body>\n", file = htmloutfile);
|
|
295 cat("<h1>Association between proteomics and transcriptomics data</h1>\n",
|
|
296 "<font color='blue'><h3>Input data summary</h3></font>",
|
|
297 "<ul>",
|
|
298 "<li>Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)","</li>",
|
|
299 "<li>Input PE data dimension (Row Column): ", dim(PE_df),"</li>",
|
|
300 "<li>Input GE data dimension (Row Column): ", dim(GE_df),"</li>",
|
|
301 #"<li>Organism selected: ", Organism,"</li>",
|
|
302 "<li>Protein ID fetched from column: ", PE_idcolno,"</li>",
|
|
303 "<li>Transcript ID fetched from column: ", GE_idcolno,"</li>",
|
|
304 "<li>Protein ID type: ", PE_idtype,"</li>",
|
|
305 "<li>Transcript ID type: ", GE_idtype,"</li>",
|
|
306 "<li>Protein expression data fetched from column: ", PE_expcolno,"</li>",
|
|
307 "<li>Transcript expression data fetched from column: ", GE_expcolno,"</li>",
|
|
308 file = htmloutfile, append = TRUE);
|
|
309
|
|
310 #=============================================================================================================
|
|
311 # Write initial data summary in logfile
|
|
312 #=============================================================================================================
|
|
313 #cat("Current work dir:", outdir,"\n");
|
|
314 cat("Processing started\n---------------------\n", file=logfile);
|
|
315 cat("Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)\n", file=logfile, append=T);
|
|
316 cat("Input PE data dimension (Row Column): ", dim(PE_df),"\n", file=logfile, append=T)
|
|
317 cat("Input GE data dimension (Row Column): ", dim(GE_df),"\n", file=logfile, append=T)
|
|
318 #cat("Organism selected: ", Organism,"\n", file=logfile, append=T)
|
|
319 #cat("Biomart map file used: ", biomart_mapfilename,"\n", file=logfile, append=T)
|
|
320 cat("Ensembl Biomart mapping data dimension (Row Column): ", dim(biomart_df),"\n", file=logfile, append=T)
|
|
321 cat("\n\nProtein ID to Transcript ID mapping started\n----------------\n", file=logfile, append=T)
|
|
322 cat("Protein ID fetched from column:", PE_idcolno,"\n", file=logfile, append=T)
|
|
323 cat("Transcript ID fetched from column:", GE_idcolno, "\n",file=logfile, append=T)
|
|
324 cat("Protein ID type:", PE_idtype, "\n",file=logfile, append=T)
|
|
325 cat("Transcript ID type:", GE_idtype,"\n", file=logfile, append=T);
|
|
326 cat("Protein expression data fetched from column:", PE_expcolno,"\n", file=logfile, append=T)
|
|
327 cat("Transcript expression data fetched from column:", GE_expcolno, "\n",file=logfile, append=T)
|
|
328
|
|
329 #=============================================================================================================
|
|
330 # Mapping starts here
|
|
331 # Pseudocode
|
|
332 # Loop over each row of PE file, fetch protein_id
|
|
333 # Search the biomartmap file and obtain corresponding transcript_id
|
|
334 # Take the mapped transcript_id and search in GE file, which row it correspond
|
|
335 # Store the PE rowno and GE rowno in rowpair object
|
|
336 #=============================================================================================================
|
|
337 rowpair = data.frame(PE_rowno = 0, GE_rowno = 0);
|
|
338 cat("Total rows:", nrow(PE_df),"\n");
|
|
339 cat("\n\nTotal protein ids to be mapped: ", nrow(PE_df),"\n", file=logfile, append=T);
|
|
340 messagelog = "\n\nBelow are protein IDs, for which no match is observed in Ensembl Biomart Map file.\n\n";
|
|
341
|
|
342 # GE_id column
|
|
343 GE_ids = GE_df[,GE_idcolno];
|
|
344 GE_ids = gsub(x=GE_ids, pattern=".\\d+$", replacement=""); # Remove version
|
|
345
|
|
346 # Loop over every row of PE data (PE_df)
|
|
347 for(i in 1:nrow(PE_df))
|
|
348 {
|
|
349
|
|
350 if(i%%100 ==0)
|
|
351 {
|
|
352 cat("Total rows processed ", i,"\n", file=logfile, append=T);
|
|
353 print(i);
|
|
354 }
|
|
355
|
|
356 queryid = PE_df[i,PE_idcolno];
|
|
357 #queryid = gsub(x=queryid, pattern=".\\d+$", replacement=""); # Remove version
|
|
358 #print(queryid);
|
|
359
|
|
360 PE_df_matchrowno = i; # Row number in PE_df which matches queryid
|
|
361
|
|
362 if(PE_idtype == "Uniprot")
|
|
363 {
|
|
364 biomart_matchrowno = which(biomart_df[,8] == queryid | biomart_df[,9] == queryid); # Row number of biomart_df which matches queryid
|
|
365 }else{
|
|
366 biomart_matchrowno = which(biomart_df[,PE_idtype] == queryid); # Row number of biomart_df which matches queryid
|
|
367 }
|
|
368
|
|
369 # If match found, map protein id to GE id and find corresponding match row number of GE_df
|
|
370 if(length(biomart_matchrowno)>0)
|
|
371 {
|
|
372 GE_df_matchrowno = which(GE_ids %in% biomart_df[biomart_matchrowno[1],GE_idtype]);
|
|
373 rowpair = rbind( rowpair, c(PE_df_matchrowno, GE_df_matchrowno));
|
|
374 if(length(GE_df_matchrowno) > 1)
|
|
375 {
|
|
376 cat("\nFor protein ID ", i," multiple transcript mapping found\n", file=logfile, append=T);
|
|
377
|
|
378 cat(
|
|
379 "<br><font color=",'red',">For protein ID", i," multiple transcript mapping found</font><br>",file = htmloutfile, append = TRUE);
|
|
380 }
|
|
381 }else{
|
|
382 messagelog = paste(messagelog, queryid, "\n",sep="", collapse="");
|
|
383 }
|
|
384 }
|
|
385 rowpair = rowpair[-1,];
|
|
386
|
|
387 #=============================================================================================================
|
|
388 # Write mapping summary, mapped and unmapped data
|
|
389 #=============================================================================================================
|
|
390 cat(
|
|
391 "<li>Total Protein ID mapped: ", length(intersect(1:nrow(PE_df), rowpair[,1])),"</li>",
|
|
392 "<li>Total Protein ID unmapped: ", length(setdiff(1:nrow(PE_df), rowpair[,1])),"</li>",
|
|
393 "<li>Total Transcript ID mapped: ", length(intersect(1:nrow(GE_df), rowpair[,2])),"</li>",
|
|
394 "<li>Total Transcript ID unmapped: ", length(setdiff(1:nrow(GE_df), rowpair[,2])),"</li></ul>",
|
|
395 file = htmloutfile, append = TRUE);
|
|
396
|
|
397 cat("\n\nMapping Statistics\n---------------------\n", file=logfile, append=T);
|
|
398 cat("Total Protein ID mapped:", length(intersect(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T)
|
|
399 cat("Total Protein ID unmapped:", length(setdiff(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T)
|
|
400 cat("Total Transcript ID mapped:", length(intersect(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T)
|
|
401 cat("Total Transcript ID unmapped:", length(setdiff(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T)
|
|
402
|
|
403 cat(messagelog,"\n",file=logfile, append=T);
|
|
404
|
|
405 if(writeMapUnmap)
|
|
406 {
|
|
407 write.table(PE_df[rowpair[,1], ], file=paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
408 write.table(GE_df[rowpair[,2], ], file= paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
409 write.table(PE_df[-rowpair[,1], ], file= paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
410 write.table(GE_df[-rowpair[,2], ], file=paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n");
|
|
411
|
|
412 cat(
|
|
413 "<font color='blue'><h3>Download mapped unmapped data</h3></font>",
|
5
|
414 "<ul><li>Protein mapped data: ", '<a href="',paste(PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
|
415 "<li>Protein unmapped data: ", '<a href="',paste(PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
|
416 "<li>Transcript mapped data: ", '<a href="',paste(GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
|
417 "<li>Transcript unmapped data: ", '<a href="',paste(GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
0
|
418 file = htmloutfile, append = TRUE);
|
|
419 }
|
|
420
|
|
421 write.table(PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)], file=paste(outdir,"/",PE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
422 write.table(GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)], file=paste(outdir,"/",GE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
423
|
|
424 cat(
|
5
|
425 "<li>Protein abundance data: ", '<a href="',paste(PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
|
426 "<li>Transcript abundance data: ", '<a href="',paste(GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
|
0
|
427 file = htmloutfile, append = TRUE);
|
|
428
|
|
429 #==========================================================================================
|
|
430 # Analysis (correlation and regression) starts here
|
|
431 #==========================================================================================
|
|
432 cat("Analysis started\n---------------------------\n",file=logfile, append=T);
|
|
433 proteome_df = PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)];
|
|
434 transcriptome_df = GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)];
|
|
435 nPE = nrow(proteome_df);
|
|
436 nGE = nrow(transcriptome_df)
|
|
437
|
|
438 cat("Total Protein ID: ",nPE,"\n",file=logfile, append=T);
|
|
439 cat("Total Transcript ID: ",nGE,"\n",file=logfile, append=T);
|
|
440
|
|
441 #==========================================================================================
|
|
442 # Data summary
|
|
443 #==========================================================================================
|
|
444 cat(
|
|
445 "<ul>",
|
|
446 "<li>Number of entries in Transcriptome data used for correlation: ",nPE,"</li>",
|
|
447 "<li>Number of entries in Proteome data used for correlation: ",nGE,"</li></ul>",
|
|
448 file = htmloutfile, append = TRUE);
|
|
449
|
|
450 #=============================================================================================================
|
|
451 # Remove entries with NA or Inf or -Inf in Transcriptome and Proteome data which will create problem in correlation analysis
|
|
452 #=============================================================================================================
|
|
453 totna = sum(is.na(transcriptome_df[,2]) | is.na(proteome_df[,2]));
|
|
454 totinf = sum(is.infinite(transcriptome_df[,2]) | is.infinite(proteome_df[,2]));
|
|
455
|
|
456 cat("<font color='blue'><h3>Filtering</h3></font>","Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry<br>","<ul>","<li>Number of NA found: ",totna,"</li>","<li>Number of Inf or -Inf found: ",totinf,"</li></ul>",file = htmloutfile, append = TRUE);
|
|
457
|
|
458 cat("Total NA observed in either Transcriptome or Proteome data: ",totna,"\n",file=logfile, append=T);
|
|
459 cat("Total Inf or -Inf observed in either Transcriptome or Proteome data: ",totinf,"\n",file=logfile, append=T);
|
|
460
|
|
461 if(totna > 0 | totinf > 0)
|
|
462 {
|
|
463 excludeIndices_PE = which(is.na(proteome_df[,2]) | is.infinite(proteome_df[,2]));
|
|
464 excludeIndices_GE = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2]));
|
|
465 excludeIndices = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2]) | is.na(proteome_df[,2]) | is.infinite(proteome_df[,2]));
|
|
466
|
|
467 # Write excluded transcriptomics and proteomics data to file
|
|
468 write.table(proteome_df[excludeIndices_PE,], file=paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
469 write.table(transcriptome_df[excludeIndices_GE,], file=paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
|
|
470
|
|
471 # Write excluded transcriptomics and proteomics data link to html file
|
|
472 cat(
|
5
|
473 "<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
|
|
474 "<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
|
0
|
475 file = htmloutfile, append = TRUE);
|
|
476
|
|
477 # Keep the unexcluded entries only
|
|
478 transcriptome_df = transcriptome_df[-excludeIndices,];
|
|
479 proteome_df = proteome_df[-excludeIndices,];
|
|
480 nPE = nrow(proteome_df);
|
|
481 nGE = nrow(transcriptome_df)
|
|
482
|
|
483 cat("<font color='blue'><h3>Filtered data summary</h3></font>",
|
|
484 "Excluding entires with abundance values: NA/Inf/-Inf<br>",
|
|
485 "<ul>",
|
|
486 "<li>Number of entries in Transcriptome data remained: ",nrow(transcriptome_df),"</li>",
|
|
487 "<li>Number of entries in Proteome data remained: ",nrow(proteome_df),"</li></ul>", file = htmloutfile, append = TRUE);
|
|
488
|
|
489 cat("Excluding entires with abundance values: NA/Inf/-Inf","\n",file=logfile, append=T);
|
|
490
|
|
491 cat("Total Protein ID after filtering: ",nPE,"\n",file=logfile, append=T);
|
|
492 cat("Total Transcript ID after filtering: ",nGE,"\n",file=logfile, append=T);
|
|
493
|
|
494 }
|
|
495
|
|
496 #==========================================================================================
|
|
497 # Scaling of data
|
|
498 #==========================================================================================
|
|
499 if(doscale)
|
|
500 {
|
|
501 proteome_df[,2] = scale(proteome_df[,2], center = TRUE, scale = TRUE);
|
|
502 transcriptome_df[,2] = scale(transcriptome_df[,2], center = TRUE, scale = TRUE);
|
|
503 }
|
|
504
|
|
505 #=============================================================================================================
|
|
506 # Proteome and Transcriptome data summary
|
|
507 #=============================================================================================================
|
|
508 cat("Calculating summary of PE and GE data\n",file=logfile, append=T);
|
|
509 s1 = summary(proteome_df[,2]);
|
|
510 s2 = summary(transcriptome_df[,2])
|
|
511
|
|
512 cat(
|
|
513 "<font color='blue'><h3>Proteome data summary</h3></font>\n",
|
|
514 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
|
|
515 file = htmloutfile, append = TRUE);
|
|
516
|
|
517 for(i in 1:length(s1))
|
|
518 {
|
|
519 cat("<tr><td>",names(s1[i]),"</td><td>", s1[i],"</td></tr>\n", file = htmloutfile, append = TRUE)
|
|
520 }
|
|
521 cat("</table>\n", file = htmloutfile, append = TRUE)
|
|
522
|
|
523 cat(
|
|
524 "<font color='blue'><h3>Transcriptome data summary</h3></font>\n",
|
|
525 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
|
|
526 file = htmloutfile, append = TRUE);
|
|
527
|
|
528 for(i in 1:length(s2))
|
|
529 {
|
|
530 cat("<tr><td>",names(s2[i]),"</td><td>", s2[i],"</td></tr>\n", file = htmloutfile, append = TRUE)
|
|
531 }
|
|
532 cat("</table>\n", file = htmloutfile, append = TRUE)
|
|
533
|
|
534 #=============================================================================================================
|
|
535 # Distribution of proteome and transcriptome abundance (Box and Density plot)
|
|
536 #=============================================================================================================
|
|
537 cat("Generating Box and Density plot\n",file=logfile, append=T);
|
5
|
538 outplot = paste(outdir,"/AbundancePlot.png",sep="",collapse="");
|
6
|
539 #png(outplot);
|
|
540 bitmap(outplot, "png16m");
|
0
|
541 par(mfrow=c(2,2));
|
|
542 boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5);
|
|
543 plot(density(proteome_df[,2]), xlab="Protein Abundance", ylab="Density", main="Proteome abundance", cex.lab=1.5);
|
|
544 boxplot(transcriptome_df[,2], ylab="Abundance", main="Transcriptome abundance", cex.lab=1.5);
|
|
545 plot(density(transcriptome_df[,2]), xlab="Transcript Abundance", ylab="Density", main="Transcriptome abundance", cex.lab=1.5);
|
|
546 dev.off();
|
|
547
|
|
548 cat(
|
|
549 "<font color='blue'><h3>Distribution of Proteome and Transcripome abundance (Box plot and Density plot)</h3></font>\n",
|
|
550 '<img src="AbundancePlot.png">',
|
|
551 file = htmloutfile, append = TRUE);
|
|
552
|
|
553 #=============================================================================================================
|
|
554 # Scatter plot
|
|
555 #=============================================================================================================
|
|
556 cat("Generating scatter plot\n",file=logfile, append=T);
|
5
|
557 outplot = paste(outdir,"/AbundancePlot_scatter.png",sep="",collapse="");
|
6
|
558 #png(outplot);
|
|
559 bitmap(outplot,"png16m")
|
0
|
560 par(mfrow=c(1,1));
|
|
561 scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
|
|
562
|
|
563 cat(
|
|
564 "<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>\n",
|
|
565 '<img src="AbundancePlot_scatter.png">',
|
|
566 file = htmloutfile, append = TRUE);
|
|
567
|
|
568 #=============================================================================================================
|
|
569 # Correlation testing
|
|
570 #=============================================================================================================
|
|
571 cat("Estimating correlation\n",file=logfile, append=T);
|
|
572 cor_result_pearson = cor.test(transcriptome_df[,2], proteome_df[,2], method = "pearson");
|
|
573 cor_result_spearman = cor.test(transcriptome_df[,2], proteome_df[,2], method = "spearman");
|
|
574 cor_result_kendall = cor.test(transcriptome_df[,2], proteome_df[,2], method = "kendall");
|
|
575
|
|
576 cat(
|
|
577 "<font color='blue'><h3>Correlation with all data</h3></font>\n",
|
|
578 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>',
|
|
579 file = htmloutfile, append = TRUE);
|
|
580
|
|
581 cat(
|
|
582 "<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>",
|
|
583 "<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>",
|
|
584 "<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>",
|
|
585 file = htmloutfile, append = TRUE)
|
|
586 cat("</table>\n", file = htmloutfile, append = TRUE)
|
|
587
|
|
588 cat(
|
|
589 '<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>cook\'s distance based approach</u> to identify such influential observations.</font>',
|
|
590 file = htmloutfile, append = TRUE);
|
|
591
|
|
592 #=============================================================================================================
|
|
593 # Linear Regression
|
|
594 #=============================================================================================================
|
|
595 cat("Fitting linear regression model\n",file=logfile, append=T);
|
|
596 PE_GE_data = proteome_df;
|
|
597 PE_GE_data = cbind(PE_GE_data, transcriptome_df);
|
|
598 colnames(PE_GE_data) = c("PE_ID","PE_abundance","GE_ID","GE_abundance");
|
|
599
|
|
600 regmodel = lm(PE_abundance~GE_abundance, data=PE_GE_data);
|
|
601 regmodel_summary = summary(regmodel);
|
|
602
|
|
603 cat(
|
|
604 "<font color='blue'><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>\n",
|
|
605 "<p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>\n",
|
|
606 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
|
|
607 file = htmloutfile, append = TRUE);
|
|
608
|
|
609 cat(
|
|
610 "<tr><td>Formula</td><td>","PE_abundance~GE_abundance","</td></tr>\n",
|
|
611 "<tr><td colspan='2' align='center'> <b>Coefficients</b></td>","</tr>\n",
|
|
612 "<tr><td>",names(regmodel$coefficients[1]),"</td><td>",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","</td></tr>\n",
|
|
613 "<tr><td>",names(regmodel$coefficients[2]),"</td><td>",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","</td></tr>\n",
|
|
614 "<tr><td colspan='2' align='center'> <b>Model parameters</b></td>","</tr>\n",
|
|
615 "<tr><td>Residual standard error</td><td>",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)</td></tr>\n",
|
|
616 "<tr><td>F-statistic</td><td>",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and ",regmodel_summary$fstatistic[3]," degree of freedom)</td></tr>\n",
|
|
617 "<tr><td>R-squared</td><td>",regmodel_summary$r.squared,"</td></tr>\n",
|
|
618 "<tr><td>Adjusted R-squared</td><td>",regmodel_summary$adj.r.squared,"</td></tr>\n",
|
|
619 file = htmloutfile, append = TRUE)
|
|
620 cat("</table>\n", file = htmloutfile, append = TRUE)
|
|
621
|
|
622 #=============================================================================================================
|
|
623 # Plotting various regression diagnostics plots
|
|
624 #=============================================================================================================
|
5
|
625 outplot1 = paste(outdir,"/PE_GE_modelfit.pdf",sep="",collapse="");
|
0
|
626 pdf(outplot1);
|
|
627 devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1
|
|
628 print(.Devices)
|
|
629 print(c(devnum,"+++"));
|
|
630
|
|
631 regmodel_predictedy = predict(regmodel, PE_GE_data);
|
|
632 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with all data");
|
|
633 points(PE_GE_data[,"GE_abundance"], regmodel_predictedy, col="red");
|
|
634
|
|
635 cat("Generating regression diagnostics plot\n",file=logfile, append=T);
|
|
636 cat(
|
|
637 "<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>\n",
|
|
638 file = htmloutfile, append = TRUE);
|
|
639
|
5
|
640 outplot = paste(outdir,"/PE_GE_lm_1.png",sep="",collapse="");
|
9
|
641 #png(outplot);
|
|
642 bitmap(outplot,"png16m");
|
0
|
643 par(mfrow=c(1,1));
|
|
644 plot(regmodel, 1, cex.lab=1.5);
|
|
645 dev.off();
|
|
646
|
5
|
647 outplot = paste(outdir,"/PE_GE_lm_2.png",sep="",collapse="");
|
9
|
648 #png(outplot);
|
|
649 bitmap(outplot,"png16m");
|
0
|
650 par(mfrow=c(1,1));
|
|
651 plot(regmodel, 2, cex.lab=1.5);
|
|
652 dev.off();
|
|
653
|
5
|
654 outplot = paste(outdir,"/PE_GE_lm_3.png",sep="",collapse="");
|
9
|
655 #png(outplot);
|
|
656 bitmap(outplot,"png16m");
|
0
|
657 par(mfrow=c(1,1));
|
|
658 plot(regmodel, 3, cex.lab=1.5);
|
|
659 dev.off();
|
|
660
|
5
|
661 outplot = paste(outdir,"/PE_GE_lm_5.png",sep="",collapse="");
|
9
|
662 #png(outplot);
|
|
663 bitmap(outplot,"png16m");
|
0
|
664 par(mfrow=c(1,1));
|
|
665 plot(regmodel, 5, cex.lab=1.5);
|
|
666 dev.off();
|
|
667
|
5
|
668 outplot = paste(outdir,"/PE_GE_lm.pdf",sep="",collapse="");
|
0
|
669 pdf(outplot);
|
|
670 plot(regmodel);
|
|
671 dev.off();
|
|
672 regmodel_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel$fitted.values)
|
|
673
|
|
674
|
|
675 cat(
|
|
676 "<u><font color='brown'><h4>Residuals vs Fitted plot</h4></font></u>\n",
|
|
677 '<img src="PE_GE_lm_1.png">',
|
|
678 '<br><br>This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship<br>',
|
|
679 file = htmloutfile, append = TRUE);
|
|
680
|
|
681 cat(
|
|
682 "<u><font color='brown'><h4>Normal Q-Q plot of residuals</h4></font></u>\n",
|
|
683 '<img src="PE_GE_lm_2.png">',
|
|
684 '<br><br>This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.<br>',
|
|
685 file = htmloutfile, append = TRUE);
|
|
686
|
|
687 cat(
|
|
688 "<u><font color='brown'><h4>Scale-Location (or Spread-Location) plot</h4></font></u>\n",
|
|
689 '<img src="PE_GE_lm_3.png">',
|
|
690 '<br><br>This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.<br>',
|
|
691 file = htmloutfile, append = TRUE);
|
|
692
|
|
693 cat(
|
|
694 "<u><font color='brown'><h4>Residuals vs Leverage plot</h4></font></u>\n",
|
|
695 '<img src="PE_GE_lm_3.png">',
|
|
696 '<br><br>This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.<br>',
|
|
697 file = htmloutfile, append = TRUE);
|
|
698
|
|
699 #=============================================================================================================
|
|
700 # Identification of influential observations
|
|
701 #=============================================================================================================
|
|
702 cat("Identifying influential observations\n",file=logfile, append=T);
|
|
703 cat(
|
|
704 "<font color='blue'><h3>Identify influential observations</h3></font>\n",
|
|
705 file = htmloutfile, append = TRUE);
|
|
706 cat(
|
|
707 '<p><b>Cook’s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>cook’s distance > than 4 times the mean</b> may be classified as <b>influential.</b></p>',
|
|
708 file = htmloutfile, append = TRUE);
|
|
709
|
|
710 cooksd <- cooks.distance(regmodel);
|
|
711
|
|
712 cat("Generating cooksd plot\n",file=logfile, append=T);
|
5
|
713 outplot = paste(outdir,"/PE_GE_lm_cooksd.png",sep="",collapse="");
|
9
|
714 #png(outplot);
|
|
715 bitmap(outplot,"png16m");
|
0
|
716 par(mfrow=c(1,1));
|
|
717 plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations") # plot cooks distance
|
|
718 abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
|
|
719 #text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2) # add labels
|
|
720 dev.off();
|
|
721
|
|
722 cat(
|
|
723 '<img src="PE_GE_lm_cooksd.png">',
|
|
724 '<br>In the above plot, observations above red line (4*mean cook\'s distance) are influential, marked in <b>*</b>. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br>',
|
|
725 file = htmloutfile, append = TRUE);
|
|
726
|
|
727 tempind = which(cooksd>4*mean(cooksd, na.rm=T));
|
|
728 PE_GE_data_no_outlier = PE_GE_data[-tempind,];
|
|
729 PE_GE_data_no_outlier$cooksd = cooksd[-tempind]
|
|
730 PE_GE_data_outlier = PE_GE_data[tempind,];
|
|
731 PE_GE_data_outlier$cooksd = cooksd[tempind]
|
|
732
|
|
733 cat(
|
|
734 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
|
|
735 file = htmloutfile, append = TRUE);
|
|
736
|
|
737 cat(
|
|
738 "<tr><td>Mean cook\'s distance</td><td>",mean(cooksd, na.rm=T),"</td></tr>\n",
|
|
739 "<tr><td>Total influential observations (cook\'s distance > 4 * mean cook\'s distance)</td><td>",length(tempind),"</td></tr>\n",
|
|
740 "<tr><td>Total influential observations (cook\'s distance > 3 * mean cook\'s distance)</td><td>",length(which(cooksd>3*mean(cooksd, na.rm=T))),"</td></tr>\n",
|
|
741 "</table>",
|
|
742 '<font color="brown"><h4>Top 10 influential observations (cook\'s distance > 4 * mean cook\'s distance)</h4></font>',
|
|
743 file = htmloutfile, append = TRUE);
|
|
744
|
|
745 cat("Writing influential observations\n",file=logfile, append=T);
|
|
746
|
5
|
747 outdatafile = paste(outdir,"/PE_GE_influential_observation.tsv", sep="", collapse="");
|
0
|
748 cat('<a href="',outdatafile, '" target="_blank">Download entire list</a>',file = htmloutfile, append = TRUE);
|
|
749 write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F);
|
|
750
|
|
751 cat(
|
|
752 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>PE_ID</th><th>PE_abundance</th><th>GE_ID</th><th>GE_abundance</th><th>cooksd</th></tr>',
|
|
753 file = htmloutfile, append = TRUE);
|
|
754
|
|
755 for(i in 1:10)
|
|
756 {
|
|
757 cat(
|
|
758 '<tr>','<td>',PE_GE_data_outlier[i,1],'</td>',
|
|
759 '<td>',PE_GE_data_outlier[i,2],'</td>',
|
|
760 '<td>',PE_GE_data_outlier[i,3],'</td>',
|
|
761 '<td>',PE_GE_data_outlier[i,4],'</td>',
|
|
762 '<td>',PE_GE_data_outlier[i,5],'</td></tr>',
|
|
763 file = htmloutfile, append = TRUE);
|
|
764 }
|
|
765 cat('</table>',file = htmloutfile, append = TRUE);
|
|
766
|
|
767 #=============================================================================================================
|
|
768 # Correlation with removal of outliers
|
|
769 #=============================================================================================================
|
|
770
|
|
771 #=============================================================================================================
|
|
772 # Scatter plot
|
|
773 #=============================================================================================================
|
5
|
774 outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse="");
|
9
|
775 #png(outplot);
|
|
776 bitmap(outplot,"png16m");
|
0
|
777 par(mfrow=c(1,1));
|
|
778 scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
|
|
779
|
|
780 cat(
|
|
781 "<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations</h3></font>\n",
|
|
782 '<img src="AbundancePlot_scatter_without_outliers.png">',
|
|
783 file = htmloutfile, append = TRUE);
|
|
784
|
|
785 #=============================================================================================================
|
|
786 # Correlation
|
|
787 #=============================================================================================================
|
|
788 cat("Estimating orrelation with removal of outliers \n",file=logfile, append=T);
|
|
789 cat(
|
|
790 "<font color='blue'><h3>Correlation with removal of outliers / influential observations</h3></font>\n",
|
|
791 '<p>We removed the influential observations and reestimated the correlation values.</p>',
|
|
792 file = htmloutfile, append = TRUE);
|
|
793
|
|
794 cor_result_pearson = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "pearson");
|
|
795 cor_result_spearman = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "spearman");
|
|
796 cor_result_kendall = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "kendall");
|
|
797
|
|
798 cat(
|
|
799 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>',
|
|
800 file = htmloutfile, append = TRUE);
|
|
801
|
|
802 cat(
|
|
803 "<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>",
|
|
804 "<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>",
|
|
805 "<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>",
|
|
806 file = htmloutfile, append = TRUE)
|
|
807 cat("</table>\n", file = htmloutfile, append = TRUE)
|
|
808
|
|
809 #=============================================================================================================
|
|
810 # Heatmap
|
|
811 #=============================================================================================================
|
|
812 cat(
|
|
813 "<font color='blue'><h3>Heatmap of PE and GE abundance values</h3></font>\n",
|
|
814 file = htmloutfile, append = TRUE);
|
|
815
|
|
816 cat("Generating heatmap plot\n",file=logfile, append=T);
|
5
|
817 outplot = paste(outdir,"/PE_GE_heatmap.png",sep="",collapse="");
|
9
|
818 #png(outplot);
|
|
819 bitmap(outplot,"png16m");
|
0
|
820 par(mfrow=c(1,1));
|
5
|
821 #heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col");
|
|
822 my_palette <- colorRampPalette(c("green", "white", "red"))(299);
|
|
823 heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=my_palette ,Colv=F, labCol=c("PE","GE"), scale="col", dendrogram = "row");
|
0
|
824 dev.off();
|
|
825
|
|
826 cat(
|
|
827 '<img src="PE_GE_heatmap.png">',
|
|
828 file = htmloutfile, append = TRUE);
|
|
829
|
|
830 #=============================================================================================================
|
|
831 # kmeans clustering
|
|
832 #=============================================================================================================
|
|
833
|
|
834 PE_GE_data_kdata = PE_GE_data;
|
|
835
|
|
836
|
|
837 k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5);
|
|
838 cat("Generating kmeans plot\n",file=logfile, append=T);
|
5
|
839 outplot = paste(outdir,"/PE_GE_kmeans.png",sep="",collapse="");
|
9
|
840 #png(outplot);
|
|
841 bitmap(outplot,"png16m");
|
0
|
842 par(mfrow=c(1,1));
|
|
843 scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
|
|
844
|
|
845 ind=which(k1$cluster==1);
|
|
846 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="red", pch=16);
|
|
847 ind=which(k1$cluster==2);
|
|
848 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="green", pch=16);
|
|
849 ind=which(k1$cluster==3);
|
|
850 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="blue", pch=16);
|
|
851 ind=which(k1$cluster==4);
|
|
852 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16);
|
|
853 ind=which(k1$cluster==5);
|
|
854 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="black", pch=16);
|
|
855 dev.off();
|
|
856
|
|
857 cat(
|
|
858 "<font color='blue'><h3>Kmean clustering</h3></font>\nNumber of Clusters: 5<br>",
|
|
859 file = htmloutfile, append = TRUE);
|
|
860
|
|
861 tempind = order(k1$cluster);
|
|
862 tempoutfile = paste(outdir,"/","PE_GE_kmeans_clusterpoints.txt",sep="",collapse="");
|
|
863
|
|
864 write.table(data.frame(PE_GE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n")
|
|
865
|
|
866 cat('<a href="',tempoutfile, '" target="_blank">Download cluster list</a><br>',file = htmloutfile, append = TRUE);
|
|
867
|
|
868 cat(
|
|
869 '<img src="PE_GE_kmeans.png">',
|
|
870 file = htmloutfile, append = TRUE);
|
|
871
|
|
872 #=============================================================================================================
|
|
873 # Other Regression fit
|
|
874 #=============================================================================================================
|
|
875 dev.set(devnum);
|
|
876 # Linear regression with removal of outliers
|
|
877 regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier);
|
|
878 regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier);
|
5
|
879 outplot = paste(outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse="");
|
0
|
880 plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers");
|
|
881 points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red");
|
|
882
|
|
883 pdf(outplot);
|
|
884 plot(regmodel_no_outlier);
|
|
885 dev.off();
|
|
886 regmodel_no_outlier_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_no_outlier$fitted.values)
|
|
887
|
|
888 # Resistant regression (lqs / least trimmed squares method)
|
|
889 regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data);
|
|
890 regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data);
|
5
|
891 outplot = paste(outdir,"/PE_GE_lqs.pdf",sep="",collapse="");
|
0
|
892 pdf(outplot);
|
|
893 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)");
|
|
894 points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red");
|
|
895 #plot(regmodel_lqs);
|
|
896 dev.off();
|
|
897 dev.set(devnum);
|
|
898 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)");
|
|
899 points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red");
|
|
900 regmodel_lqs_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_lqs$fitted.values)
|
|
901
|
|
902 # Robust regression (rlm / Huber M-estimator method)
|
|
903 regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data);
|
|
904 regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data);
|
5
|
905 outplot = paste(outdir,"/PE_GE_rlm.pdf",sep="",collapse="");
|
0
|
906 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm / Huber M-estimator method)");
|
|
907 points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red");
|
|
908 pdf(outplot);
|
|
909 plot(regmodel_rlm);
|
|
910 dev.off();
|
|
911 regmodel_rlm_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_rlm$fitted.values)
|
|
912
|
|
913 # polynomical reg
|
|
914 regmodel_poly2 = lm(PE_abundance ~ poly(GE_abundance, 2, raw = TRUE), data = PE_GE_data)
|
|
915 regmodel_poly3 = lm(PE_abundance ~ poly(GE_abundance, 3, raw = TRUE), data = PE_GE_data)
|
|
916 regmodel_poly4 = lm(PE_abundance ~ poly(GE_abundance, 4, raw = TRUE), data = PE_GE_data)
|
|
917 regmodel_poly5 = lm(PE_abundance ~ poly(GE_abundance, 5, raw = TRUE), data = PE_GE_data)
|
|
918 regmodel_poly6 = lm(PE_abundance ~ poly(GE_abundance, 6, raw = TRUE), data = PE_GE_data)
|
|
919 regmodel_poly2_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly2$fitted.values)
|
|
920 regmodel_poly3_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly3$fitted.values)
|
|
921 regmodel_poly4_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly4$fitted.values)
|
|
922 regmodel_poly5_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly5$fitted.values)
|
|
923 regmodel_poly6_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly6$fitted.values)
|
|
924 regmodel_poly2_predictedy = predict(regmodel_poly2, PE_GE_data);
|
|
925 regmodel_poly3_predictedy = predict(regmodel_poly3, PE_GE_data);
|
|
926 regmodel_poly4_predictedy = predict(regmodel_poly4, PE_GE_data);
|
|
927 regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data);
|
|
928 regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data);
|
|
929
|
5
|
930 outplot = paste(outdir,"/PE_GE_poly2.pdf",sep="",collapse="");
|
0
|
931 dev.set(devnum);
|
|
932 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2");
|
|
933 points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red");
|
|
934 pdf(outplot);
|
|
935 plot(regmodel_poly2);
|
|
936 dev.off();
|
|
937
|
5
|
938 outplot = paste(outdir,"/PE_GE_poly3.pdf",sep="",collapse="");
|
0
|
939 dev.set(devnum);
|
|
940 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3");
|
|
941 points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red");
|
|
942 pdf(outplot);
|
|
943 plot(regmodel_poly3);
|
|
944 dev.off();
|
|
945
|
5
|
946 outplot = paste(outdir,"/PE_GE_poly4.pdf",sep="",collapse="");
|
0
|
947 dev.set(devnum);
|
|
948 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4");
|
|
949 points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red");
|
|
950 pdf(outplot);
|
|
951 plot(regmodel_poly4);
|
|
952 dev.off();
|
|
953
|
5
|
954 outplot = paste(outdir,"/PE_GE_poly5.pdf",sep="",collapse="");
|
0
|
955 dev.set(devnum);
|
|
956 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5");
|
|
957 points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red");
|
|
958 pdf(outplot);
|
|
959 plot(regmodel_poly5);
|
|
960 dev.off();
|
|
961
|
5
|
962 outplot = paste(outdir,"/PE_GE_poly6.pdf",sep="",collapse="");
|
0
|
963 dev.set(devnum);
|
|
964 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6");
|
|
965 points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red");
|
|
966 pdf(outplot);
|
|
967 plot(regmodel_poly6);
|
|
968 dev.off();
|
|
969
|
|
970 # GAM Generalized additive models
|
|
971 regmodel_gam <- gam(PE_abundance ~ s(GE_abundance), data = PE_GE_data)
|
|
972 regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data);
|
|
973
|
|
974 regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values)
|
5
|
975 outplot = paste(outdir,"/PE_GE_gam.pdf",sep="",collapse="");
|
0
|
976 dev.set(devnum);
|
|
977 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models");
|
|
978 points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red");
|
|
979 pdf(outplot);
|
|
980 plot(regmodel_gam,pages=1,residuals=TRUE); ## show partial residuals
|
|
981 plot(regmodel_gam,pages=1,seWithMean=TRUE) ## `with intercept' CIs
|
|
982 dev.off();
|
|
983 dev.off(devnum);
|
|
984
|
|
985 cat(
|
|
986 "<font color='blue'><h3>Other regression model fitting</h3></font>\n",
|
|
987 file = htmloutfile, append = TRUE);
|
|
988
|
|
989 cat(
|
|
990 "<ul>
|
|
991 <li>MAE:mean absolute error</li>
|
|
992 <li>MSE: mean squared error</li>
|
|
993 <li>RMSE:root mean squared error ( sqrt(MSE) )</li>
|
|
994 <li>MAPE:mean absolute percentage error</li>
|
|
995 </ul>
|
|
996 ",
|
|
997 file = htmloutfile, append = TRUE);
|
|
998
|
|
999 cat(
|
|
1000 '<h4><a href="PE_GE_modelfit.pdf" target="_blank">Comparison of model fits</a></h4>',
|
|
1001 file = htmloutfile, append = TRUE);
|
|
1002
|
|
1003 cat(
|
|
1004 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Model</th><th>MAE</th><th>MSE</th><th>RMSE</th><th>MAPE</th><th>Diagnostics Plot</th></tr>',
|
|
1005 file = htmloutfile, append = TRUE);
|
|
1006
|
|
1007 cat(
|
|
1008 "<tr><td>Linear regression with all data</td><td>",regmodel_metrics[1],"</td><td>",regmodel_metrics[2],"</td><td>",regmodel_metrics[3],"</td><td>",regmodel_metrics[4],"</td><td>",'<a href="PE_GE_lm.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1009
|
|
1010 "<tr><td>Linear regression with removal of outliers</td><td>",regmodel_no_outlier_metrics[1],"</td><td>",regmodel_no_outlier_metrics[2],"</td><td>",regmodel_no_outlier_metrics[3],"</td><td>",regmodel_no_outlier_metrics[4],"</td><td>",'<a href="PE_GE_lm_without_outliers.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1011
|
|
1012 "<tr><td>Resistant regression (lqs / least trimmed squares method)</td><td>",regmodel_lqs_metrics[1],"</td><td>",regmodel_lqs_metrics[2],"</td><td>",regmodel_lqs_metrics[3],"</td><td>",regmodel_lqs_metrics[4],"</td><td>", '<a href="PE_GE_lqs.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1013
|
|
1014 "<tr><td>Robust regression (rlm / Huber M-estimator method)</td><td>",regmodel_rlm_metrics[1],"</td><td>",regmodel_rlm_metrics[2],"</td><td>",regmodel_rlm_metrics[3],"</td><td>",regmodel_rlm_metrics[4],"</td><td>",'<a href="PE_GE_rlm.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1015
|
|
1016
|
|
1017 "<tr><td>Polynomial regression with degree 2</td><td>",regmodel_poly2_metrics[1],"</td><td>",regmodel_poly2_metrics[2],"</td><td>",regmodel_poly2_metrics[3],"</td><td>",regmodel_poly2_metrics[4],"</td><td>",'<a href="PE_GE_poly2.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1018
|
|
1019 "<tr><td>Polynomial regression with degree 3</td><td>",regmodel_poly3_metrics[1],"</td><td>",regmodel_poly3_metrics[2],"</td><td>",regmodel_poly3_metrics[3],"</td><td>",regmodel_poly3_metrics[4],"</td><td>",'<a href="PE_GE_poly3.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1020
|
|
1021 "<tr><td>Polynomial regression with degree 4</td><td>",regmodel_poly4_metrics[1],"</td><td>",regmodel_poly4_metrics[2],"</td><td>",regmodel_poly4_metrics[3],"</td><td>",regmodel_poly4_metrics[4],"</td><td>",'<a href="PE_GE_poly4.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1022
|
|
1023 "<tr><td>Polynomial regression with degree 5</td><td>",regmodel_poly5_metrics[1],"</td><td>",regmodel_poly5_metrics[2],"</td><td>",regmodel_poly5_metrics[3],"</td><td>",regmodel_poly5_metrics[4],"</td><td>",'<a href="PE_GE_poly5.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1024
|
|
1025 "<tr><td>Polynomial regression with degree 6</td><td>",regmodel_poly6_metrics[1],"</td><td>",regmodel_poly6_metrics[2],"</td><td>",regmodel_poly6_metrics[3],"</td><td>",regmodel_poly6_metrics[4],"</td><td>",'<a href="PE_GE_poly6.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1026
|
|
1027 "<tr><td>Generalized additive models</td><td>",regmodel_gam_metrics[1],"</td><td>",regmodel_gam_metrics[2],"</td><td>",regmodel_gam_metrics[3],"</td><td>",regmodel_gam_metrics[4],"</td><td>",'<a href="PE_GE_gam.pdf" target="_blank">Link</a>',"</td></tr>",
|
|
1028
|
|
1029 "</table>",
|
|
1030 file = htmloutfile, append = TRUE);
|
|
1031
|
|
1032
|
|
1033 # Warning On
|
5
|
1034 # options(warn = oldw)
|
0
|
1035
|
|
1036
|
|
1037 |