Mercurial > repos > pravs > protein_rna_correlation
comparison protein_rna_correlation.r @ 0:fc89f8c3b777 draft
planemo upload
author | pravs |
---|---|
date | Sun, 17 Jun 2018 04:20:06 -0400 |
parents | |
children | 412403eec79c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fc89f8c3b777 |
---|---|
1 #================================================================================== | |
2 # About the script | |
3 #================================================================================== | |
4 # Version: V1 | |
5 # This script works for single sample only | |
6 # It takes GE (Gene Expression) and PE (Protein expression) data of one sample and perform correlation, regression analysis between PE and GE data | |
7 # Input data can be of tsv format | |
8 # Script also need a parameter or option file | |
9 | |
10 #================================================================================== | |
11 # Dependencies | |
12 #================================================================================== | |
13 # Following R package has to be installed. | |
14 # data.table | |
15 # gplots | |
16 # MASS | |
17 # DMwR | |
18 # mgcv | |
19 # It can be installed by following R command in R session. e.g. install.packages("data.table") | |
20 | |
21 #================================================================================== | |
22 # How to Run | |
23 #================================================================================== | |
24 # Rscript PE_GE_association_singleSample_V1.r <PE_file> <GE_file> <Option_file containing tool parameters> <Ensembl map file containing directory path> <outdir> | |
25 | |
26 #================================================================================== | |
27 # Arguments | |
28 #================================================================================== | |
29 # Arg1. <PE file>: PE data (tsv format) | |
30 # Arg2. <GE file>: GE data (tsv format) | |
31 # Arg3. <Option file>: tsv format, key\tvalue | |
32 # Options are | |
33 # PE_idcolno: Column number of PE file containing protein IDs | |
34 # GE_idcolno: Column number of GE file containing transcript IDs | |
35 # PE_expcolno: Column number of PE file containing protein expression values | |
36 # GE_expcolno: Column number of GE file containing transcript expression values | |
37 # PE_idtype: protein id type. It can be either Uniprot or Ensembl or HGNC_symbol | |
38 # GE_idtype: transcript id type. At present it is only one type i.e. Ensembl or HGNC_symbol | |
39 # Organism: Organism | |
40 # writeMapUnmap: Whether to write mapped and unmapped data in input data format. It takes value as 1 or 0. If 1, mapped and unmapped data is written. Default is 1. | |
41 # doscale: Whether perform scaling to input data or not. If yet, abundance values are normalized by standard normalization. Default 1 | |
42 # Arg4. <Ensembl map file containg directory>: Path to Ensembl map file containg directory e.g. /home/user/Ensembl/mapfiles | |
43 # Arg5. <Outdir>: output directory (e.g. /home/user/out1) | |
44 | |
45 #================================================================================== | |
46 # Sample option file | |
47 #================================================================================== | |
48 #PE_idcolno 7 | |
49 #GE_idcolno 1 | |
50 #PE_expcolno 2 | |
51 #GE_expcolno 3 | |
52 #PE_idtype Ensembl | |
53 #GE_idtype Ensembl | |
54 #Organism mmusculus | |
55 #writeMapUnmap 1 | |
56 #doscale 1 | |
57 | |
58 #================================================================================== | |
59 # Output | |
60 #================================================================================== | |
61 # The script outputs image and data folder along with Correlation_result.html and Result.log file | |
62 # Result.log: Log file | |
63 # Correlation_result.html; main result file in html format | |
64 | |
65 # data folder contains following output files | |
66 | |
67 # PE_abundance.tsv: 2 column tsv file containing mapped id and protein expression values | |
68 # GE_abundance.tsv: 2 column tsv file containing mapped id and transcript expression values | |
69 | |
70 # If writeMapUnmap is 1 i.e. to write mapped and unmapped data, 4 additional file will be written | |
71 # PE_unmapped.tsv: Output format is same as input, PE unmapped data is written | |
72 # GE_unmapped.tsv: Output format is same as input, GE unmapped data is written | |
73 # PE_mapped.tsv: Output format is same as input, PE mapped data is written | |
74 # GE_mapped.tsv: Output format is same as input, GE mapped data is written | |
75 | |
76 # PE_GE_influential_observation.tsv: Influential observations based on cook's distance | |
77 # PE_GE_kmeans_clusterpoints.txt: Observations clustered based on kmeans clustering. File contains cluster assignment of each observations | |
78 | |
79 #================================================================================== | |
80 # ............................SCRIPT STARTS FROM HERE ............................. | |
81 #================================================================================== | |
82 # Warning Off | |
83 oldw <- getOption("warn") | |
84 options(warn = -1) | |
85 #============================================================================================================= | |
86 # Functions | |
87 #============================================================================================================= | |
88 usage <- function() | |
89 { | |
90 cat("\n\n###########\nERROR: Rscript PE_GE_association_singleSample_V1.r <PE file> <GE file> <Option file containing tool parameters> <Ensembl map file containing directory path> <outdir>\n###########\n\n"); | |
91 } | |
92 | |
93 #============================================================================================================= | |
94 # Global variables | |
95 #============================================================================================================= | |
96 noargs = 13; | |
97 | |
98 #============================================================================================================= | |
99 # Parse command line arguments in args object | |
100 #============================================================================================================= | |
101 args = commandArgs(trailingOnly = TRUE); | |
102 | |
103 | |
104 #============================================================================================================= | |
105 # Check for No of arguments | |
106 #============================================================================================================= | |
107 if(length(args) != noargs) | |
108 { | |
109 usage(); | |
110 stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse="")); | |
111 } | |
112 | |
113 #====================================================================== | |
114 # Load libraries | |
115 #====================================================================== | |
116 library(data.table, warn.conflicts=FALSE); | |
117 library(lattice, warn.conflicts=FALSE); | |
118 library(grid, warn.conflicts=FALSE); | |
119 library(nlme, warn.conflicts=FALSE); | |
120 library(gplots, warn.conflicts=FALSE); | |
121 library(MASS, warn.conflicts=FALSE); | |
122 library(DMwR, warn.conflicts=FALSE); | |
123 library(mgcv, warn.conflicts=FALSE); | |
124 | |
125 | |
126 #============================================================================================================= | |
127 # Set variables | |
128 #============================================================================================================= | |
129 #PE_file = args[1]; # Protein abundance data file | |
130 #GE_file = args[2]; # Gene expression data file | |
131 #option_file = args[3]; # Option file containing various parameters | |
132 #biomartdir = args[4]; # Biomart map file containing directory path in local system | |
133 #outdir = args[5]; # output directory | |
134 | |
135 PE_file = args[1]; # Protein abundance data file | |
136 GE_file = args[2]; # Gene expression data file | |
137 #option_file = args[3]; # Option file containing various parameters | |
138 #biomartdir = args[4]; # Biomart map file containing directory path in local system | |
139 outdir = args[13]; # output directory | |
140 | |
141 #imagesubdirprefix = "image"; | |
142 #datasubdirprefix = "data"; | |
143 #htmloutfile = "Correlation_result.html"; | |
144 htmloutfile = args[12] | |
145 | |
146 #logfile = "Result.log"; | |
147 PE_outfile_mapped = "PE_mapped.tsv"; | |
148 GE_outfile_mapped = "GE_mapped.tsv"; | |
149 PE_outfile_unmapped = "PE_unmapped.tsv"; | |
150 GE_outfile_unmapped = "GE_unmapped.tsv"; | |
151 PE_expfile = "PE_abundance.tsv"; | |
152 GE_expfile = "GE_abundance.tsv"; | |
153 PE_outfile_excluded_naInf = "PE_excluded_NA_Inf.tsv"; | |
154 GE_outfile_excluded_naInf = "GE_excluded_NA_Inf.tsv"; | |
155 | |
156 #============================================================================================================= | |
157 # Check input files existance | |
158 #============================================================================================================= | |
159 if(! file.exists(PE_file)) | |
160 { | |
161 usage(); | |
162 stop(paste("Input PE_file file does not exists. Path given: ",PE_file,sep="",collapse="")); | |
163 } | |
164 if(! file.exists(GE_file)) | |
165 { | |
166 usage(); | |
167 stop(paste("Input GE_file does not exists. Path given: ",GE_file,sep="",collapse="")); | |
168 } | |
169 #if(! file.exists(option_file)) | |
170 #{ | |
171 # usage(); | |
172 # stop(paste("Input option_file does not exists. Path given: ",option_file,sep="",collapse="")); | |
173 #} | |
174 | |
175 #============================================================================================================= | |
176 # Read param_file and set parameter/option data frame | |
177 #============================================================================================================= | |
178 #optiondf = read.table(option_file, header = FALSE, stringsAsFactors = FALSE) | |
179 #rownames(optiondf) = optiondf[,1]; | |
180 | |
181 #============================================================================================================= | |
182 # Define option variables | |
183 #============================================================================================================= | |
184 #PE_idcolno = as.numeric(optiondf["PE_idcolno",2]); | |
185 #GE_idcolno = as.numeric(optiondf["GE_idcolno",2]); | |
186 #PE_expcolno = as.numeric(optiondf["PE_expcolno",2]); | |
187 #GE_expcolno = as.numeric(optiondf["GE_expcolno",2]); | |
188 #PE_idtype = optiondf["PE_idtype",2]; | |
189 #GE_idtype = optiondf["GE_idtype",2]; | |
190 #Organism = optiondf["Organism",2]; | |
191 #writeMapUnmap = as.logical(as.numeric(optiondf["writeMapUnmap",2])); | |
192 #doscale = as.logical(as.numeric(optiondf["doscale",2])); | |
193 | |
194 PE_idcolno = as.numeric(args[3]) | |
195 GE_idcolno = as.numeric(args[4]) | |
196 PE_expcolno = as.numeric(args[5]) | |
197 GE_expcolno = as.numeric(args[6]) | |
198 PE_idtype = args[7] | |
199 GE_idtype = args[8] | |
200 #Organism = args[9] | |
201 writeMapUnmap = as.logical(as.numeric(args[10])); | |
202 doscale = as.logical(as.numeric(args[11])); | |
203 | |
204 | |
205 #1 PE_file = "test_data/PE_mouse_singlesample.txt" | |
206 #2 GE_file = "test_data/GE_mouse_singlesample.txt" | |
207 #3 PE_idcolno = 7 | |
208 #4 GE_idcolno = 1 | |
209 #5 PE_expcolno = 13 | |
210 #6 GE_expcolno = 10 | |
211 #7 PE_idtype = "Ensembl_with_version" | |
212 #8 GE_idtype = "Ensembl_with_version" | |
213 #10 writeMapUnmap = 1 | |
214 #11 doscale = 1 | |
215 #9 biomart_mapfile = "test_data/mmusculus_gene_ensembl__GRCm38.p6.map" | |
216 #12 htmloutfile = "html_out.html" | |
217 #13 outdir = "output_elements" | |
218 | |
219 #============================================================================================================= | |
220 # Set column name of biomart map file (idtype) based on whether Ensembl or Uniprot | |
221 #============================================================================================================= | |
222 if(PE_idtype == "Ensembl") | |
223 { | |
224 PE_idtype = "ensembl_peptide_id"; | |
225 }else | |
226 { | |
227 if(PE_idtype == "Ensembl_with_version") | |
228 { | |
229 PE_idtype = "ensembl_peptide_id_version"; | |
230 }else{ | |
231 if(PE_idtype == "HGNC_symbol") | |
232 { | |
233 PE_idtype = "hgnc_symbol"; | |
234 } | |
235 } | |
236 } | |
237 | |
238 if(GE_idtype == "Ensembl") | |
239 { | |
240 GE_idtype = "ensembl_transcript_id"; | |
241 }else | |
242 { | |
243 if(GE_idtype == "Ensembl_with_version") | |
244 { | |
245 GE_idtype = "ensembl_transcript_id_version"; | |
246 }else{ | |
247 if(GE_idtype == "HGNC_symbol") | |
248 { | |
249 GE_idtype = "hgnc_symbol"; | |
250 } | |
251 } | |
252 } | |
253 #============================================================================================================= | |
254 # Identify biomart mapping file | |
255 #============================================================================================================= | |
256 #biomartdir = gsub(biomartdir, pattern="/$", replacement="") | |
257 #biomart_mapfilename = list.files(path = biomartdir, pattern = Organism); | |
258 #biomart_mapfile = paste(biomartdir,"/",biomart_mapfilename,sep="",collapse=""); | |
259 #print(biomart_mapfile); | |
260 biomart_mapfile = args[9]; | |
261 #============================================================================================================= | |
262 # Parse PE, GE, biomart file file | |
263 #============================================================================================================= | |
264 PE_df = as.data.frame(fread(input=PE_file, header=T, sep="\t")); | |
265 GE_df = as.data.frame(fread(input=GE_file, header=T, sep="\t")); | |
266 biomart_df = as.data.frame(fread(input=biomart_mapfile, header=T, sep="\t")); | |
267 | |
268 #============================================================================================================= | |
269 # Create directory structures and then set the working directory to output directory | |
270 #============================================================================================================= | |
271 if(! file.exists(outdir)) | |
272 { | |
273 dir.create(outdir); | |
274 } | |
275 | |
276 #tempdir = paste(outdir,"/",imagesubdirprefix,sep="",collapse=""); | |
277 #if(! file.exists(tempdir)) | |
278 #{ | |
279 # dir.create(tempdir); | |
280 #} | |
281 | |
282 #tempdir = paste(outdir,"/",datasubdirprefix,sep="",collapse=""); | |
283 #if(! file.exists(tempdir)) | |
284 #{ | |
285 # dir.create(tempdir); | |
286 #} | |
287 | |
288 #setwd(outdir); | |
289 logfile = paste(outdir,"/", "Result.log",sep="",collapse=""); | |
290 | |
291 #============================================================================================================= | |
292 # Write initial data summary in html outfile | |
293 #============================================================================================================= | |
294 cat("<html><body>\n", file = htmloutfile); | |
295 cat("<h1>Association between proteomics and transcriptomics data</h1>\n", | |
296 "<font color='blue'><h3>Input data summary</h3></font>", | |
297 "<ul>", | |
298 "<li>Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)","</li>", | |
299 "<li>Input PE data dimension (Row Column): ", dim(PE_df),"</li>", | |
300 "<li>Input GE data dimension (Row Column): ", dim(GE_df),"</li>", | |
301 #"<li>Organism selected: ", Organism,"</li>", | |
302 "<li>Protein ID fetched from column: ", PE_idcolno,"</li>", | |
303 "<li>Transcript ID fetched from column: ", GE_idcolno,"</li>", | |
304 "<li>Protein ID type: ", PE_idtype,"</li>", | |
305 "<li>Transcript ID type: ", GE_idtype,"</li>", | |
306 "<li>Protein expression data fetched from column: ", PE_expcolno,"</li>", | |
307 "<li>Transcript expression data fetched from column: ", GE_expcolno,"</li>", | |
308 file = htmloutfile, append = TRUE); | |
309 | |
310 #============================================================================================================= | |
311 # Write initial data summary in logfile | |
312 #============================================================================================================= | |
313 #cat("Current work dir:", outdir,"\n"); | |
314 cat("Processing started\n---------------------\n", file=logfile); | |
315 cat("Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)\n", file=logfile, append=T); | |
316 cat("Input PE data dimension (Row Column): ", dim(PE_df),"\n", file=logfile, append=T) | |
317 cat("Input GE data dimension (Row Column): ", dim(GE_df),"\n", file=logfile, append=T) | |
318 #cat("Organism selected: ", Organism,"\n", file=logfile, append=T) | |
319 #cat("Biomart map file used: ", biomart_mapfilename,"\n", file=logfile, append=T) | |
320 cat("Ensembl Biomart mapping data dimension (Row Column): ", dim(biomart_df),"\n", file=logfile, append=T) | |
321 cat("\n\nProtein ID to Transcript ID mapping started\n----------------\n", file=logfile, append=T) | |
322 cat("Protein ID fetched from column:", PE_idcolno,"\n", file=logfile, append=T) | |
323 cat("Transcript ID fetched from column:", GE_idcolno, "\n",file=logfile, append=T) | |
324 cat("Protein ID type:", PE_idtype, "\n",file=logfile, append=T) | |
325 cat("Transcript ID type:", GE_idtype,"\n", file=logfile, append=T); | |
326 cat("Protein expression data fetched from column:", PE_expcolno,"\n", file=logfile, append=T) | |
327 cat("Transcript expression data fetched from column:", GE_expcolno, "\n",file=logfile, append=T) | |
328 | |
329 #============================================================================================================= | |
330 # Mapping starts here | |
331 # Pseudocode | |
332 # Loop over each row of PE file, fetch protein_id | |
333 # Search the biomartmap file and obtain corresponding transcript_id | |
334 # Take the mapped transcript_id and search in GE file, which row it correspond | |
335 # Store the PE rowno and GE rowno in rowpair object | |
336 #============================================================================================================= | |
337 rowpair = data.frame(PE_rowno = 0, GE_rowno = 0); | |
338 cat("Total rows:", nrow(PE_df),"\n"); | |
339 cat("\n\nTotal protein ids to be mapped: ", nrow(PE_df),"\n", file=logfile, append=T); | |
340 messagelog = "\n\nBelow are protein IDs, for which no match is observed in Ensembl Biomart Map file.\n\n"; | |
341 | |
342 # GE_id column | |
343 GE_ids = GE_df[,GE_idcolno]; | |
344 GE_ids = gsub(x=GE_ids, pattern=".\\d+$", replacement=""); # Remove version | |
345 | |
346 # Loop over every row of PE data (PE_df) | |
347 for(i in 1:nrow(PE_df)) | |
348 { | |
349 | |
350 if(i%%100 ==0) | |
351 { | |
352 cat("Total rows processed ", i,"\n", file=logfile, append=T); | |
353 print(i); | |
354 } | |
355 | |
356 queryid = PE_df[i,PE_idcolno]; | |
357 #queryid = gsub(x=queryid, pattern=".\\d+$", replacement=""); # Remove version | |
358 #print(queryid); | |
359 | |
360 PE_df_matchrowno = i; # Row number in PE_df which matches queryid | |
361 | |
362 if(PE_idtype == "Uniprot") | |
363 { | |
364 biomart_matchrowno = which(biomart_df[,8] == queryid | biomart_df[,9] == queryid); # Row number of biomart_df which matches queryid | |
365 }else{ | |
366 biomart_matchrowno = which(biomart_df[,PE_idtype] == queryid); # Row number of biomart_df which matches queryid | |
367 } | |
368 | |
369 # If match found, map protein id to GE id and find corresponding match row number of GE_df | |
370 if(length(biomart_matchrowno)>0) | |
371 { | |
372 GE_df_matchrowno = which(GE_ids %in% biomart_df[biomart_matchrowno[1],GE_idtype]); | |
373 rowpair = rbind( rowpair, c(PE_df_matchrowno, GE_df_matchrowno)); | |
374 if(length(GE_df_matchrowno) > 1) | |
375 { | |
376 cat("\nFor protein ID ", i," multiple transcript mapping found\n", file=logfile, append=T); | |
377 | |
378 cat( | |
379 "<br><font color=",'red',">For protein ID", i," multiple transcript mapping found</font><br>",file = htmloutfile, append = TRUE); | |
380 } | |
381 }else{ | |
382 messagelog = paste(messagelog, queryid, "\n",sep="", collapse=""); | |
383 } | |
384 } | |
385 rowpair = rowpair[-1,]; | |
386 | |
387 #============================================================================================================= | |
388 # Write mapping summary, mapped and unmapped data | |
389 #============================================================================================================= | |
390 cat( | |
391 "<li>Total Protein ID mapped: ", length(intersect(1:nrow(PE_df), rowpair[,1])),"</li>", | |
392 "<li>Total Protein ID unmapped: ", length(setdiff(1:nrow(PE_df), rowpair[,1])),"</li>", | |
393 "<li>Total Transcript ID mapped: ", length(intersect(1:nrow(GE_df), rowpair[,2])),"</li>", | |
394 "<li>Total Transcript ID unmapped: ", length(setdiff(1:nrow(GE_df), rowpair[,2])),"</li></ul>", | |
395 file = htmloutfile, append = TRUE); | |
396 | |
397 cat("\n\nMapping Statistics\n---------------------\n", file=logfile, append=T); | |
398 cat("Total Protein ID mapped:", length(intersect(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T) | |
399 cat("Total Protein ID unmapped:", length(setdiff(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T) | |
400 cat("Total Transcript ID mapped:", length(intersect(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T) | |
401 cat("Total Transcript ID unmapped:", length(setdiff(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T) | |
402 | |
403 cat(messagelog,"\n",file=logfile, append=T); | |
404 | |
405 if(writeMapUnmap) | |
406 { | |
407 write.table(PE_df[rowpair[,1], ], file=paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
408 write.table(GE_df[rowpair[,2], ], file= paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
409 write.table(PE_df[-rowpair[,1], ], file= paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
410 write.table(GE_df[-rowpair[,2], ], file=paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n"); | |
411 | |
412 cat( | |
413 "<font color='blue'><h3>Download mapped unmapped data</h3></font>", | |
414 "<ul><li>Protein mapped data: ", '<a href="',paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
415 "<li>Protein unmapped data: ", '<a href="',paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
416 "<li>Transcript mapped data: ", '<a href="',paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
417 "<li>Transcript unmapped data: ", '<a href="',paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
418 file = htmloutfile, append = TRUE); | |
419 } | |
420 | |
421 write.table(PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)], file=paste(outdir,"/",PE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
422 write.table(GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)], file=paste(outdir,"/",GE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
423 | |
424 cat( | |
425 "<li>Protein abundance data: ", '<a href="',paste(outdir,"/",PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
426 "<li>Transcript abundance data: ", '<a href="',paste(outdir,"/",GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", | |
427 file = htmloutfile, append = TRUE); | |
428 | |
429 #========================================================================================== | |
430 # Analysis (correlation and regression) starts here | |
431 #========================================================================================== | |
432 cat("Analysis started\n---------------------------\n",file=logfile, append=T); | |
433 proteome_df = PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)]; | |
434 transcriptome_df = GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)]; | |
435 nPE = nrow(proteome_df); | |
436 nGE = nrow(transcriptome_df) | |
437 | |
438 cat("Total Protein ID: ",nPE,"\n",file=logfile, append=T); | |
439 cat("Total Transcript ID: ",nGE,"\n",file=logfile, append=T); | |
440 | |
441 #========================================================================================== | |
442 # Data summary | |
443 #========================================================================================== | |
444 cat( | |
445 "<ul>", | |
446 "<li>Number of entries in Transcriptome data used for correlation: ",nPE,"</li>", | |
447 "<li>Number of entries in Proteome data used for correlation: ",nGE,"</li></ul>", | |
448 file = htmloutfile, append = TRUE); | |
449 | |
450 #============================================================================================================= | |
451 # Remove entries with NA or Inf or -Inf in Transcriptome and Proteome data which will create problem in correlation analysis | |
452 #============================================================================================================= | |
453 totna = sum(is.na(transcriptome_df[,2]) | is.na(proteome_df[,2])); | |
454 totinf = sum(is.infinite(transcriptome_df[,2]) | is.infinite(proteome_df[,2])); | |
455 | |
456 cat("<font color='blue'><h3>Filtering</h3></font>","Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry<br>","<ul>","<li>Number of NA found: ",totna,"</li>","<li>Number of Inf or -Inf found: ",totinf,"</li></ul>",file = htmloutfile, append = TRUE); | |
457 | |
458 cat("Total NA observed in either Transcriptome or Proteome data: ",totna,"\n",file=logfile, append=T); | |
459 cat("Total Inf or -Inf observed in either Transcriptome or Proteome data: ",totinf,"\n",file=logfile, append=T); | |
460 | |
461 if(totna > 0 | totinf > 0) | |
462 { | |
463 excludeIndices_PE = which(is.na(proteome_df[,2]) | is.infinite(proteome_df[,2])); | |
464 excludeIndices_GE = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2])); | |
465 excludeIndices = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2]) | is.na(proteome_df[,2]) | is.infinite(proteome_df[,2])); | |
466 | |
467 # Write excluded transcriptomics and proteomics data to file | |
468 write.table(proteome_df[excludeIndices_PE,], file=paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
469 write.table(transcriptome_df[excludeIndices_GE,], file=paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") | |
470 | |
471 # Write excluded transcriptomics and proteomics data link to html file | |
472 cat( | |
473 "<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", | |
474 "<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", | |
475 file = htmloutfile, append = TRUE); | |
476 | |
477 # Keep the unexcluded entries only | |
478 transcriptome_df = transcriptome_df[-excludeIndices,]; | |
479 proteome_df = proteome_df[-excludeIndices,]; | |
480 nPE = nrow(proteome_df); | |
481 nGE = nrow(transcriptome_df) | |
482 | |
483 cat("<font color='blue'><h3>Filtered data summary</h3></font>", | |
484 "Excluding entires with abundance values: NA/Inf/-Inf<br>", | |
485 "<ul>", | |
486 "<li>Number of entries in Transcriptome data remained: ",nrow(transcriptome_df),"</li>", | |
487 "<li>Number of entries in Proteome data remained: ",nrow(proteome_df),"</li></ul>", file = htmloutfile, append = TRUE); | |
488 | |
489 cat("Excluding entires with abundance values: NA/Inf/-Inf","\n",file=logfile, append=T); | |
490 | |
491 cat("Total Protein ID after filtering: ",nPE,"\n",file=logfile, append=T); | |
492 cat("Total Transcript ID after filtering: ",nGE,"\n",file=logfile, append=T); | |
493 | |
494 } | |
495 | |
496 #========================================================================================== | |
497 # Scaling of data | |
498 #========================================================================================== | |
499 if(doscale) | |
500 { | |
501 proteome_df[,2] = scale(proteome_df[,2], center = TRUE, scale = TRUE); | |
502 transcriptome_df[,2] = scale(transcriptome_df[,2], center = TRUE, scale = TRUE); | |
503 } | |
504 | |
505 #============================================================================================================= | |
506 # Proteome and Transcriptome data summary | |
507 #============================================================================================================= | |
508 cat("Calculating summary of PE and GE data\n",file=logfile, append=T); | |
509 s1 = summary(proteome_df[,2]); | |
510 s2 = summary(transcriptome_df[,2]) | |
511 | |
512 cat( | |
513 "<font color='blue'><h3>Proteome data summary</h3></font>\n", | |
514 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>', | |
515 file = htmloutfile, append = TRUE); | |
516 | |
517 for(i in 1:length(s1)) | |
518 { | |
519 cat("<tr><td>",names(s1[i]),"</td><td>", s1[i],"</td></tr>\n", file = htmloutfile, append = TRUE) | |
520 } | |
521 cat("</table>\n", file = htmloutfile, append = TRUE) | |
522 | |
523 cat( | |
524 "<font color='blue'><h3>Transcriptome data summary</h3></font>\n", | |
525 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>', | |
526 file = htmloutfile, append = TRUE); | |
527 | |
528 for(i in 1:length(s2)) | |
529 { | |
530 cat("<tr><td>",names(s2[i]),"</td><td>", s2[i],"</td></tr>\n", file = htmloutfile, append = TRUE) | |
531 } | |
532 cat("</table>\n", file = htmloutfile, append = TRUE) | |
533 | |
534 #============================================================================================================= | |
535 # Distribution of proteome and transcriptome abundance (Box and Density plot) | |
536 #============================================================================================================= | |
537 cat("Generating Box and Density plot\n",file=logfile, append=T); | |
538 outplot = paste("./",outdir,"/AbundancePlot.png",sep="",collapse=""); | |
539 png(outplot); | |
540 par(mfrow=c(2,2)); | |
541 boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5); | |
542 plot(density(proteome_df[,2]), xlab="Protein Abundance", ylab="Density", main="Proteome abundance", cex.lab=1.5); | |
543 boxplot(transcriptome_df[,2], ylab="Abundance", main="Transcriptome abundance", cex.lab=1.5); | |
544 plot(density(transcriptome_df[,2]), xlab="Transcript Abundance", ylab="Density", main="Transcriptome abundance", cex.lab=1.5); | |
545 dev.off(); | |
546 | |
547 cat( | |
548 "<font color='blue'><h3>Distribution of Proteome and Transcripome abundance (Box plot and Density plot)</h3></font>\n", | |
549 '<img src="AbundancePlot.png">', | |
550 file = htmloutfile, append = TRUE); | |
551 | |
552 #============================================================================================================= | |
553 # Scatter plot | |
554 #============================================================================================================= | |
555 cat("Generating scatter plot\n",file=logfile, append=T); | |
556 outplot = paste("./",outdir,"/AbundancePlot_scatter.png",sep="",collapse=""); | |
557 png(outplot); | |
558 par(mfrow=c(1,1)); | |
559 scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); | |
560 | |
561 cat( | |
562 "<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>\n", | |
563 '<img src="AbundancePlot_scatter.png">', | |
564 file = htmloutfile, append = TRUE); | |
565 | |
566 #============================================================================================================= | |
567 # Correlation testing | |
568 #============================================================================================================= | |
569 cat("Estimating correlation\n",file=logfile, append=T); | |
570 cor_result_pearson = cor.test(transcriptome_df[,2], proteome_df[,2], method = "pearson"); | |
571 cor_result_spearman = cor.test(transcriptome_df[,2], proteome_df[,2], method = "spearman"); | |
572 cor_result_kendall = cor.test(transcriptome_df[,2], proteome_df[,2], method = "kendall"); | |
573 | |
574 cat( | |
575 "<font color='blue'><h3>Correlation with all data</h3></font>\n", | |
576 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>', | |
577 file = htmloutfile, append = TRUE); | |
578 | |
579 cat( | |
580 "<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>", | |
581 "<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>", | |
582 "<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>", | |
583 file = htmloutfile, append = TRUE) | |
584 cat("</table>\n", file = htmloutfile, append = TRUE) | |
585 | |
586 cat( | |
587 '<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>cook\'s distance based approach</u> to identify such influential observations.</font>', | |
588 file = htmloutfile, append = TRUE); | |
589 | |
590 #============================================================================================================= | |
591 # Linear Regression | |
592 #============================================================================================================= | |
593 cat("Fitting linear regression model\n",file=logfile, append=T); | |
594 PE_GE_data = proteome_df; | |
595 PE_GE_data = cbind(PE_GE_data, transcriptome_df); | |
596 colnames(PE_GE_data) = c("PE_ID","PE_abundance","GE_ID","GE_abundance"); | |
597 | |
598 regmodel = lm(PE_abundance~GE_abundance, data=PE_GE_data); | |
599 regmodel_summary = summary(regmodel); | |
600 | |
601 cat( | |
602 "<font color='blue'><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>\n", | |
603 "<p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>\n", | |
604 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>', | |
605 file = htmloutfile, append = TRUE); | |
606 | |
607 cat( | |
608 "<tr><td>Formula</td><td>","PE_abundance~GE_abundance","</td></tr>\n", | |
609 "<tr><td colspan='2' align='center'> <b>Coefficients</b></td>","</tr>\n", | |
610 "<tr><td>",names(regmodel$coefficients[1]),"</td><td>",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","</td></tr>\n", | |
611 "<tr><td>",names(regmodel$coefficients[2]),"</td><td>",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","</td></tr>\n", | |
612 "<tr><td colspan='2' align='center'> <b>Model parameters</b></td>","</tr>\n", | |
613 "<tr><td>Residual standard error</td><td>",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)</td></tr>\n", | |
614 "<tr><td>F-statistic</td><td>",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and ",regmodel_summary$fstatistic[3]," degree of freedom)</td></tr>\n", | |
615 "<tr><td>R-squared</td><td>",regmodel_summary$r.squared,"</td></tr>\n", | |
616 "<tr><td>Adjusted R-squared</td><td>",regmodel_summary$adj.r.squared,"</td></tr>\n", | |
617 file = htmloutfile, append = TRUE) | |
618 cat("</table>\n", file = htmloutfile, append = TRUE) | |
619 | |
620 #============================================================================================================= | |
621 # Plotting various regression diagnostics plots | |
622 #============================================================================================================= | |
623 outplot1 = paste("./",outdir,"/PE_GE_modelfit.pdf",sep="",collapse=""); | |
624 pdf(outplot1); | |
625 devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1 | |
626 print(.Devices) | |
627 print(c(devnum,"+++")); | |
628 | |
629 regmodel_predictedy = predict(regmodel, PE_GE_data); | |
630 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with all data"); | |
631 points(PE_GE_data[,"GE_abundance"], regmodel_predictedy, col="red"); | |
632 | |
633 cat("Generating regression diagnostics plot\n",file=logfile, append=T); | |
634 cat( | |
635 "<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>\n", | |
636 file = htmloutfile, append = TRUE); | |
637 | |
638 outplot = paste("./",outdir,"/PE_GE_lm_1.png",sep="",collapse=""); | |
639 png(outplot); | |
640 par(mfrow=c(1,1)); | |
641 plot(regmodel, 1, cex.lab=1.5); | |
642 dev.off(); | |
643 | |
644 outplot = paste("./",outdir,"/PE_GE_lm_2.png",sep="",collapse=""); | |
645 png(outplot); | |
646 par(mfrow=c(1,1)); | |
647 plot(regmodel, 2, cex.lab=1.5); | |
648 dev.off(); | |
649 | |
650 outplot = paste("./",outdir,"/PE_GE_lm_3.png",sep="",collapse=""); | |
651 png(outplot); | |
652 par(mfrow=c(1,1)); | |
653 plot(regmodel, 3, cex.lab=1.5); | |
654 dev.off(); | |
655 | |
656 outplot = paste("./",outdir,"/PE_GE_lm_5.png",sep="",collapse=""); | |
657 png(outplot); | |
658 par(mfrow=c(1,1)); | |
659 plot(regmodel, 5, cex.lab=1.5); | |
660 dev.off(); | |
661 | |
662 outplot = paste("./",outdir,"/PE_GE_lm.pdf",sep="",collapse=""); | |
663 pdf(outplot); | |
664 plot(regmodel); | |
665 dev.off(); | |
666 regmodel_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel$fitted.values) | |
667 | |
668 | |
669 cat( | |
670 "<u><font color='brown'><h4>Residuals vs Fitted plot</h4></font></u>\n", | |
671 '<img src="PE_GE_lm_1.png">', | |
672 '<br><br>This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship<br>', | |
673 file = htmloutfile, append = TRUE); | |
674 | |
675 cat( | |
676 "<u><font color='brown'><h4>Normal Q-Q plot of residuals</h4></font></u>\n", | |
677 '<img src="PE_GE_lm_2.png">', | |
678 '<br><br>This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.<br>', | |
679 file = htmloutfile, append = TRUE); | |
680 | |
681 cat( | |
682 "<u><font color='brown'><h4>Scale-Location (or Spread-Location) plot</h4></font></u>\n", | |
683 '<img src="PE_GE_lm_3.png">', | |
684 '<br><br>This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.<br>', | |
685 file = htmloutfile, append = TRUE); | |
686 | |
687 cat( | |
688 "<u><font color='brown'><h4>Residuals vs Leverage plot</h4></font></u>\n", | |
689 '<img src="PE_GE_lm_3.png">', | |
690 '<br><br>This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.<br>', | |
691 file = htmloutfile, append = TRUE); | |
692 | |
693 #============================================================================================================= | |
694 # Identification of influential observations | |
695 #============================================================================================================= | |
696 cat("Identifying influential observations\n",file=logfile, append=T); | |
697 cat( | |
698 "<font color='blue'><h3>Identify influential observations</h3></font>\n", | |
699 file = htmloutfile, append = TRUE); | |
700 cat( | |
701 '<p><b>Cook’s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>cook’s distance > than 4 times the mean</b> may be classified as <b>influential.</b></p>', | |
702 file = htmloutfile, append = TRUE); | |
703 | |
704 cooksd <- cooks.distance(regmodel); | |
705 | |
706 cat("Generating cooksd plot\n",file=logfile, append=T); | |
707 outplot = paste("./",outdir,"/PE_GE_lm_cooksd.png",sep="",collapse=""); | |
708 png(outplot); | |
709 par(mfrow=c(1,1)); | |
710 plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations") # plot cooks distance | |
711 abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line | |
712 #text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2) # add labels | |
713 dev.off(); | |
714 | |
715 cat( | |
716 '<img src="PE_GE_lm_cooksd.png">', | |
717 '<br>In the above plot, observations above red line (4*mean cook\'s distance) are influential, marked in <b>*</b>. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br>', | |
718 file = htmloutfile, append = TRUE); | |
719 | |
720 tempind = which(cooksd>4*mean(cooksd, na.rm=T)); | |
721 PE_GE_data_no_outlier = PE_GE_data[-tempind,]; | |
722 PE_GE_data_no_outlier$cooksd = cooksd[-tempind] | |
723 PE_GE_data_outlier = PE_GE_data[tempind,]; | |
724 PE_GE_data_outlier$cooksd = cooksd[tempind] | |
725 | |
726 cat( | |
727 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>', | |
728 file = htmloutfile, append = TRUE); | |
729 | |
730 cat( | |
731 "<tr><td>Mean cook\'s distance</td><td>",mean(cooksd, na.rm=T),"</td></tr>\n", | |
732 "<tr><td>Total influential observations (cook\'s distance > 4 * mean cook\'s distance)</td><td>",length(tempind),"</td></tr>\n", | |
733 "<tr><td>Total influential observations (cook\'s distance > 3 * mean cook\'s distance)</td><td>",length(which(cooksd>3*mean(cooksd, na.rm=T))),"</td></tr>\n", | |
734 "</table>", | |
735 '<font color="brown"><h4>Top 10 influential observations (cook\'s distance > 4 * mean cook\'s distance)</h4></font>', | |
736 file = htmloutfile, append = TRUE); | |
737 | |
738 cat("Writing influential observations\n",file=logfile, append=T); | |
739 | |
740 outdatafile = paste("./",outdir,"/PE_GE_influential_observation.tsv", sep="", collapse=""); | |
741 cat('<a href="',outdatafile, '" target="_blank">Download entire list</a>',file = htmloutfile, append = TRUE); | |
742 write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F); | |
743 | |
744 cat( | |
745 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>PE_ID</th><th>PE_abundance</th><th>GE_ID</th><th>GE_abundance</th><th>cooksd</th></tr>', | |
746 file = htmloutfile, append = TRUE); | |
747 | |
748 for(i in 1:10) | |
749 { | |
750 cat( | |
751 '<tr>','<td>',PE_GE_data_outlier[i,1],'</td>', | |
752 '<td>',PE_GE_data_outlier[i,2],'</td>', | |
753 '<td>',PE_GE_data_outlier[i,3],'</td>', | |
754 '<td>',PE_GE_data_outlier[i,4],'</td>', | |
755 '<td>',PE_GE_data_outlier[i,5],'</td></tr>', | |
756 file = htmloutfile, append = TRUE); | |
757 } | |
758 cat('</table>',file = htmloutfile, append = TRUE); | |
759 | |
760 #============================================================================================================= | |
761 # Correlation with removal of outliers | |
762 #============================================================================================================= | |
763 | |
764 #============================================================================================================= | |
765 # Scatter plot | |
766 #============================================================================================================= | |
767 outplot = paste("./",outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); | |
768 png(outplot); | |
769 par(mfrow=c(1,1)); | |
770 scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); | |
771 | |
772 cat( | |
773 "<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations</h3></font>\n", | |
774 '<img src="AbundancePlot_scatter_without_outliers.png">', | |
775 file = htmloutfile, append = TRUE); | |
776 | |
777 #============================================================================================================= | |
778 # Correlation | |
779 #============================================================================================================= | |
780 cat("Estimating orrelation with removal of outliers \n",file=logfile, append=T); | |
781 cat( | |
782 "<font color='blue'><h3>Correlation with removal of outliers / influential observations</h3></font>\n", | |
783 '<p>We removed the influential observations and reestimated the correlation values.</p>', | |
784 file = htmloutfile, append = TRUE); | |
785 | |
786 cor_result_pearson = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "pearson"); | |
787 cor_result_spearman = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "spearman"); | |
788 cor_result_kendall = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "kendall"); | |
789 | |
790 cat( | |
791 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>', | |
792 file = htmloutfile, append = TRUE); | |
793 | |
794 cat( | |
795 "<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>", | |
796 "<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>", | |
797 "<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>", | |
798 file = htmloutfile, append = TRUE) | |
799 cat("</table>\n", file = htmloutfile, append = TRUE) | |
800 | |
801 #============================================================================================================= | |
802 # Heatmap | |
803 #============================================================================================================= | |
804 cat( | |
805 "<font color='blue'><h3>Heatmap of PE and GE abundance values</h3></font>\n", | |
806 file = htmloutfile, append = TRUE); | |
807 | |
808 cat("Generating heatmap plot\n",file=logfile, append=T); | |
809 outplot = paste("./",outdir,"/PE_GE_heatmap.png",sep="",collapse=""); | |
810 png(outplot); | |
811 par(mfrow=c(1,1)); | |
812 heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col"); | |
813 dev.off(); | |
814 | |
815 cat( | |
816 '<img src="PE_GE_heatmap.png">', | |
817 file = htmloutfile, append = TRUE); | |
818 | |
819 #============================================================================================================= | |
820 # kmeans clustering | |
821 #============================================================================================================= | |
822 | |
823 PE_GE_data_kdata = PE_GE_data; | |
824 | |
825 | |
826 k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5); | |
827 cat("Generating kmeans plot\n",file=logfile, append=T); | |
828 outplot = paste("./",outdir,"/PE_GE_kmeans.png",sep="",collapse=""); | |
829 png(outplot); | |
830 par(mfrow=c(1,1)); | |
831 scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); | |
832 | |
833 ind=which(k1$cluster==1); | |
834 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="red", pch=16); | |
835 ind=which(k1$cluster==2); | |
836 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="green", pch=16); | |
837 ind=which(k1$cluster==3); | |
838 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="blue", pch=16); | |
839 ind=which(k1$cluster==4); | |
840 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16); | |
841 ind=which(k1$cluster==5); | |
842 points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="black", pch=16); | |
843 dev.off(); | |
844 | |
845 cat( | |
846 "<font color='blue'><h3>Kmean clustering</h3></font>\nNumber of Clusters: 5<br>", | |
847 file = htmloutfile, append = TRUE); | |
848 | |
849 tempind = order(k1$cluster); | |
850 tempoutfile = paste(outdir,"/","PE_GE_kmeans_clusterpoints.txt",sep="",collapse=""); | |
851 | |
852 write.table(data.frame(PE_GE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n") | |
853 | |
854 cat('<a href="',tempoutfile, '" target="_blank">Download cluster list</a><br>',file = htmloutfile, append = TRUE); | |
855 | |
856 cat( | |
857 '<img src="PE_GE_kmeans.png">', | |
858 file = htmloutfile, append = TRUE); | |
859 | |
860 #============================================================================================================= | |
861 # Other Regression fit | |
862 #============================================================================================================= | |
863 dev.set(devnum); | |
864 # Linear regression with removal of outliers | |
865 regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier); | |
866 regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier); | |
867 outplot = paste("./",outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse=""); | |
868 plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers"); | |
869 points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red"); | |
870 | |
871 pdf(outplot); | |
872 plot(regmodel_no_outlier); | |
873 dev.off(); | |
874 regmodel_no_outlier_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_no_outlier$fitted.values) | |
875 | |
876 # Resistant regression (lqs / least trimmed squares method) | |
877 regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data); | |
878 regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data); | |
879 outplot = paste("./",outdir,"/PE_GE_lqs.pdf",sep="",collapse=""); | |
880 pdf(outplot); | |
881 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)"); | |
882 points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red"); | |
883 #plot(regmodel_lqs); | |
884 dev.off(); | |
885 dev.set(devnum); | |
886 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)"); | |
887 points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red"); | |
888 regmodel_lqs_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_lqs$fitted.values) | |
889 | |
890 # Robust regression (rlm / Huber M-estimator method) | |
891 regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data); | |
892 regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data); | |
893 outplot = paste("./",outdir,"/PE_GE_rlm.pdf",sep="",collapse=""); | |
894 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm / Huber M-estimator method)"); | |
895 points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red"); | |
896 pdf(outplot); | |
897 plot(regmodel_rlm); | |
898 dev.off(); | |
899 regmodel_rlm_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_rlm$fitted.values) | |
900 | |
901 # polynomical reg | |
902 regmodel_poly2 = lm(PE_abundance ~ poly(GE_abundance, 2, raw = TRUE), data = PE_GE_data) | |
903 regmodel_poly3 = lm(PE_abundance ~ poly(GE_abundance, 3, raw = TRUE), data = PE_GE_data) | |
904 regmodel_poly4 = lm(PE_abundance ~ poly(GE_abundance, 4, raw = TRUE), data = PE_GE_data) | |
905 regmodel_poly5 = lm(PE_abundance ~ poly(GE_abundance, 5, raw = TRUE), data = PE_GE_data) | |
906 regmodel_poly6 = lm(PE_abundance ~ poly(GE_abundance, 6, raw = TRUE), data = PE_GE_data) | |
907 regmodel_poly2_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly2$fitted.values) | |
908 regmodel_poly3_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly3$fitted.values) | |
909 regmodel_poly4_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly4$fitted.values) | |
910 regmodel_poly5_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly5$fitted.values) | |
911 regmodel_poly6_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly6$fitted.values) | |
912 regmodel_poly2_predictedy = predict(regmodel_poly2, PE_GE_data); | |
913 regmodel_poly3_predictedy = predict(regmodel_poly3, PE_GE_data); | |
914 regmodel_poly4_predictedy = predict(regmodel_poly4, PE_GE_data); | |
915 regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data); | |
916 regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data); | |
917 | |
918 outplot = paste("./",outdir,"/PE_GE_poly2.pdf",sep="",collapse=""); | |
919 dev.set(devnum); | |
920 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2"); | |
921 points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red"); | |
922 pdf(outplot); | |
923 plot(regmodel_poly2); | |
924 dev.off(); | |
925 | |
926 outplot = paste("./",outdir,"/PE_GE_poly3.pdf",sep="",collapse=""); | |
927 dev.set(devnum); | |
928 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3"); | |
929 points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red"); | |
930 pdf(outplot); | |
931 plot(regmodel_poly3); | |
932 dev.off(); | |
933 | |
934 outplot = paste("./",outdir,"/PE_GE_poly4.pdf",sep="",collapse=""); | |
935 dev.set(devnum); | |
936 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4"); | |
937 points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red"); | |
938 pdf(outplot); | |
939 plot(regmodel_poly4); | |
940 dev.off(); | |
941 | |
942 outplot = paste("./",outdir,"/PE_GE_poly5.pdf",sep="",collapse=""); | |
943 dev.set(devnum); | |
944 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5"); | |
945 points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red"); | |
946 pdf(outplot); | |
947 plot(regmodel_poly5); | |
948 dev.off(); | |
949 | |
950 outplot = paste("./",outdir,"/PE_GE_poly6.pdf",sep="",collapse=""); | |
951 dev.set(devnum); | |
952 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6"); | |
953 points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red"); | |
954 pdf(outplot); | |
955 plot(regmodel_poly6); | |
956 dev.off(); | |
957 | |
958 # GAM Generalized additive models | |
959 regmodel_gam <- gam(PE_abundance ~ s(GE_abundance), data = PE_GE_data) | |
960 regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data); | |
961 | |
962 regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values) | |
963 outplot = paste("./",outdir,"/PE_GE_gam.pdf",sep="",collapse=""); | |
964 dev.set(devnum); | |
965 plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models"); | |
966 points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red"); | |
967 pdf(outplot); | |
968 plot(regmodel_gam,pages=1,residuals=TRUE); ## show partial residuals | |
969 plot(regmodel_gam,pages=1,seWithMean=TRUE) ## `with intercept' CIs | |
970 dev.off(); | |
971 dev.off(devnum); | |
972 | |
973 cat( | |
974 "<font color='blue'><h3>Other regression model fitting</h3></font>\n", | |
975 file = htmloutfile, append = TRUE); | |
976 | |
977 cat( | |
978 "<ul> | |
979 <li>MAE:mean absolute error</li> | |
980 <li>MSE: mean squared error</li> | |
981 <li>RMSE:root mean squared error ( sqrt(MSE) )</li> | |
982 <li>MAPE:mean absolute percentage error</li> | |
983 </ul> | |
984 ", | |
985 file = htmloutfile, append = TRUE); | |
986 | |
987 cat( | |
988 '<h4><a href="PE_GE_modelfit.pdf" target="_blank">Comparison of model fits</a></h4>', | |
989 file = htmloutfile, append = TRUE); | |
990 | |
991 cat( | |
992 '<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Model</th><th>MAE</th><th>MSE</th><th>RMSE</th><th>MAPE</th><th>Diagnostics Plot</th></tr>', | |
993 file = htmloutfile, append = TRUE); | |
994 | |
995 cat( | |
996 "<tr><td>Linear regression with all data</td><td>",regmodel_metrics[1],"</td><td>",regmodel_metrics[2],"</td><td>",regmodel_metrics[3],"</td><td>",regmodel_metrics[4],"</td><td>",'<a href="PE_GE_lm.pdf" target="_blank">Link</a>',"</td></tr>", | |
997 | |
998 "<tr><td>Linear regression with removal of outliers</td><td>",regmodel_no_outlier_metrics[1],"</td><td>",regmodel_no_outlier_metrics[2],"</td><td>",regmodel_no_outlier_metrics[3],"</td><td>",regmodel_no_outlier_metrics[4],"</td><td>",'<a href="PE_GE_lm_without_outliers.pdf" target="_blank">Link</a>',"</td></tr>", | |
999 | |
1000 "<tr><td>Resistant regression (lqs / least trimmed squares method)</td><td>",regmodel_lqs_metrics[1],"</td><td>",regmodel_lqs_metrics[2],"</td><td>",regmodel_lqs_metrics[3],"</td><td>",regmodel_lqs_metrics[4],"</td><td>", '<a href="PE_GE_lqs.pdf" target="_blank">Link</a>',"</td></tr>", | |
1001 | |
1002 "<tr><td>Robust regression (rlm / Huber M-estimator method)</td><td>",regmodel_rlm_metrics[1],"</td><td>",regmodel_rlm_metrics[2],"</td><td>",regmodel_rlm_metrics[3],"</td><td>",regmodel_rlm_metrics[4],"</td><td>",'<a href="PE_GE_rlm.pdf" target="_blank">Link</a>',"</td></tr>", | |
1003 | |
1004 | |
1005 "<tr><td>Polynomial regression with degree 2</td><td>",regmodel_poly2_metrics[1],"</td><td>",regmodel_poly2_metrics[2],"</td><td>",regmodel_poly2_metrics[3],"</td><td>",regmodel_poly2_metrics[4],"</td><td>",'<a href="PE_GE_poly2.pdf" target="_blank">Link</a>',"</td></tr>", | |
1006 | |
1007 "<tr><td>Polynomial regression with degree 3</td><td>",regmodel_poly3_metrics[1],"</td><td>",regmodel_poly3_metrics[2],"</td><td>",regmodel_poly3_metrics[3],"</td><td>",regmodel_poly3_metrics[4],"</td><td>",'<a href="PE_GE_poly3.pdf" target="_blank">Link</a>',"</td></tr>", | |
1008 | |
1009 "<tr><td>Polynomial regression with degree 4</td><td>",regmodel_poly4_metrics[1],"</td><td>",regmodel_poly4_metrics[2],"</td><td>",regmodel_poly4_metrics[3],"</td><td>",regmodel_poly4_metrics[4],"</td><td>",'<a href="PE_GE_poly4.pdf" target="_blank">Link</a>',"</td></tr>", | |
1010 | |
1011 "<tr><td>Polynomial regression with degree 5</td><td>",regmodel_poly5_metrics[1],"</td><td>",regmodel_poly5_metrics[2],"</td><td>",regmodel_poly5_metrics[3],"</td><td>",regmodel_poly5_metrics[4],"</td><td>",'<a href="PE_GE_poly5.pdf" target="_blank">Link</a>',"</td></tr>", | |
1012 | |
1013 "<tr><td>Polynomial regression with degree 6</td><td>",regmodel_poly6_metrics[1],"</td><td>",regmodel_poly6_metrics[2],"</td><td>",regmodel_poly6_metrics[3],"</td><td>",regmodel_poly6_metrics[4],"</td><td>",'<a href="PE_GE_poly6.pdf" target="_blank">Link</a>',"</td></tr>", | |
1014 | |
1015 "<tr><td>Generalized additive models</td><td>",regmodel_gam_metrics[1],"</td><td>",regmodel_gam_metrics[2],"</td><td>",regmodel_gam_metrics[3],"</td><td>",regmodel_gam_metrics[4],"</td><td>",'<a href="PE_GE_gam.pdf" target="_blank">Link</a>',"</td></tr>", | |
1016 | |
1017 "</table>", | |
1018 file = htmloutfile, append = TRUE); | |
1019 | |
1020 | |
1021 # Warning On | |
1022 options(warn = oldw) | |
1023 | |
1024 | |
1025 |