Mercurial > repos > pravs > protein_rna_correlation
changeset 5:6bf0203ee17e draft
planemo upload
author | pravs |
---|---|
date | Wed, 20 Jun 2018 20:43:56 -0400 |
parents | b014014e685a |
children | 8e9428eca82c |
files | protein_rna_correlation.r protein_rna_correlation.xml test_data/PE_abundance_GE_abundance_pearson.html tool_dependencies.xml |
diffstat | 4 files changed, 171 insertions(+), 68 deletions(-) [+] |
line wrap: on
line diff
--- a/protein_rna_correlation.r Sun Jun 17 05:06:18 2018 -0400 +++ b/protein_rna_correlation.r Wed Jun 20 20:43:56 2018 -0400 @@ -80,8 +80,8 @@ # ............................SCRIPT STARTS FROM HERE ............................. #================================================================================== # Warning Off -oldw <- getOption("warn") -options(warn = -1) +# oldw <- getOption("warn") +# options(warn = -1) #============================================================================================================= # Functions #============================================================================================================= @@ -411,10 +411,10 @@ cat( "<font color='blue'><h3>Download mapped unmapped data</h3></font>", - "<ul><li>Protein mapped data: ", '<a href="',paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", - "<li>Protein unmapped data: ", '<a href="',paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", - "<li>Transcript mapped data: ", '<a href="',paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", - "<li>Transcript unmapped data: ", '<a href="',paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<ul><li>Protein mapped data: ", '<a href="',paste(PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<li>Protein unmapped data: ", '<a href="',paste(PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<li>Transcript mapped data: ", '<a href="',paste(GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<li>Transcript unmapped data: ", '<a href="',paste(GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", file = htmloutfile, append = TRUE); } @@ -422,8 +422,8 @@ write.table(GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)], file=paste(outdir,"/",GE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n") cat( - "<li>Protein abundance data: ", '<a href="',paste(outdir,"/",PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", - "<li>Transcript abundance data: ", '<a href="',paste(outdir,"/",GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", + "<li>Protein abundance data: ", '<a href="',paste(PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<li>Transcript abundance data: ", '<a href="',paste(GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", file = htmloutfile, append = TRUE); #========================================================================================== @@ -470,8 +470,8 @@ # Write excluded transcriptomics and proteomics data link to html file cat( - "<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", - "<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", + "<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>", + "<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>", file = htmloutfile, append = TRUE); # Keep the unexcluded entries only @@ -535,7 +535,7 @@ # Distribution of proteome and transcriptome abundance (Box and Density plot) #============================================================================================================= cat("Generating Box and Density plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/AbundancePlot.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot.png",sep="",collapse=""); png(outplot); par(mfrow=c(2,2)); boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5); @@ -553,7 +553,7 @@ # Scatter plot #============================================================================================================= cat("Generating scatter plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/AbundancePlot_scatter.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot_scatter.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -620,7 +620,7 @@ #============================================================================================================= # Plotting various regression diagnostics plots #============================================================================================================= - outplot1 = paste("./",outdir,"/PE_GE_modelfit.pdf",sep="",collapse=""); + outplot1 = paste(outdir,"/PE_GE_modelfit.pdf",sep="",collapse=""); pdf(outplot1); devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1 print(.Devices) @@ -635,31 +635,31 @@ "<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>\n", file = htmloutfile, append = TRUE); - outplot = paste("./",outdir,"/PE_GE_lm_1.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_1.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 1, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_2.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_2.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 2, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_3.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_3.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 3, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_5.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_5.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 5, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm.pdf",sep="",collapse=""); pdf(outplot); plot(regmodel); dev.off(); @@ -704,7 +704,7 @@ cooksd <- cooks.distance(regmodel); cat("Generating cooksd plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_lm_cooksd.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_cooksd.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations") # plot cooks distance @@ -737,7 +737,7 @@ cat("Writing influential observations\n",file=logfile, append=T); - outdatafile = paste("./",outdir,"/PE_GE_influential_observation.tsv", sep="", collapse=""); + outdatafile = paste(outdir,"/PE_GE_influential_observation.tsv", sep="", collapse=""); cat('<a href="',outdatafile, '" target="_blank">Download entire list</a>',file = htmloutfile, append = TRUE); write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F); @@ -764,7 +764,7 @@ #============================================================================================================= # Scatter plot #============================================================================================================= - outplot = paste("./",outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -806,10 +806,12 @@ file = htmloutfile, append = TRUE); cat("Generating heatmap plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_heatmap.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_heatmap.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); - heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col"); + #heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col"); + my_palette <- colorRampPalette(c("green", "white", "red"))(299); + heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=my_palette ,Colv=F, labCol=c("PE","GE"), scale="col", dendrogram = "row"); dev.off(); cat( @@ -825,7 +827,7 @@ k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5); cat("Generating kmeans plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_kmeans.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_kmeans.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -864,7 +866,7 @@ # Linear regression with removal of outliers regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier); regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier); - outplot = paste("./",outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse=""); plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers"); points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red"); @@ -876,7 +878,7 @@ # Resistant regression (lqs / least trimmed squares method) regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data); regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_lqs.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lqs.pdf",sep="",collapse=""); pdf(outplot); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)"); points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red"); @@ -890,7 +892,7 @@ # Robust regression (rlm / Huber M-estimator method) regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data); regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_rlm.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_rlm.pdf",sep="",collapse=""); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm / Huber M-estimator method)"); points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red"); pdf(outplot); @@ -915,7 +917,7 @@ regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data); regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_poly2.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly2.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2"); points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red"); @@ -923,7 +925,7 @@ plot(regmodel_poly2); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly3.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly3.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3"); points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red"); @@ -931,7 +933,7 @@ plot(regmodel_poly3); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly4.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly4.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4"); points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red"); @@ -939,7 +941,7 @@ plot(regmodel_poly4); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly5.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly5.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5"); points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red"); @@ -947,7 +949,7 @@ plot(regmodel_poly5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly6.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly6.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6"); points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red"); @@ -960,7 +962,7 @@ regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data); regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values) - outplot = paste("./",outdir,"/PE_GE_gam.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_gam.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models"); points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red"); @@ -1019,7 +1021,7 @@ # Warning On - options(warn = oldw) + # options(warn = oldw) \ No newline at end of file
--- a/protein_rna_correlation.xml Sun Jun 17 05:06:18 2018 -0400 +++ b/protein_rna_correlation.xml Wed Jun 20 20:43:56 2018 -0400 @@ -1,28 +1,20 @@ -<tool id="protein_rna_correlation" name="protein_rna_correlation" version="0.1.1"> - <description>Correlation between protein and rna expression (Single Sample)</description> +<tool id="protein_rna_correlation" name="protein_rna_correlation" version="0.0.2"> + <description>Correlation between protein and transcript expression </description> <requirements> - <requirement type="package" version="3.3.1">r-base</requirement> - <requirement type="package" version="1.18.0">bioconductor-rgalaxy</requirement> - <requirement type="package" version="1.21.0">bioconductor-biocinstaller</requirement> - <requirement type="package" version="1.9">rmarkdown</requirement> - <requirement type="package" version="1.9">MASS</requirement> - <requirement type="package" version="1.8-23">mgcv</requirement> - <requirement type="package" version="0.4.1">DMwR</requirement> - <requirement type="package" version="1.11.4">data.table</requirement> - <requirement type="package" version="3.0.1">gplots</requirement> - <requirement type="package" version="0.20-35">lattice</requirement> - <requirement type="package" version="3.5.0">grid</requirement> - <requirement type="package" version="3.1-137">nlme</requirement> + <requirement type="package" version="3.2.1">R</requirement> + <requirement type="package" version="0.0.1">r-protrnacorr</requirement> + <requirement type="package" version="1.5.0">x11</requirement> + </requirements> <command detect_errors="exit_code" interpreter="Rscript"><![CDATA[protein_rna_correlation.r $pe_exp $ge_exp $pe_idcol $ge_idcol $pe_expcol $ge_expcol $pe_idtype $ge_idtype $organism_map $writeMapUnmap $doScale $html_file $html_file.files_path]]></command> <inputs> <param name="pe_exp" type="data" format="tabular"> <label>Input Protein Expression File</label> </param> - <param name="pe_idcol" type="integer"> + <param name="pe_idcol" type="integer" value="1"> <label>Column: Protein/Gene ID</label> </param> - <param name="pe_expcol" type="integer"> + <param name="pe_expcol" type="integer" value="1"> <label>Column: Protein Expression Values</label> </param> @@ -30,41 +22,41 @@ <label>Input RNA Expression File</label> </param> - <param name="ge_idcol" type="integer"> + <param name="ge_idcol" type="integer" value="1"> <label>Column: RNA/Gene ID</label> </param> - <param name="ge_expcol" type="integer"> + <param name="ge_expcol" type="integer" value="1"> <label>Column: RNA Expression Values</label> </param> - <param name="pe_idtype" type="select"> - <option value='ensembl' selected>Ensembl</option> - <option value='uniprot'>Uniprot</option> - <option value='hgnc'>HGNC</option> + <param name="pe_idtype" type="select" label="Protein ID type"> + <option value="Ensembl_with_version" selected="True">Ensembl</option> + <option value="uniprot">Uniprot</option> + <option value="hgnc">HGNC</option> </param> - <param name="ge_idtype" type="select"> - <option value='ensembl' selected>Ensembl</option> - <option value='uniprot'>Uniprot</option> - <option value='hgnc'>HGNC</option> + <param name="ge_idtype" type="select" label="Transcript ID type"> + <option value="Ensembl_with_version" selected="True">Ensembl</option> + <option value="uniprot">Uniprot</option> + <option value="hgnc">HGNC</option> </param> <param name="organism_map" type="data" format="tabular"> <label>Biomart ID Mapping file (.map)</label> </param> - <param name="writeMapUnmap" type="boolean"> + <param name="writeMapUnmap" type="boolean" checked="true" truevalue="1" falsevalue="0"> <label>Create the list of Mapped and Unmapped Identifiers in HTML</label> </param> - <param name="doScale" type="boolean"> + <param name="doScale" type="boolean" checked="true" truevalue="1" falsevalue="0"> <label>Scale the abundance values</label> </param> </inputs> <outputs> - <data format="html" name="html_file" label="protein_rna_corr_${tool_name}.html"/> + <data format="html" name="html_file" label="protein_rna_corr_${tool.name}.html"/> </outputs> <tests> @@ -78,14 +70,23 @@ <param name="pe_idtype" value="Ensembl_with_version"/> <param name="ge_idtype" value="Ensembl_with_version"/> <param name="organism_map" value="mmusculus_gene_ensembl__GRCm38.p6.map"/> - <param name="writeMapUnmap" value="1"/> - <param name="doScale" value="1"/> - <output name="html_file" file="PE_abundance_GE_abundance_pearson.html"/> + <param name="writeMapUnmap" value="true"/> + <param name="doScale" value="true"/> + <output name="html_file" file="protein_rna_corr_protein_transcript_correlation.html"/> </test> </tests> - <help> + <help><![CDATA[ Proteome Transcriptome Correlation Developer: Priyabrata Panigrahi - </help> + ]]></help> + <citations> + <citation type="bibtex"> +@misc{protein_rna_correlation, + author={Panigrahi, Priyabrata}, + year={2018}, + title={ProteinRNACorrelation} +} + </citation> + </citations> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_data/PE_abundance_GE_abundance_pearson.html Wed Jun 20 20:43:56 2018 -0400 @@ -0,0 +1,56 @@ +<html><body> +<h1>Association between proteomics and transcriptomics data</h1> + <font color='blue'><h3>Input data summary</h3></font> <ul> <li>Abbrebiations used: PE (Proteomics) and GE (Transcriptomics) </li> <li>Input PE data dimension (Row Column): 3597 58 </li> <li>Input GE data dimension (Row Column): 191650 14 </li> <li>Protein ID fetched from column: 7 </li> <li>Transcript ID fetched from column: 1 </li> <li>Protein ID type: ensembl_peptide_id_version </li> <li>Transcript ID type: ensembl_transcript_id_version </li> <li>Protein expression data fetched from column: 13 </li> <li>Transcript expression data fetched from column: 10 </li><li>Total Protein ID mapped: 3582 </li> <li>Total Protein ID unmapped: 15 </li> <li>Total Transcript ID mapped: 3582 </li> <li>Total Transcript ID unmapped: 188068 </li></ul><font color='blue'><h3>Download mapped unmapped data</h3></font> <ul><li>Protein mapped data: <a href=" output_fold/PE_mapped.tsv " target="_blank"> Link</a> </li> <li>Protein unmapped data: <a href=" output_fold/PE_unmapped.tsv " target="_blank"> Link</a> </li> <li>Transcript mapped data: <a href=" output_fold/GE_mapped.tsv " target="_blank"> Link</a> </li> <li>Transcript unmapped data: <a href=" output_fold/GE_unmapped.tsv " target="_blank"> Link</a> </li><li>Protein abundance data: <a href=" output_fold/PE_abundance.tsv " target="_blank"> Link</a> </li> <li>Transcript abundance data: <a href=" output_fold/GE_abundance.tsv " target="_blank"> Link</a> </li></ul><ul> <li>Number of entries in Transcriptome data used for correlation: 3582 </li> <li>Number of entries in Proteome data used for correlation: 3582 </li></ul><font color='blue'><h3>Filtering</h3></font> Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry<br> <ul> <li>Number of NA found: 88 </li> <li>Number of Inf or -Inf found: 559 </li></ul><ul><li>Protein excluded data with NA or Inf or -Inf: <a href=" output_fold/PE_excluded_NA_Inf.tsv " target="_blank"> Link</a> </li> <li>Transcript excluded data with NA or Inf or -Inf: <a href=" output_fold/GE_excluded_NA_Inf.tsv " target="_blank"> Link</a> </li></ul><font color='blue'><h3>Filtered data summary</h3></font> Excluding entires with abundance values: NA/Inf/-Inf<br> <ul> <li>Number of entries in Transcriptome data remained: 2949 </li> <li>Number of entries in Proteome data remained: 2949 </li></ul><font color='blue'><h3>Proteome data summary</h3></font> + <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td> </td><td> Min. :-2.98277 </td></tr> +<tr><td> </td><td> 1st Qu.:-0.40393 </td></tr> +<tr><td> </td><td> Median :-0.07986 </td></tr> +<tr><td> </td><td> Mean : 0.00000 </td></tr> +<tr><td> </td><td> 3rd Qu.: 0.26061 </td></tr> +<tr><td> </td><td> Max. :15.13211 </td></tr> +</table> +<font color='blue'><h3>Transcriptome data summary</h3></font> + <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td> </td><td> Min. :-8.33003 </td></tr> +<tr><td> </td><td> 1st Qu.:-0.06755 </td></tr> +<tr><td> </td><td> Median : 0.09635 </td></tr> +<tr><td> </td><td> Mean : 0.00000 </td></tr> +<tr><td> </td><td> 3rd Qu.: 0.18103 </td></tr> +<tr><td> </td><td> Max. : 8.50430 </td></tr> +</table> +<font color='blue'><h3>Distribution of Proteome and Transcripome abundance (Box plot and Density plot)</h3></font> + <img src="AbundancePlot.png"><font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font> + <img src="AbundancePlot_scatter.png"><font color='blue'><h3>Correlation with all data</h3></font> + <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr><tr><td>Correlation method used</td><td> Pearson's product-moment correlation </td><td> Spearman's rank correlation rho </td><td> Kendall's rank correlation tau </td></tr> <tr><td>Correlation</td><td> -0.003584536 </td><td> 0.01866248 </td><td> 0.01280742 </td></tr> <tr><td>Pvalue</td><td> 0.8457255 </td><td> 0.3110035 </td><td> 0.314683 </td></tr></table> +<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>cook's distance based approach</u> to identify such influential observations.</font><font color='blue'><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font> + <p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p> + <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td>Formula</td><td> PE_abundance~GE_abundance </td></tr> + <tr><td colspan='2' align='center'> <b>Coefficients</b></td> </tr> + <tr><td> (Intercept) </td><td> 1.727289e-16 (Pvalue: 1 ) </td></tr> + <tr><td> GE_abundance </td><td> -0.003584536 (Pvalue: 0.8457255 ) </td></tr> + <tr><td colspan='2' align='center'> <b>Model parameters</b></td> </tr> + <tr><td>Residual standard error</td><td> 1.000163 ( 2947 degree of freedom)</td></tr> + <tr><td>F-statistic</td><td> 0.0378662 ( on 1 and 2947 degree of freedom)</td></tr> + <tr><td>R-squared</td><td> 1.28489e-05 </td></tr> + <tr><td>Adjusted R-squared</td><td> -0.0003264749 </td></tr> +</table> +<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font> +<u><font color='brown'><h4>Residuals vs Fitted plot</h4></font></u> + <img src="PE_GE_lm_1.png"> <br><br>This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship<br><u><font color='brown'><h4>Normal Q-Q plot of residuals</h4></font></u> + <img src="PE_GE_lm_2.png"> <br><br>This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.<br><u><font color='brown'><h4>Scale-Location (or Spread-Location) plot</h4></font></u> + <img src="PE_GE_lm_3.png"> <br><br>This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.<br><u><font color='brown'><h4>Residuals vs Leverage plot</h4></font></u> + <img src="PE_GE_lm_3.png"> <br><br>This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.<br><font color='blue'><h3>Identify influential observations</h3></font> +<p><b>Cook’s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>cook’s distance > than 4 times the mean</b> may be classified as <b>influential.</b></p><img src="PE_GE_lm_cooksd.png"> <br>In the above plot, observations above red line (4*mean cook's distance) are influential, marked in <b>*</b>. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td>Mean cook's distance</td><td> 0.0002988385 </td></tr> + <tr><td>Total influential observations (cook's distance > 4 * mean cook's distance)</td><td> 90 </td></tr> + <tr><td>Total influential observations (cook's distance > 3 * mean cook's distance)</td><td> 116 </td></tr> + </table> <font color="brown"><h4>Top 10 influential observations (cook's distance > 4 * mean cook's distance)</h4></font><a href=" ./output_fold/PE_GE_influential_observation.tsv " target="_blank">Download entire list</a><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>PE_ID</th><th>PE_abundance</th><th>GE_ID</th><th>GE_abundance</th><th>cooksd</th></tr><tr> <td> ENSMUSP00000107109.2 </td> <td> -0.4719799 </td> <td> ENSMUST00000001126 </td> <td> -5.301664 </td> <td> 0.001213545 </td></tr><tr> <td> ENSMUSP00000151536.1 </td> <td> 3.113811 </td> <td> ENSMUST00000001256 </td> <td> -0.6348804 </td> <td> 0.00230483 </td></tr><tr> <td> ENSMUSP00000150261.1 </td> <td> 2.914045 </td> <td> ENSMUST00000001583 </td> <td> 0.4988006 </td> <td> 0.001801232 </td></tr><tr> <td> ENSMUSP00000111204.1 </td> <td> 2.850989 </td> <td> ENSMUST00000002073 </td> <td> 0.09635024 </td> <td> 0.001391751 </td></tr><tr> <td> ENSMUSP00000089336.4 </td> <td> 1.219945 </td> <td> ENSMUST00000002391 </td> <td> -2.47573 </td> <td> 0.001781417 </td></tr><tr> <td> ENSMUSP00000030805.7 </td> <td> -0.8313093 </td> <td> ENSMUST00000003469 </td> <td> 3.660597 </td> <td> 0.001650483 </td></tr><tr> <td> ENSMUSP00000011492.8 </td> <td> -0.3735374 </td> <td> ENSMUST00000004326 </td> <td> -7.366491 </td> <td> 0.001556623 </td></tr><tr> <td> ENSMUSP00000029658.7 </td> <td> 9.120211 </td> <td> ENSMUST00000004473 </td> <td> 0.09635024 </td> <td> 0.01423993 </td></tr><tr> <td> ENSMUSP00000099904.4 </td> <td> -1.913743 </td> <td> ENSMUST00000004673 </td> <td> 0.9756628 </td> <td> 0.001209039 </td></tr><tr> <td> ENSMUSP00000081956.8 </td> <td> 3.674308 </td> <td> ENSMUST00000005607 </td> <td> 1.306612 </td> <td> 0.006223403 </td></tr></table><font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations</h3></font> + <img src="AbundancePlot_scatter_without_outliers.png"><font color='blue'><h3>Correlation with removal of outliers / influential observations</h3></font> + <p>We removed the influential observations and reestimated the correlation values.</p><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr><tr><td>Correlation method used</td><td> Pearson's product-moment correlation </td><td> Spearman's rank correlation rho </td><td> Kendall's rank correlation tau </td></tr> <tr><td>Correlation</td><td> 0.01485058 </td><td> 0.0246989 </td><td> 0.01689519 </td></tr> <tr><td>Pvalue</td><td> 0.4273403 </td><td> 0.1867467 </td><td> 0.1918906 </td></tr></table> +<font color='blue'><h3>Heatmap of PE and GE abundance values</h3></font> +<img src="PE_GE_heatmap.png"><font color='blue'><h3>Kmean clustering</h3></font> +Number of Clusters: 5<br><a href=" output_fold/PE_GE_kmeans_clusterpoints.txt " target="_blank">Download cluster list</a><br><img src="PE_GE_kmeans.png"><font color='blue'><h3>Other regression model fitting</h3></font> +<ul> + <li>MAE:mean absolute error</li> + <li>MSE: mean squared error</li> + <li>RMSE:root mean squared error ( sqrt(MSE) )</li> + <li>MAPE:mean absolute percentage error</li> + </ul> + <h4><a href="PE_GE_modelfit.pdf" target="_blank">Comparison of model fits</a></h4><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Model</th><th>MAE</th><th>MSE</th><th>RMSE</th><th>MAPE</th><th>Diagnostics Plot</th></tr><tr><td>Linear regression with all data</td><td> 0.5463329 </td><td> 0.9996481 </td><td> 0.999824 </td><td> 0.9996321 </td><td> <a href="PE_GE_lm.pdf" target="_blank">Link</a> </td></tr> <tr><td>Linear regression with removal of outliers</td><td> 0.5404805 </td><td> 1.006281 </td><td> 1.003136 </td><td> 1.455637 </td><td> <a href="PE_GE_lm_without_outliers.pdf" target="_blank">Link</a> </td></tr> <tr><td>Resistant regression (lqs / least trimmed squares method)</td><td> 0.5407598 </td><td> 1.007932 </td><td> 1.003958 </td><td> 1.537172 </td><td> <a href="PE_GE_lqs.pdf" target="_blank">Link</a> </td></tr> <tr><td>Robust regression (rlm / Huber M-estimator method)</td><td> 0.5404879 </td><td> 1.005054 </td><td> 1.002524 </td><td> 1.411806 </td><td> <a href="PE_GE_rlm.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 2</td><td> 0.546322 </td><td> 0.9996472 </td><td> 0.9998236 </td><td> 0.9993865 </td><td> <a href="PE_GE_poly2.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 3</td><td> 0.5469588 </td><td> 0.9976384 </td><td> 0.9988185 </td><td> 1.043158 </td><td> <a href="PE_GE_poly3.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 4</td><td> 0.5467885 </td><td> 0.9975077 </td><td> 0.9987531 </td><td> 1.041541 </td><td> <a href="PE_GE_poly4.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 5</td><td> 0.5467813 </td><td> 0.9975076 </td><td> 0.998753 </td><td> 1.041209 </td><td> <a href="PE_GE_poly5.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 6</td><td> 0.5465911 </td><td> 0.996652 </td><td> 0.9983246 </td><td> 1.056632 </td><td> <a href="PE_GE_poly6.pdf" target="_blank">Link</a> </td></tr> <tr><td>Generalized additive models</td><td> 0.5463695 </td><td> 0.9976796 </td><td> 0.9988391 </td><td> 1.032766 </td><td> <a href="PE_GE_gam.pdf" target="_blank">Link</a> </td></tr> </table> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Jun 20 20:43:56 2018 -0400 @@ -0,0 +1,44 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="R" version="3.2.1"> + <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="r-protrnacorr" version="0.0.1"> + <install version="1.0"> + <actions> + <action type="setup_r_environment"> + <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.2.1" /> + </repository> + <package>https://cran.r-project.org/src/contrib/MASS_7.3-50.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/mgcv_1.8-24.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/zoo_1.8-2.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/xts_0.10-2.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/curl_3.2.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/TTR_0.23-3.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/quantmod_0.4-13.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/abind_1.4-5.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/gtools_3.5.0.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/gdata_2.18.0.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/bitops_1.0-6.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/caTools_1.17.1.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/gplots_3.0.1.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/ROCR_1.0-7.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/lattice_0.20-35.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/DMwR_0.4.1.tar.gz</package> + <package>https://cran.r-project.org/src/contrib/data.table_1.11.4.tar.gz</package> + <!--<package>https://cran.r-project.org/src/contrib/Archive/grid/grid_0.7-3.tar.gz</package>--> + <package>https://cran.r-project.org/src/contrib/Archive/nlme/nlme_3.1-120.tar.gz</package> + <!--<package>https://cran.r-project.org/src/contrib/nlme_3.1-137.tar.gz</package>--> + </action> + </actions> + </install> + <readme> + Contains a tool dependency definition that downloads and installs protein rna correlation tool package for R library. + </readme> + </package> + <package name="x11" version="1.5.0"> + <repository changeset_revision="2ecb3f1f8fbb" name="package_libx11_1_5_0" owner="devteam" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>