Mercurial > repos > pravs > protein_rna_correlation

--- a/protein_rna_correlation.r	Sun Jun 17 05:06:18 2018 -0400
+++ b/protein_rna_correlation.r	Wed Jun 20 20:43:56 2018 -0400
@@ -80,8 +80,8 @@
 # ............................SCRIPT STARTS FROM HERE .............................
 #==================================================================================
 # Warning Off
-oldw <- getOption("warn")
-options(warn = -1)
+# oldw <- getOption("warn")
+# options(warn = -1)
 #=============================================================================================================
 # Functions
 #=============================================================================================================
@@ -411,10 +411,10 @@

 			cat(
 		"<font color='blue'><h3>Download mapped unmapped data</h3></font>",
-		"<ul><li>Protein mapped data: ", '<a href="',paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
-		"<li>Protein unmapped data: ", '<a href="',paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
-		"<li>Transcript mapped data: ", '<a href="',paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
-		"<li>Transcript unmapped data: ", '<a href="',paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+		"<ul><li>Protein mapped data: ", '<a href="',paste(PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+		"<li>Protein unmapped data: ", '<a href="',paste(PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+		"<li>Transcript mapped data: ", '<a href="',paste(GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+		"<li>Transcript unmapped data: ", '<a href="',paste(GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
 		file = htmloutfile, append = TRUE);
 	}

@@ -422,8 +422,8 @@
 	write.table(GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)], file=paste(outdir,"/",GE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")

 		cat(
-	"<li>Protein abundance data: ", '<a href="',paste(outdir,"/",PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
-	"<li>Transcript abundance data: ", '<a href="',paste(outdir,"/",GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
+	"<li>Protein abundance data: ", '<a href="',paste(PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+	"<li>Transcript abundance data: ", '<a href="',paste(GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
 	file = htmloutfile, append = TRUE);

 #==========================================================================================
@@ -470,8 +470,8 @@

 		# Write excluded transcriptomics and proteomics data link to html file
 		cat(
-		"<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
-		"<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
+		"<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
+		"<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
 		file = htmloutfile, append = TRUE);

 		# Keep the unexcluded entries only
@@ -535,7 +535,7 @@
 # Distribution of proteome and transcriptome abundance (Box and Density plot)
 #=============================================================================================================
 	cat("Generating Box and Density plot\n",file=logfile, append=T);
-	outplot = paste("./",outdir,"/AbundancePlot.png",sep="",collapse="");
+	outplot = paste(outdir,"/AbundancePlot.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(2,2));
 	boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5);
@@ -553,7 +553,7 @@
 # Scatter plot
 #=============================================================================================================
 	cat("Generating scatter plot\n",file=logfile, append=T);
-	outplot = paste("./",outdir,"/AbundancePlot_scatter.png",sep="",collapse="");
+	outplot = paste(outdir,"/AbundancePlot_scatter.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
@@ -620,7 +620,7 @@
 #=============================================================================================================
 # Plotting various regression diagnostics plots
 #=============================================================================================================
-	outplot1 = paste("./",outdir,"/PE_GE_modelfit.pdf",sep="",collapse="");
+	outplot1 = paste(outdir,"/PE_GE_modelfit.pdf",sep="",collapse="");
 	pdf(outplot1);
 	devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1
 	print(.Devices)
@@ -635,31 +635,31 @@
 	"<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>\n",
 	file = htmloutfile, append = TRUE);

-	outplot = paste("./",outdir,"/PE_GE_lm_1.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_1.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	plot(regmodel, 1, cex.lab=1.5);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_lm_2.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_2.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	plot(regmodel, 2, cex.lab=1.5);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_lm_3.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_3.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	plot(regmodel, 3, cex.lab=1.5);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_lm_5.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_5.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	plot(regmodel, 5, cex.lab=1.5);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_lm.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm.pdf",sep="",collapse="");
 	pdf(outplot);
 	plot(regmodel);
 	dev.off();
@@ -704,7 +704,7 @@
 	cooksd <- cooks.distance(regmodel);

 	cat("Generating cooksd plot\n",file=logfile, append=T);
-	outplot = paste("./",outdir,"/PE_GE_lm_cooksd.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_cooksd.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations")  # plot cooks distance
@@ -737,7 +737,7 @@

 	cat("Writing influential observations\n",file=logfile, append=T);

-	outdatafile = paste("./",outdir,"/PE_GE_influential_observation.tsv", sep="", collapse="");
+	outdatafile = paste(outdir,"/PE_GE_influential_observation.tsv", sep="", collapse="");
 	cat('<a href="',outdatafile, '" target="_blank">Download entire list</a>',file = htmloutfile, append = TRUE);
 	write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F);

@@ -764,7 +764,7 @@
 	#=============================================================================================================
 	# Scatter plot
 	#=============================================================================================================
-	outplot = paste("./",outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse="");
+	outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
@@ -806,10 +806,12 @@
 	file = htmloutfile, append = TRUE);

 	cat("Generating heatmap plot\n",file=logfile, append=T);
-	outplot = paste("./",outdir,"/PE_GE_heatmap.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_heatmap.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
-	heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col");
+	#heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col");
+	my_palette <- colorRampPalette(c("green", "white", "red"))(299);
+	heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=my_palette ,Colv=F, labCol=c("PE","GE"), scale="col", dendrogram = "row");
 	dev.off();

 	cat(
@@ -825,7 +827,7 @@

 	k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5);
 	cat("Generating kmeans plot\n",file=logfile, append=T);
-	outplot = paste("./",outdir,"/PE_GE_kmeans.png",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_kmeans.png",sep="",collapse="");
 	png(outplot);
 	par(mfrow=c(1,1));
 	scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
@@ -864,7 +866,7 @@
 	# Linear regression with removal of outliers
 	regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier);
 	regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier);
-	outplot = paste("./",outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse="");
 	plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers");
 	points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red");

@@ -876,7 +878,7 @@
 	# Resistant regression (lqs / least trimmed squares method)
 	regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data);
 	regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data);
-	outplot = paste("./",outdir,"/PE_GE_lqs.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_lqs.pdf",sep="",collapse="");
 	pdf(outplot);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)");
 	points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red");
@@ -890,7 +892,7 @@
 	# Robust regression (rlm /  Huber M-estimator method)
 	regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data);
 	regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data);
-	outplot = paste("./",outdir,"/PE_GE_rlm.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_rlm.pdf",sep="",collapse="");
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm /  Huber M-estimator method)");
 	points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red");
 	pdf(outplot);
@@ -915,7 +917,7 @@
 	regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data);
 	regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data);

-	outplot = paste("./",outdir,"/PE_GE_poly2.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_poly2.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2");
 	points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red");
@@ -923,7 +925,7 @@
 	plot(regmodel_poly2);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_poly3.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_poly3.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3");
 	points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red");
@@ -931,7 +933,7 @@
 	plot(regmodel_poly3);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_poly4.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_poly4.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4");
 	points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red");
@@ -939,7 +941,7 @@
 	plot(regmodel_poly4);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_poly5.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_poly5.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5");
 	points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red");
@@ -947,7 +949,7 @@
 	plot(regmodel_poly5);
 	dev.off();

-	outplot = paste("./",outdir,"/PE_GE_poly6.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_poly6.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6");
 	points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red");
@@ -960,7 +962,7 @@
 	regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data);

 	regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values)
-	outplot = paste("./",outdir,"/PE_GE_gam.pdf",sep="",collapse="");
+	outplot = paste(outdir,"/PE_GE_gam.pdf",sep="",collapse="");
 	dev.set(devnum);
 	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models");
 	points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red");
@@ -1019,7 +1021,7 @@


 	# Warning On
-	options(warn = oldw)
+	# options(warn = oldw)


\ No newline at end of file
--- a/protein_rna_correlation.xml	Sun Jun 17 05:06:18 2018 -0400
+++ b/protein_rna_correlation.xml	Wed Jun 20 20:43:56 2018 -0400
@@ -1,28 +1,20 @@
-<tool id="protein_rna_correlation" name="protein_rna_correlation" version="0.1.1">
-  <description>Correlation between protein and rna expression (Single Sample)</description>
+<tool id="protein_rna_correlation" name="protein_rna_correlation" version="0.0.2">
+  <description>Correlation between protein and transcript expression  </description>
   <requirements>
-    <requirement type="package" version="3.3.1">r-base</requirement>
-    <requirement type="package" version="1.18.0">bioconductor-rgalaxy</requirement>
-    <requirement type="package" version="1.21.0">bioconductor-biocinstaller</requirement>
-    <requirement type="package" version="1.9">rmarkdown</requirement>
-    <requirement type="package" version="1.9">MASS</requirement>
-    <requirement type="package" version="1.8-23">mgcv</requirement>
-    <requirement type="package" version="0.4.1">DMwR</requirement>
-    <requirement type="package" version="1.11.4">data.table</requirement>
-    <requirement type="package" version="3.0.1">gplots</requirement>
-    <requirement type="package" version="0.20-35">lattice</requirement>
-    <requirement type="package" version="3.5.0">grid</requirement>
-    <requirement type="package" version="3.1-137">nlme</requirement>
+    <requirement type="package" version="3.2.1">R</requirement>
+    <requirement type="package" version="0.0.1">r-protrnacorr</requirement>
+    <requirement type="package" version="1.5.0">x11</requirement>
+
   </requirements>
   <command detect_errors="exit_code" interpreter="Rscript"><![CDATA[protein_rna_correlation.r $pe_exp $ge_exp $pe_idcol $ge_idcol $pe_expcol $ge_expcol $pe_idtype $ge_idtype $organism_map $writeMapUnmap $doScale $html_file $html_file.files_path]]></command>
   <inputs>
     <param name="pe_exp" type="data" format="tabular">
       <label>Input Protein Expression File</label>
     </param>
-    <param name="pe_idcol" type="integer">
+    <param name="pe_idcol" type="integer" value="1">
       <label>Column: Protein/Gene ID</label>
     </param>
-    <param name="pe_expcol" type="integer">
+    <param name="pe_expcol" type="integer" value="1">
       <label>Column: Protein Expression Values</label>
     </param>

@@ -30,41 +22,41 @@
       <label>Input RNA Expression File</label>
     </param>

-    <param name="ge_idcol" type="integer">
+    <param name="ge_idcol" type="integer" value="1">
       <label>Column: RNA/Gene ID</label>
     </param>
-    <param name="ge_expcol" type="integer">
+    <param name="ge_expcol" type="integer" value="1">
       <label>Column: RNA Expression Values</label>
     </param>

-    <param name="pe_idtype" type="select">
-        <option value='ensembl' selected>Ensembl</option>
-        <option value='uniprot'>Uniprot</option>
-        <option value='hgnc'>HGNC</option>
+    <param name="pe_idtype" type="select" label="Protein ID type">
+        <option value="Ensembl_with_version" selected="True">Ensembl</option>
+        <option value="uniprot">Uniprot</option>
+        <option value="hgnc">HGNC</option>
     </param>

-    <param name="ge_idtype" type="select">
-        <option value='ensembl' selected>Ensembl</option>
-        <option value='uniprot'>Uniprot</option>
-        <option value='hgnc'>HGNC</option>
+    <param name="ge_idtype" type="select" label="Transcript ID type">
+        <option value="Ensembl_with_version" selected="True">Ensembl</option>
+        <option value="uniprot">Uniprot</option>
+        <option value="hgnc">HGNC</option>
     </param>

     <param name="organism_map" type="data" format="tabular">
         <label>Biomart ID Mapping file (.map)</label>
     </param>

-    <param name="writeMapUnmap" type="boolean">
+    <param name="writeMapUnmap" type="boolean" checked="true" truevalue="1" falsevalue="0">
       <label>Create the list of Mapped and Unmapped Identifiers in HTML</label>
     </param>

-    <param name="doScale" type="boolean">
+    <param name="doScale" type="boolean" checked="true" truevalue="1" falsevalue="0">
       <label>Scale the abundance values</label>
     </param>

   </inputs>

   <outputs>
-    <data format="html" name="html_file" label="protein_rna_corr_${tool_name}.html"/>
+    <data format="html" name="html_file" label="protein_rna_corr_${tool.name}.html"/>
   </outputs>

   <tests>
@@ -78,14 +70,23 @@
             <param name="pe_idtype" value="Ensembl_with_version"/>
             <param name="ge_idtype" value="Ensembl_with_version"/>
             <param name="organism_map" value="mmusculus_gene_ensembl__GRCm38.p6.map"/>
-            <param name="writeMapUnmap" value="1"/>
-            <param name="doScale" value="1"/>
-            <output name="html_file" file="PE_abundance_GE_abundance_pearson.html"/>
+            <param name="writeMapUnmap" value="true"/>
+            <param name="doScale" value="true"/>
+            <output name="html_file" file="protein_rna_corr_protein_transcript_correlation.html"/>
         </test>
   </tests>

-  <help>
+  <help><![CDATA[
 Proteome Transcriptome Correlation
 Developer: Priyabrata Panigrahi
-  </help>
+  ]]></help>
+  <citations>
+      <citation type="bibtex">
+@misc{protein_rna_correlation,
+    author={Panigrahi, Priyabrata},
+    year={2018},
+    title={ProteinRNACorrelation}
+}
+      </citation>
+    </citations>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_data/PE_abundance_GE_abundance_pearson.html	Wed Jun 20 20:43:56 2018 -0400
@@ -0,0 +1,56 @@
+<html><body>
+<h1>Association between proteomics and transcriptomics data</h1>
+ <font color='blue'><h3>Input data summary</h3></font> <ul> <li>Abbrebiations used: PE (Proteomics) and GE (Transcriptomics) </li> <li>Input PE data dimension (Row Column):  3597 58 </li> <li>Input GE data dimension (Row Column):  191650 14 </li> <li>Protein ID fetched from column:  7 </li> <li>Transcript ID fetched from column:  1 </li> <li>Protein ID type:  ensembl_peptide_id_version </li> <li>Transcript ID type:  ensembl_transcript_id_version </li> <li>Protein expression data fetched from column:  13 </li> <li>Transcript expression data fetched from column:  10 </li><li>Total Protein ID mapped:  3582 </li> <li>Total Protein ID unmapped:  15 </li> <li>Total Transcript ID mapped:  3582 </li> <li>Total Transcript ID unmapped:  188068 </li></ul><font color='blue'><h3>Download mapped unmapped data</h3></font> <ul><li>Protein mapped data:  <a href=" output_fold/PE_mapped.tsv " target="_blank"> Link</a> </li> <li>Protein unmapped data:  <a href=" output_fold/PE_unmapped.tsv " target="_blank"> Link</a> </li> <li>Transcript mapped data:  <a href=" output_fold/GE_mapped.tsv " target="_blank"> Link</a> </li> <li>Transcript unmapped data:  <a href=" output_fold/GE_unmapped.tsv " target="_blank"> Link</a> </li><li>Protein abundance data:  <a href=" output_fold/PE_abundance.tsv " target="_blank"> Link</a> </li> <li>Transcript abundance data:  <a href=" output_fold/GE_abundance.tsv " target="_blank"> Link</a> </li></ul><ul> <li>Number of entries in Transcriptome data used for correlation:  3582 </li> <li>Number of entries in Proteome data used for correlation:  3582 </li></ul><font color='blue'><h3>Filtering</h3></font> Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry<br> <ul> <li>Number of NA found:  88 </li> <li>Number of Inf or -Inf found:  559 </li></ul><ul><li>Protein excluded data with NA or Inf or -Inf:  <a href=" output_fold/PE_excluded_NA_Inf.tsv " target="_blank"> Link</a> </li> <li>Transcript excluded data with NA or Inf or -Inf:  <a href=" output_fold/GE_excluded_NA_Inf.tsv " target="_blank"> Link</a> </li></ul><font color='blue'><h3>Filtered data summary</h3></font> Excluding entires with abundance values: NA/Inf/-Inf<br> <ul> <li>Number of entries in Transcriptome data remained:  2949 </li> <li>Number of entries in Proteome data remained:  2949 </li></ul><font color='blue'><h3>Proteome data summary</h3></font>
+ <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td> </td><td> Min.   :-2.98277   </td></tr>
+<tr><td> </td><td> 1st Qu.:-0.40393   </td></tr>
+<tr><td> </td><td> Median :-0.07986   </td></tr>
+<tr><td> </td><td> Mean   : 0.00000   </td></tr>
+<tr><td> </td><td> 3rd Qu.: 0.26061   </td></tr>
+<tr><td> </td><td> Max.   :15.13211   </td></tr>
+</table>
+<font color='blue'><h3>Transcriptome data summary</h3></font>
+ <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td> </td><td> Min.   :-8.33003   </td></tr>
+<tr><td> </td><td> 1st Qu.:-0.06755   </td></tr>
+<tr><td> </td><td> Median : 0.09635   </td></tr>
+<tr><td> </td><td> Mean   : 0.00000   </td></tr>
+<tr><td> </td><td> 3rd Qu.: 0.18103   </td></tr>
+<tr><td> </td><td> Max.   : 8.50430   </td></tr>
+</table>
+<font color='blue'><h3>Distribution of Proteome and Transcripome abundance (Box plot and Density plot)</h3></font>
+ <img src="AbundancePlot.png"><font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>
+ <img src="AbundancePlot_scatter.png"><font color='blue'><h3>Correlation with all data</h3></font>
+ <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr><tr><td>Correlation method used</td><td> Pearson's product-moment correlation </td><td> Spearman's rank correlation rho </td><td> Kendall's rank correlation tau </td></tr> <tr><td>Correlation</td><td> -0.003584536 </td><td> 0.01866248 </td><td> 0.01280742 </td></tr> <tr><td>Pvalue</td><td> 0.8457255 </td><td> 0.3110035 </td><td> 0.314683 </td></tr></table>
+<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>cook's distance based approach</u> to identify such influential observations.</font><font color='blue'><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>
+ <p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>
+ <table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td>Formula</td><td> PE_abundance~GE_abundance </td></tr>
+ <tr><td colspan='2' align='center'> <b>Coefficients</b></td> </tr>
+ <tr><td> (Intercept) </td><td> 1.727289e-16  (Pvalue: 1 ) </td></tr>
+ <tr><td> GE_abundance </td><td> -0.003584536  (Pvalue: 0.8457255 ) </td></tr>
+ <tr><td colspan='2' align='center'> <b>Model parameters</b></td> </tr>
+ <tr><td>Residual standard error</td><td> 1.000163  ( 2947  degree of freedom)</td></tr>
+ <tr><td>F-statistic</td><td> 0.0378662  ( on  1  and   2947  degree of freedom)</td></tr>
+ <tr><td>R-squared</td><td> 1.28489e-05 </td></tr>
+ <tr><td>Adjusted R-squared</td><td> -0.0003264749 </td></tr>
+</table>
+<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>
+<u><font color='brown'><h4>Residuals vs Fitted plot</h4></font></u>
+ <img src="PE_GE_lm_1.png"> <br><br>This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship<br><u><font color='brown'><h4>Normal Q-Q plot of residuals</h4></font></u>
+ <img src="PE_GE_lm_2.png"> <br><br>This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.<br><u><font color='brown'><h4>Scale-Location (or Spread-Location) plot</h4></font></u>
+ <img src="PE_GE_lm_3.png"> <br><br>This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.<br><u><font color='brown'><h4>Residuals vs Leverage plot</h4></font></u>
+ <img src="PE_GE_lm_3.png"> <br><br>This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.<br><font color='blue'><h3>Identify influential observations</h3></font>
+<p><b>Cook’s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>cook’s distance > than 4 times the mean</b> may be classified as <b>influential.</b></p><img src="PE_GE_lm_cooksd.png"> <br>In the above plot, observations above red line (4*mean cook's distance) are influential, marked in <b>*</b>. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr><tr><td>Mean cook's distance</td><td> 0.0002988385 </td></tr>
+ <tr><td>Total influential observations (cook's distance > 4 * mean cook's distance)</td><td> 90 </td></tr>
+ <tr><td>Total influential observations (cook's distance > 3 * mean cook's distance)</td><td> 116 </td></tr>
+ </table> <font color="brown"><h4>Top 10 influential observations (cook's distance > 4 * mean cook's distance)</h4></font><a href=" ./output_fold/PE_GE_influential_observation.tsv " target="_blank">Download entire list</a><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>PE_ID</th><th>PE_abundance</th><th>GE_ID</th><th>GE_abundance</th><th>cooksd</th></tr><tr> <td> ENSMUSP00000107109.2 </td> <td> -0.4719799 </td> <td> ENSMUST00000001126 </td> <td> -5.301664 </td> <td> 0.001213545 </td></tr><tr> <td> ENSMUSP00000151536.1 </td> <td> 3.113811 </td> <td> ENSMUST00000001256 </td> <td> -0.6348804 </td> <td> 0.00230483 </td></tr><tr> <td> ENSMUSP00000150261.1 </td> <td> 2.914045 </td> <td> ENSMUST00000001583 </td> <td> 0.4988006 </td> <td> 0.001801232 </td></tr><tr> <td> ENSMUSP00000111204.1 </td> <td> 2.850989 </td> <td> ENSMUST00000002073 </td> <td> 0.09635024 </td> <td> 0.001391751 </td></tr><tr> <td> ENSMUSP00000089336.4 </td> <td> 1.219945 </td> <td> ENSMUST00000002391 </td> <td> -2.47573 </td> <td> 0.001781417 </td></tr><tr> <td> ENSMUSP00000030805.7 </td> <td> -0.8313093 </td> <td> ENSMUST00000003469 </td> <td> 3.660597 </td> <td> 0.001650483 </td></tr><tr> <td> ENSMUSP00000011492.8 </td> <td> -0.3735374 </td> <td> ENSMUST00000004326 </td> <td> -7.366491 </td> <td> 0.001556623 </td></tr><tr> <td> ENSMUSP00000029658.7 </td> <td> 9.120211 </td> <td> ENSMUST00000004473 </td> <td> 0.09635024 </td> <td> 0.01423993 </td></tr><tr> <td> ENSMUSP00000099904.4 </td> <td> -1.913743 </td> <td> ENSMUST00000004673 </td> <td> 0.9756628 </td> <td> 0.001209039 </td></tr><tr> <td> ENSMUSP00000081956.8 </td> <td> 3.674308 </td> <td> ENSMUST00000005607 </td> <td> 1.306612 </td> <td> 0.006223403 </td></tr></table><font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations</h3></font>
+ <img src="AbundancePlot_scatter_without_outliers.png"><font color='blue'><h3>Correlation with removal of outliers / influential observations</h3></font>
+ <p>We removed the influential observations and reestimated the correlation values.</p><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr><tr><td>Correlation method used</td><td> Pearson's product-moment correlation </td><td> Spearman's rank correlation rho </td><td> Kendall's rank correlation tau </td></tr> <tr><td>Correlation</td><td> 0.01485058 </td><td> 0.0246989 </td><td> 0.01689519 </td></tr> <tr><td>Pvalue</td><td> 0.4273403 </td><td> 0.1867467 </td><td> 0.1918906 </td></tr></table>
+<font color='blue'><h3>Heatmap of PE and GE abundance values</h3></font>
+<img src="PE_GE_heatmap.png"><font color='blue'><h3>Kmean clustering</h3></font>
+Number of Clusters: 5<br><a href=" output_fold/PE_GE_kmeans_clusterpoints.txt " target="_blank">Download cluster list</a><br><img src="PE_GE_kmeans.png"><font color='blue'><h3>Other regression model fitting</h3></font>
+<ul>
+	<li>MAE:mean absolute error</li>
+	<li>MSE: mean squared error</li>
+	<li>RMSE:root mean squared error ( sqrt(MSE) )</li>
+	<li>MAPE:mean absolute percentage error</li>
+	</ul>
+	<h4><a href="PE_GE_modelfit.pdf" target="_blank">Comparison of model fits</a></h4><table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Model</th><th>MAE</th><th>MSE</th><th>RMSE</th><th>MAPE</th><th>Diagnostics Plot</th></tr><tr><td>Linear regression with all data</td><td> 0.5463329 </td><td> 0.9996481 </td><td> 0.999824 </td><td> 0.9996321 </td><td> <a href="PE_GE_lm.pdf" target="_blank">Link</a> </td></tr> <tr><td>Linear regression with removal of outliers</td><td> 0.5404805 </td><td> 1.006281 </td><td> 1.003136 </td><td> 1.455637 </td><td> <a href="PE_GE_lm_without_outliers.pdf" target="_blank">Link</a> </td></tr> <tr><td>Resistant regression (lqs / least trimmed squares method)</td><td> 0.5407598 </td><td> 1.007932 </td><td> 1.003958 </td><td> 1.537172 </td><td> <a href="PE_GE_lqs.pdf" target="_blank">Link</a> </td></tr> <tr><td>Robust regression (rlm /  Huber M-estimator method)</td><td> 0.5404879 </td><td> 1.005054 </td><td> 1.002524 </td><td> 1.411806 </td><td> <a href="PE_GE_rlm.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 2</td><td> 0.546322 </td><td> 0.9996472 </td><td> 0.9998236 </td><td> 0.9993865 </td><td> <a href="PE_GE_poly2.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 3</td><td> 0.5469588 </td><td> 0.9976384 </td><td> 0.9988185 </td><td> 1.043158 </td><td> <a href="PE_GE_poly3.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 4</td><td> 0.5467885 </td><td> 0.9975077 </td><td> 0.9987531 </td><td> 1.041541 </td><td> <a href="PE_GE_poly4.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 5</td><td> 0.5467813 </td><td> 0.9975076 </td><td> 0.998753 </td><td> 1.041209 </td><td> <a href="PE_GE_poly5.pdf" target="_blank">Link</a> </td></tr> <tr><td>Polynomial regression with degree 6</td><td> 0.5465911 </td><td> 0.996652 </td><td> 0.9983246 </td><td> 1.056632 </td><td> <a href="PE_GE_poly6.pdf" target="_blank">Link</a> </td></tr> <tr><td>Generalized additive models</td><td> 0.5463695 </td><td> 0.9976796 </td><td> 0.9988391 </td><td> 1.032766 </td><td> <a href="PE_GE_gam.pdf" target="_blank">Link</a> </td></tr> </table>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Jun 20 20:43:56 2018 -0400
@@ -0,0 +1,44 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="R" version="3.2.1">
+        <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+
+    <package name="r-protrnacorr" version="0.0.1">
+        <install version="1.0">
+            <actions>
+                <action type="setup_r_environment">
+                    <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu">
+                        <package name="R" version="3.2.1" />
+                    </repository>
+                    <package>https://cran.r-project.org/src/contrib/MASS_7.3-50.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/mgcv_1.8-24.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/zoo_1.8-2.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/xts_0.10-2.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/curl_3.2.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/TTR_0.23-3.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/quantmod_0.4-13.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/abind_1.4-5.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/gtools_3.5.0.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/gdata_2.18.0.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/bitops_1.0-6.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/caTools_1.17.1.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/gplots_3.0.1.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/ROCR_1.0-7.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/lattice_0.20-35.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/DMwR_0.4.1.tar.gz</package>
+                    <package>https://cran.r-project.org/src/contrib/data.table_1.11.4.tar.gz</package>
+                    <!--<package>https://cran.r-project.org/src/contrib/Archive/grid/grid_0.7-3.tar.gz</package>-->
+                    <package>https://cran.r-project.org/src/contrib/Archive/nlme/nlme_3.1-120.tar.gz</package>
+                    <!--<package>https://cran.r-project.org/src/contrib/nlme_3.1-137.tar.gz</package>-->
+                </action>
+            </actions>
+        </install>
+        <readme>
+        Contains a tool dependency definition that downloads and installs protein rna correlation tool package for R library.
+        </readme>
+    </package>
+    <package name="x11" version="1.5.0">
+        <repository changeset_revision="2ecb3f1f8fbb" name="package_libx11_1_5_0" owner="devteam" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>