changeset 8:386145573c19 draft

planemo upload commit bdd7e8a1f08c11db2a9f1b6db5535c6d32153b2b
author proteore
date Tue, 18 Dec 2018 09:54:57 -0500
parents 3e138d54c105
children 948fecb6a40b
files goprofiles.R goprofiles.xml
diffstat 2 files changed, 110 insertions(+), 188 deletions(-) [+]
line wrap: on
line diff
--- a/goprofiles.R	Fri Sep 21 10:08:02 2018 -0400
+++ b/goprofiles.R	Tue Dec 18 09:54:57 2018 -0500
@@ -4,23 +4,24 @@
 suppressMessages(library(goProfiles,quietly = TRUE))
 
 # Read file and return file content as data.frame
-readfile = function(filename, header) {
-  if (header == "true") {
-    # Read only first line of the file as header:
-    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
-    #Read the data of the files (skipping the first row)
-    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
-    # Remove empty rows
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
-    #And assign the header to the data
-    names(file) <- headers
+read_file <- function(path,header){
+  file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE)
+  if (inherits(file,"try-error")){
+    stop("File not found !")
+  }else{
+    return(file)
   }
-  else {
-    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
-    # Remove empty rows
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+}
+
+#convert a string to boolean
+str2bool <- function(x){
+  if (any(is.element(c("t","true"),tolower(x)))){
+    return (TRUE)
+  }else if (any(is.element(c("f","false"),tolower(x)))){
+    return (FALSE)
+  }else{
+    return(NULL)
   }
-  return(file)
 }
 
 check_ids <- function(vector,type) {
@@ -49,10 +50,10 @@
     package=org.Hs.eg.db
   } else if (species=="org.Mm.eg.db"){
     package=org.Mm.eg.db
+  } else if (species=="org.Rn.eg.db"){
+    package=org.Rn.eg.db
   }
   
-  
-  
   # Check if level is number
   if (! as.numeric(level) %% 1 == 0) {
     stop("Please enter an integer for level")
@@ -75,8 +76,8 @@
     genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
     # IDs that have NA ENTREZID
     NAs = id$UNIPROT[which(is.na(id$ENTREZID))]
-    print("IDs unable to convert to ENTREZID: ")
-    print(NAs)
+    #print("IDs unable to convert to ENTREZID: ")
+    #print(NAs)
   }
   
   # Create basic profiles
@@ -91,77 +92,20 @@
   return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
 }
 
-# Plot profiles to PNG
-plotPNG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
-  if (!is.null(profile.CC)) {
-    png("profile.CC.png")
-    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.BP)) {
-    png("profile.BP.png")
-    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.MF)) {
-    png("profile.MF.png")
-    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.ALL)) {
-    png("profile.ALL.png")
-    plotProfiles(profile.ALL, percentage=per, multiplePlots=T, aTitle=title)
-    dev.off()
-  }
-}
-
-# Plot profiles to JPEG
-plotJPEG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
-  if (!is.null(profile.CC)) {
-    jpeg("profile.CC.jpeg")
-    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.BP)) {
-    jpeg("profile.BP.jpeg")
-    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
+make_plot <- function(profile,percent,title,onto,plot_opt){
+  
+  if (plot_opt == "PDF") {
+    file_name=paste("profile_",onto,".pdf",collapse="",sep="")
+    pdf(file_name)
+  } else if (plot_opt == "JPEG"){
+    file_name=paste("profile_",onto,".jpeg",collapse="",sep="")
+    jpeg(file_name)
+  } else if (plot_opt == "PNG"){
+    file_name=paste("profile_",onto,".png",collapse="",sep="")
+    png(file_name)
   }
-  if (!is.null(profile.MF)) {
-    jpeg("profile.MF.jpeg")
-    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.ALL)) {
-    jpeg("profile.ALL.jpeg")
-    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-}
-
-# Plot profiles to PDF
-plotPDF = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
-  if (!is.null(profile.CC)) {
-    pdf("profile.CC.pdf")
-    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.BP)) {
-    pdf("profile.BP.pdf")
-    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.MF)) {
-    pdf("profile.MF.pdf")
-    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
-  if (!is.null(profile.ALL)) {
-    #print("all")
-    pdf("profile.ALL.pdf")
-    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
-    dev.off()
-  }
+  plotProfiles(profile, percentage=percent, multiplePlots=FALSE, aTitle=title)
+  dev.off()
 }
 
 goprofiles = function() {
@@ -212,9 +156,9 @@
     } else {
       ncol = as.numeric(gsub("c", "", ncol))
     }
-    header = args$header
+    header = str2bool(args$header)
     # Get file content
-    file = readfile(filename, header)
+    file = read_file(filename, header)
     # Extract Protein IDs list
     input = unlist(strsplit(as.character(file[,ncol]),";"))
     input = input [which(!is.na(input))]
@@ -225,8 +169,7 @@
   }
   
   ontoopt = strsplit(args$onto_opt, ",")[[1]]
-  #print(ontoopt)
-  #plotopt = strsplit(args[3], ",")
+  onto_pos = as.integer(gsub("BP",3,gsub("MF",2,gsub("CC",1,ontoopt))))
   plotopt = args$plot_opt
   level = args$level
   per = as.logical(args$per)
@@ -236,51 +179,15 @@
   species=args$species
 
   profiles = getprofile(input, id_type, level, duplicate,species)
-  profile.CC = profiles[1]
-  #print(profile.CC)
-  profile.MF = profiles[2]
-  #print(profile.MF)
-  profile.BP = profiles[3]
-  #print(profile.BP)
-  profile.ALL = profiles[-3:-1]
-  #print(profile.ALL)
-  #c(profile.ALL, profile.CC, profile.MF, profile.BP)
-    
-  if ("CC" %in% ontoopt) {
-    write.table(profile.CC, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
-    if (grepl("PNG", plotopt)) {
-      plotPNG(profile.CC=profile.CC, per=per, title=title)
-    }
-    if (grepl("JPEG", plotopt)) {
-      plotJPEG(profile.CC = profile.CC, per=per, title=title)
-    }
-    if (grepl("PDF", plotopt)) {
-      plotPDF(profile.CC = profile.CC, per=per, title=title)
-    }
-  }
-  if ("MF" %in% ontoopt) {
-    write.table(profile.MF, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
-    if (grepl("PNG", plotopt)) {
-      plotPNG(profile.MF = profile.MF, per=per, title=title)
-    }
-    if (grepl("JPEG", plotopt)) {
-      plotJPEG(profile.MF = profile.MF, per=per, title=title)
-    }
-    if (grepl("PDF", plotopt)) {
-      plotPDF(profile.MF = profile.MF, per=per, title=title)
-    }
-  }
-  if ("BP" %in% ontoopt) {
-    write.table(profile.BP, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
-    if (grepl("PNG", plotopt)) {
-      plotPNG(profile.BP = profile.BP, per=per, title=title)
-    }
-    if (grepl("JPEG", plotopt)) {
-      plotJPEG(profile.BP = profile.BP, per=per, title=title)
-    }
-    if (grepl("PDF", plotopt)) {
-      plotPDF(profile.BP = profile.BP, per=per, title=title)
-    }
+
+  for (index in onto_pos) {
+    onto = names(profiles[index])
+    profile=profiles[index]
+    make_plot(profile,per,title,onto,plotopt)
+    text_output=paste("goProfiles_",onto,"_",title,".tsv",sep="",collapse="")
+    profile = as.data.frame(profile)
+    profile <- as.data.frame(apply(profile, c(1,2), function(x) gsub("^$|^ $", NA, x)))  #convert "" and " " to NA
+    write.table(profile, text_output, sep="\t", row.names = FALSE, quote=FALSE, col.names = T)
   }
 }
 
--- a/goprofiles.xml	Fri Sep 21 10:08:02 2018 -0400
+++ b/goprofiles.xml	Tue Dec 18 09:54:57 2018 -0500
@@ -1,9 +1,10 @@
-<tool id="goProfiles" name="goProfiles" version="2018.09.21">
-    <description>Statistical analysis of functional profiles</description>
+<tool id="goProfiles" name="Statistical analysis of functional profiles" version="2018.12.12">
+    <description>(Human, Mouse) [goProfiles]</description>
     <requirements> 
         <requirement type="package" version="3.4.1">R</requirement>
         <requirement type="package" version="3.5.0">bioconductor-org.hs.eg.db</requirement>
         <requirement type="package" version="3.5.0">bioconductor-org.mm.eg.db</requirement>
+        <!--requirement type="package" version="3.5.0">bioconductor-org.rn.eg.db</requirement-->
         <requirement type="package" version="1.40.0">bioconductor-annotationdbi</requirement>
         <requirement type="package" version="2.38.0">bioconductor-biobase</requirement>
         <requirement type="package" version="1.38.0">goprofiles</requirement>
@@ -24,32 +25,24 @@
         #end if
         
         --id_type="$input.id_type"
-        
         --onto_opt="$onto_opt"
-        
-        --plot_opt="$opt.plot_opt"
-        
+        --plot_opt="$plot_opt"
         --level="$level"
-        
         --per="$per"
-        
         --title="$title"
-        
         --duplicate="$duplicate"
-
         --text_output="$text_output"
-
-        --species="$species"
+        --species="$species" > $log
 
     ]]></command>
     <inputs>
         <conditional name="input" >
-            <param name="ids" type="select" label="Enter your ID list (only Entrez Gene ID or UniProt accession number allowed" help="Copy/paste or ID list from a file (e.g. table)" >
-                <option value="text">Copy/paste your identifiers</option>
-                <option value="file" selected="true">Input file containing your identifiers</option>
+            <param name="ids" type="select" label="Enter your IDs (Entrez Gene ID or UniProt Accession number)" help="Copy/paste or from a file (e.g. table)" >
+                <option value="text">Copy/paste your IDs</option>
+                <option value="file" selected="true">Input file containing your IDs</option>
             </param>
             <when value="text" >
-                <param name="text" type="text" label="Copy/paste your identifiers" help='IDs must be separated by spaces into the form field, for example: P31946 P62258' >
+                <param name="text" type="text" label="Copy/paste your IDs" help='IDs must be separated by spaces into the form field, for example: P31946 P62258' >
                     <sanitizer>
                         <valid initial="string.printable">
                             <remove value="&apos;"/>
@@ -59,53 +52,56 @@
                         </mapping>
                     </sanitizer>
                 </param>
-                <param name="id_type" type="select" label="Please select the type of your IDs list" >
+                <param name="id_type" type="select" label="Select type of IDs list" >
                     <option value="Entrez">Entrez Gene ID</option>
-                    <option value="UniProt">UniProt protein acession number</option>
+                    <option value="UniProt">UniProt Accession number</option>
                 </param>
             </when>
             <when value="file" >
-                <param name="file" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="" />
-                <param name="ncol" type="text" value="c1" label="The column number of IDs to use" help='For example, fill in "c1" if it is the first column, "c2" if it is the second column and so on' />
-                <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" />
-                <param name="id_type" type="select" label="Please select the type of your IDs list" >
+                <param name="file" type="data" format="txt,tabular" label="Select your file" help="" />
+                <param name="ncol" type="text" value="c1" label="Column number of IDs" help='For example, fill in "c1" if it is the first column, "c2" if it is the second column and so on' />
+                <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" />
+                <param name="id_type" type="select" label="Select type of IDs list" >
                     <option value="Entrez">Entrez Gene ID</option>
-                    <option value="UniProt">UniProt protein ID</option>
+                    <option value="UniProt">Uniprot Accession number</option>
                 </param>
             </when>            
         </conditional>
         <param name="duplicate" type="boolean" label="Remove duplicated IDs" truevalue="TRUE" falsevalue="FALSE" />
-        <param name="species" type="select" label="Select your species">
+        <param name="species" type="select" label="Species">
             <option value="org.Hs.eg.db">Human (Homo sapiens)</option>
             <option value="org.Mm.eg.db">Mouse (Mus musculus)</option>
+            <!--option value="org.Rn.eg.db">Rat (Rattus norvegicus)</option-->
         </param>
-        <param type="select" name="onto_opt" label="Please select GO terms category" multiple="True" display="checkboxes" >
+        <param type="select" name="onto_opt" label="Select GO terms category" multiple="True" display="checkboxes" optional="false" >
             <option value="CC">Cellular Component (CC)</option>
             <option value="MF">Molecular Function (MF)</option>
             <option value="BP">Biological Process (BP)</option>
         </param>
-        <param type="select" name="level" label="Level of the ontology at which the profile has to be built (the higher this number, the deeper the GO level)" >
+        <param type="select" name="level" label="Ontology level (the higher this number, the deeper the GO level)" >
             <option value="1">1</option>
             <option value="2" selected="True">2</option>
             <option value="3">3</option>
         </param>
-        <param type="boolean" name="per" label="Plot absolute or relative frequencies (not summing to 100)" truevalue="TRUE" falsevalue="FALSE" />
+        <param type="boolean" name="per" label="Plot absolute frequencies?" truevalue="TRUE" falsevalue="FALSE" />
         <param type="text" name="title" label="Enter title of your figure" />
-        <section name="opt" title="Choose graphical output (bar plots) format: png, jpeg, pdf" expanded="False" help="By default, PDF is chosen as output format">
-            <param type="select" name="plot_opt" label="Choose plot output extension" multiple="True" display="checkboxes" >
-                <option value="PNG">PNG</option>
-                <option value="JPEG">JPEG</option>
-                <option value="PDF" selected="True">PDF</option>
-            </param>
-        </section>
+        <param type="select" name="plot_opt" label="Plot file format">
+            <option value="PNG">png</option>
+            <option value="JPEG">jpeg</option>
+            <option value="PDF" selected="True">pdf</option>
+        </param>
+
     </inputs>
     <outputs>
-	    <collection type="list" label="GO Profile diagram output" name="output" >
+        <data name="log" format="tsv" label="goProfiles log" />
+        <collection type="list" label="goProfiles text files" name="text_output">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+\.tsv)" ext="tsv"/>
+        </collection>
+	    <collection type="list" label="goProfiles diagram output" name="output" >
 	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.png)" ext="png" />
 	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.jpeg)" ext="jpg" />
 	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.pdf)" ext="pdf" />
 	    </collection>
-        <data name="text_output" format="tabular" label="GO Profile text output" />
     </outputs>
     <tests>
         <test>
@@ -121,31 +117,46 @@
             <param name="level" value="2" />
             <param name="per" value="true" />
             <param name="title" value="Test" />
-            <section name="opt" >
-                <param name="plot_opt" value="PDF" />
-            </section>
+            <param name="plot_opt" value="PDF" />
             <output_collection name="output" type="list" >
                 <element name="profile.BP.pdf" file="profile.BP.pdf" ftype="pdf" compare="sim_size"/>
                 <element name="profile.CC.pdf" file="profile.CC.pdf" ftype="pdf" compare="sim_size"/>
                 <element name="profile.MF.pdf" file="profile.MF.pdf" ftype="pdf" compare="sim_size"/>
             </output_collection>
-            <output name="text_output" file="GO_Profile_text_output.txt"/>
+            <output name="log" file="GO_Profile_text_output.tsv"/>
         </test>
     </tests>
     <help><![CDATA[
-This tool, based on the goProfiles R package, performs statistical analysis of functional profiles. It is based on GO ontology and considers either a gene set ('Entrez’ Identifiers) or a protein set (Uniprot accession number) as input. 
+
+**Description**
+
+This tool is based on the goProfiles R package; it performs statistical analysis of functional profiles based on Gene Ontology (GO). Functional profile at a given GO level is obtained by counting the
+number of identifiers having a hit in each category of this level.   
 
-You can choose one or more GO categories: 
+-----
+
+**Input** 
+
+Two modes are allowed: either by copy/pasting your IDs (separated by a space) or by supplying a tabular file (.csv, .tsv, .txt, .tab) including your IDs (identifiers).
+Only entrez gene ID (e.g : 4151) or Uniprot accession number (e.g. P31946) are allowed. If your list is not in this form, please use the ID_Converter tool of ProteoRE. 
+
+-----
 
-* Biological Process (BP) 
-* Cellular Component (CC) 
-* Molecular Function (MF) 
+**Parameters**
+
+"Species": enter the sepcies you are working on; Homo sapiens and Mus musculus supported (Rattus norvegicus coming soon)
+
+"Select GO terms category": you can choose one or more GO categories which are Biological Process (BP), Cellular Component (CC) and Molecular Function (MF) 
+
+"Ontology level (the higher this number, the deeper the GO level)": correspond to the level of GO hierarchy (from 1 to 3). In general the higher the level, the more semantically specific the term is.
 
-Functional profile at a given GO level is obtained by counting the number of identifiers having a hit in each category of this level (2 by default). Results are displayed as bar plots (with absolute or relative frequencies) and can be exported in pdf, png and jpeg formats; textual output with GO terms and their computed frequencies is also provided.  
+-----
+
+**Ouput**
 
-For more details about GoProfiles, please read: Salicrú et al. Comparison of lists of genes based on functional profiles. BMC Bioinformatics. 2011;12:401.(https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-401)  
+Diagram output: graphical output in the form of bar-plot or dot-plot (png, jpeg or pdf format), one figure for each GO category. 
 
-If your type of identifiers is not supported (i.e. different from Uniprot and Entrez), please use the **ID Converter** tool in the ProteoRE section to convert your list of IDs first.
+text output: with the following information GO category description (e.g.BP.Description), GO term identifier (e.g. BP.GOID) and GO term frequency (e.g. BP.Frequency)
 
 -----
 
@@ -153,7 +164,9 @@
 
 **Authors** 
 
-Sanchez A, Ocana J and Salicru M (2016). goProfiles: goProfiles: an R package for the statistical analysis of functional profiles. R package version 1.38.0.
+Salicrú M, Ocaña J, Sánchez-Pla A. Comparison of lists of genes based on functional profiles. BMC Bioinformatics. 2011. 12:401. doi:10.1186/1471-2105-12-401. PubMed PMID: 21999355
+
+-----
 
 .. class:: infomark
 
@@ -161,7 +174,9 @@
 
 T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
 
-Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit,Migale Bioinformatics platform,
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit,Migale Bioinformatics platform, FR
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
 
 Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.