changeset 0:d89c09253c8d draft

planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author proteore
date Sun, 26 Nov 2017 19:19:39 -0500
parents
children 1236ee08ccb8
files README.rst goprofiles.R goprofiles.xml test-data/UnipIDs.txt test-data/profile.BP.pdf test-data/profile.CC.pdf test-data/profile.MF.pdf
diffstat 7 files changed, 485 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Sun Nov 26 19:19:39 2017 -0500
@@ -0,0 +1,30 @@
+Galaxy wrappers for goProfiles package
+======================================
+
+**Authors** 
+
+Sanchez A, Ocana J and Salicru M (2016). goProfiles: goProfiles: an R package for the statistical analysis of functional profiles. R package version 1.38.0.
+
+**Galaxy integration**
+
+T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit,Migale Bioinformatics platform,
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+======================================
+
+This tool, based on the goProfiles R package, performs statistical analysis of functional profiles. It is based on GO ontology and considers either a gene set ('Entrez’ Identifiers) or a protein set (Uniprot ID) as input. 
+
+You can choose one or more GO categories: 
+
+* Biological Process (BP) 
+* Cellular Component (CC) 
+* Molecular Function (MF) 
+
+Functional profile at a given GO level is obtained by counting the number of identifiers having a hit in each category of this level (2 by default). Results are displayed as bar plots (with absolute or relative frequencies) and can be exported in pdf, png and jpeg formats.  
+
+For more details about GoProfiles, please read: Salicrú et al. Comparison of lists of genes based on functional profiles. BMC Bioinformatics. 2011;12:401.(https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-401)  
+
+If your type of identifiers is not supported (i.e. different form Uniprot and Entrez), please use the **ID Converter** component in the ProteoRE section to convert your list of IDs first.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goprofiles.R	Sun Nov 26 19:19:39 2017 -0500
@@ -0,0 +1,274 @@
+# Load necessary libraries
+library("org.Hs.eg.db", quietly=TRUE)
+library("goProfiles", quietly=TRUE)
+
+# Read file and return file content as data.frame?
+readfile = function(filename, header) {
+  if (header == "true") {
+    # Read only the first two lines of the files as data (without headers):
+    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+    #print("header")
+    #print(headers)
+    # Create the headers names with the two (or more) first rows, sappy allows to make operations over the columns (in this case paste) - read more about sapply here :
+    #headers_names <- sapply(headers, paste, collapse = "_")
+    #print(headers_names)
+    #Read the data of the files (skipping the first 2 rows):
+    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+    #print(file[1,])
+    #And assign the headers of step two to the data:
+    names(file) <- headers
+  }
+  else {
+    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+  }
+  return(file)
+}
+
+#filename = "/Users/LinCun/Documents/ProteoRE/usecase1/Check/HPA.Selection.134.txt"
+#test = readfile(filename)
+#str(test)
+#str(test$Gene.names)
+getprofile = function(ids, id_type, level, duplicate) {
+  ####################################################################
+  # Arguments
+  #   - ids: list of input IDs
+  #   - id_type: type of input IDs (UniProt/ENTREZID)
+  #   - level
+  #   - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE)
+  ####################################################################
+  
+  # Check if level is number
+  if (! as.numeric(level) %% 1 == 0) {
+    stop("Please enter an integer for level")
+  }
+  else {
+    level = as.numeric(level)
+  }
+  #genes = as.vector(file[,ncol])
+  
+  # Extract Gene Entrez ID
+  if (id_type == "Entrez") {
+    id = select(org.Hs.eg.db, ids, "ENTREZID", multiVals = "first")
+    genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
+  }
+  else {
+    genes_ids = c()
+    id = select(org.Hs.eg.db, ids, "ENTREZID", "UNIPROT", multiVals = "first")
+    if (duplicate == "TRUE") {
+      id = unique(id)
+    }
+    print(id[[1]])
+    genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
+    # IDs that have NA ENTREZID
+    NAs = id$UNIPROT[which(is.na(id$ENTREZID))]
+    print("IDs unable to convert to ENTREZID: ")
+    print(NAs)
+  }
+  #print(genes_ids)
+  # Convert Protein IDs into entrez ids
+  
+  # for (i in 1:length(id$UNIPROT)) {
+  #   print(i)
+  #   if (is.na(id[[2]][i])) {
+  #     print(id[[2]][i])
+  #   }
+  # }
+  # a = id[which(id$ENTREZID == "NA"),]
+  # print(a)
+  # print(a$UNIPROT)
+  #print(id[[1]][which(is.na(id$ENTREZID))])
+  #print(genes_ids)
+  # for (gene in genes) {
+  #   #id = as.character(mget(gene, org.Hs.egALIAS2EG, ifnotfound = NA))
+  #   id = select(org.Hs.eg.db, genes, "ENTREZID", "UNIPROT")
+  #   print(id)
+  #   genes_ids = append(genes_ids, id$ENTREZID)
+  # }
+  #print(genes_ids)
+  
+  # Create basic profiles
+  profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  
+  # Print profile
+  # printProfiles(profile)
+  
+  return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
+}
+
+# Plot profiles to PNG
+plotPNG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    png("profile.CC.png")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    png("profile.BP.png")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    png("profile.MF.png")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    png("profile.ALL.png")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=T, aTitle=title)
+    dev.off()
+  }
+}
+
+# Plot profiles to JPEG
+plotJPEG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    jpeg("profile.CC.jpeg")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    jpeg("profile.BP.jpeg")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    jpeg("profile.MF.jpeg")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    jpeg("profile.ALL.jpeg")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+}
+
+# Plot profiles to PDF
+plotPDF = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    pdf("profile.CC.pdf")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    pdf("profile.BP.pdf")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    pdf("profile.MF.pdf")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    #print("all")
+    pdf("profile.ALL.pdf")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+}
+
+goprofiles = function() {
+  args = commandArgs(trailingOnly = TRUE)
+  #print(args)
+  # arguments: filename.R inputfile ncol "CC,MF,BP,ALL" "PNG,JPEG,PDF" level "TRUE"(percentage) "Title"
+  if (length(args) != 9) {
+    stop("Not enough/Too many arguments", call. = FALSE)
+  }
+  else {
+    input_type = args[2]
+    if (input_type == "text") {
+      input = strsplit(args[1], "\\s+")[[1]]
+    }
+    else if (input_type == "file") {
+      filename = strsplit(args[1], ",")[[1]][1]
+      ncol = strsplit(args[1], ",")[[1]][2]
+      # Check ncol
+      if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
+        stop("Please enter an integer for level")
+      }
+      else {
+        ncol = as.numeric(gsub("c", "", ncol))
+      }
+      header = strsplit(args[1], ",")[[1]][3]
+      # Get file content
+      file = readfile(filename, header)
+      # Extract Protein IDs list
+      input = c()
+      for (row in as.character(file[,ncol])) {
+        input = c(input, strsplit(row, ";")[[1]][1])
+      }
+    }
+    id_type = args[3]
+    ontoopt = strsplit(args[4], ",")[[1]]
+    #print(ontoopt)
+    #plotopt = strsplit(args[3], ",")
+    plotopt = args[5]
+    level = args[6]
+    per = as.logical(args[7])
+    title = args[8]
+    duplicate = args[9]
+
+    profiles = getprofile(input, id_type, level, duplicate)
+    profile.CC = profiles[1]
+    #print(profile.CC)
+    profile.MF = profiles[2]
+    #print(profile.MF)
+    profile.BP = profiles[3]
+    #print(profile.BP)
+    profile.ALL = profiles[-3:-1]
+    #print(profile.ALL)
+    #c(profile.ALL, profile.CC, profile.MF, profile.BP)
+    if ("CC" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.CC=profile.CC, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.CC = profile.CC, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.CC = profile.CC, per=per, title=title)
+      }
+    }
+    if ("MF" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.MF = profile.MF, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.MF = profile.MF, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.MF = profile.MF, per=per, title=title)
+      }
+    }
+    if ("BP" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.BP = profile.BP, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.BP = profile.BP, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.BP = profile.BP, per=per, title=title)
+      }
+    }
+    
+    #if (grepl("PNG", plotopt)) {
+    # plotPNG(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+    #if (grepl("JPEG", plotopt)) {
+    # plotJPEG(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+    #if (grepl("PDF", plotopt)) {
+    # plotPDF(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+  }
+  
+}
+
+goprofiles()
+
+#Rscript go.R ../proteinGroups_Maud.txt "1" "CC" "PDF" 2 "TRUE" "Title"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goprofiles.xml	Sun Nov 26 19:19:39 2017 -0500
@@ -0,0 +1,156 @@
+<tool id="goProfiles" name="goProfiles" version="0.1.0">
+    <description> Identify enriched biological themes, GO terms from your protein list.
+    </description>
+    <requirements>
+        <requirement type="package" version="3.4.1">R</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-org.hs.eg.db</requirement>
+        <requirement type="package" version="1.38.0">bioconductor-annotationdbi</requirement>
+        <requirement type="package" version="2.34.0">bioconductor-biobase</requirement>
+        <requirement type="package" version="1.38.0">goprofiles</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+        Rscript $__tool_directory__/goprofiles.R
+        #if $input.ids == "text"
+            "$input.text" "text"
+        #else 
+            "$input.file,$input.ncol,$input.header" "file"
+        #end if
+        
+        $input.id_type
+        
+        $onto_opt
+        
+        $opt.plot_opt
+        
+        $level
+        
+        $per
+        
+        "$title"
+        
+        $duplicate
+
+    ]]></command>
+    <inputs>
+        <conditional name="input" >
+            <param name="ids" type="select" label="Provide your Entrez Gene or UniProt identifiers" help="Copy/paste or ID list from a file (e.g. table)" >
+                <option value="text">Copy/paste your identifiers</option>
+                <option value="file">Input file containing your identifiers</option>
+            </param>
+            <when value="text" >
+                <param name="text" type="text" label="Copy/paste your identifiers" help='IDs must be separated by spaces into the form field, for example: P31946 P62258' >
+                    <sanitizer>
+                        <valid initial="string.printable">
+                            <remove value="&apos;"/>
+                        </valid>
+                        <mapping initial="none">
+                            <add source="&apos;" target="__sq__"/>
+                        </mapping>
+                    </sanitizer>
+                </param>
+                <param name="id_type" type="select" label="Please select the type of your IDs list" >
+                    <option value="Entrez">Entrez Gene ID</option>
+                    <option value="UniProt">UniProt protein ID</option>
+                </param>
+            </when>
+            <when value="file" >
+                <param name="file" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="" />
+                <param name="ncol" type="text" label="The column number of IDs to use" help='For example, fill in "c1" if it is the first column, "c2" if it is the second column and so on' />
+                <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" />
+                <param name="id_type" type="select" label="Please select the type of your IDs list" >
+                    <option value="Entrez">Entrez Gene ID</option>
+                    <option value="UniProt">UniProt protein ID</option>
+                </param>
+            </when>            
+        </conditional>
+        <param name="duplicate" type="boolean" label="Remove duplicated IDs" truevalue="TRUE" falsevalue="FALSE" />
+        <param type="select" name="onto_opt" label="Please select GO terms category" multiple="True" display="checkboxes" >
+            <option value="CC">Cellular Component (CC)</option>
+            <option value="MF">Molecular Function (MF)</option>
+            <option value="BP">Biological Process (BP)</option>
+        </param>
+        <param type="select" name="level" label="Level of the ontology at which the profile has to be built (the higher this number, the deeper the GO level)" >
+            <option value="1">1</option>
+            <option value="2" selected="True">2</option>
+            <option value="3">3</option>
+        </param>
+        <param type="boolean" name="per" label="Plot absolute or relative frequencies (not summing to 100)" truevalue="TRUE" falsevalue="FALSE" />
+        <param type="text" name="title" label="Enter title of your figure" />
+        <section name="opt" title="Choose graphical output (bar plots) format: png, jpeg, pdf" expanded="False" help="By default, PDF is chosen as output format">
+            <param type="select" name="plot_opt" label="Choose plot output extension" multiple="True" display="checkboxes" >
+                <option value="PNG">PNG</option>
+                <option value="JPEG">JPEG</option>
+                <option value="PDF" selected="True">PDF</option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+	    <collection type="list" label="GO Profile diagram outputs" name="output" >
+	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.png)" ext="png" />
+	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.jpeg)" ext="jpg" />
+	        <discover_datasets pattern="(?P&lt;designation&gt;.+\.pdf)" ext="pdf" />
+	    </collection>
+    </outputs>
+    <tests>
+        <test>
+            <conditional name="input">
+                <param name="ids" value="file" />
+                <param name="file" value="UnipIDs.txt" />
+                <param name="ncol" value="c1" />
+                <param name="header" value="false" /> 
+                <param name="id_type" value="UniProt" />              
+            </conditional>
+            <param name="onto_opt" value="CC,MF,BP" />
+            <param name="level" value="2" />
+            <param name="per" value="true" />
+            <param name="title" value="Test" />
+            <section name="opt" >
+                <param name="plot_opt" value="PDF" />
+            </section>
+            <output_collection name="output" type="list" >
+                <element name="profile.BP.pdf" file="profile.BP.pdf" ftype="pdf" />
+                <element name="profile.MF.pdf" file="profile.MF.pdf" ftype="pdf" />
+                <element name="profile.CC.pdf" file="profile.CC.pdf" ftype="pdf" />
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+This tool, based on the goProfiles R package, performs statistical analysis of functional profiles. It is based on GO ontology and considers either a gene set ('Entrez’ Identifiers) or a protein set (Uniprot ID) as input. 
+
+You can choose one or more GO categories: 
+
+* Biological Process (BP) 
+* Cellular Component (CC) 
+* Molecular Function (MF) 
+
+Functional profile at a given GO level is obtained by counting the number of identifiers having a hit in each category of this level (2 by default). Results are displayed as bar plots (with absolute or relative frequencies) and can be exported in pdf, png and jpeg formats.  
+
+For more details about GoProfiles, please read: Salicrú et al. Comparison of lists of genes based on functional profiles. BMC Bioinformatics. 2011;12:401.(https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-401)  
+
+If your type of identifiers is not supported (i.e. different form Uniprot and Entrez), please use the **ID Converter** component in the ProteoRE section to convert your list of IDs first.
+
+-----
+
+.. class:: infomark
+
+**Authors** 
+
+Sanchez A, Ocana J and Salicru M (2016). goProfiles: goProfiles: an R package for the statistical analysis of functional profiles. R package version 1.38.0.
+
+.. class:: infomark
+
+**Galaxy integration**
+
+T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit,Migale Bioinformatics platform,
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/UnipIDs.txt	Sun Nov 26 19:19:39 2017 -0500
@@ -0,0 +1,25 @@
+P04637
+P08246
+P63244
+P10275
+P00533
+Q14524
+P05067
+P35555
+P35222
+O95273
+P00451
+P38398
+Q05086
+Q12802
+P68871
+P04585
+Q96EB6
+Q9NYL2
+P31749
+P01137
+Q5S007
+Q08379
+P02649
+P35498
+P12931
Binary file test-data/profile.BP.pdf has changed
Binary file test-data/profile.CC.pdf has changed
Binary file test-data/profile.MF.pdf has changed