Mercurial > repos > proteomisc > geo_query
changeset 0:4339db844d35 draft
Uploaded
author | proteomisc |
---|---|
date | Sun, 26 Nov 2023 18:42:20 +0000 |
parents | |
children | 0e1a528e652b |
files | geo_query/GetDatasets.R geo_query/Get_Datasets.xml geo_query/Readme.txt geo_query/citations.xml geo_query/datatypetweek.sh geo_query/images/workflow.jpg geo_query/test-data/GSM103772_1.gpr.gz |
diffstat | 7 files changed, 216 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geo_query/GetDatasets.R Sun Nov 26 18:42:20 2023 +0000 @@ -0,0 +1,31 @@ +options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)}) +sink(stdout(), type = "message") +suppressWarnings(suppressMessages(library("batch"))) +suppressWarnings(suppressMessages(library(GEOquery))) +listArguments = parseCommandArgs(evaluate=FALSE) +GeoCode=listArguments[["GeoCode"]] +GeoCode=toupper(GeoCode) +gethelp.df =suppressMessages(suppressWarnings(tryCatch(eList <- getGEOSuppFiles(GeoCode), error = function(cond)"skip"))) +if(is.null(gethelp.df)){ + gethelp.df =suppressMessages(suppressWarnings(tryCatch(eList <- getGEOSuppFiles(GeoCode), error = function(cond)"skip"))) +} +if(is.null(gethelp.df)){ + write("GeoCode with no raw data, choose another code or retry.", stderr()) +}else{ + if(is.data.frame(gethelp.df)==TRUE){ + rawdata=rownames(eList)[grep('RAW',rownames(eList))][1] + untar(rawdata, exdir = 'CEL') + if (file.exists(rawdata)) { + file.remove(rawdata) + } + }else{ + if(gethelp.df=="skip"){ + write("Network trouble, try again or Check your geoCode.", stderr()) + quit(status=2) + }else{ + write("GeoCode with no raw data, choose another code or retry.", stderr()) + quit(status=3) + } + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geo_query/Get_Datasets.xml Sun Nov 26 18:42:20 2023 +0000 @@ -0,0 +1,118 @@ +<tool id="Get_Datasets" name="Query GEO Database" version="20181028"> + <description> Query Gene Expression Omnibus (GEO) Database using an accession code. </description> + <macros> + <import>citations.xml</import> + </macros> + <requirements> + <requirement type="package" >r-base</requirement> + <requirement type="package" >r-batch</requirement> + <requirement type="package" version="2.50.5">bioconductor-geoquery</requirement> + <requirement type="package" version="1.14">libiconv</requirement> + </requirements> + <stdio> + <exit_code range="1" level="fatal" description="Wrong accession code" /> + <exit_code range="2" level="fatal" description="Network trouble, try again or check your GeoCode" /> + <exit_code range="3" level="fatal" description="GeoCode with no raw data, choose another code or retry" /> + </stdio> + <command> + <![CDATA[ + Rscript '$__tool_directory__/GetDatasets.R' GeoCode $GeoCode; + sh '$__tool_directory__/datatypetweek.sh' + + ]]> + </command> + <inputs> + <param name="GeoCode" type="text" value="GSE4632" label="GEO accession code" help="Gene Expression Omnibus (GEO) accession code"> + <sanitizer> + <valid initial="string.printable,string.digits"> + </valid> + </sanitizer> + </param> + </inputs> + <outputs> + <data format="" name="Raw files" > + <discover_datasets pattern="(?P<name>.+)\.(gpr|GPR)(\.gz)?" directory="CEL" visible="true" ext="gpr" assign_primary_output="false"/> + <discover_datasets pattern="(?P<name>.+)\.(cel|CEL)(\.gz)?" directory="CEL" visible="true" ext="cel" assign_primary_output="false" /> + <discover_datasets pattern="(?P<name>.+)\.(gal|GAL)(\.gz)?" directory="CEL" visible="true" ext="gal" assign_primary_output="false" /> + <discover_datasets pattern="(?P<name>.+)\.(txt|TXT)(\.gz)?" directory="CEL" visible="true" ext="txt" assign_primary_output="false" /> + </data> + </outputs> + <tests> + <test> + <param name="GeoCode" value="GSE4632" /> + <output name="Raw files"> + <discovered_dataset designation="GSM103772_1" ftype="gpr" file="GSM103772_1.gpr.gz" compare="sim_size"/> + </output> + </test> + </tests> + <help> + +.. class:: infomark + +**Authors** Bensellak Taoufik bensellak@ensat.ac.ma, Ahmed Moussa. + +--------------------------------------------------- + +================================================================== +Query Gene Expression Omnibus (GEO) Database. +================================================================== + +----------- +Description +----------- + +This tool is used for querying Gene Expression Omnibus Database. + +**GEO accession code** + +GEO Accession code + +----------------- +Workflow position +----------------- + +**Upstream tools** + ++------------------------+------------------+--------+-----------+ +| Name | output file |format | parameter | ++========================+==================+========+===========+ +| NA | NA | NA | NA | ++------------------------+------------------+--------+-----------+ + + +**Downstream tools** + ++----------------------------------------------------------------+------------------------------------------------+--------------+ +| Name | Output file | Format | ++================================================================+================================================+==============+ +| Make design and read dataset | Project information and design file | RData,Tabular| ++----------------------------------------------------------------+------------------------------------------------+--------------+ + +----------- +Input files +----------- + ++---------------------------+------------+ +| Parameter : num + label | Format | ++===========================+============+ +| GEO accession code | String | ++---------------------------+------------+ + +------------ +Output files +------------ + +**Set of raw samples** + +------------------------------ +General schema of the workflow +------------------------------ + +.. image:: ./workflow.jpg + :height: 800 + :width: 700 + + </help> + <expand macro="R_citation"> + </expand> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geo_query/Readme.txt Sun Nov 26 18:42:20 2023 +0000 @@ -0,0 +1,1 @@ +Galaxy tool to query Gene Expression Omnibus (GEO) Database using an accession code.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geo_query/citations.xml Sun Nov 26 18:42:20 2023 +0000 @@ -0,0 +1,40 @@ +<macros> + <token name="@VERSION@">1.0</token> + <xml name="R_citation"> + <citations> + <citation type="bibtex"> + @Manual{, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2017}, + url = {https://www.R-project.org/}, + } + </citation> + <citation type="bibtex"> + @Article{, + author = {Sean Davis and Paul Meltzer}, + title = {GEOquery: a bridge between the Gene Expression Omnibus (GEO) and BioConductor}, + journal = {Bioinformatics}, + year = {2007}, + volume = {14}, + pages = {1846--1847}, + } + </citation> + <citation type="bibtex"> + @Article{, + title = {Passing in Command Line Arguments and Parallel Cluster/Multicore Batching in {R} with {batch}}, + author = {Thomas J. Hoffmann}, + journal = {Journal of Statistical Software, Code Snippets}, + year = {2011}, + volume = {39}, + number = {1}, + pages = {1--11}, + url = {http://www.jstatsoft.org/v39/c01/}, + } + </citation> + </citations> + </xml> + +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geo_query/datatypetweek.sh Sun Nov 26 18:42:20 2023 +0000 @@ -0,0 +1,26 @@ +#! /bin/bash + +NON_UTF_FILE_DIR="CEL/" +PATTERN_FILE_NAME="\( -iname \*.gpr.gz -o -iname \*.gpr \)" + +find $NON_UTF_FILE_DIR -type f \( -iname \*.gpr.gz -o -iname \*.gpr \) > utf8list +iconv utf8list > asciilist +i=1 +for file in $(cat utf8list); do + newname=$(head -$i asciilist | tail -1 | tr -d '\n')."utf8" + filenamegpr="${file%.*}" + filenamegprutf8="${file%.*}""utf8" + if file --mime-type "$file" | grep -q gzip$; then + gunzip $file; + iconv -f ISO-8859-1 -t UTF-8 $filenamegpr > $filenamegprutf8; + rm $filenamegpr + mv $filenamegprutf8 $filenamegpr + gzip $filenamegpr + else + iconv -f ISO-8859-1 -t UTF-8 $file > $newname; + mv $newname $file; + fi + i=$((i + 1)) +done + +rm utf8list asciilist