# HG changeset patch
# User theo.collard
# Date 1507037151 14400
# Node ID 05977e96375bbf868d28ba6d3caa5f6a45a8abe4
# Parent 4290f0f3d908d9f8458ddce17a059a544e25a386
Uploaded
diff -r 4290f0f3d908 -r 05977e96375b ballgown.xml
--- a/ballgown.xml Tue Oct 03 09:25:35 2017 -0400
+++ b/ballgown.xml Tue Oct 03 09:25:51 2017 -0400
@@ -1,235 +1,331 @@
-
- Flexible, isoform-level differential expression analysis
-
- bioconductor-ballgown
- r-dplyr
- r-optparse
-
-
-
- ##------------------------------------------------------------------------------------
- ## This function reads the input file with the mapping between samples and files
- ## E.g. of result:
- ## mapping = {
- ## "e2t.ctab" : "sample1",
- ## "other.ctab" : "sample2",
- ## "i2t.ctab" : "sample1",
- ## "t_data.ctab": "sample1"
- ## ...
- ## }
- ##------------------------------------------------------------------------------------
- #def read_sample_mapping_file(sample_mapping_file):
- #try
- #set mapping = {}
- #set file = open($sample_mapping_file.dataset.dataset.get_file_name(),'r')
- #for $line in $file:
- #set content= $line.strip().split('\t')
- #for $map in $content:
- #set mapping[$map]= $content[0]
- #end for
- #end for
- #return $mapping
- #except
- #return None
- #end try
- #end def
-
- ##------------------------------------------------------------------------------------
- ## This function returns the name of the sample associated to a given file
- ##------------------------------------------------------------------------------------
- #def get_sample_name($dataset, $sample_mapping):
- ##If the file with samples mapping was provided
- #if $sample_mapping != None:
- #return $sample_mapping.get($dataset.name, None)
- ##Otherwise with extract the sample name from the filename
- #else:
- #return str($dataset.element_identifier)
- #end if
- #end def
-
- ##------------------------------------------------------------------------------------
- ## This function reads a dataset or list of datasets and sets the corresponding value
- ## in the $result variable
- ## e.g. of result
- ##'sample1' : {
- ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
- ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
- ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
- ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
- ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
- ## },
- ##------------------------------------------------------------------------------------
- #def read_input_files($param_name, $param_value, $result, $sample_mapping, $create_if_empty):
- ## If input is a data collection
- #if isinstance($param_value, list):
- ## For each dataset
- #for $dataset in $param_value:
- ## Get the sample name
- #set sample_name = $get_sample_name($dataset, $sample_mapping)
- ## Check if sample is already registered
- #if not($result.has_key($sample_name)):
- #if ($create_if_empty == True):
- #set result[$sample_name] = {}
- #else:
- #raise ValueError("Error in input. Please check that input contains all the required files for sample " + $sample_name)
- #end if
- #end if
- ## Register the file to the sample
- #set result[$sample_name][$param_name] = str($dataset.dataset.dataset.get_file_name())
- #end for
- #else:
- #if not($result.has_key("sample_1")):
- #set result["sample_1"] = {}
- #end if
- #set result["sample_1"][$param_name] = str($param_name.dataset.dataset.get_file_name())
- #end if
- #return $result
- #end def
-
- ##------------------------------------------------------------------------------------
- ## Main body of the tool
- ##------------------------------------------------------------------------------------
- ## Set the params for the next R script
- #set result={}
- #set sample_mapping=None
-
- ## If the samples mapping file was provided, parse the content
- #if $samples_names != None and not(isinstance($samples_names, list) and (None in $samples_names)):
- #set sample_mapping = $read_sample_mapping_file($samples_names)
- #end if
-
- ## READ THE CONTENT FOR e_data AND STORE THE FILES
- ## INDEXED BY THEIR SAMPLE NAME
- ## e.g. 'HBR_Rep1' : {
- ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
- ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
- ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
- ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
- ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
- ## },
- ## 'HBR_Rep2' : {...}
- #set $result = $read_input_files("e_data.ctab", $e_data, $result, $sample_mapping, True)
- #set $result = $read_input_files("i_data.ctab", $i_data, $result, $sample_mapping, False)
- #set $result = $read_input_files("t_data.ctab", $t_data, $result, $sample_mapping, False)
- #set $result = $read_input_files("e2t.ctab", $e2t, $result, $sample_mapping, False)
- #set $result = $read_input_files("i2t.ctab", $i2t, $result, $sample_mapping, False)
-
- ## For each input sample, create a directory and link the input files for ballgown
- #import os
- #set n_sample = 1
- #for $key, $value in $result.iteritems():
- #set dir_name = str($output.files_path) + "/" + $key + "/"
- $os.makedirs($dir_name)
- #for $file_name, $file_path in $value.iteritems():
- $os.symlink($file_path, $dir_name + $file_name)
- #end for
- #set n_sample = $n_sample + 1
- #end for
-
- ## Run the R script with the location of the linked files and the name for outpot file
- ballgown.R --directory $output.files_path --outputtranscript $output --outputgenes $outputgn --texpression $trexpression --phendat $phendata --bgout $bgo
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-=======================
-Ballgown
-=======================
------------------------
-**What it does**
------------------------
-
-Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-seq data.
-The Ballgown package provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly.
-
-----
-
------------------------
-**How to use**
------------------------
-The input for this tools consists on 5 files for each sample in your experiment:
-
-- **e_data**: exon-level expression measurements. Tab file or collection of tab files. One row per exon. Columns are e_id (numeric exon id), chr, strand, start, end (genomic location of the exon), and the following expression measurements for each sample:
- * rcount: reads overlapping the exon
- * ucount: uniquely mapped reads overlapping the exon
- * mrcount: multi-map-corrected number of reads overlapping the exon
- * cov average per-base read coverage
- * cov_sd: standard deviation of per-base read coverage
- * mcov: multi-map-corrected average per-base read coverage
- * mcov_sd: standard deviation of multi-map-corrected per-base coverage
-- **i_data**: intron- (i.e., junction-) level expression measurements. Tab file or collection of tab files. One row per intron. Columns are i_id (numeric intron id), chr, strand, start, end (genomic location of the intron), and the following expression measurements for each sample:
- * rcount: number of reads supporting the intron
- * ucount: number of uniquely mapped reads supporting the intron
- * mrcount: multi-map-corrected number of reads supporting the intron
-- **t_data**: transcript-level expression measurements. Tab file or collection of tab files. One row per transcript. Columns are:
- * t_id: numeric transcript id
- * chr, strand, start, end: genomic location of the transcript
- * t_name: Cufflinks-generated transcript id
- * num_exons: number of exons comprising the transcript
- * length: transcript length, including both exons and introns
- * gene_id: gene the transcript belongs to
- * gene_name: HUGO gene name for the transcript, if known
- * cov: per-base coverage for the transcript (available for each sample)
- * FPKM: Cufflinks-estimated FPKM for the transcript (available for each sample)
-- **e2t**: Tab file or collection of tab files. Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. These ids match the ids in the e_data and t_data tables.
-- **i2t**: Tab file or collection of tab files. Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. These ids match the ids in the i_data and t_data tables.
-- samples_names: (optional) Tab file. Table with five columns, one row per sample. Defines which files from the input belong to each sample in the experiment.
-
-.. class:: infomark
-
-'''TIP''' *Note* Here's an example of a good phenotype data file for your expirement.
-
-+--------------+-------------------------+-------------------------+---+
-|ids |experimental variable 1 |experimental variable 2 |...|
-+==============+=========================+=========================+===+
-|sample 1 |value 1 |value 2 |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 2 |value 2 |value 1 |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 3 |value 1 |value 2 |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 4 |value 2 |value 1 |...|
-+--------------+-------------------------+-------------------------+---+
-|... |value 1 |value 2 |...|
-+--------------+-------------------------+-------------------------+---+
-
-
-.. class:: infomark
-
-*Note* The minimal transcript expression is a number used to filter the transcripts that
-are less or not expressed in our samples when compared to the genome
-
------------------------
-**Outputs**
------------------------
-
-This tool has 3 outputs:
-
-- **transcripts expression** : this is a csv file containing all the transcripts that are expressed above the transcripts expression value
-- **genes expression** : this is a csv file containing all the genes that are expressed above the transcripts expression value
-- **Ballgown object** : this is the ballgown object created during the process. This file can be re-used later for further analysis in a R console.
-
-----
-
-**Authors**: Théo Collard [SLU Global Bioinformatics Centre], Rafael Hernández de Diego [SLU Global Bioinformatics Centre], and Tomas Klingström [SLU Global Bioinformatics Centre]
-
-Sources are available at https://github.com/CollardT/Ballgown-Wrapper
-
-
-
+
+ Flexible, isoform-level differential expression analysis
+
+ bioconductor-ballgown
+ r-dplyr
+ r-optparse
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ file_format['format']=="csv"
+
+
+ file_format['format']=="csv"
+
+
+ file_format['format']=="tsv"
+
+
+ file_format['format']=="tsv"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ doi:10.1038/nprot.2016.095
+
+