changeset 11:cec4b4fb30be draft default tip

DEXSeq version 1.6 added
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 08:22:45 -0400
parents 2fe512c7bfdf
children
files dexseq-hts_1.0/README dexseq-hts_1.0/bin/dexseq_config.sh dexseq-hts_1.0/bin/dexseq_config.sh.sample dexseq-hts_1.0/galaxy/DEXSeq.xml dexseq-hts_1.0/setup_dexseq-hts.sh dexseq-hts_1.0/src/DEXseq-hts.sh dexseq-hts_1.0/src/dexseq_count.py dexseq-hts_1.0/src/dexseq_prepare_annotation.py dexseq-hts_1.0/src/run_DEXseq.R
diffstat 9 files changed, 729 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/README	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,52 @@
+-----------------------------------------
+A Galaxy wrapper for DEXSeq version 1.6.0
+-----------------------------------------
+
+DEXSeq is focused on finding differential exon usage using RNA-seq exon counts between samples with different experimental designs. 
+DEXSeq: http://www.bioconductor.org/packages/release/bioc/html/DEXSeq.html
+
+Requirements:
+-------------
+    Python, HTSeq package :- Preprocessing of sequencing reads and GFF files
+    R, Bio-conductor package :- Required for DEXSeq 
+    SAMTOOLS :- Sequencing read processing 
+
+Contents:
+---------
+
+./bin 
+    Contains a config file for running DEXSeq in different settings, setup_dexseq-hts.sh
+    will help to create the config file according to your local work station. 
+
+./setup_dexseq-hts.sh
+    Setup script, which helps to set the right path to the config file present in the bin
+    folder. 
+    
+./README 
+    Basic information about the wrapper 
+
+./galaxy 
+    Galaxy tool configuration file can be found here. 
+
+./src
+    All relevant scripts for DEXSeq execution, Please follow the shell script to 
+    understand the details.
+
+Licenses:
+---------
+
+    If DEXSeq is used to obtain results for scientific publications it should be cited as [1].
+
+    This wrapper program is free software; you can redistribute it and/or modify it under the 
+    terms of the GNU General Public License as published by the Free Software Foundation; either 
+    version 3 of the License, or (at your option) any later version.
+
+    Copyright(C) 2013 cBio Memorial Sloan Kettering Cancer Center, New York City, USA.
+
+References
+----------
+    [1] Simon Anders and Alejandro Reyes and Wolfgang Huber (2012): Detecting differential usage of exons from RNA-seq data. 
+
+Contact:
+--------
+    support [at] oqtans.org
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/bin/dexseq_config.sh	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,6 @@
+#!/bin/bash 
+export DEXSEQ_VERSION=1.6.0
+export SAMTOOLS_DIR=
+export PYTHON_PATH=/usr/bin/python
+export PYTHONPATH=
+export R_PATH=
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/bin/dexseq_config.sh.sample	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,6 @@
+#!/bin/bash 
+export DEXSEQ_VERSION=1.6.0
+export SAMTOOLS_DIR=/home/vipin/samtools-0.18-beta
+export PYTHON_PATH=/usr/bin/python
+export PYTHONPATH=/home/vipin/lib/python2.7/site-packages/:$PYTHONPATH
+export R_PATH=/home/vipin/bin/R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/galaxy/DEXSeq.xml	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,117 @@
+<tool id="dexseq-hts" name="DEXSeq" version="1.6.0">
+    <description> Inference of differential exon usage in RNA-Seq</description>
+    <command interpreter="bash"> 
+    ./../src/DEXseq-hts.sh 
+    $anno_input
+    $sPaired
+    $library_type
+    $minQUAL
+    $dexseq_out $dexseq_out.extra_files_path
+#for $i in $replicate_groups
+#for $j in $i.replicates
+$j.bam_alignment:#slurp
+#end for
+
+#end for
+    >> $Log_File </command>
+    <inputs>
+    
+    <!-- Genome annotation in GFF/GTF/GFF3 file --> 
+	<param format="gff,gtf,gff3" name="anno_input" type="data" label="Reference genome annotation file" help="A tab delimited format for storing sequence features and annotations (GFF/GTF/GFF3)."/>
+
+    <!-- RNA-Seq read alignment files-->  
+    <repeat name="replicate_groups" title="Replicate group" min="2">
+        <repeat name="replicates" title="Replicate">
+        <param format="bam" name="bam_alignment" type="data" label="BAM file of aligned RNA-Seq reads" help="Can be generated from SAM file using the SAMTools."/> 
+        </repeat>
+    </repeat>
+
+    <!--Paired -->
+    <param name="sPaired" type="select" display="radio" label="Is this library mate-paired?" help="Indicates whether the data is paired-end (default: single-end)">
+        <option value="no" selected="true">Single-end</option>
+        <option value="yes">Paired-end</option>
+    </param>
+
+    <!-- RNA-Seq library type-->
+    <param name="library_type" type="select" display="radio" label="Strand-specific library type" help="Indicates RNA-Seq library preparation protocol (default: strand-specific assay)">
+        <option value="yes">Stranded</option>
+        <option value="no">Non stranded</option>
+        <option value="reverse">Reverse</option>
+    </param>
+
+    <!-- Read alignment quality-->
+    <param name="minQUAL" type="integer" min="0" value="10" label="Minimum alignment quality" help="Skip all reads with alignment quality lower than the given minimum value (default: 10)"/>
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="dexseq_out" label="${tool.name} on ${on_string}: Differential Expression"/>
+    <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: Log"/>
+  </outputs>
+
+  <tests>
+    <test> 
+    </test>
+  </tests> 
+
+  <help>
+
+.. class:: infomark
+
+**What it does** 
+
+DEXSeq_ is focused on finding differential exon usage using RNA-seq exon counts between samples with different experimental designs. 
+
+.. _DEXSeq: http://www.bioconductor.org/packages/release/bioc/html/DEXSeq.html
+
+`DEXSeq` requires:
+
+Genome annotation in GFF file type, containing the necessary information about the transcripts that are to be quantified.
+
+The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments, The program will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefore not recommended!
+
+------
+
+**Licenses**
+
+If **DEXSeq** is used to obtain results for scientific publications it
+should be cited as [1]_.
+
+**References** 
+
+.. [1] Simon Anders and Alejandro Reyes and Wolfgang Huber (2012): `Detecting differential usage of exons from RNA-seq data`_. 
+
+.. _Detecting differential usage of exons from RNA-seq data: http://genome.cshlp.org/content/early/2012/06/21/gr.133744.111.full.pdf+html
+
+------
+
+.. class:: infomark
+
+**About formats**
+
+**GFF/GTF format** General Feature Format/Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and protein sequences. GFF3 lines have nine tab-separated fields:
+
+1. seqid - The name of a chromosome or scaffold.
+2. source - The program that generated this feature.
+3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+5. stop - The ending position of the feature (inclusive).
+6. score - A score between 0 and 1000. If there is no score value, enter ".".
+7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+9. attributes - All lines with the same group are linked together into a single item.
+
+For more information see http://www.sequenceontology.org/gff3.shtml
+
+**BAM format** The Sequence Alignment/Map (SAM) format is a
+tab-limited text format that stores large nucleotide sequence
+alignments. BAM is the binary version of a SAM file that allows for
+fast and intensive data processing. The format specification and the
+description of SAMtools can be found on
+http://samtools.sourceforge.net/.
+
+------
+
+DEXSeq-hts Wrapper Version 0.1 (Sept. 2013)
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/setup_dexseq-hts.sh	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -e 
+
+DIR=`dirname $0`
+. ${DIR}/./bin/dexseq_config.sh
+
+echo ==========================================
+echo  DEXSeq-hts setup script \(DEXSeq version $DEXSEQ_VERSION\) 
+echo ==========================================
+echo
+echo SAMTools directory \(currently set to \"$SAMTOOLS_DIR\", system version used if left empty\)
+read SAMTOOLS_DIR
+if [ "$SAMTOOLS_DIR" == "" ];
+then
+	if [ "$(which samtools)" != "" ] ;
+	then
+		SAMTOOLS_DIR=$(dirname $(which samtools)) 
+	else
+		echo samtools not found
+		exit -1 ;
+	fi
+fi
+echo '=>' Setting SAMTools directory to \"$SAMTOOLS_DIR\"
+echo
+
+echo Path to the python binary \(currently set to \"$PYTHON_PATH\", system version used, if left empty\)
+read PYTHON_PATH
+if [ "$PYTHON_PATH" == "" ];
+then
+    PYTHON_PATH=`which python`
+	if [ "$PYTHON_PATH" == "" ];
+	then
+		echo python not found
+		exit -1 
+	fi
+fi
+echo '=>' Setting Python path to \"$PYTHON_PATH\"
+echo
+
+echo Path to HTSeq library files \(currently set to \"$PYTHONPATH\", system version is used if left empty\)
+read PYTHONPATH
+echo '=>' Setting HTSeq path to \"$PYTHONPATH\"
+echo
+
+echo Path to the R binary \(currently set to \"$R_PATH\", system version used, if left empty\)
+read R_PATH
+if [ "$R_PATH" == "" ];
+then
+    R_PATH=`which R`
+	if [ "$R_PATH" == "" ];
+	then
+		echo R not found
+		exit -1 
+	fi
+fi
+echo '=>' Setting R path to \"$R_PATH\"
+echo
+
+cp -p bin/dexseq_config.sh bin/dexseq_config.sh.bk
+grep -v -e SAMTOOLS_DIR -e PYTHON_PATH -e PYTHONPATH -e R_PATH -e $DEXSEQ_VERSION bin/dexseq_config.sh.bk > bin/dexseq_config.sh
+echo
+echo
+echo generating config file
+
+echo export DEXSEQ_VERSION=$DEXSEQ_VERSION >> bin/dexseq_config.sh
+echo export SAMTOOLS_DIR=$SAMTOOLS_DIR >> bin/dexseq_config.sh
+echo export PYTHON_PATH=$PYTHON_PATH >> bin/dexseq_config.sh
+echo export PYTHONPATH=$PYTHONPATH >> bin/dexseq_config.sh
+echo export R_PATH=$R_PATH >> bin/dexseq_config.sh
+
+echo
+echo Done.
+echo 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/src/DEXseq-hts.sh	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,98 @@
+#/bin/bash
+##
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Copyright (C) 2013 Memorial Sloan-Kettering Cancer Center 
+##
+
+set -e 
+
+PROG=`basename $0`
+DIR=`dirname $0`
+
+. ${DIR}/../bin/dexseq_config.sh
+
+echo
+echo ${PROG}: Oqtans http://galaxy.cbio.mskcc.org Galaxy wrapper for the DEXSeq version ${DEXSEQ_VERSION}.
+echo
+echo DEXSeq: Detecting differential usage of exons from RNA-seq data.
+echo 
+
+## input arguments from the interface 
+GFF_IN=${1}
+shift
+MATE_PAIR=${1}
+shift
+LIBTP=${1}
+shift
+minQL=${1}
+shift
+RES_FILE=${1}
+shift
+RES_WD=${1}
+shift
+
+## associated array with sequencing type.
+declare -A SEQ_TYPE=( [no]=SE [yes]=PE )
+
+echo %%%%%%%%%%%%%%%%%%%%%%%
+echo % 1. Data preparation %
+echo %%%%%%%%%%%%%%%%%%%%%%%
+echo
+
+mkdir -p ${RES_WD}
+echo extra file path $RES_WD
+tmpGTF=`mktemp --tmpdir=/tmp`
+
+echo load the genome annotation in GFF file 
+
+${PYTHON_PATH} ${DIR}/dexseq_prepare_annotation.py ${GFF_IN} ${tmpGTF}
+echo genome annotation stored in ${tmpGTF} 
+echo
+
+echo %%%%%%%%%%%%%%%%%%%%
+echo % 2. Read counting %
+echo %%%%%%%%%%%%%%%%%%%%
+echo
+
+tmpFILE=`mktemp --tmpdir=/tmp`
+echo $tmpFILE
+echo -e '\t'condition'\t'libType > ${tmpFILE}_CONDITIONS.tab
+
+COND=0
+for REPLICATE_GROUP in $@
+do
+    IFS=':'
+    COND=$((COND+1))
+    for BAM_FILE in ${REPLICATE_GROUP}
+    do
+        ## different group information 
+        REPNAME=$(basename ${BAM_FILE%.dat})
+        echo -e ${REPNAME}"\t"$COND"\t"${SEQ_TYPE[$MATE_PAIR]} >> ${tmpFILE}_CONDITIONS.tab
+        
+        ## counting the reads 
+        ${SAMTOOLS_DIR}/samtools view -h $BAM_FILE | ${PYTHON_PATH} ${DIR}/dexseq_count.py -p ${MATE_PAIR} -s ${LIBTP} -a ${minQL} ${tmpGTF} - ${RES_WD}/${REPNAME}
+
+        echo 
+    done
+    echo conuted condition ${COND} 
+done
+echo counted reads map to each exon.
+echo 
+
+echo %%%%%%%%%%%%%%%%%%%%%%%%%%%
+echo % 3. Differential testing %
+echo %%%%%%%%%%%%%%%%%%%%%%%%%%%
+echo
+
+echo "cat ${DIR}/run_DEXseq.R | $R_PATH --slave --args $tmpFILE $RES_WD $tmpGTF ${RES_FILE} $#" 
+cat ${DIR}/run_DEXseq.R | $R_PATH --slave --args $tmpFILE $RES_WD $tmpGTF ${RES_FILE} 
+
+## clean up
+rm -fr ${RES_WD} ${tmpGTF} ${tmpFILE} 
+echo %%%%%%%%
+echo % Done %
+echo %%%%%%%%
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/src/dexseq_count.py	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,173 @@
+import sys, itertools, optparse
+
+optParser = optparse.OptionParser( 
+   
+   usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>",
+   
+   description=
+      "This script counts how many reads in <sam_file> fall onto each exonic " +
+      "part given in <flattened_gff_file> and outputs a list of counts in " +
+      "<output_file>, for further analysis with the DEXSeq Bioconductor package. " +
+      "(Notes: The <flattened_gff_file> should be produced with the script " +
+      "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.",
+      
+   epilog = 
+      "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
+      "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
+      "Public License v3. Part of the 'DEXSeq' package." )
+      
+optParser.add_option( "-p", "--paired", type="choice", dest="paired",
+   choices = ( "no", "yes" ), default = "no",
+   help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" )
+
+optParser.add_option( "-s", "--stranded", type="choice", dest="stranded",
+   choices = ( "yes", "no", "reverse" ), default = "yes",
+   help = "'yes', 'no', or 'reverse'. Indicates whether the data is " +
+      "from a strand-specific assay (default: yes ). " +
+      "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " +
+      "preparation protocol. 'reverse' inverts strands and is neede for certain " +
+      "protocols, e.g. paired-end with circularization."  )
+   
+optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual",
+   default = 10,
+   help = "skip all reads with alignment quality lower than the given " +
+      "minimum value (default: 10)" )
+   
+if len( sys.argv ) == 1:
+   optParser.print_help()
+   sys.exit(1)
+
+(opts, args) = optParser.parse_args()
+
+if len( args ) != 3:
+   sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" )
+   sys.stderr.write( "  Call with '-h' to get usage information.\n" )
+   sys.exit( 1 )
+
+try:
+   import HTSeq
+except ImportError:
+   sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
+   sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
+   sys.exit(1)
+
+gff_file = args[0]
+sam_file = args[1]
+out_file = args[2]
+stranded = opts.stranded == "yes" or opts.stranded == "reverse"
+reverse = opts.stranded == "reverse"
+is_PE = opts.paired == "yes"
+minaqual = opts.minaqual
+
+if sam_file == "-":
+   sam_file = sys.stdin
+
+# Step 1: Read in the GFF file as generated by aggregate_genes.py
+# and put everything into a GenomicArrayOfSets
+
+features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded )     
+for f in  HTSeq.GFF_Reader( gff_file ):
+   if f.type == "exonic_part":
+      f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number']
+      features[f.iv] += f
+
+# initialise counters
+num_reads = 0
+counts = {}
+counts[ '_empty' ] = 0
+counts[ '_ambiguous' ] = 0
+counts[ '_lowaqual' ] = 0
+counts[ '_notaligned' ] = 0
+
+# put a zero for each feature ID
+for iv, s in features.steps():
+   for f in s:
+      counts[ f.name ] = 0
+
+#We need this little helper below:
+def reverse_strand( s ):
+   if s == "+":
+      return "-"
+   elif s == "-":
+      return "+"
+   else:
+      raise SystemError, "illegal strand"
+
+# Now go through the aligned reads
+
+if not is_PE:
+
+   num_reads = 0
+   for a in HTSeq.SAM_Reader( sam_file ):
+      if not a.aligned:
+         counts[ '_notaligned' ] += 1
+         continue
+      if a.aQual < minaqual:
+         counts[ '_lowaqual' ] += 1
+         continue
+      rs = set()
+      for cigop in a.cigar:
+         if cigop.type != "M":
+            continue
+         if reverse:
+            cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
+         for iv, s in features[cigop.ref_iv].steps( ):
+            rs = rs.union( s )
+      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
+      if len( set_of_gene_names ) == 0:
+         counts[ '_empty' ] += 1
+      elif len( set_of_gene_names ) > 1:
+         counts[ '_ambiguous' ] +=1
+      else:
+         for f in rs:
+            counts[ f.name ] += 1
+      num_reads += 1
+      if num_reads % 100000 == 0:
+         sys.stdout.write( "%d reads processed.\n" % num_reads )
+
+else: # paired-end
+
+   num_reads = 0
+   for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ):
+      rs = set()
+      if af and ar and not af.aligned and not ar.aligned:
+         counts[ '_notaligned' ] += 1
+         continue
+      if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
+         counts[ '_lowaqual' ] += 1
+         continue
+      if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys():
+         for cigop in af.cigar:
+            if cigop.type != "M":
+               continue
+            if reverse:
+               cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
+            for iv, s in features[cigop.ref_iv].steps():
+               rs = rs.union( s )
+      if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys():
+         for cigop in ar.cigar:
+            if cigop.type != "M":
+               continue
+            if not reverse:
+               cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
+            for iv, s in features[cigop.ref_iv].steps():
+                  rs = rs.union( s )
+      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
+      if len( set_of_gene_names ) == 0:
+         counts[ '_empty' ] += 1
+      elif len( set_of_gene_names ) > 1:
+         counts[ '_ambiguous' ] = 0
+      else:
+         for f in rs:
+            counts[ f.name ] += 1
+      num_reads += 1
+      if num_reads % 100000 == 0:
+         sys.stderr.write( "%d reads processed.\n" % num_reads )
+
+ 
+# Step 3: Write out the results
+
+fout = open( out_file, "w" )
+for fn in sorted( counts.keys() ):
+   fout.write( "%s\t%d\n" % ( fn, counts[fn] ) )
+fout.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/src/dexseq_prepare_annotation.py	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,131 @@
+import sys, collections, itertools, os.path
+
+if len( sys.argv ) != 3:
+   sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" )
+   sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) )
+   sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" )
+   sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" )
+   sys.stderr.write( "with the count_in_exons.py script.\n" )
+   sys.exit(1)
+
+try:
+   import HTSeq
+except ImportError:
+   sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
+   sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
+   sys.exit(1)
+
+gtf_file = sys.argv[1]
+out_file = sys.argv[2]
+
+## make sure that it can handle GFF files.
+parent_child_map = dict()
+for feature in HTSeq.GFF_Reader( gtf_file ):
+   if feature.type in ['mRNA', 
+      'transcript', 
+      'ncRNA', 
+      'miRNA', 
+      'pseudogenic_transcript', 
+      'rRNA', 
+      'snoRNA', 
+      'snRNA', 
+      'tRNA', 
+      'scRNA']:
+      parent_child_map[feature.attr['ID']] = feature.attr['Parent']
+
+# Step 1: Store all exons with their gene and transcript ID 
+# in a GenomicArrayOfSets
+
+exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
+for f in HTSeq.GFF_Reader( gtf_file ):
+   if not f.type in ["exon", "pseudogenic_exon"]:
+      continue
+   if not f.attr.get('gene_id'):
+      f.attr['gene_id'] = parent_child_map[f.attr['Parent']]
+      f.attr['transcript_id'] = f.attr['Parent']
+   f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" )
+   exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] )
+
+# Step 2: Form sets of overlapping genes
+
+# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
+# contains IDs of genes that overlap, i.e., share bases (on the same strand).
+# The keys of 'gene_sets' are the IDs of all genes, and each key refers to
+# the set that contains the gene.
+# Each gene set forms an 'aggregate gene'.
+
+gene_sets = collections.defaultdict( lambda: set() )
+for iv, s in exons.steps():
+   # For each step, make a set, 'full_set' of all the gene IDs occuring
+   # in the present step, and also add all those gene IDs, whch have been
+   # seen earlier to co-occur with each of the currently present gene IDs.
+   full_set = set()
+   for gene_id, transcript_id in s:
+      full_set.add( gene_id )
+      full_set |= gene_sets[ gene_id ]
+   # Make sure that all genes that are now in full_set get associated
+   # with full_set, i.e., get to know about their new partners
+   for gene_id in full_set:
+      assert gene_sets[ gene_id ] <= full_set
+      gene_sets[ gene_id ] = full_set
+
+
+# Step 3: Go through the steps again to get the exonic sections. Each step
+# becomes an 'exonic part'. The exonic part is associated with an
+# aggregate gene, i.e., a gene set as determined in the previous step, 
+# and a transcript set, containing all transcripts that occur in the step.
+# The results are stored in the dict 'aggregates', which contains, for each
+# aggregate ID, a list of all its exonic_part features.
+
+aggregates = collections.defaultdict( lambda: list() )
+for iv, s in exons.steps( ):
+   # Skip empty steps
+   if len(s) == 0:
+      continue
+   # Take one of the gene IDs, find the others via gene sets, and
+   # form the aggregate ID from all of them   
+   gene_id = list(s)[0][0]
+   assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] 
+   aggregate_id = '+'.join( gene_sets[ gene_id ] )
+   # Make the feature and store it in 'aggregates'
+   f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv )   
+   f.source = os.path.basename( sys.argv[1] )
+   f.attr = {}
+   f.attr[ 'gene_id' ] = aggregate_id
+   transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) )
+   f.attr[ 'transcripts' ] = '+'.join( transcript_set )
+   aggregates[ aggregate_id ].append( f )
+
+
+# Step 4: For each aggregate, number the exonic parts
+
+aggregate_features = []
+for l in aggregates.values():
+   for i in xrange( len(l)-1 ):
+      assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
+      assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
+      if l[i].iv.chrom != l[i+1].iv.chrom:
+         raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) )
+      if l[i].iv.strand != l[i+1].iv.strand:
+         raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) )
+   aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", 
+      HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, 
+         l[-1].iv.end, l[0].iv.strand ) )
+   aggr_feat.source = os.path.basename( sys.argv[1] )
+   aggr_feat.attr = { 'gene_id': aggr_feat.name }
+   for i in xrange( len(l) ):
+      l[i].attr['exonic_part_number'] = "%03d" % ( i+1 )
+   aggregate_features.append( aggr_feat )
+      
+      
+# Step 5: Sort the aggregates, then write everything out
+
+aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) )
+
+fout = open( out_file, "w" ) 
+for aggr_feat in aggregate_features:
+   fout.write( aggr_feat.get_gff_line() )
+   for f in aggregates[ aggr_feat.name ]:
+      fout.write( f.get_gff_line() )
+
+fout.close()      
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dexseq-hts_1.0/src/run_DEXseq.R	Tue Oct 08 08:22:45 2013 -0400
@@ -0,0 +1,73 @@
+### load DEXSeq package
+suppressMessages(require("DEXSeq"))
+
+### get arguments 1: INFILE, 2: OUTFILE 3:SIZE
+args <- commandArgs()
+INFILE<-args[4]
+EXTRAPATH<-args[5]
+annodb<-args[6]
+OUTFILE<-args[7]
+
+INFILE_CONDS=c(paste(INFILE, "_CONDITIONS.tab", sep=""))
+
+### read count data from file
+condsTable <- read.delim( INFILE_CONDS, header=TRUE, stringsAsFactors=TRUE, row.names=1 )
+#head(condsTable)
+
+conditions<-factor( condsTable[ , 1] )
+#print(conditions)
+
+## unique condition to define the pair of tests 
+uniq_conds <- unique(conditions)
+#print(uniq_conds)
+
+## all possible pairs of conditions
+pw_tests <- list()
+for(i in 1:(length(uniq_conds)-1)) {
+    for(j in (i+1):length(uniq_conds)) {
+        pw_tests[[length(pw_tests)+1]] <- c(uniq_conds[i], uniq_conds[j])
+    }
+}
+#print(pw_tests)
+
+tab <- NULL
+## testing all possible pairs of conditions
+for(i in 1:length(pw_tests)) {
+    ## header name 
+    test_pair_name <- c(paste(pw_tests[[i]][1], "__vs__", pw_tests[[i]][2], sep=""))
+    print(test_pair_name)
+
+    sub.data <- subset(condsTable, (conditions %in% c(pw_tests[[i]][1],pw_tests[[i]][2])))
+    sub.data[[1]]<-as.factor(sub.data[[1]])
+
+    ecs = read.HTSeqCounts(countfiles=file.path(EXTRAPATH, row.names(sub.data)), design=sub.data, flattenedfile=annodb)
+    
+    ## Normalisation
+    ecs <- estimateSizeFactors(ecs)
+    print(sizeFactors(ecs))
+
+    suppressMessages(require("parallel"))
+    ## Dispersion estimation 
+    ecs <- estimateDispersions(ecs, nCores=2)
+    ecs <- fitDispersionFunction(ecs)
+
+    ## Testing for differential exon usage
+    ecs <- testForDEU(ecs, nCores=2)
+
+    ## estimate the fold change 
+    ecs <- estimatelog2FoldChanges(ecs, nCores=2)
+
+    ## Result table
+    resultTable <- DEUresultTable(ecs)
+    print(head(resultTable))
+
+    colnames(resultTable) <- paste(test_pair_name, colnames(resultTable), sep=":")
+    #print(colnames(resultTable))
+
+    if(is.null(tab)) {
+        tab<- resultTable
+    }
+    else tab<- cbind(tab, resultTable)
+}
+## printing the result to out file 
+write.table(tab, OUTFILE, quote=F, sep="\t", row.names=F)