Mercurial > repos > jjohnson > defuse

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Fri Sep 16 12:41:37 2011 -0500
@@ -0,0 +1,33 @@
+The DeFuse galaxy tool is based on DeFuse_Version_0.4.2
+  http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
+
+DeFuse is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
+
+
+Manual:
+  http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+
+
+The included defuse source code is from: http://sourceforge.net/projects/defuse/files/defuse/0.4/defuse-0.4.2.tar.gz/download
+(without the defuse-0.4.2 dir level)
+tar zxf defuse-0.4.2.tar.gz
+cd tool
+make
+cd ..
+
+To use with non human genome references:
+tar zxf modified_scripts.tgz
+Defuse source was modified to include 2 extra parameters for non human references: gene_id_pattern and transcript_id_pattern
+	scripts/alignjob.pl
+	scripts/annotate_fusions.pl
+	scripts/calculate_expression_simple.pl
+	scripts/filter_bulk_fusion_reads.pl
+	scripts/filter_sam_genes.pl
+	scripts/find_concordant_ensembl.pl
+	scripts/find_gene_clusters.pl
+
+
+The defuse.xml galaxy tool wrapper will generate a defuse config.txt using values from tool-data/defuse.loc
+and call scripts/defuse.pl
+
+
Binary file defuse-0.4.2.tar.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/defuse.xml	Fri Sep 16 12:41:37 2011 -0500
@@ -0,0 +1,637 @@
+<tool id="defuse" name="DeFuse" version="1.0">
+ <description>identify fusion transcripts</description>
+ <requirements>
+  <requirement type="binary"></requirement>
+ </requirements>
+ <command interpreter="perl">
+  scripts/defuse.pl
+  -c `cp $defuse_config $config_txt; echo $defuse_config`
+  -d `mkdir -p data_dir; ln -s $left_pairendreads data_dir/reads_1.fastq; ln -s $right_pairendreads data_dir/reads_2.fastq; echo data_dir`
+  -o  output_dir -p 8
+ </command>
+ <inputs>
+  <param name="left_pairendreads" type="data" format="fastq" label="left part of read pairs" help="The left and right reads pairs must be in the same order, and not have any unpaired reads.  (FASTQ interlacer will pair reads and remove the unpaired.   FASTQ de-interlacer will separate the result into left and right reads.)"/>
+  <param name="right_pairendreads" type="data" format="fastq" label="right part of read pairs" help="In the same order as the left reads"/>
+  <conditional name="refGenomeSource">
+      <param name="genomeSource" type="select" label="Will you select a built-in DeFuse Reference Dataset, or supply a configuration from your history" help="">
+        <option value="indexed">Use a built-in DeFuse Reference Dataset</option>
+        <option value="history">Use a configuration from your history that specifies the DeFuse Reference Dataset</option>
+      </param>
+      <when value="indexed">
+        <param name="index" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
+          <options from_file="defuse.loc">
+            <column name="name" index="1"/>
+            <column name="value" index="2"/>
+            <filter type="sort_by" column="0" />
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+        <conditional name="defuse_param">
+          <param name="settings" type="select" label="Defuse parameter settings" help="">
+            <option value="preSet">Default settings</option>
+            <option value="full">Full parameter list</option>
+          </param>
+          <when value="preSet" />
+          <when value="full">
+            <param name="max_insert_size" type="integer" value="500" optional="true" label="Bowtie max_insert_size" />
+            <param name="dna_concordant_length" type="integer" value="2000" optional="true" label="Minimum gene fusion range dna_concordant_length" />
+            <param name="discord_read_trim" type="integer" value="50" optional="true" label="Trim length for discordant reads discord_read_trim" help="(split reads are not trimmed)" />
+            <param name="clustering_precision" type="float" value=".95" optional="true" label="Filter clustering_precision">
+              <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
+            </param>
+            <param name="span_count_threshold" type="integer" value="5" optional="true" label="Filter span_count_threshold" />
+            <param name="split_count_threshold" type="integer" value="3" optional="true" label="Filter split_count_threshold" />
+            <param name="percent_identity_threshold" type="float" value=".90" optional="true" label="Filter percent_identity_threshold">
+              <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
+            </param>
+            <param name="max_dist_pos" type="integer" value="600" optional="true" label="Filter max_dist_pos" />
+            <param name="num_dist_genes" type="integer" value="500" optional="true" label="Filter num_dist_genes" />
+            <param name="split_min_anchor" type="integer" value="4" optional="true" label="Filter split_min_anchor" />
+            <param name="max_concordant_ratio" type="float" value="0.1" optional="true" label="Filter max_concordant_ratio">
+              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
+            </param>
+            <param name="splice_bias" type="integer" value="10" optional="true" label="Filter splice_bias" />
+            <param name="probability_threshold" type="float" value="0.50" optional="true" label="Filter probability_threshold">
+              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
+            </param>
+            <param name="covariance_sampling_density" type="float" value="0.01" optional="true" label="covariance_sampling_density">
+              <help>Position density when calculating covariance</help>
+              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
+            </param>
+            <param name="denovo_assembly" type="select" label="denovo_assembly" help="">
+              <option value="">Use Default</option>
+              <option value="no">no</option>
+              <option value="yes">yes</option>
+            </param>
+            <!--
+              <param name="positive_controls" type="data" format="txt" optional=true label="Defuse positive_controls" help=""/>
+            -->
+          </when> <!-- full -->
+        </conditional>  <!-- defuse_param -->
+      </when>
+      <when value="history">
+        <param name="config" type="data" format="txt" label="Defuse Config file" help=""/>
+      </when>  <!-- history -->
+  </conditional>  <!-- refGenomeSource -->
+ </inputs>
+ <configfiles>
+  <configfile name="defuse_config">
+#import ast
+#if $refGenomeSource.genomeSource == "history":
+#include raw $refGenomeSource.config.__str__
+#else
+#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value))
+#
+# Configuration file for defuse
+#
+# At a minimum, change all values enclused by []
+#
+# Gene/Transcript id pattern
+gene_id_pattern = #slurp
+#try
+$ref_dict['gene_id_pattern']
+transcript_id_pattern = #slurp
+#except
+ENSG\d+
+#end try
+#try
+$ref_dict['transcript_id_pattern']
+#except
+ENST\d+
+#end try
+
+# Directory where the defuse code was unpacked
+## Default location in the tool/defuse directory
+# source_directory = ${__root_dir__}/tools/defuse
+source_directory = #slurp
+#try
+$ref_dict['source_directory']
+#except
+${__root_dir__}/tools/defuse
+#end try
+
+# Directory where you want your dataset
+dataset_directory = #slurp
+#try
+$ref_dict['dataset_directory']
+#except
+/project/db/genomes/Hsapiens/hg19/defuse
+#end try
+
+# Input genome and gene models
+gene_models = #slurp
+#try
+$ref_dict['gene_models']
+#except
+\$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf
+#end try
+genome_fasta = #slurp
+#try
+$ref_dict['genome_fasta']
+#except
+\$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa
+#end try
+
+# Repeat table from ucsc genome browser
+repeats_filename = #slurp
+#try
+$ref_dict['repeats_filename']
+#except
+\$(dataset_directory)/rmsk.txt
+#end try
+
+# EST info downloaded from ucsc genome browser
+est_fasta = #slurp
+#try
+$ref_dict['est_fasta']
+#except
+\$(dataset_directory)/est.fa
+#end try
+est_alignments = #slurp
+#try
+$ref_dict['est_alignments']
+#except
+\$(dataset_directory)/intronEst.txt
+#end try
+
+# Unigene clusters downloaded from ncbi
+unigene_fasta = #slurp
+#try
+$ref_dict['unigene_fasta']
+#except
+\$(dataset_directory)/Hs.seq.uniq
+#end try
+
+# Paths to external tools
+bowtie_bin = #slurp
+#try
+$ref_dict['bowtie_bin']
+#except
+/soft/bowtie/0.12.7/bowtie
+#end try
+bowtie_build_bin = #slurp
+#try
+$ref_dict['bowtie_build_bin']
+#except
+/soft/bowtie/0.12.7/bowtie-build
+#end try
+blat_bin = #slurp
+#try
+$ref_dict['blat_bin']
+#except
+/soft/blat/34/bin/blat
+#end try
+fatotwobit_bin = #slurp
+#try
+$ref_dict['fatotwobit_bin']
+#except
+/soft/blat/34/bin/faToTwoBit
+#end try
+r_bin = #slurp
+#try
+$ref_dict['r_bin']
+#except
+/project/sdml-sles11-weblocal/R-2.12.1/bin/R
+#end try
+rscript_bin = #slurp
+#try
+$ref_dict['rscript_bin']
+#except
+/project/sdml-sles11-weblocal/R-2.12.1/bin/Rscript
+#end try
+
+#raw
+# Dataset files
+dataset_prefix       = $(dataset_directory)/defuse
+chromosome_prefix    = $(dataset_prefix).dna.chromosomes
+exons_fasta          = $(dataset_prefix).exons.fa
+cds_fasta            = $(dataset_prefix).cds.fa
+cdna_regions         = $(dataset_prefix).cdna.regions
+cdna_fasta           = $(dataset_prefix).cdna.fa
+reference_fasta      = $(dataset_prefix).reference.fa
+rrna_fasta           = $(dataset_prefix).rrna.fa
+ig_gene_list         = $(dataset_prefix).ig.gene.list
+repeats_regions      = $(dataset_directory)/repeats.regions
+est_split_fasta1     = $(dataset_directory)/est.1.fa
+est_split_fasta2     = $(dataset_directory)/est.2.fa
+est_split_fasta3     = $(dataset_directory)/est.3.fa
+est_split_fasta4     = $(dataset_directory)/est.4.fa
+est_split_fasta5     = $(dataset_directory)/est.5.fa
+est_split_fasta6     = $(dataset_directory)/est.6.fa
+est_split_fasta7     = $(dataset_directory)/est.7.fa
+est_split_fasta8     = $(dataset_directory)/est.8.fa
+est_split_fasta9     = $(dataset_directory)/est.9.fa
+
+# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
+prefilter1           = $(unigene_fasta)
+
+# deFuse scripts and tools
+scripts_directory    = $(source_directory)/scripts
+tools_directory      = $(source_directory)/tools
+data_directory       = $(source_directory)/data
+#end raw
+
+# Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk
+samtools_bin = #slurp
+#try
+$ref_dict['samtools_bin']
+#except
+\$(source_directory)/external/samtools-0.1.8/samtools
+#end try
+
+# Bowtie parameters
+bowtie_threads = #slurp
+#try
+$ref_dict['bowtie_threads']
+#except
+1
+#end try
+bowtie_quals = #slurp
+#try
+$ref_dict['bowtie_quals']
+#except
+--phred33-quals
+#end try
+max_insert_size = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "":
+$refGenomeSource.defuse_param.max_insert_size
+#else
+#try
+$ref_dict['max_insert_size']
+#except
+500
+#end try
+#end if
+
+# Parameters for building the dataset
+chromosomes = #slurp
+#try
+$ref_dict.chromosomes
+#except
+1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
+#end try
+mt_chromosome = #slurp
+#try
+$ref_dict['mt_chromosome']
+#except
+MT
+#end try
+gene_sources = #slurp
+#try
+$ref_dict['gene_sources']
+#except
+IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding
+#end try
+ig_gene_sources = #slurp
+#try
+$ref_dict['ig_gene_sources']
+#except
+IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene
+#end try
+rrna_gene_sources = #slurp
+#try
+$ref_dict['rrna_gene_sources']
+#except
+Mt_rRNA,rRNA,rRNA_pseudogene
+#end try
+
+# Blat sequences per job
+num_blat_sequences = #slurp
+#try
+$ref_dict['num_blat_sequences']
+#except
+10000
+#end try
+
+# Minimum gene fusion range
+dna_concordant_length = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "":
+$refGenomeSource.defuse_param.dna_concordant_length
+#else
+#try
+$ref_dict['dna_concordant_length']
+#except
+2000
+#end try
+#end if
+
+# Trim length for discordant reads (split reads are not trimmed)
+discord_read_trim = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "":
+$refGenomeSource.defuse_param.discord_read_trim
+#else
+#try
+$ref_dict['discord_read_trim']
+#except
+50
+#end try
+#end if
+
+# Filtering parameters
+clustering_precision = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != ""
+$refGenomeSource.defuse_param.clustering_precision
+#else
+#try
+$ref_dict['clustering_precision']
+#except
+0.95
+#end try
+#end if
+span_count_threshold = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != ""
+$refGenomeSource.defuse_param.span_count_threshold
+#else
+#try
+$ref_dict['span_count_threshold']
+#except
+5
+#end try
+#end if
+split_count_threshold = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != ""
+$refGenomeSource.defuse_param.split_count_threshold
+#else
+#try
+$ref_dict['split_count_threshold']
+#except
+3
+#end try
+#end if
+percent_identity_threshold = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != ""
+$refGenomeSource.defuse_param.percent_identity_threshold
+#else
+#try
+$ref_dict['percent_identity_threshold']
+#except
+0.90
+#end try
+#end if
+max_dist_pos = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != ""
+$refGenomeSource.defuse_param.max_dist_pos
+#else
+#try
+$ref_dict['max_dist_pos']
+#except
+600
+#end try
+#end if
+num_dist_genes = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != ""
+$refGenomeSource.defuse_param.num_dist_genes
+#else
+#try
+$ref_dict['num_dist_genes']
+#except
+500
+#end try
+#end if
+split_min_anchor = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != ""
+$refGenomeSource.defuse_param.split_min_anchor
+#else
+#try
+$ref_dict['split_min_anchor']
+#except
+4
+#end try
+#end if
+max_concordant_ratio = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != ""
+$refGenomeSource.defuse_param.max_concordant_ratio
+#else
+#try
+$ref_dict['max_concordant_ratio']
+#except
+0.1
+#end try
+#end if
+splice_bias = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != ""
+$refGenomeSource.defuse_param.splice_bias
+#else
+#try
+$ref_dict['splice_bias']
+#except
+10
+#end try
+#end if
+denovo_assembly = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != ""
+$refGenomeSource.defuse_param.denovo_assembly
+#else
+#try
+$ref_dict['denovo_assembly']
+#except
+no
+#end try
+#end if
+probability_threshold = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != ""
+$refGenomeSource.defuse_param.probability_threshold
+#else
+#try
+$ref_dict['probability_threshold']
+#except
+0.50
+#end try
+#end if
+positive_controls                           = \$(data_directory)/controls.txt
+
+# Position density when calculating covariance
+covariance_sampling_density = #slurp
+#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != ""
+$refGenomeSource.defuse_param.covariance_sampling_density
+#else
+#try
+$ref_dict['covariance_sampling_density']
+#except
+0.01
+#end try
+#end if
+
+
+# Number of reads for each job in split
+reads_per_job                               = 1000000
+
+# Number of regions for each breakpoint sequence job in split
+regions_per_job                             = 20
+
+#raw
+# If you have command line 'mail' and wish to be notified
+# mailto                                      = andrew.mcpherson@gmail.com
+
+# Remove temp files
+remove_job_files                            = yes
+remove_job_temp_files                       = yes
+
+# Converting to fastq
+# Fastq converter config format 1 for reads stored in separate files for each end
+#  data_lane_rexex_N is a perl regex which stores the lane id in $1
+#  data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1
+#  data_compress_regex_N is a perl regex which stores the compression extension in $1
+#  data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout
+# Fastq converter config format 2 for reads stored in separate files for each end
+#  data_lane_regex_N is a perl regex which stores the lane id in $1
+#  data_compress_regex_N is a perl regex which stores the compression extension in $1
+#  data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout
+#  data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout
+
+data_lane_regex_1                           = ^(.+)_[12]_export\.txt.*$
+data_end_regex_1                            = ^.+_([12])_export\.txt.*$
+data_compress_regex_1                       = ^.+_[12]_export\.txt(.*)$
+data_converter_1                            = $(scripts_directory)/fq_all2std.pl export2std
+
+data_lane_regex_2                           = ^(.+)_[12]_concat_qseq\.txt.*$
+data_end_regex_2                            = ^.+_([12])_concat_qseq\.txt.*$
+data_compress_regex_2                       = ^.+_[12]_concat_qseq\.txt(.*)$
+data_converter_2                            = $(scripts_directory)/qseq2fastq.pl
+
+data_lane_regex_3                           = ^(.+)\.bam.*$
+data_compress_regex_3                       = ^.+\.bam(.*)$
+data_end1_converter_3                       = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl
+data_end2_converter_3                       = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl
+
+data_lane_regex_4                           = ^(.+).[12].fastq.*$
+data_end_regex_4                            = ^.+.([12]).fastq.*$
+data_compress_regex_4                       = ^.+.[12].fastq(.*)$
+data_converter_4                            = cat
+#end raw
+
+#end if
+
+  </configfile>
+ </configfiles>
+ <outputs>
+  <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
+  <data format="txt" name="defuse_log" label="${tool.name} on ${on_string}: defuse.log" from_work_dir="output_dir/log/defuse.log"/>
+  <data format="tabular" name="results_tsv" label="${tool.name} on ${on_string}: results.tsv" from_work_dir="output_dir/results.tsv"/>
+  <data format="tabular" name="results_filtered_tsv" label="${tool.name} on ${on_string}: results.filtered.tsv" from_work_dir="output_dir/results.filtered.tsv"/>
+  <data format="tabular" name="results_classify_tsv" label="${tool.name} on ${on_string}: results.classify.tsv" from_work_dir="output_dir/results.classify.tsv"/>
+ </outputs>
+ <tests>
+ </tests>
+ <help>
+**DeFuse**
+
+DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
+
+Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
+
+.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
+
+------
+
+**Inputs**
+
+DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
+
+If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
+
+DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
+    - genome_fasta from Ensembl
+    - gene_models from Ensembl
+    - repeats_filename from UCSC RepeatMasker rmsk.txt
+    - est_fasta from UCSC
+    - est_alignments from UCSC intronEst.txt
+    - unigene_fasta from NCBI
+
+.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+
+------
+
+**Outputs**
+
+The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters,  the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
+
+DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
+
+The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
+
+ - **Identification**
+    - cluster_id : random identifier assigned to each prediction
+    - library_name : library name given on the command line of defuse
+    - gene1 : ensembl id of gene 1
+    - gene2 : ensembl id of gene 2
+    - gene_name1 : name of gene 1
+    - gene_name2 : name of gene 2
+ - **Evidence**
+    - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
+    - concordant_ratio : proportion of spanning reads considered concordant by blat
+    - denovo_min_count : minimum kmer count across denovo assembled sequence
+    - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
+    - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
+    - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
+    - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
+    - min_map_count : minimum of the number of genomic mappings for each spanning read
+    - max_map_count : maximum of the number of genomic mappings for each spanning read
+    - mean_map_count : average of the number of genomic mappings for each spanning read
+    - num_multi_map : number of spanning reads that map to more than one genomic location
+    - span_count : number of spanning reads supporting the fusion
+    - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
+    - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
+    - span_coverage_min : minimum of span_coverage1 and span_coverage2
+    - span_coverage_max : maximum of span_coverage1 and span_coverage2
+    - splitr_count : number of split reads supporting the prediction
+    - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
+    - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
+    - splitr_sequence : fusion sequence predicted by split reads
+    - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
+ - **Annotation**
+    - adjacent : fusion between adjacent genes
+    - altsplice : fusion likely the product of alternative splicing between adjacent genes
+    - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
+    - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
+    - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
+    - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
+    - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
+    - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
+    - deletion : fusion produced by a genomic deletion
+    - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
+    - eversion : fusion produced by a genomic eversion
+    - exonboundaries : fusion splice at exon boundaries
+    - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
+    - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
+    - gene_chromosome1 : chromosome of gene 1
+    - gene_chromosome2 : chromosome of gene 2
+    - gene_end1 : end position for gene 1
+    - gene_end2 : end position for gene 2
+    - gene_location1 : location of breakpoint in gene 1
+    - gene_location2 : location of breakpoint in gene 2
+    - gene_start1 : start of gene 1
+    - gene_start2 : start of gene 2
+    - gene_strand1 : strand of gene 1
+    - gene_strand2 : strand of gene 2
+    - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
+    - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
+    - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
+    - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
+    - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
+    - interchromosomal : fusion produced by an interchromosomal translocation
+    - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
+    - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
+    - inversion : fusion produced by genomic inversion
+    - orf : fusion combines genes in a way that preserves a reading frame
+    - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
+    - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
+    - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
+    - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
+    - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
+    - splice_score : number of nucleotides similar to GTAG at fusion splice
+    - num_splice_variants : number of potential splice variants for this gene pair
+    - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
+    - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
+
+
+**Example**
+
+results.tsv::
+
+  cluster_id	splitr_sequence	splitr_count	splitr_span_pvalue	splitr_pos_pvalue	splitr_min_pvalue	adjacent	altsplice	break_adj_entropy1	break_adj_entropy2	break_adj_entropy_min	break_predict	breakpoint_homology	breakseqs_estislands_percident	cdna_breakseqs_percident	concordant_ratio	deletion	est_breakseqs_percident	eversion	exonboundaries	expression1	expression2	gene1	gene2	gene_align_strand1	gene_align_strand2	gene_chromosome1	gene_chromosome2	gene_end1	gene_end2	gene_location1	gene_location2	gene_name1	gene_name2	gene_start1	gene_start2	gene_strand1	gene_strand2	genome_breakseqs_percident	genomic_break_pos1	genomic_break_pos2	genomic_strand1	genomic_strand2	interchromosomal	interrupted_index1	interrupted_index2	inversion	library_name	max_map_count	max_repeat_proportion	mean_map_count	min_map_count	num_multi_map	num_splice_variants	orf	read_through	repeat_proportion1	repeat_proportion2	span_count	span_coverage1	span_coverage2	span_coverage_max	span_coverage_min	splice_score	splicing_index1	splicing_index2
+  1169	GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT	2	0.000436307890680442	0.110748295953850	0.0880671602973091	N	Y	3.19872427442695	3.48337348351473	3.19872427442695	splitr	0	0	0	0	Y	0	N	N	0	0	ENSG00000105549	ENSG00000213753	+	-	19	19	376013	59111168	intron	upstream	THEG	AC016629.2	361750	59084870	-	+	0	375099	386594	+	-	N	8.34107429512245	-	N	output_dir	82	0.677852348993289	40.6666666666667	1	11	1	N	N	0.361271676300578	0.677852348993289	12	0.758602776578432	0.569678713445872	0.758602776578432	0.569678713445872	2	0.416666666666667	-
+  3596	TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG	250	7.00711162298275e-72	0.00912124762512338	0.00684237452309549	N	N	3.31745197152461	3.47233119514066	3.31745197152461	splitr	7	0.0157657657657656	0	0	N	0.0135135135135136	N	N	0	0	ENSG00000156860	ENSG00000212932	-	+	16	21	30682131	48111157	coding	upstream	FBRS	RPL23AP4	30670289	48110676	+	+	0.0157657657657656	30680678	9827473	-	+	Y	-	-	N	output_dir	2	1	1.11111111111111	1	1	1	N	N	0	1	9	0.325530693397641	0.296465452915709	0.325530693397641	0.296465452915709	2	-	-
+
+ </help>
+</tool>
--- a/defuse/README	Fri Sep 16 13:07:35 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-The DeFuse galaxy tool is based on DeFuse_Version_0.4.2
-  http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
-
-DeFuse is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
-
-
-Manual:
-  http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
-
-
-The included defuse source code is from: http://sourceforge.net/projects/defuse/files/defuse/0.4/defuse-0.4.2.tar.gz/download
-(without the defuse-0.4.2 dir level)
-tar zxf defuse-0.4.2.tar.gz
-cd tool
-make
-cd ..
-
-To use with non human genome references:
-tar zxf modified_scripts.tgz
-Defuse source was modified to include 2 extra parameters for non human references: gene_id_pattern and transcript_id_pattern
-	scripts/alignjob.pl
-	scripts/annotate_fusions.pl
-	scripts/calculate_expression_simple.pl
-	scripts/filter_bulk_fusion_reads.pl
-	scripts/filter_sam_genes.pl
-	scripts/find_concordant_ensembl.pl
-	scripts/find_gene_clusters.pl
-
-
-The defuse.xml galaxy tool wrapper will generate a defuse config.txt using values from tool-data/defuse.loc
-and call scripts/defuse.pl
-
-
Binary file defuse/defuse-0.4.2.tar.gz has changed
--- a/defuse/defuse.xml	Fri Sep 16 13:07:35 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,637 +0,0 @@
-<tool id="defuse" name="DeFuse" version="1.0">
- <description>identify fusion transcripts</description>
- <requirements>
-  <requirement type="binary"></requirement>
- </requirements>
- <command interpreter="perl">
-  scripts/defuse.pl
-  -c `cp $defuse_config $config_txt; echo $defuse_config`
-  -d `mkdir -p data_dir; ln -s $left_pairendreads data_dir/reads_1.fastq; ln -s $right_pairendreads data_dir/reads_2.fastq; echo data_dir`
-  -o  output_dir -p 8
- </command>
- <inputs>
-  <param name="left_pairendreads" type="data" format="fastq" label="left part of read pairs" help="The left and right reads pairs must be in the same order, and not have any unpaired reads.  (FASTQ interlacer will pair reads and remove the unpaired.   FASTQ de-interlacer will separate the result into left and right reads.)"/>
-  <param name="right_pairendreads" type="data" format="fastq" label="right part of read pairs" help="In the same order as the left reads"/>
-  <conditional name="refGenomeSource">
-      <param name="genomeSource" type="select" label="Will you select a built-in DeFuse Reference Dataset, or supply a configuration from your history" help="">
-        <option value="indexed">Use a built-in DeFuse Reference Dataset</option>
-        <option value="history">Use a configuration from your history that specifies the DeFuse Reference Dataset</option>
-      </param>
-      <when value="indexed">
-        <param name="index" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
-          <options from_file="defuse.loc">
-            <column name="name" index="1"/>
-            <column name="value" index="2"/>
-            <filter type="sort_by" column="0" />
-            <validator type="no_options" message="No indexes are available" />
-          </options>
-        </param>
-        <conditional name="defuse_param">
-          <param name="settings" type="select" label="Defuse parameter settings" help="">
-            <option value="preSet">Default settings</option>
-            <option value="full">Full parameter list</option>
-          </param>
-          <when value="preSet" />
-          <when value="full">
-            <param name="max_insert_size" type="integer" value="500" optional="true" label="Bowtie max_insert_size" />
-            <param name="dna_concordant_length" type="integer" value="2000" optional="true" label="Minimum gene fusion range dna_concordant_length" />
-            <param name="discord_read_trim" type="integer" value="50" optional="true" label="Trim length for discordant reads discord_read_trim" help="(split reads are not trimmed)" />
-            <param name="clustering_precision" type="float" value=".95" optional="true" label="Filter clustering_precision">
-              <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
-            </param>
-            <param name="span_count_threshold" type="integer" value="5" optional="true" label="Filter span_count_threshold" />
-            <param name="split_count_threshold" type="integer" value="3" optional="true" label="Filter split_count_threshold" />
-            <param name="percent_identity_threshold" type="float" value=".90" optional="true" label="Filter percent_identity_threshold">
-              <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
-            </param>
-            <param name="max_dist_pos" type="integer" value="600" optional="true" label="Filter max_dist_pos" />
-            <param name="num_dist_genes" type="integer" value="500" optional="true" label="Filter num_dist_genes" />
-            <param name="split_min_anchor" type="integer" value="4" optional="true" label="Filter split_min_anchor" />
-            <param name="max_concordant_ratio" type="float" value="0.1" optional="true" label="Filter max_concordant_ratio">
-              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
-            </param>
-            <param name="splice_bias" type="integer" value="10" optional="true" label="Filter splice_bias" />
-            <param name="probability_threshold" type="float" value="0.50" optional="true" label="Filter probability_threshold">
-              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
-            </param>
-            <param name="covariance_sampling_density" type="float" value="0.01" optional="true" label="covariance_sampling_density">
-              <help>Position density when calculating covariance</help>
-              <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
-            </param>
-            <param name="denovo_assembly" type="select" label="denovo_assembly" help="">
-              <option value="">Use Default</option>
-              <option value="no">no</option>
-              <option value="yes">yes</option>
-            </param>
-            <!--
-              <param name="positive_controls" type="data" format="txt" optional=true label="Defuse positive_controls" help=""/>
-            -->
-          </when> <!-- full -->
-        </conditional>  <!-- defuse_param -->
-      </when>
-      <when value="history">
-        <param name="config" type="data" format="txt" label="Defuse Config file" help=""/>
-      </when>  <!-- history -->
-  </conditional>  <!-- refGenomeSource -->
- </inputs>
- <configfiles>
-  <configfile name="defuse_config">
-#import ast
-#if $refGenomeSource.genomeSource == "history":
-#include raw $refGenomeSource.config.__str__
-#else
-#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value))
-#
-# Configuration file for defuse
-#
-# At a minimum, change all values enclused by []
-#
-# Gene/Transcript id pattern
-gene_id_pattern = #slurp
-#try
-$ref_dict['gene_id_pattern']
-transcript_id_pattern = #slurp
-#except
-ENSG\d+
-#end try
-#try
-$ref_dict['transcript_id_pattern']
-#except
-ENST\d+
-#end try
-
-# Directory where the defuse code was unpacked
-## Default location in the tool/defuse directory
-# source_directory = ${__root_dir__}/tools/defuse
-source_directory = #slurp
-#try
-$ref_dict['source_directory']
-#except
-${__root_dir__}/tools/defuse
-#end try
-
-# Directory where you want your dataset
-dataset_directory = #slurp
-#try
-$ref_dict['dataset_directory']
-#except
-/project/db/genomes/Hsapiens/hg19/defuse
-#end try
-
-# Input genome and gene models
-gene_models = #slurp
-#try
-$ref_dict['gene_models']
-#except
-\$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf
-#end try
-genome_fasta = #slurp
-#try
-$ref_dict['genome_fasta']
-#except
-\$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa
-#end try
-
-# Repeat table from ucsc genome browser
-repeats_filename = #slurp
-#try
-$ref_dict['repeats_filename']
-#except
-\$(dataset_directory)/rmsk.txt
-#end try
-
-# EST info downloaded from ucsc genome browser
-est_fasta = #slurp
-#try
-$ref_dict['est_fasta']
-#except
-\$(dataset_directory)/est.fa
-#end try
-est_alignments = #slurp
-#try
-$ref_dict['est_alignments']
-#except
-\$(dataset_directory)/intronEst.txt
-#end try
-
-# Unigene clusters downloaded from ncbi
-unigene_fasta = #slurp
-#try
-$ref_dict['unigene_fasta']
-#except
-\$(dataset_directory)/Hs.seq.uniq
-#end try
-
-# Paths to external tools
-bowtie_bin = #slurp
-#try
-$ref_dict['bowtie_bin']
-#except
-/soft/bowtie/0.12.7/bowtie
-#end try
-bowtie_build_bin = #slurp
-#try
-$ref_dict['bowtie_build_bin']
-#except
-/soft/bowtie/0.12.7/bowtie-build
-#end try
-blat_bin = #slurp
-#try
-$ref_dict['blat_bin']
-#except
-/soft/blat/34/bin/blat
-#end try
-fatotwobit_bin = #slurp
-#try
-$ref_dict['fatotwobit_bin']
-#except
-/soft/blat/34/bin/faToTwoBit
-#end try
-r_bin = #slurp
-#try
-$ref_dict['r_bin']
-#except
-/project/sdml-sles11-weblocal/R-2.12.1/bin/R
-#end try
-rscript_bin = #slurp
-#try
-$ref_dict['rscript_bin']
-#except
-/project/sdml-sles11-weblocal/R-2.12.1/bin/Rscript
-#end try
-
-#raw
-# Dataset files
-dataset_prefix       = $(dataset_directory)/defuse
-chromosome_prefix    = $(dataset_prefix).dna.chromosomes
-exons_fasta          = $(dataset_prefix).exons.fa
-cds_fasta            = $(dataset_prefix).cds.fa
-cdna_regions         = $(dataset_prefix).cdna.regions
-cdna_fasta           = $(dataset_prefix).cdna.fa
-reference_fasta      = $(dataset_prefix).reference.fa
-rrna_fasta           = $(dataset_prefix).rrna.fa
-ig_gene_list         = $(dataset_prefix).ig.gene.list
-repeats_regions      = $(dataset_directory)/repeats.regions
-est_split_fasta1     = $(dataset_directory)/est.1.fa
-est_split_fasta2     = $(dataset_directory)/est.2.fa
-est_split_fasta3     = $(dataset_directory)/est.3.fa
-est_split_fasta4     = $(dataset_directory)/est.4.fa
-est_split_fasta5     = $(dataset_directory)/est.5.fa
-est_split_fasta6     = $(dataset_directory)/est.6.fa
-est_split_fasta7     = $(dataset_directory)/est.7.fa
-est_split_fasta8     = $(dataset_directory)/est.8.fa
-est_split_fasta9     = $(dataset_directory)/est.9.fa
-
-# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
-prefilter1           = $(unigene_fasta)
-
-# deFuse scripts and tools
-scripts_directory    = $(source_directory)/scripts
-tools_directory      = $(source_directory)/tools
-data_directory       = $(source_directory)/data
-#end raw
-
-# Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk
-samtools_bin = #slurp
-#try
-$ref_dict['samtools_bin']
-#except
-\$(source_directory)/external/samtools-0.1.8/samtools
-#end try
-
-# Bowtie parameters
-bowtie_threads = #slurp
-#try
-$ref_dict['bowtie_threads']
-#except
-1
-#end try
-bowtie_quals = #slurp
-#try
-$ref_dict['bowtie_quals']
-#except
---phred33-quals
-#end try
-max_insert_size = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "":
-$refGenomeSource.defuse_param.max_insert_size
-#else
-#try
-$ref_dict['max_insert_size']
-#except
-500
-#end try
-#end if
-
-# Parameters for building the dataset
-chromosomes = #slurp
-#try
-$ref_dict.chromosomes
-#except
-1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
-#end try
-mt_chromosome = #slurp
-#try
-$ref_dict['mt_chromosome']
-#except
-MT
-#end try
-gene_sources = #slurp
-#try
-$ref_dict['gene_sources']
-#except
-IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding
-#end try
-ig_gene_sources = #slurp
-#try
-$ref_dict['ig_gene_sources']
-#except
-IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene
-#end try
-rrna_gene_sources = #slurp
-#try
-$ref_dict['rrna_gene_sources']
-#except
-Mt_rRNA,rRNA,rRNA_pseudogene
-#end try
-
-# Blat sequences per job
-num_blat_sequences = #slurp
-#try
-$ref_dict['num_blat_sequences']
-#except
-10000
-#end try
-
-# Minimum gene fusion range
-dna_concordant_length = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "":
-$refGenomeSource.defuse_param.dna_concordant_length
-#else
-#try
-$ref_dict['dna_concordant_length']
-#except
-2000
-#end try
-#end if
-
-# Trim length for discordant reads (split reads are not trimmed)
-discord_read_trim = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "":
-$refGenomeSource.defuse_param.discord_read_trim
-#else
-#try
-$ref_dict['discord_read_trim']
-#except
-50
-#end try
-#end if
-
-# Filtering parameters
-clustering_precision = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != ""
-$refGenomeSource.defuse_param.clustering_precision
-#else
-#try
-$ref_dict['clustering_precision']
-#except
-0.95
-#end try
-#end if
-span_count_threshold = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != ""
-$refGenomeSource.defuse_param.span_count_threshold
-#else
-#try
-$ref_dict['span_count_threshold']
-#except
-5
-#end try
-#end if
-split_count_threshold = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != ""
-$refGenomeSource.defuse_param.split_count_threshold
-#else
-#try
-$ref_dict['split_count_threshold']
-#except
-3
-#end try
-#end if
-percent_identity_threshold = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != ""
-$refGenomeSource.defuse_param.percent_identity_threshold
-#else
-#try
-$ref_dict['percent_identity_threshold']
-#except
-0.90
-#end try
-#end if
-max_dist_pos = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != ""
-$refGenomeSource.defuse_param.max_dist_pos
-#else
-#try
-$ref_dict['max_dist_pos']
-#except
-600
-#end try
-#end if
-num_dist_genes = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != ""
-$refGenomeSource.defuse_param.num_dist_genes
-#else
-#try
-$ref_dict['num_dist_genes']
-#except
-500
-#end try
-#end if
-split_min_anchor = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != ""
-$refGenomeSource.defuse_param.split_min_anchor
-#else
-#try
-$ref_dict['split_min_anchor']
-#except
-4
-#end try
-#end if
-max_concordant_ratio = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != ""
-$refGenomeSource.defuse_param.max_concordant_ratio
-#else
-#try
-$ref_dict['max_concordant_ratio']
-#except
-0.1
-#end try
-#end if
-splice_bias = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != ""
-$refGenomeSource.defuse_param.splice_bias
-#else
-#try
-$ref_dict['splice_bias']
-#except
-10
-#end try
-#end if
-denovo_assembly = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != ""
-$refGenomeSource.defuse_param.denovo_assembly
-#else
-#try
-$ref_dict['denovo_assembly']
-#except
-no
-#end try
-#end if
-probability_threshold = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != ""
-$refGenomeSource.defuse_param.probability_threshold
-#else
-#try
-$ref_dict['probability_threshold']
-#except
-0.50
-#end try
-#end if
-positive_controls                           = \$(data_directory)/controls.txt
-
-# Position density when calculating covariance
-covariance_sampling_density = #slurp
-#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != ""
-$refGenomeSource.defuse_param.covariance_sampling_density
-#else
-#try
-$ref_dict['covariance_sampling_density']
-#except
-0.01
-#end try
-#end if
-
-
-# Number of reads for each job in split
-reads_per_job                               = 1000000
-
-# Number of regions for each breakpoint sequence job in split
-regions_per_job                             = 20
-
-#raw
-# If you have command line 'mail' and wish to be notified
-# mailto                                      = andrew.mcpherson@gmail.com
-
-# Remove temp files
-remove_job_files                            = yes
-remove_job_temp_files                       = yes
-
-# Converting to fastq
-# Fastq converter config format 1 for reads stored in separate files for each end
-#  data_lane_rexex_N is a perl regex which stores the lane id in $1
-#  data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1
-#  data_compress_regex_N is a perl regex which stores the compression extension in $1
-#  data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout
-# Fastq converter config format 2 for reads stored in separate files for each end
-#  data_lane_regex_N is a perl regex which stores the lane id in $1
-#  data_compress_regex_N is a perl regex which stores the compression extension in $1
-#  data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout
-#  data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout
-
-data_lane_regex_1                           = ^(.+)_[12]_export\.txt.*$
-data_end_regex_1                            = ^.+_([12])_export\.txt.*$
-data_compress_regex_1                       = ^.+_[12]_export\.txt(.*)$
-data_converter_1                            = $(scripts_directory)/fq_all2std.pl export2std
-
-data_lane_regex_2                           = ^(.+)_[12]_concat_qseq\.txt.*$
-data_end_regex_2                            = ^.+_([12])_concat_qseq\.txt.*$
-data_compress_regex_2                       = ^.+_[12]_concat_qseq\.txt(.*)$
-data_converter_2                            = $(scripts_directory)/qseq2fastq.pl
-
-data_lane_regex_3                           = ^(.+)\.bam.*$
-data_compress_regex_3                       = ^.+\.bam(.*)$
-data_end1_converter_3                       = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl
-data_end2_converter_3                       = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl
-
-data_lane_regex_4                           = ^(.+).[12].fastq.*$
-data_end_regex_4                            = ^.+.([12]).fastq.*$
-data_compress_regex_4                       = ^.+.[12].fastq(.*)$
-data_converter_4                            = cat
-#end raw
-
-#end if
-
-  </configfile>
- </configfiles>
- <outputs>
-  <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
-  <data format="txt" name="defuse_log" label="${tool.name} on ${on_string}: defuse.log" from_work_dir="output_dir/log/defuse.log"/>
-  <data format="tabular" name="results_tsv" label="${tool.name} on ${on_string}: results.tsv" from_work_dir="output_dir/results.tsv"/>
-  <data format="tabular" name="results_filtered_tsv" label="${tool.name} on ${on_string}: results.filtered.tsv" from_work_dir="output_dir/results.filtered.tsv"/>
-  <data format="tabular" name="results_classify_tsv" label="${tool.name} on ${on_string}: results.classify.tsv" from_work_dir="output_dir/results.classify.tsv"/>
- </outputs>
- <tests>
- </tests>
- <help>
-**DeFuse**
-
-DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
-
-Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
-
-.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
-
-------
-
-**Inputs**
-
-DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
-
-If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
-
-DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
-    - genome_fasta from Ensembl
-    - gene_models from Ensembl
-    - repeats_filename from UCSC RepeatMasker rmsk.txt
-    - est_fasta from UCSC
-    - est_alignments from UCSC intronEst.txt
-    - unigene_fasta from NCBI
-
-.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
-
-------
-
-**Outputs**
-
-The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters,  the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
-
-DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
-
-The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
-
- - **Identification**
-    - cluster_id : random identifier assigned to each prediction
-    - library_name : library name given on the command line of defuse
-    - gene1 : ensembl id of gene 1
-    - gene2 : ensembl id of gene 2
-    - gene_name1 : name of gene 1
-    - gene_name2 : name of gene 2
- - **Evidence**
-    - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
-    - concordant_ratio : proportion of spanning reads considered concordant by blat
-    - denovo_min_count : minimum kmer count across denovo assembled sequence
-    - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
-    - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
-    - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
-    - min_map_count : minimum of the number of genomic mappings for each spanning read
-    - max_map_count : maximum of the number of genomic mappings for each spanning read
-    - mean_map_count : average of the number of genomic mappings for each spanning read
-    - num_multi_map : number of spanning reads that map to more than one genomic location
-    - span_count : number of spanning reads supporting the fusion
-    - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
-    - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
-    - span_coverage_min : minimum of span_coverage1 and span_coverage2
-    - span_coverage_max : maximum of span_coverage1 and span_coverage2
-    - splitr_count : number of split reads supporting the prediction
-    - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - splitr_sequence : fusion sequence predicted by split reads
-    - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
- - **Annotation**
-    - adjacent : fusion between adjacent genes
-    - altsplice : fusion likely the product of alternative splicing between adjacent genes
-    - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
-    - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
-    - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
-    - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
-    - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
-    - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
-    - deletion : fusion produced by a genomic deletion
-    - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
-    - eversion : fusion produced by a genomic eversion
-    - exonboundaries : fusion splice at exon boundaries
-    - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
-    - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
-    - gene_chromosome1 : chromosome of gene 1
-    - gene_chromosome2 : chromosome of gene 2
-    - gene_end1 : end position for gene 1
-    - gene_end2 : end position for gene 2
-    - gene_location1 : location of breakpoint in gene 1
-    - gene_location2 : location of breakpoint in gene 2
-    - gene_start1 : start of gene 1
-    - gene_start2 : start of gene 2
-    - gene_strand1 : strand of gene 1
-    - gene_strand2 : strand of gene 2
-    - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
-    - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
-    - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
-    - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
-    - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
-    - interchromosomal : fusion produced by an interchromosomal translocation
-    - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
-    - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
-    - inversion : fusion produced by genomic inversion
-    - orf : fusion combines genes in a way that preserves a reading frame
-    - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
-    - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
-    - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
-    - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
-    - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
-    - splice_score : number of nucleotides similar to GTAG at fusion splice
-    - num_splice_variants : number of potential splice variants for this gene pair
-    - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
-    - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
-
-
-**Example**
-
-results.tsv::
-
-  cluster_id	splitr_sequence	splitr_count	splitr_span_pvalue	splitr_pos_pvalue	splitr_min_pvalue	adjacent	altsplice	break_adj_entropy1	break_adj_entropy2	break_adj_entropy_min	break_predict	breakpoint_homology	breakseqs_estislands_percident	cdna_breakseqs_percident	concordant_ratio	deletion	est_breakseqs_percident	eversion	exonboundaries	expression1	expression2	gene1	gene2	gene_align_strand1	gene_align_strand2	gene_chromosome1	gene_chromosome2	gene_end1	gene_end2	gene_location1	gene_location2	gene_name1	gene_name2	gene_start1	gene_start2	gene_strand1	gene_strand2	genome_breakseqs_percident	genomic_break_pos1	genomic_break_pos2	genomic_strand1	genomic_strand2	interchromosomal	interrupted_index1	interrupted_index2	inversion	library_name	max_map_count	max_repeat_proportion	mean_map_count	min_map_count	num_multi_map	num_splice_variants	orf	read_through	repeat_proportion1	repeat_proportion2	span_count	span_coverage1	span_coverage2	span_coverage_max	span_coverage_min	splice_score	splicing_index1	splicing_index2
-  1169	GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT	2	0.000436307890680442	0.110748295953850	0.0880671602973091	N	Y	3.19872427442695	3.48337348351473	3.19872427442695	splitr	0	0	0	0	Y	0	N	N	0	0	ENSG00000105549	ENSG00000213753	+	-	19	19	376013	59111168	intron	upstream	THEG	AC016629.2	361750	59084870	-	+	0	375099	386594	+	-	N	8.34107429512245	-	N	output_dir	82	0.677852348993289	40.6666666666667	1	11	1	N	N	0.361271676300578	0.677852348993289	12	0.758602776578432	0.569678713445872	0.758602776578432	0.569678713445872	2	0.416666666666667	-
-  3596	TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG	250	7.00711162298275e-72	0.00912124762512338	0.00684237452309549	N	N	3.31745197152461	3.47233119514066	3.31745197152461	splitr	7	0.0157657657657656	0	0	N	0.0135135135135136	N	N	0	0	ENSG00000156860	ENSG00000212932	-	+	16	21	30682131	48111157	coding	upstream	FBRS	RPL23AP4	30670289	48110676	+	+	0.0157657657657656	30680678	9827473	-	+	Y	-	-	N	output_dir	2	1	1.11111111111111	1	1	1	N	N	0	1	9	0.325530693397641	0.296465452915709	0.325530693397641	0.296465452915709	2	-	-
-
- </help>
-</tool>
Binary file defuse/modified_scripts.tgz has changed
--- a/defuse/tool-data/defuse.loc.sample	Fri Sep 16 13:07:35 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-## Configurstion info for prepared data references for DeFuse
-## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
-## 3 columns separated by the TAB character
-## The 3rd column has dictionary values that will be substituted in the config file for defuse
-## It should likely contain keys:   dataset_directory gene_models genome_fasta repeats_filename est_fasta est_alignments unigene_fasta
-## If this is not a Homo_sapiens reference also need keys:  gene_id_pattern transcript_id_pattern chromosomes
-
-#db_key	name	{'config_key':'config_value'}
-hg19	GRCh37(hg19)	{'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSG\d+', 'transcript_id_pattern':'ENST\d+', 'dataset_directory':'/project/db/genomes/Hsapiens/hg19/defuse', 'gene_models':'$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf', 'genome_fasta':'$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Hs.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'}
-mm9	NCBIM37(mm9)	{'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/project/db/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM37.63.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM37.63.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'}
Binary file modified_scripts.tgz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/defuse.loc.sample	Fri Sep 16 12:41:37 2011 -0500
@@ -0,0 +1,10 @@
+## Configurstion info for prepared data references for DeFuse
+## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+## 3 columns separated by the TAB character
+## The 3rd column has dictionary values that will be substituted in the config file for defuse
+## It should likely contain keys:   dataset_directory gene_models genome_fasta repeats_filename est_fasta est_alignments unigene_fasta
+## If this is not a Homo_sapiens reference also need keys:  gene_id_pattern transcript_id_pattern chromosomes
+
+#db_key	name	{'config_key':'config_value'}
+hg19	GRCh37(hg19)	{'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSG\d+', 'transcript_id_pattern':'ENST\d+', 'dataset_directory':'/project/db/genomes/Hsapiens/hg19/defuse', 'gene_models':'$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf', 'genome_fasta':'$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Hs.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'}
+mm9	NCBIM37(mm9)	{'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/project/db/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM37.63.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM37.63.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'}