changeset 0:58ad7b512590 draft default tip

Uploaded
author brenninc
date Thu, 12 May 2016 09:48:15 -0400
parents
children
files name_changer.py subread_featurecounts.xml tool-data/gene_transfer.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 5 files changed, 303 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/name_changer.py	Thu May 12 09:48:15 2016 -0400
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+import optparse
+import os.path
+
+
+def fix_header_line(start_header, header_line, new_names):
+    header_parts = header_line.split("\t")
+    if len(header_parts) <= len(start_header):
+        raise Exception("Only found {0} columns in second (header) line expected at least {1}.".format(len(header_parts), (len(start_header) + 1)))
+    data_headers = header_parts[:len(start_header)]
+    if data_headers != start_header:
+        raise Exception("Unexpected start to second (header) line Found: ")
+    new_header = "\t".join(start_header)
+    file_headers = header_parts[len(start_header):]
+    if len(file_headers) != len(new_names):
+        raise Exception("Found {0} file columns in header line, but {1} new_name paramters provided.".format(len(file_headers), len(new_names)))
+    for i in range(len(file_headers)):
+        new_header += "\t"
+        new_header += new_names[i]
+    new_header += "\n"
+    return new_header
+
+
+def clean_names(prefix, old_names):
+    if len(old_names) > 1:
+        shared_start = old_names[0].strip()
+        shared_ends = old_names[0].strip()
+        for name in old_names:
+            clean = name.strip()
+            while len(shared_start) > 0 and (not clean.startswith(shared_start)):
+                shared_start = shared_start[:-1]
+            while len(shared_ends) > 0 and (not clean.endswith(shared_ends)):
+                shared_ends = shared_ends[1:]
+        start = len(shared_start)
+        end = 0 - len(shared_ends)
+    else:
+        start = 0
+        end = 0
+    new_names = []
+    if end < 0:
+        for name in old_names:
+            new_names.append(prefix + name.strip()[start:end])
+    else:
+        for name in old_names:
+            new_names.append(prefix + name.strip()[start:])
+    return new_names
+
+
+def main():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option("--raw_count_file", action="store", type="string", default=None, help="path to file original with the counts")
+    parser.add_option("--fixed_count_file", action="store", type="string", default=None, help="new path for renamaned counts file")
+    parser.add_option("--raw_summary_file", action="store", type="string", default=None, help="path to file original with the summary")
+    parser.add_option("--fixed_summary_file", action="store", type="string", default=None, help="new path for renamaned summary file")
+    parser.add_option("--names_file", action="store", type="string", default=None, help="path to file which contains the names.")
+    parser.add_option("--new_name", action="append", type="string", default=None,
+                      help="Names to be used. Must be the same length as in the raw_count_file")
+    parser.add_option("--names_prefix", action="store", type="string", default="", help="Prefix to add in from of every name.")
+
+    (options, args) = parser.parse_args()
+
+    if not os.path.exists(options.raw_count_file):
+        parser.error("Unable to find raw_count_file {0}.".format(options.raw_count_file))
+    if options.names_file:
+        if options.new_name:
+            parser.error("names_file parameter clashes with new_names paramter(s)")
+        if not os.path.exists(options.names_file):
+            parser.error("Unable to find names_file {0}.".format(options.names_file))
+        new_names = []
+        with open(options.names_file, "r") as names_file:
+            for line in names_file:
+                new_names.append(line.strip())
+        new_names = clean_names(options.names_prefix, new_names)
+    else:
+        if not options.new_name:
+            parser.error("No names_file or new_name paraters provided.")
+        new_names = options.new_name
+
+    print "Changing column names to ", new_names
+
+    with open(options.raw_count_file, "r") as input_file:
+        with open(options.fixed_count_file, "w") as output_file:
+            input_file.readline()  # job line
+            start_header = ["Geneid", "Chr", "Start", "End", "Strand", "Length"]
+            header_line = fix_header_line(start_header, input_file.readline(), new_names)
+            output_file.write(header_line)
+            for line in input_file:
+                output_file.write(line)
+
+    with open(options.raw_summary_file, "r") as input_file:
+        with open(options.fixed_summary_file, "w") as output_file:
+            start_header = ["Status"]
+            header_line = fix_header_line(start_header, input_file.readline(), new_names)
+            output_file.write(header_line)
+            for line in input_file:
+                output_file.write(line)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/subread_featurecounts.xml	Thu May 12 09:48:15 2016 -0400
@@ -0,0 +1,174 @@
+<tool id="subread_featurecounts" name="FeatureCount from subread" version="1.5.0-p1">
+    <description>Runs FeatureCount from subread</description>
+    <requirements>
+        <requirement type="package" version="1.5.0-p1">subread</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1" level="fatal" description="Error code 1 occurred" />
+        <exit_code range="2:255" level="fatal" description="Unknown error occurred" />
+    </stdio>
+    <command>
+        featureCounts -p -t exon -g gene_id 
+            #if $reference_source.reference_source_selector=='history':
+                -a $reference_source.ref_file
+            #end if
+            #if $reference_source.reference_source_selector=='cached':
+                -a $reference_source.ref_path.fields.path
+            #end if
+            -o counts 
+            #if $names_source.names_source_selector=='manual':
+                #for $s in $names_source.input_serie
+                    $s.input_file
+                #end for
+            #else
+                #for $input in $names_source.inputs
+                    "${input}" 
+                #end for
+            #end if
+            ;
+        #if $names_source.names_source_selector in ["file","manual"]:
+            python $__tool_directory__/name_changer.py 
+                --raw_count_file counts --fixed_count_file ${output} 
+                --raw_summary_file counts.summary --fixed_summary_file "${summary}"
+                #if $names_source.names_source_selector=='file':
+                    --names_file ${names_source.names_file}
+                    #if $names_source.names_prefix:
+                        --names_prefix ${names_source.names_prefix}
+                    #end if
+                #else:
+                    #for $s in $names_source.input_serie
+                        --new_name $s.new_name
+                    #end for
+                #end if
+        #else
+            cp counts  "${output}" ;
+            cp counts.summary  "${summary}"
+        #end if
+   </command>
+    <inputs>
+        <conditional name="reference_source">
+            <param name="reference_source_selector" type="select" label="Choose the source for the gene_transfer (gtf) file.">
+               <option value="cached">Locally cached</option>
+                <option value="history">History</option>
+            </param>
+            <when value="cached">
+                <param name="ref_path" type="select" label="Using reference gene transfer">
+                    <options from_data_table="gene_transfer"/>
+                    <validator type="no_options" message="A built-in reference gene transfer is not available."/>
+                </param>
+            </when>
+            <when value="history"> 
+                <param name="ref_file" type="data" format="gtf" label="Using reference file" />
+            </when>
+        </conditional>
+        <conditional name="names_source">
+            <param name="names_source_selector" type="select" label="How are the inputs organized?">
+                <option value="file">Collection of /Multiple bam file plus File with list of names.</option>
+                <option value="galaxy_path">Collection of /Multiple bam files, but no file with list of names.</option>
+                <option value="manual">Manually enter each file and a name for that file.</option>
+            </param>
+            <when value="file">
+                <param name="inputs" format="bam" multiple="True" label="Bam file(s) to count Features of" type="data" />
+                <param name="names_file" format="txt" 
+                    label="File which has the names for the columns. Note start and end strings shared by every name will be removed" 
+                    type="data" />
+                <param name="names_prefix" size="30" type="text" value="" label="Prefix to add before every column name"/>
+            </when>
+            <when value="manual">
+                <repeat name="input_serie" title="Files and names to add">
+                    <param name="input_file" format="bam" label="Bam file to count Features of" type="data" />
+                    <param name="new_name" size="30" type="text" value="" label="Name for that column"/>
+                </repeat>
+            </when>
+            <when value="galaxy_path">
+                <param name="inputs" format="bam" multiple="True" label="Bam file(s) to count Features of" type="data" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Counted Features" />
+        <data format="tabular" name="summary" label="Feature Count Summary" />
+    </outputs>
+    <tests> 
+        <!-- Test data too large to be included but can be found at 
+             https://github.com/Christian-B/galaxy_shedtools/tree/master/subread_featurecounts -->
+        <test>
+            <param ftype="bam" name="inputs" value="C75_sorted.bam" />
+            <param name="reference_source|reference_source_selector" value="cached"/>
+            <param ftype="gtf" name="reference_source|ref_path" value="gencode.vM5" />
+            <param name="names_source|names_source_selector" value="galaxy_path"/>
+            <output name="output" file="C75_FeatureCounts.tsv"  ftype="tabular" compare="sim_size" delta="10000"/>
+            <output name="summary">
+                <assert_contents>              
+                    <has_text text="Unassigned_Ambiguity" />
+                </assert_contents>              
+            </output>
+        </test>
+        <test>
+            <param ftype="bam" name="inputs" value="C01_sorted.bam,C02_sorted.bam,C75_sorted.bam" />
+            <param name="reference_source|reference_source_selector" value="history"/>
+            <param ftype="gtf" name="reference_source|ref_file" value="/home/christian/Dropbox/Gene_data/gencode.vM5.annotation.gtf" />
+            <param name="names_source|names_source_selector" value="file"/>
+            <param ftype="txt" name="names_source|names_file" value="names.dat" />
+            <param name="names_source|names_prefix" value="prefix_"/>
+            <output name="output">
+                <assert_contents>              
+                    <has_text text="prefix__1" />
+                    <has_text text="prefix__2" />
+                    <has_text text="prefix_75" />
+                </assert_contents>              
+            </output>
+            <output name="summary">
+                <assert_contents>              
+                    <has_text text="Unassigned_Ambiguity" />
+                </assert_contents>              
+            </output>
+        </test>
+        <test>
+            <param name="reference_source|reference_source_selector" value="history"/>
+            <param ftype="gtf" name="reference_source|ref_file" value="/home/christian/Dropbox/Gene_data/gencode.vM5.annotation.gtf" />
+            <param name="names_source|names_source_selector" value="manual"/>
+            <param ftype="bam" name="names_source|input_serie_0|input_file" value="C01_sorted.bam" />
+            <param name="names_source|input_serie_0|new_name" value="ForC1"/>
+            <param ftype="bam" name="names_source|input_serie_1|input_file" value="C02_sorted.bam" />
+            <param name="names_source|input_serie_1|new_name" value="ForC2"/>
+            <param ftype="bam" name="names_source|input_serie_2|input_file" value="C75_sorted.bam" />
+            <param name="names_source|input_serie_2|new_name" value="ForC75"/>
+            <output name="output">
+                <assert_contents>              
+                    <has_text text="ForC1" />
+                    <has_text text="ForC2" />
+                    <has_text text="ForC75" />
+                </assert_contents>              
+            </output>
+            <output name="summary">
+                <assert_contents>              
+                    <has_text text="Unassigned_Ambiguity" />
+                </assert_contents>              
+            </output>
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+This tool runs subread's  Featurecounts function.
+
+Feature count labels the data columns with the input file names.  Which will be the .../000/024.dat style names used by galaxy.
+
+This tool therefor post processes the result changing these column names with either values found in a file or entered manually.
+In each case every name can be prefixed with the same value 
+]]>
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{
+                Subread,
+                author = {Liao Y, Smyth GK and Shi W},
+                title = {Subread (incl FeatureCount on SourceForge},
+                url = {http://subread.sourceforge.net/}
+            }
+        </citation>
+        <citation type="doi">10.1093/bioinformatics/btt656</citation>
+        <citation type="doi">10.1093/nar/gkt214</citation>
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gene_transfer.loc.sample	Thu May 12 09:48:15 2016 -0400
@@ -0,0 +1,14 @@
+#This file lists the locations and dbkeys of all the gene transfer files
+
+#This file has the format (white space characters are TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, gene_transfer.loc could look something like this:
+#
+#vm5	vm5	vM5 annotation	/path/to/vM5.annotation.gtf
+#
+#Your gene_transfer.loc file should contain an entry for each individual
+#gtf file. 
+#
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu May 12 09:48:15 2016 -0400
@@ -0,0 +1,7 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <table name="gene_transfer" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/gene_transfer.loc" />
+    </table>
+ </tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu May 12 09:48:15 2016 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="subread" version="1.5.0-p1">
+        <repository changeset_revision="7f2795b29d61" name="package_subread_1_5_0_p1" owner="brenninc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>