changeset 0:b737d0ed42be draft default tip

Uploaded
author brenninc
date Tue, 21 Jun 2016 03:38:52 -0400
parents
children
files data_reader.xml directory_copier.py test-data/other.fastq test-data/other.fastq.gz test-data/sample1.fastq test-data/sample1.fastq.gz tool-data/directory_data.loc.sample tool_data_table_conf.xml.sample
diffstat 8 files changed, 312 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_reader.xml	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,157 @@
+<tool id="directory_table_reader" name="Directory Data Reader" version="0.2">
+    <description>Reads data from preconfigured directories table.</description>
+    <command interpreter="python">
+        <![CDATA[
+        directory_copier.py  
+            --ending .${directory.fields.original_extension} 
+            --new_ending .${directory.fields.galaxy_extension} 
+            #if $results.required=="data"
+                --new_ending .${directory.fields.galaxy_extension} 
+                --decompress ${directory.fields.decompress} 
+                #if $results.start
+                    --start "$results.start"
+                #end if      
+                #if $results.last
+                    --last "$results.last"
+                #end if      
+                #if $results.regex
+                    --regex "$results.regex"
+                #end if      
+            #end if      
+            --path ${directory.fields.path} 
+            --list ${listing}
+        ]]>
+    </command>
+    <inputs>
+        <param name="directory" type="select" label="Directory to import data from">
+            <options from_data_table="directory_data"/>
+            <validator type="no_options" message="No Data Directory Setup"/>
+        </param>
+        <param name="list_name" type="text" size="25" label="output name" value="input data"/>
+        <conditional name="results">
+            <param name="required" type="select" label="Download data or just directory listing" help="Select type of action required.">
+                <option value="data" selected="true">Data and listing of selected type</option>
+                <option value="listing">Get listing of selected file types </option>
+            </param>
+            <when value="data">
+                <param name="start" type="text" value="" label="String which must be at the start of each file name" />
+                <param name="last" type="text" value="" label="String which must be at the end of the file name (excluding the file type)" />
+                <param name="regex" type="text" value="" label="Regex pattern which must somewhere in the file name (excluding the file type)" >
+                    <sanitizer>
+                        <valid initial="string.printable"/>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="listing"/>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="txt" name="listing" label="List of files in $list_name">
+        </data>
+        <!-- Ideally galaxy can get the type based on the file extensions. If so just add the type here -->
+        <collection type="list" label="$list_name" name="data_collection">
+            <filter>(results['required'] == 'data')</filter>
+            <discover_datasets pattern="__designation_and_ext__" directory="output" visible="true" />
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="directory" value="fastq.gz_files_id" />
+            <param name="list_name" value="test_files" />
+            <param name="results|required" value="listing"/>
+            <output name="listing">
+                <assert_contents>
+                    <has_line line="sample1.fastqsanger" />
+                    <has_line line="other.fastqsanger" />
+                </assert_contents>
+            </output>
+         </test>
+        <test>
+            <param name="directory" value="fastq.gz_files_id" />
+            <output name="listing_fastq_gz">
+                <assert_contents>
+                    <has_line line="sample1.fastqsanger" />
+                </assert_contents>
+            </output>
+            <output_collection name="data_collection" type="list">
+                <element name="sample1" ftype="fastqsanger" file="sample1.fastq" />
+                <element name="other" ftype="fastqsanger" file="other.fastq" />
+           </output_collection>
+         </test>
+        <test>
+            <param name="directory" value="fastq_files_id" />
+            <param name="results|start" value="sam" />
+            <output name="listing_fastq">
+                <assert_contents>
+                    <has_line line="sample1.fastq" />
+                    <not_has_text text="other.fastq" />
+                </assert_contents>
+            </output>
+            <output_collection name="data_collection" type="list">
+                <element name="sample1" ftype="fastq" file="sample1.fastq" />
+           </output_collection>
+         </test>
+        <test>
+            <param name="directory" value="fastq_files_id" />
+            <param name="results|regex" value="le.?" />
+            <output name="listing_fastq">
+                <assert_contents>
+                    <has_line line="sample1.fastq" />
+                    <not_has_text text="other.fastq" />
+                </assert_contents>
+            </output>
+            <output_collection name="data_collection" type="list">
+                <element name="sample1" ftype="fastq" file="sample1.fastq" />
+           </output_collection>
+         </test>
+    </tests>
+
+    <help>
+<![CDATA[
+This tool will lookup files on the Galaxy server machine, including mounted directories.
+
+Only directories and ending combinations set up by a Galaxy admin can be listed or loaded in this way.
+These endings are case senitive.
+
+====
+
+The data options will look for all files that have a particular ending in the selected directory.
+
+The tool will return two things.
+
+1. A Dataset collection of all the detected files. (If data requested)
+
+2. A file with the names of all the detected files. These will be sorted in the same order as galaxy builds the dataset collection. 
+
+The files can be filtered by setting a specific start string for the file name. 
+Only files that start with this string (case senstive) will be included.
+
+Files can also be filter for the last part before the file extsentions.
+
+Files can also be filtered by a regex pattern.  
+Only files that contain the regex string will be included.
+This uses the python search funtion so as long as the Regex pattern is found somewhere in file name (excluding extension).
+
+The three filter start, last and regex if supplied work indepently, so only files that pass all supplied test will be included.
+
+Note: Before applying the last and regex test the ending (includig the . just before the ending are removed)
+
+Assuming the directory has:
+C01_R1_001.fasta   C01_R2_001.fatsa   C02_R1_001.fasta   C02_R2_002.fatsa
+
+Setting start C01 will return just the C01 files:   C01_R1_001.fasta   C01_R2_002.fatsa
+
+Setting last R1_001 will return the read1 files:   C01_R1_001.fasta   C02_R1_001.fasta
+
+Setting regex R2_00.$ will return the R2 files:   C01_R2_001.fatsa   C02_R2_002.fatsa
+
+As Galaxy detects the file type based on the extension this tool will change the exstension as setup by the admin.
+
+This tool will unzip gz files if requested to by the admin,
+
+]]>
+    </help>
+    <citations>
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/directory_copier.py	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,106 @@
+import gzip
+import optparse  # using optparse as hydra still python 2.6
+import os.path
+import re
+import shutil
+import sys
+
+def report_error(*args):
+    sys.stderr.write(' '.join(map(str,args)) + '\n')
+    sys.stderr.flush()
+    sys.exit(1)
+
+
+def check_pattern_get_new_name(a_file, ending, options):
+    if options.start:
+        if not(a_file.startswith(options.start)):
+            return None
+    name = a_file[:-len(ending)]
+    if name.endswith("."):
+        name = name[:-1]
+    if options.last:
+        if not(name.endswith(last)):
+            return None
+    if options.regex:
+        pattern = re.compile(options.regex)
+        if pattern.search(name) is None:
+            return None
+    if options.new_ending:
+        if options.new_ending[0] ==".":
+            return name + options.new_ending
+        else:
+            return name + "." + options.new_ending
+    if options.decompress:
+        if a_file.endswith(".gz"):
+            return a_file[:-3]
+    return a_file
+
+
+def check_and_get_new_name(a_file, options):
+    for ending in options.endings:
+        if a_file.endswith(ending):
+            return check_pattern_get_new_name (a_file, ending, options)
+    return None
+
+
+def link(a_file, new_name, path):
+    file_path = os.path.join(os.path.realpath(path), a_file)
+    sym_path = os.path.join(os.path.realpath("output"), new_name)
+    os.link(file_path, sym_path)
+
+
+def decompress(a_file, new_name, path):
+    file_path = os.path.join(os.path.realpath(path), a_file)
+    target_path = os.path.join(os.path.realpath("output"), new_name)
+    with gzip.open(file_path, 'rb') as f_in, open(target_path, 'wb') as f_out:
+        shutil.copyfileobj(f_in, f_out)
+
+
+def copy_and_link(path, options):
+    if options.decompress or options.link:
+        os.mkdir("output")
+    with open(options.list, 'w') as list_file:
+        files = os.listdir(path)
+        files.sort()
+        for a_file in files:
+            new_name = check_and_get_new_name(a_file, options)
+            if new_name:
+                list_file.write(new_name)
+                list_file.write("\n")
+                if options.decompress:
+                    if a_file.endswith(".gz"):
+                        decompress(a_file, new_name,path)
+                    else:
+                        link(a_file, new_name, path)
+                elif options.link:
+                    link(a_file, new_name, path)
+
+
+if __name__ == '__main__':
+    parser = optparse.OptionParser()
+    parser.add_option("--path", action="store", type="string",
+                      help="Path of directory to check. ")
+    parser.add_option("--ending", action="append", type="string", dest="endings",
+                      help="Ending that can be listed and if requested linked or decompressed. ")
+    parser.add_option("--start", action="store", type="string",
+                      help="String that must be at the start of the file name ")
+    parser.add_option("--last", action="store", type="string",
+                      help="String that must be the last bit of the file name before the endings")
+    parser.add_option("--regex", action="store", type="string",
+                      help="Regex for file names not including the endings")
+    parser.add_option("--new_ending", action="store", type="string", 
+                      help="New ending to replace any previous ending in list and if required links or decompressions. Note: If not set decompression will auto remove the compressioned part of the ending")
+    parser.add_option("--list", action="store", type="string",
+                      help="Path to where all files should be listed. ")
+    parser.add_option("--link", action="store_true", default=False,
+                      help="If set will cause links to be added in output directory. ")
+    parser.add_option("--decompress", action="store_true", default=False,
+                      help="If set will cause gz files to be decompressed or if not a supported decompression ending linked.")
+    (options, args) = parser.parse_args()
+
+
+    path = options.path.strip()
+    if path[-1] != '/':
+        path = path + "/"
+    copy_and_link(path, options)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/other.fastq	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,12 @@
+@SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50
+GATTTGTATGAAAGTATACAACTAAAACTGCAGGTGGATCAGAGTAAGTC
++SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50
+hhhhgfhhcghghggfcffdhfehhhhcehdchhdhahehffffde`bVd
+@SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50
+TTGCCTGCCTATCATTTTAGTGCCTGTGAGGTGGAGATGTGAGGATCAGT
++SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50
+hhhhhhhhhhghhghhhhhfhhhhhfffffe`ee[`X]b[d[ed`[Y[^Y
+@SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50
+TGCATGATCTTCAGTGCCAGGACCTTATCAAGCGGTTTGGTCCCTTTGTT
++SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50
+dhhhgchhhghhhfhhhhhdhhhhehhghfhhhchfddffcffafhfghe
Binary file test-data/other.fastq.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample1.fastq	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,12 @@
+@SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50
+TTGCCTGCCTATCATTTTAGTGCCTGTGAGGTGGAGATGTGAGGATCAGT
++SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50
+hhhhhhhhhhghhghhhhhfhhhhhfffffe`ee[`X]b[d[ed`[Y[^Y
+@SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50
+GATTTGTATGAAAGTATACAACTAAAACTGCAGGTGGATCAGAGTAAGTC
++SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50
+hhhhgfhhcghghggfcffdhfehhhhcehdchhdhahehffffde`bVd
+@SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50
+TGCATGATCTTCAGTGCCAGGACCTTATCAAGCGGTTTGGTCCCTTTGTT
++SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50
+dhhhgchhhghhhfhhhhhdhhhhehhghfhhhchfddffcffafhfghe
Binary file test-data/sample1.fastq.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/directory_data.loc.sample	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,19 @@
+#This file lists the directories that can be read in
+
+#This file has the format (white space characters are TAB characters):
+#
+#<unique_build_id>	<dbkey>	<display_name>	<original_extension>	<galaxy_extension>	<decompress>	<path>
+#
+#original_extension should not include the starting .
+#
+#galaxy_extension should be one listed in galaxy/config/datatypes_conf.xml (or xml.sample)
+#
+#decompress should be No or Yes
+#
+#So, data_manager.loc could look something like this: (whitespace is tabs)
+#
+#john_12	john_12	John's fastq files batch 12	fastq.gz	fastqsanger	Yes	/data/john/batch12
+#
+#Your directory_data.loc file should contain an entry for each path and extension pair
+#
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Jun 21 03:38:52 2016 -0400
@@ -0,0 +1,6 @@
+<tables>
+    <table name="directory_data" comment_char="#">
+        <columns>value, dbkey, name, original_extension, galaxy_extension, decompress, path</columns>
+        <file path="tool-data/directory_data.loc" />
+    </table>
+</tables>