Mercurial > repos > brenninc > xdirectory_reader_limited_by_data_table
changeset 8:288a172e95aa draft default tip
Uploaded
author | brenninc |
---|---|
date | Mon, 09 May 2016 02:25:23 -0400 |
parents | 1d1b8eb0e6b7 |
children | |
files | data_reader.xml directory_copier.py test-data/other.fastq test-data/other.fastq.gz test-data/sample1.fastq test-data/sample1.fastq.gz test-data/sample2.txt tool-data/directory_data.loc.sample tool-data/white-list.ini tool_data_table_conf.xml.sample |
diffstat | 10 files changed, 281 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_reader.xml Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,126 @@ +<tool id="directory_table_reader" name="Directory Data Reader" version="0.2"> + <description>Reads data from preconfigured directories table.</description> + <command interpreter="python"> + <![CDATA[ + directory_copier.py + --ending .${directory.fields.original_extension} + --new_ending .${directory.fields.galaxy_extension} + #if $results.required=="data" + --new_ending .${directory.fields.galaxy_extension} + --decompress ${directory.fields.decompress} + #if $results.start + --start $results.start + #end if + #if $results.last + --last $results.last + #end if + #end if + --path ${directory.fields.path} + --list ${listing} + ]]> + </command> + <inputs> + <param name="directory" type="select" label="Directory to import data from"> + <options from_data_table="directory_data"/> + <validator type="no_options" message="No Data Directory Setup"/> + </param> + <param name="list_name" type="text" size="25" label="output name" value="input data"/> + <conditional name="results"> + <param name="required" type="select" label="Download data or just directory listing" help="Select type of action required."> + <option value="data" selected="true">Data and listing of selected type</option> + <option value="listing">Get listing of selected file types </option> + </param> + <when value="data"> + <param name="start" type="text" value="" label="String which must be at the start of each file name" /> + <param name="last" type="text" value="" label="String which must be at the end of the file name (excluding the file type)" /> + </when> + <when value="listing"/> + </conditional> + </inputs> + <outputs> + <data format="txt" name="listing" label="List of files in $list_name"> + </data> + <!-- Ideally galaxy can get the type based on the file extensions. If so just add the type here --> + <collection type="list" label="$list_name" name="data_collection"> + <filter>(results['required'] == 'data')</filter> + <discover_datasets pattern="__designation_and_ext__" directory="output" visible="true" /> + </collection> + </outputs> + <tests> + <test> + <param name="directory" value="fastq.gz_files_id" /> + <param name="list_name" value="test_files" /> + <param name="results|required" value="listing"/> + <output name="listing"> + <assert_contents> + <has_line line="sample1.fastqsanger" /> + <has_line line="other.fastqsanger" /> + </assert_contents> + </output> + </test> + <test> + <param name="directory" value="fastq.gz_files_id" /> + <output name="listing_fastq_gz"> + <assert_contents> + <has_line line="sample1.fastqsanger" /> + </assert_contents> + </output> + <output_collection name="data_collection" type="list"> + <element name="sample1" ftype="fastqsanger" file="sample1.fastq" /> + <element name="other" ftype="fastqsanger" file="other.fastq" /> + </output_collection> + </test> + <test> + <param name="directory" value="fastq_files_id" /> + <param name="results|start" value="sam" /> + <output name="listing_fastq"> + <assert_contents> + <has_line line="sample1.fastq" /> + <not_has_text text="other.fasta" /> + </assert_contents> + </output> + <output_collection name="data_collection" type="list"> + <element name="sample1" ftype="fastq" file="sample1.fastq" /> + </output_collection> + </test> + </tests> + + <help> +<![CDATA[ +This tool will lookup files on the Galaxy server machine, including mounted directories. + +Only directories and ending combinations set up by a Galaxy admin can be listed or loaded in this way. +These endings are case senitive. + +==== + +The data options will look for all files that have a particular ending in the selected directory. + +The tool will return two things. + +1. A Dataset collection of all the detected files. (If data requested) + +2. A file with the names of all the detected files. These will be sorted in the same order as galaxy builds the dataset collection. + +The files can be filtered by setting a specific start string for the file name. +Only files that start with this string (case senstive) will be included. + +Files can also be filter for the last part before the file extsentions. + +Assuming the directory has: +C01_R1_001.fasta C01_R2_001.fatsa C02_R1_001.fasta C02_R2_001.fatsa + +Setting start C01 will return just the C01 files: C01_R1_001.fasta C01_R2_001.fatsa + +Setting last R1_001 will return the read1 files: C01_R1_001.fasta C02_R1_001.fasta + +As Galaxy detects the file type based on the extension this tool will change the exstension as setup by the admin. + +This tool will unzip gz files if requested to by the admin, + +]]> + </help> + <citations> + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/directory_copier.py Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,106 @@ +import gzip +import optparse # using optparse as hydra still python 2.6 +import os.path +import shutil +import sys + +def report_error(*args): + sys.stderr.write(' '.join(map(str,args)) + '\n') + sys.stderr.flush() + sys.exit(1) + + +def check_pattern_get_new_name(a_file, ending, options): + if options.start: + if not(a_file.startswith(options.start)): + return None + if options.last: + if ending[0] == ".": + last = options.last + ending + else: + if options.last[-1] == ".": + last = options.last + ending + else: + last = options.last + "." + ending + if not(a_file.endswith(last)): + return None + if options.new_ending: + name = a_file[:-len(ending)] + if options.new_ending[0] ==".": + if name[-1] == ".": + name = name[:-1] + return name + options.new_ending + if options.decompress: + if a_file.endswith(".gz"): + return a_file[:-3] + return a_file + + +def check_and_get_new_name(a_file, options): + for ending in options.endings: + if a_file.endswith(ending): + return check_pattern_get_new_name (a_file, ending, options) + return None + + +def link(a_file, new_name, path): + file_path = os.path.join(os.path.realpath(path), a_file) + sym_path = os.path.join(os.path.realpath("output"), new_name) + #if not(os.path.exists(sym_path)): + os.link(file_path, sym_path) + + +def decompress(a_file, new_name, path): + file_path = os.path.join(os.path.realpath(path), a_file) + target_path = os.path.join(os.path.realpath("output"), new_name) + with gzip.open(file_path, 'rb') as f_in, open(target_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + +def copy_and_link(path, options): + os.mkdir("output") + with open(options.list, 'w') as list_file: + files = os.listdir(path) + files.sort() + for a_file in files: + new_name = check_and_get_new_name(a_file, options) + if new_name: + list_file.write(new_name) + list_file.write("\n") + if options.decompress: + if a_file.endswith(".gz"): + decompress(a_file, new_name,path) + else: + link(a_file, new_name, path) + elif options.link: + link(a_file, new_name, path) + + +if __name__ == '__main__': + parser = optparse.OptionParser() + parser.add_option("--path", action="store", type="string", + help="Path of directory to check. ") + parser.add_option("--ending", action="append", type="string", dest="endings", + help="Ending that can be listed and if requested linked or decompressed. ") + parser.add_option("--start", action="store", type="string", + help="String that must be at the start of the file name ") + parser.add_option("--last", action="store", type="string", + help="String that must be the last bit of the file name before the endings") + parser.add_option("--new_ending", action="store", type="string", + help="New ending to replace any previous ending in list and if required links or decompressions. Note: If not set decompression will auto remove the compressioned part of the ending") + #parser.add_option("--regex", action="store", type="string", + # help="Regex pattern the file name (less . ending) must match before the endings") + parser.add_option("--list", action="store", type="string", + help="Path to where all files should be listed. ") + parser.add_option("--link", action="store_true", default=False, + help="If set will cause links to be added in output directory. ") + parser.add_option("--decompress", action="store_true", default=False, + help="If set will cause gz files to be decompressed or if not a supported decompression ending linked.") + (options, args) = parser.parse_args() + + + path = options.path.strip() + if path[-1] != '/': + path = path + "/" + copy_and_link(path, options) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/other.fastq Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,12 @@ +@SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50 +GATTTGTATGAAAGTATACAACTAAAACTGCAGGTGGATCAGAGTAAGTC ++SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50 +hhhhgfhhcghghggfcffdhfehhhhcehdchhdhahehffffde`bVd +@SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50 +TTGCCTGCCTATCATTTTAGTGCCTGTGAGGTGGAGATGTGAGGATCAGT ++SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50 +hhhhhhhhhhghhghhhhhfhhhhhfffffe`ee[`X]b[d[ed`[Y[^Y +@SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50 +TGCATGATCTTCAGTGCCAGGACCTTATCAAGCGGTTTGGTCCCTTTGTT ++SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50 +dhhhgchhhghhhfhhhhhdhhhhehhghfhhhchfddffcffafhfghe
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample1.fastq Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,12 @@ +@SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50 +TTGCCTGCCTATCATTTTAGTGCCTGTGAGGTGGAGATGTGAGGATCAGT ++SRR566546.970 HWUSI-EAS1673_11067_FC7070M:4:1:2299:1109 length=50 +hhhhhhhhhhghhghhhhhfhhhhhfffffe`ee[`X]b[d[ed`[Y[^Y +@SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50 +GATTTGTATGAAAGTATACAACTAAAACTGCAGGTGGATCAGAGTAAGTC ++SRR566546.971 HWUSI-EAS1673_11067_FC7070M:4:1:2374:1108 length=50 +hhhhgfhhcghghggfcffdhfehhhhcehdchhdhahehffffde`bVd +@SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50 +TGCATGATCTTCAGTGCCAGGACCTTATCAAGCGGTTTGGTCCCTTTGTT ++SRR566546.972 HWUSI-EAS1673_11067_FC7070M:4:1:2438:1109 length=50 +dhhhgchhhghhhfhhhhhdhhhhehhghfhhhchfddffcffafhfghe
--- a/test-data/sample2.txt Mon May 09 02:06:24 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/directory_data.loc.sample Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,19 @@ +#This file lists the directories that can be read in + +#This file has the format (white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <original_extension> <galaxy_extension> <decompress> <path> +# +#original_extension should not include the starting . +# +#galaxy_extension should be one listed in galaxy/config/datatypes_conf.xml (or xml.sample) +# +#decompress should be No or Yes +# +#So, data_manager.loc could look something like this: (whitespace is tabs) +# +#john_12 john_12 John's fastq files batch 12 fastq.gz fastqsanger Yes /data/john/batch12 +# +#Your directory_data.loc file should contain an entry for each path and extension pair +# +
--- a/tool-data/white-list.ini Mon May 09 02:06:24 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -# This file only works if saved as {tool}/tool-data/white_list.ini - -# Start of paths that will be accepted by the directory reader -# No jokers including * currently supported. -# Even files listed here will be checked against the black list - -# To accept all paths just keep line with a single slash -/ - -# Add directories absolulute for example -/home/joe_blog/galaxy_data - -# relative test_data as it only make sense for planemo tests -test-data/ -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon May 09 02:25:23 2016 -0400 @@ -0,0 +1,6 @@ +<tables> + <table name="directory_data" comment_char="#"> + <columns>value, dbkey, name, original_extension, galaxy_extension, decompress, path</columns> + <file path="tool-data/directory_data.loc" /> + </table> +</tables>