# HG changeset patch # User artbio # Date 1560873546 14400 # Node ID 7afc0515a30765262cc565a1ee774bf0172fcb7c # Parent 62aebaf6cfa0aefcbe3b4151095caa965bd937ef planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/concat_multi_datasets commit 4b572d4605dfc1d5cfe2b46c9f0061d041e63df9 diff -r 62aebaf6cfa0 -r 7afc0515a307 catWrapper.xml --- a/catWrapper.xml Fri May 10 10:15:02 2019 -0400 +++ b/catWrapper.xml Tue Jun 18 11:59:06 2019 -0400 @@ -1,4 +1,4 @@ - + tail-to-head by specifying how > '$out_file1' && + #else: + $concat_command '$file' >> '$out_file1' && + #end if #end for - > '$out_file1' + sleep 1 #else: #for $file in $global_condition.inputs #if $file.ext[-2:] == "gz": @@ -26,15 +29,19 @@ sleep 1 #end if #else if $global_condition.input_type == "simple_collections": - #if $global_condition.collections_condition.collection_cat_type == "two_collections": mkdir concatenated && #if $dataset_names == "No": - #for $x, $y in zip($global_condition.collections_condition.input_1, $global_condition.collections_condition.input_2) - $concat_command '$x' '$y' > concatenated/'${x.element_identifier}.listed.${x.ext}.listed' && + #for $x, $y in zip($global_condition.input_1, $global_condition.input_2): + #if $x.ext[-2:] == "gz": + gzip -dc '$x' | $concat_command | gzip -c > concatenated/'${x.element_identifier}.listed.${x.ext}.listed' && + gzip -dc '$y' | $concat_command | gzip -c >> concatenated/'${x.element_identifier}.listed.${x.ext}.listed' && + #else: + $concat_command '$x' '$y' > concatenated/'${x.element_identifier}.listed.${x.ext}.listed' && + #end if #end for sleep 1 #else: - #for $x, $y in zip($global_condition.collections_condition.input_1, $global_condition.collections_condition.input_2) + #for $x, $y in zip($global_condition.input_1, $global_condition.input_2) #if $x.ext[-2:] == "gz": printf "# ${x.element_identifier}\n" | gzip -c > concatenated/'${x.element_identifier}_${y.element_identifier}.listed.${x.ext}.listed' && gzip -dc '$x' | $concat_command | gzip -c >> concatenated/'${x.element_identifier}_${y.element_identifier}.listed.${x.ext}.listed' && @@ -49,82 +56,142 @@ #end for sleep 1 #end if - #end if #else if $global_condition.input_type == "paired_collection": #if $global_condition.paired_cat_type == "by_strand": + mkdir concatenated && #if $dataset_names == "No": #for $file in $global_condition.inputs - $concat_command - $file['forward'] - >> '$forward' && - $concat_command - $file['reverse'] - >> '$reverse' && + #if $file['forward'].ext[-2:] == "gz": + gzip -dc $file['forward'] | $concat_command | gzip -c >> concatenated/forward.listed.${file['forward'].ext}.listed && + gzip -dc $file['reverse'] | $concat_command | gzip -c >> concatenated/reverse.listed.${file['reverse'].ext}.listed && + #else: + $concat_command $file['forward'] >> concatenated/forward.listed.${file['forward'].ext}.listed && + $concat_command $file['reverse'] >> concatenated/reverse.listed.${file['reverse'].ext}.listed && + #end if #end for sleep 1 #else: - #for $file in $global_condition.inputs.keys() - printf "# ${file}_forward\n" >> '$forward' && - $concat_command - $global_condition.inputs[$file]['forward'] - >> '$forward' && - printf "# ${file}_reverse\n" >> '$reverse' && - $concat_command - $global_condition.inputs[$file]['reverse'] - >> '$reverse' && + #for $file in $global_condition.inputs.keys(): + #if $global_condition.inputs[$file]['forward'].ext[-2:] == "gz": + printf "# ${file}_forward\n" | gzip -c >> concatenated/forward.listed.${global_condition.inputs[$file]['forward'].ext}.listed && + gzip -dc $global_condition.inputs[$file]['forward'] | $concat_command | gzip -c >> concatenated/forward.listed.${global_condition.inputs[$file]['forward'].ext}.listed && + printf "# ${file}_reverse\n" | gzip -c >> concatenated/reverse.listed.${global_condition.inputs[$file]['reverse'].ext}.listed && + gzip -dc $global_condition.inputs[$file]['reverse'] | $concat_command | gzip -c >> concatenated/reverse.listed.${global_condition.inputs[$file]['reverse'].ext}.listed && + #else: + printf "# ${file}_forward\n" >> concatenated/forward.listed.${global_condition.inputs[$file]['forward'].ext}.listed && + $concat_command $global_condition.inputs[$file]['forward'] >> concatenated/forward.listed.${global_condition.inputs[$file]['forward'].ext}.listed && + printf "# ${file}_reverse\n" >> concatenated/reverse.listed.${global_condition.inputs[$file]['reverse'].ext}.listed && + $concat_command $global_condition.inputs[$file]['reverse'] >> concatenated/reverse.listed.${global_condition.inputs[$file]['reverse'].ext}.listed && + #end if #end for sleep 1 #end if #else if $global_condition.paired_cat_type == "by_pair": mkdir concatenated && #if $dataset_names == "No": - #for $file in $global_condition.inputs.keys() - $concat_command - $global_condition.inputs[$file]['forward'] - > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && - $concat_command - $global_condition.inputs[$file]['reverse'] - >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #for $file in $global_condition.inputs.keys(): + #if $global_condition.inputs[$file]['forward'].ext[-2:] == "gz": + gzip -dc $global_condition.inputs[$file]['forward'] | $concat_command | gzip -c + > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + gzip -dc $global_condition.inputs[$file]['reverse'] | $concat_command | gzip -c + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #else: + $concat_command $global_condition.inputs[$file]['forward'] + > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + $concat_command $global_condition.inputs[$file]['reverse'] + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #end if #end for sleep 1 #else: - #for $file in $global_condition.inputs.keys() - printf "# ${file}_forward\n" > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && - $concat_command - $global_condition.inputs[$file]['forward'] - >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && - printf "# ${file}_reverse\n" >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && - $concat_command - $global_condition.inputs[$file]['reverse'] - >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #for $file in $global_condition.inputs.keys(): + #if $global_condition.inputs[$file]['reverse'].ext[-2:] == "gz": + printf "# ${file}_forward\n" | gzip -c > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + gzip -dc $global_condition.inputs[$file]['forward'] | $concat_command | gzip -c + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + printf "# ${file}_reverse\n" | gzip -c >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + gzip -dc $global_condition.inputs[$file]['reverse'] | $concat_command | gzip -c + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #else: + printf "# ${file}_forward\n" > concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + $concat_command $global_condition.inputs[$file]['forward'] + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + printf "# ${file}_reverse\n" >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + $concat_command $global_condition.inputs[$file]['reverse'] + >> concatenated/'${file}.listed.${global_condition.inputs[$file]['reverse'].ext}.listed' && + #end if #end for sleep 1 #end if #else if $global_condition.paired_cat_type == "all": + mkdir concatenated && + #set $base_name=$global_condition.inputs.element_identifier + #set $extention=$global_condition.inputs[$global_condition.inputs.keys()[0]]['forward'].ext #if $dataset_names == "No": - #for $file in $global_condition.inputs.keys() - $concat_command - $global_condition.inputs[$file]['forward'] - >> $out_file1 && - $concat_command - $global_condition.inputs[$file]['reverse'] - >> $out_file1 && + #for $file in $global_condition.inputs.keys(): + #if $global_condition.inputs[$file]['forward'].ext[-2:] == "gz": + gzip -dc $global_condition.inputs[$file]['forward'] | $concat_command | gzip -c >> c'$paired_out_file' && + gzip -dc $global_condition.inputs[$file]['reverse'] | $concat_command | gzip -c >> '$paired_out_file' && + #else: + $concat_command + $global_condition.inputs[$file]['forward'] + >> '$paired_out_file' && + $concat_command + $global_condition.inputs[$file]['reverse'] + >> '$paired_out_file' && + #end if #end for sleep 1 #else: - #for $file in $global_condition.inputs.keys() - printf "# ${file}_forward\n" > $out_file1 && - $concat_command - $global_condition.inputs[$file]['forward'] - >> $out_file1 && - printf "# ${file}_reverse\n" >> $out_file1 && - $concat_command - $global_condition.inputs[$file]['reverse'] - >> $out_file1 && + #for $file in $global_condition.inputs.keys(): + #if $global_condition.inputs[$file]['forward'].ext[-2:] == "gz": + printf "# ${file}_forward\n" | gzip -c > '$paired_out_file' && + gzip -dc $global_condition.inputs[$file]['forward'] | $concat_command | gzip -c >> '$paired_out_file' && + printf "# ${file}_reverse\n" | gzip -c >> '$paired_out_file' && + gzip -dc $global_condition.inputs[$file]['reverse'] | $concat_command | gzip -c >> '$paired_out_file' && + #else: + printf "# ${file}_forward\n" > '$paired_out_file' && + $concat_command + $global_condition.inputs[$file]['forward'] + >> '$paired_out_file' && + printf "# ${file}_reverse\n" >> '$paired_out_file' && + $concat_command + $global_condition.inputs[$file]['reverse'] + >> '$paired_out_file' && + #end if #end for sleep 1 #end if #end if + #else if $global_condition.input_type == "nested_collection": + mkdir concatenated && + #if $dataset_names == "No": + #for $sub_list in $global_condition.inputs: + #set $file_base_name=$sub_list.element_identifier + #for $sub_list_element in $sub_list: + #if $sub_list_element.ext[-2:] == "gz": + gzip -dc ${sub_list_element} | $concat_command | gzip -c >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + #else: + $concat_command ${sub_list_element} >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + #end if + #end for + #end for + sleep 1 + #else: + #for $sub_list in $global_condition.inputs: + #set $file_base_name=$sub_list.element_identifier + #for $sub_list_element in $sub_list: + #if $sub_list_element.ext[-2:] == "gz": + printf "# ${sub_list_element.element_identifier}\n" | gzip -c >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + gzip -dc ${sub_list_element} | $concat_command | gzip -c >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + #else: + printf "# ${sub_list_element.element_identifier}\n" >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + $concat_command ${sub_list_element} >> concatenated/'${file_base_name}.listed.${sub_list_element.ext}.listed' && + #end if + #end for + #end for + sleep 1 + #end if #end if ]]> @@ -132,8 +199,9 @@ - + + @@ -147,15 +215,11 @@ - - - - - - - - - + + + + + @@ -163,16 +227,18 @@ - global_condition['input_type'] == 'singles' or (global_condition['input_type'] == 'paired_collection' and global_condition['paired_cat_type'] == 'all') + global_condition['input_type'] == 'singles' + + + global_condition['input_type'] == 'paired_collection' and global_condition['paired_cat_type'] == 'all' - - + global_condition['input_type'] == 'paired_collection' and global_condition['paired_cat_type'] == 'by_strand' - - (global_condition['input_type'] == 'paired_collection' and global_condition['paired_cat_type'] == 'by_pair') or (global_condition['input_type'] == 'simple_collections' and global_condition['collections_condition']['collection_cat_type'] == 'two_collections') + + (global_condition['input_type'] == 'paired_collection' and global_condition['paired_cat_type'] == 'by_pair') or (global_condition['input_type'] == 'simple_collections') or (global_condition['input_type'] == 'nested_collection') @@ -322,6 +388,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -329,9 +437,9 @@ **WARNING:** This tool does not check if the datasets being concatenated are in the same format. -**WARNING:** The paired collection operations do not handle gziped files. +**WARNING:** When concatenating 2 collections make sure the first collection is the one with the most items. -**WARNING:** When concatenating 2 collections make sure the first collection is the one with the most items. +**WARNING:** This tool can't handle nested collection deeper than list:list. ----- @@ -349,6 +457,8 @@ - When the inputs are 2 collections: datasets are concatenated in a pairwise combination and a single dataset collection is returned + - When nested collection concatenation: datasets in each sub-collection are concatenated and a simple dataset collection is returned + - Skipping lines before concatenation to avoid headers - Add the name of the concatenated files as separator @@ -456,6 +566,32 @@ ----- +**Nested collection concatenation example** + +Nested collection: + + - Experiment + + - Sample_1 + + - Sample_1_file_1 + - Sample_1_file_2 + + - Sample_2 + + - Sample_2_file_1 + - Sample_2_file_2 + - Sample_2_file_3 + +Concatenation result:: + + A single collection containing: + + - Sample_1: (Sample_1_file_1 + Sample_1_file_2) + - Sample_2: (Sample_2_file_1 + Sample_2_file_2 + Sample_2_file_3) + +----- + **When selecting "Include dataset names" when concatenating files**: 1rst file name="first_tabular"::