diff hubArchiveCreator.py @ 1:fb5e60d4d18a draft

planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
author rmarenco
date Wed, 13 Jul 2016 13:36:37 -0400
parents
children fcff8e9146e7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hubArchiveCreator.py	Wed Jul 13 13:36:37 2016 -0400
@@ -0,0 +1,193 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This Galaxy tool permits to prepare your files to be ready for
+Assembly Hub visualization.
+Program test arguments:
+hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
+"""
+
+import argparse
+import collections
+import json
+import sys
+
+# Internal dependencies
+from TrackHub import TrackHub
+from Gff3 import Gff3
+from Bam import Bam
+from BedSimpleRepeats import BedSimpleRepeats
+from Bed import Bed
+from BigWig import BigWig
+from Gtf import Gtf
+
+
+# TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
+
+
+def main(argv):
+    # Command Line parsing init
+    parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
+
+    # Reference genome mandatory
+    parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
+
+    # GFF3 Management
+    parser.add_argument('--gff3', action='append', help='GFF3 format')
+
+    # GTF Management
+    parser.add_argument('--gtf', action='append', help='GTF format')
+
+    # Bed4+12 (TrfBig)
+    parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
+
+    # Generic Bed (Blastx transformed to bed)
+    parser.add_argument('--bed', action='append', help='Bed generic format')
+
+    # BigWig Management
+    parser.add_argument('--bigwig', action='append', help='BigWig format')
+
+    # Bam Management
+    parser.add_argument('--bam', action='append', help='Bam format')
+
+    # TODO: Check if the running directory can have issues if we run the tool outside
+    parser.add_argument('-d', '--directory',
+                        help='Running tool directory, where to find the templates. Default is running directory')
+    parser.add_argument('-u', '--ucsc_tools_path',
+                        help='Directory where to find the executables needed to run this tool')
+    parser.add_argument('-e', '--extra_files_path',
+                        help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
+    parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
+
+    parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
+
+    ucsc_tools_path = ''
+
+    toolDirectory = '.'
+    extra_files_path = '.'
+
+    # Get the args passed in parameter
+    args = parser.parse_args()
+
+    input_fasta_file = args.fasta
+
+    # TODO: Add array for each input because we can add multiple -b for example + filter the data associated
+
+
+    array_inputs_gff3 = args.gff3
+    array_inputs_bed_simple_repeats = args.bedSimpleRepeats
+    array_inputs_bed_generic = args.bed
+    array_inputs_gtf = args.gtf
+    array_inputs_bam = args.bam
+    array_inputs_bigwig = args.bigwig
+
+    outputFile = args.output
+    json_inputs_data = args.data_json
+
+    inputs_data = json.loads(json_inputs_data)
+
+    # We remove the spaces in ["name"] of inputs_data
+    sanitize_name_inputs(inputs_data)
+
+    json_inputs_data = args.data_json
+
+    inputs_data = json.loads(json_inputs_data)
+    # We remove the spaces in ["name"] of inputs_data
+    sanitize_name_inputs(inputs_data)
+
+    if args.directory:
+        toolDirectory = args.directory
+    if args.extra_files_path:
+        extra_files_path = args.extra_files_path
+    if args.ucsc_tools_path:
+        ucsc_tools_path = args.ucsc_tools_path
+
+    # TODO: Check here all the binaries / tools we need. Exception is missing
+
+    # Create the Track Hub folder
+    trackHub = TrackHub(input_fasta_file, outputFile, extra_files_path, toolDirectory)
+
+    all_datatype_dictionary = {}
+
+    # Process Augustus
+    if array_inputs_gff3:
+        create_ordered_datatype_objects(Gff3, array_inputs_gff3, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Process Bed simple repeats => From Tandem Repeats Finder / TrfBig
+    if array_inputs_bed_simple_repeats:
+        create_ordered_datatype_objects(BedSimpleRepeats, array_inputs_bed_simple_repeats, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Process a Bed => tBlastN or TopHat
+    if array_inputs_bed_generic:
+        create_ordered_datatype_objects(Bed, array_inputs_bed_generic, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Process a GTF => Tophat
+    if array_inputs_gtf:
+        create_ordered_datatype_objects(Gtf, array_inputs_gtf, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Process a Bam => Tophat
+    if array_inputs_bam:
+        create_ordered_datatype_objects(Bam, array_inputs_bam, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Process a BigWig => From Bam
+    if array_inputs_bigwig:
+        create_ordered_datatype_objects(BigWig, array_inputs_bigwig, inputs_data, input_fasta_file,
+                                        extra_files_path, all_datatype_dictionary, toolDirectory)
+
+    # Create Ordered Dictionary to add the tracks in the tool form order
+    all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
+
+    for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
+        trackHub.addTrack(datatypeObject.track.trackDb)
+
+    # We process all the modifications to create the zip file
+    trackHub.createZip()
+
+    # We terminate le process and so create a HTML file summarizing all the files
+    trackHub.terminate()
+
+    sys.exit(0)
+
+
+def sanitize_name_inputs(inputs_data):
+    """
+    Sometimes output from Galaxy, or even just file name from user have spaces
+    :param inputs_data: dict[string, dict[string, string]]
+    :return:
+    """
+    for key in inputs_data:
+        inputs_data[key]["name"] = inputs_data[key]["name"].replace(" ", "_")
+
+
+def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data, input_fasta_file,
+                                    extra_files_path, all_datatype_dictionary, tool_directory):
+    """
+    Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
+    and update the dictionary of datatype
+    :param ExtensionClass: T <= Datatype
+    :param array_inputs: list[string]
+    :param inputs_data:
+    :param input_fasta_file: string
+    :param extra_files_path: string
+    :param tool_directory; string
+    """
+
+    datatype_dictionary = {}
+
+    # TODO: Optimize this double loop
+    for input_false_path in array_inputs:
+        for key, data_value in inputs_data.items():
+            if key == input_false_path:
+                extensionObject = ExtensionClass(input_false_path, data_value,
+                                                 input_fasta_file, extra_files_path, tool_directory)
+                datatype_dictionary.update({data_value["order_index"]: extensionObject})
+    all_datatype_dictionary.update(datatype_dictionary)
+
+if __name__ == "__main__":
+    main(sys.argv)