Mercurial > repos > yating-l > jbrowsearchivecreator
changeset 0:804a93e87cc8 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author | yating-l |
---|---|
date | Wed, 12 Apr 2017 17:41:55 -0400 |
parents | |
children | ac83821b0e06 |
files | TrackHub.py bedToGff3.py blastxmlToGff3.py jbrowse_hub.py jbrowse_hub.xml tool_dependencies.xml trackObject.py utils.py |
diffstat | 8 files changed, 1231 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TrackHub.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,192 @@ +#!/usr/bin/env python + +import os +import subprocess +import shutil +import json +import utils + + +class TrackHub: + def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path, metaData, jbrowse_host): + self.input_files = inputFiles.tracks + self.outfile = outputDirect + self.outfolder = extra_files_path + self.out_path = os.path.join(extra_files_path, genome) + self.reference = reference + self.tool_dir = tool_dir + self.metaData = metaData + self.raw = os.path.join(self.out_path, 'raw') + self.json = os.path.join(self.out_path, 'json') + self.jbrowse_host = jbrowse_host + try: + if os.path.exists(self.json): + shutil.rmtree(self.json) + os.makedirs(self.json) + except OSError as e: + print "Cannot create json folder error({0}): {1}".format(e.errno, e.strerror) + else: + print "Create jbrowse folder {}".format(self.out_path) + + def createHub(self): + self.prepareRefseq() + for input_file in self.input_files: + self.addTrack(input_file) + self.indexName() + slink = self.makeArchive() + self.outHtml(slink) + print "Success!\n" + + def prepareRefseq(self): + try: + #print os.path.join(self.tool_dir, 'prepare-refseqs.pl') + ", '--fasta', " + self.reference +", '--out', self.json])" + subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference, '--out', self.json]) + except OSError as e: + print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror) + #TODO: hard coded the bam and bigwig tracks. Need to allow users to customize the settings + def addTrack(self, track): + #print "false_path" , track['false_path'] + if track['false_path'] in self.metaData.keys(): + metadata = self.metaData[track['false_path']] + else: + metadata = {} + self.SetMetadata(track, metadata) + if track['dataType'] == 'bam': + self.Bam(track, metadata) + # print "add bam track\n" + elif track['dataType'] == 'bigwig': + self.BigWig(track, metadata) + else: + flat_file = os.path.join(self.raw, track['fileName']) + if track['dataType'] == 'bed': + subprocess.call(['flatfile-to-json.pl', '--bed', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) + elif track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf' or track['dataType'] == 'blastxml': + subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) + elif track['dataType'] == 'gff3_transcript': + subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"transcriptType": "transcript", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) + else: + subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) + + def indexName(self): + subprocess.call(['generate-names.pl', '-v', '--out', self.json]) + print "finished name index \n" + + def makeArchive(self): + shutil.make_archive(self.out_path, 'zip', self.out_path) + file_dir = os.path.abspath(self.outfile) + source_dir = os.path.dirname(file_dir) + folder_name = os.path.basename(self.outfolder) + source_name = os.path.basename(self.out_path) + source = os.path.join(source_dir, folder_name, source_name) + slink = source.replace('/', '_') + slink = os.path.join('/var/www/html/JBrowse-1.12.1/data', slink) + try: + if os.path.islink(slink): + os.unlink(slink) + except OSError as oserror: + print "Cannot create symlink to the data({0}): {1}".format(oserror.errno, oserror.strerror) + os.symlink(source, slink) + return slink + ''' + data_folder = '/gonramp/static/JBrowse-1.12.1/jbrowse_hub' + try: + if os.path.exists(data_folder): + if os.path.isdir(data_folder): + shutil.rmtree(data_folder) + else: + os.remove(data_folder) + except OSError as oserror: + print "Cannot create data folder({0}): {1}".format(oserror.errno, oserror.strerror) + shutil.copytree(self.out_path, data_folder) + subprocess.call(['chmod', '-R', 'o+rx', '/var/www/html/JBrowse-1.12.1/jbrowse_hub']) + shutil.rmtree(self.out_path) + ''' + + #TODO: this will list all zip files in the filedir and sub-dirs. worked in Galaxy but all list zip files in test-data when + #run it locally. May need modify + def outHtml(self, slink): + with open(self.outfile, 'w') as htmlfile: + htmlstr = 'The JBrowse Hub is created: <br>' + zipfiles = '<li><a href = "%s">Download</a></li>' + url = self.jbrowse_host + "/JBrowse-1.12.1/index.html?data=%s" + jbrowse_hub = '<li><a href = "%s" target="_blank">View JBrowse Hub</a></li>' % url + filedir_abs = os.path.abspath(self.outfile) + filedir = os.path.dirname(filedir_abs) + filedir = os.path.join(filedir, self.outfolder) + for root, dirs, files in os.walk(filedir): + for file in files: + if file.endswith('.zip'): + relative_directory = os.path.relpath(root, filedir) + relative_file_path = os.path.join(relative_directory, file) + htmlstr += zipfiles % relative_file_path + link_name = os.path.basename(slink) + relative_path = os.path.join('data', link_name + '/json') + htmlstr += jbrowse_hub % relative_path + htmlfile.write(htmlstr) + + def createTrackList(self): + trackList = os.path.join(self.json, "trackList.json") + if not os.path.exists(trackList): + os.mknod(trackList) + + def Bam(self, track, metadata): + #create trackList.json if not exist + self.createTrackList() + json_file = os.path.join(self.json, "trackList.json") + bam_track = dict() + bam_track['type'] = 'JBrowse/View/Track/Alignments2' + bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM' + bam_track['urlTemplate'] = os.path.join('../raw', track['fileName']) + bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index']) + bam_track['label'] = metadata['label'] + bam_track['category'] = metadata['category'] + bam_track = json.dumps(bam_track) + #Use add-track-json.pl to add bam track to json file + new_track = subprocess.Popen(['echo', bam_track], stdout=subprocess.PIPE) + subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout) + + def BigWig(self, track, metadata): + #create trackList.json if not exist + self.createTrackList() + json_file = os.path.join(self.json, "trackList.json") + bigwig_track = dict() + bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName']) + bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot' + bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig' + bigwig_track['label'] = metadata['label'] + bigwig_track['style'] = metadata['style'] + bigwig_track['category'] = metadata['category'] + bigwig_track = json.dumps(bigwig_track) + #Use add-track-json.pl to add bigwig track to json file + new_track = subprocess.Popen(['echo', bigwig_track], stdout=subprocess.PIPE) + #output = new_track.communicate()[0] + subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout) + + #If the metadata is not set, use the default value + def SetMetadata(self, track, metadata): + if 'label' not in metadata.keys() or metadata['label'] == '': + metadata['label'] = track['fileName'] + if 'color' not in metadata.keys() or metadata['color'] == '': + metadata['color'] = "#daa520" + if track['dataType'] == 'bigwig': + if 'style' not in metadata.keys(): + metadata['style'] = {} + if 'pos_color' not in metadata['style'] or metadata['style']['pos_color'] == '': + metadata['style']['pos_color'] = "#FFA600" + if 'neg_color' not in metadata['style'] or metadata['style']['neg_color'] == '': + metadata['style']['neg_color'] = "#005EFF" + if 'category' not in metadata.keys() or metadata['category'] == '': + metadata['category'] = "Default group" + if track['dataType'] == 'blastxml': + metadata['type'] = "G-OnRamp_plugin/BlastAlignment" + elif track['dataType'] == 'gff3_transcript' or track['dataType'] == 'gff3_mrna': + metadata['type'] = "G-OnRamp_plugin/GenePred" + else: + metadata['type'] = "CanvasFeatures" + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedToGff3.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +''' +Convert BED format to gff3 +''' +import os +from collections import OrderedDict +import utils + +class bedToGff3(): + def __init__(self, inputBedFile, chrom_sizes, bed_type, output): + self.input = inputBedFile + #file_dir = os.path.basename(inputBedFile) + #print file_dir + "\n\n" + self.output = output + self.chrom_sizes = chrom_sizes + self.type = bed_type + if self.type == "trfbig": + self.trfbig_to_gff3() + if self.type == "regtools": + self.splicejunctions_to_gff3() + + def trfbig_to_gff3(self): + gff3 = open(self.output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = utils.sequence_region(self.chrom_sizes) + seq_regions = dict() + with open(self.input, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'tandem_repeat' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[9] + field['strand'] = '+' + field['phase'] = '.' + attribute['length of repeat unit'] = li[4] + attribute['mean number of copies of repeat'] = li[5] + attribute['length of consensus sequence'] = li[6] + attribute['percentage match'] = li[7] + attribute['percentage indel'] = li[8] + attribute['percent of a\'s in repeat unit'] = li[10] + attribute['percent of c\'s in repeat unit'] = li[11] + attribute['percent of g\'s in repeat unit'] = li[12] + attribute['percent of t\'s in repeat unit'] = li[13] + attribute['entropy'] = li[14] + attribute['sequence of repeat unit element'] = li[15] + utils.write_features(field, attribute, gff3) + gff3.close() + + + def splicejunctions_to_gff3(self): + gff3 = open(self.output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = utils.sequence_region(self.chrom_sizes) + seq_regions = dict() + with open(self.input, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'junction' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = int(li[1]) + 1 + field['end'] = li[2] + field['score'] = li[12] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + utils.write_features(field, attribute, gff3) + utils.child_blocks(field, attribute, gff3) + gff3.close() + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blastxmlToGff3.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,159 @@ +#!/usr/bin/env python + + +from Bio.Blast import NCBIXML +from collections import OrderedDict +import utils + + +def align2cigar(hsp_query, hsp_reference): + """ + Build CIGAR representation from an hsp_query + input: + hsp_query + hsp_sbjct + output: + CIGAR string + """ + query = hsp_query + ref = hsp_reference + # preType, curType: + # 'M' represents match, + # 'I' represents insert a gap into the reference sequence, + # 'D' represents insert a gap into the target (delete from reference) + # some ideas of this algin2cigar function are coming from + # https://gist.github.com/ozagordi/099bdb796507da8d9426 + prevType = 'M' + curType = 'M' + count = 0 + cigar = [] + num = len(query) + for i in range(num): + if query[i] == '-': + curType = 'D' + elif ref[i] == '-': + curType = 'I' + else: + curType = 'M' + if curType == prevType: + count += 1 + else: + cigar.append('%s%d' % (prevType, count)) + prevType = curType + count = 1 + cigar.append('%s%d' % (curType, count)) + return ' '.join(cigar) + +def gff3_writer(blast_records, gff3_file): + gff3 = open(gff3_file, 'a') + gff3.write("##gff-version 3\n") + seq_regions = dict() + for blast_record in blast_records: + query_name = blast_record.query.split(" ")[0] + source = blast_record.application + method = blast_record.matrix + for alignment in blast_record.alignments: + group = { + "parent_field" : OrderedDict(), + "parent_attribute" : OrderedDict(), + "alignments" : [] + } + title = alignment.title.split(" ") + contig_name = title[len(title) - 1] + length = alignment.length + group['parent_field']['seqid'] = contig_name + group['parent_field']['source'] = source + group['parent_field']['type'] = 'match' + group['parent_attribute']['ID'] = contig_name + '_' + query_name + group['parent_attribute']['method'] = method + group['parent_attribute']['length'] = length + if contig_name not in seq_regions: + gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') + seq_regions[contig_name] = length + match_num = 0 + coords = [length, 0] + for hsp in alignment.hsps: + hsp_align = {} + field = OrderedDict() + attribute = OrderedDict() + ref = hsp.sbjct + query = hsp.query + field['seqid'] = contig_name + field['source'] = source + field['type'] = 'match_part' + + field['start'] = hsp.sbjct_start + if field['start'] < coords[0]: + coords[0] = field['start'] + ref_length = len(ref.replace('-', '')) + # if run tblastn, the actual length of reference should be multiplied by 3 + if source.lower() == "tblastn": + ref_length *= 3 + field['end'] = field['start'] + ref_length - 1 + if field['end'] > coords[1]: + coords[1] = field['end'] + field['score'] = hsp.score + #decide if the alignment in the same strand or reverse strand + #reading frame + # (+, +), (0, 0), (-, -) => + + # (+, -), (-, +) => - + if hsp.frame[1] * hsp.frame[0] > 0: + field['strand'] = '+' + elif hsp.frame[1] * hsp.frame[0] < 0: + field['strand'] = '-' + else: + if hsp.frame[0] + hsp.frame[1] >= 0: + field['strand'] = '+' + else: + field['strand'] = '-' + field['phase'] = '.' + + target_start = hsp.query_start + target_len = len(query.replace('-', '')) + # if run blastx, the actual length of query should be multiplied by 3 + if source.lower() == "blastx": + target_len *= 3 + target_end = target_start + target_len -1 + attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num) + attribute['Parent'] = group['parent_attribute']['ID'] + attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) + attribute['Gap'] = align2cigar(query, ref) + #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin + attribute['subject'] = hsp.sbjct + attribute['query'] = hsp.query + attribute['match'] = hsp.match + attribute['gaps'] = attribute['match'].count(' ') + similar = attribute['match'].count('+') + attribute['identities'] = len(attribute['match']) - similar - attribute['gaps'] + attribute['positives'] = attribute['identities'] + similar + attribute['expect'] = hsp.expect + # show reading frame attribute only if the frame is not (0, 0) + attribute['frame'] = hsp.frame[1] + match_num += 1 + hsp_align['field'] = field + hsp_align['attribute'] = attribute + group['alignments'].append(hsp_align) + group['parent_field']['start'] = coords[0] + group['parent_field']['end'] = coords[1] + group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.' + group['parent_attribute']['match_num'] = match_num + group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end'])) + utils.write_features(group['parent_field'], group['parent_attribute'], gff3) + prev_end = -1 + for align in group['alignments']: + overlap = '' + if align['field']['start'] <= prev_end: + overlap += str(align['field']['start']) + ',' + str(prev_end) + prev_end = align['field']['end'] + align['attribute']['overlap'] = overlap + utils.write_features(align['field'], align['attribute'], gff3) + gff3.close() + +def blastxml2gff3(xml_file, gff3_file): + result_handle = open(xml_file) + blast_records = NCBIXML.parse(result_handle) + gff3_writer(blast_records, gff3_file) + +if __name__ == "__main__": + blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt") +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowse_hub.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,171 @@ +#!/usr/bin/env python + +import sys +import argparse +import json +import utils +import trackObject +import TrackHub + + + +def main(argv): + parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.') + + # Reference genome mandatory + parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)') + + # Genome name + parser.add_argument('-g', '--genome_name', help='Name of reference genome') + + # Output folder + parser.add_argument('-o', '--out', help='output html') + + # Output folder + parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder') + + #Tool Directory + parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools') + + #GFF3 + parser.add_argument('--gff3', action='append', help='GFF3 format') + + # GFF3 structure: gene->transcription->CDS + parser.add_argument('--gff3_transcript', action='append', help='GFF3 format for gene prediction, structure: gene->transcription->CDS') + + # GFF3 structure: gene->mRNA->CDS + parser.add_argument('--gff3_mrna', action='append', help='GFF3 format for gene prediction, structure: gene->mRNA->CDS') + + # generic BED + parser.add_argument('--bed', action='append', help='BED format') + + # trfBig simple repeats (BED 4+12) + parser.add_argument('--bedSimpleRepeats', action='append', help='BED 4+12 format, using simpleRepeats.as') + + # regtools (BED 12+1) + parser.add_argument('--bedSpliceJunctions', action='append', help='BED 12+1 format, using spliceJunctions.as') + + # tblastn alignment (blastxml) + parser.add_argument('--blastxml', action='append', help='blastxml format from tblastn') + + # BAM format + parser.add_argument('--bam', action='append', help='BAM format from HISAT') + + # BIGWIG format + parser.add_argument('--bigwig', action='append', help='BIGWIG format to show rnaseq coverage') + + # GTF format + parser.add_argument('--gtf', action='append', help='GTF format from StringTie') + + # Metadata json format + parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') + + #JBrowse host + parser.add_argument('--jbrowse_host', help="JBrowse Host") + + args = parser.parse_args() + all_datatype_dictionary = dict() + + + if not args.fasta: + parser.print_help() + raise RuntimeError("No reference genome\n") + reference = args.fasta + genome = 'unknown' + out_path = 'unknown.html' + extra_files_path = '.' + tool_directory = '.' + jbrowse_host = '' + if args.jbrowse_host: + jbrowse_host = args.jbrowse_host + if args.genome_name: + genome = args.genome_name + if args.out: + out_path = args.out + if args.extra_files_path: + extra_files_path = args.extra_files_path + + #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies + if args.tool_directory: + tool_directory = args.tool_directory + + #Calculate chromsome sizes using genome reference and uscs tools + chrom_size = utils.getChromSizes(reference, tool_directory) + + #get metadata from json file + json_inputs_data = args.data_json + if json_inputs_data: + inputs_data = json.loads(json_inputs_data) + else: + inputs_data = {} + + #print inputs_data + + #Initate trackObject + all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path) + + array_inputs_bam = args.bam + array_inputs_bed = args.bed + array_inputs_bed_simple_repeats = args.bedSimpleRepeats + array_inputs_bed_splice_junctions = args.bedSpliceJunctions + array_inputs_bigwig = args.bigwig + array_inputs_gff3 = args.gff3 + array_inputs_gff3_transcript = args.gff3_transcript + array_inputs_gff3_mrna = args.gff3_mrna + array_inputs_gtf = args.gtf + array_inputs_blastxml = args.blastxml + + if array_inputs_bam: + all_datatype_dictionary['bam'] = array_inputs_bam + if array_inputs_bed: + all_datatype_dictionary['bed'] = array_inputs_bed + if array_inputs_bed_simple_repeats: + all_datatype_dictionary['bedSimpleRepeats'] = array_inputs_bed_simple_repeats + if array_inputs_bed_splice_junctions: + all_datatype_dictionary['bedSpliceJunctions'] = array_inputs_bed_splice_junctions + if array_inputs_bigwig: + all_datatype_dictionary['bigwig'] = array_inputs_bigwig + if array_inputs_gff3: + all_datatype_dictionary['gff3'] = array_inputs_gff3 + if array_inputs_gff3_transcript: + all_datatype_dictionary['gff3_transcript'] = array_inputs_gff3_transcript + if array_inputs_gff3_mrna: + all_datatype_dictionary['gff3_mrna'] = array_inputs_gff3_mrna + if array_inputs_gtf: + all_datatype_dictionary['gtf'] = array_inputs_gtf + if array_inputs_blastxml: + all_datatype_dictionary['blastxml'] = array_inputs_blastxml + + print "input tracks: \n", all_datatype_dictionary + + for datatype, inputfiles in all_datatype_dictionary.items(): + try: + if not inputfiles: + raise ValueError('empty input, must provide track files!\n') + except IOError: + print 'Cannot open', datatype + else: + for f in inputfiles: + #metadata = {} + #print f + #if f in inputs_data.keys(): + # metadata = inputs_data[f] + #print metadata + #Convert tracks into gff3 format + all_tracks.addToRaw(f, datatype) + + jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path, inputs_data, jbrowse_host) + jbrowseHub.createHub() + +""" +def extractMetadata(array_inputs, inputs_data): + metadata_dict = {} + for input_false_path in array_inputs: + for key, data_value in inputs_data.items(): + if key == input_false_path: + metadata_dict[input_false_path] +""" + +if __name__ == "__main__": + main(sys.argv) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowse_hub.xml Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,293 @@ +<tool id="jbrowse_hub" name="JBrowse Hub Creator" version="1.0"> + <description> + This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse + </description> + + <requirements> + <requirement type="package" version="1.2">samtools</requirement> + <requirement type="package" version="1.9">numpy</requirement> + <requirement type="package" version="1.68">biopython</requirement> + <requirement type="package" version="1.0">ucsc_tools_340</requirement> + <requirement type="package" version="1.12.1">jbrowse_tools</requirement> + </requirements> + + <stdio> + </stdio> + + <command detect_errors="exit_code"><![CDATA[ + python $__tool_directory__/jbrowse_hub.py + --fasta '$reference' + --genome_name '$genome_name' + + #set galaxy_url = str($GALAXY_URL) + #set $jbrowse_url = galaxy_url.replace("8080", "80") + --jbrowse_host '$jbrowse_url' + + ## json metadata recording from Remi's hub-archive-creator.xml + #import json + #set global data_parameter_dict = {} + + ## Function to retrieve the data of the inputs + #def prepare_json($input_to_prepare, $extra_data_dict={}) + #set false_path = str($input_to_prepare) + #set name = $input_to_prepare.name + + #set data_dict = {"name": $name} + #silent data_dict.update($extra_data_dict) + + #silent $data_parameter_dict.update({$false_path: $data_dict}) + + #end def + + #for $g in $group + #for $f in $g.format + #set track_label = str($f.formatChoice.label) + #set group_name = str($g.group_name) + #set extra_data_dict = {"label" : $track_label, "category" : $group_name} + #if $f.formatChoice.format_select == 'bed' + #set track_color = str($f.formatChoice.track_color) + #silent extra_data_dict.update({"color" : $track_color}) + #if $f.formatChoice.bedChoice.bed_select == 'bed_generic_option' + --bed $f.formatChoice.bedChoice.BED_generic + #silent $prepare_json($f.formatChoice.bedChoice.BED_generic, extra_data_dict) + #elif $f.formatChoice.bedChoice.bed_select == 'bed_simple_repeats_option' + --bedSimpleRepeats $f.formatChoice.bedChoice.BED_simple_repeats + #silent $prepare_json($f.formatChoice.bedChoice.BED_simple_repeats, extra_data_dict) + #elif $f.formatChoice.bedChoice.bed_select == 'bed_splice_junctions_option' + --bedSpliceJunctions $f.formatChoice.bedChoice.BED_splice_junctions + #silent $prepare_json($f.formatChoice.bedChoice.BED_splice_junctions, extra_data_dict) + #end if + #end if + #if $f.formatChoice.format_select == 'bam' + --bam $f.formatChoice.BAM + #silent $prepare_json($f.formatChoice.BAM, extra_data_dict) + #end if + #if $f.formatChoice.format_select == 'gff3' + #set track_color = str($f.formatChoice.track_color) + #silent extra_data_dict.update({"color" : $track_color}) + #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_generic' + --gff3 $f.formatChoice.gff3Choice.GFF3_generic + #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_generic, extra_data_dict) + #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript' + --gff3_transcript $f.formatChoice.gff3Choice.GFF3_transcript + #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_transcript, extra_data_dict) + #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna' + --gff3_mrna $f.formatChoice.gff3Choice.GFF3_mrna + #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_mrna, extra_data_dict) + #end if + #end if + #if $f.formatChoice.format_select == 'blastxml' + --blastxml $f.formatChoice.BlastXML + #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict) + #end if + #if $f.formatChoice.format_select == 'gtf' + --gtf $f.formatChoice.GTF + #set track_color = str($f.formatChoice.track_color) + #silent extra_data_dict.update({"color" : $track_color}) + #silent $prepare_json($f.formatChoice.GTF, extra_data_dict) + #end if + #if $f.formatChoice.format_select == 'bigwig' + --bigwig $f.formatChoice.BIGWIG + #set pos_color = str($f.formatChoice.pos_color) + #set neg_color = str($f.formatChoice.neg_color) + #silent $extra_data_dict.update({"style" : {"pos_color" : $pos_color, "neg_color" : $neg_color}}) + #silent $prepare_json($f.formatChoice.BIGWIG, extra_data_dict) + #end if + #end for + #end for + + #set all_data_json = json.dumps($data_parameter_dict) + -j '$all_data_json' + -e '$output.extra_files_path' + -o '$output' + + ]]></command> + + <inputs> + <param name="GALAXY_URL" type="baseurl" value="" /> + <param name="reference" type="data" format="fasta" label="Reference Genome" /> + <param name="genome_name" type="text" size="30" value="unknown" label="Genome name" /> + <repeat name="group" title="New group"> + <param type="text" name="group_name" label="Group name" value="Default group"/> + <repeat name="format" title="New track"> + <conditional name="formatChoice"> + <param name="format_select" type="select" label="Format"> + <option value="bam" selected="true">BAM</option> + <option value="bed">BED</option> + <option value="blastxml">BLASTXML</option> + <option value="bigwig">BIGWIG</option> + <option value="gff3">GFF3</option> + <option value="gtf">GTF</option> + </param> + + <when value="bam"> + <param + format="bam" + name="BAM" + type="data" + label="BAM File" + /> + <param name="label" type="text" size="30" value = "Sequence Alignment" label="Track name" /> + </when> + <when value="bed"> + <conditional name="bedChoice"> + <param name="bed_select" type="select" label="Bed Choice"> + <option value="bed_generic_option">BED format</option> + <option value="bed_simple_repeats_option">BED Simple repeat (bed4+12 / simpleRepeat.as)</option> + <option value="bed_splice_junctions_option">BED Splice junctions (bed12+1 / spliceJunctions.as)</option> + </param> + <when value="bed_generic_option"> + <param + format="bed" + name="BED_generic" + type="data" + label="Bed File" + /> + </when> + <when value="bed_simple_repeats_option"> + <param + format="bed" + name="BED_simple_repeats" + type="data" + label="Bed Simple Repeats (Bed4+12) File" + /> + </when> + <when value="bed_splice_junctions_option"> + <param + format="bed" + name="BED_splice_junctions" + type="data" + label="Bed Splice Junctions (Bed12+1) File" + /> + </when> + </conditional> + <param name="label" type="text" size="30" value="BED file" label="Track name" /> + <param name="track_color" type="color" label="Track color" value="#daa520"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="blastxml"> + <param + format="blastxml" + name="BlastXML" + type="data" + label="Blast Alignments File" + /> + <param name="label" type="text" size="30" value="Blast Alignment" label="Track name" /> + </when> + <when value="bigwig"> + <param + format="bigwig" + name="BIGWIG" + type="data" + label="BIGWIG File" + /> + <param name="label" type="text" size="30" value="Sequence Coverage" label="Track name" /> + <param name="pos_color" type="color" label="Positive Coverage Color" value="#FFA600"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + <param name="neg_color" type="color" label="Negative Coverage Color" value="#005EFF"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="gff3"> + <conditional name="gff3Choice"> + <param name="gff3_select" type="select" label="gff3 type"> + <option value="gff3_generic">GFF3 format</option> + <option value="gff3_transcript">GFF3 format output from gene prediction tools (e.g. Augustus), structure: gene->transcription->CDS</option> + <option value="gff3_mrna">GFF3 format output from gene prediction tools (e.g. SNAP), structure: gene->mRNA->CDS</option> + </param> + <when value="gff3_generic"> + <param + format="gff3" + name="GFF3_generic" + type="data" + label="GFF3 File" + /> + </when> + <when value="gff3_transcript"> + <param + format="gff3" + name="GFF3_transcript" + type="data" + label="GFF3 File from gene prediction" + /> + </when> + <when value="gff3_mrna"> + <param + format="gff3" + name="GFF3_mrna" + type="data" + label="GFF3 File from gene prediction" + /> + </when> + </conditional> + <param name="label" type="text" size="30" value="Gene Prediction" label="Track name" /> + <param name="track_color" type="color" label="Track color" value="#daa520"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="gtf"> + <param + format="gtf" + name="GTF" + type="data" + label="GTF File" + /> + <param name="label" type="text" size="30" value="Assembled Transcripts" label="Track name" /> + <param name="track_color" type="color" label="Track color" value="#daa520"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + </conditional> + </repeat> + </repeat> + </inputs> + + <outputs> + <data format="html" name="output" label="${tool.name}" /> + </outputs> + <tests> + <test> + <param name="reference" value="dbia3/raw/dbia3.fa" /> + <param name="output" value="hubtest" /> + <repeat name="group"> + <param name="group_name" value="Default group"/> + <repeat name="format"> + <conditional name="formatChoice"> + <param name="format_select" value="gff3_transcript"/> + <param name="GFF3_transcript" value="dbia3/raw/Augustus_on_data_3__GTF_GFF.gff3" /> + </conditional> + </repeat> + </repeat> + <output name="output" file="hubtest" > + </output> + </test> + </tests> + <help> + This Galaxy tool will create a tar file which including raw datasets and json datasets that can be used for + JBrowse visualization. + </help> + <citations> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,94 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="samtools" version="1.2"> + <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="numpy" version="1.9"> + <repository changeset_revision="83d12e13dbbd" name="package_numpy_1_9" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="biopython" version="1.68"> + <readme> +This package is based on package_biopython_1_67 owned by biopython. +https://toolshed.g2.bx.psu.edu/repository?user_id=fd5c6d0f82f315d8 + +This Galaxy Tool Shed package installs Biopython from source, having +first installed NumPy which is a build time depencency. This requires +and assumes a standard C compiler is already installed, along with +the Python header files. + +Development of this dependency definition is being done here on GitHub: +https://github.com/biopython/galaxy_packages + +The PYTHONPATH for biopython can be accessed through PYTHONPATH_BIOPYTHON. + </readme> + <install version="1.0"> + <actions> + <action type="download_by_url">http://biopython.org/DIST/biopython-1.68.tar.gz</action> + <action type="set_environment_for_install"> + <repository changeset_revision="83d12e13dbbd" name="package_numpy_1_9" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="numpy" version="1.9" /> + </repository> + </action> + <action type="make_directory">$INSTALL_DIR/lib/python</action> + <action type="shell_command"> + export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && + export PATH=$PATH:$PATH_NUMPY && + export PYTHONPATH=$PYTHONPATH:$PYTHONPATH_NUMPY && + python setup.py install --install-lib $INSTALL_DIR/lib/python + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable> + <environment_variable action="append_to" name="PYTHONPATH">$ENV[PYTHONPATH_NUMPY]</environment_variable> + <environment_variable action="prepend_to" name="PATH">$ENV[PATH_NUMPY]</environment_variable> + <environment_variable action="set_to" name="PYTHONPATH_BIOPYTHON">$INSTALL_DIR/lib/python</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="ucsc_tools_340" version="1.0"> + <install version="1.0"> + <actions_group> + <actions architecture="x86_64" os="linux"> + <action type="download_by_url">http://old-gep.wustl.edu/~galaxy/ucsc_tools_340.tar.gz</action> + <action type="move_directory_files"> + <source_directory>.</source_directory> + <destination_directory>$INSTALL_DIR/bin</destination_directory> + </action> + </actions> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + </action> + </actions_group> + </install> + <readme>The well known UCSC tools from Jim Kent.</readme> + </package> + <package name="jbrowse_tools" version="1.12.1"> + <install version="1.0"> + <actions_group> + <actions architecture="x86_64" os="linux"> + <action type="download_by_url">http://jbrowse.org/wordpress/wp-content/plugins/download-monitor/download.php?id=105</action> + <action type="make_directory">$INSTALL_DIR/jbrowse</action> + <action type="shell_command"> + ./setup.sh + </action> + <action type="move_directory_files"> + <source_directory>.</source_directory> + <destination_directory>$INSTALL_DIR/jbrowse</destination_directory> + </action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/jbrowse</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/jbrowse/bin</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/jbrowse/src</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/jbrowse/extlib</environment_variable> + </action> + </actions> + </actions_group> + </install> + <readme>The perl scripts for converting flat files to json.</readme> + </package> + + +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trackObject.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import os +import shutil +import utils +import bedToGff3 +import blastxmlToGff3 + + +class trackObject: + def __init__(self, chrom_size, genome, extra_files_path): + self.chrom_size = chrom_size + outputDirect = os.path.join(extra_files_path, genome) + self.raw_folder = os.path.join(outputDirect, 'raw') + #Store metadata of the tracks + self.tracks = [] + try: + if os.path.exists(self.raw_folder): + if os.path.isdir(self.raw_folder): + shutil.rmtree(self.raw_folder) + else: + os.remove(self.raw_folder) + os.makedirs(self.raw_folder) + except OSError as oserror: + print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror) + + def addToRaw(self, dataFile, dataType): + """ + Convert gff3, BED, blastxml and gtf files into gff3 files + and store converted files in folder 'raw' + """ + false_path = os.path.abspath(dataFile) + fileName = os.path.basename(dataFile) + des_path = os.path.join(self.raw_folder, fileName) + track = {} + if dataType == 'bed' or dataType == 'gff3' or dataType == 'gff3_mrna' or dataType == 'gff3_transcript' or dataType == 'fasta' or dataType == 'bam' or dataType == 'bigwig': + if dataType == 'bam': + # JBrowse will raise error: not a BAM file if the filename hasn't .bam extension + extension = os.path.splitext(fileName)[1] + if extension != '.bam': + fileName = fileName + '.bam' + des_path = os.path.join(self.raw_folder, fileName) + bam_index = utils.createBamIndex(dataFile) + indexname = os.path.basename(bam_index) + des_path_for_index = os.path.join(self.raw_folder, indexname) + shutil.copyfile(bam_index, des_path_for_index) + track['index'] = indexname + + try: + shutil.copyfile(dataFile, des_path) + except shutil.Error as err1: + print "Cannot move file, error({0}: {1})".format(err1.errno, err1.strerror) + except IOError as err2: + print "Cannot move file, error({0}: {1})".format(err2.errno, err2.strerror) + elif dataType == 'bedSimpleRepeats': + bedToGff3.bedToGff3(dataFile, self.chrom_size, 'trfbig', des_path) + elif dataType == 'bedSpliceJunctions': + bedToGff3.bedToGff3(dataFile, self.chrom_size, 'regtools', des_path) + elif dataType == 'blastxml': + blastxmlToGff3.blastxml2gff3(dataFile, des_path) + elif dataType == 'gtf': + utils.gtfToGff3(dataFile, des_path, self.chrom_size) + track['fileName'] = fileName + track['dataType'] = dataType + track['false_path'] = false_path + #self.SetMetadata(track, metaData) + self.tracks.append(track) + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils.py Wed Apr 12 17:41:55 2017 -0400 @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +""" +This file include common used functions for converting file format to gff3 +""" +from collections import OrderedDict +import json +import subprocess +import os +import tempfile +import string + +def write_features(field, attribute, gff3): + """ + The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) + field, attribute are ordered dictionary + gff3 is the file handler + """ + attr = [] + for v in field.values(): + gff3.write(str(v) + '\t') + for k, v in attribute.items(): + s = str(k) + '=' + str(v) + attr.append(s) + gff3.write(';'.join(attr)) + gff3.write('\n') + +def getChromSizes(reference, tool_dir): + #TODO: find a better way instead of shipping the two exec files with the tool + faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') + twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') + try: + twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) + chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) + except IOError as err: + print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) + try: + subprocess.call(['faToTwoBit', reference, twoBitFile.name]) + except OSError as err: + print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) + try: + subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name]) + except OSError as err: + print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) + return chrom_sizes + +def sequence_region(chrom_sizes): + """ + This function read from a chromatin size file generated by twoBitInfo and write the information to dict + return a dict + """ + f = open(chrom_sizes, 'r') + sizes = f.readlines() + sizes_dict = {} + for line in sizes: + chrom_info = line.rstrip().split('\t') + sizes_dict[chrom_info[0]] = chrom_info[1] + return sizes_dict + +def child_blocks(parent_field, parent_attr, gff3): + num = 0 + blockcount = int(parent_attr['blockcount']) + chromstart = parent_attr['chromstarts'].split(',') + blocksize = parent_attr['blocksizes'].split(',') + while num < blockcount: + child_attr = OrderedDict() + child_field = parent_field + child_field['type'] = 'exon_junction' + child_field['start'] = int(chromstart[num]) + int(parent_field['start']) + child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 + child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1) + child_attr['Parent'] = parent_attr['ID'] + write_features(child_field, child_attr, gff3) + num = num + 1 + +def add_tracks_to_json(trackList_json, new_tracks, modify_type): + """ + Add to track configuration (trackList.json) + # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() + # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) + """ + with open(trackList_json, 'r+') as f: + data = json.load(f) + if modify_type == 'add_tracks': + data['tracks'].append(new_tracks) + elif modify_type == 'add_attr': + for k in new_tracks: + for track in data['tracks']: + if k.lower() in track['urlTemplate'].lower(): + attr = new_tracks[k] + for k, v in attr.items(): + track[k] = v + f.seek(0, 0) + f.write(json.dumps(data, separators=(',' , ':'), indent=4)) + f.truncate() + f.close() + +def gtfToGff3(gtf_file, gff3_file, chrom_sizes): + """ + Covert gtf file output from StringTie to gff3 format + """ + gff3 = open(gff3_file, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = sequence_region(chrom_sizes) + seq_regions = dict() + parents = dict() + with open(gtf_file, 'r') as gtf: + for line in gtf: + if line.startswith('#'): + continue + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + #print li + field['seqid'] = li[0] + #print field['seqid'] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[1] + field['type'] = li[2] + # The first base in a chromosome is numbered 0 in BED format + field['start'] = li[3] + field['end'] = li[4] + field['score'] = li[5] + field['strand'] = li[6] + field['phase'] = li[7] + attr_li = li[8].split(';') + gene_id = attr_li[0].split()[1].strip('"') + attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) + if field['type'] == 'transcript': + parents[gene_id] = attribute['ID'] + attribute['transcript_id'] = attr_li[1].split()[1].strip('"') + attribute['coverage'] = attr_li[2].split()[1].strip('"') + attribute['fpkm'] = attr_li[3].split()[1].strip('"') + attribute['tpm'] = attr_li[4].split()[1].strip('"') + elif field['type'] == 'exon': + attribute['Parent'] = parents[gene_id] + attribute['transcript_id'] = attr_li[1].split()[1].strip('"') + attribute['coverage'] = attr_li[3].split()[1].strip('"') + write_features(field, attribute, gff3) + gff3.close() + + +def sanitize_name(input_name): + """ + Galaxy will name all the files and dirs as *.dat, + the function can replace '.' to '_' for the dirs + """ + validChars = "_-%s%s" % (string.ascii_letters, string.digits) + sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) + return "gonramp_" + sanitized_name + +def createBamIndex(bamfile): + subprocess.call(['samtools', 'index', bamfile]) + filename = bamfile + '.bai' + if os.path.exists(filename): + return filename + else: + raise ValueError('Did not find bai file')