Mercurial > repos > yating-l > jbrowsearchivecreator
view utils.py @ 3:eda851e52060 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 479fc6490e24ca0e5034ae6c3579882e97e095e6-dirty
author | yating-l |
---|---|
date | Wed, 31 May 2017 15:45:47 -0400 |
parents | 804a93e87cc8 |
children | 7e471cdd9e71 |
line wrap: on
line source
#!/usr/bin/env python """ This file include common used functions for converting file format to gff3 """ from collections import OrderedDict import json import subprocess import os import tempfile import string def write_features(field, attribute, gff3): """ The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) field, attribute are ordered dictionary gff3 is the file handler """ attr = [] for v in field.values(): gff3.write(str(v) + '\t') for k, v in attribute.items(): s = str(k) + '=' + str(v) attr.append(s) gff3.write(';'.join(attr)) gff3.write('\n') def getChromSizes(reference, tool_dir): #TODO: find a better way instead of shipping the two exec files with the tool faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') try: twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) except IOError as err: print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) try: subprocess.call(['faToTwoBit', reference, twoBitFile.name]) except OSError as err: print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) try: subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name]) except OSError as err: print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) return chrom_sizes def sequence_region(chrom_sizes): """ This function read from a chromatin size file generated by twoBitInfo and write the information to dict return a dict """ f = open(chrom_sizes, 'r') sizes = f.readlines() sizes_dict = {} for line in sizes: chrom_info = line.rstrip().split('\t') sizes_dict[chrom_info[0]] = chrom_info[1] return sizes_dict def child_blocks(parent_field, parent_attr, gff3): num = 0 blockcount = int(parent_attr['blockcount']) chromstart = parent_attr['chromstarts'].split(',') blocksize = parent_attr['blocksizes'].split(',') while num < blockcount: child_attr = OrderedDict() child_field = parent_field child_field['type'] = 'exon_junction' child_field['start'] = int(chromstart[num]) + int(parent_field['start']) child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1) child_attr['Parent'] = parent_attr['ID'] write_features(child_field, child_attr, gff3) num = num + 1 def add_tracks_to_json(trackList_json, new_tracks, modify_type): """ Add to track configuration (trackList.json) # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) """ with open(trackList_json, 'r+') as f: data = json.load(f) if modify_type == 'add_tracks': data['tracks'].append(new_tracks) elif modify_type == 'add_attr': for k in new_tracks: for track in data['tracks']: if k.lower() in track['urlTemplate'].lower(): attr = new_tracks[k] for k, v in attr.items(): track[k] = v f.seek(0, 0) f.write(json.dumps(data, separators=(',' , ':'), indent=4)) f.truncate() f.close() def gtfToGff3(gtf_file, gff3_file, chrom_sizes): """ Covert gtf file output from StringTie to gff3 format """ gff3 = open(gff3_file, 'w') gff3.write("##gff-version 3\n") sizes_dict = sequence_region(chrom_sizes) seq_regions = dict() parents = dict() with open(gtf_file, 'r') as gtf: for line in gtf: if line.startswith('#'): continue field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") #print li field['seqid'] = li[0] #print field['seqid'] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[1] field['type'] = li[2] # The first base in a chromosome is numbered 0 in BED format field['start'] = li[3] field['end'] = li[4] field['score'] = li[5] field['strand'] = li[6] field['phase'] = li[7] attr_li = li[8].split(';') gene_id = attr_li[0].split()[1].strip('"') attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) if field['type'] == 'transcript': parents[gene_id] = attribute['ID'] attribute['transcript_id'] = attr_li[1].split()[1].strip('"') attribute['coverage'] = attr_li[2].split()[1].strip('"') attribute['fpkm'] = attr_li[3].split()[1].strip('"') attribute['tpm'] = attr_li[4].split()[1].strip('"') elif field['type'] == 'exon': attribute['Parent'] = parents[gene_id] attribute['transcript_id'] = attr_li[1].split()[1].strip('"') attribute['coverage'] = attr_li[3].split()[1].strip('"') write_features(field, attribute, gff3) gff3.close() def sanitize_name(input_name): """ Galaxy will name all the files and dirs as *.dat, the function can replace '.' to '_' for the dirs """ validChars = "_-%s%s" % (string.ascii_letters, string.digits) sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) return "gonramp_" + sanitized_name def createBamIndex(bamfile): subprocess.call(['samtools', 'index', bamfile]) filename = bamfile + '.bai' if os.path.exists(filename): return filename else: raise ValueError('Did not find bai file')