diff utils.py @ 0:804a93e87cc8 draft

planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author yating-l
date Wed, 12 Apr 2017 17:41:55 -0400
parents
children 7e471cdd9e71
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+"""
+This file include common used functions for converting file format to gff3
+"""
+from collections import OrderedDict
+import json
+import subprocess
+import os
+import tempfile
+import string
+
+def write_features(field, attribute, gff3):
+    """
+    The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
+    field, attribute are ordered dictionary 
+    gff3 is the file handler
+    """
+    attr = []
+    for v in field.values():
+        gff3.write(str(v) + '\t')
+    for k, v in attribute.items():
+        s = str(k) + '=' + str(v)
+        attr.append(s)
+    gff3.write(';'.join(attr))
+    gff3.write('\n')
+
+def getChromSizes(reference, tool_dir):
+    #TODO: find a better way instead of shipping the two exec files with the tool
+    faToTwoBit = os.path.join(tool_dir, 'faToTwoBit')
+    twoBitInfo = os.path.join(tool_dir, 'twoBitInfo')
+    try:
+        twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
+        chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False)
+    except IOError as err:
+        print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror)
+    try:
+        subprocess.call(['faToTwoBit', reference, twoBitFile.name])
+    except OSError as err:
+        print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror)
+    try:
+        subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name])
+    except OSError as err:
+        print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror)
+    return chrom_sizes
+
+def sequence_region(chrom_sizes):
+    """
+    This function read from a chromatin size file generated by twoBitInfo and write the information to dict
+    return a dict
+    """
+    f = open(chrom_sizes, 'r')
+    sizes = f.readlines()
+    sizes_dict = {}
+    for line in sizes:
+        chrom_info = line.rstrip().split('\t')
+        sizes_dict[chrom_info[0]] = chrom_info[1]
+    return sizes_dict
+
+def child_blocks(parent_field, parent_attr, gff3):
+    num = 0
+    blockcount = int(parent_attr['blockcount'])
+    chromstart = parent_attr['chromstarts'].split(',')
+    blocksize = parent_attr['blocksizes'].split(',')
+    while num < blockcount:
+        child_attr = OrderedDict()
+        child_field = parent_field
+        child_field['type'] = 'exon_junction'
+        child_field['start'] = int(chromstart[num]) + int(parent_field['start'])
+        child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1
+        child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1)
+        child_attr['Parent'] = parent_attr['ID']
+        write_features(child_field, child_attr, gff3)
+        num = num + 1
+
+def add_tracks_to_json(trackList_json, new_tracks, modify_type):
+    """
+    Add to track configuration (trackList.json)
+    # modify_type =  'add_tracks': add a new track like bam or bigwig, new_track = dict()
+    # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
+    """
+    with open(trackList_json, 'r+') as f:
+        data = json.load(f)
+        if modify_type == 'add_tracks':
+            data['tracks'].append(new_tracks)
+        elif modify_type == 'add_attr':
+            for k in new_tracks:
+                for track in data['tracks']:
+                    if k.lower() in track['urlTemplate'].lower():
+                        attr = new_tracks[k]
+                        for k, v in attr.items():
+                            track[k] = v
+        f.seek(0, 0)
+        f.write(json.dumps(data, separators=(',' , ':'), indent=4))
+        f.truncate()
+        f.close()
+
+def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
+    """
+    Covert gtf file output from StringTie to gff3 format
+    """
+    gff3 = open(gff3_file, 'w')
+    gff3.write("##gff-version 3\n")
+    sizes_dict = sequence_region(chrom_sizes)
+    seq_regions = dict()
+    parents = dict()
+    with open(gtf_file, 'r') as gtf:
+        for line in gtf:
+            if line.startswith('#'):
+                continue
+            field = OrderedDict()
+            attribute = OrderedDict()
+            li = line.rstrip().split("\t")
+            #print li
+            field['seqid'] = li[0]
+            #print field['seqid']
+            if field['seqid'] not in seq_regions:
+                end_region = sizes_dict[field['seqid']]
+                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+                seq_regions[field['seqid']] = end_region
+            field['source'] = li[1]
+            field['type'] = li[2]
+                # The first base in a chromosome is numbered 0 in BED format
+            field['start'] = li[3]
+            field['end'] = li[4]
+            field['score'] = li[5]
+            field['strand'] = li[6]
+            field['phase'] = li[7]
+            attr_li = li[8].split(';')
+            gene_id = attr_li[0].split()[1].strip('"')
+            attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end'])
+            if field['type'] == 'transcript':
+                parents[gene_id] = attribute['ID']
+                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+                attribute['coverage'] = attr_li[2].split()[1].strip('"')
+                attribute['fpkm'] = attr_li[3].split()[1].strip('"')
+                attribute['tpm'] = attr_li[4].split()[1].strip('"')
+            elif field['type'] == 'exon':
+                attribute['Parent'] = parents[gene_id]
+                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+                attribute['coverage'] = attr_li[3].split()[1].strip('"')
+            write_features(field, attribute, gff3)
+    gff3.close()
+
+
+def sanitize_name(input_name):
+    """
+    Galaxy will name all the files and dirs as *.dat, 
+    the function can replace '.' to '_' for the dirs
+    """
+    validChars = "_-%s%s" % (string.ascii_letters, string.digits)
+    sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
+    return "gonramp_" + sanitized_name
+
+def createBamIndex(bamfile):
+    subprocess.call(['samtools', 'index', bamfile])
+    filename = bamfile + '.bai'
+    if os.path.exists(filename):
+        return filename
+    else:
+        raise ValueError('Did not find bai file')