# HG changeset patch
# User devteam
# Date 1415121319 18000
# Node ID 6e589f267c1421ecb5b6760c947009ed159bb280
# Parent  619e0fcd9126df9ca8f7c1b1a8e3abc097abfee9
Uploaded

diff -r 619e0fcd9126 -r 6e589f267c14 GFFParser.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GFFParser.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,495 @@
+#!/usr/bin/env python
+"""
+Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
+
+Requirements: 
+    Numpy :- http://numpy.org/ 
+    Scipy :- http://scipy.org/ 
+
+Copyright (C)	
+
+2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. 
+2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.
+"""
+
+import re
+import os
+import sys
+import urllib
+import numpy as np
+import scipy.io as sio
+from collections import defaultdict
+import helper as utils 
+
+def attribute_tags(col9):
+    """ 
+    Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file 
+
+    @args col9: attribute column from GFF file 
+    @type col9: str
+    """
+    info = defaultdict(list)
+    is_gff = False
+    
+    if not col9:
+        return is_gff, info
+        
+    # trim the line ending semi-colon  ucsc may have some white-space  
+    col9 = col9.rstrip(';| ')
+    # attributes from 9th column 
+    atbs = col9.split(" ; ")
+    if len(atbs) == 1:
+        atbs = col9.split("; ")
+        if len(atbs) == 1:
+            atbs = col9.split(";")
+    # check the GFF3 pattern which has key value pairs like:
+    gff3_pat = re.compile("\w+=")
+    # sometime GTF have: gene_id uc002zkg.1;
+    gtf_pat = re.compile("\s?\w+\s")
+
+    key_vals = []
+
+    if gff3_pat.match(atbs[0]): # gff3 pattern 
+        is_gff = True
+        key_vals = [at.split('=') for at in atbs]
+    elif gtf_pat.match(atbs[0]): # gtf pattern
+        for at in atbs:
+            key_vals.append(at.strip().split(" ",1))
+    else:
+        # to handle attribute column has only single value 
+        key_vals.append(['ID', atbs[0]])
+    # get key, val items 
+    for item in key_vals:
+        key, val = item
+        # replace the double qoutes from feature identifier 
+        val = re.sub('"', '', val)
+        # replace the web formating place holders to plain text format 
+        info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
+
+    return is_gff, info
+                
+def spec_features_keywd(gff_parts):
+    """
+    Specify the feature key word according to the GFF specifications
+
+    @args gff_parts: attribute field key 
+    @type gff_parts: str 
+    """
+    for t_id in ["transcript_id", "transcriptId", "proteinId"]:
+        try:
+            gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
+            break
+        except KeyError:
+            pass
+    for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
+        try:
+            gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
+            break
+        except KeyError:
+            pass
+    ## TODO key words
+    for flat_name in ["Transcript", "CDS"]:
+        if gff_parts["info"].has_key(flat_name):
+            # parents
+            if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
+                if not gff_parts['id']:
+                    gff_parts['id'] = gff_parts['info'][flat_name][0]
+                    #gff_parts["info"]["ID"] = [gff_parts["id"]]
+            # children 
+            elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
+            break
+    return gff_parts
+
+def Parse(ga_file):
+    """
+    Parsing GFF/GTF file based on feature relationship, it takes the input file.
+
+    @args ga_file: input file name 
+    @type ga_file: str 
+    """
+    child_map = defaultdict(list)
+    parent_map = dict()
+
+    ga_handle = utils.open_file(ga_file)
+
+    for rec in ga_handle:
+        rec = rec.strip('\n\r')
+        
+        # skip empty line fasta identifier and commented line
+        if not rec or rec[0] in  ['#', '>']:
+            continue
+        # skip the genome sequence 
+        if not re.search('\t', rec):
+            continue
+
+        parts = rec.split('\t')
+        assert len(parts) >= 8, rec
+
+        # process the attribute column (9th column)
+        ftype, tags = attribute_tags(parts[-1])
+        if not tags: # skip the line if no attribute column.
+	        continue 
+        
+        # extract fields  
+        if parts[1]:
+            tags["source"] = parts[1]
+        if parts[7]:
+            tags["phase"] = parts[7]
+
+        gff_info = dict()
+        gff_info['info'] = dict(tags)
+        gff_info["is_gff3"] = ftype
+        gff_info['chr'] = parts[0]
+        gff_info['score'] = parts[5]
+
+        if parts[3] and parts[4]:
+            gff_info['location'] = [int(parts[3]) ,
+                        int(parts[4])]
+            gff_info['type'] = parts[2]
+            gff_info['id'] = tags.get('ID', [''])[0]
+            if parts[6] in ['?', '.']:
+                parts[6] = None 
+            gff_info['strand'] = parts[6]
+
+            # key word according to the GFF spec.
+            # is_gff3 flag is false check this condition and get the attribute fields 
+            if not ftype:
+                gff_info = spec_features_keywd(gff_info)
+            
+            # link the feature relationships
+            if gff_info['info'].has_key('Parent'): 
+                for p in gff_info['info']['Parent']:
+                    if p == gff_info['id']:
+                        gff_info['id'] = ''
+                        break
+                rec_category = 'child'
+            elif gff_info['id']:
+                rec_category = 'parent'
+            else:
+                rec_category = 'record'
+
+            # depends on the record category organize the features
+            if rec_category == 'child':
+                for p in gff_info['info']['Parent']:
+                    # create the data structure based on source and feature id 
+                    child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
+                                            dict( type = gff_info['type'], 
+                                            location =  gff_info['location'], 
+                                            strand = gff_info['strand'], 
+                                            score = gff_info['score'], 
+                                            ID = gff_info['id'],
+                                            gene_id = gff_info['info'].get('GParent', '') 
+                                            ))
+            elif rec_category == 'parent':
+                parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( 
+                                            type = gff_info['type'], 
+                                            location = gff_info['location'],
+                                            strand = gff_info['strand'],
+                                            score = gff_info['score'], 
+                                            name = tags.get('Name', [''])[0])
+            elif rec_category == 'record':
+                #TODO how to handle plain records?
+                c = 1 
+    ga_handle.close()
+
+    # depends on file type create parent feature  
+    if not ftype:
+        parent_map, child_map = create_missing_feature_type(parent_map, child_map)    
+    
+    # connecting parent child relations  
+    # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
+    gene_mat = format_gene_models(parent_map, child_map) 
+
+    return gene_mat 
+    
+def format_gene_models(parent_nf_map, child_nf_map): 
+    """
+    Genarate GeneObject based on the parsed file contents
+
+    @args parent_nf_map: parent features with source and chromosome information 
+    @type parent_nf_map: collections defaultdict
+    @args child_nf_map: transctipt and exon information are encoded 
+    @type child_nf_map: collections defaultdict
+    """
+
+    g_cnt = 0 
+    gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
+
+    for pkey, pdet in parent_nf_map.items():
+        # considering only gene features 
+        #if not re.search(r'gene', pdet.get('type', '')):
+        #    continue 
+
+        # infer the gene start and stop if not there in the 
+        if not pdet.get('location', []):
+            GNS, GNE = [], []
+            # multiple number of transcripts 
+            for L1 in child_nf_map[pkey]:
+                GNS.append(L1.get('location', [])[0]) 
+                GNE.append(L1.get('location', [])[1]) 
+            GNS.sort()
+            GNE.sort()
+            pdet['location'] = [GNS[0], GNE[-1]]
+
+        orient = pdet.get('strand', '')
+        gene[g_cnt]['id'] = g_cnt +1 
+        gene[g_cnt]['chr'] = pkey[0]
+        gene[g_cnt]['source'] = pkey[1]
+        gene[g_cnt]['name'] = pkey[-1]
+        gene[g_cnt]['start'] = pdet.get('location', [])[0]
+        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
+        gene[g_cnt]['strand'] = orient  
+        gene[g_cnt]['score'] = pdet.get('score','')
+
+        # default value 
+        gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
+        if len(child_nf_map[pkey]) > 1:
+            gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
+
+        # complete sub-feature for all transcripts 
+        dim = len(child_nf_map[pkey])
+        TRS = np.zeros((dim,), dtype=np.object)
+        TR_TYP = np.zeros((dim,), dtype=np.object)
+        EXON = np.zeros((dim,), dtype=np.object)
+        UTR5 = np.zeros((dim,), dtype=np.object)
+        UTR3 = np.zeros((dim,), dtype=np.object)
+        CDS = np.zeros((dim,), dtype=np.object)
+        TISc = np.zeros((dim,), dtype=np.object)
+        TSSc = np.zeros((dim,), dtype=np.object)
+        CLV = np.zeros((dim,), dtype=np.object)
+        CSTOP = np.zeros((dim,), dtype=np.object)
+        TSTAT = np.zeros((dim,), dtype=np.object)
+
+        # fetching corresponding transcripts 
+        for xq, Lv1 in enumerate(child_nf_map[pkey]):
+
+            TID = Lv1.get('ID', '')
+            TRS[xq]= np.array([TID])
+
+            TYPE = Lv1.get('type', '')
+            TR_TYP[xq] = np.array('')
+            TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
+
+            orient = Lv1.get('strand', '')
+
+            # fetching different sub-features 
+            child_feat = defaultdict(list)
+            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
+                E_TYP = Lv2.get('type', '')
+                child_feat[E_TYP].append(Lv2.get('location'))
+            
+            # make general ascending order of coordinates 
+            if orient == '-':
+                for etype, excod in child_feat.items():
+                    if len(excod) > 1:
+                        if excod[0][0] > excod[-1][0]:
+                            excod.reverse()
+                            child_feat[etype] = excod
+
+            # make exon coordinate from cds and utr regions 
+            if not child_feat.get('exon'):  
+                if child_feat.get('CDS'):
+                    exon_cod = utils.make_Exon_cod( orient, 
+                                NonetoemptyList(child_feat.get('five_prime_UTR')), 
+                                NonetoemptyList(child_feat.get('CDS')),
+                                NonetoemptyList(child_feat.get('three_prime_UTR')))
+                    child_feat['exon'] = exon_cod 
+                else: 
+                    # TODO only UTR's
+                    # searching through keys to find a pattern describing exon feature 
+                    ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
+                    if ex_key_pattern:
+                        child_feat['exon'] = child_feat[ex_key_pattern[0]]
+
+            # stop_codon are seperated from CDS, add the coordinates based on strand
+            if child_feat.get('stop_codon'):
+                if orient == '+':
+                    if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
+                        child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
+                    else:
+                        child_feat['CDS'].append(child_feat.get('stop_codon')[0])
+                elif orient == '-':
+                    if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
+                        child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
+                    else:
+                        child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
+
+            # transcript signal sites 
+            TIS, cdsStop, TSS, cleave = [], [], [], []
+            cds_status, exon_status, utr_status = 0, 0, 0
+
+            if child_feat.get('exon'):
+                TSS = [child_feat.get('exon')[-1][1]]
+                TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS 
+                cleave = [child_feat.get('exon')[0][0]]
+                cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
+                exon_status = 1
+
+            if child_feat.get('CDS'):
+                if orient == '+': 
+                    TIS = [child_feat.get('CDS')[0][0]]
+                    cdsStop = [child_feat.get('CDS')[-1][1]-3]
+                else:
+                    TIS = [child_feat.get('CDS')[-1][1]]
+                    cdsStop = [child_feat.get('CDS')[0][0]+3]
+                cds_status = 1 
+                # cds phase calculation 
+                child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
+            
+            # sub-feature status 
+            if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
+                utr_status =1 
+            
+            if utr_status == cds_status == exon_status == 1: 
+                t_status = 1
+            else:
+                t_status = 0
+            
+            # add sub-feature # make array for export to different out
+            TSTAT[xq] = t_status
+            EXON[xq] = np.array(child_feat.get('exon'), np.float64)
+            UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
+            UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
+            CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
+            TISc[xq] = np.array(TIS)
+            CSTOP[xq] = np.array(cdsStop)
+            TSSc[xq] = np.array(TSS)
+            CLV[xq] = np.array(cleave)
+            
+        # add sub-features to the parent gene feature
+        gene[g_cnt]['transcript_status'] = TSTAT
+        gene[g_cnt]['transcripts'] = TRS 
+        gene[g_cnt]['exons'] = EXON
+        gene[g_cnt]['utr5_exons'] = UTR5 
+        gene[g_cnt]['cds_exons'] = CDS 
+        gene[g_cnt]['utr3_exons'] = UTR3 
+        gene[g_cnt]['transcript_type'] = TR_TYP
+        gene[g_cnt]['tis'] = TISc
+        gene[g_cnt]['cdsStop'] = CSTOP
+        gene[g_cnt]['tss'] = TSSc
+        gene[g_cnt]['cleave'] = CLV
+
+        gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], 
+                                Name = pdet.get('name'), 
+                                Source = pkey[1]) 
+        # few empty fields // TODO fill this:
+        gene[g_cnt]['anno_id'] = []
+        gene[g_cnt]['confgenes_id'] = []
+        gene[g_cnt]['alias'] = ''
+        gene[g_cnt]['name2'] = []
+        gene[g_cnt]['chr_num'] = []
+        gene[g_cnt]['paralogs'] = []
+        gene[g_cnt]['transcript_info'] = []
+        gene[g_cnt]['transcript_valid'] = []
+        gene[g_cnt]['exons_confirmed'] = []
+        gene[g_cnt]['tis_conf'] = []
+        gene[g_cnt]['tis_info'] = []
+        gene[g_cnt]['cdsStop_conf'] = []
+        gene[g_cnt]['cdsStop_info'] = []
+        gene[g_cnt]['tss_info'] = []
+        gene[g_cnt]['tss_conf'] = []
+        gene[g_cnt]['cleave_info'] = []
+        gene[g_cnt]['cleave_conf'] = []
+        gene[g_cnt]['polya_info'] = []
+        gene[g_cnt]['polya_conf'] = []
+        gene[g_cnt]['is_valid'] = []
+        gene[g_cnt]['transcript_complete'] = []
+        gene[g_cnt]['is_complete'] = []
+        gene[g_cnt]['is_correctly_gff3_referenced'] = ''
+        gene[g_cnt]['splicegraph'] = []
+        g_cnt += 1 
+
+    ## deleting empty gene records from the main array
+    XPFLG=0
+    for XP, ens in enumerate(gene):
+        if ens[0]==0:
+            XPFLG=1
+            break
+    
+    if XPFLG==1:
+        XQC = range(XP, len(gene)+1)
+        gene = np.delete(gene, XQC)
+
+    return gene 
+
+def NonetoemptyList(XS):
+    """
+    Convert a None type to empty list 
+
+    @args XS: None type 
+    @type XS: str 
+    """
+    return [] if XS is None else XS 
+
+def create_missing_feature_type(p_feat, c_feat):
+    """
+    GFF/GTF file defines only child features. This function tries to create 
+    the parent feature from the information provided in the attribute column. 
+
+    example: 
+    chr21   hg19_knownGene  exon    9690071 9690100 0.000000        +       .       gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; 
+    chr21   hg19_knownGene  exon    9692178 9692207 0.000000        +       .       gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; 
+    chr21   hg19_knownGene  exon    9711935 9712038 0.000000        +       .       gene_id "uc011abu.2"; transcript_id "uc011abu.2"; 
+
+    This function gets the parsed feature annotations. 
+    
+    @args p_feat: Parent feature map  
+    @type p_feat: collections defaultdict
+    @args c_feat: Child feature map  
+    @type c_feat: collections defaultdict
+    """
+
+    child_n_map = defaultdict(list)
+    for fid, det in c_feat.items():
+        # get the details from grand child  
+        GID = STRD = SCR = None
+        SPOS, EPOS = [], [] 
+        TYP = dict()
+        for gchild in det:
+            GID = gchild.get('gene_id', [''])[0] 
+            SPOS.append(gchild.get('location', [])[0]) 
+            EPOS.append(gchild.get('location', [])[1]) 
+            STRD = gchild.get('strand', '')
+            SCR = gchild.get('score', '')
+            if gchild.get('type', '') == "gene": ## gencode GTF file has this problem 
+                continue
+            TYP[gchild.get('type', '')] = 1
+        SPOS.sort() 
+        EPOS.sort()
+        
+        # infer transcript type
+        transcript_type = 'transcript'
+        transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
+        
+        # gene id and transcript id are same
+        transcript_id = fid[-1]
+        if GID == transcript_id:
+            transcript_id = 'Transcript:' + str(GID)
+        
+        # level -1 feature type 
+        p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
+                                            location = [], ## infer location based on multiple transcripts  
+                                            strand = STRD,
+                                            name = GID )
+        # level -2 feature type 
+        child_n_map[(fid[0], fid[1], GID)].append(
+                                            dict( type = transcript_type,
+                                            location =  [SPOS[0], EPOS[-1]], 
+                                            strand = STRD, 
+                                            score = SCR, 
+                                            ID = transcript_id,
+                                            gene_id = '' ))
+        # reorganizing the grand child
+        for gchild in det:
+            child_n_map[(fid[0], fid[1], transcript_id)].append(
+                                            dict( type = gchild.get('type', ''),
+                                            location =  gchild.get('location'),
+                                            strand = gchild.get('strand'), 
+                                            ID = gchild.get('ID'),
+                                            score = gchild.get('score'),
+                                            gene_id = '' ))
+    return p_feat, child_n_map 
+
diff -r 619e0fcd9126 -r 6e589f267c14 README
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,61 @@
+A collection of tools for converting genome annotation between GTF (Gene Transfer Format), 
+BED (Browser Extensible Data) and GFF (Generic Feature Format).
+
+INTRODUCTION
+
+Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs 
+they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested 
+with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the 
+instructions below to clone these tools into your galaxy instance.
+
+CONTENTS
+
+Tool configuration files in *.xml format. 
+
+    gtf_to_gff.xml
+    gff_to_gtf.xml
+    bed_to_gff.xml
+    gff_to_bed.xml
+    gbk_to_gff.xml
+    gff_fmap.xml
+
+Python based scripts. 
+
+    gtf_to_gff.py: convert data from GTF to valid GFF3.
+    gff_to_gtf.py: convert data from GFF3 to GTF.
+    bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3.
+    gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format.
+    gbk_to_gff.py: convert data from genbank format to GFF. 
+    gff_fmap.py: find the relation between different features described in a GFF file.  
+    GFFParser.py: Parse GFF/GTF files.  
+    helper.py: Utility functions.
+
+test-data: Test data set. (move to your galaxy_root_folder/test-data/)
+    
+    You may need to move the test files into your test-data directory so galaxy can find them. 
+    If you want to run the functional tests eg as: 
+
+    exmaple: 
+    sh run_functional_tests.sh -id fml_gtf2gff
+
+REQUIREMENTS
+
+    python 
+
+COMMENTS/QUESTIONS 
+
+I can be reached at vipin [at] cbio.mskcc.org 
+
+LICENSE
+
+Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society
+              2013-2014 Memorial Sloan Kettering Cancer Center
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+COURTESY
+
+To the Galaxy Team.
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_gff.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""
+Convert genome annotation data in a 12 column BED format to GFF3. 
+
+Usage: python bed_to_gff.py in.bed > out.gff
+
+Requirement:
+    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import helper 
+
+def __main__():
+    """
+    main function 
+    """
+
+    try:
+        bed_fname = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    bed_fh = helper.open_file(bed_fname)
+
+    for line in bed_fh: 
+        line = line.strip( '\n\r' )
+
+        if not line or line[0] in  ['#']:
+            continue 
+
+        parts = line.split('\t') 
+        assert len(parts) >= 12, line
+
+        rstarts = parts[-1].split(',')
+        rstarts.pop() if rstarts[-1] == '' else rstarts
+
+        exon_lens = parts[-2].split(',')
+        exon_lens.pop() if exon_lens[-1] == '' else exon_lens
+        
+        if len(rstarts) != len(exon_lens):
+            continue # checking the consistency col 11 and col 12 
+
+        if len(rstarts) != int(parts[-3]): 
+            continue # checking the number of exons and block count are same
+        
+        if not parts[5] in ['+', '-']:
+            parts[5] = '.' # replace the unknown strand with '.' 
+
+        # bed2gff result line 
+        print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])
+        print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])
+
+        st = int(parts[1])
+        for ex_cnt in range(int(parts[-3])):
+            start = st + int(rstarts[ex_cnt]) + 1
+            stop = start + int(exon_lens[ex_cnt]) - 1
+            print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3])
+
+    bed_fh.close()
+
+
+if __name__ == "__main__": 
+    __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_gff.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,89 @@
+<tool id="fml_bed2gff" name="BED-to-GFF" version="2.0.0">
+	<description>converter</description>
+	<command interpreter="python">bed_to_gff.py $inf_bed  &gt; $gff_format 
+	</command> 
+	<inputs>
+  		<param format="bed" name="inf_bed" type="data" label="Convert this query" help="Provide genome annotation in 12 column BED format."/>
+    </inputs>
+  	<outputs>
+  		<data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted" /> 
+  	</outputs>
+	<tests>
+        <test>
+                <param name="inf_bed" value="ccds_genes.bed" />
+                <output name="gff_format" file="ccds_genes.gff3" />
+        </test>
+        <test>
+                <param name="inf_bed" value="hs_2009.bed" />
+                <output name="gff_format" file="hs_2009.gff3" />
+        </test>
+        </tests>
+  	<help>
+
+**What it does**
+
+This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in UCSC Wiggle BED format::
+
+	chr1    11873   14409   uc001aaa.3      0       +       11873   11873   0       3       354,109,1189,   0,739,1347,
+
+- Will be converted to GFF3::
+
+	##gff-version 3
+	chr1    bed2gff gene    11874   14409   0       +       .       ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+	chr1    bed2gff transcript      11874   14409   0       +       .       ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+	chr1    bed2gff exon    11874   12227   0       +       .       Parent=uc001aaa.3
+	chr1    bed2gff exon    12613   12721   0       +       .       Parent=uc001aaa.3
+	chr1    bed2gff exon    13221   14409   0       +       .       Parent=uc001aaa.3
+
+--------
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold or contig.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+"""
+Convert data from Genbank format to GFF. 
+
+Usage: 
+python gbk_to_gff.py in.gbk > out.gff 
+
+Requirements:
+    BioPython:- http://biopython.org/
+    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import os
+import re
+import sys
+import collections
+from Bio import SeqIO
+import helper 
+
+def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
+    """
+    Write the feature information
+    """
+
+    for gname, ginfo in genes.items():
+        line = [str(chr_id), 
+                'gbk_to_gff',
+                ginfo[3],
+                str(ginfo[0]),
+                str(ginfo[1]),
+                '.',
+                ginfo[2],
+                '.',
+                'ID=%s;Name=%s' % (str(gname), str(gname))]
+        print '\t'.join(line) 
+        ## construct the transcript line is not defined in the original file 
+        t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] 
+
+        if not transcripts:
+            t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
+
+            if exons: ## get the entire transcript region  from the defined feature
+                t_line[3] = str(exons[gname][0][0])
+                t_line[4] = str(exons[gname][0][-1])
+            elif cds:
+                t_line[3] = str(cds[gname][0][0])
+                t_line[4] = str(cds[gname][0][-1])
+            print '\t'.join(t_line) 
+
+            if exons:
+                exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
+
+            if cds:
+                exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
+                if not exons:
+                    exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
+
+        else: ## transcript is defined 
+            for idx in transcripts[gname]: 
+                t_line[2] = idx[3]
+                t_line[3] = str(idx[0])
+                t_line[4] = str(idx[1])
+                t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
+                print '\t'.join(t_line) 
+                
+                ## feature line print call 
+                if exons:
+                    exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
+                if cds:
+                    exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
+                    if not exons:
+                        exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
+
+    if len(genes) == 0: ## feature entry with fragment information 
+        
+        line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] 
+        fStart = fStop = None 
+
+        for eid, ex in cds.items(): 
+            fStart = ex[0][0] 
+            fStop = ex[0][-1]
+
+        for eid, ex in exons.items(): 
+            fStart = ex[0][0] 
+            fStop = ex[0][-1]
+
+        if fStart or fStart:
+
+            line[2] = 'gene'
+            line[3] = str(fStart)
+            line[4] = str(fStop)
+            line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
+            print "\t".join(line)
+
+            if not cds:
+                line[2] = 'transcript'
+            else:
+                line[2] = 'mRNA'
+
+            line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
+            print "\t".join(line)
+           
+            if exons:
+                exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+                
+            if cds:
+                exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
+                if not exons:
+                    exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+                
+            unk +=1 
+
+    return unk
+
+def exon_line_print(temp_line, trx_exons, parent, ftype):
+    """
+    Print the EXON feature line 
+    """
+
+    for ex in trx_exons:
+        temp_line[2] = ftype
+        temp_line[3] = str(ex[0])
+        temp_line[4] = str(ex[1])
+        temp_line[8] = 'Parent=%s' % parent
+        print '\t'.join(temp_line)
+
+def gbk_parse(fname):
+    """
+    Extract genome annotation recods from genbank format 
+
+    @args fname: gbk file name 
+    @type fname: str
+    """
+
+    fhand = helper.open_file(gbkfname)
+    unk = 1 
+
+    for record in SeqIO.parse(fhand, "genbank"):
+
+        gene_tags = dict()
+        tx_tags = collections.defaultdict(list) 
+        exon = collections.defaultdict(list) 
+        cds = collections.defaultdict(list) 
+        mol_type, chr_id = None, None 
+
+        for rec in record.features:
+
+            if rec.type == 'source':
+                try:
+                    mol_type = rec.qualifiers['mol_type'][0]
+                except:
+                    mol_type = '.'
+                    pass 
+                try:
+                    chr_id = rec.qualifiers['chromosome'][0]
+                except:
+                    chr_id = record.name 
+                continue 
+
+            strand='-'
+            strand='+' if rec.strand>0 else strand
+            
+            fid = None 
+            try:
+                fid = rec.qualifiers['gene'][0]
+            except:
+                pass
+
+            transcript_id = None
+            try:
+                transcript_id = rec.qualifiers['transcript_id'][0]
+            except:
+                pass 
+
+            if re.search(r'gene', rec.type):
+                gene_tags[fid] = (rec.location._start.position+1, 
+                                    rec.location._end.position, 
+                                    strand,
+                                    rec.type
+                                    )
+            elif rec.type == 'exon':
+                exon[fid].append((rec.location._start.position+1, 
+                                    rec.location._end.position))
+            elif rec.type=='CDS':
+                cds[fid].append((rec.location._start.position+1, 
+                                    rec.location._end.position))
+            else: 
+                # get all transcripts 
+                if transcript_id: 
+                    tx_tags[fid].append((rec.location._start.position+1,
+                                    rec.location._end.position, 
+                                    transcript_id,
+                                    rec.type))
+        # record extracted, generate feature table
+        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
+        
+    fhand.close()
+
+
+if __name__=='__main__': 
+
+    try:
+        gbkfname = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    ## extract gbk records  
+    gbk_parse(gbkfname) 
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,91 @@
+<tool id="fml_gbk2gff" name="GBK-to-GFF" version="2.0.0">
+  <description>converter</description>
+  	<command interpreter="python">gbk_to_gff.py $inf_gbk &gt; $gff_format
+  	</command>
+  	<inputs>
+		<param format="gb,gbk,genbank,txt" name="inf_gbk" type="data" label="Convert this query" help="GenBank flat file format consists of an annotation section and a sequence section."/>
+  	</inputs>
+	<outputs>
+		<data format="gff3" name="gff_format" label="${tool.name} on ${on_string}: Converted"/>
+  	</outputs>
+	<tests>
+        <test>
+                <param name="inf_gbk" value="s_cerevisiae_SCU49845.gbk" />
+                <output name="gff_format" file="s_cerevisiae_SCU49845.gff3" />
+        </test>
+	</tests>
+  	<help>
+    
+**What it does**
+
+This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description).
+
+.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ 
+
+------
+
+**Example**
+
+- The following data in GenBank format::
+	
+    LOCUS       NM_001202705            2406 bp    mRNA    linear   PLN 28-MAY-2011
+    DEFINITION  Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
+                mRNA, complete cds.
+    ACCESSION   NM_001202705
+    VERSION     NM_001202705.1  GI:334184566.........
+    FEATURES             Location/Qualifiers
+         source          1..2406
+                         /organism="Arabidopsis thaliana"
+                         /mol_type="mRNA"
+                         /db_xref="taxon:3702"........
+         gene            1..2406
+                         /gene="THIC"
+                         /locus_tag="AT2G29630"
+                         /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
+    ORIGIN
+        1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
+        61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
+        121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
+        181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
+    //
+
+
+- Will be converted to GFF3::
+
+    ##gff-version 3
+    NM_001202705    gbk_to_gff chromosome      1       2406    .       +       1       ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705
+    NM_001202705    gbk_to_gff gene    1       2406    .       +       1       ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC
+    NM_001202705    gbk_to_gff mRNA    192     2126    .       +       1       ID=AT2G29630.t01;Parent=AT2G29630
+    NM_001202705    gbk_to_gff CDS     192     2126    .       +       1       ID=AT2G29630.p01;Parent=AT2G29630.t01
+    NM_001202705    gbk_to_gff exon    192     2126    .       +       1       Parent=AT2G29630.t01
+
+------
+
+**About formats** 
+
+**GenBank format** An example of a GenBank record may be viewed here_
+
+.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html 
+
+**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold or contig.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_fmap.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+"""
+GFF feature mapping program, to find the relation between different features described in a given GFF file. 
+
+Usage: 
+python gff_fmap.py in.gff > out.txt 
+
+Courtesy: Brad Chapman 
+    Few functions are inherited from bcbio-GFFParser program. 
+"""
+
+import re
+import sys 
+import urllib
+import collections
+from helper import open_file
+
+def _gff_line_map(line):
+    """Parses a line of GFF into a dictionary.
+    Given an input line from a GFF file, this:
+        - breaks it into component elements
+        - determines the type of attribute (flat, parent, child or annotation)
+        - generates a dictionary of GFF info 
+    """
+    gff3_kw_pat = re.compile("\w+=")
+    def _split_keyvals(keyval_str):
+        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+        GFF3 has key value pairs like:
+          count=9;gene=amx-2;sequence=SAGE:aacggagccg
+        GFF2 and GTF have:           
+          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+        """
+        quals = collections.defaultdict(list)
+        if keyval_str is None:
+            return quals
+        # ensembl GTF has a stray semi-colon at the end
+        if keyval_str[-1] == ';':
+            keyval_str = keyval_str[:-1]
+        # GFF2/GTF has a semi-colon with at least one space after it.
+        # It can have spaces on both sides; wormbase does this.
+        # GFF3 works with no spaces.
+        # Split at the first one we can recognize as working
+        parts = keyval_str.split(" ; ")
+        if len(parts) == 1:
+            parts = keyval_str.split("; ")
+            if len(parts) == 1:
+                parts = keyval_str.split(";")
+        # check if we have GFF3 style key-vals (with =)
+        is_gff2 = True
+        if gff3_kw_pat.match(parts[0]):
+            is_gff2 = False
+            key_vals = [p.split('=') for p in parts]
+        # otherwise, we are separated by a space with a key as the first item
+        else:
+            pieces = []
+            for p in parts:
+                # fix misplaced semi-colons in keys in some GFF2 files
+                if p and p[0] == ';':
+                    p = p[1:]
+                pieces.append(p.strip().split(" "))
+            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+        for key, val in key_vals:
+            # remove quotes in GFF2 files
+            if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+                val = val[1:-1] 
+            if val:
+                quals[key].extend(val.split(','))
+            # if we don't have a value, make this a key=True/False style
+            # attribute
+            else:
+                quals[key].append('true')
+        for key, vals in quals.items():
+            quals[key] = [urllib.unquote(v) for v in vals]
+        return quals, is_gff2
+
+    def _nest_gff2_features(gff_parts):
+        """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+        exons and coding sequences are mapped to a parent with a transcript_id
+        in GFF2. This is implemented differently at different genome centers
+        and this function attempts to resolve that and map things to the GFF3
+        way of doing them.
+        """
+        # map protein or transcript ids to a parent
+        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+            try:
+                gff_parts["quals"]["Parent"] = \
+                        gff_parts["quals"][transcript_id]
+                break
+            except KeyError:
+                pass
+        # case for WormBase GFF -- everything labelled as Transcript or CDS
+        for flat_name in ["Transcript", "CDS"]:
+            if gff_parts["quals"].has_key(flat_name):
+                # parent types
+                if gff_parts["type"] in [flat_name]:
+                    if not gff_parts["id"]:
+                        gff_parts["id"] = gff_parts["quals"][flat_name][0]
+                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
+                # children types
+                elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                    gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+                break
+
+        return gff_parts
+
+    line = line.strip()
+    if line == '':return [('directive', line)] # sometimes the blank lines will be there 
+    if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
+    if line[0] == "#":
+        return [('directive', line[2:])]
+    elif line:
+        parts = line.split('\t')
+        if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together 
+        assert len(parts) == 9, line
+        gff_parts = [(None if p == '.' else p) for p in parts]
+        gff_info = dict()
+            
+        # collect all of the base qualifiers for this item
+        quals, is_gff2 = _split_keyvals(gff_parts[8])
+
+        gff_info["is_gff2"] = is_gff2
+
+        if gff_parts[1]:quals["source"].append(gff_parts[1])
+        gff_info['quals'] = dict(quals)
+
+        # if we are describing a location, then we are a feature
+        if gff_parts[3] and gff_parts[4]:
+            gff_info['type'] = gff_parts[2]
+            gff_info['id'] = quals.get('ID', [''])[0]
+            
+            if is_gff2:gff_info = _nest_gff2_features(gff_info)
+            # features that have parents need to link so we can pick up
+            # the relationship
+            if gff_info['quals'].has_key('Parent'):
+                final_key = 'child'
+            elif gff_info['id']:
+                final_key = 'parent'
+            # Handle flat features
+            else:
+                final_key = 'feature'
+        # otherwise, associate these annotations with the full record
+        else:
+            final_key = 'annotation'
+        return [(final_key, gff_info)]
+    
+def parent_child_id_map(gff_handle):
+    """
+    Provide a mapping of parent to child relationships in the file.
+    Gives a dictionary of parent child relationships:
+
+    keys -- tuple of (source, type) for each parent
+    values -- tuple of (source, type) as children of that parent
+    """
+    # collect all of the parent and child types mapped to IDs
+    parent_sts = dict()
+    child_sts = collections.defaultdict(list)
+    for line in gff_handle:
+        line_type, line_info = _gff_line_map(line)[0]
+        if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
+            parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
+        if line_type == 'child':
+            for parent_id in line_info['quals']['Parent']:
+                child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
+    gff_handle.close()
+    # generate a dictionary of the unique final type relationships
+    pc_map = collections.defaultdict(list)
+    for parent_id, parent_type in parent_sts.items():
+        for child_type in child_sts[parent_id]:
+            pc_map[parent_type].append(child_type)
+    pc_final_map = dict()
+    for ptype, ctypes in pc_map.items():
+        unique_ctypes = list(set(ctypes))
+        unique_ctypes.sort()
+        pc_final_map[ptype] = unique_ctypes
+    # some cases the GFF file represents a single feature type 
+    if not pc_final_map:
+        for fid, stypes in parent_sts.items():
+            pc_final_map[stypes] = dict()
+    # generate a report on feature id mapping in the file 
+    print '+---------------------+---------------------------------+'
+    print '| Parent feature type | Associated child feature type(s)|'
+    print '+---------------------+---------------------------------+'
+    for key, value in pc_final_map.items():
+        print key[0], key[1]
+        for child_to in value:
+            print '\t\t\t|-',child_to[0], child_to[1]
+        print '+---------------------+---------------------------------+'
+
+
+if __name__=='__main__':
+
+    try:
+        gff_file = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+    
+    gff_handle = open_file(gff_file)
+    parent_child_id_map(gff_handle)
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_fmap.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,83 @@
+<tool id="fml_gfffmap" name="GFF-map" version="2.0.0">
+  <description>features</description>
+  <command interpreter="python"> 
+		gff_fmap.py $gff_input &gt; $idmapping
+    </command>
+  <inputs>
+    <param format="gff3,gff" name="gff_input" type="data" label="Query file" help="Provide genome annotation file in GFF."/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="idmapping" label="${tool.name} on ${on_string}: parent child id map"/>
+  </outputs>
+		<tests>
+    		<test>
+            	<param name="gff_input" value="Feature_ID_mapping_W.gff3" />
+            	<output name="idmapping" file="Feature_ID_mapping_W.txt" />
+            </test>
+    		<test>
+            	<param name="gff_input" value="Aly_JGI.gff3" />
+            	<output name="idmapping" file="Feature_ID_mapping_R.txt" />
+            </test>
+        </tests>
+  <help>
+
+**What it does** 
+
+GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file.
+
+--------
+
+**Example**
+
+- The features ID mapping in the following data in GFF3::
+
+	##gff-version 3
+	17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
+	17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+	17      protein_coding  protein     7256262 7256960 .       +       0       ID=ENSP00000328352;Name=ENSP00000328352
+	17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
+	17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+	17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
+	17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
+
+- Will be displayed as::
+    
+    +-----------------------+---------------------------------+
+    | Parent feature type   | Associated child feature type(s)|
+    +-----------------------+---------------------------------+
+    | protein_coding gene   | protein_coding mRNA             |
+    +-----------------------+---------------------------------+
+    | protein_coding protein| protein_coding CDS              |
+    +-----------------------+---------------------------------+
+    | protein_coding mRNA   | protein_coding CDS              |
+    |                       | protein_coding exon             |
+    |                       | protein_coding five_prime_UTR   |
+    |                       | protein_coding three_prime_UTR  |
+    +-----------------------+---------------------------------+
+
+--------
+
+**About formats**
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_bed.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""
+Convert genome annotation data in GFF/GTF to a 12 column BED format. 
+BED format typically represents the transcript models. 
+
+Usage: python gff_to_bed.py in.gff > out.bed  
+
+Requirement:
+    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+
+def writeBED(tinfo):
+    """
+    writing result files in bed format 
+
+    @args tinfo: list of genes 
+    @args tinfo: numpy object  
+    """
+
+    for ent1 in tinfo:
+        child_flag = False  
+
+        for idx, tid in enumerate(ent1['transcripts']):
+            child_flag = True 
+            exon_cnt = len(ent1['exons'][idx])
+            exon_len = ''
+            exon_cod = '' 
+            rel_start = None 
+            rel_stop = None 
+            for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript  
+                exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
+                if idz == 0: #calculate the relative start position 
+                    exon_cod += '0,'
+                    rel_start = int(ex_cod[0])
+                    rel_stop = ex_cod[1]
+                else:
+                    exon_cod += '%d,' % (ex_cod[0]-rel_start)
+                    rel_stop = int(ex_cod[1])
+            
+            if exon_len:
+                score = '0' 
+                score = ent1['score'][0] if ent1['score'] else score
+                out_print = [ent1['chr'],
+                            str(rel_start),
+                            str(rel_stop),
+                            tid[0],
+                            score, 
+                            ent1['strand'], 
+                            str(rel_start),
+                            str(rel_stop),
+                            '0',
+                            str(exon_cnt),
+                            exon_len,
+                            exon_cod]
+                print '\t'.join(out_print)  
+        
+        if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type 
+            score = '0' 
+            score = ent1['score'][0] if ent1['score'] else score
+
+            out_print = [ent1['chr'], 
+                        '%d' % int(ent1['start']), 
+                        '%d' % int(ent1['stop']),
+                        ent1['name'], 
+                        score, 
+                        ent1['strand'],
+                        '%d' % int(ent1['start']), 
+                        '%d' % int(ent1['stop']),
+                        '0',
+                        '1',
+                        '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), 
+                        '0,']
+
+            print '\t'.join(out_print)  
+
+    
+def __main__():
+    try:
+        query_file = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    Transcriptdb = GFFParser.Parse(query_file)  
+    writeBED(Transcriptdb)
+
+if __name__ == "__main__": 
+    __main__() 
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_bed.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,90 @@
+<tool id="fml_gff2bed" name="GFF-to-BED" version="2.0.0">
+	<description>converter</description> 
+	<command interpreter="python">gff_to_bed.py $inf_gff &gt; $bed_format 
+	</command> 
+	<inputs>
+  		<param format="gtf,gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Provide genome annotation file in GFF, GTF, GFF3."/>
+    </inputs>
+  	<outputs>
+  		<data format="bed" name="bed_format" label="${tool.name} on ${on_string}: Converted" /> 
+  	</outputs>
+	<tests>
+        <test>
+                <param name="inf_gff" value="Aly_JGI.gff3" />
+                <output name="bed_format" file="Aly_JGI.bed" />
+        </test>
+        <test>
+                <param name="inf_gff" value="MB7_3R.gff3" />
+                <output name="bed_format" file="MB7_3R.bed" />
+        </test>
+        </tests>
+  	<help>
+
+**What it does**
+
+This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format.
+
+--------
+
+**Example**
+
+- The following data in GFF3::
+
+	##gff-version 3
+	chr1    protein_coding  gene    11874   14409   0       +       .       ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+	chr1    protein_coding  transcript      11874   14409   0       +       .       ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+	chr1    protein_coding  exon    11874   12227   0       +       .       Parent=uc001aaa.3
+	chr1    protein_coding  exon    12613   12721   0       +       .       Parent=uc001aaa.3
+	chr1    protein_coding  exon    13221   14409   0       +       .       Parent=uc001aaa.3
+
+- Will be converted to UCSC Wiggle BED format::
+
+	chr1    11874   14409   uc001aaa.3      0       +       11874   14409   0       3       354,109,1189,   0,739,1347,
+
+--------
+
+**About formats**
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+
+    1. seqid - Must be a chromosome or scaffold or contig.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gbk.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,54 @@
+#!/usr/bin/env python 
+"""
+Convert data from GFF and associated genome sequence in fasta file into GenBank.
+
+Usage: 
+python gff_to_gbk.py in.gff in.fasta out.gbk 
+
+Requirements:
+    BioPython:- http://biopython.org/
+    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C) 
+    2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import sys
+import helper
+import gffparser_bcbio
+
+from Bio import SeqIO
+from Bio.Alphabet import generic_dna
+
+def __main__():
+    """
+    main wrapper
+    """
+
+    try:
+        gff_fname = sys.argv[1]
+        fasta_fname = sys.argv[2]
+        gb_fname = sys.argv[3]
+    except: 
+        print __doc__
+        sys.exit(-1)
+
+    fasta_fh = helper.open_file(fasta_fname) 
+
+    fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna))
+    fasta_fh.close()
+
+    gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec)
+    
+    try:
+        gb_fh = open(gb_fname, "w")
+    except:
+        print 'file not ready for writing %s' % gb_fname
+        sys.exit(-1)
+
+    SeqIO.write(gff_rec, gb_fh, "genbank")
+    gb_fh.close()
+
+if __name__=="__main__":
+    __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gbk.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,98 @@
+<tool id="fml_gff2gbk" name="GFF-to-GBK" version="2.0.0">
+    <description>converter</description>
+  	<command interpreter="python">gff_to_gbk.py $inf_gff $inf_fas $gbk_format
+  	</command>
+  	<inputs>
+		<param format="gff,gff3" name="inf_gff" type="data" label="Convert this query" help="Genome annotation in GFF file format."/>
+		<param format="fa,fasta" name="inf_fas" type="data" label="Genome Sequence" help="Genome sequence in FASTA format."/>
+  	</inputs>
+	<outputs>
+		<data format="genbank" name="gbk_format" label="${tool.name} on ${on_string}: Converted"/>
+  	</outputs>
+    <tests>
+        <test>
+            <param name="inf_gff" value="s_cerevisiae_SCU49845.gff3" />
+            <param name="inf_fas" value="s_cerevisiae_SCU49845.fasta" />
+            <output name="gbk_format" file="s_cerevisiae_SCU49845.gbk" />
+        </test>
+    </tests>
+  	<help>
+
+**What it does**
+
+This tool converts annotations in GFF to GenBank_ format (scroll down for format description).
+
+.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ 
+
+------
+
+**Example**
+
+- The following data in GFF::
+	
+    ##gff-version 3
+    # sequence-region NM_001202705 1 2406
+    NM_001202705    GenBank chromosome      1       2406    .       +       1       ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ;
+    NM_001202705    GenBank gene    1       2406    .       +       1       ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630
+    NM_001202705    GenBank mRNA    192     2126    .       +       1       ID=AT2G29630.t01;Parent=AT2G29630
+    NM_001202705    GenBank CDS     192     2126    .       +       1       ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1;
+    NM_001202705    GenBank exon    192     2126    .       +       1       Parent=AT2G29630.t01
+    ##FASTA
+    >NM_001202705
+    AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG
+    TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT
+
+- Will be converted to GenBank format::
+
+    LOCUS       NM_001202705            2406 bp    mRNA    linear   PLN 28-MAY-2011
+    DEFINITION  Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
+                mRNA, complete cds.
+    ACCESSION   NM_001202705
+    VERSION     NM_001202705.1  GI:334184566.........
+    FEATURES             Location/Qualifiers
+         source          1..2406
+                         /organism="Arabidopsis thaliana"
+                         /mol_type="mRNA"
+                         /db_xref="taxon:3702"........
+         gene            1..2406
+                         /gene="THIC"
+                         /locus_tag="AT2G29630"
+                         /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
+    ORIGIN
+        1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
+        61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
+        121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
+        181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
+    //
+
+------
+
+**About formats** 
+
+**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold or contig.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+**GenBank format** Consists of an annotation section and a sequence section. Sample record_  
+
+.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html 
+
+
+--------
+
+**Copyright**
+
+2010-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gtf.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,76 @@
+#!/usr/bin/env python 
+"""
+Program to convert data from GFF to GTF 
+
+Usage: python gff_to_gtf.py in.gff > out.gtf 
+
+Requirement:
+    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+
+def printGTF(tinfo):
+    """
+    writing result file in GTF format
+
+    @args tinfo: parsed object from gff file
+    @type tinfo: numpy array 
+    """
+
+    for ent1 in tinfo:
+        for idx, tid in enumerate(ent1['transcripts']):
+            
+            exons = ent1['exons'][idx]
+            cds_exons = ent1['cds_exons'][idx]
+
+            stop_codon = start_codon = ()
+
+            if ent1['strand'] == '+':
+                if cds_exons.any():
+                    start_codon = (cds_exons[0][0], cds_exons[0][0]+2) 
+                    stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) 
+            elif ent1['strand'] == '-':
+                if cds_exons.any():
+                    start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
+                    stop_codon = (cds_exons[0][0], cds_exons[0][0]+2)
+            else:
+                print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0]) 
+                pass 
+                
+            last_cds_cod = 0 
+            for idz, ex_cod in enumerate(exons):
+
+                print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+                if cds_exons.any():
+                    try:
+                        print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+                        last_cds_cod = idz 
+                    except:
+                        pass 
+
+                    if idz == 0:
+                        print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+            if stop_codon:
+                print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+    
+if __name__ == "__main__": 
+
+    try:
+        gff_fname = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    Transcriptdb = GFFParser.Parse(gff_fname)  
+
+    printGTF(Transcriptdb) 
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gtf.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,88 @@
+<tool id="fml_gff2gtf" name="GFF-to-GTF" version="2.0.0">
+	<description>converter</description> 
+	<command interpreter="python">gff_to_gtf.py $inf_gff3 &gt; $gtf_format
+	</command> 
+	<inputs>
+  		<param format="gff3,gff" name="inf_gff3" type="data" label="Convert this query" help="Provide genome annotation file in GFF or GFF3."/>
+    </inputs>
+  	<outputs>
+  		<data format="gtf" name="gtf_format" label="${tool.name} on ${on_string}: Converted" /> 
+  	</outputs>
+	<tests>
+        <test>
+                <param name="inf_gff3" value="AceView_ncbi_37.gff3" />
+                <output name="gtf_format" file="AceView_gff3_to_gtf.gtf" />
+        </test>
+        <test>
+                <param name="inf_gff3" value="ENSEMBL_mm9.gff3" />
+                <output name="gtf_format" file="ENSEMBL_mm9_gff3_to_gtf.gtf" />
+        </test>
+    </tests>
+  	<help>
+
+**What it does**
+
+This tool converts data from GFF3 to GTF file format (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in GFF3 format::
+
+	##gff-version 3
+	17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
+	17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+	17      protein_coding  protein 7256262 7256960 .       +       .       ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+	17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
+	17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+	17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
+	17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
+
+- Will be converted to GTF format::
+
+	17      protein_coding  exon    7255208 7258258 .       +       .        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+	17      protein_coding  CDS     7256262 7256957 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+	17      protein_coding  start_codon     7256262 7256264 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+	17      protein_coding  stop_codon      7256958 7256960 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+--------
+
+**About formats**
+
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
+
+    1. seqname - The name of the sequence.
+    2. source - This indicating where the annotation came from.
+    3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+    7. strand - Valid entries include '+', '-', or '.'
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+    9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 gffparser_bcbio.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gffparser_bcbio.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,828 @@
+"""Parse GFF files into features attached to Biopython SeqRecord objects.
+
+This deals with GFF3 formatted files, a tab delimited format for storing
+sequence features and annotations:
+
+http://www.sequenceontology.org/gff3.shtml
+
+It will also deal with older GFF versions (GTF/GFF2):
+
+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml
+http://mblab.wustl.edu/GTF22.html
+
+The implementation utilizes map/reduce parsing of GFF using Disco. Disco
+(http://discoproject.org) is a Map-Reduce framework for Python utilizing
+Erlang for parallelization. The code works on a single processor without
+Disco using the same architecture.
+"""
+import os
+import copy
+import re
+import collections
+import urllib
+import itertools
+
+# Make defaultdict compatible with versions of python older than 2.4
+try:
+    collections.defaultdict
+except AttributeError:
+    import _utils
+    collections.defaultdict = _utils.defaultdict
+
+from Bio.Seq import Seq, UnknownSeq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio import SeqIO
+
+def _gff_line_map(line, params):
+    """Map part of Map-Reduce; parses a line of GFF into a dictionary.
+
+    Given an input line from a GFF file, this:
+    - decides if the file passes our filtering limits
+    - if so:
+        - breaks it into component elements
+        - determines the type of attribute (flat, parent, child or annotation)
+        - generates a dictionary of GFF info which can be serialized as JSON
+    """
+    gff3_kw_pat = re.compile("\w+=")
+    def _split_keyvals(keyval_str):
+        """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+
+        GFF3 has key value pairs like:
+          count=9;gene=amx-2;sequence=SAGE:aacggagccg
+        GFF2 and GTF have:           
+          Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+          name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+        """
+        quals = collections.defaultdict(list)
+        if keyval_str is None:
+            return quals
+        # ensembl GTF has a stray semi-colon at the end
+        if keyval_str[-1] == ';':
+            keyval_str = keyval_str[:-1]
+        # GFF2/GTF has a semi-colon with at least one space after it.
+        # It can have spaces on both sides; wormbase does this.
+        # GFF3 works with no spaces.
+        # Split at the first one we can recognize as working
+        parts = keyval_str.split(" ; ")
+        if len(parts) == 1:
+            parts = keyval_str.split("; ")
+            if len(parts) == 1:
+                parts = keyval_str.split(";")
+        # check if we have GFF3 style key-vals (with =)
+        is_gff2 = True
+        if gff3_kw_pat.match(parts[0]):
+            is_gff2 = False
+            key_vals = [p.split('=') for p in parts]
+        # otherwise, we are separated by a space with a key as the first item
+        else:
+            pieces = []
+            for p in parts:
+                # fix misplaced semi-colons in keys in some GFF2 files
+                if p and p[0] == ';':
+                    p = p[1:]
+                pieces.append(p.strip().split(" "))
+            key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+        for item in key_vals:
+            # standard in-spec items are key=value
+            if len(item) == 2:
+                key, val = item
+            # out-of-spec files can have just key values. We set an empty value
+            # which will be changed to true later to standardize.
+            else:
+                assert len(item) == 1, item
+                key = item[0]
+                val = ''
+            # remove quotes in GFF2 files
+            if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+                val = val[1:-1] 
+            if val:
+                quals[key].extend([v for v in val.split(',') if v])
+            # if we don't have a value, make this a key=True/False style
+            # attribute
+            else:
+                quals[key].append('true')
+        for key, vals in quals.items():
+            quals[key] = [urllib.unquote(v) for v in vals]
+        return quals, is_gff2
+
+    def _nest_gff2_features(gff_parts):
+        """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+        exons and coding sequences are mapped to a parent with a transcript_id
+        in GFF2. This is implemented differently at different genome centers
+        and this function attempts to resolve that and map things to the GFF3
+        way of doing them.
+        """
+        # map protein or transcript ids to a parent
+        for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+            try:
+                gff_parts["quals"]["Parent"] = \
+                        gff_parts["quals"][transcript_id]
+                break
+            except KeyError:
+                pass
+        # case for WormBase GFF -- everything labelled as Transcript or CDS
+        for flat_name in ["Transcript", "CDS"]:
+            if gff_parts["quals"].has_key(flat_name):
+                # parent types
+                if gff_parts["type"] in [flat_name]:
+                    if not gff_parts["id"]:
+                        gff_parts["id"] = gff_parts["quals"][flat_name][0]
+                        gff_parts["quals"]["ID"] = [gff_parts["id"]]
+                # children types
+                elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+                        "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+                        "start_codon"]:
+                    gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+                break
+
+        return gff_parts
+
+    strand_map = {'+' : 1, '-' : -1, '?' : None, None: None}
+    line = line.strip()
+    if line[:2] == "##":
+        return [('directive', line[2:])]
+    elif line and line[0] != "#":
+        parts = line.split('\t')
+        should_do = True
+        if params.limit_info:
+            for limit_name, limit_values in params.limit_info.items():
+                cur_id = tuple([parts[i] for i in 
+                    params.filter_info[limit_name]])
+                if cur_id not in limit_values:
+                    should_do = False
+                    break
+        if should_do:
+            assert len(parts) >= 8, line
+            # not python2.4 compatible but easier to understand
+            #gff_parts = [(None if p == '.' else p) for p in parts]
+            gff_parts = []
+            for p in parts:
+                if p == ".":
+                    gff_parts.append(None)
+                else:
+                    gff_parts.append(p)
+            gff_info = dict()
+            # collect all of the base qualifiers for this item
+            if len(parts) > 8:
+                quals, is_gff2 = _split_keyvals(gff_parts[8])
+            else:
+                quals, is_gff2 = dict(), False
+            gff_info["is_gff2"] = is_gff2
+            if gff_parts[1]:
+                quals["source"].append(gff_parts[1])
+            if gff_parts[5]:
+                quals["score"].append(gff_parts[5])
+            if gff_parts[7]:
+                quals["phase"].append(gff_parts[7])
+            gff_info['quals'] = dict(quals)
+            gff_info['rec_id'] = gff_parts[0]
+            # if we are describing a location, then we are a feature
+            if gff_parts[3] and gff_parts[4]:
+                gff_info['location'] = [int(gff_parts[3]) - 1,
+                        int(gff_parts[4])]
+                gff_info['type'] = gff_parts[2]
+                gff_info['id'] = quals.get('ID', [''])[0]
+                gff_info['strand'] = strand_map.get(gff_parts[6], None)
+                if is_gff2:
+                    gff_info = _nest_gff2_features(gff_info)
+                # features that have parents need to link so we can pick up
+                # the relationship
+                if gff_info['quals'].has_key('Parent'):
+                    # check for self referential parent/child relationships
+                    # remove the ID, which is not useful
+                    for p in gff_info['quals']['Parent']:
+                        if p == gff_info['id']:
+                            gff_info['id'] = ''
+                            del gff_info['quals']['ID']
+                            break
+                    final_key = 'child'
+                elif gff_info['id']:
+                    final_key = 'parent'
+                # Handle flat features
+                else:
+                    final_key = 'feature'
+            # otherwise, associate these annotations with the full record
+            else:
+                final_key = 'annotation'
+            if params.jsonify:
+                return [(final_key, simplejson.dumps(gff_info))]
+            else:
+                return [(final_key, gff_info)]
+    return []
+
+def _gff_line_reduce(map_results, out, params):
+    """Reduce part of Map-Reduce; combines results of parsed features.
+    """
+    final_items = dict()
+    for gff_type, final_val in map_results:
+        if params.jsonify and gff_type not in ['directive']:
+            final_val = simplejson.loads(final_val)
+        try:
+            final_items[gff_type].append(final_val)
+        except KeyError:
+            final_items[gff_type] = [final_val]
+    for key, vals in final_items.items():
+        if params.jsonify:
+            vals = simplejson.dumps(vals)
+        out.add(key, vals)
+
+class _MultiIDRemapper:
+    """Provide an ID remapping for cases where a parent has a non-unique ID.
+
+    Real life GFF3 cases have non-unique ID attributes, which we fix here
+    by using the unique sequence region to assign children to the right
+    parent.
+    """
+    def __init__(self, base_id, all_parents):
+        self._base_id = base_id
+        self._parents = all_parents
+
+    def remap_id(self, feature_dict):
+        rstart, rend = feature_dict['location']
+        for index, parent in enumerate(self._parents):
+            pstart, pend = parent['location']
+            if rstart >= pstart and rend <= pend:
+                if index > 0:
+                    return ("%s_%s" % (self._base_id, index + 1))
+                else:
+                    return self._base_id
+        raise ValueError("Did not find remapped ID location: %s, %s, %s" % (
+                self._base_id, [p['location'] for p in self._parents],
+                feature_dict['location']))
+
+class _AbstractMapReduceGFF:
+    """Base class providing general GFF parsing for local and remote classes.
+
+    This class should be subclassed to provide a concrete class to parse
+    GFF under specific conditions. These classes need to implement
+    the _gff_process function, which returns a dictionary of SeqRecord
+    information.
+    """
+    def __init__(self, create_missing=True):
+        """Initialize GFF parser 
+
+        create_missing - If True, create blank records for GFF ids not in
+        the base_dict. If False, an error will be raised.
+        """
+        self._create_missing = create_missing
+        self._map_fn = _gff_line_map
+        self._reduce_fn = _gff_line_reduce
+        self._examiner = GFFExaminer()
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        raise NotImplementedError("Derived class must define")
+
+    def parse(self, gff_files, base_dict=None, limit_info=None):
+        """Parse a GFF file, returning an iterator of SeqRecords.
+
+        limit_info - A dictionary specifying the regions of the GFF file
+        which should be extracted. This allows only relevant portions of a file
+        to be parsed.
+        
+        base_dict - A base dictionary of SeqRecord objects which may be
+        pre-populated with sequences and other features. The new features from
+        the GFF file will be added to this dictionary.
+        """
+        for rec in self.parse_in_parts(gff_files, base_dict, limit_info):
+            yield rec
+
+    def parse_in_parts(self, gff_files, base_dict=None, limit_info=None,
+            target_lines=None):
+        """Parse a region of a GFF file specified, returning info as generated.
+
+        target_lines -- The number of lines in the file which should be used
+        for each partial parse. This should be determined based on available
+        memory.
+        """
+        for results in self.parse_simple(gff_files, limit_info, target_lines):
+            if base_dict is None:
+                cur_dict = dict()
+            else:
+                cur_dict = copy.deepcopy(base_dict)
+            cur_dict = self._results_to_features(cur_dict, results)
+            all_ids = cur_dict.keys()
+            all_ids.sort()
+            for cur_id in all_ids:
+                yield cur_dict[cur_id]
+
+    def parse_simple(self, gff_files, limit_info=None, target_lines=1):
+        """Simple parse which does not build or nest features.
+
+        This returns a simple dictionary representation of each line in the
+        GFF file.
+        """
+        # gracefully handle a single file passed
+        if not isinstance(gff_files, (list, tuple)):
+            gff_files = [gff_files]
+        limit_info = self._normalize_limit_info(limit_info)
+        for results in self._gff_process(gff_files, limit_info, target_lines):
+            yield results
+       
+    def _normalize_limit_info(self, limit_info):
+        """Turn all limit information into tuples for identical comparisons.
+        """
+        final_limit_info = {}
+        if limit_info:
+            for key, values in limit_info.items():
+                final_limit_info[key] = []
+                for v in values:
+                    if isinstance(v, str):
+                        final_limit_info[key].append((v,))
+                    else:
+                        final_limit_info[key].append(tuple(v))
+        return final_limit_info
+    
+    def _results_to_features(self, base, results):
+        """Add parsed dictionaries of results to Biopython SeqFeatures.
+        """
+        base = self._add_annotations(base, results.get('annotation', []))
+        for feature in results.get('feature', []):
+            (_, base) = self._add_toplevel_feature(base, feature)
+        base = self._add_parent_child_features(base, results.get('parent', []),
+                results.get('child', []))
+        base = self._add_seqs(base, results.get('fasta', []))
+        base = self._add_directives(base, results.get('directive', []))
+        return base
+
+    def _add_directives(self, base, directives):
+        """Handle any directives or meta-data in the GFF file.
+
+        Relevant items are added as annotation meta-data to each record.
+        """
+        dir_keyvals = collections.defaultdict(list)
+        for directive in directives:
+            parts = directive.split()
+            if len(parts) > 1:
+                key = parts[0]
+                if len(parts) == 2:
+                    val = parts[1]
+                else:
+                    val = tuple(parts[1:])
+                dir_keyvals[key].append(val)
+        for key, vals in dir_keyvals.items():
+            for rec in base.values():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _add_seqs(self, base, recs):
+        """Add sequence information contained in the GFF3 to records.
+        """
+        for rec in recs:
+            if base.has_key(rec.id):
+                base[rec.id].seq = rec.seq
+            else:
+                base[rec.id] = rec
+        return base
+    
+    def _add_parent_child_features(self, base, parents, children):
+        """Add nested features with parent child relationships.
+        """
+        multi_remap = self._identify_dup_ids(parents)
+        # add children features
+        children_prep = collections.defaultdict(list)
+        for child_dict in children:
+            child_feature = self._get_feature(child_dict)
+            for pindex, pid in enumerate(child_feature.qualifiers['Parent']):
+                if multi_remap.has_key(pid):
+                    pid = multi_remap[pid].remap_id(child_dict)
+                    child_feature.qualifiers['Parent'][pindex] = pid
+                children_prep[pid].append((child_dict['rec_id'],
+                    child_feature))
+        children = dict(children_prep)
+        # add children to parents that exist
+        for cur_parent_dict in parents:
+            cur_id = cur_parent_dict['id']
+            if multi_remap.has_key(cur_id):
+                cur_parent_dict['id'] = multi_remap[cur_id].remap_id(
+                        cur_parent_dict)
+            cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict)
+            cur_parent, children = self._add_children_to_parent(cur_parent,
+                    children)
+        # create parents for children without them (GFF2 or split/bad files)
+        while len(children) > 0:
+            parent_id, cur_children = itertools.islice(children.items(),
+                    1).next()
+            # one child, do not nest it
+            if len(cur_children) == 1:
+                rec_id, child = cur_children[0]
+                loc = (child.location.nofuzzy_start, child.location.nofuzzy_end)
+                rec, base = self._get_rec(base,
+                        dict(rec_id=rec_id, location=loc))
+                rec.features.append(child)
+                del children[parent_id]
+            else:
+                cur_parent, base = self._add_missing_parent(base, parent_id,
+                        cur_children)
+                cur_parent, children = self._add_children_to_parent(cur_parent,
+                        children)
+        return base
+
+    def _identify_dup_ids(self, parents):
+        """Identify duplicated ID attributes in potential nested parents.
+
+        According to the GFF3 spec ID attributes are supposed to be unique
+        for a file, but this is not always true in practice. This looks
+        for duplicates, and provides unique IDs sorted by locations.
+        """
+        multi_ids = collections.defaultdict(list)
+        for parent in parents:
+            multi_ids[parent['id']].append(parent)
+        multi_ids = [(mid, parents) for (mid, parents) in multi_ids.items()
+                if len(parents) > 1]
+        multi_remap = dict()
+        for mid, parents in multi_ids:
+            multi_remap[mid] = _MultiIDRemapper(mid, parents)
+        return multi_remap
+
+    def _add_children_to_parent(self, cur_parent, children):
+        """Recursively add children to parent features.
+        """
+        if children.has_key(cur_parent.id):
+            cur_children = children[cur_parent.id]
+            for rec_id, cur_child in cur_children:
+                cur_child, children = self._add_children_to_parent(cur_child,
+                        children)
+                cur_parent.location_operator = "join"
+                cur_parent.sub_features.append(cur_child)
+            del children[cur_parent.id]
+        return cur_parent, children
+
+    def _add_annotations(self, base, anns):
+        """Add annotation data from the GFF file to records.
+        """
+        # add these as a list of annotations, checking not to overwrite
+        # current values
+        for ann in anns:
+            rec, base = self._get_rec(base, ann)
+            for key, vals in ann['quals'].items():
+                self._add_ann_to_rec(rec, key, vals)
+        return base
+
+    def _add_ann_to_rec(self, rec, key, vals):
+        """Add a key/value annotation to the given SeqRecord.
+        """
+        if rec.annotations.has_key(key):
+            try:
+                rec.annotations[key].extend(vals)
+            except AttributeError:
+                rec.annotations[key] = [rec.annotations[key]] + vals
+        else:
+            rec.annotations[key] = vals
+
+    def _get_rec(self, base, info_dict):
+        """Retrieve a record to add features to.
+        """
+        max_loc = info_dict.get('location', (0, 1))[1]
+        try:
+            cur_rec = base[info_dict['rec_id']]
+            # update generated unknown sequences with the expected maximum length
+            if isinstance(cur_rec.seq, UnknownSeq):
+                cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
+            return cur_rec, base
+        except KeyError:
+            if self._create_missing:
+                new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id'])
+                base[info_dict['rec_id']] = new_rec
+                return new_rec, base
+            else:
+                raise
+
+    def _add_missing_parent(self, base, parent_id, cur_children):
+        """Add a new feature that is missing from the GFF file.
+        """
+        base_rec_id = list(set(c[0] for c in cur_children))
+        assert len(base_rec_id) == 1
+        feature_dict = dict(id=parent_id, strand=None,
+                type="inferred_parent", quals=dict(ID=[parent_id]),
+                rec_id=base_rec_id[0])
+        coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) 
+                for r, c in cur_children]
+        feature_dict["location"] = (min([c[0] for c in coords]),
+                max([c[1] for c in coords]))
+        return self._add_toplevel_feature(base, feature_dict)
+
+    def _add_toplevel_feature(self, base, feature_dict):
+        """Add a toplevel non-nested feature to the appropriate record.
+        """
+        new_feature = self._get_feature(feature_dict)
+        rec, base = self._get_rec(base, feature_dict)
+        rec.features.append(new_feature)
+        return new_feature, base
+
+    def _get_feature(self, feature_dict):
+        """Retrieve a Biopython feature from our dictionary representation.
+        """
+        location = FeatureLocation(*feature_dict['location'])
+        new_feature = SeqFeature(location, feature_dict['type'],
+                id=feature_dict['id'], strand=feature_dict['strand'])
+        new_feature.qualifiers = feature_dict['quals']
+        return new_feature
+
+    def _parse_fasta(self, in_handle):
+        """Parse FASTA sequence information contained in the GFF3 file.
+        """
+        return list(SeqIO.parse(in_handle, "fasta"))
+
+class _GFFParserLocalOut:
+    """Provide a collector for local GFF MapReduce file parsing.
+    """
+    def __init__(self, smart_breaks=False):
+        self._items = dict()
+        self._smart_breaks = smart_breaks
+        self._missing_keys = collections.defaultdict(int)
+        self._last_parent = None
+        self.can_break = True
+        self.num_lines = 0
+
+    def add(self, key, vals):
+        if self._smart_breaks:
+            # if we are not GFF2 we expect parents and break
+            # based on not having missing ones
+            if key == 'directive':
+                if vals[0] == '#':
+                    self.can_break = True
+                self._last_parent = None
+            elif not vals[0].get("is_gff2", False):
+                self._update_missing_parents(key, vals)
+                self.can_break = (len(self._missing_keys) == 0)
+            # break when we are done with stretches of child features
+            elif key != 'child':
+                self.can_break = True
+                self._last_parent = None
+            # break when we have lots of child features in a row
+            # and change between parents
+            else:
+                cur_parent = vals[0]["quals"]["Parent"][0]
+                if (self._last_parent):
+                    self.can_break = (cur_parent != self._last_parent)
+                self._last_parent = cur_parent
+        self.num_lines += 1
+        try:
+            self._items[key].extend(vals)
+        except KeyError:
+            self._items[key] = vals
+
+    def _update_missing_parents(self, key, vals):
+        # smart way of deciding if we can break this.
+        # if this is too much, can go back to not breaking in the
+        # middle of children
+        if key in ["child"]:
+            for val in vals:
+                for p_id in val["quals"]["Parent"]:
+                    self._missing_keys[p_id] += 1
+        for val in vals:
+            try:
+                del self._missing_keys[val["quals"]["ID"][0]]
+            except KeyError:
+                pass
+
+    def has_items(self):
+        return len(self._items) > 0
+
+    def get_results(self):
+        self._last_parent = None
+        return self._items
+
+class GFFParser(_AbstractMapReduceGFF):
+    """Local GFF parser providing standardized parsing of GFF3 and GFF2 files.
+    """
+    def __init__(self, line_adjust_fn=None, create_missing=True):
+        _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+        self._line_adjust_fn = line_adjust_fn
+    
+    def _gff_process(self, gff_files, limit_info, target_lines):
+        """Process GFF addition without any parallelization.
+
+        In addition to limit filtering, this accepts a target_lines attribute
+        which provides a number of lines to parse before returning results.
+        This allows partial parsing of a file to prevent memory issues.
+        """
+        line_gen = self._file_line_generator(gff_files)
+        for out in self._lines_to_out_info(line_gen, limit_info, target_lines):
+            yield out
+
+    def _file_line_generator(self, gff_files):
+        """Generate single lines from a set of GFF files.
+        """
+        for gff_file in gff_files:
+            if hasattr(gff_file, "read"):
+                need_close = False
+                in_handle = gff_file
+            else:
+                need_close = True
+                in_handle = open(gff_file)
+            found_seqs = False
+            while 1:
+                line = in_handle.readline()
+                if not line:
+                    break
+                yield line
+            if need_close:
+                in_handle.close()
+
+    def _lines_to_out_info(self, line_iter, limit_info=None,
+            target_lines=None):
+        """Generate SeqRecord and SeqFeatures from GFF file lines.
+        """
+        params = self._examiner._get_local_params(limit_info)
+        out_info = _GFFParserLocalOut((target_lines is not None and
+                target_lines > 1))
+        found_seqs = False
+        for line in line_iter:
+            results = self._map_fn(line, params)
+            if self._line_adjust_fn and results:
+                if results[0][0] not in ['directive']:
+                    results = [(results[0][0],
+                        self._line_adjust_fn(results[0][1]))]
+            self._reduce_fn(results, out_info, params)
+            if (target_lines and out_info.num_lines >= target_lines and
+                    out_info.can_break):
+                yield out_info.get_results()
+                out_info = _GFFParserLocalOut((target_lines is not None and
+                        target_lines > 1))
+            if (results and results[0][0] == 'directive' and 
+                    results[0][1] == 'FASTA'):
+                found_seqs = True
+                break
+
+        class FakeHandle:
+            def __init__(self, line_iter):
+                self._iter = line_iter
+            def read(self):
+                return "".join(l for l in self._iter)
+            def readline(self):
+                try:
+                    return self._iter.next()
+                except StopIteration:
+                    return ""
+
+        if found_seqs:
+            fasta_recs = self._parse_fasta(FakeHandle(line_iter))
+            out_info.add('fasta', fasta_recs)
+        if out_info.has_items():
+            yield out_info.get_results()
+
+class DiscoGFFParser(_AbstractMapReduceGFF):
+    """GFF Parser with parallelization through Disco (http://discoproject.org.
+    """
+    def __init__(self, disco_host, create_missing=True):
+        """Initialize parser.
+        
+        disco_host - Web reference to a Disco host which will be used for
+        parallelizing the GFF reading job.
+        """
+        _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+        self._disco_host = disco_host
+
+    def _gff_process(self, gff_files, limit_info, target_lines=None):
+        """Process GFF addition, using Disco to parallelize the process.
+        """
+        assert target_lines is None, "Cannot split parallelized jobs"
+        # make these imports local; only need them when using disco
+        import simplejson
+        import disco
+        # absolute path names unless they are special disco files 
+        full_files = []
+        for f in gff_files:
+            if f.split(":")[0] != "disco":
+                full_files.append(os.path.abspath(f))
+            else:
+                full_files.append(f)
+        results = disco.job(self._disco_host, name="gff_reader",
+                input=full_files,
+                params=disco.Params(limit_info=limit_info, jsonify=True,
+                    filter_info=self._examiner._filter_info),
+                required_modules=["simplejson", "collections", "re"],
+                map=self._map_fn, reduce=self._reduce_fn)
+        processed = dict()
+        for out_key, out_val in disco.result_iterator(results):
+            processed[out_key] = simplejson.loads(out_val)
+        yield processed
+
+def parse(gff_files, base_dict=None, limit_info=None, target_lines=None):
+    """High level interface to parse GFF files into SeqRecords and SeqFeatures.
+    """
+    parser = GFFParser()
+    for rec in parser.parse_in_parts(gff_files, base_dict, limit_info,
+            target_lines):
+        yield rec
+
+def _file_or_handle(fn):
+    """Decorator to handle either an input handle or a file.
+    """
+    def _file_or_handle_inside(*args, **kwargs):
+        in_file = args[1]
+        if hasattr(in_file, "read"):
+            need_close = False
+            in_handle = in_file
+        else:
+            need_close = True
+            in_handle = open(in_file)
+        args = (args[0], in_handle) + args[2:]
+        out = fn(*args, **kwargs)
+        if need_close:
+            in_handle.close()
+        return out
+    return _file_or_handle_inside
+
+class GFFExaminer:
+    """Provide high level details about a GFF file to refine parsing.
+
+    GFF is a spec and is provided by many different centers. Real life files
+    will present the same information in slightly different ways. Becoming
+    familiar with the file you are dealing with is the best way to extract the
+    information you need. This class provides high level summary details to
+    help in learning.
+    """
+    def __init__(self):
+        self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
+                gff_source = [1], gff_type = [2])
+    
+    def _get_local_params(self, limit_info=None):
+        class _LocalParams:
+            def __init__(self):
+                self.jsonify = False
+        params = _LocalParams()
+        params.limit_info = limit_info
+        params.filter_info = self._filter_info
+        return params
+    
+    @_file_or_handle
+    def available_limits(self, gff_handle):
+        """Return dictionary information on possible limits for this file.
+
+        This returns a nested dictionary with the following structure:
+        
+        keys -- names of items to filter by
+        values -- dictionary with:
+            keys -- filter choice
+            value -- counts of that filter in this file
+
+        Not a parallelized map-reduce implementation.
+        """
+        cur_limits = dict()
+        for filter_key in self._filter_info.keys():
+            cur_limits[filter_key] = collections.defaultdict(int)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            # ignore empty and comment lines
+            if line.strip() and line.strip()[0] != "#":
+                parts = [p.strip() for p in line.split('\t')]
+                assert len(parts) == 9, line
+                for filter_key, cur_indexes in self._filter_info.items():
+                    cur_id = tuple([parts[i] for i in cur_indexes])
+                    cur_limits[filter_key][cur_id] += 1
+        # get rid of the default dicts
+        final_dict = dict()
+        for key, value_dict in cur_limits.items():
+            if len(key) == 1:
+                key = key[0]
+            final_dict[key] = dict(value_dict)
+        gff_handle.close()
+        return final_dict
+
+    @_file_or_handle
+    def parent_child_map(self, gff_handle):
+        """Provide a mapping of parent to child relationships in the file.
+
+        Returns a dictionary of parent child relationships:
+
+        keys -- tuple of (source, type) for each parent
+        values -- tuple of (source, type) as children of that parent
+        
+        Not a parallelized map-reduce implementation.
+        """
+        # collect all of the parent and child types mapped to IDs
+        parent_sts = dict()
+        child_sts = collections.defaultdict(list)
+        for line in gff_handle:
+            # when we hit FASTA sequences, we are done with annotations
+            if line.startswith("##FASTA"):
+                break
+            if line.strip():
+                line_type, line_info = _gff_line_map(line,
+                        self._get_local_params())[0]
+                if (line_type == 'parent' or (line_type == 'child' and
+                        line_info['id'])):
+                    parent_sts[line_info['id']] = (
+                            line_info['quals']['source'][0], line_info['type'])
+                if line_type == 'child':
+                    for parent_id in line_info['quals']['Parent']:
+                        child_sts[parent_id].append((
+                            line_info['quals']['source'][0], line_info['type']))
+        #print parent_sts, child_sts
+        # generate a dictionary of the unique final type relationships
+        pc_map = collections.defaultdict(list)
+        for parent_id, parent_type in parent_sts.items():
+            for child_type in child_sts[parent_id]:
+                pc_map[parent_type].append(child_type)
+        pc_final_map = dict()
+        for ptype, ctypes in pc_map.items():
+            unique_ctypes = list(set(ctypes))
+            unique_ctypes.sort()
+            pc_final_map[ptype] = unique_ctypes
+        return pc_final_map
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gtf_to_gff.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+"""
+Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
+
+Usage: python gtf_to_gff.py in.gtf > out.gff3  
+    
+Requirement:
+    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
+    helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+    
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+import helper
+
+def GFFWriter(gtf_content):
+    """
+    write the feature information to GFF format
+
+    @args gtf_content: Parsed object from gtf file 
+    @type gtf_content: numpy array
+    """
+
+    print '##gff-version 3'
+
+    for ent1 in gtf_content:
+
+        chr_name = ent1['chr']
+        strand = ent1['strand']
+        start = ent1['start']
+        stop = ent1['stop']
+        source = ent1['source']
+        ID = ent1['name']
+        Name = ent1['gene_info']['Name']
+
+        Name = ID if not Name else Name 
+
+        print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name) 
+
+        for idx, tid in enumerate(ent1['transcripts']):
+            print idx 
+            print tid 
+
+            t_start = ent1['exons'][idx][0][0]
+            t_stop = ent1['exons'][idx][-1][-1]
+            t_type = ent1['transcript_type'][idx]
+
+            utr5_exons, utr3_exons = [], [] 
+            if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
+                utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
+
+            print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID) 
+
+            for ex_cod in utr5_exons:
+                print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
+
+            for ex_cod in ent1['cds_exons'][idx]:
+                print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]) 
+
+            for ex_cod in utr3_exons:
+                print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
+
+            for ex_cod in ent1['exons'][idx]:
+                print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) 
+            
+
+def __main__():
+
+    try:
+        gtf_fname = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    gtf_file_content = GFFParser.Parse(gtf_fname)  
+
+    GFFWriter(gtf_file_content)
+
+if __name__ == "__main__": 
+    __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gtf_to_gff.xml	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,94 @@
+<tool id="fml_gtf2gff" name="GTF-to-GFF" version="2.0.0">
+	<description>converter</description> 
+	<command interpreter="python">gtf_to_gff.py $inf_gtf &gt; $gff3_format 
+	</command> 
+	<inputs>
+  		<param format="gtf" name="inf_gtf" type="data" label="Convert this query" help="Provide genome annotation file in GTF."/>
+        </inputs>
+  	<outputs>
+  		<data format="gff3" name="gff3_format" label="${tool.name} on ${on_string}: Converted" /> 
+  	</outputs>
+	    <tests>
+        	<test>
+                <param name="inf_gtf" value="UCSC_transcripts.gtf" />
+                <output name="gff3_format" file="UCSC_transcripts.gff3" />
+        	</test>
+        	<test>
+                <param name="inf_gtf" value="JGI_genes.gtf" />
+                <output name="gff3_format" file="JGI_genes.gff3" />
+        	</test>
+        	<test>
+                <param name="inf_gtf" value="ENSEMBL_mm9.gtf" />
+                <output name="gff3_format" file="ENSEMBL_mm9.gff3" />
+        	</test>
+        	<test>
+                <param name="inf_gtf" value="AceView_ncbi_37.gtf" />
+                <output name="gff3_format" file="AceView_ncbi_37.gff3" />
+        	</test>
+        </tests>
+  	<help>
+
+**What it does**
+
+This tool converts data from GTF to a valid GFF3 file (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in GTF format::
+
+	17      protein_coding  exon    7255208 7258258 .       +       .        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+	17      protein_coding  CDS     7256262 7256957 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+	17      protein_coding  start_codon     7256262 7256264 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+	17      protein_coding  stop_codon      7256958 7256960 .       +       0        gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+- Will be converted to GFF3 format::
+
+	##gff-version 3
+	17      protein_coding  gene    7255208 7258258 .       +       .       ID=ENSG00000213859;Name=KCTD11
+	17      protein_coding  mRNA    7255208 7258258 .       +       .       ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+	17      protein_coding  protein 7256262 7256960 .       +       .       ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+	17      protein_coding  five_prime_UTR  7255208 7256261 .       +       .       Parent=ENST00000333751
+	17      protein_coding  CDS     7256262 7256960 .       +       0       Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+	17      protein_coding  three_prime_UTR 7256961 7258258 .       +       .       Parent=ENST00000333751
+	17      protein_coding  exon    7255208 7258258 .       +       .       Parent=ENST00000333751
+
+--------
+
+**About formats**
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
+
+    1. seqname - The name of the sequence.
+    2. source - This indicating where the annotation came from.
+    3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+    7. strand - Valid entries include '+', '-', or '.'
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+    9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+    1. seqid - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". 
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. stop - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen &amp; Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+	</help>
+</tool>
diff -r 619e0fcd9126 -r 6e589f267c14 helper.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/helper.py	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,332 @@
+#!/usr/bin/env python
+"""
+Common utility functions
+"""
+
+import os 
+import re
+import sys 
+import gzip 
+import bz2
+import numpy 
+
+def init_gene():
+    """
+    Initializing the gene structure 
+    """
+
+    gene_det = [('id', 'f8'), 
+            ('anno_id', numpy.dtype), 
+            ('confgenes_id', numpy.dtype),
+            ('name', 'S25'),
+            ('source', 'S25'),
+            ('gene_info', numpy.dtype),
+            ('alias', 'S15'),
+            ('name2', numpy.dtype),
+            ('strand', 'S2'), 
+            ('score', 'S15'), 
+            ('chr', 'S15'), 
+            ('chr_num', numpy.dtype),
+            ('paralogs', numpy.dtype),
+            ('start', 'f8'),
+            ('stop', 'f8'), 
+            ('transcripts', numpy.dtype),
+            ('transcript_type', numpy.dtype),
+            ('transcript_info', numpy.dtype),
+            ('transcript_status', numpy.dtype),
+            ('transcript_valid', numpy.dtype),
+            ('exons', numpy.dtype),
+            ('exons_confirmed', numpy.dtype),
+            ('cds_exons', numpy.dtype),
+            ('utr5_exons', numpy.dtype),
+            ('utr3_exons', numpy.dtype),
+            ('tis', numpy.dtype),
+            ('tis_conf', numpy.dtype),
+            ('tis_info', numpy.dtype),
+            ('cdsStop', numpy.dtype),
+            ('cdsStop_conf', numpy.dtype),
+            ('cdsStop_info', numpy.dtype),
+            ('tss', numpy.dtype),
+            ('tss_info', numpy.dtype),
+            ('tss_conf', numpy.dtype),
+            ('cleave', numpy.dtype),
+            ('cleave_info', numpy.dtype),
+            ('cleave_conf', numpy.dtype),
+            ('polya', numpy.dtype),
+            ('polya_info', numpy.dtype),
+            ('polya_conf', numpy.dtype),
+            ('is_alt', 'f8'), 
+            ('is_alt_spliced', 'f8'), 
+            ('is_valid',  numpy.dtype),
+            ('transcript_complete', numpy.dtype),
+            ('is_complete', numpy.dtype),
+            ('is_correctly_gff3_referenced', 'S5'),
+            ('splicegraph', numpy.dtype) ]
+
+    return gene_det
+
+def open_file(fname):
+    """
+    Open the file (supports .gz .bz2) and returns the handler
+
+    @args fname: input file name for reading 
+    @type fname: str
+    """
+
+    try:
+        if os.path.splitext(fname)[1] == ".gz":
+            FH = gzip.open(fname, 'rb')
+        elif os.path.splitext(fname)[1] == ".bz2":
+            FH = bz2.BZ2File(fname, 'rb')
+        else:
+            FH = open(fname, 'rU')
+    except Exception as error:
+        sys.exit(error)
+
+    return FH
+
+def add_CDS_phase(strand, cds):
+    """
+    Calculate CDS phase and add to the CDS exons
+
+    @args strand: feature strand information 
+    @type strand: +/- 
+    @args cds: coding exon coordinates 
+    @type cds: numpy array [[int, int, int]]
+    """
+
+    cds_region, cds_flag = [], 0 
+    if strand == '+':
+        for cdspos in cds:
+            if cds_flag == 0:
+                cdspos = (cdspos[0], cdspos[1], 0)
+                diff = (cdspos[1]-(cdspos[0]-1))%3
+            else:
+                xy = 0
+                if diff == 0: 
+                    cdspos = (cdspos[0], cdspos[1], 0)
+                elif diff == 1: 
+                    cdspos = (cdspos[0], cdspos[1], 2)
+                    xy = 2
+                elif diff == 2: 
+                    cdspos = (cdspos[0], cdspos[1], 1)
+                    xy = 1
+                diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
+            cds_region.append(cdspos)
+            cds_flag = 1 
+    elif strand == '-':
+        cds.reverse()
+        for cdspos in cds: 
+            if cds_flag == 0:
+                cdspos = (cdspos[0], cdspos[1], 0)
+                diff = (cdspos[1]-(cdspos[0]-1))%3
+            else:  
+                xy = 0 
+                if diff == 0: 
+                    cdspos = (cdspos[0], cdspos[1], 0)
+                elif diff == 1:
+                    cdspos = (cdspos[0], cdspos[1], 2)
+                    xy = 2
+                elif diff == 2: 
+                    cdspos = (cdspos[0], cdspos[1], 1)
+                    xy = 1
+                diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
+            cds_region.append(cdspos)
+            cds_flag = 1
+        cds_region.reverse()
+    return cds_region
+
+def buildUTR(cc, ec, strand):
+    """
+    Build UTR regions from a given set of CDS and exon coordiantes of a gene
+
+    @args cc: coding exon coordinates 
+    @type cc: numpy array [[int, int, int]]
+    @args ec: exon coordinates 
+    @type ec: numpy array [[int, int]]
+    @args strand: feature strand information 
+    @type strand: +/- 
+    """
+
+    utr5 = []
+    utr3 = []
+    if strand == '+':
+        cds_s = cc[0][0]
+        for ex in ec:
+            if ex[0] <= cds_s and cds_s <= ex[1]:
+                if ex[0] != cds_s:utr5.append((ex[0], cds_s-1))
+                break
+            else:
+                utr5.append(ex)
+        cds_e = cc[-1][1]
+        for i in range(len(ec)):
+            i += 1
+            if ec[-i][0] <= cds_e and cds_e <= ec[-i][1]:
+                if ec[-i][1] != cds_e:utr3.append((cds_e +1, ec[-i][1]))
+                break
+            else:
+                utr3.append(ec[-i]) 
+        utr3.reverse()
+    elif strand == '-':
+        cds_s = cc[-1][1]
+        for i in range(len(ec)):
+            i += 1
+            if ec[-i][0] <= cds_s and cds_s <= ec[-i][1]:
+                if ec[-i][1] != cds_s:utr5.append((cds_s+1, ec[-i][1]))
+                break
+            else:
+                utr5.append(ec[-i])
+        utr5.reverse()
+        cds_e = cc[0][0] 
+        for ex in ec:
+            if ex[0] <= cds_e and cds_e <= ex[1]:
+                if ex[0] != cds_e:utr3.append((ex[0], cds_e-1))
+                break
+            else:
+                utr3.append(ex)
+    return utr5, utr3
+
+def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr):
+    """
+    Create exon cordinates from UTR's and CDS region
+
+    @args strand_p: feature strand information 
+    @type strand_p: +/- 
+    @args five_p_utr: five prime utr exon coordinates 
+    @type five_p_utr: numpy array [[int, int]]
+    @args cds_cod: coding exon coordinates 
+    @type cds_cod: numpy array [[int, int, int]]
+    @args three_p_utr: three prime utr exon coordinates 
+    @type three_p_utr: numpy array [[int, int]]
+    """
+
+    exon_pos = []
+    if strand_p == '+':        
+        utr5_start, utr5_end = 0, 0
+        if five_p_utr != []:
+            utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] 
+        cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
+        jun_exon = []
+        if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
+            jun_exon = [utr5_start, cds_5end]    
+        if len(cds_cod) == 1:
+            five_prime_flag = 0
+            if jun_exon != []:
+                five_p_utr = five_p_utr[:-1]
+                five_prime_flag = 1
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+            jun_exon = []
+            utr3_start, utr3_end = 0, 0
+            if three_p_utr != []: 
+                utr3_start = three_p_utr[0][0]
+                utr3_end = three_p_utr[0][1]
+            if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
+                jun_exon = [cds_5start, utr3_end]
+            three_prime_flag = 0
+            if jun_exon != []: 
+                cds_cod = cds_cod[:-1]
+                three_p_utr = three_p_utr[1:]
+                three_prime_flag = 1
+            if five_prime_flag == 1 and three_prime_flag == 1:
+                exon_pos.append([utr5_start, utr3_end])
+            if five_prime_flag == 1 and three_prime_flag == 0:
+                exon_pos.append([utr5_start, cds_5end])
+                cds_cod = cds_cod[:-1]
+            if five_prime_flag == 0 and three_prime_flag == 1:
+                exon_pos.append([cds_5start, utr3_end])
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+        else:    
+            if jun_exon != []:
+                five_p_utr = five_p_utr[:-1]
+                cds_cod = cds_cod[1:]
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+            exon_pos.append(jun_exon) if jun_exon != [] else ''
+            jun_exon = []
+            utr3_start, utr3_end = 0, 0
+            if three_p_utr != []:
+                utr3_start = three_p_utr[0][0]
+                utr3_end = three_p_utr[0][1]
+            cds_3start = cds_cod[-1][0]
+            cds_3end = cds_cod[-1][1]
+            if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:       
+                jun_exon = [cds_3start, utr3_end]
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                three_p_utr = three_p_utr[1:]
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            exon_pos.append(jun_exon) if jun_exon != [] else ''
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+    elif strand_p == '-':
+        utr3_start, utr3_end = 0, 0        
+        if three_p_utr != []:
+            utr3_start = three_p_utr[-1][0]
+            utr3_end = three_p_utr[-1][1]
+        cds_3start = cds_cod[0][0]
+        cds_3end = cds_cod[0][1]
+        jun_exon = []
+        if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
+            jun_exon = [utr3_start, cds_3end]  
+        if len(cds_cod) == 1:    
+            three_prime_flag = 0
+            if jun_exon != []:
+                three_p_utr = three_p_utr[:-1]
+                three_prime_flag = 1
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)
+            jun_exon = []
+            (utr5_start, utr5_end) = (0, 0)
+            if five_p_utr != []:
+                utr5_start = five_p_utr[0][0]
+                utr5_end = five_p_utr[0][1]
+            if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
+                jun_exon = [cds_3start, utr5_end]
+            five_prime_flag = 0
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                five_p_utr = five_p_utr[1:]
+                five_prime_flag = 1
+            if three_prime_flag == 1 and five_prime_flag == 1:
+                exon_pos.append([utr3_start, utr5_end])
+            if three_prime_flag == 1 and five_prime_flag == 0:
+                exon_pos.append([utr3_start, cds_3end])
+                cds_cod = cds_cod[:-1]
+            if three_prime_flag == 0 and five_prime_flag == 1:
+                exon_pos.append([cds_3start, utr5_end])        
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+        else:
+            if jun_exon != []:
+                three_p_utr = three_p_utr[:-1]
+                cds_cod = cds_cod[1:]
+            for utr3 in three_p_utr:
+                exon_pos.append(utr3)   
+            if jun_exon != []:
+                exon_pos.append(jun_exon)
+            jun_exon = []
+            (utr5_start, utr5_end) = (0, 0)
+            if five_p_utr != []:
+                utr5_start = five_p_utr[0][0]
+                utr5_end = five_p_utr[0][1]    
+            cds_5start = cds_cod[-1][0]
+            cds_5end = cds_cod[-1][1]
+            if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
+                jun_exon = [cds_5start, utr5_end]
+            if jun_exon != []:
+                cds_cod = cds_cod[:-1]
+                five_p_utr = five_p_utr[1:]
+            for cds in cds_cod:
+                exon_pos.append(cds)
+            if jun_exon != []:
+                exon_pos.append(jun_exon)    
+            for utr5 in five_p_utr:
+                exon_pos.append(utr5)
+    return exon_pos
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gbk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/s_cerevisiae_SCU49845.gbk	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,165 @@
+LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999
+DEFINITION  Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
+            (AXL2) and Rev7p (REV7) genes, complete cds.
+ACCESSION   U49845
+VERSION     U49845.1  GI:1293613
+KEYWORDS    .
+SOURCE      Saccharomyces cerevisiae (baker's yeast)
+  ORGANISM  Saccharomyces cerevisiae
+            Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
+            Saccharomycetales; Saccharomycetaceae; Saccharomyces.
+REFERENCE   1  (bases 1 to 5028)
+  AUTHORS   Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
+  TITLE     Cloning and sequence of REV7, a gene whose function is required for
+            DNA damage-induced mutagenesis in Saccharomyces cerevisiae
+  JOURNAL   Yeast 10 (11), 1503-1509 (1994)
+  PUBMED    7871890
+REFERENCE   2  (bases 1 to 5028)
+  AUTHORS   Roemer,T., Madden,K., Chang,J. and Snyder,M.
+  TITLE     Selection of axial growth sites in yeast requires Axl2p, a novel
+            plasma membrane glycoprotein
+  JOURNAL   Genes Dev. 10 (7), 777-793 (1996)
+  PUBMED    8846915
+REFERENCE   3  (bases 1 to 5028)
+  AUTHORS   Roemer,T.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
+            Haven, CT, USA
+FEATURES             Location/Qualifiers
+     source          1..5028
+                     /organism="Saccharomyces cerevisiae"
+                     /db_xref="taxon:4932"
+                     /chromosome="IX"
+                     /map="9"
+     CDS             <1..206
+                     /codon_start=3
+                     /product="TCP1-beta"
+                     /protein_id="AAA98665.1"
+                     /db_xref="GI:1293614"
+                     /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
+                     AEVLLRVDNIIRARPRTANRQHM"
+     gene            687..3158
+                     /gene="AXL2"
+     CDS             687..3158
+                     /gene="AXL2"
+                     /note="plasma membrane glycoprotein"
+                     /codon_start=1
+                     /function="required for axial budding pattern of S.
+                     cerevisiae"
+                     /product="Axl2p"
+                     /protein_id="AAA98666.1"
+                     /db_xref="GI:1293615"
+                     /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
+                     TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
+                     VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
+                     VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
+                     TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
+                     YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
+                     DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
+                     DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
+                     NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
+                     CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
+                     NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
+                     SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
+                     YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
+                     HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
+                     VDFSNKSNVNVGQVKDIHGRIPEML"
+     gene            complement(3300..4037)
+                     /gene="REV7"
+     CDS             complement(3300..4037)
+                     /gene="REV7"
+                     /codon_start=1
+                     /product="Rev7p"
+                     /protein_id="AAA98667.1"
+                     /db_xref="GI:1293616"
+                     /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
+                     FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
+                     KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
+                     RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
+                     LISGDDKILNGVYSQYEEGESIFGSLF"
+ORIGIN
+        1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
+       61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
+      121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
+      181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
+      241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
+      301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
+      361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
+      421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
+      481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
+      541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
+      601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
+      661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
+      721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
+      781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
+      841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
+      901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
+      961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
+     1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
+     1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
+     1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
+     1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
+     1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
+     1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
+     1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
+     1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
+     1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
+     1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
+     1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
+     1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
+     1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
+     1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
+     1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
+     1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
+     1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
+     2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
+     2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
+     2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
+     2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
+     2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
+     2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
+     2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
+     2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
+     2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
+     2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
+     2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
+     2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
+     2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
+     2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
+     2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
+     2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
+     3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
+     3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
+     3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
+     3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
+     3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
+     3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
+     3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
+     3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
+     3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
+     3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
+     3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
+     3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
+     3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
+     3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
+     3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
+     3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
+     3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
+     4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
+     4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
+     4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
+     4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
+     4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
+     4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
+     4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
+     4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
+     4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
+     4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
+     4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
+     4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
+     4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
+     4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
+     4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
+     4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
+     4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
+//
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gff3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/s_cerevisiae_SCU49845.gff3	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,8 @@
+IX	gbk_to_gff	gene	687	3158	.	+	.	ID=AXL2;Name=AXL2
+IX	gbk_to_gff	.	687	3158	.	+	.	ID=Transcript:AXL2;Parent=AXL2
+IX	gbk_to_gff	CDS	687	3158	.	+	.	Parent=Transcript:AXL2
+IX	gbk_to_gff	exon	687	3158	.	+	.	Parent=Transcript:AXL2
+IX	gbk_to_gff	gene	3300	4037	.	-	.	ID=REV7;Name=REV7
+IX	gbk_to_gff	.	3300	4037	.	-	.	ID=Transcript:REV7;Parent=REV7
+IX	gbk_to_gff	CDS	3300	4037	.	-	.	Parent=Transcript:REV7
+IX	gbk_to_gff	exon	3300	4037	.	-	.	Parent=Transcript:REV7
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/single_parent_feature_record.gff3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/single_parent_feature_record.gff3	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,10 @@
+chr1	.	miRNA_primary_transcript	1380242	1380467	.	-	.	ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754
+chr1	.	miRNA	1380249	1380270	.	-	.	ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047
+chr1	.	miRNA_primary_transcript	2410094	2410318	.	+	.	ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367
+chr1	.	miRNA	2410242	2410263	.	+	.	ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507
+chr1	.	miRNA_primary_transcript	4792375	4792487	.	-	.	ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h
+chr1	.	miRNA	4792388	4792408	.	-	.	ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714
+chr1	.	miRNA_primary_transcript	4797903	4798018	.	-	.	ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i
+chr1	.	miRNA	4797916	4797936	.	-	.	ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715
+chr1	.	miRNA_primary_transcript	4810817	4810942	.	-	.	ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j
+chr1	.	miRNA	4810830	4810850	.	-	.	ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716
diff -r 619e0fcd9126 -r 6e589f267c14 tool_conf.xml.sample
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_conf.xml.sample	Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,9 @@
+<section name="GFFtools" id="gfftools.web">
+    <tool file="GFFtools-GX/gff_to_bed.xml"/>
+    <tool file="GFFtools-GX/bed_to_gff.xml"/>
+    <tool file="GFFtools-GX/gbk_to_gff.xml"/>
+    <tool file="GFFtools-GX/gff_to_gbk.xml"/>
+    <tool file="GFFtools-GX/gff_to_gtf.xml"/>
+    <tool file="GFFtools-GX/gtf_to_gff.xml"/>
+    <tool file="GFFtools-GX/gff_fmap.xml"/>
+</section>