# HG changeset patch
# User devteam
# Date 1415121319 18000
# Node ID 6e589f267c1421ecb5b6760c947009ed159bb280
# Parent 619e0fcd9126df9ca8f7c1b1a8e3abc097abfee9
Uploaded
diff -r 619e0fcd9126 -r 6e589f267c14 GFFParser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GFFParser.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,495 @@
+#!/usr/bin/env python
+"""
+Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
+
+Requirements:
+ Numpy :- http://numpy.org/
+ Scipy :- http://scipy.org/
+
+Copyright (C)
+
+2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA.
+"""
+
+import re
+import os
+import sys
+import urllib
+import numpy as np
+import scipy.io as sio
+from collections import defaultdict
+import helper as utils
+
+def attribute_tags(col9):
+ """
+ Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
+
+ @args col9: attribute column from GFF file
+ @type col9: str
+ """
+ info = defaultdict(list)
+ is_gff = False
+
+ if not col9:
+ return is_gff, info
+
+ # trim the line ending semi-colon ucsc may have some white-space
+ col9 = col9.rstrip(';| ')
+ # attributes from 9th column
+ atbs = col9.split(" ; ")
+ if len(atbs) == 1:
+ atbs = col9.split("; ")
+ if len(atbs) == 1:
+ atbs = col9.split(";")
+ # check the GFF3 pattern which has key value pairs like:
+ gff3_pat = re.compile("\w+=")
+ # sometime GTF have: gene_id uc002zkg.1;
+ gtf_pat = re.compile("\s?\w+\s")
+
+ key_vals = []
+
+ if gff3_pat.match(atbs[0]): # gff3 pattern
+ is_gff = True
+ key_vals = [at.split('=') for at in atbs]
+ elif gtf_pat.match(atbs[0]): # gtf pattern
+ for at in atbs:
+ key_vals.append(at.strip().split(" ",1))
+ else:
+ # to handle attribute column has only single value
+ key_vals.append(['ID', atbs[0]])
+ # get key, val items
+ for item in key_vals:
+ key, val = item
+ # replace the double qoutes from feature identifier
+ val = re.sub('"', '', val)
+ # replace the web formating place holders to plain text format
+ info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
+
+ return is_gff, info
+
+def spec_features_keywd(gff_parts):
+ """
+ Specify the feature key word according to the GFF specifications
+
+ @args gff_parts: attribute field key
+ @type gff_parts: str
+ """
+ for t_id in ["transcript_id", "transcriptId", "proteinId"]:
+ try:
+ gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
+ break
+ except KeyError:
+ pass
+ for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
+ try:
+ gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
+ break
+ except KeyError:
+ pass
+ ## TODO key words
+ for flat_name in ["Transcript", "CDS"]:
+ if gff_parts["info"].has_key(flat_name):
+ # parents
+ if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
+ if not gff_parts['id']:
+ gff_parts['id'] = gff_parts['info'][flat_name][0]
+ #gff_parts["info"]["ID"] = [gff_parts["id"]]
+ # children
+ elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+ "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+ "start_codon"]:
+ gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
+ break
+ return gff_parts
+
+def Parse(ga_file):
+ """
+ Parsing GFF/GTF file based on feature relationship, it takes the input file.
+
+ @args ga_file: input file name
+ @type ga_file: str
+ """
+ child_map = defaultdict(list)
+ parent_map = dict()
+
+ ga_handle = utils.open_file(ga_file)
+
+ for rec in ga_handle:
+ rec = rec.strip('\n\r')
+
+ # skip empty line fasta identifier and commented line
+ if not rec or rec[0] in ['#', '>']:
+ continue
+ # skip the genome sequence
+ if not re.search('\t', rec):
+ continue
+
+ parts = rec.split('\t')
+ assert len(parts) >= 8, rec
+
+ # process the attribute column (9th column)
+ ftype, tags = attribute_tags(parts[-1])
+ if not tags: # skip the line if no attribute column.
+ continue
+
+ # extract fields
+ if parts[1]:
+ tags["source"] = parts[1]
+ if parts[7]:
+ tags["phase"] = parts[7]
+
+ gff_info = dict()
+ gff_info['info'] = dict(tags)
+ gff_info["is_gff3"] = ftype
+ gff_info['chr'] = parts[0]
+ gff_info['score'] = parts[5]
+
+ if parts[3] and parts[4]:
+ gff_info['location'] = [int(parts[3]) ,
+ int(parts[4])]
+ gff_info['type'] = parts[2]
+ gff_info['id'] = tags.get('ID', [''])[0]
+ if parts[6] in ['?', '.']:
+ parts[6] = None
+ gff_info['strand'] = parts[6]
+
+ # key word according to the GFF spec.
+ # is_gff3 flag is false check this condition and get the attribute fields
+ if not ftype:
+ gff_info = spec_features_keywd(gff_info)
+
+ # link the feature relationships
+ if gff_info['info'].has_key('Parent'):
+ for p in gff_info['info']['Parent']:
+ if p == gff_info['id']:
+ gff_info['id'] = ''
+ break
+ rec_category = 'child'
+ elif gff_info['id']:
+ rec_category = 'parent'
+ else:
+ rec_category = 'record'
+
+ # depends on the record category organize the features
+ if rec_category == 'child':
+ for p in gff_info['info']['Parent']:
+ # create the data structure based on source and feature id
+ child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
+ dict( type = gff_info['type'],
+ location = gff_info['location'],
+ strand = gff_info['strand'],
+ score = gff_info['score'],
+ ID = gff_info['id'],
+ gene_id = gff_info['info'].get('GParent', '')
+ ))
+ elif rec_category == 'parent':
+ parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
+ type = gff_info['type'],
+ location = gff_info['location'],
+ strand = gff_info['strand'],
+ score = gff_info['score'],
+ name = tags.get('Name', [''])[0])
+ elif rec_category == 'record':
+ #TODO how to handle plain records?
+ c = 1
+ ga_handle.close()
+
+ # depends on file type create parent feature
+ if not ftype:
+ parent_map, child_map = create_missing_feature_type(parent_map, child_map)
+
+ # connecting parent child relations
+ # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
+ gene_mat = format_gene_models(parent_map, child_map)
+
+ return gene_mat
+
+def format_gene_models(parent_nf_map, child_nf_map):
+ """
+ Genarate GeneObject based on the parsed file contents
+
+ @args parent_nf_map: parent features with source and chromosome information
+ @type parent_nf_map: collections defaultdict
+ @args child_nf_map: transctipt and exon information are encoded
+ @type child_nf_map: collections defaultdict
+ """
+
+ g_cnt = 0
+ gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())
+
+ for pkey, pdet in parent_nf_map.items():
+ # considering only gene features
+ #if not re.search(r'gene', pdet.get('type', '')):
+ # continue
+
+ # infer the gene start and stop if not there in the
+ if not pdet.get('location', []):
+ GNS, GNE = [], []
+ # multiple number of transcripts
+ for L1 in child_nf_map[pkey]:
+ GNS.append(L1.get('location', [])[0])
+ GNE.append(L1.get('location', [])[1])
+ GNS.sort()
+ GNE.sort()
+ pdet['location'] = [GNS[0], GNE[-1]]
+
+ orient = pdet.get('strand', '')
+ gene[g_cnt]['id'] = g_cnt +1
+ gene[g_cnt]['chr'] = pkey[0]
+ gene[g_cnt]['source'] = pkey[1]
+ gene[g_cnt]['name'] = pkey[-1]
+ gene[g_cnt]['start'] = pdet.get('location', [])[0]
+ gene[g_cnt]['stop'] = pdet.get('location', [])[1]
+ gene[g_cnt]['strand'] = orient
+ gene[g_cnt]['score'] = pdet.get('score','')
+
+ # default value
+ gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
+ if len(child_nf_map[pkey]) > 1:
+ gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1
+
+ # complete sub-feature for all transcripts
+ dim = len(child_nf_map[pkey])
+ TRS = np.zeros((dim,), dtype=np.object)
+ TR_TYP = np.zeros((dim,), dtype=np.object)
+ EXON = np.zeros((dim,), dtype=np.object)
+ UTR5 = np.zeros((dim,), dtype=np.object)
+ UTR3 = np.zeros((dim,), dtype=np.object)
+ CDS = np.zeros((dim,), dtype=np.object)
+ TISc = np.zeros((dim,), dtype=np.object)
+ TSSc = np.zeros((dim,), dtype=np.object)
+ CLV = np.zeros((dim,), dtype=np.object)
+ CSTOP = np.zeros((dim,), dtype=np.object)
+ TSTAT = np.zeros((dim,), dtype=np.object)
+
+ # fetching corresponding transcripts
+ for xq, Lv1 in enumerate(child_nf_map[pkey]):
+
+ TID = Lv1.get('ID', '')
+ TRS[xq]= np.array([TID])
+
+ TYPE = Lv1.get('type', '')
+ TR_TYP[xq] = np.array('')
+ TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]
+
+ orient = Lv1.get('strand', '')
+
+ # fetching different sub-features
+ child_feat = defaultdict(list)
+ for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
+ E_TYP = Lv2.get('type', '')
+ child_feat[E_TYP].append(Lv2.get('location'))
+
+ # make general ascending order of coordinates
+ if orient == '-':
+ for etype, excod in child_feat.items():
+ if len(excod) > 1:
+ if excod[0][0] > excod[-1][0]:
+ excod.reverse()
+ child_feat[etype] = excod
+
+ # make exon coordinate from cds and utr regions
+ if not child_feat.get('exon'):
+ if child_feat.get('CDS'):
+ exon_cod = utils.make_Exon_cod( orient,
+ NonetoemptyList(child_feat.get('five_prime_UTR')),
+ NonetoemptyList(child_feat.get('CDS')),
+ NonetoemptyList(child_feat.get('three_prime_UTR')))
+ child_feat['exon'] = exon_cod
+ else:
+ # TODO only UTR's
+ # searching through keys to find a pattern describing exon feature
+ ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
+ if ex_key_pattern:
+ child_feat['exon'] = child_feat[ex_key_pattern[0]]
+
+ # stop_codon are seperated from CDS, add the coordinates based on strand
+ if child_feat.get('stop_codon'):
+ if orient == '+':
+ if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
+ child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
+ else:
+ child_feat['CDS'].append(child_feat.get('stop_codon')[0])
+ elif orient == '-':
+ if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
+ child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
+ else:
+ child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])
+
+ # transcript signal sites
+ TIS, cdsStop, TSS, cleave = [], [], [], []
+ cds_status, exon_status, utr_status = 0, 0, 0
+
+ if child_feat.get('exon'):
+ TSS = [child_feat.get('exon')[-1][1]]
+ TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
+ cleave = [child_feat.get('exon')[0][0]]
+ cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
+ exon_status = 1
+
+ if child_feat.get('CDS'):
+ if orient == '+':
+ TIS = [child_feat.get('CDS')[0][0]]
+ cdsStop = [child_feat.get('CDS')[-1][1]-3]
+ else:
+ TIS = [child_feat.get('CDS')[-1][1]]
+ cdsStop = [child_feat.get('CDS')[0][0]+3]
+ cds_status = 1
+ # cds phase calculation
+ child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
+
+ # sub-feature status
+ if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
+ utr_status =1
+
+ if utr_status == cds_status == exon_status == 1:
+ t_status = 1
+ else:
+ t_status = 0
+
+ # add sub-feature # make array for export to different out
+ TSTAT[xq] = t_status
+ EXON[xq] = np.array(child_feat.get('exon'), np.float64)
+ UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
+ UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
+ CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
+ TISc[xq] = np.array(TIS)
+ CSTOP[xq] = np.array(cdsStop)
+ TSSc[xq] = np.array(TSS)
+ CLV[xq] = np.array(cleave)
+
+ # add sub-features to the parent gene feature
+ gene[g_cnt]['transcript_status'] = TSTAT
+ gene[g_cnt]['transcripts'] = TRS
+ gene[g_cnt]['exons'] = EXON
+ gene[g_cnt]['utr5_exons'] = UTR5
+ gene[g_cnt]['cds_exons'] = CDS
+ gene[g_cnt]['utr3_exons'] = UTR3
+ gene[g_cnt]['transcript_type'] = TR_TYP
+ gene[g_cnt]['tis'] = TISc
+ gene[g_cnt]['cdsStop'] = CSTOP
+ gene[g_cnt]['tss'] = TSSc
+ gene[g_cnt]['cleave'] = CLV
+
+ gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
+ Name = pdet.get('name'),
+ Source = pkey[1])
+ # few empty fields // TODO fill this:
+ gene[g_cnt]['anno_id'] = []
+ gene[g_cnt]['confgenes_id'] = []
+ gene[g_cnt]['alias'] = ''
+ gene[g_cnt]['name2'] = []
+ gene[g_cnt]['chr_num'] = []
+ gene[g_cnt]['paralogs'] = []
+ gene[g_cnt]['transcript_info'] = []
+ gene[g_cnt]['transcript_valid'] = []
+ gene[g_cnt]['exons_confirmed'] = []
+ gene[g_cnt]['tis_conf'] = []
+ gene[g_cnt]['tis_info'] = []
+ gene[g_cnt]['cdsStop_conf'] = []
+ gene[g_cnt]['cdsStop_info'] = []
+ gene[g_cnt]['tss_info'] = []
+ gene[g_cnt]['tss_conf'] = []
+ gene[g_cnt]['cleave_info'] = []
+ gene[g_cnt]['cleave_conf'] = []
+ gene[g_cnt]['polya_info'] = []
+ gene[g_cnt]['polya_conf'] = []
+ gene[g_cnt]['is_valid'] = []
+ gene[g_cnt]['transcript_complete'] = []
+ gene[g_cnt]['is_complete'] = []
+ gene[g_cnt]['is_correctly_gff3_referenced'] = ''
+ gene[g_cnt]['splicegraph'] = []
+ g_cnt += 1
+
+ ## deleting empty gene records from the main array
+ XPFLG=0
+ for XP, ens in enumerate(gene):
+ if ens[0]==0:
+ XPFLG=1
+ break
+
+ if XPFLG==1:
+ XQC = range(XP, len(gene)+1)
+ gene = np.delete(gene, XQC)
+
+ return gene
+
+def NonetoemptyList(XS):
+ """
+ Convert a None type to empty list
+
+ @args XS: None type
+ @type XS: str
+ """
+ return [] if XS is None else XS
+
+def create_missing_feature_type(p_feat, c_feat):
+ """
+ GFF/GTF file defines only child features. This function tries to create
+ the parent feature from the information provided in the attribute column.
+
+ example:
+ chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
+ chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
+ chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
+
+ This function gets the parsed feature annotations.
+
+ @args p_feat: Parent feature map
+ @type p_feat: collections defaultdict
+ @args c_feat: Child feature map
+ @type c_feat: collections defaultdict
+ """
+
+ child_n_map = defaultdict(list)
+ for fid, det in c_feat.items():
+ # get the details from grand child
+ GID = STRD = SCR = None
+ SPOS, EPOS = [], []
+ TYP = dict()
+ for gchild in det:
+ GID = gchild.get('gene_id', [''])[0]
+ SPOS.append(gchild.get('location', [])[0])
+ EPOS.append(gchild.get('location', [])[1])
+ STRD = gchild.get('strand', '')
+ SCR = gchild.get('score', '')
+ if gchild.get('type', '') == "gene": ## gencode GTF file has this problem
+ continue
+ TYP[gchild.get('type', '')] = 1
+ SPOS.sort()
+ EPOS.sort()
+
+ # infer transcript type
+ transcript_type = 'transcript'
+ transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
+
+ # gene id and transcript id are same
+ transcript_id = fid[-1]
+ if GID == transcript_id:
+ transcript_id = 'Transcript:' + str(GID)
+
+ # level -1 feature type
+ p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
+ location = [], ## infer location based on multiple transcripts
+ strand = STRD,
+ name = GID )
+ # level -2 feature type
+ child_n_map[(fid[0], fid[1], GID)].append(
+ dict( type = transcript_type,
+ location = [SPOS[0], EPOS[-1]],
+ strand = STRD,
+ score = SCR,
+ ID = transcript_id,
+ gene_id = '' ))
+ # reorganizing the grand child
+ for gchild in det:
+ child_n_map[(fid[0], fid[1], transcript_id)].append(
+ dict( type = gchild.get('type', ''),
+ location = gchild.get('location'),
+ strand = gchild.get('strand'),
+ ID = gchild.get('ID'),
+ score = gchild.get('score'),
+ gene_id = '' ))
+ return p_feat, child_n_map
+
diff -r 619e0fcd9126 -r 6e589f267c14 README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,61 @@
+A collection of tools for converting genome annotation between GTF (Gene Transfer Format),
+BED (Browser Extensible Data) and GFF (Generic Feature Format).
+
+INTRODUCTION
+
+Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs
+they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested
+with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the
+instructions below to clone these tools into your galaxy instance.
+
+CONTENTS
+
+Tool configuration files in *.xml format.
+
+ gtf_to_gff.xml
+ gff_to_gtf.xml
+ bed_to_gff.xml
+ gff_to_bed.xml
+ gbk_to_gff.xml
+ gff_fmap.xml
+
+Python based scripts.
+
+ gtf_to_gff.py: convert data from GTF to valid GFF3.
+ gff_to_gtf.py: convert data from GFF3 to GTF.
+ bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3.
+ gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format.
+ gbk_to_gff.py: convert data from genbank format to GFF.
+ gff_fmap.py: find the relation between different features described in a GFF file.
+ GFFParser.py: Parse GFF/GTF files.
+ helper.py: Utility functions.
+
+test-data: Test data set. (move to your galaxy_root_folder/test-data/)
+
+ You may need to move the test files into your test-data directory so galaxy can find them.
+ If you want to run the functional tests eg as:
+
+ exmaple:
+ sh run_functional_tests.sh -id fml_gtf2gff
+
+REQUIREMENTS
+
+ python
+
+COMMENTS/QUESTIONS
+
+I can be reached at vipin [at] cbio.mskcc.org
+
+LICENSE
+
+Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society
+ 2013-2014 Memorial Sloan Kettering Cancer Center
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+COURTESY
+
+To the Galaxy Team.
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_gff.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""
+Convert genome annotation data in a 12 column BED format to GFF3.
+
+Usage: python bed_to_gff.py in.bed > out.gff
+
+Requirement:
+ helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C)
+ 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import helper
+
+def __main__():
+ """
+ main function
+ """
+
+ try:
+ bed_fname = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ bed_fh = helper.open_file(bed_fname)
+
+ for line in bed_fh:
+ line = line.strip( '\n\r' )
+
+ if not line or line[0] in ['#']:
+ continue
+
+ parts = line.split('\t')
+ assert len(parts) >= 12, line
+
+ rstarts = parts[-1].split(',')
+ rstarts.pop() if rstarts[-1] == '' else rstarts
+
+ exon_lens = parts[-2].split(',')
+ exon_lens.pop() if exon_lens[-1] == '' else exon_lens
+
+ if len(rstarts) != len(exon_lens):
+ continue # checking the consistency col 11 and col 12
+
+ if len(rstarts) != int(parts[-3]):
+ continue # checking the number of exons and block count are same
+
+ if not parts[5] in ['+', '-']:
+ parts[5] = '.' # replace the unknown strand with '.'
+
+ # bed2gff result line
+ print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])
+ print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])
+
+ st = int(parts[1])
+ for ex_cnt in range(int(parts[-3])):
+ start = st + int(rstarts[ex_cnt]) + 1
+ stop = start + int(exon_lens[ex_cnt]) - 1
+ print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3])
+
+ bed_fh.close()
+
+
+if __name__ == "__main__":
+ __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_gff.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,89 @@
+
+ converter
+ bed_to_gff.py $inf_bed > $gff_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in UCSC Wiggle BED format::
+
+ chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347,
+
+- Will be converted to GFF3::
+
+ ##gff-version 3
+ chr1 bed2gff gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+ chr1 bed2gff transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+ chr1 bed2gff exon 11874 12227 0 + . Parent=uc001aaa.3
+ chr1 bed2gff exon 12613 12721 0 + . Parent=uc001aaa.3
+ chr1 bed2gff exon 13221 14409 0 + . Parent=uc001aaa.3
+
+--------
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+ 1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+ 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+ 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+ 4. name - The name of the BED line.
+ 5. score - A score between 0 and 1000.
+ 6. strand - Defines the strand - either '+' or '-'.
+ 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+ 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+ 9. reserved - This should always be set to zero.
+ 10. blockCount - The number of blocks (exons) in the BED line.
+ 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+ 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+"""
+Convert data from Genbank format to GFF.
+
+Usage:
+python gbk_to_gff.py in.gbk > out.gff
+
+Requirements:
+ BioPython:- http://biopython.org/
+ helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C)
+ 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import os
+import re
+import sys
+import collections
+from Bio import SeqIO
+import helper
+
+def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
+ """
+ Write the feature information
+ """
+
+ for gname, ginfo in genes.items():
+ line = [str(chr_id),
+ 'gbk_to_gff',
+ ginfo[3],
+ str(ginfo[0]),
+ str(ginfo[1]),
+ '.',
+ ginfo[2],
+ '.',
+ 'ID=%s;Name=%s' % (str(gname), str(gname))]
+ print '\t'.join(line)
+ ## construct the transcript line is not defined in the original file
+ t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.']
+
+ if not transcripts:
+ t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
+
+ if exons: ## get the entire transcript region from the defined feature
+ t_line[3] = str(exons[gname][0][0])
+ t_line[4] = str(exons[gname][0][-1])
+ elif cds:
+ t_line[3] = str(cds[gname][0][0])
+ t_line[4] = str(cds[gname][0][-1])
+ print '\t'.join(t_line)
+
+ if exons:
+ exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
+
+ if cds:
+ exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
+ if not exons:
+ exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
+
+ else: ## transcript is defined
+ for idx in transcripts[gname]:
+ t_line[2] = idx[3]
+ t_line[3] = str(idx[0])
+ t_line[4] = str(idx[1])
+ t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
+ print '\t'.join(t_line)
+
+ ## feature line print call
+ if exons:
+ exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
+ if cds:
+ exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
+ if not exons:
+ exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
+
+ if len(genes) == 0: ## feature entry with fragment information
+
+ line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.']
+ fStart = fStop = None
+
+ for eid, ex in cds.items():
+ fStart = ex[0][0]
+ fStop = ex[0][-1]
+
+ for eid, ex in exons.items():
+ fStart = ex[0][0]
+ fStop = ex[0][-1]
+
+ if fStart or fStart:
+
+ line[2] = 'gene'
+ line[3] = str(fStart)
+ line[4] = str(fStop)
+ line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
+ print "\t".join(line)
+
+ if not cds:
+ line[2] = 'transcript'
+ else:
+ line[2] = 'mRNA'
+
+ line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
+ print "\t".join(line)
+
+ if exons:
+ exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+
+ if cds:
+ exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
+ if not exons:
+ exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
+
+ unk +=1
+
+ return unk
+
+def exon_line_print(temp_line, trx_exons, parent, ftype):
+ """
+ Print the EXON feature line
+ """
+
+ for ex in trx_exons:
+ temp_line[2] = ftype
+ temp_line[3] = str(ex[0])
+ temp_line[4] = str(ex[1])
+ temp_line[8] = 'Parent=%s' % parent
+ print '\t'.join(temp_line)
+
+def gbk_parse(fname):
+ """
+ Extract genome annotation recods from genbank format
+
+ @args fname: gbk file name
+ @type fname: str
+ """
+
+ fhand = helper.open_file(gbkfname)
+ unk = 1
+
+ for record in SeqIO.parse(fhand, "genbank"):
+
+ gene_tags = dict()
+ tx_tags = collections.defaultdict(list)
+ exon = collections.defaultdict(list)
+ cds = collections.defaultdict(list)
+ mol_type, chr_id = None, None
+
+ for rec in record.features:
+
+ if rec.type == 'source':
+ try:
+ mol_type = rec.qualifiers['mol_type'][0]
+ except:
+ mol_type = '.'
+ pass
+ try:
+ chr_id = rec.qualifiers['chromosome'][0]
+ except:
+ chr_id = record.name
+ continue
+
+ strand='-'
+ strand='+' if rec.strand>0 else strand
+
+ fid = None
+ try:
+ fid = rec.qualifiers['gene'][0]
+ except:
+ pass
+
+ transcript_id = None
+ try:
+ transcript_id = rec.qualifiers['transcript_id'][0]
+ except:
+ pass
+
+ if re.search(r'gene', rec.type):
+ gene_tags[fid] = (rec.location._start.position+1,
+ rec.location._end.position,
+ strand,
+ rec.type
+ )
+ elif rec.type == 'exon':
+ exon[fid].append((rec.location._start.position+1,
+ rec.location._end.position))
+ elif rec.type=='CDS':
+ cds[fid].append((rec.location._start.position+1,
+ rec.location._end.position))
+ else:
+ # get all transcripts
+ if transcript_id:
+ tx_tags[fid].append((rec.location._start.position+1,
+ rec.location._end.position,
+ transcript_id,
+ rec.type))
+ # record extracted, generate feature table
+ unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
+
+ fhand.close()
+
+
+if __name__=='__main__':
+
+ try:
+ gbkfname = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ ## extract gbk records
+ gbk_parse(gbkfname)
diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,91 @@
+
+ converter
+ gbk_to_gff.py $inf_gbk > $gff_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description).
+
+.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/
+
+------
+
+**Example**
+
+- The following data in GenBank format::
+
+ LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011
+ DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
+ mRNA, complete cds.
+ ACCESSION NM_001202705
+ VERSION NM_001202705.1 GI:334184566.........
+ FEATURES Location/Qualifiers
+ source 1..2406
+ /organism="Arabidopsis thaliana"
+ /mol_type="mRNA"
+ /db_xref="taxon:3702"........
+ gene 1..2406
+ /gene="THIC"
+ /locus_tag="AT2G29630"
+ /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
+ ORIGIN
+ 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
+ 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
+ 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
+ 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
+ //
+
+
+- Will be converted to GFF3::
+
+ ##gff-version 3
+ NM_001202705 gbk_to_gff chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705
+ NM_001202705 gbk_to_gff gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC
+ NM_001202705 gbk_to_gff mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630
+ NM_001202705 gbk_to_gff CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01
+ NM_001202705 gbk_to_gff exon 192 2126 . + 1 Parent=AT2G29630.t01
+
+------
+
+**About formats**
+
+**GenBank format** An example of a GenBank record may be viewed here_
+
+.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
+
+**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_fmap.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+"""
+GFF feature mapping program, to find the relation between different features described in a given GFF file.
+
+Usage:
+python gff_fmap.py in.gff > out.txt
+
+Courtesy: Brad Chapman
+ Few functions are inherited from bcbio-GFFParser program.
+"""
+
+import re
+import sys
+import urllib
+import collections
+from helper import open_file
+
+def _gff_line_map(line):
+ """Parses a line of GFF into a dictionary.
+ Given an input line from a GFF file, this:
+ - breaks it into component elements
+ - determines the type of attribute (flat, parent, child or annotation)
+ - generates a dictionary of GFF info
+ """
+ gff3_kw_pat = re.compile("\w+=")
+ def _split_keyvals(keyval_str):
+ """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+ GFF3 has key value pairs like:
+ count=9;gene=amx-2;sequence=SAGE:aacggagccg
+ GFF2 and GTF have:
+ Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+ name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+ """
+ quals = collections.defaultdict(list)
+ if keyval_str is None:
+ return quals
+ # ensembl GTF has a stray semi-colon at the end
+ if keyval_str[-1] == ';':
+ keyval_str = keyval_str[:-1]
+ # GFF2/GTF has a semi-colon with at least one space after it.
+ # It can have spaces on both sides; wormbase does this.
+ # GFF3 works with no spaces.
+ # Split at the first one we can recognize as working
+ parts = keyval_str.split(" ; ")
+ if len(parts) == 1:
+ parts = keyval_str.split("; ")
+ if len(parts) == 1:
+ parts = keyval_str.split(";")
+ # check if we have GFF3 style key-vals (with =)
+ is_gff2 = True
+ if gff3_kw_pat.match(parts[0]):
+ is_gff2 = False
+ key_vals = [p.split('=') for p in parts]
+ # otherwise, we are separated by a space with a key as the first item
+ else:
+ pieces = []
+ for p in parts:
+ # fix misplaced semi-colons in keys in some GFF2 files
+ if p and p[0] == ';':
+ p = p[1:]
+ pieces.append(p.strip().split(" "))
+ key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+ for key, val in key_vals:
+ # remove quotes in GFF2 files
+ if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+ val = val[1:-1]
+ if val:
+ quals[key].extend(val.split(','))
+ # if we don't have a value, make this a key=True/False style
+ # attribute
+ else:
+ quals[key].append('true')
+ for key, vals in quals.items():
+ quals[key] = [urllib.unquote(v) for v in vals]
+ return quals, is_gff2
+
+ def _nest_gff2_features(gff_parts):
+ """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+ exons and coding sequences are mapped to a parent with a transcript_id
+ in GFF2. This is implemented differently at different genome centers
+ and this function attempts to resolve that and map things to the GFF3
+ way of doing them.
+ """
+ # map protein or transcript ids to a parent
+ for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+ try:
+ gff_parts["quals"]["Parent"] = \
+ gff_parts["quals"][transcript_id]
+ break
+ except KeyError:
+ pass
+ # case for WormBase GFF -- everything labelled as Transcript or CDS
+ for flat_name in ["Transcript", "CDS"]:
+ if gff_parts["quals"].has_key(flat_name):
+ # parent types
+ if gff_parts["type"] in [flat_name]:
+ if not gff_parts["id"]:
+ gff_parts["id"] = gff_parts["quals"][flat_name][0]
+ gff_parts["quals"]["ID"] = [gff_parts["id"]]
+ # children types
+ elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+ "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+ "start_codon"]:
+ gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+ break
+
+ return gff_parts
+
+ line = line.strip()
+ if line == '':return [('directive', line)] # sometimes the blank lines will be there
+ if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
+ if line[0] == "#":
+ return [('directive', line[2:])]
+ elif line:
+ parts = line.split('\t')
+ if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together
+ assert len(parts) == 9, line
+ gff_parts = [(None if p == '.' else p) for p in parts]
+ gff_info = dict()
+
+ # collect all of the base qualifiers for this item
+ quals, is_gff2 = _split_keyvals(gff_parts[8])
+
+ gff_info["is_gff2"] = is_gff2
+
+ if gff_parts[1]:quals["source"].append(gff_parts[1])
+ gff_info['quals'] = dict(quals)
+
+ # if we are describing a location, then we are a feature
+ if gff_parts[3] and gff_parts[4]:
+ gff_info['type'] = gff_parts[2]
+ gff_info['id'] = quals.get('ID', [''])[0]
+
+ if is_gff2:gff_info = _nest_gff2_features(gff_info)
+ # features that have parents need to link so we can pick up
+ # the relationship
+ if gff_info['quals'].has_key('Parent'):
+ final_key = 'child'
+ elif gff_info['id']:
+ final_key = 'parent'
+ # Handle flat features
+ else:
+ final_key = 'feature'
+ # otherwise, associate these annotations with the full record
+ else:
+ final_key = 'annotation'
+ return [(final_key, gff_info)]
+
+def parent_child_id_map(gff_handle):
+ """
+ Provide a mapping of parent to child relationships in the file.
+ Gives a dictionary of parent child relationships:
+
+ keys -- tuple of (source, type) for each parent
+ values -- tuple of (source, type) as children of that parent
+ """
+ # collect all of the parent and child types mapped to IDs
+ parent_sts = dict()
+ child_sts = collections.defaultdict(list)
+ for line in gff_handle:
+ line_type, line_info = _gff_line_map(line)[0]
+ if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
+ parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
+ if line_type == 'child':
+ for parent_id in line_info['quals']['Parent']:
+ child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
+ gff_handle.close()
+ # generate a dictionary of the unique final type relationships
+ pc_map = collections.defaultdict(list)
+ for parent_id, parent_type in parent_sts.items():
+ for child_type in child_sts[parent_id]:
+ pc_map[parent_type].append(child_type)
+ pc_final_map = dict()
+ for ptype, ctypes in pc_map.items():
+ unique_ctypes = list(set(ctypes))
+ unique_ctypes.sort()
+ pc_final_map[ptype] = unique_ctypes
+ # some cases the GFF file represents a single feature type
+ if not pc_final_map:
+ for fid, stypes in parent_sts.items():
+ pc_final_map[stypes] = dict()
+ # generate a report on feature id mapping in the file
+ print '+---------------------+---------------------------------+'
+ print '| Parent feature type | Associated child feature type(s)|'
+ print '+---------------------+---------------------------------+'
+ for key, value in pc_final_map.items():
+ print key[0], key[1]
+ for child_to in value:
+ print '\t\t\t|-',child_to[0], child_to[1]
+ print '+---------------------+---------------------------------+'
+
+
+if __name__=='__main__':
+
+ try:
+ gff_file = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ gff_handle = open_file(gff_file)
+ parent_child_id_map(gff_handle)
diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_fmap.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,83 @@
+
+ features
+
+ gff_fmap.py $gff_input > $idmapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file.
+
+--------
+
+**Example**
+
+- The features ID mapping in the following data in GFF3::
+
+ ##gff-version 3
+ 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11
+ 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+ 17 protein_coding protein 7256262 7256960 . + 0 ID=ENSP00000328352;Name=ENSP00000328352
+ 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751
+ 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+ 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751
+ 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751
+
+- Will be displayed as::
+
+ +-----------------------+---------------------------------+
+ | Parent feature type | Associated child feature type(s)|
+ +-----------------------+---------------------------------+
+ | protein_coding gene | protein_coding mRNA |
+ +-----------------------+---------------------------------+
+ | protein_coding protein| protein_coding CDS |
+ +-----------------------+---------------------------------+
+ | protein_coding mRNA | protein_coding CDS |
+ | | protein_coding exon |
+ | | protein_coding five_prime_UTR |
+ | | protein_coding three_prime_UTR |
+ +-----------------------+---------------------------------+
+
+--------
+
+**About formats**
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_bed.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""
+Convert genome annotation data in GFF/GTF to a 12 column BED format.
+BED format typically represents the transcript models.
+
+Usage: python gff_to_bed.py in.gff > out.bed
+
+Requirement:
+ GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
+
+Copyright (C)
+ 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+
+def writeBED(tinfo):
+ """
+ writing result files in bed format
+
+ @args tinfo: list of genes
+ @args tinfo: numpy object
+ """
+
+ for ent1 in tinfo:
+ child_flag = False
+
+ for idx, tid in enumerate(ent1['transcripts']):
+ child_flag = True
+ exon_cnt = len(ent1['exons'][idx])
+ exon_len = ''
+ exon_cod = ''
+ rel_start = None
+ rel_stop = None
+ for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript
+ exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
+ if idz == 0: #calculate the relative start position
+ exon_cod += '0,'
+ rel_start = int(ex_cod[0])
+ rel_stop = ex_cod[1]
+ else:
+ exon_cod += '%d,' % (ex_cod[0]-rel_start)
+ rel_stop = int(ex_cod[1])
+
+ if exon_len:
+ score = '0'
+ score = ent1['score'][0] if ent1['score'] else score
+ out_print = [ent1['chr'],
+ str(rel_start),
+ str(rel_stop),
+ tid[0],
+ score,
+ ent1['strand'],
+ str(rel_start),
+ str(rel_stop),
+ '0',
+ str(exon_cnt),
+ exon_len,
+ exon_cod]
+ print '\t'.join(out_print)
+
+ if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type
+ score = '0'
+ score = ent1['score'][0] if ent1['score'] else score
+
+ out_print = [ent1['chr'],
+ '%d' % int(ent1['start']),
+ '%d' % int(ent1['stop']),
+ ent1['name'],
+ score,
+ ent1['strand'],
+ '%d' % int(ent1['start']),
+ '%d' % int(ent1['stop']),
+ '0',
+ '1',
+ '%d,' % (int(ent1['stop'])-int(ent1['start'])+1),
+ '0,']
+
+ print '\t'.join(out_print)
+
+
+def __main__():
+ try:
+ query_file = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ Transcriptdb = GFFParser.Parse(query_file)
+ writeBED(Transcriptdb)
+
+if __name__ == "__main__":
+ __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_bed.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,90 @@
+
+ converter
+ gff_to_bed.py $inf_gff > $bed_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format.
+
+--------
+
+**Example**
+
+- The following data in GFF3::
+
+ ##gff-version 3
+ chr1 protein_coding gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+ chr1 protein_coding transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+ chr1 protein_coding exon 11874 12227 0 + . Parent=uc001aaa.3
+ chr1 protein_coding exon 12613 12721 0 + . Parent=uc001aaa.3
+ chr1 protein_coding exon 13221 14409 0 + . Parent=uc001aaa.3
+
+- Will be converted to UCSC Wiggle BED format::
+
+ chr1 11874 14409 uc001aaa.3 0 + 11874 14409 0 3 354,109,1189, 0,739,1347,
+
+--------
+
+**About formats**
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+ 1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+ 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+ 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+ 4. name - The name of the BED line.
+ 5. score - A score between 0 and 1000.
+ 6. strand - Defines the strand - either '+' or '-'.
+ 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+ 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+ 9. reserved - This should always be set to zero.
+ 10. blockCount - The number of blocks (exons) in the BED line.
+ 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+ 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gbk.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+"""
+Convert data from GFF and associated genome sequence in fasta file into GenBank.
+
+Usage:
+python gff_to_gbk.py in.gff in.fasta out.gbk
+
+Requirements:
+ BioPython:- http://biopython.org/
+ helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C)
+ 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import sys
+import helper
+import gffparser_bcbio
+
+from Bio import SeqIO
+from Bio.Alphabet import generic_dna
+
+def __main__():
+ """
+ main wrapper
+ """
+
+ try:
+ gff_fname = sys.argv[1]
+ fasta_fname = sys.argv[2]
+ gb_fname = sys.argv[3]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ fasta_fh = helper.open_file(fasta_fname)
+
+ fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna))
+ fasta_fh.close()
+
+ gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec)
+
+ try:
+ gb_fh = open(gb_fname, "w")
+ except:
+ print 'file not ready for writing %s' % gb_fname
+ sys.exit(-1)
+
+ SeqIO.write(gff_rec, gb_fh, "genbank")
+ gb_fh.close()
+
+if __name__=="__main__":
+ __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gbk.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,98 @@
+
+ converter
+ gff_to_gbk.py $inf_gff $inf_fas $gbk_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts annotations in GFF to GenBank_ format (scroll down for format description).
+
+.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/
+
+------
+
+**Example**
+
+- The following data in GFF::
+
+ ##gff-version 3
+ # sequence-region NM_001202705 1 2406
+ NM_001202705 GenBank chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ;
+ NM_001202705 GenBank gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630
+ NM_001202705 GenBank mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630
+ NM_001202705 GenBank CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1;
+ NM_001202705 GenBank exon 192 2126 . + 1 Parent=AT2G29630.t01
+ ##FASTA
+ >NM_001202705
+ AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG
+ TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT
+
+- Will be converted to GenBank format::
+
+ LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011
+ DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC)
+ mRNA, complete cds.
+ ACCESSION NM_001202705
+ VERSION NM_001202705.1 GI:334184566.........
+ FEATURES Location/Qualifiers
+ source 1..2406
+ /organism="Arabidopsis thaliana"
+ /mol_type="mRNA"
+ /db_xref="taxon:3702"........
+ gene 1..2406
+ /gene="THIC"
+ /locus_tag="AT2G29630"
+ /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........
+ ORIGIN
+ 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg
+ 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat
+ 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc
+ 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca
+ //
+
+------
+
+**About formats**
+
+**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+**GenBank format** Consists of an annotation section and a sequence section. Sample record_
+
+.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
+
+
+--------
+
+**Copyright**
+
+2010-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gtf.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+"""
+Program to convert data from GFF to GTF
+
+Usage: python gff_to_gtf.py in.gff > out.gtf
+
+Requirement:
+ GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
+
+Copyright (C)
+ 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+
+def printGTF(tinfo):
+ """
+ writing result file in GTF format
+
+ @args tinfo: parsed object from gff file
+ @type tinfo: numpy array
+ """
+
+ for ent1 in tinfo:
+ for idx, tid in enumerate(ent1['transcripts']):
+
+ exons = ent1['exons'][idx]
+ cds_exons = ent1['cds_exons'][idx]
+
+ stop_codon = start_codon = ()
+
+ if ent1['strand'] == '+':
+ if cds_exons.any():
+ start_codon = (cds_exons[0][0], cds_exons[0][0]+2)
+ stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
+ elif ent1['strand'] == '-':
+ if cds_exons.any():
+ start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
+ stop_codon = (cds_exons[0][0], cds_exons[0][0]+2)
+ else:
+ print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0])
+ pass
+
+ last_cds_cod = 0
+ for idz, ex_cod in enumerate(exons):
+
+ print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+ if cds_exons.any():
+ try:
+ print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+ last_cds_cod = idz
+ except:
+ pass
+
+ if idz == 0:
+ print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+ if stop_codon:
+ print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
+
+
+if __name__ == "__main__":
+
+ try:
+ gff_fname = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ Transcriptdb = GFFParser.Parse(gff_fname)
+
+ printGTF(Transcriptdb)
diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_gtf.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,88 @@
+
+ converter
+ gff_to_gtf.py $inf_gff3 > $gtf_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from GFF3 to GTF file format (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in GFF3 format::
+
+ ##gff-version 3
+ 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11
+ 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+ 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+ 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751
+ 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+ 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751
+ 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751
+
+- Will be converted to GTF format::
+
+ 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+ 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+--------
+
+**About formats**
+
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
+
+ 1. seqname - The name of the sequence.
+ 2. source - This indicating where the annotation came from.
+ 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. end - The ending position of the feature (inclusive).
+ 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+ 7. strand - Valid entries include '+', '-', or '.'
+ 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+ 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 gffparser_bcbio.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gffparser_bcbio.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,828 @@
+"""Parse GFF files into features attached to Biopython SeqRecord objects.
+
+This deals with GFF3 formatted files, a tab delimited format for storing
+sequence features and annotations:
+
+http://www.sequenceontology.org/gff3.shtml
+
+It will also deal with older GFF versions (GTF/GFF2):
+
+http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml
+http://mblab.wustl.edu/GTF22.html
+
+The implementation utilizes map/reduce parsing of GFF using Disco. Disco
+(http://discoproject.org) is a Map-Reduce framework for Python utilizing
+Erlang for parallelization. The code works on a single processor without
+Disco using the same architecture.
+"""
+import os
+import copy
+import re
+import collections
+import urllib
+import itertools
+
+# Make defaultdict compatible with versions of python older than 2.4
+try:
+ collections.defaultdict
+except AttributeError:
+ import _utils
+ collections.defaultdict = _utils.defaultdict
+
+from Bio.Seq import Seq, UnknownSeq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio import SeqIO
+
+def _gff_line_map(line, params):
+ """Map part of Map-Reduce; parses a line of GFF into a dictionary.
+
+ Given an input line from a GFF file, this:
+ - decides if the file passes our filtering limits
+ - if so:
+ - breaks it into component elements
+ - determines the type of attribute (flat, parent, child or annotation)
+ - generates a dictionary of GFF info which can be serialized as JSON
+ """
+ gff3_kw_pat = re.compile("\w+=")
+ def _split_keyvals(keyval_str):
+ """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
+
+ GFF3 has key value pairs like:
+ count=9;gene=amx-2;sequence=SAGE:aacggagccg
+ GFF2 and GTF have:
+ Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
+ name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
+ """
+ quals = collections.defaultdict(list)
+ if keyval_str is None:
+ return quals
+ # ensembl GTF has a stray semi-colon at the end
+ if keyval_str[-1] == ';':
+ keyval_str = keyval_str[:-1]
+ # GFF2/GTF has a semi-colon with at least one space after it.
+ # It can have spaces on both sides; wormbase does this.
+ # GFF3 works with no spaces.
+ # Split at the first one we can recognize as working
+ parts = keyval_str.split(" ; ")
+ if len(parts) == 1:
+ parts = keyval_str.split("; ")
+ if len(parts) == 1:
+ parts = keyval_str.split(";")
+ # check if we have GFF3 style key-vals (with =)
+ is_gff2 = True
+ if gff3_kw_pat.match(parts[0]):
+ is_gff2 = False
+ key_vals = [p.split('=') for p in parts]
+ # otherwise, we are separated by a space with a key as the first item
+ else:
+ pieces = []
+ for p in parts:
+ # fix misplaced semi-colons in keys in some GFF2 files
+ if p and p[0] == ';':
+ p = p[1:]
+ pieces.append(p.strip().split(" "))
+ key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
+ for item in key_vals:
+ # standard in-spec items are key=value
+ if len(item) == 2:
+ key, val = item
+ # out-of-spec files can have just key values. We set an empty value
+ # which will be changed to true later to standardize.
+ else:
+ assert len(item) == 1, item
+ key = item[0]
+ val = ''
+ # remove quotes in GFF2 files
+ if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
+ val = val[1:-1]
+ if val:
+ quals[key].extend([v for v in val.split(',') if v])
+ # if we don't have a value, make this a key=True/False style
+ # attribute
+ else:
+ quals[key].append('true')
+ for key, vals in quals.items():
+ quals[key] = [urllib.unquote(v) for v in vals]
+ return quals, is_gff2
+
+ def _nest_gff2_features(gff_parts):
+ """Provide nesting of GFF2 transcript parts with transcript IDs.
+
+ exons and coding sequences are mapped to a parent with a transcript_id
+ in GFF2. This is implemented differently at different genome centers
+ and this function attempts to resolve that and map things to the GFF3
+ way of doing them.
+ """
+ # map protein or transcript ids to a parent
+ for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
+ try:
+ gff_parts["quals"]["Parent"] = \
+ gff_parts["quals"][transcript_id]
+ break
+ except KeyError:
+ pass
+ # case for WormBase GFF -- everything labelled as Transcript or CDS
+ for flat_name in ["Transcript", "CDS"]:
+ if gff_parts["quals"].has_key(flat_name):
+ # parent types
+ if gff_parts["type"] in [flat_name]:
+ if not gff_parts["id"]:
+ gff_parts["id"] = gff_parts["quals"][flat_name][0]
+ gff_parts["quals"]["ID"] = [gff_parts["id"]]
+ # children types
+ elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
+ "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
+ "start_codon"]:
+ gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
+ break
+
+ return gff_parts
+
+ strand_map = {'+' : 1, '-' : -1, '?' : None, None: None}
+ line = line.strip()
+ if line[:2] == "##":
+ return [('directive', line[2:])]
+ elif line and line[0] != "#":
+ parts = line.split('\t')
+ should_do = True
+ if params.limit_info:
+ for limit_name, limit_values in params.limit_info.items():
+ cur_id = tuple([parts[i] for i in
+ params.filter_info[limit_name]])
+ if cur_id not in limit_values:
+ should_do = False
+ break
+ if should_do:
+ assert len(parts) >= 8, line
+ # not python2.4 compatible but easier to understand
+ #gff_parts = [(None if p == '.' else p) for p in parts]
+ gff_parts = []
+ for p in parts:
+ if p == ".":
+ gff_parts.append(None)
+ else:
+ gff_parts.append(p)
+ gff_info = dict()
+ # collect all of the base qualifiers for this item
+ if len(parts) > 8:
+ quals, is_gff2 = _split_keyvals(gff_parts[8])
+ else:
+ quals, is_gff2 = dict(), False
+ gff_info["is_gff2"] = is_gff2
+ if gff_parts[1]:
+ quals["source"].append(gff_parts[1])
+ if gff_parts[5]:
+ quals["score"].append(gff_parts[5])
+ if gff_parts[7]:
+ quals["phase"].append(gff_parts[7])
+ gff_info['quals'] = dict(quals)
+ gff_info['rec_id'] = gff_parts[0]
+ # if we are describing a location, then we are a feature
+ if gff_parts[3] and gff_parts[4]:
+ gff_info['location'] = [int(gff_parts[3]) - 1,
+ int(gff_parts[4])]
+ gff_info['type'] = gff_parts[2]
+ gff_info['id'] = quals.get('ID', [''])[0]
+ gff_info['strand'] = strand_map.get(gff_parts[6], None)
+ if is_gff2:
+ gff_info = _nest_gff2_features(gff_info)
+ # features that have parents need to link so we can pick up
+ # the relationship
+ if gff_info['quals'].has_key('Parent'):
+ # check for self referential parent/child relationships
+ # remove the ID, which is not useful
+ for p in gff_info['quals']['Parent']:
+ if p == gff_info['id']:
+ gff_info['id'] = ''
+ del gff_info['quals']['ID']
+ break
+ final_key = 'child'
+ elif gff_info['id']:
+ final_key = 'parent'
+ # Handle flat features
+ else:
+ final_key = 'feature'
+ # otherwise, associate these annotations with the full record
+ else:
+ final_key = 'annotation'
+ if params.jsonify:
+ return [(final_key, simplejson.dumps(gff_info))]
+ else:
+ return [(final_key, gff_info)]
+ return []
+
+def _gff_line_reduce(map_results, out, params):
+ """Reduce part of Map-Reduce; combines results of parsed features.
+ """
+ final_items = dict()
+ for gff_type, final_val in map_results:
+ if params.jsonify and gff_type not in ['directive']:
+ final_val = simplejson.loads(final_val)
+ try:
+ final_items[gff_type].append(final_val)
+ except KeyError:
+ final_items[gff_type] = [final_val]
+ for key, vals in final_items.items():
+ if params.jsonify:
+ vals = simplejson.dumps(vals)
+ out.add(key, vals)
+
+class _MultiIDRemapper:
+ """Provide an ID remapping for cases where a parent has a non-unique ID.
+
+ Real life GFF3 cases have non-unique ID attributes, which we fix here
+ by using the unique sequence region to assign children to the right
+ parent.
+ """
+ def __init__(self, base_id, all_parents):
+ self._base_id = base_id
+ self._parents = all_parents
+
+ def remap_id(self, feature_dict):
+ rstart, rend = feature_dict['location']
+ for index, parent in enumerate(self._parents):
+ pstart, pend = parent['location']
+ if rstart >= pstart and rend <= pend:
+ if index > 0:
+ return ("%s_%s" % (self._base_id, index + 1))
+ else:
+ return self._base_id
+ raise ValueError("Did not find remapped ID location: %s, %s, %s" % (
+ self._base_id, [p['location'] for p in self._parents],
+ feature_dict['location']))
+
+class _AbstractMapReduceGFF:
+ """Base class providing general GFF parsing for local and remote classes.
+
+ This class should be subclassed to provide a concrete class to parse
+ GFF under specific conditions. These classes need to implement
+ the _gff_process function, which returns a dictionary of SeqRecord
+ information.
+ """
+ def __init__(self, create_missing=True):
+ """Initialize GFF parser
+
+ create_missing - If True, create blank records for GFF ids not in
+ the base_dict. If False, an error will be raised.
+ """
+ self._create_missing = create_missing
+ self._map_fn = _gff_line_map
+ self._reduce_fn = _gff_line_reduce
+ self._examiner = GFFExaminer()
+
+ def _gff_process(self, gff_files, limit_info, target_lines=None):
+ raise NotImplementedError("Derived class must define")
+
+ def parse(self, gff_files, base_dict=None, limit_info=None):
+ """Parse a GFF file, returning an iterator of SeqRecords.
+
+ limit_info - A dictionary specifying the regions of the GFF file
+ which should be extracted. This allows only relevant portions of a file
+ to be parsed.
+
+ base_dict - A base dictionary of SeqRecord objects which may be
+ pre-populated with sequences and other features. The new features from
+ the GFF file will be added to this dictionary.
+ """
+ for rec in self.parse_in_parts(gff_files, base_dict, limit_info):
+ yield rec
+
+ def parse_in_parts(self, gff_files, base_dict=None, limit_info=None,
+ target_lines=None):
+ """Parse a region of a GFF file specified, returning info as generated.
+
+ target_lines -- The number of lines in the file which should be used
+ for each partial parse. This should be determined based on available
+ memory.
+ """
+ for results in self.parse_simple(gff_files, limit_info, target_lines):
+ if base_dict is None:
+ cur_dict = dict()
+ else:
+ cur_dict = copy.deepcopy(base_dict)
+ cur_dict = self._results_to_features(cur_dict, results)
+ all_ids = cur_dict.keys()
+ all_ids.sort()
+ for cur_id in all_ids:
+ yield cur_dict[cur_id]
+
+ def parse_simple(self, gff_files, limit_info=None, target_lines=1):
+ """Simple parse which does not build or nest features.
+
+ This returns a simple dictionary representation of each line in the
+ GFF file.
+ """
+ # gracefully handle a single file passed
+ if not isinstance(gff_files, (list, tuple)):
+ gff_files = [gff_files]
+ limit_info = self._normalize_limit_info(limit_info)
+ for results in self._gff_process(gff_files, limit_info, target_lines):
+ yield results
+
+ def _normalize_limit_info(self, limit_info):
+ """Turn all limit information into tuples for identical comparisons.
+ """
+ final_limit_info = {}
+ if limit_info:
+ for key, values in limit_info.items():
+ final_limit_info[key] = []
+ for v in values:
+ if isinstance(v, str):
+ final_limit_info[key].append((v,))
+ else:
+ final_limit_info[key].append(tuple(v))
+ return final_limit_info
+
+ def _results_to_features(self, base, results):
+ """Add parsed dictionaries of results to Biopython SeqFeatures.
+ """
+ base = self._add_annotations(base, results.get('annotation', []))
+ for feature in results.get('feature', []):
+ (_, base) = self._add_toplevel_feature(base, feature)
+ base = self._add_parent_child_features(base, results.get('parent', []),
+ results.get('child', []))
+ base = self._add_seqs(base, results.get('fasta', []))
+ base = self._add_directives(base, results.get('directive', []))
+ return base
+
+ def _add_directives(self, base, directives):
+ """Handle any directives or meta-data in the GFF file.
+
+ Relevant items are added as annotation meta-data to each record.
+ """
+ dir_keyvals = collections.defaultdict(list)
+ for directive in directives:
+ parts = directive.split()
+ if len(parts) > 1:
+ key = parts[0]
+ if len(parts) == 2:
+ val = parts[1]
+ else:
+ val = tuple(parts[1:])
+ dir_keyvals[key].append(val)
+ for key, vals in dir_keyvals.items():
+ for rec in base.values():
+ self._add_ann_to_rec(rec, key, vals)
+ return base
+
+ def _add_seqs(self, base, recs):
+ """Add sequence information contained in the GFF3 to records.
+ """
+ for rec in recs:
+ if base.has_key(rec.id):
+ base[rec.id].seq = rec.seq
+ else:
+ base[rec.id] = rec
+ return base
+
+ def _add_parent_child_features(self, base, parents, children):
+ """Add nested features with parent child relationships.
+ """
+ multi_remap = self._identify_dup_ids(parents)
+ # add children features
+ children_prep = collections.defaultdict(list)
+ for child_dict in children:
+ child_feature = self._get_feature(child_dict)
+ for pindex, pid in enumerate(child_feature.qualifiers['Parent']):
+ if multi_remap.has_key(pid):
+ pid = multi_remap[pid].remap_id(child_dict)
+ child_feature.qualifiers['Parent'][pindex] = pid
+ children_prep[pid].append((child_dict['rec_id'],
+ child_feature))
+ children = dict(children_prep)
+ # add children to parents that exist
+ for cur_parent_dict in parents:
+ cur_id = cur_parent_dict['id']
+ if multi_remap.has_key(cur_id):
+ cur_parent_dict['id'] = multi_remap[cur_id].remap_id(
+ cur_parent_dict)
+ cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict)
+ cur_parent, children = self._add_children_to_parent(cur_parent,
+ children)
+ # create parents for children without them (GFF2 or split/bad files)
+ while len(children) > 0:
+ parent_id, cur_children = itertools.islice(children.items(),
+ 1).next()
+ # one child, do not nest it
+ if len(cur_children) == 1:
+ rec_id, child = cur_children[0]
+ loc = (child.location.nofuzzy_start, child.location.nofuzzy_end)
+ rec, base = self._get_rec(base,
+ dict(rec_id=rec_id, location=loc))
+ rec.features.append(child)
+ del children[parent_id]
+ else:
+ cur_parent, base = self._add_missing_parent(base, parent_id,
+ cur_children)
+ cur_parent, children = self._add_children_to_parent(cur_parent,
+ children)
+ return base
+
+ def _identify_dup_ids(self, parents):
+ """Identify duplicated ID attributes in potential nested parents.
+
+ According to the GFF3 spec ID attributes are supposed to be unique
+ for a file, but this is not always true in practice. This looks
+ for duplicates, and provides unique IDs sorted by locations.
+ """
+ multi_ids = collections.defaultdict(list)
+ for parent in parents:
+ multi_ids[parent['id']].append(parent)
+ multi_ids = [(mid, parents) for (mid, parents) in multi_ids.items()
+ if len(parents) > 1]
+ multi_remap = dict()
+ for mid, parents in multi_ids:
+ multi_remap[mid] = _MultiIDRemapper(mid, parents)
+ return multi_remap
+
+ def _add_children_to_parent(self, cur_parent, children):
+ """Recursively add children to parent features.
+ """
+ if children.has_key(cur_parent.id):
+ cur_children = children[cur_parent.id]
+ for rec_id, cur_child in cur_children:
+ cur_child, children = self._add_children_to_parent(cur_child,
+ children)
+ cur_parent.location_operator = "join"
+ cur_parent.sub_features.append(cur_child)
+ del children[cur_parent.id]
+ return cur_parent, children
+
+ def _add_annotations(self, base, anns):
+ """Add annotation data from the GFF file to records.
+ """
+ # add these as a list of annotations, checking not to overwrite
+ # current values
+ for ann in anns:
+ rec, base = self._get_rec(base, ann)
+ for key, vals in ann['quals'].items():
+ self._add_ann_to_rec(rec, key, vals)
+ return base
+
+ def _add_ann_to_rec(self, rec, key, vals):
+ """Add a key/value annotation to the given SeqRecord.
+ """
+ if rec.annotations.has_key(key):
+ try:
+ rec.annotations[key].extend(vals)
+ except AttributeError:
+ rec.annotations[key] = [rec.annotations[key]] + vals
+ else:
+ rec.annotations[key] = vals
+
+ def _get_rec(self, base, info_dict):
+ """Retrieve a record to add features to.
+ """
+ max_loc = info_dict.get('location', (0, 1))[1]
+ try:
+ cur_rec = base[info_dict['rec_id']]
+ # update generated unknown sequences with the expected maximum length
+ if isinstance(cur_rec.seq, UnknownSeq):
+ cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
+ return cur_rec, base
+ except KeyError:
+ if self._create_missing:
+ new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id'])
+ base[info_dict['rec_id']] = new_rec
+ return new_rec, base
+ else:
+ raise
+
+ def _add_missing_parent(self, base, parent_id, cur_children):
+ """Add a new feature that is missing from the GFF file.
+ """
+ base_rec_id = list(set(c[0] for c in cur_children))
+ assert len(base_rec_id) == 1
+ feature_dict = dict(id=parent_id, strand=None,
+ type="inferred_parent", quals=dict(ID=[parent_id]),
+ rec_id=base_rec_id[0])
+ coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end)
+ for r, c in cur_children]
+ feature_dict["location"] = (min([c[0] for c in coords]),
+ max([c[1] for c in coords]))
+ return self._add_toplevel_feature(base, feature_dict)
+
+ def _add_toplevel_feature(self, base, feature_dict):
+ """Add a toplevel non-nested feature to the appropriate record.
+ """
+ new_feature = self._get_feature(feature_dict)
+ rec, base = self._get_rec(base, feature_dict)
+ rec.features.append(new_feature)
+ return new_feature, base
+
+ def _get_feature(self, feature_dict):
+ """Retrieve a Biopython feature from our dictionary representation.
+ """
+ location = FeatureLocation(*feature_dict['location'])
+ new_feature = SeqFeature(location, feature_dict['type'],
+ id=feature_dict['id'], strand=feature_dict['strand'])
+ new_feature.qualifiers = feature_dict['quals']
+ return new_feature
+
+ def _parse_fasta(self, in_handle):
+ """Parse FASTA sequence information contained in the GFF3 file.
+ """
+ return list(SeqIO.parse(in_handle, "fasta"))
+
+class _GFFParserLocalOut:
+ """Provide a collector for local GFF MapReduce file parsing.
+ """
+ def __init__(self, smart_breaks=False):
+ self._items = dict()
+ self._smart_breaks = smart_breaks
+ self._missing_keys = collections.defaultdict(int)
+ self._last_parent = None
+ self.can_break = True
+ self.num_lines = 0
+
+ def add(self, key, vals):
+ if self._smart_breaks:
+ # if we are not GFF2 we expect parents and break
+ # based on not having missing ones
+ if key == 'directive':
+ if vals[0] == '#':
+ self.can_break = True
+ self._last_parent = None
+ elif not vals[0].get("is_gff2", False):
+ self._update_missing_parents(key, vals)
+ self.can_break = (len(self._missing_keys) == 0)
+ # break when we are done with stretches of child features
+ elif key != 'child':
+ self.can_break = True
+ self._last_parent = None
+ # break when we have lots of child features in a row
+ # and change between parents
+ else:
+ cur_parent = vals[0]["quals"]["Parent"][0]
+ if (self._last_parent):
+ self.can_break = (cur_parent != self._last_parent)
+ self._last_parent = cur_parent
+ self.num_lines += 1
+ try:
+ self._items[key].extend(vals)
+ except KeyError:
+ self._items[key] = vals
+
+ def _update_missing_parents(self, key, vals):
+ # smart way of deciding if we can break this.
+ # if this is too much, can go back to not breaking in the
+ # middle of children
+ if key in ["child"]:
+ for val in vals:
+ for p_id in val["quals"]["Parent"]:
+ self._missing_keys[p_id] += 1
+ for val in vals:
+ try:
+ del self._missing_keys[val["quals"]["ID"][0]]
+ except KeyError:
+ pass
+
+ def has_items(self):
+ return len(self._items) > 0
+
+ def get_results(self):
+ self._last_parent = None
+ return self._items
+
+class GFFParser(_AbstractMapReduceGFF):
+ """Local GFF parser providing standardized parsing of GFF3 and GFF2 files.
+ """
+ def __init__(self, line_adjust_fn=None, create_missing=True):
+ _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+ self._line_adjust_fn = line_adjust_fn
+
+ def _gff_process(self, gff_files, limit_info, target_lines):
+ """Process GFF addition without any parallelization.
+
+ In addition to limit filtering, this accepts a target_lines attribute
+ which provides a number of lines to parse before returning results.
+ This allows partial parsing of a file to prevent memory issues.
+ """
+ line_gen = self._file_line_generator(gff_files)
+ for out in self._lines_to_out_info(line_gen, limit_info, target_lines):
+ yield out
+
+ def _file_line_generator(self, gff_files):
+ """Generate single lines from a set of GFF files.
+ """
+ for gff_file in gff_files:
+ if hasattr(gff_file, "read"):
+ need_close = False
+ in_handle = gff_file
+ else:
+ need_close = True
+ in_handle = open(gff_file)
+ found_seqs = False
+ while 1:
+ line = in_handle.readline()
+ if not line:
+ break
+ yield line
+ if need_close:
+ in_handle.close()
+
+ def _lines_to_out_info(self, line_iter, limit_info=None,
+ target_lines=None):
+ """Generate SeqRecord and SeqFeatures from GFF file lines.
+ """
+ params = self._examiner._get_local_params(limit_info)
+ out_info = _GFFParserLocalOut((target_lines is not None and
+ target_lines > 1))
+ found_seqs = False
+ for line in line_iter:
+ results = self._map_fn(line, params)
+ if self._line_adjust_fn and results:
+ if results[0][0] not in ['directive']:
+ results = [(results[0][0],
+ self._line_adjust_fn(results[0][1]))]
+ self._reduce_fn(results, out_info, params)
+ if (target_lines and out_info.num_lines >= target_lines and
+ out_info.can_break):
+ yield out_info.get_results()
+ out_info = _GFFParserLocalOut((target_lines is not None and
+ target_lines > 1))
+ if (results and results[0][0] == 'directive' and
+ results[0][1] == 'FASTA'):
+ found_seqs = True
+ break
+
+ class FakeHandle:
+ def __init__(self, line_iter):
+ self._iter = line_iter
+ def read(self):
+ return "".join(l for l in self._iter)
+ def readline(self):
+ try:
+ return self._iter.next()
+ except StopIteration:
+ return ""
+
+ if found_seqs:
+ fasta_recs = self._parse_fasta(FakeHandle(line_iter))
+ out_info.add('fasta', fasta_recs)
+ if out_info.has_items():
+ yield out_info.get_results()
+
+class DiscoGFFParser(_AbstractMapReduceGFF):
+ """GFF Parser with parallelization through Disco (http://discoproject.org.
+ """
+ def __init__(self, disco_host, create_missing=True):
+ """Initialize parser.
+
+ disco_host - Web reference to a Disco host which will be used for
+ parallelizing the GFF reading job.
+ """
+ _AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
+ self._disco_host = disco_host
+
+ def _gff_process(self, gff_files, limit_info, target_lines=None):
+ """Process GFF addition, using Disco to parallelize the process.
+ """
+ assert target_lines is None, "Cannot split parallelized jobs"
+ # make these imports local; only need them when using disco
+ import simplejson
+ import disco
+ # absolute path names unless they are special disco files
+ full_files = []
+ for f in gff_files:
+ if f.split(":")[0] != "disco":
+ full_files.append(os.path.abspath(f))
+ else:
+ full_files.append(f)
+ results = disco.job(self._disco_host, name="gff_reader",
+ input=full_files,
+ params=disco.Params(limit_info=limit_info, jsonify=True,
+ filter_info=self._examiner._filter_info),
+ required_modules=["simplejson", "collections", "re"],
+ map=self._map_fn, reduce=self._reduce_fn)
+ processed = dict()
+ for out_key, out_val in disco.result_iterator(results):
+ processed[out_key] = simplejson.loads(out_val)
+ yield processed
+
+def parse(gff_files, base_dict=None, limit_info=None, target_lines=None):
+ """High level interface to parse GFF files into SeqRecords and SeqFeatures.
+ """
+ parser = GFFParser()
+ for rec in parser.parse_in_parts(gff_files, base_dict, limit_info,
+ target_lines):
+ yield rec
+
+def _file_or_handle(fn):
+ """Decorator to handle either an input handle or a file.
+ """
+ def _file_or_handle_inside(*args, **kwargs):
+ in_file = args[1]
+ if hasattr(in_file, "read"):
+ need_close = False
+ in_handle = in_file
+ else:
+ need_close = True
+ in_handle = open(in_file)
+ args = (args[0], in_handle) + args[2:]
+ out = fn(*args, **kwargs)
+ if need_close:
+ in_handle.close()
+ return out
+ return _file_or_handle_inside
+
+class GFFExaminer:
+ """Provide high level details about a GFF file to refine parsing.
+
+ GFF is a spec and is provided by many different centers. Real life files
+ will present the same information in slightly different ways. Becoming
+ familiar with the file you are dealing with is the best way to extract the
+ information you need. This class provides high level summary details to
+ help in learning.
+ """
+ def __init__(self):
+ self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
+ gff_source = [1], gff_type = [2])
+
+ def _get_local_params(self, limit_info=None):
+ class _LocalParams:
+ def __init__(self):
+ self.jsonify = False
+ params = _LocalParams()
+ params.limit_info = limit_info
+ params.filter_info = self._filter_info
+ return params
+
+ @_file_or_handle
+ def available_limits(self, gff_handle):
+ """Return dictionary information on possible limits for this file.
+
+ This returns a nested dictionary with the following structure:
+
+ keys -- names of items to filter by
+ values -- dictionary with:
+ keys -- filter choice
+ value -- counts of that filter in this file
+
+ Not a parallelized map-reduce implementation.
+ """
+ cur_limits = dict()
+ for filter_key in self._filter_info.keys():
+ cur_limits[filter_key] = collections.defaultdict(int)
+ for line in gff_handle:
+ # when we hit FASTA sequences, we are done with annotations
+ if line.startswith("##FASTA"):
+ break
+ # ignore empty and comment lines
+ if line.strip() and line.strip()[0] != "#":
+ parts = [p.strip() for p in line.split('\t')]
+ assert len(parts) == 9, line
+ for filter_key, cur_indexes in self._filter_info.items():
+ cur_id = tuple([parts[i] for i in cur_indexes])
+ cur_limits[filter_key][cur_id] += 1
+ # get rid of the default dicts
+ final_dict = dict()
+ for key, value_dict in cur_limits.items():
+ if len(key) == 1:
+ key = key[0]
+ final_dict[key] = dict(value_dict)
+ gff_handle.close()
+ return final_dict
+
+ @_file_or_handle
+ def parent_child_map(self, gff_handle):
+ """Provide a mapping of parent to child relationships in the file.
+
+ Returns a dictionary of parent child relationships:
+
+ keys -- tuple of (source, type) for each parent
+ values -- tuple of (source, type) as children of that parent
+
+ Not a parallelized map-reduce implementation.
+ """
+ # collect all of the parent and child types mapped to IDs
+ parent_sts = dict()
+ child_sts = collections.defaultdict(list)
+ for line in gff_handle:
+ # when we hit FASTA sequences, we are done with annotations
+ if line.startswith("##FASTA"):
+ break
+ if line.strip():
+ line_type, line_info = _gff_line_map(line,
+ self._get_local_params())[0]
+ if (line_type == 'parent' or (line_type == 'child' and
+ line_info['id'])):
+ parent_sts[line_info['id']] = (
+ line_info['quals']['source'][0], line_info['type'])
+ if line_type == 'child':
+ for parent_id in line_info['quals']['Parent']:
+ child_sts[parent_id].append((
+ line_info['quals']['source'][0], line_info['type']))
+ #print parent_sts, child_sts
+ # generate a dictionary of the unique final type relationships
+ pc_map = collections.defaultdict(list)
+ for parent_id, parent_type in parent_sts.items():
+ for child_type in child_sts[parent_id]:
+ pc_map[parent_type].append(child_type)
+ pc_final_map = dict()
+ for ptype, ctypes in pc_map.items():
+ unique_ctypes = list(set(ctypes))
+ unique_ctypes.sort()
+ pc_final_map[ptype] = unique_ctypes
+ return pc_final_map
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gtf_to_gff.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+"""
+Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
+
+Usage: python gtf_to_gff.py in.gtf > out.gff3
+
+Requirement:
+ GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
+ helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
+
+Copyright (C)
+ 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+ 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+import helper
+
+def GFFWriter(gtf_content):
+ """
+ write the feature information to GFF format
+
+ @args gtf_content: Parsed object from gtf file
+ @type gtf_content: numpy array
+ """
+
+ print '##gff-version 3'
+
+ for ent1 in gtf_content:
+
+ chr_name = ent1['chr']
+ strand = ent1['strand']
+ start = ent1['start']
+ stop = ent1['stop']
+ source = ent1['source']
+ ID = ent1['name']
+ Name = ent1['gene_info']['Name']
+
+ Name = ID if not Name else Name
+
+ print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name)
+
+ for idx, tid in enumerate(ent1['transcripts']):
+ print idx
+ print tid
+
+ t_start = ent1['exons'][idx][0][0]
+ t_stop = ent1['exons'][idx][-1][-1]
+ t_type = ent1['transcript_type'][idx]
+
+ utr5_exons, utr3_exons = [], []
+ if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
+ utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
+
+ print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID)
+
+ for ex_cod in utr5_exons:
+ print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
+
+ for ex_cod in ent1['cds_exons'][idx]:
+ print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0])
+
+ for ex_cod in utr3_exons:
+ print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
+
+ for ex_cod in ent1['exons'][idx]:
+ print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
+
+
+def __main__():
+
+ try:
+ gtf_fname = sys.argv[1]
+ except:
+ print __doc__
+ sys.exit(-1)
+
+ gtf_file_content = GFFParser.Parse(gtf_fname)
+
+ GFFWriter(gtf_file_content)
+
+if __name__ == "__main__":
+ __main__()
diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gtf_to_gff.xml Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,94 @@
+
+ converter
+ gtf_to_gff.py $inf_gtf > $gff3_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from GTF to a valid GFF3 file (scroll down for format description).
+
+--------
+
+**Example**
+
+- The following data in GTF format::
+
+ 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+ 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+- Will be converted to GFF3 format::
+
+ ##gff-version 3
+ 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11
+ 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+ 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+ 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751
+ 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+ 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751
+ 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751
+
+--------
+
+**About formats**
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields::
+
+ 1. seqname - The name of the sequence.
+ 2. source - This indicating where the annotation came from.
+ 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. end - The ending position of the feature (inclusive).
+ 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+ 7. strand - Valid entries include '+', '-', or '.'
+ 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+ 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields::
+
+ 1. seqid - Must be a chromosome or scaffold.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+**Copyright**
+
+2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center
+
+Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014)
+
+
+
diff -r 619e0fcd9126 -r 6e589f267c14 helper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/helper.py Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,332 @@
+#!/usr/bin/env python
+"""
+Common utility functions
+"""
+
+import os
+import re
+import sys
+import gzip
+import bz2
+import numpy
+
+def init_gene():
+ """
+ Initializing the gene structure
+ """
+
+ gene_det = [('id', 'f8'),
+ ('anno_id', numpy.dtype),
+ ('confgenes_id', numpy.dtype),
+ ('name', 'S25'),
+ ('source', 'S25'),
+ ('gene_info', numpy.dtype),
+ ('alias', 'S15'),
+ ('name2', numpy.dtype),
+ ('strand', 'S2'),
+ ('score', 'S15'),
+ ('chr', 'S15'),
+ ('chr_num', numpy.dtype),
+ ('paralogs', numpy.dtype),
+ ('start', 'f8'),
+ ('stop', 'f8'),
+ ('transcripts', numpy.dtype),
+ ('transcript_type', numpy.dtype),
+ ('transcript_info', numpy.dtype),
+ ('transcript_status', numpy.dtype),
+ ('transcript_valid', numpy.dtype),
+ ('exons', numpy.dtype),
+ ('exons_confirmed', numpy.dtype),
+ ('cds_exons', numpy.dtype),
+ ('utr5_exons', numpy.dtype),
+ ('utr3_exons', numpy.dtype),
+ ('tis', numpy.dtype),
+ ('tis_conf', numpy.dtype),
+ ('tis_info', numpy.dtype),
+ ('cdsStop', numpy.dtype),
+ ('cdsStop_conf', numpy.dtype),
+ ('cdsStop_info', numpy.dtype),
+ ('tss', numpy.dtype),
+ ('tss_info', numpy.dtype),
+ ('tss_conf', numpy.dtype),
+ ('cleave', numpy.dtype),
+ ('cleave_info', numpy.dtype),
+ ('cleave_conf', numpy.dtype),
+ ('polya', numpy.dtype),
+ ('polya_info', numpy.dtype),
+ ('polya_conf', numpy.dtype),
+ ('is_alt', 'f8'),
+ ('is_alt_spliced', 'f8'),
+ ('is_valid', numpy.dtype),
+ ('transcript_complete', numpy.dtype),
+ ('is_complete', numpy.dtype),
+ ('is_correctly_gff3_referenced', 'S5'),
+ ('splicegraph', numpy.dtype) ]
+
+ return gene_det
+
+def open_file(fname):
+ """
+ Open the file (supports .gz .bz2) and returns the handler
+
+ @args fname: input file name for reading
+ @type fname: str
+ """
+
+ try:
+ if os.path.splitext(fname)[1] == ".gz":
+ FH = gzip.open(fname, 'rb')
+ elif os.path.splitext(fname)[1] == ".bz2":
+ FH = bz2.BZ2File(fname, 'rb')
+ else:
+ FH = open(fname, 'rU')
+ except Exception as error:
+ sys.exit(error)
+
+ return FH
+
+def add_CDS_phase(strand, cds):
+ """
+ Calculate CDS phase and add to the CDS exons
+
+ @args strand: feature strand information
+ @type strand: +/-
+ @args cds: coding exon coordinates
+ @type cds: numpy array [[int, int, int]]
+ """
+
+ cds_region, cds_flag = [], 0
+ if strand == '+':
+ for cdspos in cds:
+ if cds_flag == 0:
+ cdspos = (cdspos[0], cdspos[1], 0)
+ diff = (cdspos[1]-(cdspos[0]-1))%3
+ else:
+ xy = 0
+ if diff == 0:
+ cdspos = (cdspos[0], cdspos[1], 0)
+ elif diff == 1:
+ cdspos = (cdspos[0], cdspos[1], 2)
+ xy = 2
+ elif diff == 2:
+ cdspos = (cdspos[0], cdspos[1], 1)
+ xy = 1
+ diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
+ cds_region.append(cdspos)
+ cds_flag = 1
+ elif strand == '-':
+ cds.reverse()
+ for cdspos in cds:
+ if cds_flag == 0:
+ cdspos = (cdspos[0], cdspos[1], 0)
+ diff = (cdspos[1]-(cdspos[0]-1))%3
+ else:
+ xy = 0
+ if diff == 0:
+ cdspos = (cdspos[0], cdspos[1], 0)
+ elif diff == 1:
+ cdspos = (cdspos[0], cdspos[1], 2)
+ xy = 2
+ elif diff == 2:
+ cdspos = (cdspos[0], cdspos[1], 1)
+ xy = 1
+ diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
+ cds_region.append(cdspos)
+ cds_flag = 1
+ cds_region.reverse()
+ return cds_region
+
+def buildUTR(cc, ec, strand):
+ """
+ Build UTR regions from a given set of CDS and exon coordiantes of a gene
+
+ @args cc: coding exon coordinates
+ @type cc: numpy array [[int, int, int]]
+ @args ec: exon coordinates
+ @type ec: numpy array [[int, int]]
+ @args strand: feature strand information
+ @type strand: +/-
+ """
+
+ utr5 = []
+ utr3 = []
+ if strand == '+':
+ cds_s = cc[0][0]
+ for ex in ec:
+ if ex[0] <= cds_s and cds_s <= ex[1]:
+ if ex[0] != cds_s:utr5.append((ex[0], cds_s-1))
+ break
+ else:
+ utr5.append(ex)
+ cds_e = cc[-1][1]
+ for i in range(len(ec)):
+ i += 1
+ if ec[-i][0] <= cds_e and cds_e <= ec[-i][1]:
+ if ec[-i][1] != cds_e:utr3.append((cds_e +1, ec[-i][1]))
+ break
+ else:
+ utr3.append(ec[-i])
+ utr3.reverse()
+ elif strand == '-':
+ cds_s = cc[-1][1]
+ for i in range(len(ec)):
+ i += 1
+ if ec[-i][0] <= cds_s and cds_s <= ec[-i][1]:
+ if ec[-i][1] != cds_s:utr5.append((cds_s+1, ec[-i][1]))
+ break
+ else:
+ utr5.append(ec[-i])
+ utr5.reverse()
+ cds_e = cc[0][0]
+ for ex in ec:
+ if ex[0] <= cds_e and cds_e <= ex[1]:
+ if ex[0] != cds_e:utr3.append((ex[0], cds_e-1))
+ break
+ else:
+ utr3.append(ex)
+ return utr5, utr3
+
+def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr):
+ """
+ Create exon cordinates from UTR's and CDS region
+
+ @args strand_p: feature strand information
+ @type strand_p: +/-
+ @args five_p_utr: five prime utr exon coordinates
+ @type five_p_utr: numpy array [[int, int]]
+ @args cds_cod: coding exon coordinates
+ @type cds_cod: numpy array [[int, int, int]]
+ @args three_p_utr: three prime utr exon coordinates
+ @type three_p_utr: numpy array [[int, int]]
+ """
+
+ exon_pos = []
+ if strand_p == '+':
+ utr5_start, utr5_end = 0, 0
+ if five_p_utr != []:
+ utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1]
+ cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
+ jun_exon = []
+ if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
+ jun_exon = [utr5_start, cds_5end]
+ if len(cds_cod) == 1:
+ five_prime_flag = 0
+ if jun_exon != []:
+ five_p_utr = five_p_utr[:-1]
+ five_prime_flag = 1
+ for utr5 in five_p_utr:
+ exon_pos.append(utr5)
+ jun_exon = []
+ utr3_start, utr3_end = 0, 0
+ if three_p_utr != []:
+ utr3_start = three_p_utr[0][0]
+ utr3_end = three_p_utr[0][1]
+ if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
+ jun_exon = [cds_5start, utr3_end]
+ three_prime_flag = 0
+ if jun_exon != []:
+ cds_cod = cds_cod[:-1]
+ three_p_utr = three_p_utr[1:]
+ three_prime_flag = 1
+ if five_prime_flag == 1 and three_prime_flag == 1:
+ exon_pos.append([utr5_start, utr3_end])
+ if five_prime_flag == 1 and three_prime_flag == 0:
+ exon_pos.append([utr5_start, cds_5end])
+ cds_cod = cds_cod[:-1]
+ if five_prime_flag == 0 and three_prime_flag == 1:
+ exon_pos.append([cds_5start, utr3_end])
+ for cds in cds_cod:
+ exon_pos.append(cds)
+ for utr3 in three_p_utr:
+ exon_pos.append(utr3)
+ else:
+ if jun_exon != []:
+ five_p_utr = five_p_utr[:-1]
+ cds_cod = cds_cod[1:]
+ for utr5 in five_p_utr:
+ exon_pos.append(utr5)
+ exon_pos.append(jun_exon) if jun_exon != [] else ''
+ jun_exon = []
+ utr3_start, utr3_end = 0, 0
+ if three_p_utr != []:
+ utr3_start = three_p_utr[0][0]
+ utr3_end = three_p_utr[0][1]
+ cds_3start = cds_cod[-1][0]
+ cds_3end = cds_cod[-1][1]
+ if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:
+ jun_exon = [cds_3start, utr3_end]
+ if jun_exon != []:
+ cds_cod = cds_cod[:-1]
+ three_p_utr = three_p_utr[1:]
+ for cds in cds_cod:
+ exon_pos.append(cds)
+ exon_pos.append(jun_exon) if jun_exon != [] else ''
+ for utr3 in three_p_utr:
+ exon_pos.append(utr3)
+ elif strand_p == '-':
+ utr3_start, utr3_end = 0, 0
+ if three_p_utr != []:
+ utr3_start = three_p_utr[-1][0]
+ utr3_end = three_p_utr[-1][1]
+ cds_3start = cds_cod[0][0]
+ cds_3end = cds_cod[0][1]
+ jun_exon = []
+ if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
+ jun_exon = [utr3_start, cds_3end]
+ if len(cds_cod) == 1:
+ three_prime_flag = 0
+ if jun_exon != []:
+ three_p_utr = three_p_utr[:-1]
+ three_prime_flag = 1
+ for utr3 in three_p_utr:
+ exon_pos.append(utr3)
+ jun_exon = []
+ (utr5_start, utr5_end) = (0, 0)
+ if five_p_utr != []:
+ utr5_start = five_p_utr[0][0]
+ utr5_end = five_p_utr[0][1]
+ if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
+ jun_exon = [cds_3start, utr5_end]
+ five_prime_flag = 0
+ if jun_exon != []:
+ cds_cod = cds_cod[:-1]
+ five_p_utr = five_p_utr[1:]
+ five_prime_flag = 1
+ if three_prime_flag == 1 and five_prime_flag == 1:
+ exon_pos.append([utr3_start, utr5_end])
+ if three_prime_flag == 1 and five_prime_flag == 0:
+ exon_pos.append([utr3_start, cds_3end])
+ cds_cod = cds_cod[:-1]
+ if three_prime_flag == 0 and five_prime_flag == 1:
+ exon_pos.append([cds_3start, utr5_end])
+ for cds in cds_cod:
+ exon_pos.append(cds)
+ for utr5 in five_p_utr:
+ exon_pos.append(utr5)
+ else:
+ if jun_exon != []:
+ three_p_utr = three_p_utr[:-1]
+ cds_cod = cds_cod[1:]
+ for utr3 in three_p_utr:
+ exon_pos.append(utr3)
+ if jun_exon != []:
+ exon_pos.append(jun_exon)
+ jun_exon = []
+ (utr5_start, utr5_end) = (0, 0)
+ if five_p_utr != []:
+ utr5_start = five_p_utr[0][0]
+ utr5_end = five_p_utr[0][1]
+ cds_5start = cds_cod[-1][0]
+ cds_5end = cds_cod[-1][1]
+ if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
+ jun_exon = [cds_5start, utr5_end]
+ if jun_exon != []:
+ cds_cod = cds_cod[:-1]
+ five_p_utr = five_p_utr[1:]
+ for cds in cds_cod:
+ exon_pos.append(cds)
+ if jun_exon != []:
+ exon_pos.append(jun_exon)
+ for utr5 in five_p_utr:
+ exon_pos.append(utr5)
+ return exon_pos
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gbk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/s_cerevisiae_SCU49845.gbk Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,165 @@
+LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
+DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
+ (AXL2) and Rev7p (REV7) genes, complete cds.
+ACCESSION U49845
+VERSION U49845.1 GI:1293613
+KEYWORDS .
+SOURCE Saccharomyces cerevisiae (baker's yeast)
+ ORGANISM Saccharomyces cerevisiae
+ Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
+ Saccharomycetales; Saccharomycetaceae; Saccharomyces.
+REFERENCE 1 (bases 1 to 5028)
+ AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
+ TITLE Cloning and sequence of REV7, a gene whose function is required for
+ DNA damage-induced mutagenesis in Saccharomyces cerevisiae
+ JOURNAL Yeast 10 (11), 1503-1509 (1994)
+ PUBMED 7871890
+REFERENCE 2 (bases 1 to 5028)
+ AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
+ TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
+ plasma membrane glycoprotein
+ JOURNAL Genes Dev. 10 (7), 777-793 (1996)
+ PUBMED 8846915
+REFERENCE 3 (bases 1 to 5028)
+ AUTHORS Roemer,T.
+ TITLE Direct Submission
+ JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
+ Haven, CT, USA
+FEATURES Location/Qualifiers
+ source 1..5028
+ /organism="Saccharomyces cerevisiae"
+ /db_xref="taxon:4932"
+ /chromosome="IX"
+ /map="9"
+ CDS <1..206
+ /codon_start=3
+ /product="TCP1-beta"
+ /protein_id="AAA98665.1"
+ /db_xref="GI:1293614"
+ /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
+ AEVLLRVDNIIRARPRTANRQHM"
+ gene 687..3158
+ /gene="AXL2"
+ CDS 687..3158
+ /gene="AXL2"
+ /note="plasma membrane glycoprotein"
+ /codon_start=1
+ /function="required for axial budding pattern of S.
+ cerevisiae"
+ /product="Axl2p"
+ /protein_id="AAA98666.1"
+ /db_xref="GI:1293615"
+ /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
+ TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
+ VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
+ VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
+ TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
+ YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
+ DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
+ DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
+ NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
+ CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
+ NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
+ SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
+ YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
+ HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
+ VDFSNKSNVNVGQVKDIHGRIPEML"
+ gene complement(3300..4037)
+ /gene="REV7"
+ CDS complement(3300..4037)
+ /gene="REV7"
+ /codon_start=1
+ /product="Rev7p"
+ /protein_id="AAA98667.1"
+ /db_xref="GI:1293616"
+ /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
+ FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
+ KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
+ RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
+ LISGDDKILNGVYSQYEEGESIFGSLF"
+ORIGIN
+ 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
+ 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
+ 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
+ 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
+ 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
+ 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
+ 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
+ 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
+ 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
+ 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
+ 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
+ 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
+ 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
+ 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
+ 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
+ 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
+ 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
+ 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
+ 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
+ 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
+ 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
+ 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
+ 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
+ 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
+ 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
+ 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
+ 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
+ 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
+ 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
+ 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
+ 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
+ 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
+ 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
+ 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
+ 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
+ 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
+ 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
+ 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
+ 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
+ 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
+ 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
+ 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
+ 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
+ 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
+ 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
+ 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
+ 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
+ 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
+ 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
+ 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
+ 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
+ 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
+ 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
+ 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
+ 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
+ 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
+ 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
+ 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
+ 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
+ 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
+ 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
+ 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
+ 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
+ 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
+ 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
+ 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
+ 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
+ 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
+ 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
+ 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
+ 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
+ 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
+ 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
+ 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
+ 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
+ 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
+ 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
+ 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
+ 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
+ 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
+ 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
+ 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
+ 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
+ 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
+//
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/s_cerevisiae_SCU49845.gff3 Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,8 @@
+IX gbk_to_gff gene 687 3158 . + . ID=AXL2;Name=AXL2
+IX gbk_to_gff . 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2
+IX gbk_to_gff CDS 687 3158 . + . Parent=Transcript:AXL2
+IX gbk_to_gff exon 687 3158 . + . Parent=Transcript:AXL2
+IX gbk_to_gff gene 3300 4037 . - . ID=REV7;Name=REV7
+IX gbk_to_gff . 3300 4037 . - . ID=Transcript:REV7;Parent=REV7
+IX gbk_to_gff CDS 3300 4037 . - . Parent=Transcript:REV7
+IX gbk_to_gff exon 3300 4037 . - . Parent=Transcript:REV7
diff -r 619e0fcd9126 -r 6e589f267c14 test-data/single_parent_feature_record.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/single_parent_feature_record.gff3 Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,10 @@
+chr1 . miRNA_primary_transcript 1380242 1380467 . - . ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754
+chr1 . miRNA 1380249 1380270 . - . ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047
+chr1 . miRNA_primary_transcript 2410094 2410318 . + . ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367
+chr1 . miRNA 2410242 2410263 . + . ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507
+chr1 . miRNA_primary_transcript 4792375 4792487 . - . ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h
+chr1 . miRNA 4792388 4792408 . - . ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714
+chr1 . miRNA_primary_transcript 4797903 4798018 . - . ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i
+chr1 . miRNA 4797916 4797936 . - . ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715
+chr1 . miRNA_primary_transcript 4810817 4810942 . - . ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j
+chr1 . miRNA 4810830 4810850 . - . ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716
diff -r 619e0fcd9126 -r 6e589f267c14 tool_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_conf.xml.sample Tue Nov 04 12:15:19 2014 -0500
@@ -0,0 +1,9 @@
+