# HG changeset patch # User devteam # Date 1415121319 18000 # Node ID 6e589f267c1421ecb5b6760c947009ed159bb280 # Parent 619e0fcd9126df9ca8f7c1b1a8e3abc097abfee9 Uploaded diff -r 619e0fcd9126 -r 6e589f267c14 GFFParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GFFParser.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,495 @@ +#!/usr/bin/env python +""" +Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. + +Requirements: + Numpy :- http://numpy.org/ + Scipy :- http://scipy.org/ + +Copyright (C) + +2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. +2012-2014 Memorial Sloan Kettering Cancer Center, New York City, USA. +""" + +import re +import os +import sys +import urllib +import numpy as np +import scipy.io as sio +from collections import defaultdict +import helper as utils + +def attribute_tags(col9): + """ + Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file + + @args col9: attribute column from GFF file + @type col9: str + """ + info = defaultdict(list) + is_gff = False + + if not col9: + return is_gff, info + + # trim the line ending semi-colon ucsc may have some white-space + col9 = col9.rstrip(';| ') + # attributes from 9th column + atbs = col9.split(" ; ") + if len(atbs) == 1: + atbs = col9.split("; ") + if len(atbs) == 1: + atbs = col9.split(";") + # check the GFF3 pattern which has key value pairs like: + gff3_pat = re.compile("\w+=") + # sometime GTF have: gene_id uc002zkg.1; + gtf_pat = re.compile("\s?\w+\s") + + key_vals = [] + + if gff3_pat.match(atbs[0]): # gff3 pattern + is_gff = True + key_vals = [at.split('=') for at in atbs] + elif gtf_pat.match(atbs[0]): # gtf pattern + for at in atbs: + key_vals.append(at.strip().split(" ",1)) + else: + # to handle attribute column has only single value + key_vals.append(['ID', atbs[0]]) + # get key, val items + for item in key_vals: + key, val = item + # replace the double qoutes from feature identifier + val = re.sub('"', '', val) + # replace the web formating place holders to plain text format + info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) + + return is_gff, info + +def spec_features_keywd(gff_parts): + """ + Specify the feature key word according to the GFF specifications + + @args gff_parts: attribute field key + @type gff_parts: str + """ + for t_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["info"]["Parent"] = gff_parts["info"][t_id] + break + except KeyError: + pass + for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: + try: + gff_parts["info"]["GParent"] = gff_parts["info"][g_id] + break + except KeyError: + pass + ## TODO key words + for flat_name in ["Transcript", "CDS"]: + if gff_parts["info"].has_key(flat_name): + # parents + if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): + if not gff_parts['id']: + gff_parts['id'] = gff_parts['info'][flat_name][0] + #gff_parts["info"]["ID"] = [gff_parts["id"]] + # children + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] + break + return gff_parts + +def Parse(ga_file): + """ + Parsing GFF/GTF file based on feature relationship, it takes the input file. + + @args ga_file: input file name + @type ga_file: str + """ + child_map = defaultdict(list) + parent_map = dict() + + ga_handle = utils.open_file(ga_file) + + for rec in ga_handle: + rec = rec.strip('\n\r') + + # skip empty line fasta identifier and commented line + if not rec or rec[0] in ['#', '>']: + continue + # skip the genome sequence + if not re.search('\t', rec): + continue + + parts = rec.split('\t') + assert len(parts) >= 8, rec + + # process the attribute column (9th column) + ftype, tags = attribute_tags(parts[-1]) + if not tags: # skip the line if no attribute column. + continue + + # extract fields + if parts[1]: + tags["source"] = parts[1] + if parts[7]: + tags["phase"] = parts[7] + + gff_info = dict() + gff_info['info'] = dict(tags) + gff_info["is_gff3"] = ftype + gff_info['chr'] = parts[0] + gff_info['score'] = parts[5] + + if parts[3] and parts[4]: + gff_info['location'] = [int(parts[3]) , + int(parts[4])] + gff_info['type'] = parts[2] + gff_info['id'] = tags.get('ID', [''])[0] + if parts[6] in ['?', '.']: + parts[6] = None + gff_info['strand'] = parts[6] + + # key word according to the GFF spec. + # is_gff3 flag is false check this condition and get the attribute fields + if not ftype: + gff_info = spec_features_keywd(gff_info) + + # link the feature relationships + if gff_info['info'].has_key('Parent'): + for p in gff_info['info']['Parent']: + if p == gff_info['id']: + gff_info['id'] = '' + break + rec_category = 'child' + elif gff_info['id']: + rec_category = 'parent' + else: + rec_category = 'record' + + # depends on the record category organize the features + if rec_category == 'child': + for p in gff_info['info']['Parent']: + # create the data structure based on source and feature id + child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( + dict( type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + score = gff_info['score'], + ID = gff_info['id'], + gene_id = gff_info['info'].get('GParent', '') + )) + elif rec_category == 'parent': + parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( + type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + score = gff_info['score'], + name = tags.get('Name', [''])[0]) + elif rec_category == 'record': + #TODO how to handle plain records? + c = 1 + ga_handle.close() + + # depends on file type create parent feature + if not ftype: + parent_map, child_map = create_missing_feature_type(parent_map, child_map) + + # connecting parent child relations + # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file + gene_mat = format_gene_models(parent_map, child_map) + + return gene_mat + +def format_gene_models(parent_nf_map, child_nf_map): + """ + Genarate GeneObject based on the parsed file contents + + @args parent_nf_map: parent features with source and chromosome information + @type parent_nf_map: collections defaultdict + @args child_nf_map: transctipt and exon information are encoded + @type child_nf_map: collections defaultdict + """ + + g_cnt = 0 + gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) + + for pkey, pdet in parent_nf_map.items(): + # considering only gene features + #if not re.search(r'gene', pdet.get('type', '')): + # continue + + # infer the gene start and stop if not there in the + if not pdet.get('location', []): + GNS, GNE = [], [] + # multiple number of transcripts + for L1 in child_nf_map[pkey]: + GNS.append(L1.get('location', [])[0]) + GNE.append(L1.get('location', [])[1]) + GNS.sort() + GNE.sort() + pdet['location'] = [GNS[0], GNE[-1]] + + orient = pdet.get('strand', '') + gene[g_cnt]['id'] = g_cnt +1 + gene[g_cnt]['chr'] = pkey[0] + gene[g_cnt]['source'] = pkey[1] + gene[g_cnt]['name'] = pkey[-1] + gene[g_cnt]['start'] = pdet.get('location', [])[0] + gene[g_cnt]['stop'] = pdet.get('location', [])[1] + gene[g_cnt]['strand'] = orient + gene[g_cnt]['score'] = pdet.get('score','') + + # default value + gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 + if len(child_nf_map[pkey]) > 1: + gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 + + # complete sub-feature for all transcripts + dim = len(child_nf_map[pkey]) + TRS = np.zeros((dim,), dtype=np.object) + TR_TYP = np.zeros((dim,), dtype=np.object) + EXON = np.zeros((dim,), dtype=np.object) + UTR5 = np.zeros((dim,), dtype=np.object) + UTR3 = np.zeros((dim,), dtype=np.object) + CDS = np.zeros((dim,), dtype=np.object) + TISc = np.zeros((dim,), dtype=np.object) + TSSc = np.zeros((dim,), dtype=np.object) + CLV = np.zeros((dim,), dtype=np.object) + CSTOP = np.zeros((dim,), dtype=np.object) + TSTAT = np.zeros((dim,), dtype=np.object) + + # fetching corresponding transcripts + for xq, Lv1 in enumerate(child_nf_map[pkey]): + + TID = Lv1.get('ID', '') + TRS[xq]= np.array([TID]) + + TYPE = Lv1.get('type', '') + TR_TYP[xq] = np.array('') + TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] + + orient = Lv1.get('strand', '') + + # fetching different sub-features + child_feat = defaultdict(list) + for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: + E_TYP = Lv2.get('type', '') + child_feat[E_TYP].append(Lv2.get('location')) + + # make general ascending order of coordinates + if orient == '-': + for etype, excod in child_feat.items(): + if len(excod) > 1: + if excod[0][0] > excod[-1][0]: + excod.reverse() + child_feat[etype] = excod + + # make exon coordinate from cds and utr regions + if not child_feat.get('exon'): + if child_feat.get('CDS'): + exon_cod = utils.make_Exon_cod( orient, + NonetoemptyList(child_feat.get('five_prime_UTR')), + NonetoemptyList(child_feat.get('CDS')), + NonetoemptyList(child_feat.get('three_prime_UTR'))) + child_feat['exon'] = exon_cod + else: + # TODO only UTR's + # searching through keys to find a pattern describing exon feature + ex_key_pattern = [k for k in child_feat if k.endswith("exon")] + if ex_key_pattern: + child_feat['exon'] = child_feat[ex_key_pattern[0]] + + # stop_codon are seperated from CDS, add the coordinates based on strand + if child_feat.get('stop_codon'): + if orient == '+': + if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1: + child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]] + else: + child_feat['CDS'].append(child_feat.get('stop_codon')[0]) + elif orient == '-': + if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1: + child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]] + else: + child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0]) + + # transcript signal sites + TIS, cdsStop, TSS, cleave = [], [], [], [] + cds_status, exon_status, utr_status = 0, 0, 0 + + if child_feat.get('exon'): + TSS = [child_feat.get('exon')[-1][1]] + TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS + cleave = [child_feat.get('exon')[0][0]] + cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave + exon_status = 1 + + if child_feat.get('CDS'): + if orient == '+': + TIS = [child_feat.get('CDS')[0][0]] + cdsStop = [child_feat.get('CDS')[-1][1]-3] + else: + TIS = [child_feat.get('CDS')[-1][1]] + cdsStop = [child_feat.get('CDS')[0][0]+3] + cds_status = 1 + # cds phase calculation + child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS')) + + # sub-feature status + if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'): + utr_status =1 + + if utr_status == cds_status == exon_status == 1: + t_status = 1 + else: + t_status = 0 + + # add sub-feature # make array for export to different out + TSTAT[xq] = t_status + EXON[xq] = np.array(child_feat.get('exon'), np.float64) + UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR'))) + UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR'))) + CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) + TISc[xq] = np.array(TIS) + CSTOP[xq] = np.array(cdsStop) + TSSc[xq] = np.array(TSS) + CLV[xq] = np.array(cleave) + + # add sub-features to the parent gene feature + gene[g_cnt]['transcript_status'] = TSTAT + gene[g_cnt]['transcripts'] = TRS + gene[g_cnt]['exons'] = EXON + gene[g_cnt]['utr5_exons'] = UTR5 + gene[g_cnt]['cds_exons'] = CDS + gene[g_cnt]['utr3_exons'] = UTR3 + gene[g_cnt]['transcript_type'] = TR_TYP + gene[g_cnt]['tis'] = TISc + gene[g_cnt]['cdsStop'] = CSTOP + gene[g_cnt]['tss'] = TSSc + gene[g_cnt]['cleave'] = CLV + + gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], + Name = pdet.get('name'), + Source = pkey[1]) + # few empty fields // TODO fill this: + gene[g_cnt]['anno_id'] = [] + gene[g_cnt]['confgenes_id'] = [] + gene[g_cnt]['alias'] = '' + gene[g_cnt]['name2'] = [] + gene[g_cnt]['chr_num'] = [] + gene[g_cnt]['paralogs'] = [] + gene[g_cnt]['transcript_info'] = [] + gene[g_cnt]['transcript_valid'] = [] + gene[g_cnt]['exons_confirmed'] = [] + gene[g_cnt]['tis_conf'] = [] + gene[g_cnt]['tis_info'] = [] + gene[g_cnt]['cdsStop_conf'] = [] + gene[g_cnt]['cdsStop_info'] = [] + gene[g_cnt]['tss_info'] = [] + gene[g_cnt]['tss_conf'] = [] + gene[g_cnt]['cleave_info'] = [] + gene[g_cnt]['cleave_conf'] = [] + gene[g_cnt]['polya_info'] = [] + gene[g_cnt]['polya_conf'] = [] + gene[g_cnt]['is_valid'] = [] + gene[g_cnt]['transcript_complete'] = [] + gene[g_cnt]['is_complete'] = [] + gene[g_cnt]['is_correctly_gff3_referenced'] = '' + gene[g_cnt]['splicegraph'] = [] + g_cnt += 1 + + ## deleting empty gene records from the main array + XPFLG=0 + for XP, ens in enumerate(gene): + if ens[0]==0: + XPFLG=1 + break + + if XPFLG==1: + XQC = range(XP, len(gene)+1) + gene = np.delete(gene, XQC) + + return gene + +def NonetoemptyList(XS): + """ + Convert a None type to empty list + + @args XS: None type + @type XS: str + """ + return [] if XS is None else XS + +def create_missing_feature_type(p_feat, c_feat): + """ + GFF/GTF file defines only child features. This function tries to create + the parent feature from the information provided in the attribute column. + + example: + chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; + chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; + chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; + + This function gets the parsed feature annotations. + + @args p_feat: Parent feature map + @type p_feat: collections defaultdict + @args c_feat: Child feature map + @type c_feat: collections defaultdict + """ + + child_n_map = defaultdict(list) + for fid, det in c_feat.items(): + # get the details from grand child + GID = STRD = SCR = None + SPOS, EPOS = [], [] + TYP = dict() + for gchild in det: + GID = gchild.get('gene_id', [''])[0] + SPOS.append(gchild.get('location', [])[0]) + EPOS.append(gchild.get('location', [])[1]) + STRD = gchild.get('strand', '') + SCR = gchild.get('score', '') + if gchild.get('type', '') == "gene": ## gencode GTF file has this problem + continue + TYP[gchild.get('type', '')] = 1 + SPOS.sort() + EPOS.sort() + + # infer transcript type + transcript_type = 'transcript' + transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type + + # gene id and transcript id are same + transcript_id = fid[-1] + if GID == transcript_id: + transcript_id = 'Transcript:' + str(GID) + + # level -1 feature type + p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', + location = [], ## infer location based on multiple transcripts + strand = STRD, + name = GID ) + # level -2 feature type + child_n_map[(fid[0], fid[1], GID)].append( + dict( type = transcript_type, + location = [SPOS[0], EPOS[-1]], + strand = STRD, + score = SCR, + ID = transcript_id, + gene_id = '' )) + # reorganizing the grand child + for gchild in det: + child_n_map[(fid[0], fid[1], transcript_id)].append( + dict( type = gchild.get('type', ''), + location = gchild.get('location'), + strand = gchild.get('strand'), + ID = gchild.get('ID'), + score = gchild.get('score'), + gene_id = '' )) + return p_feat, child_n_map + diff -r 619e0fcd9126 -r 6e589f267c14 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,61 @@ +A collection of tools for converting genome annotation between GTF (Gene Transfer Format), +BED (Browser Extensible Data) and GFF (Generic Feature Format). + +INTRODUCTION + +Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs +they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested +with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the +instructions below to clone these tools into your galaxy instance. + +CONTENTS + +Tool configuration files in *.xml format. + + gtf_to_gff.xml + gff_to_gtf.xml + bed_to_gff.xml + gff_to_bed.xml + gbk_to_gff.xml + gff_fmap.xml + +Python based scripts. + + gtf_to_gff.py: convert data from GTF to valid GFF3. + gff_to_gtf.py: convert data from GFF3 to GTF. + bed_to_gff.py: convert data from a 12 column UCSC wiggle BED format to GFF3. + gff_to_bed.py: convert gene transcript annotation from GFF3 to UCSC wiggle 12 column BED format. + gbk_to_gff.py: convert data from genbank format to GFF. + gff_fmap.py: find the relation between different features described in a GFF file. + GFFParser.py: Parse GFF/GTF files. + helper.py: Utility functions. + +test-data: Test data set. (move to your galaxy_root_folder/test-data/) + + You may need to move the test files into your test-data directory so galaxy can find them. + If you want to run the functional tests eg as: + + exmaple: + sh run_functional_tests.sh -id fml_gtf2gff + +REQUIREMENTS + + python + +COMMENTS/QUESTIONS + +I can be reached at vipin [at] cbio.mskcc.org + +LICENSE + +Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society + 2013-2014 Memorial Sloan Kettering Cancer Center + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +COURTESY + +To the Galaxy Team. diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_to_gff.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,70 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in a 12 column BED format to GFF3. + +Usage: python bed_to_gff.py in.bed > out.gff + +Requirement: + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import helper + +def __main__(): + """ + main function + """ + + try: + bed_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + bed_fh = helper.open_file(bed_fname) + + for line in bed_fh: + line = line.strip( '\n\r' ) + + if not line or line[0] in ['#']: + continue + + parts = line.split('\t') + assert len(parts) >= 12, line + + rstarts = parts[-1].split(',') + rstarts.pop() if rstarts[-1] == '' else rstarts + + exon_lens = parts[-2].split(',') + exon_lens.pop() if exon_lens[-1] == '' else exon_lens + + if len(rstarts) != len(exon_lens): + continue # checking the consistency col 11 and col 12 + + if len(rstarts) != int(parts[-3]): + continue # checking the number of exons and block count are same + + if not parts[5] in ['+', '-']: + parts[5] = '.' # replace the unknown strand with '.' + + # bed2gff result line + print '%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3]) + print '%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3]) + + st = int(parts[1]) + for ex_cnt in range(int(parts[-3])): + start = st + int(rstarts[ex_cnt]) + 1 + stop = start + int(exon_lens[ex_cnt]) - 1 + print '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s' % (parts[0], start, stop, parts[4], parts[5], parts[3]) + + bed_fh.close() + + +if __name__ == "__main__": + __main__() diff -r 619e0fcd9126 -r 6e589f267c14 bed_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,89 @@ + + converter + bed_to_gff.py $inf_bed > $gff_format + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts data from a 12 column UCSC wiggle BED format to GFF3 (scroll down for format description). + +-------- + +**Example** + +- The following data in UCSC Wiggle BED format:: + + chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347, + +- Will be converted to GFF3:: + + ##gff-version 3 + chr1 bed2gff gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 + chr1 bed2gff transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 + chr1 bed2gff exon 11874 12227 0 + . Parent=uc001aaa.3 + chr1 bed2gff exon 12613 12721 0 + . Parent=uc001aaa.3 + chr1 bed2gff exon 13221 14409 0 + . Parent=uc001aaa.3 + +-------- + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,213 @@ +#!/usr/bin/env python +""" +Convert data from Genbank format to GFF. + +Usage: +python gbk_to_gff.py in.gbk > out.gff + +Requirements: + BioPython:- http://biopython.org/ + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import os +import re +import sys +import collections +from Bio import SeqIO +import helper + +def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): + """ + Write the feature information + """ + + for gname, ginfo in genes.items(): + line = [str(chr_id), + 'gbk_to_gff', + ginfo[3], + str(ginfo[0]), + str(ginfo[1]), + '.', + ginfo[2], + '.', + 'ID=%s;Name=%s' % (str(gname), str(gname))] + print '\t'.join(line) + ## construct the transcript line is not defined in the original file + t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.'] + + if not transcripts: + t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) + + if exons: ## get the entire transcript region from the defined feature + t_line[3] = str(exons[gname][0][0]) + t_line[4] = str(exons[gname][0][-1]) + elif cds: + t_line[3] = str(cds[gname][0][0]) + t_line[4] = str(cds[gname][0][-1]) + print '\t'.join(t_line) + + if exons: + exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') + + if cds: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') + + else: ## transcript is defined + for idx in transcripts[gname]: + t_line[2] = idx[3] + t_line[3] = str(idx[0]) + t_line[4] = str(idx[1]) + t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) + print '\t'.join(t_line) + + ## feature line print call + if exons: + exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') + if cds: + exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') + if not exons: + exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') + + if len(genes) == 0: ## feature entry with fragment information + + line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.'] + fStart = fStop = None + + for eid, ex in cds.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + for eid, ex in exons.items(): + fStart = ex[0][0] + fStop = ex[0][-1] + + if fStart or fStart: + + line[2] = 'gene' + line[3] = str(fStart) + line[4] = str(fStop) + line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) + print "\t".join(line) + + if not cds: + line[2] = 'transcript' + else: + line[2] = 'mRNA' + + line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) + print "\t".join(line) + + if exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + if cds: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') + if not exons: + exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') + + unk +=1 + + return unk + +def exon_line_print(temp_line, trx_exons, parent, ftype): + """ + Print the EXON feature line + """ + + for ex in trx_exons: + temp_line[2] = ftype + temp_line[3] = str(ex[0]) + temp_line[4] = str(ex[1]) + temp_line[8] = 'Parent=%s' % parent + print '\t'.join(temp_line) + +def gbk_parse(fname): + """ + Extract genome annotation recods from genbank format + + @args fname: gbk file name + @type fname: str + """ + + fhand = helper.open_file(gbkfname) + unk = 1 + + for record in SeqIO.parse(fhand, "genbank"): + + gene_tags = dict() + tx_tags = collections.defaultdict(list) + exon = collections.defaultdict(list) + cds = collections.defaultdict(list) + mol_type, chr_id = None, None + + for rec in record.features: + + if rec.type == 'source': + try: + mol_type = rec.qualifiers['mol_type'][0] + except: + mol_type = '.' + pass + try: + chr_id = rec.qualifiers['chromosome'][0] + except: + chr_id = record.name + continue + + strand='-' + strand='+' if rec.strand>0 else strand + + fid = None + try: + fid = rec.qualifiers['gene'][0] + except: + pass + + transcript_id = None + try: + transcript_id = rec.qualifiers['transcript_id'][0] + except: + pass + + if re.search(r'gene', rec.type): + gene_tags[fid] = (rec.location._start.position+1, + rec.location._end.position, + strand, + rec.type + ) + elif rec.type == 'exon': + exon[fid].append((rec.location._start.position+1, + rec.location._end.position)) + elif rec.type=='CDS': + cds[fid].append((rec.location._start.position+1, + rec.location._end.position)) + else: + # get all transcripts + if transcript_id: + tx_tags[fid].append((rec.location._start.position+1, + rec.location._end.position, + transcript_id, + rec.type)) + # record extracted, generate feature table + unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) + + fhand.close() + + +if __name__=='__main__': + + try: + gbkfname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + ## extract gbk records + gbk_parse(gbkfname) diff -r 619e0fcd9126 -r 6e589f267c14 gbk_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,91 @@ + + converter + gbk_to_gff.py $inf_gbk > $gff_format + + + + + + + + + + + + + + + +**What it does** + +This tool converts data from a GenBank_ flat file format to GFF (scroll down for format description). + +.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ + +------ + +**Example** + +- The following data in GenBank format:: + + LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 + DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) + mRNA, complete cds. + ACCESSION NM_001202705 + VERSION NM_001202705.1 GI:334184566......... + FEATURES Location/Qualifiers + source 1..2406 + /organism="Arabidopsis thaliana" + /mol_type="mRNA" + /db_xref="taxon:3702"........ + gene 1..2406 + /gene="THIC" + /locus_tag="AT2G29630" + /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ + ORIGIN + 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg + 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat + 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc + 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca + // + + +- Will be converted to GFF3:: + + ##gff-version 3 + NM_001202705 gbk_to_gff chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705 + NM_001202705 gbk_to_gff gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC + NM_001202705 gbk_to_gff mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 + NM_001202705 gbk_to_gff CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01 + NM_001202705 gbk_to_gff exon 192 2126 . + 1 Parent=AT2G29630.t01 + +------ + +**About formats** + +**GenBank format** An example of a GenBank record may be viewed here_ + +.. _here: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + +**GFF3** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_fmap.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,203 @@ +#!/usr/bin/env python +""" +GFF feature mapping program, to find the relation between different features described in a given GFF file. + +Usage: +python gff_fmap.py in.gff > out.txt + +Courtesy: Brad Chapman + Few functions are inherited from bcbio-GFFParser program. +""" + +import re +import sys +import urllib +import collections +from helper import open_file + +def _gff_line_map(line): + """Parses a line of GFF into a dictionary. + Given an input line from a GFF file, this: + - breaks it into component elements + - determines the type of attribute (flat, parent, child or annotation) + - generates a dictionary of GFF info + """ + gff3_kw_pat = re.compile("\w+=") + def _split_keyvals(keyval_str): + """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. + GFF3 has key value pairs like: + count=9;gene=amx-2;sequence=SAGE:aacggagccg + GFF2 and GTF have: + Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" + name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 + """ + quals = collections.defaultdict(list) + if keyval_str is None: + return quals + # ensembl GTF has a stray semi-colon at the end + if keyval_str[-1] == ';': + keyval_str = keyval_str[:-1] + # GFF2/GTF has a semi-colon with at least one space after it. + # It can have spaces on both sides; wormbase does this. + # GFF3 works with no spaces. + # Split at the first one we can recognize as working + parts = keyval_str.split(" ; ") + if len(parts) == 1: + parts = keyval_str.split("; ") + if len(parts) == 1: + parts = keyval_str.split(";") + # check if we have GFF3 style key-vals (with =) + is_gff2 = True + if gff3_kw_pat.match(parts[0]): + is_gff2 = False + key_vals = [p.split('=') for p in parts] + # otherwise, we are separated by a space with a key as the first item + else: + pieces = [] + for p in parts: + # fix misplaced semi-colons in keys in some GFF2 files + if p and p[0] == ';': + p = p[1:] + pieces.append(p.strip().split(" ")) + key_vals = [(p[0], " ".join(p[1:])) for p in pieces] + for key, val in key_vals: + # remove quotes in GFF2 files + if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): + val = val[1:-1] + if val: + quals[key].extend(val.split(',')) + # if we don't have a value, make this a key=True/False style + # attribute + else: + quals[key].append('true') + for key, vals in quals.items(): + quals[key] = [urllib.unquote(v) for v in vals] + return quals, is_gff2 + + def _nest_gff2_features(gff_parts): + """Provide nesting of GFF2 transcript parts with transcript IDs. + + exons and coding sequences are mapped to a parent with a transcript_id + in GFF2. This is implemented differently at different genome centers + and this function attempts to resolve that and map things to the GFF3 + way of doing them. + """ + # map protein or transcript ids to a parent + for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["quals"]["Parent"] = \ + gff_parts["quals"][transcript_id] + break + except KeyError: + pass + # case for WormBase GFF -- everything labelled as Transcript or CDS + for flat_name in ["Transcript", "CDS"]: + if gff_parts["quals"].has_key(flat_name): + # parent types + if gff_parts["type"] in [flat_name]: + if not gff_parts["id"]: + gff_parts["id"] = gff_parts["quals"][flat_name][0] + gff_parts["quals"]["ID"] = [gff_parts["id"]] + # children types + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] + break + + return gff_parts + + line = line.strip() + if line == '':return [('directive', line)] # sometimes the blank lines will be there + if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header + if line[0] == "#": + return [('directive', line[2:])] + elif line: + parts = line.split('\t') + if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together + assert len(parts) == 9, line + gff_parts = [(None if p == '.' else p) for p in parts] + gff_info = dict() + + # collect all of the base qualifiers for this item + quals, is_gff2 = _split_keyvals(gff_parts[8]) + + gff_info["is_gff2"] = is_gff2 + + if gff_parts[1]:quals["source"].append(gff_parts[1]) + gff_info['quals'] = dict(quals) + + # if we are describing a location, then we are a feature + if gff_parts[3] and gff_parts[4]: + gff_info['type'] = gff_parts[2] + gff_info['id'] = quals.get('ID', [''])[0] + + if is_gff2:gff_info = _nest_gff2_features(gff_info) + # features that have parents need to link so we can pick up + # the relationship + if gff_info['quals'].has_key('Parent'): + final_key = 'child' + elif gff_info['id']: + final_key = 'parent' + # Handle flat features + else: + final_key = 'feature' + # otherwise, associate these annotations with the full record + else: + final_key = 'annotation' + return [(final_key, gff_info)] + +def parent_child_id_map(gff_handle): + """ + Provide a mapping of parent to child relationships in the file. + Gives a dictionary of parent child relationships: + + keys -- tuple of (source, type) for each parent + values -- tuple of (source, type) as children of that parent + """ + # collect all of the parent and child types mapped to IDs + parent_sts = dict() + child_sts = collections.defaultdict(list) + for line in gff_handle: + line_type, line_info = _gff_line_map(line)[0] + if (line_type == 'parent' or (line_type == 'child' and line_info['id'])): + parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type']) + if line_type == 'child': + for parent_id in line_info['quals']['Parent']: + child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type'])) + gff_handle.close() + # generate a dictionary of the unique final type relationships + pc_map = collections.defaultdict(list) + for parent_id, parent_type in parent_sts.items(): + for child_type in child_sts[parent_id]: + pc_map[parent_type].append(child_type) + pc_final_map = dict() + for ptype, ctypes in pc_map.items(): + unique_ctypes = list(set(ctypes)) + unique_ctypes.sort() + pc_final_map[ptype] = unique_ctypes + # some cases the GFF file represents a single feature type + if not pc_final_map: + for fid, stypes in parent_sts.items(): + pc_final_map[stypes] = dict() + # generate a report on feature id mapping in the file + print '+---------------------+---------------------------------+' + print '| Parent feature type | Associated child feature type(s)|' + print '+---------------------+---------------------------------+' + for key, value in pc_final_map.items(): + print key[0], key[1] + for child_to in value: + print '\t\t\t|-',child_to[0], child_to[1] + print '+---------------------+---------------------------------+' + + +if __name__=='__main__': + + try: + gff_file = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + gff_handle = open_file(gff_file) + parent_child_id_map(gff_handle) diff -r 619e0fcd9126 -r 6e589f267c14 gff_fmap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_fmap.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,83 @@ + + features + + gff_fmap.py $gff_input > $idmapping + + + + + + + + + + + + + + + + + + + +**What it does** + +GFF-map provides the features (gene, mRNA, UTR's, exon, CDS etc) relationship based on their identifier mapping in a given GFF file. + +-------- + +**Example** + +- The features ID mapping in the following data in GFF3:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + 0 ID=ENSP00000328352;Name=ENSP00000328352 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +- Will be displayed as:: + + +-----------------------+---------------------------------+ + | Parent feature type | Associated child feature type(s)| + +-----------------------+---------------------------------+ + | protein_coding gene | protein_coding mRNA | + +-----------------------+---------------------------------+ + | protein_coding protein| protein_coding CDS | + +-----------------------+---------------------------------+ + | protein_coding mRNA | protein_coding CDS | + | | protein_coding exon | + | | protein_coding five_prime_UTR | + | | protein_coding three_prime_UTR | + +-----------------------+---------------------------------+ + +-------- + +**About formats** + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,96 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in GFF/GTF to a 12 column BED format. +BED format typically represents the transcript models. + +Usage: python gff_to_bed.py in.gff > out.bed + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def writeBED(tinfo): + """ + writing result files in bed format + + @args tinfo: list of genes + @args tinfo: numpy object + """ + + for ent1 in tinfo: + child_flag = False + + for idx, tid in enumerate(ent1['transcripts']): + child_flag = True + exon_cnt = len(ent1['exons'][idx]) + exon_len = '' + exon_cod = '' + rel_start = None + rel_stop = None + for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript + exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) + if idz == 0: #calculate the relative start position + exon_cod += '0,' + rel_start = int(ex_cod[0]) + rel_stop = ex_cod[1] + else: + exon_cod += '%d,' % (ex_cod[0]-rel_start) + rel_stop = int(ex_cod[1]) + + if exon_len: + score = '0' + score = ent1['score'][0] if ent1['score'] else score + out_print = [ent1['chr'], + str(rel_start), + str(rel_stop), + tid[0], + score, + ent1['strand'], + str(rel_start), + str(rel_stop), + '0', + str(exon_cnt), + exon_len, + exon_cod] + print '\t'.join(out_print) + + if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type + score = '0' + score = ent1['score'][0] if ent1['score'] else score + + out_print = [ent1['chr'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + ent1['name'], + score, + ent1['strand'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + '0', + '1', + '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), + '0,'] + + print '\t'.join(out_print) + + +def __main__(): + try: + query_file = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(query_file) + writeBED(Transcriptdb) + +if __name__ == "__main__": + __main__() diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,90 @@ + + converter + gff_to_bed.py $inf_gff > $bed_format + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts gene transcript annotation from GTF or GFF or GFF3 to UCSC wiggle 12 column BED format. + +-------- + +**Example** + +- The following data in GFF3:: + + ##gff-version 3 + chr1 protein_coding gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3 + chr1 protein_coding transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3 + chr1 protein_coding exon 11874 12227 0 + . Parent=uc001aaa.3 + chr1 protein_coding exon 12613 12721 0 + . Parent=uc001aaa.3 + chr1 protein_coding exon 13221 14409 0 + . Parent=uc001aaa.3 + +- Will be converted to UCSC Wiggle BED format:: + + chr1 11874 14409 uc001aaa.3 0 + 11874 14409 0 3 354,109,1189, 0,739,1347, + +-------- + +**About formats** + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gbk.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,54 @@ +#!/usr/bin/env python +""" +Convert data from GFF and associated genome sequence in fasta file into GenBank. + +Usage: +python gff_to_gbk.py in.gff in.fasta out.gbk + +Requirements: + BioPython:- http://biopython.org/ + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import sys +import helper +import gffparser_bcbio + +from Bio import SeqIO +from Bio.Alphabet import generic_dna + +def __main__(): + """ + main wrapper + """ + + try: + gff_fname = sys.argv[1] + fasta_fname = sys.argv[2] + gb_fname = sys.argv[3] + except: + print __doc__ + sys.exit(-1) + + fasta_fh = helper.open_file(fasta_fname) + + fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fh, "fasta", generic_dna)) + fasta_fh.close() + + gff_rec = gffparser_bcbio.parse(gff_fname, fasta_rec) + + try: + gb_fh = open(gb_fname, "w") + except: + print 'file not ready for writing %s' % gb_fname + sys.exit(-1) + + SeqIO.write(gff_rec, gb_fh, "genbank") + gb_fh.close() + +if __name__=="__main__": + __main__() diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gbk.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gbk.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,98 @@ + + converter + gff_to_gbk.py $inf_gff $inf_fas $gbk_format + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts annotations in GFF to GenBank_ format (scroll down for format description). + +.. _GenBank: http://www.ncbi.nlm.nih.gov/genbank/ + +------ + +**Example** + +- The following data in GFF:: + + ##gff-version 3 + # sequence-region NM_001202705 1 2406 + NM_001202705 GenBank chromosome 1 2406 . + 1 ID=NM_001202705;Alias=2;Dbxref=taxon:3702;Name=NM_001202705;Note=Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) mRNA%2C complete cds.,REVIEWED REFSEQ; + NM_001202705 GenBank gene 1 2406 . + 1 ID=AT2G29630;Dbxref=GeneID:817513,TAIR:AT2G29630;Name=THIC;locus_tag=AT2G29630 + NM_001202705 GenBank mRNA 192 2126 . + 1 ID=AT2G29630.t01;Parent=AT2G29630 + NM_001202705 GenBank CDS 192 2126 . + 1 ID=AT2G29630.p01;Parent=AT2G29630.t01;Dbxref=GI:334184567,GeneID:817513,TAIR:AT2G29630;Name=THIC;Note=thiaminC (THIC)%3B CONTAINS InterPro DOMAIN;rotein_id=NP_001189634.1; + NM_001202705 GenBank exon 192 2126 . + 1 Parent=AT2G29630.t01 + ##FASTA + >NM_001202705 + AAGCCTTTCGCTTTAGGCTGCATTGGGCCGTGACAATATTCAGACGATTCAGGAGGTTCG + TTCCTTTTTTAAAGGACCCTAATCACTCTGAGTACCACTGACTCACTCAGTGTGCGCGAT + +- Will be converted to GenBank format:: + + LOCUS NM_001202705 2406 bp mRNA linear PLN 28-MAY-2011 + DEFINITION Arabidopsis thaliana thiamine biosynthesis protein ThiC (THIC) + mRNA, complete cds. + ACCESSION NM_001202705 + VERSION NM_001202705.1 GI:334184566......... + FEATURES Location/Qualifiers + source 1..2406 + /organism="Arabidopsis thaliana" + /mol_type="mRNA" + /db_xref="taxon:3702"........ + gene 1..2406 + /gene="THIC" + /locus_tag="AT2G29630" + /gene_synonym="PY; PYRIMIDINE REQUIRING; T27A16.27;........ + ORIGIN + 1 aagcctttcg ctttaggctg cattgggccg tgacaatatt cagacgattc aggaggttcg + 61 ttcctttttt aaaggaccct aatcactctg agtaccactg actcactcag tgtgcgcgat + 121 tcatttcaaa aacgagccag cctcttcttc cttcgtctac tagatcagat ccaaagcttc + 181 ctcttccagc tatggctgct tcagtacact gtaccttgat gtccgtcgta tgcaacaaca + // + +------ + +**About formats** + +**GFF** Generic Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold or contig. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +**GenBank format** Consists of an annotation section and a sequence section. Sample record_ + +.. _record: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html + + +-------- + +**Copyright** + +2010-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gtf.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" +Program to convert data from GFF to GTF + +Usage: python gff_to_gtf.py in.gff > out.gtf + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def printGTF(tinfo): + """ + writing result file in GTF format + + @args tinfo: parsed object from gff file + @type tinfo: numpy array + """ + + for ent1 in tinfo: + for idx, tid in enumerate(ent1['transcripts']): + + exons = ent1['exons'][idx] + cds_exons = ent1['cds_exons'][idx] + + stop_codon = start_codon = () + + if ent1['strand'] == '+': + if cds_exons.any(): + start_codon = (cds_exons[0][0], cds_exons[0][0]+2) + stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + elif ent1['strand'] == '-': + if cds_exons.any(): + start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1]) + stop_codon = (cds_exons[0][0], cds_exons[0][0]+2) + else: + print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0]) + pass + + last_cds_cod = 0 + for idz, ex_cod in enumerate(exons): + + print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + if cds_exons.any(): + try: + print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + last_cds_cod = idz + except: + pass + + if idz == 0: + print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + if stop_codon: + print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name']) + + +if __name__ == "__main__": + + try: + gff_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(gff_fname) + + printGTF(Transcriptdb) diff -r 619e0fcd9126 -r 6e589f267c14 gff_to_gtf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_gtf.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,88 @@ + + converter + gff_to_gtf.py $inf_gff3 > $gtf_format + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts data from GFF3 to GTF file format (scroll down for format description). + +-------- + +**Example** + +- The following data in GFF3 format:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +- Will be converted to GTF format:: + + 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; + 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + +-------- + +**About formats** + + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + + +**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: + + 1. seqname - The name of the sequence. + 2. source - This indicating where the annotation came from. + 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. + 7. strand - Valid entries include '+', '-', or '.' + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. + 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 gffparser_bcbio.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gffparser_bcbio.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,828 @@ +"""Parse GFF files into features attached to Biopython SeqRecord objects. + +This deals with GFF3 formatted files, a tab delimited format for storing +sequence features and annotations: + +http://www.sequenceontology.org/gff3.shtml + +It will also deal with older GFF versions (GTF/GFF2): + +http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml +http://mblab.wustl.edu/GTF22.html + +The implementation utilizes map/reduce parsing of GFF using Disco. Disco +(http://discoproject.org) is a Map-Reduce framework for Python utilizing +Erlang for parallelization. The code works on a single processor without +Disco using the same architecture. +""" +import os +import copy +import re +import collections +import urllib +import itertools + +# Make defaultdict compatible with versions of python older than 2.4 +try: + collections.defaultdict +except AttributeError: + import _utils + collections.defaultdict = _utils.defaultdict + +from Bio.Seq import Seq, UnknownSeq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +from Bio import SeqIO + +def _gff_line_map(line, params): + """Map part of Map-Reduce; parses a line of GFF into a dictionary. + + Given an input line from a GFF file, this: + - decides if the file passes our filtering limits + - if so: + - breaks it into component elements + - determines the type of attribute (flat, parent, child or annotation) + - generates a dictionary of GFF info which can be serialized as JSON + """ + gff3_kw_pat = re.compile("\w+=") + def _split_keyvals(keyval_str): + """Split key-value pairs in a GFF2, GTF and GFF3 compatible way. + + GFF3 has key value pairs like: + count=9;gene=amx-2;sequence=SAGE:aacggagccg + GFF2 and GTF have: + Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206" + name "fgenesh1_pg.C_chr_1000003"; transcriptId 869 + """ + quals = collections.defaultdict(list) + if keyval_str is None: + return quals + # ensembl GTF has a stray semi-colon at the end + if keyval_str[-1] == ';': + keyval_str = keyval_str[:-1] + # GFF2/GTF has a semi-colon with at least one space after it. + # It can have spaces on both sides; wormbase does this. + # GFF3 works with no spaces. + # Split at the first one we can recognize as working + parts = keyval_str.split(" ; ") + if len(parts) == 1: + parts = keyval_str.split("; ") + if len(parts) == 1: + parts = keyval_str.split(";") + # check if we have GFF3 style key-vals (with =) + is_gff2 = True + if gff3_kw_pat.match(parts[0]): + is_gff2 = False + key_vals = [p.split('=') for p in parts] + # otherwise, we are separated by a space with a key as the first item + else: + pieces = [] + for p in parts: + # fix misplaced semi-colons in keys in some GFF2 files + if p and p[0] == ';': + p = p[1:] + pieces.append(p.strip().split(" ")) + key_vals = [(p[0], " ".join(p[1:])) for p in pieces] + for item in key_vals: + # standard in-spec items are key=value + if len(item) == 2: + key, val = item + # out-of-spec files can have just key values. We set an empty value + # which will be changed to true later to standardize. + else: + assert len(item) == 1, item + key = item[0] + val = '' + # remove quotes in GFF2 files + if (len(val) > 0 and val[0] == '"' and val[-1] == '"'): + val = val[1:-1] + if val: + quals[key].extend([v for v in val.split(',') if v]) + # if we don't have a value, make this a key=True/False style + # attribute + else: + quals[key].append('true') + for key, vals in quals.items(): + quals[key] = [urllib.unquote(v) for v in vals] + return quals, is_gff2 + + def _nest_gff2_features(gff_parts): + """Provide nesting of GFF2 transcript parts with transcript IDs. + + exons and coding sequences are mapped to a parent with a transcript_id + in GFF2. This is implemented differently at different genome centers + and this function attempts to resolve that and map things to the GFF3 + way of doing them. + """ + # map protein or transcript ids to a parent + for transcript_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["quals"]["Parent"] = \ + gff_parts["quals"][transcript_id] + break + except KeyError: + pass + # case for WormBase GFF -- everything labelled as Transcript or CDS + for flat_name in ["Transcript", "CDS"]: + if gff_parts["quals"].has_key(flat_name): + # parent types + if gff_parts["type"] in [flat_name]: + if not gff_parts["id"]: + gff_parts["id"] = gff_parts["quals"][flat_name][0] + gff_parts["quals"]["ID"] = [gff_parts["id"]] + # children types + elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name] + break + + return gff_parts + + strand_map = {'+' : 1, '-' : -1, '?' : None, None: None} + line = line.strip() + if line[:2] == "##": + return [('directive', line[2:])] + elif line and line[0] != "#": + parts = line.split('\t') + should_do = True + if params.limit_info: + for limit_name, limit_values in params.limit_info.items(): + cur_id = tuple([parts[i] for i in + params.filter_info[limit_name]]) + if cur_id not in limit_values: + should_do = False + break + if should_do: + assert len(parts) >= 8, line + # not python2.4 compatible but easier to understand + #gff_parts = [(None if p == '.' else p) for p in parts] + gff_parts = [] + for p in parts: + if p == ".": + gff_parts.append(None) + else: + gff_parts.append(p) + gff_info = dict() + # collect all of the base qualifiers for this item + if len(parts) > 8: + quals, is_gff2 = _split_keyvals(gff_parts[8]) + else: + quals, is_gff2 = dict(), False + gff_info["is_gff2"] = is_gff2 + if gff_parts[1]: + quals["source"].append(gff_parts[1]) + if gff_parts[5]: + quals["score"].append(gff_parts[5]) + if gff_parts[7]: + quals["phase"].append(gff_parts[7]) + gff_info['quals'] = dict(quals) + gff_info['rec_id'] = gff_parts[0] + # if we are describing a location, then we are a feature + if gff_parts[3] and gff_parts[4]: + gff_info['location'] = [int(gff_parts[3]) - 1, + int(gff_parts[4])] + gff_info['type'] = gff_parts[2] + gff_info['id'] = quals.get('ID', [''])[0] + gff_info['strand'] = strand_map.get(gff_parts[6], None) + if is_gff2: + gff_info = _nest_gff2_features(gff_info) + # features that have parents need to link so we can pick up + # the relationship + if gff_info['quals'].has_key('Parent'): + # check for self referential parent/child relationships + # remove the ID, which is not useful + for p in gff_info['quals']['Parent']: + if p == gff_info['id']: + gff_info['id'] = '' + del gff_info['quals']['ID'] + break + final_key = 'child' + elif gff_info['id']: + final_key = 'parent' + # Handle flat features + else: + final_key = 'feature' + # otherwise, associate these annotations with the full record + else: + final_key = 'annotation' + if params.jsonify: + return [(final_key, simplejson.dumps(gff_info))] + else: + return [(final_key, gff_info)] + return [] + +def _gff_line_reduce(map_results, out, params): + """Reduce part of Map-Reduce; combines results of parsed features. + """ + final_items = dict() + for gff_type, final_val in map_results: + if params.jsonify and gff_type not in ['directive']: + final_val = simplejson.loads(final_val) + try: + final_items[gff_type].append(final_val) + except KeyError: + final_items[gff_type] = [final_val] + for key, vals in final_items.items(): + if params.jsonify: + vals = simplejson.dumps(vals) + out.add(key, vals) + +class _MultiIDRemapper: + """Provide an ID remapping for cases where a parent has a non-unique ID. + + Real life GFF3 cases have non-unique ID attributes, which we fix here + by using the unique sequence region to assign children to the right + parent. + """ + def __init__(self, base_id, all_parents): + self._base_id = base_id + self._parents = all_parents + + def remap_id(self, feature_dict): + rstart, rend = feature_dict['location'] + for index, parent in enumerate(self._parents): + pstart, pend = parent['location'] + if rstart >= pstart and rend <= pend: + if index > 0: + return ("%s_%s" % (self._base_id, index + 1)) + else: + return self._base_id + raise ValueError("Did not find remapped ID location: %s, %s, %s" % ( + self._base_id, [p['location'] for p in self._parents], + feature_dict['location'])) + +class _AbstractMapReduceGFF: + """Base class providing general GFF parsing for local and remote classes. + + This class should be subclassed to provide a concrete class to parse + GFF under specific conditions. These classes need to implement + the _gff_process function, which returns a dictionary of SeqRecord + information. + """ + def __init__(self, create_missing=True): + """Initialize GFF parser + + create_missing - If True, create blank records for GFF ids not in + the base_dict. If False, an error will be raised. + """ + self._create_missing = create_missing + self._map_fn = _gff_line_map + self._reduce_fn = _gff_line_reduce + self._examiner = GFFExaminer() + + def _gff_process(self, gff_files, limit_info, target_lines=None): + raise NotImplementedError("Derived class must define") + + def parse(self, gff_files, base_dict=None, limit_info=None): + """Parse a GFF file, returning an iterator of SeqRecords. + + limit_info - A dictionary specifying the regions of the GFF file + which should be extracted. This allows only relevant portions of a file + to be parsed. + + base_dict - A base dictionary of SeqRecord objects which may be + pre-populated with sequences and other features. The new features from + the GFF file will be added to this dictionary. + """ + for rec in self.parse_in_parts(gff_files, base_dict, limit_info): + yield rec + + def parse_in_parts(self, gff_files, base_dict=None, limit_info=None, + target_lines=None): + """Parse a region of a GFF file specified, returning info as generated. + + target_lines -- The number of lines in the file which should be used + for each partial parse. This should be determined based on available + memory. + """ + for results in self.parse_simple(gff_files, limit_info, target_lines): + if base_dict is None: + cur_dict = dict() + else: + cur_dict = copy.deepcopy(base_dict) + cur_dict = self._results_to_features(cur_dict, results) + all_ids = cur_dict.keys() + all_ids.sort() + for cur_id in all_ids: + yield cur_dict[cur_id] + + def parse_simple(self, gff_files, limit_info=None, target_lines=1): + """Simple parse which does not build or nest features. + + This returns a simple dictionary representation of each line in the + GFF file. + """ + # gracefully handle a single file passed + if not isinstance(gff_files, (list, tuple)): + gff_files = [gff_files] + limit_info = self._normalize_limit_info(limit_info) + for results in self._gff_process(gff_files, limit_info, target_lines): + yield results + + def _normalize_limit_info(self, limit_info): + """Turn all limit information into tuples for identical comparisons. + """ + final_limit_info = {} + if limit_info: + for key, values in limit_info.items(): + final_limit_info[key] = [] + for v in values: + if isinstance(v, str): + final_limit_info[key].append((v,)) + else: + final_limit_info[key].append(tuple(v)) + return final_limit_info + + def _results_to_features(self, base, results): + """Add parsed dictionaries of results to Biopython SeqFeatures. + """ + base = self._add_annotations(base, results.get('annotation', [])) + for feature in results.get('feature', []): + (_, base) = self._add_toplevel_feature(base, feature) + base = self._add_parent_child_features(base, results.get('parent', []), + results.get('child', [])) + base = self._add_seqs(base, results.get('fasta', [])) + base = self._add_directives(base, results.get('directive', [])) + return base + + def _add_directives(self, base, directives): + """Handle any directives or meta-data in the GFF file. + + Relevant items are added as annotation meta-data to each record. + """ + dir_keyvals = collections.defaultdict(list) + for directive in directives: + parts = directive.split() + if len(parts) > 1: + key = parts[0] + if len(parts) == 2: + val = parts[1] + else: + val = tuple(parts[1:]) + dir_keyvals[key].append(val) + for key, vals in dir_keyvals.items(): + for rec in base.values(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _add_seqs(self, base, recs): + """Add sequence information contained in the GFF3 to records. + """ + for rec in recs: + if base.has_key(rec.id): + base[rec.id].seq = rec.seq + else: + base[rec.id] = rec + return base + + def _add_parent_child_features(self, base, parents, children): + """Add nested features with parent child relationships. + """ + multi_remap = self._identify_dup_ids(parents) + # add children features + children_prep = collections.defaultdict(list) + for child_dict in children: + child_feature = self._get_feature(child_dict) + for pindex, pid in enumerate(child_feature.qualifiers['Parent']): + if multi_remap.has_key(pid): + pid = multi_remap[pid].remap_id(child_dict) + child_feature.qualifiers['Parent'][pindex] = pid + children_prep[pid].append((child_dict['rec_id'], + child_feature)) + children = dict(children_prep) + # add children to parents that exist + for cur_parent_dict in parents: + cur_id = cur_parent_dict['id'] + if multi_remap.has_key(cur_id): + cur_parent_dict['id'] = multi_remap[cur_id].remap_id( + cur_parent_dict) + cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict) + cur_parent, children = self._add_children_to_parent(cur_parent, + children) + # create parents for children without them (GFF2 or split/bad files) + while len(children) > 0: + parent_id, cur_children = itertools.islice(children.items(), + 1).next() + # one child, do not nest it + if len(cur_children) == 1: + rec_id, child = cur_children[0] + loc = (child.location.nofuzzy_start, child.location.nofuzzy_end) + rec, base = self._get_rec(base, + dict(rec_id=rec_id, location=loc)) + rec.features.append(child) + del children[parent_id] + else: + cur_parent, base = self._add_missing_parent(base, parent_id, + cur_children) + cur_parent, children = self._add_children_to_parent(cur_parent, + children) + return base + + def _identify_dup_ids(self, parents): + """Identify duplicated ID attributes in potential nested parents. + + According to the GFF3 spec ID attributes are supposed to be unique + for a file, but this is not always true in practice. This looks + for duplicates, and provides unique IDs sorted by locations. + """ + multi_ids = collections.defaultdict(list) + for parent in parents: + multi_ids[parent['id']].append(parent) + multi_ids = [(mid, parents) for (mid, parents) in multi_ids.items() + if len(parents) > 1] + multi_remap = dict() + for mid, parents in multi_ids: + multi_remap[mid] = _MultiIDRemapper(mid, parents) + return multi_remap + + def _add_children_to_parent(self, cur_parent, children): + """Recursively add children to parent features. + """ + if children.has_key(cur_parent.id): + cur_children = children[cur_parent.id] + for rec_id, cur_child in cur_children: + cur_child, children = self._add_children_to_parent(cur_child, + children) + cur_parent.location_operator = "join" + cur_parent.sub_features.append(cur_child) + del children[cur_parent.id] + return cur_parent, children + + def _add_annotations(self, base, anns): + """Add annotation data from the GFF file to records. + """ + # add these as a list of annotations, checking not to overwrite + # current values + for ann in anns: + rec, base = self._get_rec(base, ann) + for key, vals in ann['quals'].items(): + self._add_ann_to_rec(rec, key, vals) + return base + + def _add_ann_to_rec(self, rec, key, vals): + """Add a key/value annotation to the given SeqRecord. + """ + if rec.annotations.has_key(key): + try: + rec.annotations[key].extend(vals) + except AttributeError: + rec.annotations[key] = [rec.annotations[key]] + vals + else: + rec.annotations[key] = vals + + def _get_rec(self, base, info_dict): + """Retrieve a record to add features to. + """ + max_loc = info_dict.get('location', (0, 1))[1] + try: + cur_rec = base[info_dict['rec_id']] + # update generated unknown sequences with the expected maximum length + if isinstance(cur_rec.seq, UnknownSeq): + cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) + return cur_rec, base + except KeyError: + if self._create_missing: + new_rec = SeqRecord(UnknownSeq(max_loc), info_dict['rec_id']) + base[info_dict['rec_id']] = new_rec + return new_rec, base + else: + raise + + def _add_missing_parent(self, base, parent_id, cur_children): + """Add a new feature that is missing from the GFF file. + """ + base_rec_id = list(set(c[0] for c in cur_children)) + assert len(base_rec_id) == 1 + feature_dict = dict(id=parent_id, strand=None, + type="inferred_parent", quals=dict(ID=[parent_id]), + rec_id=base_rec_id[0]) + coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) + for r, c in cur_children] + feature_dict["location"] = (min([c[0] for c in coords]), + max([c[1] for c in coords])) + return self._add_toplevel_feature(base, feature_dict) + + def _add_toplevel_feature(self, base, feature_dict): + """Add a toplevel non-nested feature to the appropriate record. + """ + new_feature = self._get_feature(feature_dict) + rec, base = self._get_rec(base, feature_dict) + rec.features.append(new_feature) + return new_feature, base + + def _get_feature(self, feature_dict): + """Retrieve a Biopython feature from our dictionary representation. + """ + location = FeatureLocation(*feature_dict['location']) + new_feature = SeqFeature(location, feature_dict['type'], + id=feature_dict['id'], strand=feature_dict['strand']) + new_feature.qualifiers = feature_dict['quals'] + return new_feature + + def _parse_fasta(self, in_handle): + """Parse FASTA sequence information contained in the GFF3 file. + """ + return list(SeqIO.parse(in_handle, "fasta")) + +class _GFFParserLocalOut: + """Provide a collector for local GFF MapReduce file parsing. + """ + def __init__(self, smart_breaks=False): + self._items = dict() + self._smart_breaks = smart_breaks + self._missing_keys = collections.defaultdict(int) + self._last_parent = None + self.can_break = True + self.num_lines = 0 + + def add(self, key, vals): + if self._smart_breaks: + # if we are not GFF2 we expect parents and break + # based on not having missing ones + if key == 'directive': + if vals[0] == '#': + self.can_break = True + self._last_parent = None + elif not vals[0].get("is_gff2", False): + self._update_missing_parents(key, vals) + self.can_break = (len(self._missing_keys) == 0) + # break when we are done with stretches of child features + elif key != 'child': + self.can_break = True + self._last_parent = None + # break when we have lots of child features in a row + # and change between parents + else: + cur_parent = vals[0]["quals"]["Parent"][0] + if (self._last_parent): + self.can_break = (cur_parent != self._last_parent) + self._last_parent = cur_parent + self.num_lines += 1 + try: + self._items[key].extend(vals) + except KeyError: + self._items[key] = vals + + def _update_missing_parents(self, key, vals): + # smart way of deciding if we can break this. + # if this is too much, can go back to not breaking in the + # middle of children + if key in ["child"]: + for val in vals: + for p_id in val["quals"]["Parent"]: + self._missing_keys[p_id] += 1 + for val in vals: + try: + del self._missing_keys[val["quals"]["ID"][0]] + except KeyError: + pass + + def has_items(self): + return len(self._items) > 0 + + def get_results(self): + self._last_parent = None + return self._items + +class GFFParser(_AbstractMapReduceGFF): + """Local GFF parser providing standardized parsing of GFF3 and GFF2 files. + """ + def __init__(self, line_adjust_fn=None, create_missing=True): + _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) + self._line_adjust_fn = line_adjust_fn + + def _gff_process(self, gff_files, limit_info, target_lines): + """Process GFF addition without any parallelization. + + In addition to limit filtering, this accepts a target_lines attribute + which provides a number of lines to parse before returning results. + This allows partial parsing of a file to prevent memory issues. + """ + line_gen = self._file_line_generator(gff_files) + for out in self._lines_to_out_info(line_gen, limit_info, target_lines): + yield out + + def _file_line_generator(self, gff_files): + """Generate single lines from a set of GFF files. + """ + for gff_file in gff_files: + if hasattr(gff_file, "read"): + need_close = False + in_handle = gff_file + else: + need_close = True + in_handle = open(gff_file) + found_seqs = False + while 1: + line = in_handle.readline() + if not line: + break + yield line + if need_close: + in_handle.close() + + def _lines_to_out_info(self, line_iter, limit_info=None, + target_lines=None): + """Generate SeqRecord and SeqFeatures from GFF file lines. + """ + params = self._examiner._get_local_params(limit_info) + out_info = _GFFParserLocalOut((target_lines is not None and + target_lines > 1)) + found_seqs = False + for line in line_iter: + results = self._map_fn(line, params) + if self._line_adjust_fn and results: + if results[0][0] not in ['directive']: + results = [(results[0][0], + self._line_adjust_fn(results[0][1]))] + self._reduce_fn(results, out_info, params) + if (target_lines and out_info.num_lines >= target_lines and + out_info.can_break): + yield out_info.get_results() + out_info = _GFFParserLocalOut((target_lines is not None and + target_lines > 1)) + if (results and results[0][0] == 'directive' and + results[0][1] == 'FASTA'): + found_seqs = True + break + + class FakeHandle: + def __init__(self, line_iter): + self._iter = line_iter + def read(self): + return "".join(l for l in self._iter) + def readline(self): + try: + return self._iter.next() + except StopIteration: + return "" + + if found_seqs: + fasta_recs = self._parse_fasta(FakeHandle(line_iter)) + out_info.add('fasta', fasta_recs) + if out_info.has_items(): + yield out_info.get_results() + +class DiscoGFFParser(_AbstractMapReduceGFF): + """GFF Parser with parallelization through Disco (http://discoproject.org. + """ + def __init__(self, disco_host, create_missing=True): + """Initialize parser. + + disco_host - Web reference to a Disco host which will be used for + parallelizing the GFF reading job. + """ + _AbstractMapReduceGFF.__init__(self, create_missing=create_missing) + self._disco_host = disco_host + + def _gff_process(self, gff_files, limit_info, target_lines=None): + """Process GFF addition, using Disco to parallelize the process. + """ + assert target_lines is None, "Cannot split parallelized jobs" + # make these imports local; only need them when using disco + import simplejson + import disco + # absolute path names unless they are special disco files + full_files = [] + for f in gff_files: + if f.split(":")[0] != "disco": + full_files.append(os.path.abspath(f)) + else: + full_files.append(f) + results = disco.job(self._disco_host, name="gff_reader", + input=full_files, + params=disco.Params(limit_info=limit_info, jsonify=True, + filter_info=self._examiner._filter_info), + required_modules=["simplejson", "collections", "re"], + map=self._map_fn, reduce=self._reduce_fn) + processed = dict() + for out_key, out_val in disco.result_iterator(results): + processed[out_key] = simplejson.loads(out_val) + yield processed + +def parse(gff_files, base_dict=None, limit_info=None, target_lines=None): + """High level interface to parse GFF files into SeqRecords and SeqFeatures. + """ + parser = GFFParser() + for rec in parser.parse_in_parts(gff_files, base_dict, limit_info, + target_lines): + yield rec + +def _file_or_handle(fn): + """Decorator to handle either an input handle or a file. + """ + def _file_or_handle_inside(*args, **kwargs): + in_file = args[1] + if hasattr(in_file, "read"): + need_close = False + in_handle = in_file + else: + need_close = True + in_handle = open(in_file) + args = (args[0], in_handle) + args[2:] + out = fn(*args, **kwargs) + if need_close: + in_handle.close() + return out + return _file_or_handle_inside + +class GFFExaminer: + """Provide high level details about a GFF file to refine parsing. + + GFF is a spec and is provided by many different centers. Real life files + will present the same information in slightly different ways. Becoming + familiar with the file you are dealing with is the best way to extract the + information you need. This class provides high level summary details to + help in learning. + """ + def __init__(self): + self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2], + gff_source = [1], gff_type = [2]) + + def _get_local_params(self, limit_info=None): + class _LocalParams: + def __init__(self): + self.jsonify = False + params = _LocalParams() + params.limit_info = limit_info + params.filter_info = self._filter_info + return params + + @_file_or_handle + def available_limits(self, gff_handle): + """Return dictionary information on possible limits for this file. + + This returns a nested dictionary with the following structure: + + keys -- names of items to filter by + values -- dictionary with: + keys -- filter choice + value -- counts of that filter in this file + + Not a parallelized map-reduce implementation. + """ + cur_limits = dict() + for filter_key in self._filter_info.keys(): + cur_limits[filter_key] = collections.defaultdict(int) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + # ignore empty and comment lines + if line.strip() and line.strip()[0] != "#": + parts = [p.strip() for p in line.split('\t')] + assert len(parts) == 9, line + for filter_key, cur_indexes in self._filter_info.items(): + cur_id = tuple([parts[i] for i in cur_indexes]) + cur_limits[filter_key][cur_id] += 1 + # get rid of the default dicts + final_dict = dict() + for key, value_dict in cur_limits.items(): + if len(key) == 1: + key = key[0] + final_dict[key] = dict(value_dict) + gff_handle.close() + return final_dict + + @_file_or_handle + def parent_child_map(self, gff_handle): + """Provide a mapping of parent to child relationships in the file. + + Returns a dictionary of parent child relationships: + + keys -- tuple of (source, type) for each parent + values -- tuple of (source, type) as children of that parent + + Not a parallelized map-reduce implementation. + """ + # collect all of the parent and child types mapped to IDs + parent_sts = dict() + child_sts = collections.defaultdict(list) + for line in gff_handle: + # when we hit FASTA sequences, we are done with annotations + if line.startswith("##FASTA"): + break + if line.strip(): + line_type, line_info = _gff_line_map(line, + self._get_local_params())[0] + if (line_type == 'parent' or (line_type == 'child' and + line_info['id'])): + parent_sts[line_info['id']] = ( + line_info['quals']['source'][0], line_info['type']) + if line_type == 'child': + for parent_id in line_info['quals']['Parent']: + child_sts[parent_id].append(( + line_info['quals']['source'][0], line_info['type'])) + #print parent_sts, child_sts + # generate a dictionary of the unique final type relationships + pc_map = collections.defaultdict(list) + for parent_id, parent_type in parent_sts.items(): + for child_type in child_sts[parent_id]: + pc_map[parent_type].append(child_type) + pc_final_map = dict() + for ptype, ctypes in pc_map.items(): + unique_ctypes = list(set(ctypes)) + unique_ctypes.sort() + pc_final_map[ptype] = unique_ctypes + return pc_final_map diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf_to_gff.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,85 @@ +#!/usr/bin/env python +""" +Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3]. + +Usage: python gtf_to_gff.py in.gtf > out.gff3 + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser +import helper + +def GFFWriter(gtf_content): + """ + write the feature information to GFF format + + @args gtf_content: Parsed object from gtf file + @type gtf_content: numpy array + """ + + print '##gff-version 3' + + for ent1 in gtf_content: + + chr_name = ent1['chr'] + strand = ent1['strand'] + start = ent1['start'] + stop = ent1['stop'] + source = ent1['source'] + ID = ent1['name'] + Name = ent1['gene_info']['Name'] + + Name = ID if not Name else Name + + print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name) + + for idx, tid in enumerate(ent1['transcripts']): + print idx + print tid + + t_start = ent1['exons'][idx][0][0] + t_stop = ent1['exons'][idx][-1][-1] + t_type = ent1['transcript_type'][idx] + + utr5_exons, utr3_exons = [], [] + if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any(): + utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand) + + print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID) + + for ex_cod in utr5_exons: + print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + for ex_cod in ent1['cds_exons'][idx]: + print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]) + + for ex_cod in utr3_exons: + print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + for ex_cod in ent1['exons'][idx]: + print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]) + + +def __main__(): + + try: + gtf_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + gtf_file_content = GFFParser.Parse(gtf_fname) + + GFFWriter(gtf_file_content) + +if __name__ == "__main__": + __main__() diff -r 619e0fcd9126 -r 6e589f267c14 gtf_to_gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf_to_gff.xml Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,94 @@ + + converter + gtf_to_gff.py $inf_gtf > $gff3_format + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts data from GTF to a valid GFF3 file (scroll down for format description). + +-------- + +**Example** + +- The following data in GTF format:: + + 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352"; + 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; + +- Will be converted to GFF3 format:: + + ##gff-version 3 + 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11 + 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859 + 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751 + 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751 + 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352 + 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751 + 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751 + +-------- + +**About formats** + +**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:: + + 1. seqname - The name of the sequence. + 2. source - This indicating where the annotation came from. + 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon' + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates. + 7. strand - Valid entries include '+', '-', or '.' + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. + 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region. + +**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:: + + 1. seqid - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. stop - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. attributes - All lines with the same group are linked together into a single item. + +-------- + +**Copyright** + +2009-2014 Max Planck Society, University of Tübingen & Memorial Sloan Kettering Cancer Center + +Sreedharan VT, Schultheiss SJ, Jean G, Kahles A, Bohnert R, Drewe P, Mudrakarta P, Görnitz N, Zeller G, Rätsch G. Oqtans: the RNA-seq workbench in the cloud for complete and reproducible quantitative transcriptome analysis. Bioinformatics 10.1093/bioinformatics/btt731 (2014) + + + diff -r 619e0fcd9126 -r 6e589f267c14 helper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/helper.py Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,332 @@ +#!/usr/bin/env python +""" +Common utility functions +""" + +import os +import re +import sys +import gzip +import bz2 +import numpy + +def init_gene(): + """ + Initializing the gene structure + """ + + gene_det = [('id', 'f8'), + ('anno_id', numpy.dtype), + ('confgenes_id', numpy.dtype), + ('name', 'S25'), + ('source', 'S25'), + ('gene_info', numpy.dtype), + ('alias', 'S15'), + ('name2', numpy.dtype), + ('strand', 'S2'), + ('score', 'S15'), + ('chr', 'S15'), + ('chr_num', numpy.dtype), + ('paralogs', numpy.dtype), + ('start', 'f8'), + ('stop', 'f8'), + ('transcripts', numpy.dtype), + ('transcript_type', numpy.dtype), + ('transcript_info', numpy.dtype), + ('transcript_status', numpy.dtype), + ('transcript_valid', numpy.dtype), + ('exons', numpy.dtype), + ('exons_confirmed', numpy.dtype), + ('cds_exons', numpy.dtype), + ('utr5_exons', numpy.dtype), + ('utr3_exons', numpy.dtype), + ('tis', numpy.dtype), + ('tis_conf', numpy.dtype), + ('tis_info', numpy.dtype), + ('cdsStop', numpy.dtype), + ('cdsStop_conf', numpy.dtype), + ('cdsStop_info', numpy.dtype), + ('tss', numpy.dtype), + ('tss_info', numpy.dtype), + ('tss_conf', numpy.dtype), + ('cleave', numpy.dtype), + ('cleave_info', numpy.dtype), + ('cleave_conf', numpy.dtype), + ('polya', numpy.dtype), + ('polya_info', numpy.dtype), + ('polya_conf', numpy.dtype), + ('is_alt', 'f8'), + ('is_alt_spliced', 'f8'), + ('is_valid', numpy.dtype), + ('transcript_complete', numpy.dtype), + ('is_complete', numpy.dtype), + ('is_correctly_gff3_referenced', 'S5'), + ('splicegraph', numpy.dtype) ] + + return gene_det + +def open_file(fname): + """ + Open the file (supports .gz .bz2) and returns the handler + + @args fname: input file name for reading + @type fname: str + """ + + try: + if os.path.splitext(fname)[1] == ".gz": + FH = gzip.open(fname, 'rb') + elif os.path.splitext(fname)[1] == ".bz2": + FH = bz2.BZ2File(fname, 'rb') + else: + FH = open(fname, 'rU') + except Exception as error: + sys.exit(error) + + return FH + +def add_CDS_phase(strand, cds): + """ + Calculate CDS phase and add to the CDS exons + + @args strand: feature strand information + @type strand: +/- + @args cds: coding exon coordinates + @type cds: numpy array [[int, int, int]] + """ + + cds_region, cds_flag = [], 0 + if strand == '+': + for cdspos in cds: + if cds_flag == 0: + cdspos = (cdspos[0], cdspos[1], 0) + diff = (cdspos[1]-(cdspos[0]-1))%3 + else: + xy = 0 + if diff == 0: + cdspos = (cdspos[0], cdspos[1], 0) + elif diff == 1: + cdspos = (cdspos[0], cdspos[1], 2) + xy = 2 + elif diff == 2: + cdspos = (cdspos[0], cdspos[1], 1) + xy = 1 + diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 + cds_region.append(cdspos) + cds_flag = 1 + elif strand == '-': + cds.reverse() + for cdspos in cds: + if cds_flag == 0: + cdspos = (cdspos[0], cdspos[1], 0) + diff = (cdspos[1]-(cdspos[0]-1))%3 + else: + xy = 0 + if diff == 0: + cdspos = (cdspos[0], cdspos[1], 0) + elif diff == 1: + cdspos = (cdspos[0], cdspos[1], 2) + xy = 2 + elif diff == 2: + cdspos = (cdspos[0], cdspos[1], 1) + xy = 1 + diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 + cds_region.append(cdspos) + cds_flag = 1 + cds_region.reverse() + return cds_region + +def buildUTR(cc, ec, strand): + """ + Build UTR regions from a given set of CDS and exon coordiantes of a gene + + @args cc: coding exon coordinates + @type cc: numpy array [[int, int, int]] + @args ec: exon coordinates + @type ec: numpy array [[int, int]] + @args strand: feature strand information + @type strand: +/- + """ + + utr5 = [] + utr3 = [] + if strand == '+': + cds_s = cc[0][0] + for ex in ec: + if ex[0] <= cds_s and cds_s <= ex[1]: + if ex[0] != cds_s:utr5.append((ex[0], cds_s-1)) + break + else: + utr5.append(ex) + cds_e = cc[-1][1] + for i in range(len(ec)): + i += 1 + if ec[-i][0] <= cds_e and cds_e <= ec[-i][1]: + if ec[-i][1] != cds_e:utr3.append((cds_e +1, ec[-i][1])) + break + else: + utr3.append(ec[-i]) + utr3.reverse() + elif strand == '-': + cds_s = cc[-1][1] + for i in range(len(ec)): + i += 1 + if ec[-i][0] <= cds_s and cds_s <= ec[-i][1]: + if ec[-i][1] != cds_s:utr5.append((cds_s+1, ec[-i][1])) + break + else: + utr5.append(ec[-i]) + utr5.reverse() + cds_e = cc[0][0] + for ex in ec: + if ex[0] <= cds_e and cds_e <= ex[1]: + if ex[0] != cds_e:utr3.append((ex[0], cds_e-1)) + break + else: + utr3.append(ex) + return utr5, utr3 + +def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr): + """ + Create exon cordinates from UTR's and CDS region + + @args strand_p: feature strand information + @type strand_p: +/- + @args five_p_utr: five prime utr exon coordinates + @type five_p_utr: numpy array [[int, int]] + @args cds_cod: coding exon coordinates + @type cds_cod: numpy array [[int, int, int]] + @args three_p_utr: three prime utr exon coordinates + @type three_p_utr: numpy array [[int, int]] + """ + + exon_pos = [] + if strand_p == '+': + utr5_start, utr5_end = 0, 0 + if five_p_utr != []: + utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] + cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] + jun_exon = [] + if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: + jun_exon = [utr5_start, cds_5end] + if len(cds_cod) == 1: + five_prime_flag = 0 + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + five_prime_flag = 1 + for utr5 in five_p_utr: + exon_pos.append(utr5) + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: + jun_exon = [cds_5start, utr3_end] + three_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + three_prime_flag = 1 + if five_prime_flag == 1 and three_prime_flag == 1: + exon_pos.append([utr5_start, utr3_end]) + if five_prime_flag == 1 and three_prime_flag == 0: + exon_pos.append([utr5_start, cds_5end]) + cds_cod = cds_cod[:-1] + if five_prime_flag == 0 and three_prime_flag == 1: + exon_pos.append([cds_5start, utr3_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr3 in three_p_utr: + exon_pos.append(utr3) + else: + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr5 in five_p_utr: + exon_pos.append(utr5) + exon_pos.append(jun_exon) if jun_exon != [] else '' + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + cds_3start = cds_cod[-1][0] + cds_3end = cds_cod[-1][1] + if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: + jun_exon = [cds_3start, utr3_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + exon_pos.append(jun_exon) if jun_exon != [] else '' + for utr3 in three_p_utr: + exon_pos.append(utr3) + elif strand_p == '-': + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[-1][0] + utr3_end = three_p_utr[-1][1] + cds_3start = cds_cod[0][0] + cds_3end = cds_cod[0][1] + jun_exon = [] + if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: + jun_exon = [utr3_start, cds_3end] + if len(cds_cod) == 1: + three_prime_flag = 0 + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + three_prime_flag = 1 + for utr3 in three_p_utr: + exon_pos.append(utr3) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: + jun_exon = [cds_3start, utr5_end] + five_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + five_prime_flag = 1 + if three_prime_flag == 1 and five_prime_flag == 1: + exon_pos.append([utr3_start, utr5_end]) + if three_prime_flag == 1 and five_prime_flag == 0: + exon_pos.append([utr3_start, cds_3end]) + cds_cod = cds_cod[:-1] + if three_prime_flag == 0 and five_prime_flag == 1: + exon_pos.append([cds_3start, utr5_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr5 in five_p_utr: + exon_pos.append(utr5) + else: + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr3 in three_p_utr: + exon_pos.append(utr3) + if jun_exon != []: + exon_pos.append(jun_exon) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + cds_5start = cds_cod[-1][0] + cds_5end = cds_cod[-1][1] + if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: + jun_exon = [cds_5start, utr5_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + if jun_exon != []: + exon_pos.append(jun_exon) + for utr5 in five_p_utr: + exon_pos.append(utr5) + return exon_pos diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gbk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/s_cerevisiae_SCU49845.gbk Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,165 @@ +LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 +DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p + (AXL2) and Rev7p (REV7) genes, complete cds. +ACCESSION U49845 +VERSION U49845.1 GI:1293613 +KEYWORDS . +SOURCE Saccharomyces cerevisiae (baker's yeast) + ORGANISM Saccharomyces cerevisiae + Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; + Saccharomycetales; Saccharomycetaceae; Saccharomyces. +REFERENCE 1 (bases 1 to 5028) + AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. + TITLE Cloning and sequence of REV7, a gene whose function is required for + DNA damage-induced mutagenesis in Saccharomyces cerevisiae + JOURNAL Yeast 10 (11), 1503-1509 (1994) + PUBMED 7871890 +REFERENCE 2 (bases 1 to 5028) + AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. + TITLE Selection of axial growth sites in yeast requires Axl2p, a novel + plasma membrane glycoprotein + JOURNAL Genes Dev. 10 (7), 777-793 (1996) + PUBMED 8846915 +REFERENCE 3 (bases 1 to 5028) + AUTHORS Roemer,T. + TITLE Direct Submission + JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New + Haven, CT, USA +FEATURES Location/Qualifiers + source 1..5028 + /organism="Saccharomyces cerevisiae" + /db_xref="taxon:4932" + /chromosome="IX" + /map="9" + CDS <1..206 + /codon_start=3 + /product="TCP1-beta" + /protein_id="AAA98665.1" + /db_xref="GI:1293614" + /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA + AEVLLRVDNIIRARPRTANRQHM" + gene 687..3158 + /gene="AXL2" + CDS 687..3158 + /gene="AXL2" + /note="plasma membrane glycoprotein" + /codon_start=1 + /function="required for axial budding pattern of S. + cerevisiae" + /product="Axl2p" + /protein_id="AAA98666.1" + /db_xref="GI:1293615" + /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF + TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN + VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE + VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE + TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV + YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG + DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ + DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA + NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA + CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN + NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ + SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS + YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK + HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL + VDFSNKSNVNVGQVKDIHGRIPEML" + gene complement(3300..4037) + /gene="REV7" + CDS complement(3300..4037) + /gene="REV7" + /codon_start=1 + /product="Rev7p" + /protein_id="AAA98667.1" + /db_xref="GI:1293616" + /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ + FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD + KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR + RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK + LISGDDKILNGVYSQYEEGESIFGSLF" +ORIGIN + 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg + 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct + 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa + 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg + 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa + 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa + 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat + 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga + 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc + 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga + 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta + 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag + 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa + 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata + 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga + 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac + 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg + 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc + 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa + 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca + 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac + 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa + 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag + 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct + 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac + 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa + 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc + 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata + 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca + 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc + 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc + 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca + 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc + 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg + 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt + 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc + 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg + 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca + 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata + 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg + 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga + 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt + 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat + 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt + 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc + 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag + 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta + 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa + 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact + 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt + 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa + 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag + 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct + 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt + 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact + 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa + 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg + 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt + 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc + 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca + 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc + 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc + 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat + 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa + 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga + 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat + 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc + 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc + 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa + 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg + 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc + 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt + 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg + 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg + 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt + 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt + 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat + 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc + 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct + 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta + 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac + 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct + 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct + 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc +// diff -r 619e0fcd9126 -r 6e589f267c14 test-data/s_cerevisiae_SCU49845.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/s_cerevisiae_SCU49845.gff3 Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,8 @@ +IX gbk_to_gff gene 687 3158 . + . ID=AXL2;Name=AXL2 +IX gbk_to_gff . 687 3158 . + . ID=Transcript:AXL2;Parent=AXL2 +IX gbk_to_gff CDS 687 3158 . + . Parent=Transcript:AXL2 +IX gbk_to_gff exon 687 3158 . + . Parent=Transcript:AXL2 +IX gbk_to_gff gene 3300 4037 . - . ID=REV7;Name=REV7 +IX gbk_to_gff . 3300 4037 . - . ID=Transcript:REV7;Parent=REV7 +IX gbk_to_gff CDS 3300 4037 . - . Parent=Transcript:REV7 +IX gbk_to_gff exon 3300 4037 . - . Parent=Transcript:REV7 diff -r 619e0fcd9126 -r 6e589f267c14 test-data/single_parent_feature_record.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/single_parent_feature_record.gff3 Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,10 @@ +chr1 . miRNA_primary_transcript 1380242 1380467 . - . ID=MI0031047;Alias=MI0031047;Name=gma-MIR9754 +chr1 . miRNA 1380249 1380270 . - . ID=MIMAT0036385;Alias=MIMAT0036385;Name=gma-miR9754;Derives_from=MI0031047 +chr1 . miRNA_primary_transcript 2410094 2410318 . + . ID=MI0016507;Alias=MI0016507;Name=gma-MIR4367 +chr1 . miRNA 2410242 2410263 . + . ID=MIMAT0018266;Alias=MIMAT0018266;Name=gma-miR4367;Derives_from=MI0016507 +chr1 . miRNA_primary_transcript 4792375 4792487 . - . ID=MI0021714;Alias=MI0021714;Name=gma-MIR395h +chr1 . miRNA 4792388 4792408 . - . ID=MIMAT0024920;Alias=MIMAT0024920;Name=gma-miR395h;Derives_from=MI0021714 +chr1 . miRNA_primary_transcript 4797903 4798018 . - . ID=MI0021715;Alias=MI0021715;Name=gma-MIR395i +chr1 . miRNA 4797916 4797936 . - . ID=MIMAT0024921;Alias=MIMAT0024921;Name=gma-miR395i;Derives_from=MI0021715 +chr1 . miRNA_primary_transcript 4810817 4810942 . - . ID=MI0021716;Alias=MI0021716;Name=gma-MIR395j +chr1 . miRNA 4810830 4810850 . - . ID=MIMAT0024922;Alias=MIMAT0024922;Name=gma-miR395j;Derives_from=MI0021716 diff -r 619e0fcd9126 -r 6e589f267c14 tool_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml.sample Tue Nov 04 12:15:19 2014 -0500 @@ -0,0 +1,9 @@ +
+ + + + + + + +