Mercurial > repos > vipints > rdiff
changeset 2:233c30f91d66
updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 07:15:44 -0400 |
parents | 0f80a5141704 |
children | 29a698dc5c7e |
files | rDiff/bin/genes_cell2struct rDiff/bin/rdiff rDiff/galaxy/rdiff.xml rDiff/src/genes_cell2struct.m rDiff/tools/GFFParser.py rDiff/tools/ParseGFF.py rDiff/tools/helper.py |
diffstat | 7 files changed, 588 insertions(+), 600 deletions(-) [+] |
line wrap: on
line diff
--- a/rDiff/bin/genes_cell2struct Thu Feb 14 23:38:36 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -#/bin/bash -# rDiff wrapper script to start the interpreter with the correct list of arguments - -set -e - -PROG=`basename $0` -DIR=`dirname $0` - -exec ${DIR}/start_interpreter.sh ${PROG} "`${DIR}/genarglist.sh $@`"
--- a/rDiff/bin/rdiff Thu Feb 14 23:38:36 2013 -0500 +++ b/rDiff/bin/rdiff Tue Oct 08 07:15:44 2013 -0400 @@ -161,19 +161,19 @@ echo #Check wether results directory exists -#if [ -d $RDIFF_RES_DIR ] -#then -# echo "Writting results to: $RDIFF_RES_DIR" -#else -# mkdir $RDIFF_RES_DIR -#fi +if [ -d $RDIFF_RES_DIR ] +then + echo "Writting results to: $RDIFF_RES_DIR" +else + mkdir $RDIFF_RES_DIR +fi echo 1a. load the genome annotation in GFF3 format, create an annotation object #\[Log file in ${RDIFF_RES_DIR}}/elegans-gff2anno.log\] -export PYTHONPATH=$PYTHONPATH:$RDIFF_PYTHON_PATH:${SCIPY_PATH} -${RDIFF_PYTHON_PATH} -W ignore::RuntimeWarning ${DIR}/../tools/ParseGFF.py ${GFF_INPUT} ${RDIFF_RES_DIR}/genes.mat #> ${RDIFF_RES_DIR}}/elegans-gff2anno.log -${DIR}/../bin/genes_cell2struct ${RDIFF_RES_DIR}/genes.mat +export PYTHONPATH=$PYTHONPATH:$RDIFF_PYTHON_PATH +touch ${RDIFF_RES_DIR}/genes.mat +${RDIFF_PYTHON_PATH} ${DIR}/../tools/GFFParser.py ${GFF_INPUT} ${RDIFF_RES_DIR}/genes.mat #> ${RDIFF_RES_DIR}}/elegans-gff2anno.log PARAM_VECT="$PARAM_VECT""GFF_INPUT:${RDIFF_RES_DIR}/genes.mat;" @@ -183,4 +183,3 @@ echo exec ${DIR}/start_interpreter.sh ${PROG} "$PARAM_VECT" -
--- a/rDiff/galaxy/rdiff.xml Thu Feb 14 23:38:36 2013 -0500 +++ b/rDiff/galaxy/rdiff.xml Tue Oct 08 07:15:44 2013 -0400 @@ -1,19 +1,27 @@ <tool id="rdiff-web" name="rDiff" version="0.3"> <description>Determines differentially expressed transcripts from read alignments</description> <command interpreter="bash"> - rdiff_run.sh $test_meth $anno_input $read_length $rdiff_out $rdiff_out.extra_files_path - #for $i in $replicate_groups - #for $j in $i.replicates - $j.bam_alignment? - #end for - : - #end for + rdiff_run.sh + $test_meth + $anno_input + $read_length + $rdiff_out $rdiff_out.extra_files_path +## +## replicate groups +## +#for $i in $replicate_groups +#for $j in $i.replicates +$j.bam_alignment? +#end for +: +#end for + >> $Log_File </command> <inputs> <!-- genome annotation in GFF format --> - <param format="gff3" name="anno_input" type="data" label="Genome annotation in GFF3 file" help="A tab delimited format for storing sequence features and annotations"/> + <param format="gff,gtf,gff3" name="anno_input" type="data" label="Genome annotation file" help="A tab delimited format for storing sequence features and annotations(GFF/GTF/GFF3)"/> <!-- RNA-seq samples and corresponding replicates --> <repeat name="replicate_groups" title="Sample" min="2" max="2"> @@ -22,19 +30,21 @@ </repeat> </repeat> - + <!-- rDiff test method --> <param name="test_meth" type="select" label="rDiff testing method" help="Statistical test on differential transcript expression data." > <option value="poisson">Poisson</option> <option value="param">Parametric</option> <option value="nonparam">Nonparametric</option> </param> + + <!-- read length uesd in the experiment--> <param name="read_length" type="text" value="75" help="default 75 nucleotide." label="Read length from sequencing experiment"/> </inputs> <outputs> - <data format="txt" name="rdiff_out" label="${tool.name} on ${on_string}: Differential Testing Result" /> - <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: log"/> + <data format="txt" name="rdiff_out" label="${tool.name} on ${on_string}: Differential Expression" /> + <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: Log"/> </outputs> <help>
--- a/rDiff/src/genes_cell2struct.m Thu Feb 14 23:38:36 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -function genes_cell2struct(anno_fname) -% GENES_CELL2STRUCT Converts genes stored as a cell to struct. -% -% genes_cell2struct(anno_fname) -% -% -- input -- -% anno_fname: name of file where genes as cell are stored -% -% -- output -- -% genes as a struct -% -% -% This program is free software; you can redistribute it and/or modify -% it under the terms of the GNU General Public License as published by -% the Free Software Foundation; either version 3 of the License, or -% (at your option) any later version. -% -% Written (W) 2009-2011 Regina Bohnert, Gunnar Raetsch -% Copyright (C) 2009-2011 Max Planck Society -% - -load(anno_fname, 'genes'); -if iscell(genes) - genes_cell = genes; - clear genes; - for g = 1:length(genes_cell), - gene = genes_cell{g}; - for e = 1:length(gene.exons) - gene.exons{e} = double(gene.exons{e}); - end - gene.exons = reshape(gene.exons, 1, length(gene.exons)); - gene.id = double(gene.id); - gene.start = double(gene.start); - gene.stop = double(gene.stop); - genes(g) = gene; - end - save(anno_fname, 'genes'); -end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/tools/GFFParser.py Tue Oct 08 07:15:44 2013 -0400 @@ -0,0 +1,379 @@ +#!/usr/bin/env python +""" +Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. + +Requirements: + Numpy :- http://numpy.org/ + Scipy :- http://scipy.org/ + +Copyright (C) + +2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. +2012-2013 Memorial Sloan-Kettering Cancer Center, New York City, USA. +""" + +import re +import os +import sys +import urllib +import numpy as np +import scipy.io as sio +from collections import defaultdict +import helper as utils + +def _attribute_tags(col9): + """ + Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file + """ + info = defaultdict(list) + is_gff = False + + if not col9: + return is_gff, info + + # trim the line ending semi-colon ucsc may have some white-space + col9 = col9.rstrip(';| ') + # attributes from 9th column + atbs = col9.split(" ; ") + if len(atbs) == 1: + atbs = col9.split("; ") + if len(atbs) == 1: + atbs = col9.split(";") + # check the GFF3 pattern which has key value pairs like: + gff3_pat = re.compile("\w+=") + # sometime GTF have: gene_id uc002zkg.1; + gtf_pat = re.compile("\s?\w+\s") + + key_vals = [] + + if gff3_pat.match(atbs[0]): # gff3 pattern + is_gff = True + key_vals = [at.split('=') for at in atbs] + elif gtf_pat.match(atbs[0]): # gtf pattern + for at in atbs: + key_vals.append(at.strip().split(" ",1)) + else: + # to handle attribute column has only single value + key_vals.append(['ID', atbs[0]]) + # get key, val items + for item in key_vals: + key, val = item + # replace the double qoutes from feature identifier + val = re.sub('"', '', val) + # replace the web formating place holders to plain text format + info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) + + return is_gff, info + +def _spec_features_keywd(gff_parts): + """ + Specify the feature key word according to the GFF specifications + """ + for t_id in ["transcript_id", "transcriptId", "proteinId"]: + try: + gff_parts["info"]["Parent"] = gff_parts["info"][t_id] + break + except KeyError: + pass + for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: + try: + gff_parts["info"]["GParent"] = gff_parts["info"][g_id] + break + except KeyError: + pass + ## TODO key words + for flat_name in ["Transcript", "CDS"]: + if gff_parts["info"].has_key(flat_name): + # parents + if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): + if not gff_parts['id']: + gff_parts['id'] = gff_parts['info'][flat_name][0] + #gff_parts["info"]["ID"] = [gff_parts["id"]] + # children + elif gff_parts["type"] in ["intron", "exon", "pseudogenic_exon", "three_prime_UTR", + "coding_exon", "five_prime_UTR", "CDS", "stop_codon", + "start_codon"]: + gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] + break + return gff_parts + +def Parse(ga_file): + """ + Parsing GFF/GTF file based on feature relationship, it takes the input file. + """ + child_map = defaultdict(list) + parent_map = dict() + + ga_handle = utils._open_file(ga_file) + + for rec in ga_handle: + rec = rec.strip('\n\r') + + # skip empty line fasta identifier and commented line + if not rec or rec[0] in ['#', '>']: + continue + # skip the genome sequence + if not re.search('\t', rec): + continue + + parts = rec.split('\t') + assert len(parts) >= 8, rec + + # process the attribute column (9th column) + ftype, tags = _attribute_tags(parts[-1]) + if not tags: # skip the line if no attribute column. + continue + + # extract fields + if parts[1]: + tags["source"] = parts[1] + if parts[7]: + tags["phase"] = parts[7] + + gff_info = dict() + gff_info['info'] = dict(tags) + #gff_info["is_gff3"] = ftype + gff_info['chr'] = parts[0] + + if parts[3] and parts[4]: + gff_info['location'] = [int(parts[3]) , + int(parts[4])] + gff_info['type'] = parts[2] + gff_info['id'] = tags.get('ID', [''])[0] + if parts[6] in ['?', '.']: + parts[6] = None + gff_info['strand'] = parts[6] + + # key word according to the GFF spec. + if not ftype: + gff_info = _spec_features_keywd(gff_info) + + # link the feature relationships + if gff_info['info'].has_key('Parent'): + for p in gff_info['info']['Parent']: + if p == gff_info['id']: + gff_info['id'] = '' + break + rec_category = 'child' + elif gff_info['id']: + rec_category = 'parent' + else: + rec_category = 'record' + + # depends on the record category organize the features + if rec_category == 'child': + for p in gff_info['info']['Parent']: + # create the data structure based on source and feature id + child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( + dict( type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + ID = gff_info['id'], + gene_id = gff_info['info'].get('GParent', '') + )) + elif rec_category == 'parent': + parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( + type = gff_info['type'], + location = gff_info['location'], + strand = gff_info['strand'], + name = tags.get('Name', [''])[0]) + elif rec_category == 'record': + #TODO how to handle plain records? + c = 1 + ga_handle.close() + + # depends on file type create parent feature + if not ftype: + parent_map, child_map = _create_missing_feature_type(parent_map, child_map) + + # connecting parent child relations + # // essentially the parent child features are here from any type of GTF/GFF2/GFF3 file + gene_mat = _format_gene_models(parent_map, child_map) + + return gene_mat + +def _format_gene_models(parent_nf_map, child_nf_map): + """ + Genarate GeneObject based on the parsed file contents + + parent_map: parent features with source and chromosome information + child_map: transctipt and exon information are encoded + """ + g_cnt = 0 + gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE()) + + for pkey, pdet in parent_nf_map.items(): + # considering only gene features + if not re.search(r'gene', pdet.get('type', '')): + continue + # infer the gene start and stop if not there in the + if not pdet.get('location', []): + GNS, GNE = [], [] + # multiple number of transcripts + for L1 in child_nf_map[pkey]: + GNS.append(L1.get('location', [])[0]) + GNE.append(L1.get('location', [])[1]) + GNS.sort() + GNE.sort() + pdet['location'] = [GNS[0], GNE[-1]] + orient = pdet.get('strand', '') + + gene[g_cnt]['id'] = g_cnt +1 + gene[g_cnt]['chr'] = pkey[0] + gene[g_cnt]['source'] = pkey[1] + gene[g_cnt]['name'] = pkey[-1] + gene[g_cnt]['start'] = pdet.get('location', [])[0] + gene[g_cnt]['stop'] = pdet.get('location', [])[1] + gene[g_cnt]['strand'] = orient + + # default value + gene[g_cnt]['is_alt_spliced'] = 0 + if len(child_nf_map[pkey]) > 1: + gene[g_cnt]['is_alt_spliced'] = 1 + + # complete sub-feature for all transcripts + dim = len(child_nf_map[pkey]) + TRS = np.zeros((dim,), dtype=np.object) + EXON = np.zeros((dim,), dtype=np.object) + + # fetching corresponding transcripts + for xq, Lv1 in enumerate(child_nf_map[pkey]): + + TID = Lv1.get('ID', '') + TRS[xq]= np.array([TID]) + + orient = Lv1.get('strand', '') + + # fetching different sub-features + child_feat = defaultdict(list) + for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: + E_TYP = Lv2.get('type', '') + child_feat[E_TYP].append(Lv2.get('location')) + + # make exon coordinate from cds and utr regions + if not child_feat.get('exon'): + if child_feat.get('CDS'): + exon_cod = utils.make_Exon_cod( orient, + NonetoemptyList(child_feat.get('five_prime_UTR')), + NonetoemptyList(child_feat.get('CDS')), + NonetoemptyList(child_feat.get('three_prime_UTR'))) + child_feat['exon'] = exon_cod + else: + # searching through keys to find a pattern describing exon feature + ex_key_pattern = [k for k in child_feat if k.endswith("exon")] + child_feat['exon'] = child_feat[ex_key_pattern[0]] + # TODO only UTR's + + # make general ascending order of coordinates + if orient == '-': + for etype, excod in child_feat.items(): + if len(excod) > 1: + if excod[0][0] > excod[-1][0]: + excod.reverse() + child_feat[etype] = excod + + # add sub-feature # make array for export to different out + EXON[xq] = np.array(child_feat.get('exon'), np.float64) + + # add sub-features to the parent gene feature + gene[g_cnt]['transcripts'] = TRS + gene[g_cnt]['exons'] = EXON + + gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], + Name = pdet.get('name'), + Source = pkey[1]) + g_cnt += 1 + + ## deleting empty gene records from the main array + for XP, ens in enumerate(gene): + if ens[0]==0: + break + + XQC = range(XP, len(gene)+1) + gene = np.delete(gene, XQC) + + return gene + +def NonetoemptyList(XS): + """ + Convert a None type to empty list + """ + return [] if XS is None else XS + +def _create_missing_feature_type(p_feat, c_feat): + """ + GFF/GTF file defines only child features. This function tries to create + the parent feature from the information provided in the attribute column. + + example: + chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; + chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; + chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; + + This function gets the parsed feature annotations. + """ + child_n_map = defaultdict(list) + for fid, det in c_feat.items(): + # get the details from grand child + GID = STRD = None + SPOS, EPOS = [], [] + TYP = dict() + for gchild in det: + GID = gchild.get('gene_id', [''])[0] + SPOS.append(gchild.get('location', [])[0]) + EPOS.append(gchild.get('location', [])[1]) + STRD = gchild.get('strand', '') + TYP[gchild.get('type', '')] = 1 + SPOS.sort() + EPOS.sort() + + # infer transcript type + transcript_type = 'transcript' + transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type + + # gene id and transcript id are same + transcript_id = fid[-1] + if GID == transcript_id: + transcript_id = 'Transcript:' + str(GID) + + # level -1 feature type + p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', + location = [], ## infer location based on multiple transcripts + strand = STRD, + name = GID ) + # level -2 feature type + child_n_map[(fid[0], fid[1], GID)].append( + dict( type = transcript_type, + location = [SPOS[0], EPOS[-1]], + strand = STRD, + ID = transcript_id, + gene_id = '' )) + # reorganizing the grand child + for gchild in det: + child_n_map[(fid[0], fid[1], transcript_id)].append( + dict( type = gchild.get('type', ''), + location = gchild.get('location'), + strand = gchild.get('strand'), + ID = gchild.get('ID'), + gene_id = '' )) + return p_feat, child_n_map + + +## General instruction to use the above functions: +## Usage: GFFParser.py in.gff3 out.mat + +try: + gff_file = sys.argv[1] + out_mat = sys.argv[2] +except: + print __doc__ + sys.exit(-1) + +## Parse the file accoring to the type and returns the genes informations -- +gene_struct = Parse(gff_file) + +## Write the gene annotations to a matlab struct array format -- +sio.savemat(out_mat, + mdict = dict(genes = gene_struct), + format = '5', + oned_as = 'row')
--- a/rDiff/tools/ParseGFF.py Thu Feb 14 23:38:36 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,532 +0,0 @@ -#!/usr/bin/env python -""" -Extract genome annotation from a GFF3 (a tab delimited -format for storing sequence features and annotations: -http://www.sequenceontology.org/gff3.shtml) file. - -Usage: ParseGFF.py in.gff3 out.mat - -Requirements: - Scipy :- http://scipy.org/ - Numpy :- http://numpy.org/ - -Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany - 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA -""" - -import re, sys -import scipy.io as sio -import numpy as np - -def addCDSphase(strand, cds): - """Add CDS phase to the CDS exons - """ - cds_region, cds_flag = [], 0 - if strand == '+': - for cdspos in cds: - if cds_flag == 0: - cdspos = (cdspos[0], cdspos[1], 0) - diff = (cdspos[1]-(cdspos[0]-1))%3 - else: - xy = 0 - if diff == 0: - cdspos = (cdspos[0], cdspos[1], 0) - elif diff == 1: - cdspos = (cdspos[0], cdspos[1], 2) - xy = 2 - elif diff == 2: - cdspos = (cdspos[0], cdspos[1], 1) - xy = 1 - diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 - cds_region.append(cdspos) - cds_flag = 1 - elif strand == '-': - cds.reverse() - for cdspos in cds: - if cds_flag == 0: - cdspos = (cdspos[0], cdspos[1], 0) - diff = (cdspos[1]-(cdspos[0]-1))%3 - else: - xy = 0 - if diff == 0: - cdspos = (cdspos[0], cdspos[1], 0) - elif diff == 1: - cdspos = (cdspos[0], cdspos[1], 2) - xy = 2 - elif diff == 2: - cdspos = (cdspos[0], cdspos[1], 1) - xy = 1 - diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 - cds_region.append(cdspos) - cds_flag = 1 - cds_region.reverse() - return cds_region - -def createExon(strand_p, five_p_utr, cds_cod, three_p_utr): - """Create exon cordinates from UTR's and CDS region - """ - exon_pos = [] - if strand_p == '+': - utr5_start, utr5_end = 0, 0 - if five_p_utr != []: - utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] - cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] - jun_exon = [] - if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: - jun_exon = [utr5_start, cds_5end] - if len(cds_cod) == 1: - five_prime_flag = 0 - if jun_exon != []: - five_p_utr = five_p_utr[:-1] - five_prime_flag = 1 - for utr5 in five_p_utr: - exon_pos.append(utr5) - jun_exon = [] - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[0][0] - utr3_end = three_p_utr[0][1] - if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: - jun_exon = [cds_5start, utr3_end] - three_prime_flag = 0 - if jun_exon != []: - cds_cod = cds_cod[:-1] - three_p_utr = three_p_utr[1:] - three_prime_flag = 1 - if five_prime_flag == 1 and three_prime_flag == 1: - exon_pos.append([utr5_start, utr3_end]) - if five_prime_flag == 1 and three_prime_flag == 0: - exon_pos.append([utr5_start, cds_5end]) - cds_cod = cds_cod[:-1] - if five_prime_flag == 0 and three_prime_flag == 1: - exon_pos.append([cds_5start, utr3_end]) - for cds in cds_cod: - exon_pos.append(cds) - for utr3 in three_p_utr: - exon_pos.append(utr3) - else: - if jun_exon != []: - five_p_utr = five_p_utr[:-1] - cds_cod = cds_cod[1:] - for utr5 in five_p_utr: - exon_pos.append(utr5) - exon_pos.append(jun_exon) if jun_exon != [] else '' - jun_exon = [] - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[0][0] - utr3_end = three_p_utr[0][1] - cds_3start = cds_cod[-1][0] - cds_3end = cds_cod[-1][1] - if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: - jun_exon = [cds_3start, utr3_end] - if jun_exon != []: - cds_cod = cds_cod[:-1] - three_p_utr = three_p_utr[1:] - for cds in cds_cod: - exon_pos.append(cds) - exon_pos.append(jun_exon) if jun_exon != [] else '' - for utr3 in three_p_utr: - exon_pos.append(utr3) - elif strand_p == '-': - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[-1][0] - utr3_end = three_p_utr[-1][1] - cds_3start = cds_cod[0][0] - cds_3end = cds_cod[0][1] - jun_exon = [] - if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: - jun_exon = [utr3_start, cds_3end] - if len(cds_cod) == 1: - three_prime_flag = 0 - if jun_exon != []: - three_p_utr = three_p_utr[:-1] - three_prime_flag = 1 - for utr3 in three_p_utr: - exon_pos.append(utr3) - jun_exon = [] - (utr5_start, utr5_end) = (0, 0) - if five_p_utr != []: - utr5_start = five_p_utr[0][0] - utr5_end = five_p_utr[0][1] - if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: - jun_exon = [cds_3start, utr5_end] - five_prime_flag = 0 - if jun_exon != []: - cds_cod = cds_cod[:-1] - five_p_utr = five_p_utr[1:] - five_prime_flag = 1 - if three_prime_flag == 1 and five_prime_flag == 1: - exon_pos.append([utr3_start, utr5_end]) - if three_prime_flag == 1 and five_prime_flag == 0: - exon_pos.append([utr3_start, cds_3end]) - cds_cod = cds_cod[:-1] - if three_prime_flag == 0 and five_prime_flag == 1: - exon_pos.append([cds_3start, utr5_end]) - for cds in cds_cod: - exon_pos.append(cds) - for utr5 in five_p_utr: - exon_pos.append(utr5) - else: - if jun_exon != []: - three_p_utr = three_p_utr[:-1] - cds_cod = cds_cod[1:] - for utr3 in three_p_utr: - exon_pos.append(utr3) - if jun_exon != []: - exon_pos.append(jun_exon) - jun_exon = [] - (utr5_start, utr5_end) = (0, 0) - if five_p_utr != []: - utr5_start = five_p_utr[0][0] - utr5_end = five_p_utr[0][1] - cds_5start = cds_cod[-1][0] - cds_5end = cds_cod[-1][1] - if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: - jun_exon = [cds_5start, utr5_end] - if jun_exon != []: - cds_cod = cds_cod[:-1] - five_p_utr = five_p_utr[1:] - for cds in cds_cod: - exon_pos.append(cds) - if jun_exon != []: - exon_pos.append(jun_exon) - for utr5 in five_p_utr: - exon_pos.append(utr5) - return exon_pos - -def init_gene(): - """Initializing the gene structure - """ - gene_details=dict( - id = '', - anno_id = [], - confgenes_id = [], - name = '', - source = '', - gene_info = {}, - alias = '', - name2 = [], - strand = '', - chr = '', - chr_num = [], - paralogs = [], - start = '', - stop = '', - transcripts = [], - transcript_info = [], - transcript_status = [], - transcript_valid = [], - exons = [], - exons_confirmed = [], - cds_exons = [], - utr5_exons = [], - utr3_exons = [], - tis = [], - tis_conf = [], - tis_info = [], - cdsStop = [], - cdsStop_conf = [], - cdsStop_info = [], - tss = [], - tss_info = [], - tss_conf = [], - cleave = [], - cleave_info = [], - cleave_conf = [], - polya = [], - polya_info = [], - polya_conf = [], - is_alt = [], - is_alt_spliced = 0, - is_valid = [], - transcript_complete = [], - is_complete = [], - is_correctly_gff3_referenced = '', - splicegraph = [] - ) - return gene_details - -def FeatureValueFormat(singlegene): - """Make feature value compactable to write in a .mat format - """ - comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object) - for i in range(len(singlegene['exons'])): - comp_exon[i]= np.array(singlegene['exons'][i]) - singlegene['exons'] = comp_exon - comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object) - for i in range(len(singlegene['cds_exons'])): - comp_cds[i]= np.array(singlegene['cds_exons'][i]) - singlegene['cds_exons'] = comp_cds - comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object) - for i in range(len(singlegene['utr3_exons'])): - comp_utr3[i]= np.array(singlegene['utr3_exons'][i]) - singlegene['utr3_exons'] = comp_utr3 - comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object) - for i in range(len(singlegene['utr5_exons'])): - comp_utr5[i]= np.array(singlegene['utr5_exons'][i]) - singlegene['utr5_exons'] = comp_utr5 - comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object) - for i in range(len(singlegene['transcripts'])): - comp_transcripts[i]= np.array(singlegene['transcripts'][i]) - singlegene['transcripts'] = comp_transcripts - comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object) - for i in range(len(singlegene['tss'])): - comp_tss[i]= np.array(singlegene['tss'][i]) - singlegene['tss'] = comp_tss - comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object) - for i in range(len(singlegene['tis'])): - comp_tis[i]= np.array(singlegene['tis'][i]) - singlegene['tis'] = comp_tis - comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object) - for i in range(len(singlegene['cleave'])): - comp_cleave[i]= np.array(singlegene['cleave'][i]) - singlegene['cleave'] = comp_cleave - comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object) - for i in range(len(singlegene['cdsStop'])): - comp_cdsStop[i]= np.array(singlegene['cdsStop'][i]) - singlegene['cdsStop'] = comp_cdsStop - - return singlegene - -def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt): - """Creating Coding/Non-coding gene models from parsed GFF objects. - """ - gene_counter, gene_models = 1, [] - for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature - if gene_entry in transcripts_cmpt: - gene=init_gene() - gene['id']=gene_counter - gene['name']=gene_entry[1] - gene['chr']=genes_cmpt[gene_entry]['chr'] - gene['source']=genes_cmpt[gene_entry]['source'] - gene['start']=genes_cmpt[gene_entry]['start'] - gene['stop']=genes_cmpt[gene_entry]['stop'] - gene['strand']=genes_cmpt[gene_entry]['strand'] - if not gene['strand'] in ['+', '-']: - gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc. - if len(transcripts_cmpt[gene_entry])>1: - gene['is_alt_spliced'] = 1 - gene['is_alt'] = 1 - gtype=[] - for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags - gene['transcripts'].append(tids['ID']) - gtype.append(tids['type']) - exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], [] - if (gene['chr'], tids['ID']) in exons_cmpt: - exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]] - if (gene['chr'], tids['ID']) in utr5_cmpt: - utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]] - if (gene['chr'], tids['ID']) in utr3_cmpt: - utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]] - if (gene['chr'], tids['ID']) in cds_cmpt: - cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]] - if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS - if cds_cod != []: - exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod) - - if gene['strand']=='-': ## general order to coordinates - if len(exon_cod) >1: - if exon_cod[0][0] > exon_cod[-1][0]: - exon_cod.reverse() - if len(cds_cod) >1: - if cds_cod[0][0] > cds_cod[-1][0]: - cds_cod.reverse() - if len(utr3_cod) >1: - if utr3_cod[0][0] > utr3_cod[-1][0]: - utr3_cod.reverse() - if len(utr5_cod) >1: - if utr5_cod[0][0] > utr5_cod[-1][0]: - utr5_cod.reverse() - - tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region - if cds_cod != []: - if gene['strand'] == '+': - tis = [cds_cod[0][0]] - cdsStop = [cds_cod[-1][1]-3] - elif gene['strand'] == '-': - tis = [cds_cod[-1][1]] - cdsStop = [cds_cod[0][0]+3] - if utr5_cod != []: - if gene['strand'] == '+': - tss = [utr5_cod[0][0]] - elif gene['strand'] == '-': - tss = [utr5_cod[-1][1]] - if utr3_cod != []: - if gene['strand'] == '+': - cleave = [utr3_cod[-1][1]] - elif gene['strand'] == '-': - cleave = [utr3_cod[0][0]] - - cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene - if cds_cod != []: ## adding phase to the CDS region - cds_cod_phase = addCDSphase(gene['strand'], cds_cod) - cds_status = 1 - gene['cds_exons'].append(cds_cod_phase) - - if exon_cod != []: - exon_status = 1 - if utr5_cod != [] or utr3_cod != []: - utr_status = 1 - if cds_status != 0 and exon_status != 0 and utr_status != 0: - gene['transcript_status'].append(1) - else: - gene['transcript_status'].append(0) - - if exon_cod: ## final check point for a valid gene model - gene['exons'].append(exon_cod) - gene['utr3_exons'].append(utr3_cod) - gene['utr5_exons'].append(utr5_cod) - gene['tis'].append(tis) - gene['cdsStop'].append(cdsStop) - gene['tss'].append(tss) - gene['cleave'].append(cleave) - - gtype=list(set(gtype)) ## different types - gene['gene_info']=dict(ID=gene_entry[1], - Source=genes_cmpt[gene_entry]['source'], - Type=gtype) - gene=FeatureValueFormat(gene) ## get prepare for MAT writing - gene_counter+=1 - gene_models.append(gene) - return gene_models - -def GFFParse(gff_file): - """Parsing GFF file based on feature relationship. - """ - genes, utr5, exons=dict(), dict(), dict() - transcripts, utr3, cds=dict(), dict(), dict() - # TODO Include growing key words of different non-coding/coding transcripts - features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna'] - gff_handle=open(gff_file, "rU") - for gff_line in gff_handle: - gff_line=gff_line.strip('\n\r').split('\t') - if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line - continue - if len(gff_line)==1: # skip fasta sequence/empty line if present - continue - assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line - if '' in gff_line: # skip this line if there any field with an empty value - print 'Skipping..', '\t'.join(gff_line) - continue - if gff_line[-1][-1]==';': # trim the last ';' character - gff_line[-1]=gff_line[-1].strip(';') - if gff_line[2].lower() in ['gene', 'pseudogene']: - gid, gene_info=None, dict() - gene_info['start']=int(gff_line[3]) - gene_info['stop']=int(gff_line[4]) - gene_info['chr']=gff_line[0] - gene_info['source']=gff_line[1] - gene_info['strand']=gff_line[6] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') # gff attributes are separated by key=value pair - if attb[0]=='ID': - gid=attb[1] - break - genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol. - elif gff_line[2].lower() in features: - gid, mrna_info=None, dict() - mrna_info['start']=int(gff_line[3]) - mrna_info['stop']=int(gff_line[4]) - mrna_info['chr']=gff_line[0] - mrna_info['strand']=gff_line[6] - mrna_info['type'] = gff_line[2] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') - if attb[0]=='Parent': - gid=attb[1] - elif attb[0]=='ID': - mrna_info[attb[0]]=attb[1] - for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein - if (gff_line[0], fid) in transcripts: - transcripts[(gff_line[0], fid)].append(mrna_info) - else: - transcripts[(gff_line[0], fid)]=[mrna_info] - elif gff_line[2].lower() in ['exon']: - tids, exon_info=None, dict() - exon_info['start']=int(gff_line[3]) - exon_info['stop']=int(gff_line[4]) - exon_info['chr']=gff_line[0] - exon_info['strand']=gff_line[6] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') - if attb[0]=='Parent': - tids=attb[1] - break - for tid in tids.split(','): - if (gff_line[0], tid) in exons: - exons[(gff_line[0], tid)].append(exon_info) - else: - exons[(gff_line[0], tid)]=[exon_info] - elif gff_line[2].lower() in ['five_prime_utr']: - utr5_info, tids=dict(), None - utr5_info['start']=int(gff_line[3]) - utr5_info['stop']=int(gff_line[4]) - utr5_info['chr']=gff_line[0] - utr5_info['strand']=gff_line[6] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') - if attb[0]=='Parent': - tids=attb[1] - break - for tid in tids.split(','): - if (gff_line[0], tid) in utr5: - utr5[(gff_line[0], tid)].append(utr5_info) - else: - utr5[(gff_line[0], tid)]=[utr5_info] - elif gff_line[2].lower() in ['cds']: - cds_info, tids=dict(), None - cds_info['start']=int(gff_line[3]) - cds_info['stop']=int(gff_line[4]) - cds_info['chr']=gff_line[0] - cds_info['strand']=gff_line[6] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') - if attb[0]=='Parent': - tids=attb[1] - break - for tid in tids.split(','): - if (gff_line[0], tid) in cds: - cds[(gff_line[0], tid)].append(cds_info) - else: - cds[(gff_line[0], tid)]=[cds_info] - elif gff_line[2].lower() in ['three_prime_utr']: - utr3_info, tids=dict(), None - utr3_info['start']=int(gff_line[3]) - utr3_info['stop']=int(gff_line[4]) - utr3_info['chr']=gff_line[0] - utr3_info['strand']=gff_line[6] - for attb in gff_line[-1].split(';'): - attb=attb.split('=') - if attb[0]=='Parent': - tids=attb[1] - break - for tid in tids.split(','): - if (gff_line[0], tid) in utr3: - utr3[(gff_line[0], tid)].append(utr3_info) - else: - utr3[(gff_line[0], tid)]=[utr3_info] - gff_handle.close() - return genes, transcripts, exons, utr3, utr5, cds - -def __main__(): - """extract genome feature information - """ - try: - gff_file = sys.argv[1] - mat_file = sys.argv[2] - except: - print __doc__ - sys.exit(-1) - - genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file) - gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds) - # TODO Write to matlab/octave struct instead of cell arrays. - sio.savemat(mat_file, - mdict=dict(genes=gene_models), - format='5', - oned_as='row') - -if __name__=='__main__': - __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/tools/helper.py Tue Oct 08 07:15:44 2013 -0400 @@ -0,0 +1,179 @@ +#!/usr/bin/env python +""" +Common utility functions +""" + +import os +import re +import sys +import gzip +import bz2 +import numpy + +def init_gene_DE(): + """ + Initializing the gene structure for DE + """ + gene_det = [('id', 'f8'), + ('chr', 'S15'), + ('exons', numpy.dtype), + ('gene_info', numpy.dtype), + ('is_alt_spliced', 'f8'), + ('name', 'S25'), + ('source', 'S25'), + ('start', 'f8'), + ('stop', 'f8'), + ('strand', 'S2'), + ('transcripts', numpy.dtype)] + + return gene_det + +def _open_file(fname): + """ + Open the file (supports .gz .bz2) and returns the handler + """ + try: + if os.path.splitext(fname)[1] == ".gz": + FH = gzip.open(fname, 'rb') + elif os.path.splitext(fname)[1] == ".bz2": + FH = bz2.BZ2File(fname, 'rb') + else: + FH = open(fname, 'rU') + except Exception as error: + sys.exit(error) + return FH + +def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr): + """ + Create exon cordinates from UTR's and CDS region + """ + exon_pos = [] + if strand_p == '+': + utr5_start, utr5_end = 0, 0 + if five_p_utr != []: + utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] + cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] + jun_exon = [] + if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: + jun_exon = [utr5_start, cds_5end] + if len(cds_cod) == 1: + five_prime_flag = 0 + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + five_prime_flag = 1 + for utr5 in five_p_utr: + exon_pos.append(utr5) + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: + jun_exon = [cds_5start, utr3_end] + three_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + three_prime_flag = 1 + if five_prime_flag == 1 and three_prime_flag == 1: + exon_pos.append([utr5_start, utr3_end]) + if five_prime_flag == 1 and three_prime_flag == 0: + exon_pos.append([utr5_start, cds_5end]) + cds_cod = cds_cod[:-1] + if five_prime_flag == 0 and three_prime_flag == 1: + exon_pos.append([cds_5start, utr3_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr3 in three_p_utr: + exon_pos.append(utr3) + else: + if jun_exon != []: + five_p_utr = five_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr5 in five_p_utr: + exon_pos.append(utr5) + exon_pos.append(jun_exon) if jun_exon != [] else '' + jun_exon = [] + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[0][0] + utr3_end = three_p_utr[0][1] + cds_3start = cds_cod[-1][0] + cds_3end = cds_cod[-1][1] + if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: + jun_exon = [cds_3start, utr3_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + three_p_utr = three_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + exon_pos.append(jun_exon) if jun_exon != [] else '' + for utr3 in three_p_utr: + exon_pos.append(utr3) + elif strand_p == '-': + utr3_start, utr3_end = 0, 0 + if three_p_utr != []: + utr3_start = three_p_utr[-1][0] + utr3_end = three_p_utr[-1][1] + cds_3start = cds_cod[0][0] + cds_3end = cds_cod[0][1] + jun_exon = [] + if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: + jun_exon = [utr3_start, cds_3end] + if len(cds_cod) == 1: + three_prime_flag = 0 + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + three_prime_flag = 1 + for utr3 in three_p_utr: + exon_pos.append(utr3) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: + jun_exon = [cds_3start, utr5_end] + five_prime_flag = 0 + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + five_prime_flag = 1 + if three_prime_flag == 1 and five_prime_flag == 1: + exon_pos.append([utr3_start, utr5_end]) + if three_prime_flag == 1 and five_prime_flag == 0: + exon_pos.append([utr3_start, cds_3end]) + cds_cod = cds_cod[:-1] + if three_prime_flag == 0 and five_prime_flag == 1: + exon_pos.append([cds_3start, utr5_end]) + for cds in cds_cod: + exon_pos.append(cds) + for utr5 in five_p_utr: + exon_pos.append(utr5) + else: + if jun_exon != []: + three_p_utr = three_p_utr[:-1] + cds_cod = cds_cod[1:] + for utr3 in three_p_utr: + exon_pos.append(utr3) + if jun_exon != []: + exon_pos.append(jun_exon) + jun_exon = [] + (utr5_start, utr5_end) = (0, 0) + if five_p_utr != []: + utr5_start = five_p_utr[0][0] + utr5_end = five_p_utr[0][1] + cds_5start = cds_cod[-1][0] + cds_5end = cds_cod[-1][1] + if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: + jun_exon = [cds_5start, utr5_end] + if jun_exon != []: + cds_cod = cds_cod[:-1] + five_p_utr = five_p_utr[1:] + for cds in cds_cod: + exon_pos.append(cds) + if jun_exon != []: + exon_pos.append(jun_exon) + for utr5 in five_p_utr: + exon_pos.append(utr5) + return exon_pos