# HG changeset patch # User brigidar # Date 1444405957 14400 # Node ID 0c769b205d1837d2be4a4ffce167b84674ba361b # Parent 53f1f83e80a1a7fdc7c32344232227c76edb4dd4 Deleted selected files diff -r 53f1f83e80a1 -r 0c769b205d18 helper.py --- a/helper.py Fri Oct 09 11:52:31 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,333 +0,0 @@ -#!/usr/bin/env python -""" -Common utility functions -""" - -import os -import re -import sys -import bz2 -import gzip -import numpy - -def init_gene(): - """ - Initializing the gene structure - """ - - gene_det = [('id', 'f8'), - ('anno_id', numpy.dtype), - ('confgenes_id', numpy.dtype), - ('name', 'S25'), - ('source', 'S25'), - ('gene_info', numpy.dtype), - ('alias', 'S15'), - ('name2', numpy.dtype), - ('strand', 'S2'), - ('score', 'S15'), - ('chr', 'S15'), - ('chr_num', numpy.dtype), - ('paralogs', numpy.dtype), - ('start', 'f8'), - ('stop', 'f8'), - ('transcripts', numpy.dtype), - ('transcript_type', numpy.dtype), - ('transcript_info', numpy.dtype), - ('transcript_score', numpy.dtype), - ('transcript_status', numpy.dtype), - ('transcript_valid', numpy.dtype), - ('exons', numpy.dtype), - ('exons_confirmed', numpy.dtype), - ('cds_exons', numpy.dtype), - ('utr5_exons', numpy.dtype), - ('utr3_exons', numpy.dtype), - ('tis', numpy.dtype), - ('tis_conf', numpy.dtype), - ('tis_info', numpy.dtype), - ('cdsStop', numpy.dtype), - ('cdsStop_conf', numpy.dtype), - ('cdsStop_info', numpy.dtype), - ('tss', numpy.dtype), - ('tss_info', numpy.dtype), - ('tss_conf', numpy.dtype), - ('cleave', numpy.dtype), - ('cleave_info', numpy.dtype), - ('cleave_conf', numpy.dtype), - ('polya', numpy.dtype), - ('polya_info', numpy.dtype), - ('polya_conf', numpy.dtype), - ('is_alt', 'f8'), - ('is_alt_spliced', 'f8'), - ('is_valid', numpy.dtype), - ('transcript_complete', numpy.dtype), - ('is_complete', numpy.dtype), - ('is_correctly_gff3_referenced', 'S5'), - ('splicegraph', numpy.dtype) ] - - return gene_det - -def open_file(fname): - """ - Open the file (supports .gz .bz2) and returns the handler - - @args fname: input file name for reading - @type fname: str - """ - - try: - if os.path.splitext(fname)[1] == ".gz": - FH = gzip.open(fname, 'rb') - elif os.path.splitext(fname)[1] == ".bz2": - FH = bz2.BZ2File(fname, 'rb') - else: - FH = open(fname, 'rU') - except Exception as error: - sys.exit(error) - - return FH - -def add_CDS_phase(strand, cds): - """ - Calculate CDS phase and add to the CDS exons - - @args strand: feature strand information - @type strand: +/- - @args cds: coding exon coordinates - @type cds: numpy array [[int, int, int]] - """ - - cds_region, cds_flag = [], 0 - if strand == '+': - for cdspos in cds: - if cds_flag == 0: - cdspos = (cdspos[0], cdspos[1], 0) - diff = (cdspos[1]-(cdspos[0]-1))%3 - else: - xy = 0 - if diff == 0: - cdspos = (cdspos[0], cdspos[1], 0) - elif diff == 1: - cdspos = (cdspos[0], cdspos[1], 2) - xy = 2 - elif diff == 2: - cdspos = (cdspos[0], cdspos[1], 1) - xy = 1 - diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 - cds_region.append(cdspos) - cds_flag = 1 - elif strand == '-': - cds.reverse() - for cdspos in cds: - if cds_flag == 0: - cdspos = (cdspos[0], cdspos[1], 0) - diff = (cdspos[1]-(cdspos[0]-1))%3 - else: - xy = 0 - if diff == 0: - cdspos = (cdspos[0], cdspos[1], 0) - elif diff == 1: - cdspos = (cdspos[0], cdspos[1], 2) - xy = 2 - elif diff == 2: - cdspos = (cdspos[0], cdspos[1], 1) - xy = 1 - diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3 - cds_region.append(cdspos) - cds_flag = 1 - cds_region.reverse() - return cds_region - -def buildUTR(cc, ec, strand): - """ - Build UTR regions from a given set of CDS and exon coordiantes of a gene - - @args cc: coding exon coordinates - @type cc: numpy array [[int, int, int]] - @args ec: exon coordinates - @type ec: numpy array [[int, int]] - @args strand: feature strand information - @type strand: +/- - """ - - utr5 = [] - utr3 = [] - if strand == '+': - cds_s = cc[0][0] - for ex in ec: - if ex[0] <= cds_s and cds_s <= ex[1]: - if ex[0] != cds_s:utr5.append((ex[0], cds_s-1)) - break - else: - utr5.append(ex) - cds_e = cc[-1][1] - for i in range(len(ec)): - i += 1 - if ec[-i][0] <= cds_e and cds_e <= ec[-i][1]: - if ec[-i][1] != cds_e:utr3.append((cds_e +1, ec[-i][1])) - break - else: - utr3.append(ec[-i]) - utr3.reverse() - elif strand == '-': - cds_s = cc[-1][1] - for i in range(len(ec)): - i += 1 - if ec[-i][0] <= cds_s and cds_s <= ec[-i][1]: - if ec[-i][1] != cds_s:utr5.append((cds_s+1, ec[-i][1])) - break - else: - utr5.append(ec[-i]) - utr5.reverse() - cds_e = cc[0][0] - for ex in ec: - if ex[0] <= cds_e and cds_e <= ex[1]: - if ex[0] != cds_e:utr3.append((ex[0], cds_e-1)) - break - else: - utr3.append(ex) - return utr5, utr3 - -def make_Exon_cod(strand_p, five_p_utr, cds_cod, three_p_utr): - """ - Create exon cordinates from UTR's and CDS region - - @args strand_p: feature strand information - @type strand_p: +/- - @args five_p_utr: five prime utr exon coordinates - @type five_p_utr: numpy array [[int, int]] - @args cds_cod: coding exon coordinates - @type cds_cod: numpy array [[int, int, int]] - @args three_p_utr: three prime utr exon coordinates - @type three_p_utr: numpy array [[int, int]] - """ - - exon_pos = [] - if strand_p == '+': - utr5_start, utr5_end = 0, 0 - if five_p_utr != []: - utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] - cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] - jun_exon = [] - if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: - jun_exon = [utr5_start, cds_5end] - if len(cds_cod) == 1: - five_prime_flag = 0 - if jun_exon != []: - five_p_utr = five_p_utr[:-1] - five_prime_flag = 1 - for utr5 in five_p_utr: - exon_pos.append(utr5) - jun_exon = [] - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[0][0] - utr3_end = three_p_utr[0][1] - if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: - jun_exon = [cds_5start, utr3_end] - three_prime_flag = 0 - if jun_exon != []: - cds_cod = cds_cod[:-1] - three_p_utr = three_p_utr[1:] - three_prime_flag = 1 - if five_prime_flag == 1 and three_prime_flag == 1: - exon_pos.append([utr5_start, utr3_end]) - if five_prime_flag == 1 and three_prime_flag == 0: - exon_pos.append([utr5_start, cds_5end]) - cds_cod = cds_cod[:-1] - if five_prime_flag == 0 and three_prime_flag == 1: - exon_pos.append([cds_5start, utr3_end]) - for cds in cds_cod: - exon_pos.append(cds) - for utr3 in three_p_utr: - exon_pos.append(utr3) - else: - if jun_exon != []: - five_p_utr = five_p_utr[:-1] - cds_cod = cds_cod[1:] - for utr5 in five_p_utr: - exon_pos.append(utr5) - exon_pos.append(jun_exon) if jun_exon != [] else '' - jun_exon = [] - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[0][0] - utr3_end = three_p_utr[0][1] - cds_3start = cds_cod[-1][0] - cds_3end = cds_cod[-1][1] - if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: - jun_exon = [cds_3start, utr3_end] - if jun_exon != []: - cds_cod = cds_cod[:-1] - three_p_utr = three_p_utr[1:] - for cds in cds_cod: - exon_pos.append(cds) - exon_pos.append(jun_exon) if jun_exon != [] else '' - for utr3 in three_p_utr: - exon_pos.append(utr3) - elif strand_p == '-': - utr3_start, utr3_end = 0, 0 - if three_p_utr != []: - utr3_start = three_p_utr[-1][0] - utr3_end = three_p_utr[-1][1] - cds_3start = cds_cod[0][0] - cds_3end = cds_cod[0][1] - jun_exon = [] - if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: - jun_exon = [utr3_start, cds_3end] - if len(cds_cod) == 1: - three_prime_flag = 0 - if jun_exon != []: - three_p_utr = three_p_utr[:-1] - three_prime_flag = 1 - for utr3 in three_p_utr: - exon_pos.append(utr3) - jun_exon = [] - (utr5_start, utr5_end) = (0, 0) - if five_p_utr != []: - utr5_start = five_p_utr[0][0] - utr5_end = five_p_utr[0][1] - if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: - jun_exon = [cds_3start, utr5_end] - five_prime_flag = 0 - if jun_exon != []: - cds_cod = cds_cod[:-1] - five_p_utr = five_p_utr[1:] - five_prime_flag = 1 - if three_prime_flag == 1 and five_prime_flag == 1: - exon_pos.append([utr3_start, utr5_end]) - if three_prime_flag == 1 and five_prime_flag == 0: - exon_pos.append([utr3_start, cds_3end]) - cds_cod = cds_cod[:-1] - if three_prime_flag == 0 and five_prime_flag == 1: - exon_pos.append([cds_3start, utr5_end]) - for cds in cds_cod: - exon_pos.append(cds) - for utr5 in five_p_utr: - exon_pos.append(utr5) - else: - if jun_exon != []: - three_p_utr = three_p_utr[:-1] - cds_cod = cds_cod[1:] - for utr3 in three_p_utr: - exon_pos.append(utr3) - if jun_exon != []: - exon_pos.append(jun_exon) - jun_exon = [] - (utr5_start, utr5_end) = (0, 0) - if five_p_utr != []: - utr5_start = five_p_utr[0][0] - utr5_end = five_p_utr[0][1] - cds_5start = cds_cod[-1][0] - cds_5end = cds_cod[-1][1] - if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: - jun_exon = [cds_5start, utr5_end] - if jun_exon != []: - cds_cod = cds_cod[:-1] - five_p_utr = five_p_utr[1:] - for cds in cds_cod: - exon_pos.append(cds) - if jun_exon != []: - exon_pos.append(jun_exon) - for utr5 in five_p_utr: - exon_pos.append(utr5) - return exon_pos