annotate deseq-hts_1.0/tools/ParseGFF.py @ 11:cec4b4fb30be draft default tip

DEXSeq version 1.6 added
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 08:22:45 -0400
parents 8ab01cc29c4b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
1 #!/usr/bin/env python
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
2 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
3 Extract genome annotation from a GFF3 (a tab delimited format
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
4 for storing sequence features and annotations:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
5 http://www.sequenceontology.org/gff3.shtml) file.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
6
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
7 Usage: ParseGFF.py in.gff3 out.mat
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
8
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
9 Requirements:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
10 Scipy :- http://scipy.org/
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
11 Numpy :- http://numpy.org/
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
12
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
13 Copyright (C) 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
14 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
15 import re, sys, os
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
16 import scipy.io as sio
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
17 import numpy as np
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
18
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
19 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
20 """Create exon cordinates from UTR's and CDS region
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
21 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
22 exon_pos = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
23 if strand_p == '+':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
24 utr5_start, utr5_end = 0, 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
25 if five_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
26 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
27 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
28 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
29 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
30 jun_exon = [utr5_start, cds_5end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
31 if len(cds_cod) == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
32 five_prime_flag = 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
33 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
34 five_p_utr = five_p_utr[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
35 five_prime_flag = 1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
36 for utr5 in five_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
37 exon_pos.append(utr5)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
38 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
39 utr3_start, utr3_end = 0, 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
40 if three_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
41 utr3_start = three_p_utr[0][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
42 utr3_end = three_p_utr[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
43 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
44 jun_exon = [cds_5start, utr3_end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
45 three_prime_flag = 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
46 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
47 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
48 three_p_utr = three_p_utr[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
49 three_prime_flag = 1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
50 if five_prime_flag == 1 and three_prime_flag == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
51 exon_pos.append([utr5_start, utr3_end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
52 if five_prime_flag == 1 and three_prime_flag == 0:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
53 exon_pos.append([utr5_start, cds_5end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
54 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
55 if five_prime_flag == 0 and three_prime_flag == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
56 exon_pos.append([cds_5start, utr3_end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
57 for cds in cds_cod:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
58 exon_pos.append(cds)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
59 for utr3 in three_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
60 exon_pos.append(utr3)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
61 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
62 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
63 five_p_utr = five_p_utr[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
64 cds_cod = cds_cod[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
65 for utr5 in five_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
66 exon_pos.append(utr5)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
67 exon_pos.append(jun_exon) if jun_exon != [] else ''
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
68 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
69 utr3_start, utr3_end = 0, 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
70 if three_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
71 utr3_start = three_p_utr[0][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
72 utr3_end = three_p_utr[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
73 cds_3start = cds_cod[-1][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
74 cds_3end = cds_cod[-1][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
75 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
76 jun_exon = [cds_3start, utr3_end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
77 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
78 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
79 three_p_utr = three_p_utr[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
80 for cds in cds_cod:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
81 exon_pos.append(cds)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
82 exon_pos.append(jun_exon) if jun_exon != [] else ''
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
83 for utr3 in three_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
84 exon_pos.append(utr3)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
85 elif strand_p == '-':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
86 utr3_start, utr3_end = 0, 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
87 if three_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
88 utr3_start = three_p_utr[-1][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
89 utr3_end = three_p_utr[-1][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
90 cds_3start = cds_cod[0][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
91 cds_3end = cds_cod[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
92 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
93 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
94 jun_exon = [utr3_start, cds_3end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
95 if len(cds_cod) == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
96 three_prime_flag = 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
97 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
98 three_p_utr = three_p_utr[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
99 three_prime_flag = 1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
100 for utr3 in three_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
101 exon_pos.append(utr3)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
102 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
103 (utr5_start, utr5_end) = (0, 0)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
104 if five_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
105 utr5_start = five_p_utr[0][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
106 utr5_end = five_p_utr[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
107 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
108 jun_exon = [cds_3start, utr5_end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
109 five_prime_flag = 0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
110 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
111 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
112 five_p_utr = five_p_utr[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
113 five_prime_flag = 1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
114 if three_prime_flag == 1 and five_prime_flag == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
115 exon_pos.append([utr3_start, utr5_end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
116 if three_prime_flag == 1 and five_prime_flag == 0:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
117 exon_pos.append([utr3_start, cds_3end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
118 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
119 if three_prime_flag == 0 and five_prime_flag == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
120 exon_pos.append([cds_3start, utr5_end])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
121 for cds in cds_cod:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
122 exon_pos.append(cds)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
123 for utr5 in five_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
124 exon_pos.append(utr5)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
125 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
126 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
127 three_p_utr = three_p_utr[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
128 cds_cod = cds_cod[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
129 for utr3 in three_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
130 exon_pos.append(utr3)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
131 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
132 exon_pos.append(jun_exon)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
133 jun_exon = []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
134 (utr5_start, utr5_end) = (0, 0)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
135 if five_p_utr != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
136 utr5_start = five_p_utr[0][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
137 utr5_end = five_p_utr[0][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
138 cds_5start = cds_cod[-1][0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
139 cds_5end = cds_cod[-1][1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
140 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
141 jun_exon = [cds_5start, utr5_end]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
142 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
143 cds_cod = cds_cod[:-1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
144 five_p_utr = five_p_utr[1:]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
145 for cds in cds_cod:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
146 exon_pos.append(cds)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
147 if jun_exon != []:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
148 exon_pos.append(jun_exon)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
149 for utr5 in five_p_utr:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
150 exon_pos.append(utr5)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
151 return exon_pos
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
152
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
153 def init_gene():
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
154 """Initializing the gene structure
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
155 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
156 gene_details=dict(chr='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
157 exons=[],
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
158 gene_info={},
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
159 id='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
160 is_alt_spliced=0,
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
161 name='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
162 source='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
163 start='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
164 stop='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
165 strand='',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
166 transcripts=[])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
167 return gene_details
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
168
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
169 def FeatureValueFormat(singlegene):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
170 """Make feature value compactable to write in a .mat format
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
171 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
172 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
173 for i in range(len(singlegene['exons'])):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
174 comp_exon[i]= np.array(singlegene['exons'][i])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
175 singlegene['exons'] = comp_exon
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
176 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
177 for i in range(len(singlegene['transcripts'])):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
178 comp_transcripts[i] = np.array(singlegene['transcripts'][i])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
179 singlegene['transcripts'] = comp_transcripts
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
180 return singlegene
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
181
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
182 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
183 """Creating Coding/Non-coding gene models from parsed GFF objects.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
184 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
185 gene_counter, gene_models=1, []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
186 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
187 if gene_entry in transcripts_cmpt:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
188 gene=init_gene()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
189 gene['id']=gene_counter
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
190 gene['name']=gene_entry[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
191 gene['chr']=genes_cmpt[gene_entry]['chr']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
192 gene['source']=genes_cmpt[gene_entry]['source']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
193 gene['start']=genes_cmpt[gene_entry]['start']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
194 gene['stop']=genes_cmpt[gene_entry]['stop']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
195 gene['strand']=genes_cmpt[gene_entry]['strand']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
196 if not gene['strand'] in ['+', '-']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
197 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
198 gene['gene_info']=dict(ID=gene_entry[1])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
199 if len(transcripts_cmpt[gene_entry])>1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
200 gene['is_alt_spliced'] = 1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
201 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
202 gene['transcripts'].append(tids['ID'])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
203 if len(exons_cmpt) != 0:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
204 if (gene['chr'], tids['ID']) in exons_cmpt:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
205 exon_cod=[[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
206 else: ## build exon coordinates from UTR3, UTR5 and CDS
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
207 utr5_pos, cds_pos, utr3_pos = [], [], []
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
208 if (gene['chr'], tids['ID']) in utr5_cmpt:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
209 utr5_pos=[[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
210 if (gene['chr'], tids['ID']) in cds_cmpt:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
211 cds_pos=[[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
212 if (gene['chr'], tids['ID']) in utr3_cmpt:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
213 utr3_pos=[[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
214 exon_cod=createExon(gene['strand'], utr5_pos, cds_pos, utr3_pos)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
215 if gene['strand']=='-':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
216 if len(exon_cod) >1:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
217 if exon_cod[0][0] > exon_cod[-1][0]:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
218 exon_cod.reverse()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
219 if exon_cod:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
220 gene['exons'].append(exon_cod)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
221 gene=FeatureValueFormat(gene) # get prepare for MAT writing
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
222 gene_counter+=1
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
223 gene_models.append(gene)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
224 return gene_models
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
225
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
226 def GFFParse(gff_file):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
227 """Parsing GFF file based on feature relationship.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
228 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
229 genes, utr5, exons=dict(), dict(), dict()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
230 transcripts, utr3, cds=dict(), dict(), dict()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
231 # TODO Include growing key words of different non-coding/coding transcripts
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
232 features=['mRNA', 'transcript', 'ncRNA', 'miRNA', 'pseudogenic_transcript', 'rRNA', 'snoRNA', 'snRNA', 'tRNA', 'scRNA']
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
233 gff_handle=open(gff_file, "rU")
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
234 for gff_line in gff_handle:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
235 gff_line=gff_line.strip('\n\r').split('\t')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
236 if re.match(r'#|>', gff_line[0]): # skip commented line and fasta identifier line
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
237 continue
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
238 if len(gff_line)==1: # skip fasta sequence/empty line if present
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
239 continue
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
240 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
241 if '' in gff_line: # skip this line if there any field with an empty value
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
242 print 'Skipping..', '\t'.join(gff_line)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
243 continue
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
244 if gff_line[-1][-1]==';': # trim the last ';' character
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
245 gff_line[-1]=gff_line[-1].strip(';')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
246 if gff_line[2] in ['gene', 'pseudogene']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
247 gid, gene_info=None, dict()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
248 gene_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
249 gene_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
250 gene_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
251 gene_info['source']=gff_line[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
252 gene_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
253 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
254 attb=attb.split('=') # gff attributes are separated by key=value pair
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
255 if attb[0]=='ID':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
256 gid=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
257 break
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
258 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
259 elif gff_line[2] in features:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
260 gid, mrna_info=None, dict()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
261 mrna_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
262 mrna_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
263 mrna_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
264 mrna_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
265 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
266 attb=attb.split('=')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
267 if attb[0]=='Parent':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
268 gid=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
269 elif attb[0]=='ID':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
270 mrna_info[attb[0]]=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
271 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
272 if (gff_line[0], fid) in transcripts:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
273 transcripts[(gff_line[0], fid)].append(mrna_info)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
274 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
275 transcripts[(gff_line[0], fid)]=[mrna_info]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
276 elif gff_line[2] in ['exon']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
277 tids, exon_info=None, dict()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
278 exon_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
279 exon_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
280 exon_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
281 exon_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
282 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
283 attb=attb.split('=')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
284 if attb[0]=='Parent':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
285 tids=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
286 break
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
287 for tid in tids.split(','):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
288 if (gff_line[0], tid) in exons:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
289 exons[(gff_line[0], tid)].append(exon_info)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
290 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
291 exons[(gff_line[0], tid)]=[exon_info]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
292 elif gff_line[2] in ['five_prime_UTR']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
293 utr5_info, tids=dict(), None
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
294 utr5_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
295 utr5_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
296 utr5_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
297 utr5_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
298 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
299 attb=attb.split('=')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
300 if attb[0]=='Parent':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
301 tids=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
302 break
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
303 for tid in tids.split(','):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
304 if (gff_line[0], tid) in utr5:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
305 utr5[(gff_line[0], tid)].append(utr5_info)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
306 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
307 utr5[(gff_line[0], tid)]=[utr5_info]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
308 elif gff_line[2] in ['CDS']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
309 cds_info, tids=dict(), None
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
310 cds_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
311 cds_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
312 cds_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
313 cds_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
314 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
315 attb=attb.split('=')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
316 if attb[0]=='Parent':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
317 tids=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
318 break
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
319 for tid in tids.split(','):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
320 if (gff_line[0], tid) in cds:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
321 cds[(gff_line[0], tid)].append(cds_info)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
322 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
323 cds[(gff_line[0], tid)]=[cds_info]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
324 elif gff_line[2] in ['three_prime_UTR']:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
325 utr3_info, tids=dict(), None
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
326 utr3_info['start']=int(gff_line[3])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
327 utr3_info['stop']=int(gff_line[4])
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
328 utr3_info['chr']=gff_line[0]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
329 utr3_info['strand']=gff_line[6]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
330 for attb in gff_line[-1].split(';'):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
331 attb=attb.split('=')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
332 if attb[0]=='Parent':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
333 tids=attb[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
334 break
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
335 for tid in tids.split(','):
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
336 if (gff_line[0], tid) in utr3:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
337 utr3[(gff_line[0], tid)].append(utr3_info)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
338 else:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
339 utr3[(gff_line[0], tid)]=[utr3_info]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
340 gff_handle.close()
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
341 return genes, transcripts, exons, utr3, utr5, cds
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
342
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
343 def __main__():
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
344 """This function provides a best way to extract genome feature
1
8ab01cc29c4b Uploaded
vipints
parents: 0
diff changeset
345 information from a GFF3 file for the downstream processing.
0
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
346 """
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
347 try:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
348 gff_file = sys.argv[1]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
349 mat_file = sys.argv[2]
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
350 except:
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
351 print __doc__
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
352 sys.exit(-1)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
353 genes, transcripts, exons, utr3, utr5, cds=GFFParse(gff_file)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
354 gene_models=CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds)
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
355 # TODO Write to matlab/octave struct instead of cell arrays.
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
356 sio.savemat(mat_file,
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
357 mdict=dict(genes=gene_models),
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
358 format='5',
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
359 oned_as='row')
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
360
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
361 if __name__=='__main__':
94a108763d9e deseq-hts version 1.0 wraps the DESeq 1.6.0
vipints
parents:
diff changeset
362 __main__()