0
|
1 import os
|
|
2 import argparse
|
|
3 import sys
|
|
4 import string
|
|
5
|
|
6 from galaxy.model.orm import *
|
|
7 import logging
|
|
8 from galaxy import eggs
|
|
9 eggs.require('SQLAlchemy')
|
|
10 import sqlalchemy
|
|
11
|
|
12
|
|
13 class CummerbundParser(object):
|
|
14
|
|
15 def __init__(self, opts):
|
|
16 self.cummerbund_db = opts.filename
|
|
17 self.__connect_database()
|
|
18
|
|
19 def generate_file( self, table ):
|
|
20 if hasattr( self, table ):
|
|
21 with open( '%s.tabular' % table, 'w' ) as self.fh:
|
|
22 getattr( self, table )()
|
|
23 else:
|
|
24 print 'Table %s is not supported or does not exist.' % table
|
|
25
|
|
26 def __connect_database( self ):
|
|
27 database_connection = 'sqlite:///%s' % os.path.abspath( self.cummerbund_db )
|
|
28 # Initialize the database connection.
|
|
29 engine = create_engine( database_connection )
|
|
30 meta = MetaData( bind=engine )
|
|
31 sa_sesssion = Session = scoped_session( sessionmaker( bind=engine, autoflush=False, autocommit=True ) )
|
|
32 self.session = sa_sesssion
|
|
33
|
|
34 def __write_line(self, line):
|
|
35 columns = []
|
|
36 for col in line:
|
|
37 if isinstance( col, float ):
|
|
38 if str( col ) in [ '-inf', 'inf' ]:
|
|
39 columns.append( str( col ) )
|
|
40 elif col == int(col):
|
|
41 columns.append( str( int( col ) ) )
|
|
42 else:
|
|
43 columns.append( str( col ) )
|
|
44 elif col is None:
|
|
45 columns.append( '-' )
|
|
46 else:
|
|
47 columns.append( str( col ) )
|
|
48 print >>self.fh, '\t'.join( columns )
|
|
49
|
|
50 def __get_diff_from_table( self, table, identifier ):
|
|
51 columns = [ '${table}.${identifier}', '${table}.gene_id', 'genes.gene_short_name', 'genes.locus',
|
|
52 '${table}.sample_1', '${table}.sample_2', '${table}.status',
|
|
53 '${table}.value_1', '${table}.value_2', '${table}.JS_dist',
|
|
54 '${table}.test_stat', '${table}.p_value', '${table}.q_value',
|
|
55 '${table}.significant' ]
|
|
56 query = string.Template( 'SELECT %s FROM ${table} JOIN genes on ${table}.gene_id = genes.gene_id' % ', '.join(columns) )
|
|
57 result = self.session.execute( query.safe_substitute( table=table, identifier=identifier ) )
|
|
58 self.__write_line( [ 'test_id', 'gene_id', 'gene', 'locus', 'sample_1',
|
|
59 'sample_2', 'status', 'value_1', 'value_2', 'sqrt(JS)',
|
|
60 'test_stat', 'p_value', 'q_value', 'significant' ] )
|
|
61 for row in result:
|
|
62 self.__write_line( row )
|
|
63
|
|
64 def __get_read_group_data( self, table, identifier ):
|
|
65 header = [ 'tracking_id', 'condition', 'replicate', 'raw_frags',
|
|
66 'internal_scaled_frags', 'external_scaled_frags', 'FPKM',
|
|
67 'effective_length', 'status' ]
|
|
68 columns = [ identifier, 'sample_name', 'replicate', 'raw_frags',
|
|
69 'internal_scaled_frags', 'external_scaled_frags', 'fpkm',
|
|
70 'effective_length', 'status' ]
|
|
71 self.__write_line( header )
|
|
72 for row in self.session.execute( 'SELECT %s FROM %s' % ( ', '.join( columns ), table ) ):
|
|
73 self.__write_line( row )
|
|
74
|
|
75
|
|
76 def __get_exp_diff( self, table, data_table, data_table_as, column ):
|
|
77 header = [ 'test_id', 'gene_id', 'gene', 'locus', 'sample_1', 'sample_2',
|
|
78 'status', 'value_1', 'value_2', 'log2(fold_change)', 'test_stat',
|
|
79 'p_value', 'q_value', 'significant' ]
|
|
80 columns = [ '${dtas}.${column}', '${table}.gene_id', '${table}.gene_short_name', '${table}.locus',
|
|
81 '${dtas}.sample_1', '${dtas}.sample_2', '${dtas}.status',
|
|
82 '${dtas}.value_1', '${dtas}.value_2', '${dtas}.log2_fold_change',
|
|
83 '${dtas}.test_stat', '${dtas}.p_value', '${dtas}.q_value',
|
|
84 '${dtas}.significant' ]
|
|
85 query = string.Template( 'SELECT %s FROM ${dtab} as ${dtas} JOIN ${table} on ${dtas}.${column} = ${table}.${column}' % ', '.join( columns ) )
|
|
86 self.__write_line( header )
|
|
87 for row in self.session.execute( query.safe_substitute( dtas=data_table_as, dtab=data_table, table=table, column=column ) ):
|
|
88 self.__write_line( row )
|
|
89
|
|
90 def __get_per_sample_fpkm( self, identifiers, table, column ):
|
|
91 columns = []
|
|
92 for identifier in identifiers:
|
|
93 samples = self.session.execute( "SELECT sample_name FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, identifier[0] ) )
|
|
94 for sample in samples:
|
|
95 sample_name = sample[0]
|
|
96 columns.extend( [ '%s_FPKM' % sample_name,
|
|
97 '%s_conf_lo' % sample_name,
|
|
98 '%s_conf_hi' % sample_name,
|
|
99 '%s_status' % sample_name ] )
|
|
100 return columns
|
|
101
|
|
102 def __get_fpkms( self, table, data_table, column ):
|
|
103 tss_columns = [ column, 'class_code', 'nearest_ref_id', 'gene_id',
|
|
104 'gene_short_name', column, 'locus', 'length', 'coverage' ]
|
|
105 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name',
|
|
106 'tss_id', 'locus', 'length', 'coverage' ]
|
|
107 tss_groups = self.session.execute( 'SELECT %s FROM %s LIMIT 1' % ( ', '.join( tss_columns ), table ) )
|
|
108 output_cols.extend( self.__get_per_sample_fpkm( identifiers=tss_groups, column=column, table=data_table ) )
|
|
109 self.__write_line( output_cols )
|
|
110 tss_groups = self.session.execute( 'SELECT %s FROM %s' % ( ', '.join( tss_columns ), table ) )
|
|
111 for tss_group in tss_groups:
|
|
112 out_data = list( tss_group )
|
|
113 samples = self.session.execute( "SELECT fpkm, conf_hi, conf_lo, quant_status FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( data_table, column, tss_group[0] ) )
|
|
114 for sample in samples:
|
|
115 out_data.extend( list( sample ) )
|
|
116 self.__write_line( out_data )
|
|
117
|
|
118 def __get_count_data( self, table, column ):
|
|
119 output_cols = [ 'tracking_id' ]
|
|
120 tss_groups = self.session.execute( 'SELECT %s FROM %s LIMIT 1' % ( column, table ) )
|
|
121 output_cols.extend( self.__get_per_sample_count_cols( identifiers=tss_groups, table=table, column=column ) )
|
|
122 self.__write_line( output_cols )
|
|
123 self.__get_per_sample_count_data( table=table, column=column )
|
|
124
|
|
125 def __get_per_sample_count_data( self, table, column ):
|
|
126 result = self.session.execute( 'SELECT DISTINCT(%s) FROM %s' % ( column, table ) )
|
|
127 for row in result:
|
|
128 isoform_id = row[0]
|
|
129 output_data = [ isoform_id ]
|
|
130 per_sample = self.session.execute( "SELECT count, variance, uncertainty, dispersion, status FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, isoform_id ) )
|
|
131 for samplerow in per_sample:
|
|
132 output_data.extend( list( samplerow ) )
|
|
133 self.__write_line( output_data )
|
|
134
|
|
135 def __get_per_sample_count_cols( self, identifiers, table, column ):
|
|
136 columns = []
|
|
137 for identifier in identifiers:
|
|
138 samples = self.session.execute( "SELECT sample_name FROM %s WHERE %s = '%s' ORDER BY sample_name ASC" % ( table, column, identifier[0] ) )
|
|
139 for sample in samples:
|
|
140 sample_name = sample[0]
|
|
141 columns.extend( [ '%s_count' % sample_name,
|
|
142 '%s_count_variance' % sample_name,
|
|
143 '%s_count_uncertainty_var' % sample_name,
|
|
144 '%s_count_dispersion_var' % sample_name,
|
|
145 '%s_status' % sample_name ] )
|
|
146 return columns
|
|
147
|
|
148 def splicing_diff( self ):
|
|
149 self.__get_diff_from_table( 'splicingDiffData', 'TSS_group_id' )
|
|
150
|
|
151 def promoters_diff( self ):
|
|
152 self.__get_diff_from_table( 'promoterDiffData', 'gene_id' )
|
|
153
|
|
154 def cds_diff( self ):
|
|
155 self.__get_diff_from_table( 'CDSDiffData', 'gene_id' )
|
|
156
|
|
157 def tss_fpkm( self ):
|
|
158 data_table = 'TSSData'
|
|
159 table = 'TSS'
|
|
160 column = 'TSS_group_id'
|
|
161 self.__get_fpkms( data_table=data_table, table=table, column=column )
|
|
162
|
|
163 def isoform_fpkm( self ):
|
|
164 data_table = 'isoformData'
|
|
165 table = 'isoforms'
|
|
166 column = 'isoform_id'
|
|
167 self.__get_fpkms( data_table=data_table, table=table, column=column )
|
|
168
|
|
169 def genes_fpkm( self ):
|
|
170 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name',
|
|
171 'tss_id', 'locus', 'length', 'coverage' ]
|
|
172 iso_groups = self.session.execute( 'SELECT gene_id FROM genes LIMIT 1' )
|
|
173 output_cols.extend( self.__get_per_sample_fpkm( identifiers=iso_groups, column='gene_id', table='geneData' ) )
|
|
174 self.__write_line( output_cols )
|
|
175 data_columns = [ 'genes.gene_id', 'genes.class_code', 'genes.nearest_ref_id', 'genes.gene_id', 'genes.gene_short_name',
|
|
176 'GROUP_CONCAT(TSS.TSS_group_id)', 'genes.locus', 'genes.length', 'genes.coverage' ]
|
|
177 query = 'SELECT %s FROM genes JOIN TSS on TSS.gene_id = genes.gene_id GROUP BY genes.gene_id' % ', '.join( data_columns )
|
|
178 result = self.session.execute( query )
|
|
179 for row in result:
|
|
180 gene_id = row[0]
|
|
181 output_data = list( row )
|
|
182 per_sample = self.session.execute( "SELECT fpkm, conf_lo, conf_hi, quant_status FROM geneData WHERE gene_id = '%s' ORDER BY sample_name ASC" % gene_id )
|
|
183 for samplerow in per_sample:
|
|
184 output_data.extend( list( samplerow ) )
|
|
185 self.__write_line( output_data )
|
|
186
|
|
187 def cds_fpkm( self ):
|
|
188 output_cols = [ 'tracking_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name',
|
|
189 'tss_id', 'locus', 'length', 'coverage' ]
|
|
190 iso_groups = self.session.execute( 'SELECT CDS_id FROM CDS LIMIT 1' )
|
|
191 output_cols.extend( self.__get_per_sample_fpkm( identifiers=iso_groups, column='CDS_id', table='CDSData' ) )
|
|
192 self.__write_line( output_cols )
|
|
193 data_columns = [ 'CDS_id', 'class_code', 'nearest_ref_id', 'gene_id', 'gene_short_name',
|
|
194 'GROUP_CONCAT(TSS_group_id)', 'locus', 'length', 'coverage' ]
|
|
195 query = 'SELECT %s FROM CDS GROUP BY CDS_id' % ', '.join( data_columns )
|
|
196 result = self.session.execute( query )
|
|
197 for row in result:
|
|
198 CDS_id = row[0]
|
|
199 output_data = list( row )
|
|
200 per_sample = self.session.execute( "SELECT fpkm, conf_lo, conf_hi, quant_status FROM CDSData WHERE CDS_id = '%s' ORDER BY sample_name ASC" % CDS_id )
|
|
201 for samplerow in per_sample:
|
|
202 output_data.extend( list( samplerow ) )
|
|
203 self.__write_line( output_data )
|
|
204
|
|
205 def tss_count_tracking( self ):
|
|
206 self.__get_count_data( table='TSSCount', column='TSS_group_id' )
|
|
207
|
|
208 def isoform_count( self ):
|
|
209 self.__get_count_data( table='isoformCount', column='isoform_id' )
|
|
210
|
|
211 def genes_count( self ):
|
|
212 self.__get_count_data( table='geneCount', column='gene_id' )
|
|
213
|
|
214 def cds_count( self ):
|
|
215 self.__get_count_data( table='CDSCount', column='CDS_id' )
|
|
216
|
|
217 def tss_group_exp( self ):
|
|
218 columns = [ 'TEDD.TSS_group_id', 'TSS.gene_id', 'TSS.gene_short_name', 'TSS.locus',
|
|
219 'TEDD.sample_1', 'TEDD.sample_2', 'TEDD.status',
|
|
220 'TEDD.value_1', 'TEDD.value_2', 'TEDD.log2_fold_change',
|
|
221 'TEDD.test_stat', 'TEDD.p_value', 'TEDD.q_value', 'TEDD.significant' ]
|
|
222 query = [ 'SELECT %s FROM TSSExpDiffData AS TEDD' % ', '.join(columns),
|
|
223 'JOIN TSS on TEDD.TSS_group_id = TSS.TSS_group_id' ]
|
|
224 self.__write_line( [ 'test_id', 'gene_id', 'gene', 'locus',
|
|
225 'sample_1', 'sample_2', 'status', 'value_1',
|
|
226 'value_2', 'log2(fold_change)', 'test_stat',
|
|
227 'p_value', 'q_value', 'significant' ] )
|
|
228 for row in self.session.execute( ' '.join( query ) ):
|
|
229 self.__write_line( row )
|
|
230
|
|
231 def run_info( self ):
|
|
232 self.__write_line( [ 'param', 'value' ] )
|
|
233 for row in self.session.execute( 'SELECT param, value FROM runInfo' ):
|
|
234 self.__write_line( row )
|
|
235
|
|
236 def read_groups( self ):
|
|
237 self.__write_line( [ 'file', 'condition', 'replicate_num', 'total_mass', 'norm_mass', 'internal_scale', 'external_scale' ] )
|
|
238 for row in self.session.execute( 'SELECT file, sample_name, replicate, total_mass, norm_mass, internal_scale, external_scale FROM replicates' ):
|
|
239 self.__write_line( row )
|
|
240
|
|
241 def isoform_exp_diff( self ):
|
|
242 self.__get_exp_diff( table='isoforms', data_table='isoformExpDiffData', data_table_as='iED', column='isoform_id' )
|
|
243
|
|
244 def gene_exp_diff( self ):
|
|
245 self.__get_exp_diff( table='genes', data_table='geneExpDiffData', data_table_as='gEDD', column='gene_id' )
|
|
246
|
|
247 def cds_exp_diff( self ):
|
|
248 self.__get_exp_diff( table='CDS', data_table='CDSExpDiffData', data_table_as='CED', column='CDS_id' )
|
|
249
|
|
250 def tss_rg( self ):
|
|
251 self.__get_read_group_data( table='TSSReplicateData', identifier='TSS_group_id' )
|
|
252
|
|
253 def isoform_rg( self ):
|
|
254 self.__get_read_group_data( table='isoformReplicateData', identifier='isoform_id' )
|
|
255
|
|
256 def gene_rg( self ):
|
|
257 self.__get_read_group_data( table='geneReplicateData', identifier='gene_id' )
|
|
258
|
|
259 def cds_rg( self ):
|
|
260 self.__get_read_group_data( table='CDSReplicateData', identifier='CDS_id' )
|
|
261
|
|
262 def var_model( self ):
|
|
263 header = [ 'condition', 'locus', 'compatible_count_mean', 'compatible_count_var', 'total_count_mean', 'total_count_var', 'fitted_var' ]
|
|
264 self.__write_line( header )
|
|
265 for row in self.session.execute( 'SELECT %s FROM varModel' % ', '.join( header ) ):
|
|
266 self.__write_line( row )
|
|
267
|
|
268 if __name__ == '__main__':
|
|
269 parser = argparse.ArgumentParser()
|
|
270 parser.add_argument( '--file', dest='filename' )
|
|
271 parser.add_argument( '--tables', dest='tables', action='append' )
|
|
272 opts = parser.parse_args()
|
|
273 cb = CummerbundParser( opts )
|
|
274 for table in opts.tables:
|
|
275 cb.generate_file( table )
|