comparison gmap/lib/galaxy/datatypes/gmap.py @ 2:52da588232b0

Add datatypes for maps and snpindex, add iit_store and snpindex tools, update GMAP and GSNAP to use these.
author Jim Johnson <jj@umn.edu>
date Fri, 21 Oct 2011 11:38:55 -0500
parents d58d272914e7
children f4b4c1712e39
comparison
equal deleted inserted replaced
1:30d42bb409b8 2:52da588232b0
1 """ 1 """
2 GMAP indexes 2 GMAP indexes
3 """ 3 """
4 import logging 4 import logging
5 import os,os.path,re 5 import os,os.path,re
6 import data
6 from data import Text 7 from data import Text
8 from galaxy import util
7 from metadata import MetadataElement 9 from metadata import MetadataElement
8 10
9 log = logging.getLogger(__name__) 11 log = logging.getLogger(__name__)
10 12
11 class GmapDB( Text ): 13 class GmapDB( Text ):
59 def display_peek( self, dataset ): 61 def display_peek( self, dataset ):
60 try: 62 try:
61 return dataset.peek 63 return dataset.peek
62 except: 64 except:
63 return "GMAP index file" 65 return "GMAP index file"
66
64 def sniff( self, filename ): 67 def sniff( self, filename ):
65 return False 68 return False
66 def set_meta( self, dataset, overwrite = True, **kwd ): 69 def set_meta( self, dataset, overwrite = True, **kwd ):
67 """ 70 """
68 Expecting: 71 Expecting:
111 dataset.metadata.cmet = True 114 dataset.metadata.cmet = True
112 elif m.groups()[4] == 'a2i': 115 elif m.groups()[4] == 'a2i':
113 dataset.metadata.atoi = True 116 dataset.metadata.atoi = True
114 dataset.metadata.kmers = kmers.keys() 117 dataset.metadata.kmers = kmers.keys()
115 118
116 ## class IntervalIndexTree( Text ): 119 class GmapSnpIndex( Text ):
117 ## """ 120 """
118 ## A GMAP Interval Index Tree Map 121 A GMAP SNP index created by snpindex
119 ## created by iit_store 122 """
120 ## (/path/to/map)/(mapname).iit 123 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
121 ## """ 124 MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True )
122 ## MetadataElement( name="map_name", desc="The map name for this index set", default='unknown', set_in_upload=True, readonly=False ) 125
123 ## file_ext = 'iit' 126 file_ext = 'gmapsnpindex'
124 ## is_binary = True 127 is_binary = True
125 ## composite_type = 'auto_primary_file' 128 composite_type = 'auto_primary_file'
126 ## allow_datatype_change = False 129 allow_datatype_change = False
127 ## 130
128 ## class IntervalAnnotation(data.Text): 131 def generate_primary_file( self, dataset = None ):
129 ## """ 132 """
130 ## Class describing a GMAP Interval format: 133 This is called only at upload to write the html file
131 ## >label coords optional_tag 134 cannot rename the datasets here - they come with the default unfortunately
132 ## optional_annotation (which may be zero, one, or multiple lines) 135 """
133 ## The coords should be of the form: 136 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
134 ## chr:position 137
135 ## chr:startposition..endposition 138 def regenerate_primary_file(self,dataset):
136 ## """ 139 """
137 ## file_ext = 'gmapannotation' 140 cannot do this until we are setting metadata
138 ## 141 """
139 ## class SpliceSiteAnnotation(IntervalAnnotation): 142 bn = dataset.metadata.db_name
140 ## file_ext = 'gmapsplicesites' 143 log.info( "GmapDB regenerate_primary_file %s" % (bn))
141 ## """ 144 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
142 ## Example: 145 for i,name in enumerate(dataset.metadata.maps):
143 ## >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678 146 rval.append( '<li>%s' % name)
144 ## >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678 147 rval.append( '</ul></html>' )
145 ## >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179 148 f = file(dataset.file_name,'w')
146 ## >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179 149 f.write("\n".join( rval ))
147 ## >NM_004449.ERG.exon1 21:38955452..38955451 donor 783 150 f.write('\n')
148 ## >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783 151 f.close()
149 ## >NM_004449.ERG.exon2 21:38878638..38878637 donor 360 152 def set_peek( self, dataset, is_multi_byte=False ):
150 ## >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360 153 log.info( "GmapSnpIndex set_peek %s" % (dataset))
151 ## """ 154 if not dataset.dataset.purged:
152 ## 155 dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
153 ## class IntronAnnotation(IntervalAnnotation): 156 dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
154 ## file_ext = 'gmapintrons' 157 else:
155 ## """ 158 dataset.peek = 'file does not exist'
156 ## Example: 159 dataset.blurb = 'file purged from disk'
157 ## >NM_004448.ERBB2.intron1 17:35110090..35116769 160 def display_peek( self, dataset ):
158 ## >NM_004448.ERBB2.intron2 17:35116920..35118100 161 try:
159 ## >NM_004449.ERG.intron1 21:38955452..38878739 162 return dataset.peek
160 ## >NM_004449.ERG.intron2 21:38878638..38869541 163 except:
161 ## """ 164 return "GMAP SNP index"
162 ## 165
163 ## class SNPAnnotation(IntervalAnnotation): 166 def sniff( self, filename ):
164 ## file_ext = 'gmapsnps' 167 return False
165 ## """ 168 def set_meta( self, dataset, overwrite = True, **kwd ):
166 ## Example: 169 """
167 ## >rs62211261 21:14379270 CG 170 Expecting:
168 ## >rs62211262 21:14379281 CG 171 extra_files_path/snp_name.iit
169 ## """ 172 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp.snp_name
173 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name
174 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name
175 """
176 log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path))
177 pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?'
178 efp = dataset.extra_files_path
179 flist = os.listdir(efp)
180 for i,fname in enumerate(flist):
181 m = re.match(pat,fname)
182 if m:
183 assert len(m.groups()) == 6
184 dataset.metadata.db_name = m.groups()[0]
185 dataset.metadata.snps_name = m.groups()[-1]
186
187
188
189
190 class IntervalIndexTree( Text ):
191 """
192 A GMAP Interval Index Tree Map
193 created by iit_store
194 (/path/to/map)/(mapname).iit
195 """
196 file_ext = 'iit'
197 is_binary = True
198
199 class SpliceSitesIntervalIndexTree( IntervalIndexTree ):
200 """
201 A GMAP Interval Index Tree Map
202 created by iit_store
203 """
204 file_ext = 'splicesites.iit'
205
206 class IntronsIntervalIndexTree( IntervalIndexTree ):
207 """
208 A GMAP Interval Index Tree Map
209 created by iit_store
210 """
211 file_ext = 'introns.iit'
212
213 class SNPsIntervalIndexTree( IntervalIndexTree ):
214 """
215 A GMAP Interval Index Tree Map
216 created by iit_store
217 """
218 file_ext = 'snps.iit'
219
220 class IntervalAnnotation( Text ):
221 """
222 Class describing a GMAP Interval format:
223 >label coords optional_tag
224 optional_annotation (which may be zero, one, or multiple lines)
225 The coords should be of the form:
226 chr:position
227 chr:startposition..endposition
228 """
229 file_ext = 'gmap_annotation'
230 """Add metadata elements"""
231 MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 )
232
233 def set_meta( self, dataset, **kwd ):
234 """
235 Set the number of annotations and the number of data lines in dataset.
236 """
237 data_lines = 0
238 annotations = 0
239 for line in file( dataset.file_name ):
240 line = line.strip()
241 if line and line.startswith( '>' ):
242 annotations += 1
243 data_lines +=1
244 else:
245 data_lines += 1
246 dataset.metadata.data_lines = data_lines
247 dataset.metadata.annotations = annotations
248 def set_peek( self, dataset, is_multi_byte=False ):
249 if not dataset.dataset.purged:
250 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
251 if dataset.metadata.annotations:
252 dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) )
253 else:
254 dataset.blurb = data.nice_size( dataset.get_size() )
255 else:
256 dataset.peek = 'file does not exist'
257 dataset.blurb = 'file purged from disk'
258
259 def sniff( self, filename ):
260 """
261 Determines whether the file is a gmap annotation file
262 Format:
263 >label coords optional_tag
264 optional_annotation (which may be zero, one, or multiple lines)
265 For example, the label may be an EST accession, with the coords
266 representing its genomic position. Labels may be duplicated if
267 necessary.
268 The coords should be of the form
269 chr:position
270 chr:startposition..endposition
271 The term "chr:position" is equivalent to "chr:position..position". If
272 you want to indicate that the interval is on the minus strand or
273 reverse direction, then <endposition> may be less than <startposition>.
274 """
275 try:
276 pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag]
277 fh = open( filename )
278 count = 0
279 while True and count < 10:
280 line = fh.readline()
281 if not line:
282 break #EOF
283 line = line.strip()
284 if line: #first non-empty line
285 if line.startswith( '>' ):
286 count += 1
287 if re.match(pat,line) == None: # Failed to match
288 return False
289 finally:
290 fh.close()
291 return False
292
293 class SpliceSiteAnnotation(IntervalAnnotation):
294 file_ext = 'gmap_splicesites'
295 """
296 Example:
297 >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678
298 >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678
299 >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179
300 >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179
301 >NM_004449.ERG.exon1 21:38955452..38955451 donor 783
302 >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783
303 >NM_004449.ERG.exon2 21:38878638..38878637 donor 360
304 >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360
305 Each line must start with a ">" character, then be followed by an
306 identifier, which may have duplicates and can have any format, with
307 the gene name or exon number shown here only as a suggestion. Then
308 there should be the chromosomal coordinates which straddle the
309 exon-intron boundary, so one coordinate is on the exon and one is on
310 the intron. (Coordinates are all 1-based, so the first character of a
311 chromosome is number 1.) Finally, there should be the splice type:
312 "donor" or "acceptor". You may optionally store the intron distance
313 at the end. GSNAP can use this intron distance, if it is longer than
314 its value for --localsplicedist, to look for long introns at that
315 splice site. The same splice site may have different intron distances
316 in the database; GSNAP will use the longest intron distance reported
317 in searching for long introns.
318 """
319 def sniff( self, filename ): # TODO
320 """
321 Determines whether the file is a gmap splice site annotation file
322 """
323 try:
324 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist]
325 fh = open( filename )
326 count = 0
327 while True and count < 10:
328 line = fh.readline()
329 if not line:
330 break #EOF
331 line = line.strip()
332 if line: #first non-empty line
333 count += 1
334 if re.match(pat,line) == None: # Failed to match
335 return False
336 finally:
337 fh.close()
338 return False
339
340 class IntronAnnotation(IntervalAnnotation):
341 file_ext = 'gmap_introns'
342 """
343 Example:
344 >NM_004448.ERBB2.intron1 17:35110090..35116769
345 >NM_004448.ERBB2.intron2 17:35116920..35118100
346 >NM_004449.ERG.intron1 21:38955452..38878739
347 >NM_004449.ERG.intron2 21:38878638..38869541
348 The coordinates are 1-based, and specify the exon coordinates
349 surrounding the intron, with the first coordinate being from the donor
350 exon and the second one being from the acceptor exon.
351 """
352 def sniff( self, filename ): # TODO
353 """
354 Determines whether the file is a gmap Intron annotation file
355 """
356 try:
357 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position
358 fh = open( filename )
359 count = 0
360 while True and count < 10:
361 line = fh.readline()
362 if not line:
363 break #EOF
364 line = line.strip()
365 if line: #first non-empty line
366 count += 1
367 if re.match(pat,line) == None: # Failed to match
368 return False
369 finally:
370 fh.close()
371 return False
372
373 class SNPAnnotation(IntervalAnnotation):
374 file_ext = 'gmap_snps'
375 """
376 Example:
377 >rs62211261 21:14379270 CG
378 >rs62211262 21:14379281 AT
379 >rs62211263 21:14379298 WN
380 Each line must start with a ">" character, then be followed by an
381 identifier (which may have duplicates). Then there should be the
382 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so
383 the first character of a chromosome is number 1.) Finally, there
384 should be the two possible alleles. (Previous versions required that
385 these be in alphabetical order: "AC", "AG", "AT", "CG", "CT", or "GT",
386 but that is no longer a requirement.) These alleles must correspond
387 to the possible nucleotides on the plus strand of the genome. If the
388 one of these two letters does not match the allele in the reference
389 sequence, that SNP will be ignored in subsequent processing as a
390 probable error.
391
392 GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows
393 all nucleotides to match at that position, not just a given reference
394 and alternate allele. It is essentially as if an "N" were recorded at
395 that genomic location, although the index files still keep track of
396 the reference allele. To indicate that a position has a wildcard SNP,
397 you can indicate the genotype as "WN", where "W" is the reference
398 allele. Another indication of a wildcard SNP is to provide two
399 separate lines at that position with the genotypes "WX" and "WY",
400 where "W" is the reference allele and "X" and "Y" are two different
401 alternate alleles.
402 """
403 def sniff( self, filename ):
404 """
405 Determines whether the file is a gmap SNP annotation file
406 """
407 try:
408 pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG
409 fh = open( filename )
410 count = 0
411 while True and count < 10:
412 line = fh.readline()
413 if not line:
414 break #EOF
415 line = line.strip()
416 if line: #first non-empty line
417 count += 1
418 if re.match(pat,line) == None: # Failed to match
419 return False
420 finally:
421 fh.close()
422 return False
423
424