Mercurial > repos > jjohnson > gmap
comparison gmap/lib/galaxy/datatypes/gmap.py @ 2:52da588232b0
Add datatypes for maps and snpindex, add iit_store and snpindex tools, update GMAP and GSNAP to use these.
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Fri, 21 Oct 2011 11:38:55 -0500 |
parents | d58d272914e7 |
children | f4b4c1712e39 |
comparison
equal
deleted
inserted
replaced
1:30d42bb409b8 | 2:52da588232b0 |
---|---|
1 """ | 1 """ |
2 GMAP indexes | 2 GMAP indexes |
3 """ | 3 """ |
4 import logging | 4 import logging |
5 import os,os.path,re | 5 import os,os.path,re |
6 import data | |
6 from data import Text | 7 from data import Text |
8 from galaxy import util | |
7 from metadata import MetadataElement | 9 from metadata import MetadataElement |
8 | 10 |
9 log = logging.getLogger(__name__) | 11 log = logging.getLogger(__name__) |
10 | 12 |
11 class GmapDB( Text ): | 13 class GmapDB( Text ): |
59 def display_peek( self, dataset ): | 61 def display_peek( self, dataset ): |
60 try: | 62 try: |
61 return dataset.peek | 63 return dataset.peek |
62 except: | 64 except: |
63 return "GMAP index file" | 65 return "GMAP index file" |
66 | |
64 def sniff( self, filename ): | 67 def sniff( self, filename ): |
65 return False | 68 return False |
66 def set_meta( self, dataset, overwrite = True, **kwd ): | 69 def set_meta( self, dataset, overwrite = True, **kwd ): |
67 """ | 70 """ |
68 Expecting: | 71 Expecting: |
111 dataset.metadata.cmet = True | 114 dataset.metadata.cmet = True |
112 elif m.groups()[4] == 'a2i': | 115 elif m.groups()[4] == 'a2i': |
113 dataset.metadata.atoi = True | 116 dataset.metadata.atoi = True |
114 dataset.metadata.kmers = kmers.keys() | 117 dataset.metadata.kmers = kmers.keys() |
115 | 118 |
116 ## class IntervalIndexTree( Text ): | 119 class GmapSnpIndex( Text ): |
117 ## """ | 120 """ |
118 ## A GMAP Interval Index Tree Map | 121 A GMAP SNP index created by snpindex |
119 ## created by iit_store | 122 """ |
120 ## (/path/to/map)/(mapname).iit | 123 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) |
121 ## """ | 124 MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True ) |
122 ## MetadataElement( name="map_name", desc="The map name for this index set", default='unknown', set_in_upload=True, readonly=False ) | 125 |
123 ## file_ext = 'iit' | 126 file_ext = 'gmapsnpindex' |
124 ## is_binary = True | 127 is_binary = True |
125 ## composite_type = 'auto_primary_file' | 128 composite_type = 'auto_primary_file' |
126 ## allow_datatype_change = False | 129 allow_datatype_change = False |
127 ## | 130 |
128 ## class IntervalAnnotation(data.Text): | 131 def generate_primary_file( self, dataset = None ): |
129 ## """ | 132 """ |
130 ## Class describing a GMAP Interval format: | 133 This is called only at upload to write the html file |
131 ## >label coords optional_tag | 134 cannot rename the datasets here - they come with the default unfortunately |
132 ## optional_annotation (which may be zero, one, or multiple lines) | 135 """ |
133 ## The coords should be of the form: | 136 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' |
134 ## chr:position | 137 |
135 ## chr:startposition..endposition | 138 def regenerate_primary_file(self,dataset): |
136 ## """ | 139 """ |
137 ## file_ext = 'gmapannotation' | 140 cannot do this until we are setting metadata |
138 ## | 141 """ |
139 ## class SpliceSiteAnnotation(IntervalAnnotation): | 142 bn = dataset.metadata.db_name |
140 ## file_ext = 'gmapsplicesites' | 143 log.info( "GmapDB regenerate_primary_file %s" % (bn)) |
141 ## """ | 144 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] |
142 ## Example: | 145 for i,name in enumerate(dataset.metadata.maps): |
143 ## >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678 | 146 rval.append( '<li>%s' % name) |
144 ## >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678 | 147 rval.append( '</ul></html>' ) |
145 ## >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179 | 148 f = file(dataset.file_name,'w') |
146 ## >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179 | 149 f.write("\n".join( rval )) |
147 ## >NM_004449.ERG.exon1 21:38955452..38955451 donor 783 | 150 f.write('\n') |
148 ## >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783 | 151 f.close() |
149 ## >NM_004449.ERG.exon2 21:38878638..38878637 donor 360 | 152 def set_peek( self, dataset, is_multi_byte=False ): |
150 ## >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360 | 153 log.info( "GmapSnpIndex set_peek %s" % (dataset)) |
151 ## """ | 154 if not dataset.dataset.purged: |
152 ## | 155 dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) |
153 ## class IntronAnnotation(IntervalAnnotation): | 156 dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) |
154 ## file_ext = 'gmapintrons' | 157 else: |
155 ## """ | 158 dataset.peek = 'file does not exist' |
156 ## Example: | 159 dataset.blurb = 'file purged from disk' |
157 ## >NM_004448.ERBB2.intron1 17:35110090..35116769 | 160 def display_peek( self, dataset ): |
158 ## >NM_004448.ERBB2.intron2 17:35116920..35118100 | 161 try: |
159 ## >NM_004449.ERG.intron1 21:38955452..38878739 | 162 return dataset.peek |
160 ## >NM_004449.ERG.intron2 21:38878638..38869541 | 163 except: |
161 ## """ | 164 return "GMAP SNP index" |
162 ## | 165 |
163 ## class SNPAnnotation(IntervalAnnotation): | 166 def sniff( self, filename ): |
164 ## file_ext = 'gmapsnps' | 167 return False |
165 ## """ | 168 def set_meta( self, dataset, overwrite = True, **kwd ): |
166 ## Example: | 169 """ |
167 ## >rs62211261 21:14379270 CG | 170 Expecting: |
168 ## >rs62211262 21:14379281 CG | 171 extra_files_path/snp_name.iit |
169 ## """ | 172 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp.snp_name |
173 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name | |
174 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name | |
175 """ | |
176 log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path)) | |
177 pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?' | |
178 efp = dataset.extra_files_path | |
179 flist = os.listdir(efp) | |
180 for i,fname in enumerate(flist): | |
181 m = re.match(pat,fname) | |
182 if m: | |
183 assert len(m.groups()) == 6 | |
184 dataset.metadata.db_name = m.groups()[0] | |
185 dataset.metadata.snps_name = m.groups()[-1] | |
186 | |
187 | |
188 | |
189 | |
190 class IntervalIndexTree( Text ): | |
191 """ | |
192 A GMAP Interval Index Tree Map | |
193 created by iit_store | |
194 (/path/to/map)/(mapname).iit | |
195 """ | |
196 file_ext = 'iit' | |
197 is_binary = True | |
198 | |
199 class SpliceSitesIntervalIndexTree( IntervalIndexTree ): | |
200 """ | |
201 A GMAP Interval Index Tree Map | |
202 created by iit_store | |
203 """ | |
204 file_ext = 'splicesites.iit' | |
205 | |
206 class IntronsIntervalIndexTree( IntervalIndexTree ): | |
207 """ | |
208 A GMAP Interval Index Tree Map | |
209 created by iit_store | |
210 """ | |
211 file_ext = 'introns.iit' | |
212 | |
213 class SNPsIntervalIndexTree( IntervalIndexTree ): | |
214 """ | |
215 A GMAP Interval Index Tree Map | |
216 created by iit_store | |
217 """ | |
218 file_ext = 'snps.iit' | |
219 | |
220 class IntervalAnnotation( Text ): | |
221 """ | |
222 Class describing a GMAP Interval format: | |
223 >label coords optional_tag | |
224 optional_annotation (which may be zero, one, or multiple lines) | |
225 The coords should be of the form: | |
226 chr:position | |
227 chr:startposition..endposition | |
228 """ | |
229 file_ext = 'gmap_annotation' | |
230 """Add metadata elements""" | |
231 MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 ) | |
232 | |
233 def set_meta( self, dataset, **kwd ): | |
234 """ | |
235 Set the number of annotations and the number of data lines in dataset. | |
236 """ | |
237 data_lines = 0 | |
238 annotations = 0 | |
239 for line in file( dataset.file_name ): | |
240 line = line.strip() | |
241 if line and line.startswith( '>' ): | |
242 annotations += 1 | |
243 data_lines +=1 | |
244 else: | |
245 data_lines += 1 | |
246 dataset.metadata.data_lines = data_lines | |
247 dataset.metadata.annotations = annotations | |
248 def set_peek( self, dataset, is_multi_byte=False ): | |
249 if not dataset.dataset.purged: | |
250 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
251 if dataset.metadata.annotations: | |
252 dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) ) | |
253 else: | |
254 dataset.blurb = data.nice_size( dataset.get_size() ) | |
255 else: | |
256 dataset.peek = 'file does not exist' | |
257 dataset.blurb = 'file purged from disk' | |
258 | |
259 def sniff( self, filename ): | |
260 """ | |
261 Determines whether the file is a gmap annotation file | |
262 Format: | |
263 >label coords optional_tag | |
264 optional_annotation (which may be zero, one, or multiple lines) | |
265 For example, the label may be an EST accession, with the coords | |
266 representing its genomic position. Labels may be duplicated if | |
267 necessary. | |
268 The coords should be of the form | |
269 chr:position | |
270 chr:startposition..endposition | |
271 The term "chr:position" is equivalent to "chr:position..position". If | |
272 you want to indicate that the interval is on the minus strand or | |
273 reverse direction, then <endposition> may be less than <startposition>. | |
274 """ | |
275 try: | |
276 pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag] | |
277 fh = open( filename ) | |
278 count = 0 | |
279 while True and count < 10: | |
280 line = fh.readline() | |
281 if not line: | |
282 break #EOF | |
283 line = line.strip() | |
284 if line: #first non-empty line | |
285 if line.startswith( '>' ): | |
286 count += 1 | |
287 if re.match(pat,line) == None: # Failed to match | |
288 return False | |
289 finally: | |
290 fh.close() | |
291 return False | |
292 | |
293 class SpliceSiteAnnotation(IntervalAnnotation): | |
294 file_ext = 'gmap_splicesites' | |
295 """ | |
296 Example: | |
297 >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678 | |
298 >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678 | |
299 >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179 | |
300 >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179 | |
301 >NM_004449.ERG.exon1 21:38955452..38955451 donor 783 | |
302 >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783 | |
303 >NM_004449.ERG.exon2 21:38878638..38878637 donor 360 | |
304 >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360 | |
305 Each line must start with a ">" character, then be followed by an | |
306 identifier, which may have duplicates and can have any format, with | |
307 the gene name or exon number shown here only as a suggestion. Then | |
308 there should be the chromosomal coordinates which straddle the | |
309 exon-intron boundary, so one coordinate is on the exon and one is on | |
310 the intron. (Coordinates are all 1-based, so the first character of a | |
311 chromosome is number 1.) Finally, there should be the splice type: | |
312 "donor" or "acceptor". You may optionally store the intron distance | |
313 at the end. GSNAP can use this intron distance, if it is longer than | |
314 its value for --localsplicedist, to look for long introns at that | |
315 splice site. The same splice site may have different intron distances | |
316 in the database; GSNAP will use the longest intron distance reported | |
317 in searching for long introns. | |
318 """ | |
319 def sniff( self, filename ): # TODO | |
320 """ | |
321 Determines whether the file is a gmap splice site annotation file | |
322 """ | |
323 try: | |
324 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist] | |
325 fh = open( filename ) | |
326 count = 0 | |
327 while True and count < 10: | |
328 line = fh.readline() | |
329 if not line: | |
330 break #EOF | |
331 line = line.strip() | |
332 if line: #first non-empty line | |
333 count += 1 | |
334 if re.match(pat,line) == None: # Failed to match | |
335 return False | |
336 finally: | |
337 fh.close() | |
338 return False | |
339 | |
340 class IntronAnnotation(IntervalAnnotation): | |
341 file_ext = 'gmap_introns' | |
342 """ | |
343 Example: | |
344 >NM_004448.ERBB2.intron1 17:35110090..35116769 | |
345 >NM_004448.ERBB2.intron2 17:35116920..35118100 | |
346 >NM_004449.ERG.intron1 21:38955452..38878739 | |
347 >NM_004449.ERG.intron2 21:38878638..38869541 | |
348 The coordinates are 1-based, and specify the exon coordinates | |
349 surrounding the intron, with the first coordinate being from the donor | |
350 exon and the second one being from the acceptor exon. | |
351 """ | |
352 def sniff( self, filename ): # TODO | |
353 """ | |
354 Determines whether the file is a gmap Intron annotation file | |
355 """ | |
356 try: | |
357 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position | |
358 fh = open( filename ) | |
359 count = 0 | |
360 while True and count < 10: | |
361 line = fh.readline() | |
362 if not line: | |
363 break #EOF | |
364 line = line.strip() | |
365 if line: #first non-empty line | |
366 count += 1 | |
367 if re.match(pat,line) == None: # Failed to match | |
368 return False | |
369 finally: | |
370 fh.close() | |
371 return False | |
372 | |
373 class SNPAnnotation(IntervalAnnotation): | |
374 file_ext = 'gmap_snps' | |
375 """ | |
376 Example: | |
377 >rs62211261 21:14379270 CG | |
378 >rs62211262 21:14379281 AT | |
379 >rs62211263 21:14379298 WN | |
380 Each line must start with a ">" character, then be followed by an | |
381 identifier (which may have duplicates). Then there should be the | |
382 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so | |
383 the first character of a chromosome is number 1.) Finally, there | |
384 should be the two possible alleles. (Previous versions required that | |
385 these be in alphabetical order: "AC", "AG", "AT", "CG", "CT", or "GT", | |
386 but that is no longer a requirement.) These alleles must correspond | |
387 to the possible nucleotides on the plus strand of the genome. If the | |
388 one of these two letters does not match the allele in the reference | |
389 sequence, that SNP will be ignored in subsequent processing as a | |
390 probable error. | |
391 | |
392 GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows | |
393 all nucleotides to match at that position, not just a given reference | |
394 and alternate allele. It is essentially as if an "N" were recorded at | |
395 that genomic location, although the index files still keep track of | |
396 the reference allele. To indicate that a position has a wildcard SNP, | |
397 you can indicate the genotype as "WN", where "W" is the reference | |
398 allele. Another indication of a wildcard SNP is to provide two | |
399 separate lines at that position with the genotypes "WX" and "WY", | |
400 where "W" is the reference allele and "X" and "Y" are two different | |
401 alternate alleles. | |
402 """ | |
403 def sniff( self, filename ): | |
404 """ | |
405 Determines whether the file is a gmap SNP annotation file | |
406 """ | |
407 try: | |
408 pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG | |
409 fh = open( filename ) | |
410 count = 0 | |
411 while True and count < 10: | |
412 line = fh.readline() | |
413 if not line: | |
414 break #EOF | |
415 line = line.strip() | |
416 if line: #first non-empty line | |
417 count += 1 | |
418 if re.match(pat,line) == None: # Failed to match | |
419 return False | |
420 finally: | |
421 fh.close() | |
422 return False | |
423 | |
424 |