annotate sra.py @ 3:956e892b299d

Add missing SRA datatype
author matt-shirley
date Mon, 07 Oct 2013 10:07:25 -0400
parents
children e4c21444a3ba
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
1 """
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
2 NCBI sra class
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
3 """
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
4 import logging
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
5 import binascii
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
6 from galaxy.datatypes.data import *
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
7 from galaxy.datatypes.sniff import *
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
8 from galaxy.datatypes.binary import *
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
9 from galaxy.datatypes.metadata import *
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
10
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
11 log = logging.getLogger(__name__)
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
12
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
13 class sra( Binary ):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
14 """ Sequence Read Archive (SRA) """
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
15 file_ext = 'sra'
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
16
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
17 def __init__( self, **kwd ):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
18 Binary.__init__( self, **kwd )
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
19 def sniff( self, filename ):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
20 """ The first 8 bytes of any NCBI sra file is 'NCIB.sra', and the file is binary. EBI and DDBJ files may differ, though EBI and DDBJ
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
21 submissions through NCBI (ERR and DRR accessions) read 'NCBI.sra'.
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
22 For details about the format, see http://www.ncbi.nlm.nih.gov/books/n/helpsra/SRA_Overview_BK/#SRA_Overview_BK.4_SRA_Data_Structure
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
23 """
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
24 try:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
25 header = open(filename).read(8)
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
26 if binascii.b2a_hex(header) == binascii.hexlify('NCBI.sra'):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
27 return True
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
28 else:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
29 return False
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
30 except:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
31 return False
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
32 def set_peek(self, dataset, is_multi_byte=False):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
33 if not dataset.dataset.purged:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
34 dataset.peek = 'Binary sra file'
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
35 dataset.blurb = data.nice_size(dataset.get_size())
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
36 else:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
37 dataset.peek = 'file does not exist'
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
38 dataset.blurb = 'file purged from disk'
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
39 def display_peek(self, dataset):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
40 try:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
41 return dataset.peek
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
42 except:
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
43 return 'Binary sra file (%s)' % ( data.nice_size(dataset.get_size()))
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
44
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
45 if hasattr(Binary, 'register_sniffable_binary_format'):
956e892b299d Add missing SRA datatype
matt-shirley
parents:
diff changeset
46 Binary.register_sniffable_binary_format('sra', 'sra', sra)