annotate snpsift_dbnsfp.py @ 0:0c4372b93e85 draft default tip

Uploaded
author iuc
date Thu, 22 Jan 2015 08:04:59 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
1 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
2 SnpSift dbNSFP datatypes
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
3 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
4 import os,os.path,re,sys,gzip,logging
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
5 import traceback
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
6 import galaxy.datatypes.data
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
7 from galaxy.datatypes.data import Text
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
8 from galaxy.datatypes.metadata import MetadataElement
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
9
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
10 log = logging.getLogger(__name__)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
11
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
12 class SnpSiftDbNSFP( Text ):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
13 """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
14 MetadataElement( name='reference_name', default='dbSNFP' , desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP' )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
15 MetadataElement( name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
16 MetadataElement( name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
17 MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[] )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
18 file_ext = "snpsiftdbnsfp"
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
19 composite_type = 'auto_primary_file'
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
20 allow_datatype_change = False
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
21 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
22 ## The dbNSFP file is a tabular file with 1 header line
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
23 ## The first 4 columns are required to be: chrom pos ref alt
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
24 ## These match columns 1,2,4,5 of the VCF file
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
25 ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
26 ## Example:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
27 ## Compress using block-gzip algorithm
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
28 bgzip dbNSFP2.3.txt
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
29 ## Create tabix index
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
30 tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
31 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
32 def __init__( self, **kwd ):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
33 Text.__init__( self, **kwd )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
34 self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
35 self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
36 def init_meta( self, dataset, copy_from=None ):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
37 Text.init_meta( self, dataset, copy_from=copy_from )
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
38 def generate_primary_file( self, dataset = None ):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
39 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
40 This is called only at upload to write the html file
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
41 cannot rename the datasets here - they come with the default unfortunately
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
42 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
43 regenerate_primary_file( self, dataset)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
44 def regenerate_primary_file(self,dataset):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
45 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
46 cannot do this until we are setting metadata
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
47 """
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
48 annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
49 f = open(dataset.file_name,'a')
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
50 if dataset.metadata.bgzip:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
51 bn = dataset.metadata.bgzip
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
52 f.write(bn)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
53 f.write('\n')
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
54 f.write(annotations)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
55 f.close()
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
56 def set_meta( self, dataset, overwrite=True, **kwd ):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
57 try:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
58 efp = dataset.extra_files_path
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
59 if os.path.exists(efp):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
60 flist = os.listdir(efp)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
61 for i,fname in enumerate(flist):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
62 if fname.endswith('.gz'):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
63 dataset.metadata.bgzip = fname
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
64 try:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
65 fh = gzip.open(os.path.join(efp,fname),'r')
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
66 buf = fh.read(5000)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
67 lines = buf.splitlines()
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
68 headers = lines[0].split('\t')
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
69 dataset.metadata.annotation = headers[4:]
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
70 except Exception,e:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
71 log.warn("set_meta fname: %s %s" % (fname,str(e)))
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
72 traceback.print_stack(file=sys.stderr)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
73 finally:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
74 fh.close()
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
75 if fname.endswith('.tbi'):
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
76 dataset.metadata.index = fname
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
77 self.regenerate_primary_file(dataset)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
78 except Exception,e:
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
79 log.warn("set_meta fname: %s %s" % (dataset.file_name if dataset and dataset.file_name else 'Unkwown',str(e)))
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
80 traceback.print_stack(file=sys.stderr)
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
81
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
82 if __name__ == '__main__':
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
83 import doctest
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
84 doctest.testmod(sys.modules[__name__])
0c4372b93e85 Uploaded
iuc
parents:
diff changeset
85