annotate snpeff.py @ 0:d78b2b2a3388 draft

Uploaded
author iuc
date Thu, 22 Jan 2015 07:58:16 -0500
parents
children 2b53f59de80c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
1 """
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
2 SnpEff datatypes
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
3 """
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
4 import os,os.path,re,sys,gzip,logging
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
5 import galaxy.datatypes.data
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
6 from galaxy.datatypes.data import Text
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
7 from galaxy.datatypes.metadata import MetadataElement
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
8
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
9 log = logging.getLogger(__name__)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
10
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
11 class SnpEffDb( Text ):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
12 """Class describing a SnpEff genome build"""
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
13 file_ext = "snpeffdb"
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
14 MetadataElement( name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None )
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
15 MetadataElement( name="snpeff_version", default="SnpEff4.0", desc="SnpEff Version", readonly=True, visible=True, no_value=None )
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
16 MetadataElement( name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
17 MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
18
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
19 def __init__( self, **kwd ):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
20 Text.__init__( self, **kwd )
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
21
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
22 "" The SnpEff version line was added in SnpEff version 4.1
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
23 def getSnpeffVersionFromFile(self, path):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
24 snpeff_version = None
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
25 try:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
26 fh = gzip.open(path, 'rb')
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
27 buf = fh.read(100)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
28 lines = buf.splitlines()
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
29 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip())
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
30 if m:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
31 snpeff_version = m.groups()[0] + m.groups()[1]
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
32 fh.close()
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
33 except Exception, e:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
34 pass
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
35 return snpeff_version
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
36
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
37 def set_meta( self, dataset, **kwd ):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
38 Text.set_meta(self, dataset, **kwd )
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
39 data_dir = dataset.extra_files_path
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
40 ## search data_dir/genome_version for files
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
41 regulation_pattern = 'regulation_(.+).bin'
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
42 # annotation files that are included in snpEff by a flag
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
43 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
44 regulations = []
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
45 annotations = []
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
46 genome_version = None
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
47 snpeff_version = None
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
48 if data_dir and os.path.isdir(data_dir):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
49 for root, dirs, files in os.walk(data_dir):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
50 for fname in files:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
51 if fname.startswith('snpEffectPredictor'):
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
52 # if snpEffectPredictor.bin download succeeded
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
53 genome_version = os.path.basename(root)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
54 dataset.metadata.genome_version = genome_version
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
55 else:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
56 m = re.match(regulation_pattern,fname)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
57 if m:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
58 name = m.groups()[0]
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
59 regulations.append(name)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
60 elif fname in annotations_dict:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
61 value = annotations_dict[fname]
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
62 name = value.lstrip('-')
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
63 annotations.append(name)
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
64 dataset.metadata.regulation = regulations
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
65 dataset.metadata.annotation = annotations
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
66 try:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
67 fh = file(dataset.file_name,'w')
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
68 fh.write("%s\n" % genome_version if genome_version else 'Genome unknown')
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
69 fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown')
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
70 if annotations:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
71 fh.write("annotations: %s\n" % ','.join(annotations))
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
72 if regulations:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
73 fh.write("regulations: %s\n" % ','.join(regulations))
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
74 fh.close()
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
75 except:
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
76 pass
d78b2b2a3388 Uploaded
iuc
parents:
diff changeset
77