annotate reference_bins.py @ 0:7db7ecc78ad6 draft

Uploaded
author damion
date Mon, 02 Mar 2015 20:46:00 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
1 import os.path
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
2 import common
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
3
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
4 class ReferenceBins:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
5
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
6 def __init__(self, db_spec_path = None):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
7 """
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
8 @param db_spec_path string path to fasta databases specification file. This file has format:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
9 #value id type active name path
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
10 16S_euzby 16S 1 Euzby /usr/local/galaxy/shared/ngs_data/
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
11 ...
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
12 """
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
13 self.reference_bins = []
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
14
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
15 if db_spec_path == None: # Default to the command-line lookup table in code's folder:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
16 db_spec_path = os.path.join(os.path.dirname(__file__), 'fasta_reference_dbs.tab')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
17 self.fieldSpec = common.FieldSpec(db_spec_path)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
18
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
19 def __main__(self): pass
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
20
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
21 # Could double check to see if it exists?
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
22 def build_bins(self, bins, columns):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
23
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
24 if bins == None:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
25 self.reference_bins = []
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
26 return False
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
27
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
28 for myfield in bins.strip().strip(';').split(';'):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
29 field_spec = myfield.strip().split(':')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
30 field_name = field_spec[0].strip()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
31
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
32 if field_name != '':
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
33 if not field_name.replace('_','').isalnum():
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
34 common.stop_err("Invalid bin name: " + field_name + ':' + myfield)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
35
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
36 if len(field_spec) < 2: field_spec.append('column') # default grouping = column
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
37 if len(field_spec) < 3: field_spec.append('') # default filtering = none
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
38 if len(field_spec) < 4: field_spec.append('') # default no description
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
39
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
40 grouping = field_spec[1].strip()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
41 if not grouping in ['table', 'column', 'hidden']:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
42 common.stop_err("Invalid bin layout: " + grouping)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
43
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
44 bin_filter = field_spec[2].strip()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
45 if not bin_filter in ['', 'include', 'exclude']:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
46 common.stop_err("Invalid bin sort: " + bin_filter)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
47
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
48 newbin = self.buildBin(field_name, bin_filter)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
49 self.reference_bins.append(newbin)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
50
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
51 field = { # any time we have a bin we want sort descending
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
52 'field': field_name,
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
53 'group': grouping,
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
54 'sort': 'desc',
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
55 'label': newbin.name,
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
56 'type': 'bin'
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
57 }
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
58 columns.append(field)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
59 if (field_spec[3] == 'true'): # description field requested
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
60 field = {
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
61 'field': field_name + '_desc',
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
62 'group': 'column',
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
63 'sort': '', # Allow other sorts????
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
64 'label': newbin.name + ' Description',
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
65 'type': 'text'
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
66 }
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
67 columns.append(field)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
68
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
69
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
70 def buildBin(self, bin_folder_name, bin_filter):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
71 """
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
72 Create a lookup table consisting of a dictionary entry for each accession id held in dictionary's file.
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
73 @param bin_folder_name string name of requested db, e.g 16S_ncbi
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
74 @param bin_filter string '' or 'include' or 'exclude'
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
75
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
76 """
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
77 bin = ReferenceBin(self.fieldSpec, bin_folder_name, bin_filter)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
78
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
79 try:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
80 with open(bin.file_path) as file_in:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
81 for line in file_in: # Should always contains succession id
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
82 #FUTURE: Preprocess so accession ID ready to go.
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
83 keyValue = line.rstrip().split("\t",1)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
84 # keep only first term minus integer portion of id
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
85 accGeneralId = keyValue[0].split('.')[0]
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
86 if len(keyValue) >1: description = keyValue[1]
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
87 else: description = ''
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
88 bin.lookup[accGeneralId] = description
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
89
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
90 file_in.close()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
91
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
92 except IOError:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
93 stop_err("Reference bin could not be found or opened: " + self.path + bin_folder_name + '/accession_ids.tab')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
94
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
95 return bin
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
96
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
97 def setStatus(self, record):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
98
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
99 if len(self.reference_bins) == 0: return #no bins
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
100
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
101 # Use of "extended slices" http://docs.python.org/2.3/whatsnew/section-slices.html
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
102 # Example sallseqid is 'gi|194753780|ref|XR_046072.1|;gi|195119578|ref|XR_047594.1|;gi|195154052|ref|XR_047967.1|'
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
103 # Example accs is ['XR_046072.1', 'XR_047594.1', 'XR_047967.1']
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
104 # Original code was "[1::2][1::2]" (select every 2nd item, then every 2nd item of that)
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
105 accs = record.sallseqid.split('|')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
106
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
107 if common.re_default_ncbi_id.match(record.sseqid):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
108 accs = accs[3::4] #Select every 4th item starting offset 4
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
109
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
110 elif common.re_default_ref_id.match(record.sseqid):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
111 accs = accs[1::2]
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
112
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
113
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
114 # Check each accession # against each bin.
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
115 for ptr, bin in enumerate(self.reference_bins):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
116 setattr(record, bin.field, '') #Using '','1' not FALSE/TRUE because of tab delim output
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
117 setattr(record, bin.field + '_desc', '')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
118 for acc in accs:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
119 accGeneralId = acc.split('.')[0]
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
120 if accGeneralId in bin.lookup:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
121 if bin.exclude: return False
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
122 setattr(record, bin.field, str(ptr+1))
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
123 # Include any bin notes for this item
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
124 setattr(record, bin.field + '_desc', bin.lookup[accGeneralId])
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
125 break # This result has been binned to this bin so break.
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
126
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
127
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
128
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
129
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
130 def __str__(self):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
131 return "name: %s dict: %s" % (self.name, str(self.lookup))
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
132
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
133 class ReferenceBin:
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
134 def __init__(self, fieldSpec, bin_folder_name, bin_filter):
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
135 self.lookup = {}
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
136 self.folder = bin_folder_name
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
137 self.name = fieldSpec.getAttribute(bin_folder_name, 'name')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
138 self.field = bin_folder_name
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
139 self.path = fieldSpec.getAttribute(bin_folder_name, 'path')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
140 self.exclude = bin_filter
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
141 #absolute path to reference bins folder: /usr/local/galaxy/shared/ngs_data/
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
142 self.file_path = os.path.join(self.path + self.folder + '/accession_ids.tab')
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
143
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
144 if __name__ == '__main__':
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
145
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
146 binManager = ReferenceBins()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
147 binManager.__main__()
7db7ecc78ad6 Uploaded
damion
parents:
diff changeset
148