Mercurial > repos > miller-lab > genome_diversity
annotate Population.py @ 32:03c22b722882
remove BeautifulSoup dependency
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:54:23 -0400 |
parents | 8997f2ca8c7a |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
3 import OrderedDict |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
4 import base64 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
5 import json |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
6 import zlib |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
7 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
8 import sys |
0 | 9 |
10 class Individual(object): | |
11 __slots__ = ['_column', '_name', '_alias'] | |
12 | |
13 def __init__(self, column, name, alias=None): | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
14 self._column = int(column) |
0 | 15 self._name = name |
16 self._alias = alias | |
17 | |
18 @property | |
19 def column(self): | |
20 return self._column | |
21 | |
22 @property | |
23 def name(self): | |
24 return self._name if self._alias is None else self._alias | |
25 | |
26 @property | |
27 def alias(self): | |
28 return self._alias | |
29 | |
30 @alias.setter | |
31 def alias(self, alias): | |
32 self._alias = alias | |
33 | |
34 @property | |
35 def real_name(self): | |
36 return self._name | |
37 | |
38 def __eq__(self, other): | |
39 return self._column == other._column and self._name == other._name | |
40 | |
41 def __ne__(self, other): | |
42 return not self.__eq__(other) | |
43 | |
44 def __repr__(self): | |
45 return 'Individual: column={0} name={1} alias={2}'.format(self._column, self._name, self._alias) | |
46 | |
47 | |
48 class Population(object): | |
49 def __init__(self, name=None): | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
50 self._columns = OrderedDict.OrderedDict() |
0 | 51 self._name = name |
52 | |
53 @property | |
54 def name(self): | |
55 return self._name | |
56 | |
57 @name.setter | |
58 def name(self, name): | |
59 self._name = name | |
60 | |
61 def add_individual(self, individual, alias=None): | |
62 if individual.column not in self._columns: | |
63 self._columns[individual.column] = individual | |
64 elif self._columns[individual.column] == individual: | |
65 # should should this be an error? | |
66 # should we replace the alias using this entry? | |
67 pass | |
68 else: | |
69 raise 'Duplicate column: {0}'.format(individual) | |
70 | |
71 def is_superset(self, other): | |
72 for column, other_individual in other._columns.items(): | |
73 our_individual = self._columns.get(column) | |
74 if our_individual is None or our_individual != other_individual: | |
75 return False | |
76 return True | |
77 | |
78 def is_disjoint(self, other): | |
79 for column, our_individual in self._columns.items(): | |
80 other_individual = other._columns.get(column) | |
81 if other_individual is not None and other_individual == our_individual: | |
82 return False | |
83 return True | |
84 | |
85 def column_list(self): | |
86 return self._columns.keys() | |
87 | |
88 def individual_with_column(self, column): | |
89 if column in self._columns: | |
90 return self._columns[column] | |
91 return None | |
92 | |
93 def tag_list(self, delimiter=':'): | |
94 entries = [] | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
95 for column, individual in self._columns.iteritems(): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
96 first_token = individual.name.split()[0] |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
97 entry = '{0}{1}{2}'.format(column, delimiter, first_token) |
0 | 98 entries.append(entry) |
99 return entries | |
100 | |
101 def to_string(self, delimiter=':', separator=' ', replace_names_with=None): | |
102 entries = [] | |
103 for column, individual in self._columns.items(): | |
104 value = individual.name | |
105 if replace_names_with is not None: | |
106 value = replace_names_with | |
107 entry = '{0}{1}{2}'.format(column, delimiter, value) | |
108 entries.append(entry) | |
109 return separator.join(entries) | |
110 | |
111 def __str__(self): | |
112 return self.to_string() | |
113 | |
114 def from_population_file(self, filename): | |
115 with open(filename) as fh: | |
116 for line in fh: | |
117 line = line.rstrip('\r\n') | |
118 column, name, alias = line.split('\t') | |
119 alias = alias.strip() | |
120 individual = Individual(column, name) | |
121 if alias: | |
122 individual.alias = alias | |
123 self.add_individual(individual) | |
124 | |
125 def from_tag_list(self, tag_list): | |
126 for tag in tag_list: | |
127 column, name = tag.split(':') | |
128 individual = Individual(column, name) | |
129 self.add_individual(individual) | |
130 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
131 def from_wrapped_dict(self, wrapped_dict): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
132 unwraped_dict = self.unwrap_dict(wrapped_dict) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
133 for name, column in unwraped_dict.iteritems(): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
134 individual = Individual(column, name) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
135 self.add_individual(individual) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
136 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
137 def unwrap_dict(self, wrapped_dict): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
138 decoded_value = self.decode_value(wrapped_dict) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
139 decompressed_value = self.decompress_value(decoded_value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
140 def _decode_list(data): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
141 rv = [] |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
142 for item in data: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
143 if isinstance(item, unicode): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
144 item = item.encode('utf-8') |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
145 elif isinstance(item, list): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
146 item = _decode_list(item) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
147 elif isinstance(item, dict): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
148 item = _decode_dict(item) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
149 rv.append(item) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
150 return rv |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
151 def _decode_dict(data): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
152 rv = {} |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
153 for key, value in data.iteritems(): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
154 if isinstance(key, unicode): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
155 key = key.encode('utf-8') |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
156 if isinstance(value, unicode): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
157 value = value.encode('utf-8') |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
158 elif isinstance(value, list): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
159 value = _decode_list(value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
160 elif isinstance(value, dict): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
161 value = _decode_dict(value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
162 rv[key] = value |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
163 return rv |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
164 unwrapped_dict = json.loads(decompressed_value, object_hook=_decode_dict) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
165 return unwrapped_dict |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
166 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
167 def decode_value(self, value): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
168 try: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
169 return base64.b64decode(value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
170 except TypeError, message: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
171 print >> sys.stderr, 'base64.b64decode: {0}: {1}'.format(message, value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
172 sys.exit(1) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
173 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
174 def decompress_value(self, value): |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
175 try: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
176 return zlib.decompress(value) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
177 except zlib.error, message: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
178 print >> sys.stderr, 'zlib.decompress: {0}'.format(message) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
179 sys.exit(1) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
180 |
0 | 181 def individual_names(self): |
182 for column, individual in self._columns.items(): | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
183 first_token = individual.name.split()[0] |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
184 yield first_token |
0 | 185 |