Mercurial > repos > miller-lab > genome_diversity
annotate Population.py @ 28:184d14e4270d
Update to Miller Lab devshed revision 4ede22dd5500
| author | Richard Burhans <burhans@bx.psu.edu> |
|---|---|
| date | Wed, 17 Jul 2013 12:46:46 -0400 |
| parents | 8997f2ca8c7a |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
3 import OrderedDict |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
4 import base64 |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
5 import json |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
6 import zlib |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
7 |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
8 import sys |
| 0 | 9 |
| 10 class Individual(object): | |
| 11 __slots__ = ['_column', '_name', '_alias'] | |
| 12 | |
| 13 def __init__(self, column, name, alias=None): | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
14 self._column = int(column) |
| 0 | 15 self._name = name |
| 16 self._alias = alias | |
| 17 | |
| 18 @property | |
| 19 def column(self): | |
| 20 return self._column | |
| 21 | |
| 22 @property | |
| 23 def name(self): | |
| 24 return self._name if self._alias is None else self._alias | |
| 25 | |
| 26 @property | |
| 27 def alias(self): | |
| 28 return self._alias | |
| 29 | |
| 30 @alias.setter | |
| 31 def alias(self, alias): | |
| 32 self._alias = alias | |
| 33 | |
| 34 @property | |
| 35 def real_name(self): | |
| 36 return self._name | |
| 37 | |
| 38 def __eq__(self, other): | |
| 39 return self._column == other._column and self._name == other._name | |
| 40 | |
| 41 def __ne__(self, other): | |
| 42 return not self.__eq__(other) | |
| 43 | |
| 44 def __repr__(self): | |
| 45 return 'Individual: column={0} name={1} alias={2}'.format(self._column, self._name, self._alias) | |
| 46 | |
| 47 | |
| 48 class Population(object): | |
| 49 def __init__(self, name=None): | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
50 self._columns = OrderedDict.OrderedDict() |
| 0 | 51 self._name = name |
| 52 | |
| 53 @property | |
| 54 def name(self): | |
| 55 return self._name | |
| 56 | |
| 57 @name.setter | |
| 58 def name(self, name): | |
| 59 self._name = name | |
| 60 | |
| 61 def add_individual(self, individual, alias=None): | |
| 62 if individual.column not in self._columns: | |
| 63 self._columns[individual.column] = individual | |
| 64 elif self._columns[individual.column] == individual: | |
| 65 # should should this be an error? | |
| 66 # should we replace the alias using this entry? | |
| 67 pass | |
| 68 else: | |
| 69 raise 'Duplicate column: {0}'.format(individual) | |
| 70 | |
| 71 def is_superset(self, other): | |
| 72 for column, other_individual in other._columns.items(): | |
| 73 our_individual = self._columns.get(column) | |
| 74 if our_individual is None or our_individual != other_individual: | |
| 75 return False | |
| 76 return True | |
| 77 | |
| 78 def is_disjoint(self, other): | |
| 79 for column, our_individual in self._columns.items(): | |
| 80 other_individual = other._columns.get(column) | |
| 81 if other_individual is not None and other_individual == our_individual: | |
| 82 return False | |
| 83 return True | |
| 84 | |
| 85 def column_list(self): | |
| 86 return self._columns.keys() | |
| 87 | |
| 88 def individual_with_column(self, column): | |
| 89 if column in self._columns: | |
| 90 return self._columns[column] | |
| 91 return None | |
| 92 | |
| 93 def tag_list(self, delimiter=':'): | |
| 94 entries = [] | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
95 for column, individual in self._columns.iteritems(): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
96 first_token = individual.name.split()[0] |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
97 entry = '{0}{1}{2}'.format(column, delimiter, first_token) |
| 0 | 98 entries.append(entry) |
| 99 return entries | |
| 100 | |
| 101 def to_string(self, delimiter=':', separator=' ', replace_names_with=None): | |
| 102 entries = [] | |
| 103 for column, individual in self._columns.items(): | |
| 104 value = individual.name | |
| 105 if replace_names_with is not None: | |
| 106 value = replace_names_with | |
| 107 entry = '{0}{1}{2}'.format(column, delimiter, value) | |
| 108 entries.append(entry) | |
| 109 return separator.join(entries) | |
| 110 | |
| 111 def __str__(self): | |
| 112 return self.to_string() | |
| 113 | |
| 114 def from_population_file(self, filename): | |
| 115 with open(filename) as fh: | |
| 116 for line in fh: | |
| 117 line = line.rstrip('\r\n') | |
| 118 column, name, alias = line.split('\t') | |
| 119 alias = alias.strip() | |
| 120 individual = Individual(column, name) | |
| 121 if alias: | |
| 122 individual.alias = alias | |
| 123 self.add_individual(individual) | |
| 124 | |
| 125 def from_tag_list(self, tag_list): | |
| 126 for tag in tag_list: | |
| 127 column, name = tag.split(':') | |
| 128 individual = Individual(column, name) | |
| 129 self.add_individual(individual) | |
| 130 | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
131 def from_wrapped_dict(self, wrapped_dict): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
132 unwraped_dict = self.unwrap_dict(wrapped_dict) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
133 for name, column in unwraped_dict.iteritems(): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
134 individual = Individual(column, name) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
135 self.add_individual(individual) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
136 |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
137 def unwrap_dict(self, wrapped_dict): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
138 decoded_value = self.decode_value(wrapped_dict) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
139 decompressed_value = self.decompress_value(decoded_value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
140 def _decode_list(data): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
141 rv = [] |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
142 for item in data: |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
143 if isinstance(item, unicode): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
144 item = item.encode('utf-8') |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
145 elif isinstance(item, list): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
146 item = _decode_list(item) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
147 elif isinstance(item, dict): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
148 item = _decode_dict(item) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
149 rv.append(item) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
150 return rv |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
151 def _decode_dict(data): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
152 rv = {} |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
153 for key, value in data.iteritems(): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
154 if isinstance(key, unicode): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
155 key = key.encode('utf-8') |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
156 if isinstance(value, unicode): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
157 value = value.encode('utf-8') |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
158 elif isinstance(value, list): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
159 value = _decode_list(value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
160 elif isinstance(value, dict): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
161 value = _decode_dict(value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
162 rv[key] = value |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
163 return rv |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
164 unwrapped_dict = json.loads(decompressed_value, object_hook=_decode_dict) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
165 return unwrapped_dict |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
166 |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
167 def decode_value(self, value): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
168 try: |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
169 return base64.b64decode(value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
170 except TypeError, message: |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
171 print >> sys.stderr, 'base64.b64decode: {0}: {1}'.format(message, value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
172 sys.exit(1) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
173 |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
174 def decompress_value(self, value): |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
175 try: |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
176 return zlib.decompress(value) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
177 except zlib.error, message: |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
178 print >> sys.stderr, 'zlib.decompress: {0}'.format(message) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
179 sys.exit(1) |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
180 |
| 0 | 181 def individual_names(self): |
| 182 for column, individual in self._columns.items(): | |
|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
183 first_token = individual.name.split()[0] |
|
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
184 yield first_token |
| 0 | 185 |
