Mercurial > repos > chrisb > gap_all_glycan_tools
comparison manipulate/extend_linearcode/lc_extend.py @ 0:89592faa2875 draft
Uploaded
author | chrisb |
---|---|
date | Wed, 23 Mar 2016 14:35:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:89592faa2875 |
---|---|
1 __license__ = "MIT" | |
2 __version = "0.3" | |
3 | |
4 def splitkeepsep(s, sep): # seems great but adds element to an awkward spot! | |
5 """items and keep the separator i.e. a,b,c becomes ['a,', 'b,', 'c']""" | |
6 import re | |
7 | |
8 return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep else acc + [elem], | |
9 re.split("(%s)" % re.escape(sep), s), []) | |
10 | |
11 | |
12 def create_logger(logfile): | |
13 import logging as logging | |
14 # logging.basicConfig(filename=logfile, level=logging.DEBUG) | |
15 logger = logging.getLogger() | |
16 hdlr = logging.FileHandler(logfile) | |
17 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') | |
18 hdlr.setFormatter(formatter) | |
19 logger.addHandler(hdlr) # technically calling this multiple times will not add additional loggers | |
20 logger.setLevel(logging.DEBUG) | |
21 | |
22 | |
23 def create_permutations(num_branches, num_additions): | |
24 """ | |
25 # create all permutations of placing num_additions monosaccharides on the end on a glycan with num_branches branch points | |
26 """ | |
27 permute = [] | |
28 import itertools | |
29 | |
30 a = range(0, num_branches) | |
31 if num_additions > 12 or num_additions == 1: # unsupported return None | |
32 return [] | |
33 elif num_additions == 2: | |
34 for r in itertools.product(a, a): | |
35 if len(set(r)) == num_additions: | |
36 permute.append((r[0], r[1])) | |
37 elif num_additions == 3: | |
38 for r in itertools.product(a, a, a): | |
39 if len(set(r)) == num_additions: | |
40 permute.append((r[0], r[1], r[2])) | |
41 elif num_additions == 4: | |
42 for r in itertools.product(a, a, a, a): | |
43 if len(set(r)) == num_additions: | |
44 permute.append((r[0], r[1], r[2], r[3])) | |
45 elif num_additions == 5: | |
46 for r in itertools.product(a, a, a, a, a): | |
47 if len(set(r)) == num_additions: | |
48 permute.append((r[0], r[1], r[2], r[3], r[4])) | |
49 elif num_additions == 6: | |
50 for r in itertools.product(a, a, a, a, a, a): | |
51 if len(set(r)) == num_additions: | |
52 permute.append((r[0], r[1], r[2], r[3], r[4], r[5])) | |
53 elif num_additions == 7: | |
54 for r in itertools.product(a, a, a, a, a, a, a): | |
55 if len(set(r)) == num_additions: | |
56 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6])) | |
57 elif num_additions == 8: | |
58 for r in itertools.product(a, a, a, a, a, a, a, a): | |
59 if len(set(r)) == num_additions: | |
60 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7])) | |
61 elif num_additions == 9: | |
62 for r in itertools.product(a, a, a, a, a, a, a, a, a): | |
63 if len(set(r)) == num_additions: | |
64 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8])) | |
65 elif num_additions == 10: | |
66 for r in itertools.product(a, a, a, a, a, a, a, a, a, a): | |
67 if len(set(r)) == num_additions: | |
68 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9])) | |
69 elif num_additions == 11: | |
70 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a): | |
71 if len(set(r)) == num_additions: | |
72 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10])) | |
73 elif num_additions == 12: | |
74 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a, a): | |
75 if len(set(r)) == num_additions: | |
76 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11])) | |
77 return permute | |
78 | |
79 | |
80 def read_linearcode(inputstream, logfile="logging.log", extend=False): | |
81 """ | |
82 read linear code and either append or expand to provide all forms | |
83 append mode, appends xylose and comma'd glycans to the root | |
84 extend mode create all permutations if the comma'd glycan | |
85 more on append: # . read linear code and append comma separated data as a new rooted branch with a xylose tag | |
86 # example (A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4(Fa6)GN,F,F,NN,A??GN) | |
87 # will become | |
88 # (((A??GN??)(F??F??)(NN??)X??)(A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4)(Fa6)GN) | |
89 # as all the additional glycans are attached to a xylose at the root position | |
90 # but why? analysis tools do not support these unattached glycans and creating all the permutations | |
91 # is a large task. Additionally, extend mode produces too many glycans of unknown 'veracity'. | |
92 :param inputstream: | |
93 :param logfile: | |
94 :param extend: boolean flag. default is append (false). True extend and creates all permutes | |
95 :return: | |
96 """ | |
97 import logging as logging | |
98 | |
99 create_logger(logfile) | |
100 linearcode_list = [] | |
101 if inputstream is None or inputstream == [] or inputstream == "": | |
102 raise IOError("empty input stream") | |
103 for line in inputstream: | |
104 line = line.strip() | |
105 # does this linear code have unassigned glycans on leaves | |
106 if "," in line: | |
107 # remove brackets over entire linear code | |
108 | |
109 if line[0] == "(" and line[-1] == ")": | |
110 lin = line[1:-1] | |
111 else: | |
112 lin = line | |
113 # split by , to get additional glycans | |
114 glycan = lin.split(",") | |
115 additional_glycans = glycan[1:] | |
116 logging.debug('These are the %s additional glycans', additional_glycans) | |
117 logging.debug('There are %s additional glycans', len(additional_glycans)) | |
118 | |
119 # assume additional glycans of size one need a ?? appended to indicate unknown linkage | |
120 for i in range(0, len(additional_glycans)): | |
121 additional_glycans[i] += "??" | |
122 | |
123 if extend: # permute | |
124 # How many branches | |
125 gly = splitkeepsep(glycan[0], "(") | |
126 # . Intervention to prevent core fucose glycosylation | |
127 if len(gly[-1]) < 7 and gly[-1][0].upper() == "F": | |
128 logging.debug('Merging Core Fucose with adjacent branch %s %s %s ', gly[-1], len(gly[-1]), | |
129 gly[-1][0].upper()) # initial part + splits | |
130 last = gly.pop() | |
131 last2 = gly.pop() | |
132 gly.append(last2 + last) | |
133 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits | |
134 logging.debug('These are the %s additional leaves', gly) | |
135 | |
136 # Permutations... | |
137 # . Only compute permutations for certain size as not sure how to generate for all sizes | |
138 if len(additional_glycans) > len(gly): | |
139 # ignoring for now, although this is possible | |
140 logging.warning('Glycans> Leaves - Ignored adding permutations') | |
141 linearcode_list.append(line) | |
142 elif len(additional_glycans) > 12: | |
143 logging.warning('Ignored adding permutations, too many additional glys %s', additional_glycans) | |
144 linearcode_list.append(line) | |
145 elif len(additional_glycans) == 1: | |
146 logging.debug('There are %s permutations', len(gly)) | |
147 for i in range(0, len(gly)): | |
148 tempgly = list(gly) # make sure I make a new copy every time | |
149 tempgly[i] = additional_glycans[0] + tempgly[i] | |
150 linearcode_list.append("".join(tempgly)) | |
151 logging.debug('This, %s, is one of the permutations', "".join(tempgly)) | |
152 else: | |
153 perms = create_permutations(len(gly), len(additional_glycans)) | |
154 logging.debug('There are %s permutations', len(perms)) | |
155 for itm in perms: | |
156 tempgly = list(gly) # make sure I make a new copy every time | |
157 # .refactor | |
158 for idx in range(0, len(additional_glycans)): | |
159 tempgly[int(itm[idx])] = additional_glycans[idx] + tempgly[int(itm[idx])] | |
160 logging.debug('This, %s, is one of the permutations', "".join(tempgly)) | |
161 linearcode_list.append("".join(tempgly)) | |
162 | |
163 else: # append xylose tag | |
164 # create the additional branch and keep things sorted | |
165 additional_branch = "(" | |
166 for subtype in sorted(set(additional_glycans)): | |
167 base = "(" | |
168 for i in range(0, additional_glycans.count(subtype)): | |
169 base += subtype | |
170 base += ")" | |
171 additional_branch += base | |
172 additional_branch += "X??)" | |
173 logging.debug('These are the %s additional glycans after munging', additional_branch) | |
174 # How many branches | |
175 gly = splitkeepsep(glycan[0], "(") | |
176 | |
177 # . check if the branch needs brackets | |
178 if ")(" not in gly[-2]: # then need to bracket this | |
179 gly[-2] = gly[-2][:-1] + ")(" # bracket this | |
180 gly[0] = "(" + gly[0] | |
181 logging.debug('The glycan with brackets enclosing previous main branch%s', gly) | |
182 else: | |
183 # .. todo check all others until fails and bracket. | |
184 pass | |
185 | |
186 # prepend the additional branch | |
187 gly.insert(0, additional_branch) | |
188 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits | |
189 logging.debug('These are the %s additional leaves', gly) | |
190 linearcode_list.append("".join(gly)) | |
191 else: | |
192 if line != "": | |
193 linearcode_list.append(line) | |
194 return linearcode_list | |
195 | |
196 | |
197 if __name__ == "__main__": | |
198 from optparse import OptionParser | |
199 | |
200 usage = "usage: python %prog [options]\n" | |
201 parser = OptionParser(usage=usage) | |
202 parser.add_option("-i", action="store", type="string", dest="i", default="input", | |
203 help="input glycan linear code ") | |
204 parser.add_option("-l", action="store", type="string", dest="l", default="linearcode.log", | |
205 help="log file name ") | |
206 parser.add_option("-o", action="store", type="string", dest="o", default="output", | |
207 help="output file name (linear code) ") | |
208 parser.add_option("-e", action="store_true", dest="e", default=False, | |
209 help="turn on extender, which creates all possible permutations . Append by default") | |
210 (options, args) = parser.parse_args() | |
211 | |
212 try: | |
213 instream = file(options.i, 'r') | |
214 except Exception as e: | |
215 raise IOError(e, "the input file specified does not exist. Use -h flag for help") | |
216 print options.e | |
217 allpermutes = read_linearcode(instream, options.l, options.e) | |
218 try: | |
219 out = file(options.o, "w") | |
220 except Exception as e: | |
221 raise IOError(e, "cannot write to output file. Use -h flag for help") | |
222 try: | |
223 for item in allpermutes: | |
224 out.write(item + "\r\n") | |
225 except Exception as e: | |
226 raise e | |
227 finally: | |
228 out.close() |