comparison manipulate/extend_linearcode/lc_extend.py @ 0:89592faa2875 draft

Uploaded
author chrisb
date Wed, 23 Mar 2016 14:35:56 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:89592faa2875
1 __license__ = "MIT"
2 __version = "0.3"
3
4 def splitkeepsep(s, sep): # seems great but adds element to an awkward spot!
5 """items and keep the separator i.e. a,b,c becomes ['a,', 'b,', 'c']"""
6 import re
7
8 return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep else acc + [elem],
9 re.split("(%s)" % re.escape(sep), s), [])
10
11
12 def create_logger(logfile):
13 import logging as logging
14 # logging.basicConfig(filename=logfile, level=logging.DEBUG)
15 logger = logging.getLogger()
16 hdlr = logging.FileHandler(logfile)
17 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
18 hdlr.setFormatter(formatter)
19 logger.addHandler(hdlr) # technically calling this multiple times will not add additional loggers
20 logger.setLevel(logging.DEBUG)
21
22
23 def create_permutations(num_branches, num_additions):
24 """
25 # create all permutations of placing num_additions monosaccharides on the end on a glycan with num_branches branch points
26 """
27 permute = []
28 import itertools
29
30 a = range(0, num_branches)
31 if num_additions > 12 or num_additions == 1: # unsupported return None
32 return []
33 elif num_additions == 2:
34 for r in itertools.product(a, a):
35 if len(set(r)) == num_additions:
36 permute.append((r[0], r[1]))
37 elif num_additions == 3:
38 for r in itertools.product(a, a, a):
39 if len(set(r)) == num_additions:
40 permute.append((r[0], r[1], r[2]))
41 elif num_additions == 4:
42 for r in itertools.product(a, a, a, a):
43 if len(set(r)) == num_additions:
44 permute.append((r[0], r[1], r[2], r[3]))
45 elif num_additions == 5:
46 for r in itertools.product(a, a, a, a, a):
47 if len(set(r)) == num_additions:
48 permute.append((r[0], r[1], r[2], r[3], r[4]))
49 elif num_additions == 6:
50 for r in itertools.product(a, a, a, a, a, a):
51 if len(set(r)) == num_additions:
52 permute.append((r[0], r[1], r[2], r[3], r[4], r[5]))
53 elif num_additions == 7:
54 for r in itertools.product(a, a, a, a, a, a, a):
55 if len(set(r)) == num_additions:
56 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6]))
57 elif num_additions == 8:
58 for r in itertools.product(a, a, a, a, a, a, a, a):
59 if len(set(r)) == num_additions:
60 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]))
61 elif num_additions == 9:
62 for r in itertools.product(a, a, a, a, a, a, a, a, a):
63 if len(set(r)) == num_additions:
64 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8]))
65 elif num_additions == 10:
66 for r in itertools.product(a, a, a, a, a, a, a, a, a, a):
67 if len(set(r)) == num_additions:
68 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9]))
69 elif num_additions == 11:
70 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a):
71 if len(set(r)) == num_additions:
72 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10]))
73 elif num_additions == 12:
74 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a, a):
75 if len(set(r)) == num_additions:
76 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11]))
77 return permute
78
79
80 def read_linearcode(inputstream, logfile="logging.log", extend=False):
81 """
82 read linear code and either append or expand to provide all forms
83 append mode, appends xylose and comma'd glycans to the root
84 extend mode create all permutations if the comma'd glycan
85 more on append: # . read linear code and append comma separated data as a new rooted branch with a xylose tag
86 # example (A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4(Fa6)GN,F,F,NN,A??GN)
87 # will become
88 # (((A??GN??)(F??F??)(NN??)X??)(A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4)(Fa6)GN)
89 # as all the additional glycans are attached to a xylose at the root position
90 # but why? analysis tools do not support these unattached glycans and creating all the permutations
91 # is a large task. Additionally, extend mode produces too many glycans of unknown 'veracity'.
92 :param inputstream:
93 :param logfile:
94 :param extend: boolean flag. default is append (false). True extend and creates all permutes
95 :return:
96 """
97 import logging as logging
98
99 create_logger(logfile)
100 linearcode_list = []
101 if inputstream is None or inputstream == [] or inputstream == "":
102 raise IOError("empty input stream")
103 for line in inputstream:
104 line = line.strip()
105 # does this linear code have unassigned glycans on leaves
106 if "," in line:
107 # remove brackets over entire linear code
108
109 if line[0] == "(" and line[-1] == ")":
110 lin = line[1:-1]
111 else:
112 lin = line
113 # split by , to get additional glycans
114 glycan = lin.split(",")
115 additional_glycans = glycan[1:]
116 logging.debug('These are the %s additional glycans', additional_glycans)
117 logging.debug('There are %s additional glycans', len(additional_glycans))
118
119 # assume additional glycans of size one need a ?? appended to indicate unknown linkage
120 for i in range(0, len(additional_glycans)):
121 additional_glycans[i] += "??"
122
123 if extend: # permute
124 # How many branches
125 gly = splitkeepsep(glycan[0], "(")
126 # . Intervention to prevent core fucose glycosylation
127 if len(gly[-1]) < 7 and gly[-1][0].upper() == "F":
128 logging.debug('Merging Core Fucose with adjacent branch %s %s %s ', gly[-1], len(gly[-1]),
129 gly[-1][0].upper()) # initial part + splits
130 last = gly.pop()
131 last2 = gly.pop()
132 gly.append(last2 + last)
133 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits
134 logging.debug('These are the %s additional leaves', gly)
135
136 # Permutations...
137 # . Only compute permutations for certain size as not sure how to generate for all sizes
138 if len(additional_glycans) > len(gly):
139 # ignoring for now, although this is possible
140 logging.warning('Glycans> Leaves - Ignored adding permutations')
141 linearcode_list.append(line)
142 elif len(additional_glycans) > 12:
143 logging.warning('Ignored adding permutations, too many additional glys %s', additional_glycans)
144 linearcode_list.append(line)
145 elif len(additional_glycans) == 1:
146 logging.debug('There are %s permutations', len(gly))
147 for i in range(0, len(gly)):
148 tempgly = list(gly) # make sure I make a new copy every time
149 tempgly[i] = additional_glycans[0] + tempgly[i]
150 linearcode_list.append("".join(tempgly))
151 logging.debug('This, %s, is one of the permutations', "".join(tempgly))
152 else:
153 perms = create_permutations(len(gly), len(additional_glycans))
154 logging.debug('There are %s permutations', len(perms))
155 for itm in perms:
156 tempgly = list(gly) # make sure I make a new copy every time
157 # .refactor
158 for idx in range(0, len(additional_glycans)):
159 tempgly[int(itm[idx])] = additional_glycans[idx] + tempgly[int(itm[idx])]
160 logging.debug('This, %s, is one of the permutations', "".join(tempgly))
161 linearcode_list.append("".join(tempgly))
162
163 else: # append xylose tag
164 # create the additional branch and keep things sorted
165 additional_branch = "("
166 for subtype in sorted(set(additional_glycans)):
167 base = "("
168 for i in range(0, additional_glycans.count(subtype)):
169 base += subtype
170 base += ")"
171 additional_branch += base
172 additional_branch += "X??)"
173 logging.debug('These are the %s additional glycans after munging', additional_branch)
174 # How many branches
175 gly = splitkeepsep(glycan[0], "(")
176
177 # . check if the branch needs brackets
178 if ")(" not in gly[-2]: # then need to bracket this
179 gly[-2] = gly[-2][:-1] + ")(" # bracket this
180 gly[0] = "(" + gly[0]
181 logging.debug('The glycan with brackets enclosing previous main branch%s', gly)
182 else:
183 # .. todo check all others until fails and bracket.
184 pass
185
186 # prepend the additional branch
187 gly.insert(0, additional_branch)
188 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits
189 logging.debug('These are the %s additional leaves', gly)
190 linearcode_list.append("".join(gly))
191 else:
192 if line != "":
193 linearcode_list.append(line)
194 return linearcode_list
195
196
197 if __name__ == "__main__":
198 from optparse import OptionParser
199
200 usage = "usage: python %prog [options]\n"
201 parser = OptionParser(usage=usage)
202 parser.add_option("-i", action="store", type="string", dest="i", default="input",
203 help="input glycan linear code ")
204 parser.add_option("-l", action="store", type="string", dest="l", default="linearcode.log",
205 help="log file name ")
206 parser.add_option("-o", action="store", type="string", dest="o", default="output",
207 help="output file name (linear code) ")
208 parser.add_option("-e", action="store_true", dest="e", default=False,
209 help="turn on extender, which creates all possible permutations . Append by default")
210 (options, args) = parser.parse_args()
211
212 try:
213 instream = file(options.i, 'r')
214 except Exception as e:
215 raise IOError(e, "the input file specified does not exist. Use -h flag for help")
216 print options.e
217 allpermutes = read_linearcode(instream, options.l, options.e)
218 try:
219 out = file(options.o, "w")
220 except Exception as e:
221 raise IOError(e, "cannot write to output file. Use -h flag for help")
222 try:
223 for item in allpermutes:
224 out.write(item + "\r\n")
225 except Exception as e:
226 raise e
227 finally:
228 out.close()