Mercurial > repos > chrisb > gap_all_glycan_tools
comparison manipulate/extend_linearcode/lc_extend.py @ 0:89592faa2875 draft
Uploaded
| author | chrisb |
|---|---|
| date | Wed, 23 Mar 2016 14:35:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:89592faa2875 |
|---|---|
| 1 __license__ = "MIT" | |
| 2 __version = "0.3" | |
| 3 | |
| 4 def splitkeepsep(s, sep): # seems great but adds element to an awkward spot! | |
| 5 """items and keep the separator i.e. a,b,c becomes ['a,', 'b,', 'c']""" | |
| 6 import re | |
| 7 | |
| 8 return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep else acc + [elem], | |
| 9 re.split("(%s)" % re.escape(sep), s), []) | |
| 10 | |
| 11 | |
| 12 def create_logger(logfile): | |
| 13 import logging as logging | |
| 14 # logging.basicConfig(filename=logfile, level=logging.DEBUG) | |
| 15 logger = logging.getLogger() | |
| 16 hdlr = logging.FileHandler(logfile) | |
| 17 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') | |
| 18 hdlr.setFormatter(formatter) | |
| 19 logger.addHandler(hdlr) # technically calling this multiple times will not add additional loggers | |
| 20 logger.setLevel(logging.DEBUG) | |
| 21 | |
| 22 | |
| 23 def create_permutations(num_branches, num_additions): | |
| 24 """ | |
| 25 # create all permutations of placing num_additions monosaccharides on the end on a glycan with num_branches branch points | |
| 26 """ | |
| 27 permute = [] | |
| 28 import itertools | |
| 29 | |
| 30 a = range(0, num_branches) | |
| 31 if num_additions > 12 or num_additions == 1: # unsupported return None | |
| 32 return [] | |
| 33 elif num_additions == 2: | |
| 34 for r in itertools.product(a, a): | |
| 35 if len(set(r)) == num_additions: | |
| 36 permute.append((r[0], r[1])) | |
| 37 elif num_additions == 3: | |
| 38 for r in itertools.product(a, a, a): | |
| 39 if len(set(r)) == num_additions: | |
| 40 permute.append((r[0], r[1], r[2])) | |
| 41 elif num_additions == 4: | |
| 42 for r in itertools.product(a, a, a, a): | |
| 43 if len(set(r)) == num_additions: | |
| 44 permute.append((r[0], r[1], r[2], r[3])) | |
| 45 elif num_additions == 5: | |
| 46 for r in itertools.product(a, a, a, a, a): | |
| 47 if len(set(r)) == num_additions: | |
| 48 permute.append((r[0], r[1], r[2], r[3], r[4])) | |
| 49 elif num_additions == 6: | |
| 50 for r in itertools.product(a, a, a, a, a, a): | |
| 51 if len(set(r)) == num_additions: | |
| 52 permute.append((r[0], r[1], r[2], r[3], r[4], r[5])) | |
| 53 elif num_additions == 7: | |
| 54 for r in itertools.product(a, a, a, a, a, a, a): | |
| 55 if len(set(r)) == num_additions: | |
| 56 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6])) | |
| 57 elif num_additions == 8: | |
| 58 for r in itertools.product(a, a, a, a, a, a, a, a): | |
| 59 if len(set(r)) == num_additions: | |
| 60 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7])) | |
| 61 elif num_additions == 9: | |
| 62 for r in itertools.product(a, a, a, a, a, a, a, a, a): | |
| 63 if len(set(r)) == num_additions: | |
| 64 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8])) | |
| 65 elif num_additions == 10: | |
| 66 for r in itertools.product(a, a, a, a, a, a, a, a, a, a): | |
| 67 if len(set(r)) == num_additions: | |
| 68 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9])) | |
| 69 elif num_additions == 11: | |
| 70 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a): | |
| 71 if len(set(r)) == num_additions: | |
| 72 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10])) | |
| 73 elif num_additions == 12: | |
| 74 for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a, a): | |
| 75 if len(set(r)) == num_additions: | |
| 76 permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11])) | |
| 77 return permute | |
| 78 | |
| 79 | |
| 80 def read_linearcode(inputstream, logfile="logging.log", extend=False): | |
| 81 """ | |
| 82 read linear code and either append or expand to provide all forms | |
| 83 append mode, appends xylose and comma'd glycans to the root | |
| 84 extend mode create all permutations if the comma'd glycan | |
| 85 more on append: # . read linear code and append comma separated data as a new rooted branch with a xylose tag | |
| 86 # example (A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4(Fa6)GN,F,F,NN,A??GN) | |
| 87 # will become | |
| 88 # (((A??GN??)(F??F??)(NN??)X??)(A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4)(Fa6)GN) | |
| 89 # as all the additional glycans are attached to a xylose at the root position | |
| 90 # but why? analysis tools do not support these unattached glycans and creating all the permutations | |
| 91 # is a large task. Additionally, extend mode produces too many glycans of unknown 'veracity'. | |
| 92 :param inputstream: | |
| 93 :param logfile: | |
| 94 :param extend: boolean flag. default is append (false). True extend and creates all permutes | |
| 95 :return: | |
| 96 """ | |
| 97 import logging as logging | |
| 98 | |
| 99 create_logger(logfile) | |
| 100 linearcode_list = [] | |
| 101 if inputstream is None or inputstream == [] or inputstream == "": | |
| 102 raise IOError("empty input stream") | |
| 103 for line in inputstream: | |
| 104 line = line.strip() | |
| 105 # does this linear code have unassigned glycans on leaves | |
| 106 if "," in line: | |
| 107 # remove brackets over entire linear code | |
| 108 | |
| 109 if line[0] == "(" and line[-1] == ")": | |
| 110 lin = line[1:-1] | |
| 111 else: | |
| 112 lin = line | |
| 113 # split by , to get additional glycans | |
| 114 glycan = lin.split(",") | |
| 115 additional_glycans = glycan[1:] | |
| 116 logging.debug('These are the %s additional glycans', additional_glycans) | |
| 117 logging.debug('There are %s additional glycans', len(additional_glycans)) | |
| 118 | |
| 119 # assume additional glycans of size one need a ?? appended to indicate unknown linkage | |
| 120 for i in range(0, len(additional_glycans)): | |
| 121 additional_glycans[i] += "??" | |
| 122 | |
| 123 if extend: # permute | |
| 124 # How many branches | |
| 125 gly = splitkeepsep(glycan[0], "(") | |
| 126 # . Intervention to prevent core fucose glycosylation | |
| 127 if len(gly[-1]) < 7 and gly[-1][0].upper() == "F": | |
| 128 logging.debug('Merging Core Fucose with adjacent branch %s %s %s ', gly[-1], len(gly[-1]), | |
| 129 gly[-1][0].upper()) # initial part + splits | |
| 130 last = gly.pop() | |
| 131 last2 = gly.pop() | |
| 132 gly.append(last2 + last) | |
| 133 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits | |
| 134 logging.debug('These are the %s additional leaves', gly) | |
| 135 | |
| 136 # Permutations... | |
| 137 # . Only compute permutations for certain size as not sure how to generate for all sizes | |
| 138 if len(additional_glycans) > len(gly): | |
| 139 # ignoring for now, although this is possible | |
| 140 logging.warning('Glycans> Leaves - Ignored adding permutations') | |
| 141 linearcode_list.append(line) | |
| 142 elif len(additional_glycans) > 12: | |
| 143 logging.warning('Ignored adding permutations, too many additional glys %s', additional_glycans) | |
| 144 linearcode_list.append(line) | |
| 145 elif len(additional_glycans) == 1: | |
| 146 logging.debug('There are %s permutations', len(gly)) | |
| 147 for i in range(0, len(gly)): | |
| 148 tempgly = list(gly) # make sure I make a new copy every time | |
| 149 tempgly[i] = additional_glycans[0] + tempgly[i] | |
| 150 linearcode_list.append("".join(tempgly)) | |
| 151 logging.debug('This, %s, is one of the permutations', "".join(tempgly)) | |
| 152 else: | |
| 153 perms = create_permutations(len(gly), len(additional_glycans)) | |
| 154 logging.debug('There are %s permutations', len(perms)) | |
| 155 for itm in perms: | |
| 156 tempgly = list(gly) # make sure I make a new copy every time | |
| 157 # .refactor | |
| 158 for idx in range(0, len(additional_glycans)): | |
| 159 tempgly[int(itm[idx])] = additional_glycans[idx] + tempgly[int(itm[idx])] | |
| 160 logging.debug('This, %s, is one of the permutations', "".join(tempgly)) | |
| 161 linearcode_list.append("".join(tempgly)) | |
| 162 | |
| 163 else: # append xylose tag | |
| 164 # create the additional branch and keep things sorted | |
| 165 additional_branch = "(" | |
| 166 for subtype in sorted(set(additional_glycans)): | |
| 167 base = "(" | |
| 168 for i in range(0, additional_glycans.count(subtype)): | |
| 169 base += subtype | |
| 170 base += ")" | |
| 171 additional_branch += base | |
| 172 additional_branch += "X??)" | |
| 173 logging.debug('These are the %s additional glycans after munging', additional_branch) | |
| 174 # How many branches | |
| 175 gly = splitkeepsep(glycan[0], "(") | |
| 176 | |
| 177 # . check if the branch needs brackets | |
| 178 if ")(" not in gly[-2]: # then need to bracket this | |
| 179 gly[-2] = gly[-2][:-1] + ")(" # bracket this | |
| 180 gly[0] = "(" + gly[0] | |
| 181 logging.debug('The glycan with brackets enclosing previous main branch%s', gly) | |
| 182 else: | |
| 183 # .. todo check all others until fails and bracket. | |
| 184 pass | |
| 185 | |
| 186 # prepend the additional branch | |
| 187 gly.insert(0, additional_branch) | |
| 188 logging.debug('There are %s additional leaves', len(gly)) # initial part + splits | |
| 189 logging.debug('These are the %s additional leaves', gly) | |
| 190 linearcode_list.append("".join(gly)) | |
| 191 else: | |
| 192 if line != "": | |
| 193 linearcode_list.append(line) | |
| 194 return linearcode_list | |
| 195 | |
| 196 | |
| 197 if __name__ == "__main__": | |
| 198 from optparse import OptionParser | |
| 199 | |
| 200 usage = "usage: python %prog [options]\n" | |
| 201 parser = OptionParser(usage=usage) | |
| 202 parser.add_option("-i", action="store", type="string", dest="i", default="input", | |
| 203 help="input glycan linear code ") | |
| 204 parser.add_option("-l", action="store", type="string", dest="l", default="linearcode.log", | |
| 205 help="log file name ") | |
| 206 parser.add_option("-o", action="store", type="string", dest="o", default="output", | |
| 207 help="output file name (linear code) ") | |
| 208 parser.add_option("-e", action="store_true", dest="e", default=False, | |
| 209 help="turn on extender, which creates all possible permutations . Append by default") | |
| 210 (options, args) = parser.parse_args() | |
| 211 | |
| 212 try: | |
| 213 instream = file(options.i, 'r') | |
| 214 except Exception as e: | |
| 215 raise IOError(e, "the input file specified does not exist. Use -h flag for help") | |
| 216 print options.e | |
| 217 allpermutes = read_linearcode(instream, options.l, options.e) | |
| 218 try: | |
| 219 out = file(options.o, "w") | |
| 220 except Exception as e: | |
| 221 raise IOError(e, "cannot write to output file. Use -h flag for help") | |
| 222 try: | |
| 223 for item in allpermutes: | |
| 224 out.write(item + "\r\n") | |
| 225 except Exception as e: | |
| 226 raise e | |
| 227 finally: | |
| 228 out.close() |
