Mercurial > repos > chrisb > gap_all_glycan_tools
view manipulate/extend_linearcode/lc_extend.py @ 1:0a5e0df17054 draft default tip
Uploaded
author | chrisb |
---|---|
date | Fri, 06 May 2016 08:05:48 -0400 |
parents | 89592faa2875 |
children |
line wrap: on
line source
__license__ = "MIT" __version = "0.3" def splitkeepsep(s, sep): # seems great but adds element to an awkward spot! """items and keep the separator i.e. a,b,c becomes ['a,', 'b,', 'c']""" import re return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep else acc + [elem], re.split("(%s)" % re.escape(sep), s), []) def create_logger(logfile): import logging as logging # logging.basicConfig(filename=logfile, level=logging.DEBUG) logger = logging.getLogger() hdlr = logging.FileHandler(logfile) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) # technically calling this multiple times will not add additional loggers logger.setLevel(logging.DEBUG) def create_permutations(num_branches, num_additions): """ # create all permutations of placing num_additions monosaccharides on the end on a glycan with num_branches branch points """ permute = [] import itertools a = range(0, num_branches) if num_additions > 12 or num_additions == 1: # unsupported return None return [] elif num_additions == 2: for r in itertools.product(a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1])) elif num_additions == 3: for r in itertools.product(a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2])) elif num_additions == 4: for r in itertools.product(a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3])) elif num_additions == 5: for r in itertools.product(a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4])) elif num_additions == 6: for r in itertools.product(a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5])) elif num_additions == 7: for r in itertools.product(a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6])) elif num_additions == 8: for r in itertools.product(a, a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7])) elif num_additions == 9: for r in itertools.product(a, a, a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8])) elif num_additions == 10: for r in itertools.product(a, a, a, a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9])) elif num_additions == 11: for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10])) elif num_additions == 12: for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a, a): if len(set(r)) == num_additions: permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11])) return permute def read_linearcode(inputstream, logfile="logging.log", extend=False): """ read linear code and either append or expand to provide all forms append mode, appends xylose and comma'd glycans to the root extend mode create all permutations if the comma'd glycan more on append: # . read linear code and append comma separated data as a new rooted branch with a xylose tag # example (A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4(Fa6)GN,F,F,NN,A??GN) # will become # (((A??GN??)(F??F??)(NN??)X??)(A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4)(Fa6)GN) # as all the additional glycans are attached to a xylose at the root position # but why? analysis tools do not support these unattached glycans and creating all the permutations # is a large task. Additionally, extend mode produces too many glycans of unknown 'veracity'. :param inputstream: :param logfile: :param extend: boolean flag. default is append (false). True extend and creates all permutes :return: """ import logging as logging create_logger(logfile) linearcode_list = [] if inputstream is None or inputstream == [] or inputstream == "": raise IOError("empty input stream") for line in inputstream: line = line.strip() # does this linear code have unassigned glycans on leaves if "," in line: # remove brackets over entire linear code if line[0] == "(" and line[-1] == ")": lin = line[1:-1] else: lin = line # split by , to get additional glycans glycan = lin.split(",") additional_glycans = glycan[1:] logging.debug('These are the %s additional glycans', additional_glycans) logging.debug('There are %s additional glycans', len(additional_glycans)) # assume additional glycans of size one need a ?? appended to indicate unknown linkage for i in range(0, len(additional_glycans)): additional_glycans[i] += "??" if extend: # permute # How many branches gly = splitkeepsep(glycan[0], "(") # . Intervention to prevent core fucose glycosylation if len(gly[-1]) < 7 and gly[-1][0].upper() == "F": logging.debug('Merging Core Fucose with adjacent branch %s %s %s ', gly[-1], len(gly[-1]), gly[-1][0].upper()) # initial part + splits last = gly.pop() last2 = gly.pop() gly.append(last2 + last) logging.debug('There are %s additional leaves', len(gly)) # initial part + splits logging.debug('These are the %s additional leaves', gly) # Permutations... # . Only compute permutations for certain size as not sure how to generate for all sizes if len(additional_glycans) > len(gly): # ignoring for now, although this is possible logging.warning('Glycans> Leaves - Ignored adding permutations') linearcode_list.append(line) elif len(additional_glycans) > 12: logging.warning('Ignored adding permutations, too many additional glys %s', additional_glycans) linearcode_list.append(line) elif len(additional_glycans) == 1: logging.debug('There are %s permutations', len(gly)) for i in range(0, len(gly)): tempgly = list(gly) # make sure I make a new copy every time tempgly[i] = additional_glycans[0] + tempgly[i] linearcode_list.append("".join(tempgly)) logging.debug('This, %s, is one of the permutations', "".join(tempgly)) else: perms = create_permutations(len(gly), len(additional_glycans)) logging.debug('There are %s permutations', len(perms)) for itm in perms: tempgly = list(gly) # make sure I make a new copy every time # .refactor for idx in range(0, len(additional_glycans)): tempgly[int(itm[idx])] = additional_glycans[idx] + tempgly[int(itm[idx])] logging.debug('This, %s, is one of the permutations', "".join(tempgly)) linearcode_list.append("".join(tempgly)) else: # append xylose tag # create the additional branch and keep things sorted additional_branch = "(" for subtype in sorted(set(additional_glycans)): base = "(" for i in range(0, additional_glycans.count(subtype)): base += subtype base += ")" additional_branch += base additional_branch += "X??)" logging.debug('These are the %s additional glycans after munging', additional_branch) # How many branches gly = splitkeepsep(glycan[0], "(") # . check if the branch needs brackets if ")(" not in gly[-2]: # then need to bracket this gly[-2] = gly[-2][:-1] + ")(" # bracket this gly[0] = "(" + gly[0] logging.debug('The glycan with brackets enclosing previous main branch%s', gly) else: # .. todo check all others until fails and bracket. pass # prepend the additional branch gly.insert(0, additional_branch) logging.debug('There are %s additional leaves', len(gly)) # initial part + splits logging.debug('These are the %s additional leaves', gly) linearcode_list.append("".join(gly)) else: if line != "": linearcode_list.append(line) return linearcode_list if __name__ == "__main__": from optparse import OptionParser usage = "usage: python %prog [options]\n" parser = OptionParser(usage=usage) parser.add_option("-i", action="store", type="string", dest="i", default="input", help="input glycan linear code ") parser.add_option("-l", action="store", type="string", dest="l", default="linearcode.log", help="log file name ") parser.add_option("-o", action="store", type="string", dest="o", default="output", help="output file name (linear code) ") parser.add_option("-e", action="store_true", dest="e", default=False, help="turn on extender, which creates all possible permutations . Append by default") (options, args) = parser.parse_args() try: instream = file(options.i, 'r') except Exception as e: raise IOError(e, "the input file specified does not exist. Use -h flag for help") print options.e allpermutes = read_linearcode(instream, options.l, options.e) try: out = file(options.o, "w") except Exception as e: raise IOError(e, "cannot write to output file. Use -h flag for help") try: for item in allpermutes: out.write(item + "\r\n") except Exception as e: raise e finally: out.close()