view manipulate/extend_linearcode/lc_extend.py @ 1:0a5e0df17054 draft default tip

Uploaded
author chrisb
date Fri, 06 May 2016 08:05:48 -0400
parents 89592faa2875
children
line wrap: on
line source

__license__ = "MIT"
__version = "0.3"

def splitkeepsep(s, sep):  # seems great but adds element to an awkward spot!
    """items and keep the separator i.e. a,b,c becomes ['a,', 'b,', 'c']"""
    import re

    return reduce(lambda acc, elem: acc[:-1] + [acc[-1] + elem] if elem == sep else acc + [elem],
                  re.split("(%s)" % re.escape(sep), s), [])


def create_logger(logfile):
    import logging as logging
    # logging.basicConfig(filename=logfile, level=logging.DEBUG)
    logger = logging.getLogger()
    hdlr = logging.FileHandler(logfile)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)  # technically calling this multiple times will not add additional loggers
    logger.setLevel(logging.DEBUG)


def create_permutations(num_branches, num_additions):
    """
    # create all permutations of placing num_additions monosaccharides on the end on a glycan with num_branches branch points
    """
    permute = []
    import itertools

    a = range(0, num_branches)
    if num_additions > 12 or num_additions == 1:  # unsupported return None
        return []
    elif num_additions == 2:
        for r in itertools.product(a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1]))
    elif num_additions == 3:
        for r in itertools.product(a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2]))
    elif num_additions == 4:
        for r in itertools.product(a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3]))
    elif num_additions == 5:
        for r in itertools.product(a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4]))
    elif num_additions == 6:
        for r in itertools.product(a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5]))
    elif num_additions == 7:
        for r in itertools.product(a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6]))
    elif num_additions == 8:
        for r in itertools.product(a, a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]))
    elif num_additions == 9:
        for r in itertools.product(a, a, a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8]))
    elif num_additions == 10:
        for r in itertools.product(a, a, a, a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9]))
    elif num_additions == 11:
        for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10]))
    elif num_additions == 12:
        for r in itertools.product(a, a, a, a, a, a, a, a, a, a, a, a):
            if len(set(r)) == num_additions:
                permute.append((r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11]))
    return permute


def read_linearcode(inputstream, logfile="logging.log", extend=False):
    """
     read linear code and either append or expand to provide all forms
     append mode, appends xylose and comma'd glycans to the root
     extend mode create all permutations if the comma'd glycan
     more on append: # . read linear code and append comma separated data as a new rooted branch with a xylose tag
     # example (A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4(Fa6)GN,F,F,NN,A??GN)
     # will become
     #  (((A??GN??)(F??F??)(NN??)X??)(A??GN??(A??GN??)Ma3(A??GN??(A??GN??)Ma6)Mb4GNb4)(Fa6)GN)
     # as all the additional glycans are attached to a xylose at the root position
     # but why? analysis tools do not support these unattached glycans and creating all the permutations
     # is a large task. Additionally, extend mode produces too many glycans of unknown 'veracity'.
    :param inputstream:
    :param logfile:
    :param extend: boolean flag. default is append (false). True extend and creates all permutes
    :return:
    """
    import logging as logging

    create_logger(logfile)
    linearcode_list = []
    if inputstream is None or inputstream == [] or inputstream == "":
        raise IOError("empty input stream")
    for line in inputstream:
        line = line.strip()
        # does this linear code have unassigned glycans on leaves
        if "," in line:
            # remove brackets over entire linear code

            if line[0] == "(" and line[-1] == ")":
                lin = line[1:-1]
            else:
                lin = line
            # split by , to get additional glycans
            glycan = lin.split(",")
            additional_glycans = glycan[1:]
            logging.debug('These are the %s additional glycans', additional_glycans)
            logging.debug('There are %s additional glycans', len(additional_glycans))

            # assume additional glycans of size one need a ?? appended to indicate unknown linkage
            for i in range(0, len(additional_glycans)):
                additional_glycans[i] += "??"

            if extend:  # permute
                # How many branches
                gly = splitkeepsep(glycan[0], "(")
                # . Intervention to prevent core fucose glycosylation
                if len(gly[-1]) < 7 and gly[-1][0].upper() == "F":
                    logging.debug('Merging Core Fucose with adjacent branch %s %s %s ', gly[-1], len(gly[-1]),
                                  gly[-1][0].upper())  # initial part + splits
                    last = gly.pop()
                    last2 = gly.pop()
                    gly.append(last2 + last)
                logging.debug('There are %s additional leaves', len(gly))  # initial part + splits
                logging.debug('These are the %s additional leaves', gly)

                # Permutations...
                # . Only compute permutations for certain size as not sure how to generate for all sizes
                if len(additional_glycans) > len(gly):
                    # ignoring for now, although this is possible
                    logging.warning('Glycans> Leaves - Ignored adding permutations')
                    linearcode_list.append(line)
                elif len(additional_glycans) > 12:
                    logging.warning('Ignored adding permutations, too many additional glys %s', additional_glycans)
                    linearcode_list.append(line)
                elif len(additional_glycans) == 1:
                    logging.debug('There are %s permutations', len(gly))
                    for i in range(0, len(gly)):
                        tempgly = list(gly)  # make sure I make a new copy every time
                        tempgly[i] = additional_glycans[0] + tempgly[i]
                        linearcode_list.append("".join(tempgly))
                        logging.debug('This, %s,  is one of the  permutations', "".join(tempgly))
                else:
                    perms = create_permutations(len(gly), len(additional_glycans))
                    logging.debug('There are %s permutations', len(perms))
                    for itm in perms:
                        tempgly = list(gly)  # make sure I make a new copy every time
                        # .refactor
                        for idx in range(0, len(additional_glycans)):
                            tempgly[int(itm[idx])] = additional_glycans[idx] + tempgly[int(itm[idx])]
                        logging.debug('This, %s,  is one of the  permutations', "".join(tempgly))
                        linearcode_list.append("".join(tempgly))

            else:  # append xylose tag
                # create the additional branch and keep things sorted
                additional_branch = "("
                for subtype in sorted(set(additional_glycans)):
                    base = "("
                    for i in range(0, additional_glycans.count(subtype)):
                        base += subtype
                    base += ")"
                    additional_branch += base
                additional_branch += "X??)"
                logging.debug('These are the %s additional glycans after munging', additional_branch)
                # How many branches
                gly = splitkeepsep(glycan[0], "(")

                # . check if the branch needs brackets
                if ")(" not in gly[-2]:  # then need to bracket this
                    gly[-2] = gly[-2][:-1] + ")("  # bracket this
                    gly[0] = "(" + gly[0]
                    logging.debug('The glycan with brackets enclosing previous main branch%s', gly)
                else:
                    # .. todo check all others until fails and bracket.
                    pass

                # prepend the additional branch
                gly.insert(0, additional_branch)
                logging.debug('There are %s additional leaves', len(gly))  # initial part + splits
                logging.debug('These are the %s additional leaves', gly)
                linearcode_list.append("".join(gly))
        else:
            if line != "":
                linearcode_list.append(line)
    return linearcode_list


if __name__ == "__main__":
    from optparse import OptionParser

    usage = "usage: python %prog [options]\n"
    parser = OptionParser(usage=usage)
    parser.add_option("-i", action="store", type="string", dest="i", default="input",
                      help="input glycan linear code ")
    parser.add_option("-l", action="store", type="string", dest="l", default="linearcode.log",
                      help="log file name ")
    parser.add_option("-o", action="store", type="string", dest="o", default="output",
                      help="output file name (linear code) ")
    parser.add_option("-e", action="store_true", dest="e", default=False,
                      help="turn on extender, which creates all possible permutations . Append by default")
    (options, args) = parser.parse_args()

    try:
        instream = file(options.i, 'r')
    except Exception as e:
        raise IOError(e, "the input file specified does not exist. Use -h flag for help")
    print options.e
    allpermutes = read_linearcode(instream, options.l, options.e)
    try:
        out = file(options.o, "w")
    except Exception as e:
        raise IOError(e, "cannot write to output file. Use -h flag for help")
    try:
        for item in allpermutes:
            out.write(item + "\r\n")
    except Exception as e:
        raise e
    finally:
        out.close()