view spring_map.py @ 38:80a4b98121b6 draft

"planemo upload commit 22cd6b0fa88ce0ddc4052beab306f5ba10754f12"
author guerler
date Wed, 25 Nov 2020 17:38:24 +0000
parents 0be0af9e695d
children 172398348efd
line wrap: on
line source

#! /usr/bin/env python3
import argparse
import os

from spring_package.DBKit import createFile
from spring_package.Molecule import Molecule


def getId(line):
    line = line.strip()
    if len(line) != 6 or line[4:5] != "_":
        raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line)
    return line[:4].upper() + line[4:6]


def getPDB(line, args):
    pdb = line[:4].lower()
    pdbChain = line[5:6]
    pdbFile = "%s/temp.pdb" % args.temp
    pdbDatabaseId = "%s.pdb" % pdb
    createFile(pdbDatabaseId, args.index, args.database, pdbFile)
    return pdbFile, pdbChain


def getSequences(fileName):
    sequences = dict()
    with open(fileName) as file:
        for line in file:
            if line.startswith(">"):
                name = getId(line.split()[0][1:])
                nextLine = next(file)
                sequences[name] = nextLine
    return sequences


def getHomologue(queryFile, queryResultFile, databaseFile):
    if not os.path.isfile(queryResultFile):
        os.system("psiblast -query %s -db %s -out %s" % (queryFile,
                  databaseFile, queryResultFile))
    maxMatch = None
    try:
        with open(queryResultFile) as file:
            for i in range(38):
                line = next(file)
            maxMatch = getId(line.split()[0])
    except Exception:
        return None
    return maxMatch


def main(args):
    logFile = open(args.log, "w")
    temp = args.temp.rstrip("/")
    templates = set()
    os.system("mkdir -p %s" % temp)
    templateSequenceFile = "%s/templates.fasta" % temp
    if not os.path.isfile(templateSequenceFile):
        templateSequences = open(templateSequenceFile, "w")
        with open(args.list) as file:
            for rawId in file:
                templateId = getId(rawId)
                templates.add(templateId)
                pdbFile, pdbChain = getPDB(templateId, args)
                try:
                    templateMol = Molecule(pdbFile)
                    templateSeq = templateMol.getSequence(pdbChain)
                    templateSequences.write(">%s\n" % templateId)
                    templateSequences.write("%s\n" % templateSeq)
                except Exception:
                    logFile.write("Warning: File not found [%s].\n" % pdbFile)
        templateSequences.close()
        os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile)
    else:
        logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile)
    logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list))
    logFile.flush()

    crossReference = list()
    with open(args.cross) as file:
        for line in file:
            cols = line.split()
            if len(cols) != 2:
                raise Exception("Invalid line in crossreference [%s]." % line)
            crossReference.append(dict(core=cols[0], partner=cols[1]))
    logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference))
    logFile.flush()

    for refEntry in crossReference:
        coreId = refEntry["core"]
        partnerId = refEntry["partner"]
        partnerFile = "%s/%s.fasta" % (temp, partnerId)
        partnerResultFile = "%s.result" % partnerFile
        if partnerId in templates:
            refEntry["match"] = partnerId
            logFile.write("Found partner in template list alignment [%s].\n" % partnerId)
        else:
            logFile.write("Processing %s.\n" % partnerId)
            if not os.path.isfile(partnerResultFile):
                pdbFile, pdbChain = getPDB(partnerId, args)
                partnerMol = Molecule(pdbFile)
                partnerSeq = partnerMol.getSequence(pdbChain)
                with open(partnerFile, "w") as partnerFasta:
                    partnerFasta.write(">%s\n" % partnerId)
                    partnerFasta.write("%s" % partnerSeq)
            else:
                logFile.write("Using existing results. [%s].\n" % partnerId)
            matchedId = getHomologue(partnerFile, partnerResultFile,
                                     templateSequenceFile)
            if matchedId is None:
                logFile.write("Warning: Failed alignment [%s].\n" % partnerId)
            else:
                logFile.write("Found matching entry %s.\n" % matchedId)
                refEntry["match"] = matchedId
        logFile.flush()

    finalSet = set()
    for refEntry in crossReference:
        coreId = refEntry["core"]
        partnerId = refEntry["partner"]
        if "match" in refEntry:
            entry = "%s\t%s" % (coreId, refEntry["match"])
            if entry not in finalSet:
                finalSet.add(entry)
        else:
            logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId))
    logFile.write("Found %s cross reference entries.\n" % len(finalSet))
    logFile.close()

    with open(args.output, 'w') as output_file:
        for entry in sorted(finalSet):
            output_file.write("%s\n" % entry)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Maps binding partners to template library')
    parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True)
    parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True)
    parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True)
    parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True)
    parser.add_argument('-o', '--output', help='Cross reference', required=True)
    parser.add_argument('-g', '--log', help='Log File', required=True)
    parser.add_argument('-t', '--temp', help='Temporary directory', required=True)
    args = parser.parse_args()
    main(args)