Mercurial > repos > guerler > springsuite
diff spring_map.py @ 39:172398348efd draft
"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
author | guerler |
---|---|
date | Fri, 22 Jan 2021 15:50:27 +0000 |
parents | 80a4b98121b6 |
children |
line wrap: on
line diff
--- a/spring_map.py Wed Nov 25 17:38:24 2020 +0000 +++ b/spring_map.py Fri Jan 22 15:50:27 2021 +0000 @@ -1,24 +1,19 @@ #! /usr/bin/env python3 import argparse -import os +from os import mkdir, system +from os.path import isfile, isdir -from spring_package.DBKit import createFile +from spring_package.DBKit import DBKit from spring_package.Molecule import Molecule +from spring_package.Utilities import getId, getChain, getName -def getId(line): - line = line.strip() - if len(line) != 6 or line[4:5] != "_": - raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line) - return line[:4].upper() + line[4:6] - - -def getPDB(line, args): - pdb = line[:4].lower() - pdbChain = line[5:6] - pdbFile = "%s/temp.pdb" % args.temp +def getPDB(line, pdbDatabase): + pdb = getName(line) + pdbChain = getChain(line) + pdbFile = "temp/temp.pdb" pdbDatabaseId = "%s.pdb" % pdb - createFile(pdbDatabaseId, args.index, args.database, pdbFile) + pdbDatabase.createFile(pdbDatabaseId, pdbFile) return pdbFile, pdbChain @@ -33,43 +28,64 @@ return sequences -def getHomologue(queryFile, queryResultFile, databaseFile): - if not os.path.isfile(queryResultFile): - os.system("psiblast -query %s -db %s -out %s" % (queryFile, - databaseFile, queryResultFile)) +def findMatch(identifier, templates, databaseFile, pdbDatabase, evalue=0.0): + if identifier in templates: + return identifier + resultSub = identifier[:2] + fastaFile = "temp/%s/%s.fasta" % (resultSub, identifier) + resultFile = "%s.result" % fastaFile + if not isfile(resultFile): + resultDir = "temp/%s" % resultSub + if not isdir(resultDir): + mkdir(resultDir) + pdbFile, pdbChain = getPDB(identifier, pdbDatabase) + mol = Molecule(pdbFile) + seq = mol.getSequence(pdbChain) + with open(fastaFile, "w") as fasta: + fasta.write(">%s\n" % identifier) + fasta.write("%s" % seq) + system("psiblast -query %s -db %s -out %s" % (fastaFile, databaseFile, resultFile)) maxMatch = None try: - with open(queryResultFile) as file: + with open(resultFile) as file: for i in range(38): line = next(file) - maxMatch = getId(line.split()[0]) + columns = line.split() + maxMatch = getId(columns[0]) + maxScore = float(columns[2]) + if maxScore > evalue: + return None except Exception: return None return maxMatch def main(args): + if not isdir("temp"): + mkdir("temp") logFile = open(args.log, "w") - temp = args.temp.rstrip("/") + templateSequenceFile = "temp/templates.fasta" + pdbDatabase = DBKit(args.index, args.database) templates = set() - os.system("mkdir -p %s" % temp) - templateSequenceFile = "%s/templates.fasta" % temp - if not os.path.isfile(templateSequenceFile): + with open(args.list) as file: + for rawId in file: + templateId = getId(rawId) + templates.add(templateId) + if not isfile(templateSequenceFile): templateSequences = open(templateSequenceFile, "w") with open(args.list) as file: for rawId in file: templateId = getId(rawId) - templates.add(templateId) - pdbFile, pdbChain = getPDB(templateId, args) + pdbFile, pdbChain = getPDB(templateId, pdbDatabase) try: templateMol = Molecule(pdbFile) templateSeq = templateMol.getSequence(pdbChain) templateSequences.write(">%s\n" % templateId) templateSequences.write("%s\n" % templateSeq) except Exception: - logFile.write("Warning: File not found [%s].\n" % pdbFile) + logFile.write("Warning: File not found [%s].\n" % templateId) templateSequences.close() - os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) + system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) else: logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) @@ -87,58 +103,42 @@ for refEntry in crossReference: coreId = refEntry["core"] + logFile.write("Processing %s.\n" % coreId) + coreMatch = findMatch(coreId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) partnerId = refEntry["partner"] - partnerFile = "%s/%s.fasta" % (temp, partnerId) - partnerResultFile = "%s.result" % partnerFile - if partnerId in templates: - refEntry["match"] = partnerId - logFile.write("Found partner in template list alignment [%s].\n" % partnerId) + logFile.write("Processing %s.\n" % partnerId) + partnerMatch = findMatch(partnerId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) + if partnerMatch is None or coreMatch is None: + logFile.write("Warning: Failed alignment [%s, %s].\n" % (coreId, partnerId)) else: - logFile.write("Processing %s.\n" % partnerId) - if not os.path.isfile(partnerResultFile): - pdbFile, pdbChain = getPDB(partnerId, args) - partnerMol = Molecule(pdbFile) - partnerSeq = partnerMol.getSequence(pdbChain) - with open(partnerFile, "w") as partnerFasta: - partnerFasta.write(">%s\n" % partnerId) - partnerFasta.write("%s" % partnerSeq) - else: - logFile.write("Using existing results. [%s].\n" % partnerId) - matchedId = getHomologue(partnerFile, partnerResultFile, - templateSequenceFile) - if matchedId is None: - logFile.write("Warning: Failed alignment [%s].\n" % partnerId) - else: - logFile.write("Found matching entry %s.\n" % matchedId) - refEntry["match"] = matchedId + logFile.write("Found matching entries [%s, %s].\n" % (coreMatch, partnerMatch)) + refEntry["coreMatch"] = coreMatch + refEntry["partnerMatch"] = partnerMatch logFile.flush() - finalSet = set() - for refEntry in crossReference: - coreId = refEntry["core"] - partnerId = refEntry["partner"] - if "match" in refEntry: - entry = "%s\t%s" % (coreId, refEntry["match"]) - if entry not in finalSet: - finalSet.add(entry) - else: - logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) - logFile.write("Found %s cross reference entries.\n" % len(finalSet)) + entryCount = 0 + with open(args.output, 'w') as output_file: + for refEntry in crossReference: + coreId = refEntry["core"] + partnerId = refEntry["partner"] + if "coreMatch" in refEntry and "partnerMatch" in refEntry: + entry = "%s\t%s\t%s\t%s\n" % (refEntry["coreMatch"], refEntry["partnerMatch"], refEntry["core"], refEntry["partner"]) + output_file.write(entry) + entryCount = entryCount + 1 + else: + logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) + logFile.write("Found %s cross reference entries.\n" % entryCount) logFile.close() - with open(args.output, 'w') as output_file: - for entry in sorted(finalSet): - output_file.write("%s\n" % entry) - if __name__ == "__main__": parser = argparse.ArgumentParser(description='Maps binding partners to template library') parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) - parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True) - parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True) + parser.add_argument('-i', '--index', help='PDB Database Index file (ffindex)', required=True) + parser.add_argument('-d', '--database', help='PDB Database files (ffdata)', required=True) parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) parser.add_argument('-o', '--output', help='Cross reference', required=True) parser.add_argument('-g', '--log', help='Log File', required=True) - parser.add_argument('-t', '--temp', help='Temporary directory', required=True) + parser.add_argument('-e', '--evalue', help='e-Value threshold', type=float, default=0.0) args = parser.parse_args() main(args)