diff spring_map.py @ 39:172398348efd draft

"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
author guerler
date Fri, 22 Jan 2021 15:50:27 +0000
parents 80a4b98121b6
children
line wrap: on
line diff
--- a/spring_map.py	Wed Nov 25 17:38:24 2020 +0000
+++ b/spring_map.py	Fri Jan 22 15:50:27 2021 +0000
@@ -1,24 +1,19 @@
 #! /usr/bin/env python3
 import argparse
-import os
+from os import mkdir, system
+from os.path import isfile, isdir
 
-from spring_package.DBKit import createFile
+from spring_package.DBKit import DBKit
 from spring_package.Molecule import Molecule
+from spring_package.Utilities import getId, getChain, getName
 
 
-def getId(line):
-    line = line.strip()
-    if len(line) != 6 or line[4:5] != "_":
-        raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line)
-    return line[:4].upper() + line[4:6]
-
-
-def getPDB(line, args):
-    pdb = line[:4].lower()
-    pdbChain = line[5:6]
-    pdbFile = "%s/temp.pdb" % args.temp
+def getPDB(line, pdbDatabase):
+    pdb = getName(line)
+    pdbChain = getChain(line)
+    pdbFile = "temp/temp.pdb"
     pdbDatabaseId = "%s.pdb" % pdb
-    createFile(pdbDatabaseId, args.index, args.database, pdbFile)
+    pdbDatabase.createFile(pdbDatabaseId, pdbFile)
     return pdbFile, pdbChain
 
 
@@ -33,43 +28,64 @@
     return sequences
 
 
-def getHomologue(queryFile, queryResultFile, databaseFile):
-    if not os.path.isfile(queryResultFile):
-        os.system("psiblast -query %s -db %s -out %s" % (queryFile,
-                  databaseFile, queryResultFile))
+def findMatch(identifier, templates, databaseFile, pdbDatabase, evalue=0.0):
+    if identifier in templates:
+        return identifier
+    resultSub = identifier[:2]
+    fastaFile = "temp/%s/%s.fasta" % (resultSub, identifier)
+    resultFile = "%s.result" % fastaFile
+    if not isfile(resultFile):
+        resultDir = "temp/%s" % resultSub
+        if not isdir(resultDir):
+            mkdir(resultDir)
+        pdbFile, pdbChain = getPDB(identifier, pdbDatabase)
+        mol = Molecule(pdbFile)
+        seq = mol.getSequence(pdbChain)
+        with open(fastaFile, "w") as fasta:
+            fasta.write(">%s\n" % identifier)
+            fasta.write("%s" % seq)
+        system("psiblast -query %s -db %s -out %s" % (fastaFile, databaseFile, resultFile))
     maxMatch = None
     try:
-        with open(queryResultFile) as file:
+        with open(resultFile) as file:
             for i in range(38):
                 line = next(file)
-            maxMatch = getId(line.split()[0])
+            columns = line.split()
+            maxMatch = getId(columns[0])
+            maxScore = float(columns[2])
+            if maxScore > evalue:
+                return None
     except Exception:
         return None
     return maxMatch
 
 
 def main(args):
+    if not isdir("temp"):
+        mkdir("temp")
     logFile = open(args.log, "w")
-    temp = args.temp.rstrip("/")
+    templateSequenceFile = "temp/templates.fasta"
+    pdbDatabase = DBKit(args.index, args.database)
     templates = set()
-    os.system("mkdir -p %s" % temp)
-    templateSequenceFile = "%s/templates.fasta" % temp
-    if not os.path.isfile(templateSequenceFile):
+    with open(args.list) as file:
+        for rawId in file:
+            templateId = getId(rawId)
+            templates.add(templateId)
+    if not isfile(templateSequenceFile):
         templateSequences = open(templateSequenceFile, "w")
         with open(args.list) as file:
             for rawId in file:
                 templateId = getId(rawId)
-                templates.add(templateId)
-                pdbFile, pdbChain = getPDB(templateId, args)
+                pdbFile, pdbChain = getPDB(templateId, pdbDatabase)
                 try:
                     templateMol = Molecule(pdbFile)
                     templateSeq = templateMol.getSequence(pdbChain)
                     templateSequences.write(">%s\n" % templateId)
                     templateSequences.write("%s\n" % templateSeq)
                 except Exception:
-                    logFile.write("Warning: File not found [%s].\n" % pdbFile)
+                    logFile.write("Warning: File not found [%s].\n" % templateId)
         templateSequences.close()
-        os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile)
+        system("makeblastdb -in %s -dbtype prot" % templateSequenceFile)
     else:
         logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile)
     logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list))
@@ -87,58 +103,42 @@
 
     for refEntry in crossReference:
         coreId = refEntry["core"]
+        logFile.write("Processing %s.\n" % coreId)
+        coreMatch = findMatch(coreId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue)
         partnerId = refEntry["partner"]
-        partnerFile = "%s/%s.fasta" % (temp, partnerId)
-        partnerResultFile = "%s.result" % partnerFile
-        if partnerId in templates:
-            refEntry["match"] = partnerId
-            logFile.write("Found partner in template list alignment [%s].\n" % partnerId)
+        logFile.write("Processing %s.\n" % partnerId)
+        partnerMatch = findMatch(partnerId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue)
+        if partnerMatch is None or coreMatch is None:
+            logFile.write("Warning: Failed alignment [%s, %s].\n" % (coreId, partnerId))
         else:
-            logFile.write("Processing %s.\n" % partnerId)
-            if not os.path.isfile(partnerResultFile):
-                pdbFile, pdbChain = getPDB(partnerId, args)
-                partnerMol = Molecule(pdbFile)
-                partnerSeq = partnerMol.getSequence(pdbChain)
-                with open(partnerFile, "w") as partnerFasta:
-                    partnerFasta.write(">%s\n" % partnerId)
-                    partnerFasta.write("%s" % partnerSeq)
-            else:
-                logFile.write("Using existing results. [%s].\n" % partnerId)
-            matchedId = getHomologue(partnerFile, partnerResultFile,
-                                     templateSequenceFile)
-            if matchedId is None:
-                logFile.write("Warning: Failed alignment [%s].\n" % partnerId)
-            else:
-                logFile.write("Found matching entry %s.\n" % matchedId)
-                refEntry["match"] = matchedId
+            logFile.write("Found matching entries [%s, %s].\n" % (coreMatch, partnerMatch))
+            refEntry["coreMatch"] = coreMatch
+            refEntry["partnerMatch"] = partnerMatch
         logFile.flush()
 
-    finalSet = set()
-    for refEntry in crossReference:
-        coreId = refEntry["core"]
-        partnerId = refEntry["partner"]
-        if "match" in refEntry:
-            entry = "%s\t%s" % (coreId, refEntry["match"])
-            if entry not in finalSet:
-                finalSet.add(entry)
-        else:
-            logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId))
-    logFile.write("Found %s cross reference entries.\n" % len(finalSet))
+    entryCount = 0
+    with open(args.output, 'w') as output_file:
+        for refEntry in crossReference:
+            coreId = refEntry["core"]
+            partnerId = refEntry["partner"]
+            if "coreMatch" in refEntry and "partnerMatch" in refEntry:
+                entry = "%s\t%s\t%s\t%s\n" % (refEntry["coreMatch"], refEntry["partnerMatch"], refEntry["core"], refEntry["partner"])
+                output_file.write(entry)
+                entryCount = entryCount + 1
+            else:
+                logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId))
+    logFile.write("Found %s cross reference entries.\n" % entryCount)
     logFile.close()
 
-    with open(args.output, 'w') as output_file:
-        for entry in sorted(finalSet):
-            output_file.write("%s\n" % entry)
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Maps binding partners to template library')
     parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True)
-    parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True)
-    parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True)
+    parser.add_argument('-i', '--index', help='PDB Database Index file (ffindex)', required=True)
+    parser.add_argument('-d', '--database', help='PDB Database files (ffdata)', required=True)
     parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True)
     parser.add_argument('-o', '--output', help='Cross reference', required=True)
     parser.add_argument('-g', '--log', help='Log File', required=True)
-    parser.add_argument('-t', '--temp', help='Temporary directory', required=True)
+    parser.add_argument('-e', '--evalue', help='e-Value threshold', type=float, default=0.0)
     args = parser.parse_args()
     main(args)