comparison spring_map.py @ 39:172398348efd draft

"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
author guerler
date Fri, 22 Jan 2021 15:50:27 +0000
parents 80a4b98121b6
children
comparison
equal deleted inserted replaced
38:80a4b98121b6 39:172398348efd
1 #! /usr/bin/env python3 1 #! /usr/bin/env python3
2 import argparse 2 import argparse
3 import os 3 from os import mkdir, system
4 from os.path import isfile, isdir
4 5
5 from spring_package.DBKit import createFile 6 from spring_package.DBKit import DBKit
6 from spring_package.Molecule import Molecule 7 from spring_package.Molecule import Molecule
8 from spring_package.Utilities import getId, getChain, getName
7 9
8 10
9 def getId(line): 11 def getPDB(line, pdbDatabase):
10 line = line.strip() 12 pdb = getName(line)
11 if len(line) != 6 or line[4:5] != "_": 13 pdbChain = getChain(line)
12 raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line) 14 pdbFile = "temp/temp.pdb"
13 return line[:4].upper() + line[4:6]
14
15
16 def getPDB(line, args):
17 pdb = line[:4].lower()
18 pdbChain = line[5:6]
19 pdbFile = "%s/temp.pdb" % args.temp
20 pdbDatabaseId = "%s.pdb" % pdb 15 pdbDatabaseId = "%s.pdb" % pdb
21 createFile(pdbDatabaseId, args.index, args.database, pdbFile) 16 pdbDatabase.createFile(pdbDatabaseId, pdbFile)
22 return pdbFile, pdbChain 17 return pdbFile, pdbChain
23 18
24 19
25 def getSequences(fileName): 20 def getSequences(fileName):
26 sequences = dict() 21 sequences = dict()
31 nextLine = next(file) 26 nextLine = next(file)
32 sequences[name] = nextLine 27 sequences[name] = nextLine
33 return sequences 28 return sequences
34 29
35 30
36 def getHomologue(queryFile, queryResultFile, databaseFile): 31 def findMatch(identifier, templates, databaseFile, pdbDatabase, evalue=0.0):
37 if not os.path.isfile(queryResultFile): 32 if identifier in templates:
38 os.system("psiblast -query %s -db %s -out %s" % (queryFile, 33 return identifier
39 databaseFile, queryResultFile)) 34 resultSub = identifier[:2]
35 fastaFile = "temp/%s/%s.fasta" % (resultSub, identifier)
36 resultFile = "%s.result" % fastaFile
37 if not isfile(resultFile):
38 resultDir = "temp/%s" % resultSub
39 if not isdir(resultDir):
40 mkdir(resultDir)
41 pdbFile, pdbChain = getPDB(identifier, pdbDatabase)
42 mol = Molecule(pdbFile)
43 seq = mol.getSequence(pdbChain)
44 with open(fastaFile, "w") as fasta:
45 fasta.write(">%s\n" % identifier)
46 fasta.write("%s" % seq)
47 system("psiblast -query %s -db %s -out %s" % (fastaFile, databaseFile, resultFile))
40 maxMatch = None 48 maxMatch = None
41 try: 49 try:
42 with open(queryResultFile) as file: 50 with open(resultFile) as file:
43 for i in range(38): 51 for i in range(38):
44 line = next(file) 52 line = next(file)
45 maxMatch = getId(line.split()[0]) 53 columns = line.split()
54 maxMatch = getId(columns[0])
55 maxScore = float(columns[2])
56 if maxScore > evalue:
57 return None
46 except Exception: 58 except Exception:
47 return None 59 return None
48 return maxMatch 60 return maxMatch
49 61
50 62
51 def main(args): 63 def main(args):
64 if not isdir("temp"):
65 mkdir("temp")
52 logFile = open(args.log, "w") 66 logFile = open(args.log, "w")
53 temp = args.temp.rstrip("/") 67 templateSequenceFile = "temp/templates.fasta"
68 pdbDatabase = DBKit(args.index, args.database)
54 templates = set() 69 templates = set()
55 os.system("mkdir -p %s" % temp) 70 with open(args.list) as file:
56 templateSequenceFile = "%s/templates.fasta" % temp 71 for rawId in file:
57 if not os.path.isfile(templateSequenceFile): 72 templateId = getId(rawId)
73 templates.add(templateId)
74 if not isfile(templateSequenceFile):
58 templateSequences = open(templateSequenceFile, "w") 75 templateSequences = open(templateSequenceFile, "w")
59 with open(args.list) as file: 76 with open(args.list) as file:
60 for rawId in file: 77 for rawId in file:
61 templateId = getId(rawId) 78 templateId = getId(rawId)
62 templates.add(templateId) 79 pdbFile, pdbChain = getPDB(templateId, pdbDatabase)
63 pdbFile, pdbChain = getPDB(templateId, args)
64 try: 80 try:
65 templateMol = Molecule(pdbFile) 81 templateMol = Molecule(pdbFile)
66 templateSeq = templateMol.getSequence(pdbChain) 82 templateSeq = templateMol.getSequence(pdbChain)
67 templateSequences.write(">%s\n" % templateId) 83 templateSequences.write(">%s\n" % templateId)
68 templateSequences.write("%s\n" % templateSeq) 84 templateSequences.write("%s\n" % templateSeq)
69 except Exception: 85 except Exception:
70 logFile.write("Warning: File not found [%s].\n" % pdbFile) 86 logFile.write("Warning: File not found [%s].\n" % templateId)
71 templateSequences.close() 87 templateSequences.close()
72 os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) 88 system("makeblastdb -in %s -dbtype prot" % templateSequenceFile)
73 else: 89 else:
74 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) 90 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile)
75 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) 91 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list))
76 logFile.flush() 92 logFile.flush()
77 93
85 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) 101 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference))
86 logFile.flush() 102 logFile.flush()
87 103
88 for refEntry in crossReference: 104 for refEntry in crossReference:
89 coreId = refEntry["core"] 105 coreId = refEntry["core"]
106 logFile.write("Processing %s.\n" % coreId)
107 coreMatch = findMatch(coreId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue)
90 partnerId = refEntry["partner"] 108 partnerId = refEntry["partner"]
91 partnerFile = "%s/%s.fasta" % (temp, partnerId) 109 logFile.write("Processing %s.\n" % partnerId)
92 partnerResultFile = "%s.result" % partnerFile 110 partnerMatch = findMatch(partnerId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue)
93 if partnerId in templates: 111 if partnerMatch is None or coreMatch is None:
94 refEntry["match"] = partnerId 112 logFile.write("Warning: Failed alignment [%s, %s].\n" % (coreId, partnerId))
95 logFile.write("Found partner in template list alignment [%s].\n" % partnerId)
96 else: 113 else:
97 logFile.write("Processing %s.\n" % partnerId) 114 logFile.write("Found matching entries [%s, %s].\n" % (coreMatch, partnerMatch))
98 if not os.path.isfile(partnerResultFile): 115 refEntry["coreMatch"] = coreMatch
99 pdbFile, pdbChain = getPDB(partnerId, args) 116 refEntry["partnerMatch"] = partnerMatch
100 partnerMol = Molecule(pdbFile)
101 partnerSeq = partnerMol.getSequence(pdbChain)
102 with open(partnerFile, "w") as partnerFasta:
103 partnerFasta.write(">%s\n" % partnerId)
104 partnerFasta.write("%s" % partnerSeq)
105 else:
106 logFile.write("Using existing results. [%s].\n" % partnerId)
107 matchedId = getHomologue(partnerFile, partnerResultFile,
108 templateSequenceFile)
109 if matchedId is None:
110 logFile.write("Warning: Failed alignment [%s].\n" % partnerId)
111 else:
112 logFile.write("Found matching entry %s.\n" % matchedId)
113 refEntry["match"] = matchedId
114 logFile.flush() 117 logFile.flush()
115 118
116 finalSet = set() 119 entryCount = 0
117 for refEntry in crossReference: 120 with open(args.output, 'w') as output_file:
118 coreId = refEntry["core"] 121 for refEntry in crossReference:
119 partnerId = refEntry["partner"] 122 coreId = refEntry["core"]
120 if "match" in refEntry: 123 partnerId = refEntry["partner"]
121 entry = "%s\t%s" % (coreId, refEntry["match"]) 124 if "coreMatch" in refEntry and "partnerMatch" in refEntry:
122 if entry not in finalSet: 125 entry = "%s\t%s\t%s\t%s\n" % (refEntry["coreMatch"], refEntry["partnerMatch"], refEntry["core"], refEntry["partner"])
123 finalSet.add(entry) 126 output_file.write(entry)
124 else: 127 entryCount = entryCount + 1
125 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) 128 else:
126 logFile.write("Found %s cross reference entries.\n" % len(finalSet)) 129 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId))
130 logFile.write("Found %s cross reference entries.\n" % entryCount)
127 logFile.close() 131 logFile.close()
128
129 with open(args.output, 'w') as output_file:
130 for entry in sorted(finalSet):
131 output_file.write("%s\n" % entry)
132 132
133 133
134 if __name__ == "__main__": 134 if __name__ == "__main__":
135 parser = argparse.ArgumentParser(description='Maps binding partners to template library') 135 parser = argparse.ArgumentParser(description='Maps binding partners to template library')
136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) 136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True)
137 parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True) 137 parser.add_argument('-i', '--index', help='PDB Database Index file (ffindex)', required=True)
138 parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True) 138 parser.add_argument('-d', '--database', help='PDB Database files (ffdata)', required=True)
139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) 139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True)
140 parser.add_argument('-o', '--output', help='Cross reference', required=True) 140 parser.add_argument('-o', '--output', help='Cross reference', required=True)
141 parser.add_argument('-g', '--log', help='Log File', required=True) 141 parser.add_argument('-g', '--log', help='Log File', required=True)
142 parser.add_argument('-t', '--temp', help='Temporary directory', required=True) 142 parser.add_argument('-e', '--evalue', help='e-Value threshold', type=float, default=0.0)
143 args = parser.parse_args() 143 args = parser.parse_args()
144 main(args) 144 main(args)