Mercurial > repos > guerler > springsuite
comparison spring_map.py @ 39:172398348efd draft
"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
author | guerler |
---|---|
date | Fri, 22 Jan 2021 15:50:27 +0000 |
parents | 80a4b98121b6 |
children |
comparison
equal
deleted
inserted
replaced
38:80a4b98121b6 | 39:172398348efd |
---|---|
1 #! /usr/bin/env python3 | 1 #! /usr/bin/env python3 |
2 import argparse | 2 import argparse |
3 import os | 3 from os import mkdir, system |
4 from os.path import isfile, isdir | |
4 | 5 |
5 from spring_package.DBKit import createFile | 6 from spring_package.DBKit import DBKit |
6 from spring_package.Molecule import Molecule | 7 from spring_package.Molecule import Molecule |
8 from spring_package.Utilities import getId, getChain, getName | |
7 | 9 |
8 | 10 |
9 def getId(line): | 11 def getPDB(line, pdbDatabase): |
10 line = line.strip() | 12 pdb = getName(line) |
11 if len(line) != 6 or line[4:5] != "_": | 13 pdbChain = getChain(line) |
12 raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line) | 14 pdbFile = "temp/temp.pdb" |
13 return line[:4].upper() + line[4:6] | |
14 | |
15 | |
16 def getPDB(line, args): | |
17 pdb = line[:4].lower() | |
18 pdbChain = line[5:6] | |
19 pdbFile = "%s/temp.pdb" % args.temp | |
20 pdbDatabaseId = "%s.pdb" % pdb | 15 pdbDatabaseId = "%s.pdb" % pdb |
21 createFile(pdbDatabaseId, args.index, args.database, pdbFile) | 16 pdbDatabase.createFile(pdbDatabaseId, pdbFile) |
22 return pdbFile, pdbChain | 17 return pdbFile, pdbChain |
23 | 18 |
24 | 19 |
25 def getSequences(fileName): | 20 def getSequences(fileName): |
26 sequences = dict() | 21 sequences = dict() |
31 nextLine = next(file) | 26 nextLine = next(file) |
32 sequences[name] = nextLine | 27 sequences[name] = nextLine |
33 return sequences | 28 return sequences |
34 | 29 |
35 | 30 |
36 def getHomologue(queryFile, queryResultFile, databaseFile): | 31 def findMatch(identifier, templates, databaseFile, pdbDatabase, evalue=0.0): |
37 if not os.path.isfile(queryResultFile): | 32 if identifier in templates: |
38 os.system("psiblast -query %s -db %s -out %s" % (queryFile, | 33 return identifier |
39 databaseFile, queryResultFile)) | 34 resultSub = identifier[:2] |
35 fastaFile = "temp/%s/%s.fasta" % (resultSub, identifier) | |
36 resultFile = "%s.result" % fastaFile | |
37 if not isfile(resultFile): | |
38 resultDir = "temp/%s" % resultSub | |
39 if not isdir(resultDir): | |
40 mkdir(resultDir) | |
41 pdbFile, pdbChain = getPDB(identifier, pdbDatabase) | |
42 mol = Molecule(pdbFile) | |
43 seq = mol.getSequence(pdbChain) | |
44 with open(fastaFile, "w") as fasta: | |
45 fasta.write(">%s\n" % identifier) | |
46 fasta.write("%s" % seq) | |
47 system("psiblast -query %s -db %s -out %s" % (fastaFile, databaseFile, resultFile)) | |
40 maxMatch = None | 48 maxMatch = None |
41 try: | 49 try: |
42 with open(queryResultFile) as file: | 50 with open(resultFile) as file: |
43 for i in range(38): | 51 for i in range(38): |
44 line = next(file) | 52 line = next(file) |
45 maxMatch = getId(line.split()[0]) | 53 columns = line.split() |
54 maxMatch = getId(columns[0]) | |
55 maxScore = float(columns[2]) | |
56 if maxScore > evalue: | |
57 return None | |
46 except Exception: | 58 except Exception: |
47 return None | 59 return None |
48 return maxMatch | 60 return maxMatch |
49 | 61 |
50 | 62 |
51 def main(args): | 63 def main(args): |
64 if not isdir("temp"): | |
65 mkdir("temp") | |
52 logFile = open(args.log, "w") | 66 logFile = open(args.log, "w") |
53 temp = args.temp.rstrip("/") | 67 templateSequenceFile = "temp/templates.fasta" |
68 pdbDatabase = DBKit(args.index, args.database) | |
54 templates = set() | 69 templates = set() |
55 os.system("mkdir -p %s" % temp) | 70 with open(args.list) as file: |
56 templateSequenceFile = "%s/templates.fasta" % temp | 71 for rawId in file: |
57 if not os.path.isfile(templateSequenceFile): | 72 templateId = getId(rawId) |
73 templates.add(templateId) | |
74 if not isfile(templateSequenceFile): | |
58 templateSequences = open(templateSequenceFile, "w") | 75 templateSequences = open(templateSequenceFile, "w") |
59 with open(args.list) as file: | 76 with open(args.list) as file: |
60 for rawId in file: | 77 for rawId in file: |
61 templateId = getId(rawId) | 78 templateId = getId(rawId) |
62 templates.add(templateId) | 79 pdbFile, pdbChain = getPDB(templateId, pdbDatabase) |
63 pdbFile, pdbChain = getPDB(templateId, args) | |
64 try: | 80 try: |
65 templateMol = Molecule(pdbFile) | 81 templateMol = Molecule(pdbFile) |
66 templateSeq = templateMol.getSequence(pdbChain) | 82 templateSeq = templateMol.getSequence(pdbChain) |
67 templateSequences.write(">%s\n" % templateId) | 83 templateSequences.write(">%s\n" % templateId) |
68 templateSequences.write("%s\n" % templateSeq) | 84 templateSequences.write("%s\n" % templateSeq) |
69 except Exception: | 85 except Exception: |
70 logFile.write("Warning: File not found [%s].\n" % pdbFile) | 86 logFile.write("Warning: File not found [%s].\n" % templateId) |
71 templateSequences.close() | 87 templateSequences.close() |
72 os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) | 88 system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) |
73 else: | 89 else: |
74 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) | 90 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) |
75 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) | 91 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) |
76 logFile.flush() | 92 logFile.flush() |
77 | 93 |
85 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) | 101 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) |
86 logFile.flush() | 102 logFile.flush() |
87 | 103 |
88 for refEntry in crossReference: | 104 for refEntry in crossReference: |
89 coreId = refEntry["core"] | 105 coreId = refEntry["core"] |
106 logFile.write("Processing %s.\n" % coreId) | |
107 coreMatch = findMatch(coreId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) | |
90 partnerId = refEntry["partner"] | 108 partnerId = refEntry["partner"] |
91 partnerFile = "%s/%s.fasta" % (temp, partnerId) | 109 logFile.write("Processing %s.\n" % partnerId) |
92 partnerResultFile = "%s.result" % partnerFile | 110 partnerMatch = findMatch(partnerId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) |
93 if partnerId in templates: | 111 if partnerMatch is None or coreMatch is None: |
94 refEntry["match"] = partnerId | 112 logFile.write("Warning: Failed alignment [%s, %s].\n" % (coreId, partnerId)) |
95 logFile.write("Found partner in template list alignment [%s].\n" % partnerId) | |
96 else: | 113 else: |
97 logFile.write("Processing %s.\n" % partnerId) | 114 logFile.write("Found matching entries [%s, %s].\n" % (coreMatch, partnerMatch)) |
98 if not os.path.isfile(partnerResultFile): | 115 refEntry["coreMatch"] = coreMatch |
99 pdbFile, pdbChain = getPDB(partnerId, args) | 116 refEntry["partnerMatch"] = partnerMatch |
100 partnerMol = Molecule(pdbFile) | |
101 partnerSeq = partnerMol.getSequence(pdbChain) | |
102 with open(partnerFile, "w") as partnerFasta: | |
103 partnerFasta.write(">%s\n" % partnerId) | |
104 partnerFasta.write("%s" % partnerSeq) | |
105 else: | |
106 logFile.write("Using existing results. [%s].\n" % partnerId) | |
107 matchedId = getHomologue(partnerFile, partnerResultFile, | |
108 templateSequenceFile) | |
109 if matchedId is None: | |
110 logFile.write("Warning: Failed alignment [%s].\n" % partnerId) | |
111 else: | |
112 logFile.write("Found matching entry %s.\n" % matchedId) | |
113 refEntry["match"] = matchedId | |
114 logFile.flush() | 117 logFile.flush() |
115 | 118 |
116 finalSet = set() | 119 entryCount = 0 |
117 for refEntry in crossReference: | 120 with open(args.output, 'w') as output_file: |
118 coreId = refEntry["core"] | 121 for refEntry in crossReference: |
119 partnerId = refEntry["partner"] | 122 coreId = refEntry["core"] |
120 if "match" in refEntry: | 123 partnerId = refEntry["partner"] |
121 entry = "%s\t%s" % (coreId, refEntry["match"]) | 124 if "coreMatch" in refEntry and "partnerMatch" in refEntry: |
122 if entry not in finalSet: | 125 entry = "%s\t%s\t%s\t%s\n" % (refEntry["coreMatch"], refEntry["partnerMatch"], refEntry["core"], refEntry["partner"]) |
123 finalSet.add(entry) | 126 output_file.write(entry) |
124 else: | 127 entryCount = entryCount + 1 |
125 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) | 128 else: |
126 logFile.write("Found %s cross reference entries.\n" % len(finalSet)) | 129 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) |
130 logFile.write("Found %s cross reference entries.\n" % entryCount) | |
127 logFile.close() | 131 logFile.close() |
128 | |
129 with open(args.output, 'w') as output_file: | |
130 for entry in sorted(finalSet): | |
131 output_file.write("%s\n" % entry) | |
132 | 132 |
133 | 133 |
134 if __name__ == "__main__": | 134 if __name__ == "__main__": |
135 parser = argparse.ArgumentParser(description='Maps binding partners to template library') | 135 parser = argparse.ArgumentParser(description='Maps binding partners to template library') |
136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) | 136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) |
137 parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True) | 137 parser.add_argument('-i', '--index', help='PDB Database Index file (ffindex)', required=True) |
138 parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True) | 138 parser.add_argument('-d', '--database', help='PDB Database files (ffdata)', required=True) |
139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) | 139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) |
140 parser.add_argument('-o', '--output', help='Cross reference', required=True) | 140 parser.add_argument('-o', '--output', help='Cross reference', required=True) |
141 parser.add_argument('-g', '--log', help='Log File', required=True) | 141 parser.add_argument('-g', '--log', help='Log File', required=True) |
142 parser.add_argument('-t', '--temp', help='Temporary directory', required=True) | 142 parser.add_argument('-e', '--evalue', help='e-Value threshold', type=float, default=0.0) |
143 args = parser.parse_args() | 143 args = parser.parse_args() |
144 main(args) | 144 main(args) |