Mercurial > repos > guerler > springsuite
comparison spring_map.py @ 39:172398348efd draft
"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
| author | guerler |
|---|---|
| date | Fri, 22 Jan 2021 15:50:27 +0000 |
| parents | 80a4b98121b6 |
| children |
comparison
equal
deleted
inserted
replaced
| 38:80a4b98121b6 | 39:172398348efd |
|---|---|
| 1 #! /usr/bin/env python3 | 1 #! /usr/bin/env python3 |
| 2 import argparse | 2 import argparse |
| 3 import os | 3 from os import mkdir, system |
| 4 from os.path import isfile, isdir | |
| 4 | 5 |
| 5 from spring_package.DBKit import createFile | 6 from spring_package.DBKit import DBKit |
| 6 from spring_package.Molecule import Molecule | 7 from spring_package.Molecule import Molecule |
| 8 from spring_package.Utilities import getId, getChain, getName | |
| 7 | 9 |
| 8 | 10 |
| 9 def getId(line): | 11 def getPDB(line, pdbDatabase): |
| 10 line = line.strip() | 12 pdb = getName(line) |
| 11 if len(line) != 6 or line[4:5] != "_": | 13 pdbChain = getChain(line) |
| 12 raise Exception("Invalid list entry (`PDB_CHAIN`): %s." % line) | 14 pdbFile = "temp/temp.pdb" |
| 13 return line[:4].upper() + line[4:6] | |
| 14 | |
| 15 | |
| 16 def getPDB(line, args): | |
| 17 pdb = line[:4].lower() | |
| 18 pdbChain = line[5:6] | |
| 19 pdbFile = "%s/temp.pdb" % args.temp | |
| 20 pdbDatabaseId = "%s.pdb" % pdb | 15 pdbDatabaseId = "%s.pdb" % pdb |
| 21 createFile(pdbDatabaseId, args.index, args.database, pdbFile) | 16 pdbDatabase.createFile(pdbDatabaseId, pdbFile) |
| 22 return pdbFile, pdbChain | 17 return pdbFile, pdbChain |
| 23 | 18 |
| 24 | 19 |
| 25 def getSequences(fileName): | 20 def getSequences(fileName): |
| 26 sequences = dict() | 21 sequences = dict() |
| 31 nextLine = next(file) | 26 nextLine = next(file) |
| 32 sequences[name] = nextLine | 27 sequences[name] = nextLine |
| 33 return sequences | 28 return sequences |
| 34 | 29 |
| 35 | 30 |
| 36 def getHomologue(queryFile, queryResultFile, databaseFile): | 31 def findMatch(identifier, templates, databaseFile, pdbDatabase, evalue=0.0): |
| 37 if not os.path.isfile(queryResultFile): | 32 if identifier in templates: |
| 38 os.system("psiblast -query %s -db %s -out %s" % (queryFile, | 33 return identifier |
| 39 databaseFile, queryResultFile)) | 34 resultSub = identifier[:2] |
| 35 fastaFile = "temp/%s/%s.fasta" % (resultSub, identifier) | |
| 36 resultFile = "%s.result" % fastaFile | |
| 37 if not isfile(resultFile): | |
| 38 resultDir = "temp/%s" % resultSub | |
| 39 if not isdir(resultDir): | |
| 40 mkdir(resultDir) | |
| 41 pdbFile, pdbChain = getPDB(identifier, pdbDatabase) | |
| 42 mol = Molecule(pdbFile) | |
| 43 seq = mol.getSequence(pdbChain) | |
| 44 with open(fastaFile, "w") as fasta: | |
| 45 fasta.write(">%s\n" % identifier) | |
| 46 fasta.write("%s" % seq) | |
| 47 system("psiblast -query %s -db %s -out %s" % (fastaFile, databaseFile, resultFile)) | |
| 40 maxMatch = None | 48 maxMatch = None |
| 41 try: | 49 try: |
| 42 with open(queryResultFile) as file: | 50 with open(resultFile) as file: |
| 43 for i in range(38): | 51 for i in range(38): |
| 44 line = next(file) | 52 line = next(file) |
| 45 maxMatch = getId(line.split()[0]) | 53 columns = line.split() |
| 54 maxMatch = getId(columns[0]) | |
| 55 maxScore = float(columns[2]) | |
| 56 if maxScore > evalue: | |
| 57 return None | |
| 46 except Exception: | 58 except Exception: |
| 47 return None | 59 return None |
| 48 return maxMatch | 60 return maxMatch |
| 49 | 61 |
| 50 | 62 |
| 51 def main(args): | 63 def main(args): |
| 64 if not isdir("temp"): | |
| 65 mkdir("temp") | |
| 52 logFile = open(args.log, "w") | 66 logFile = open(args.log, "w") |
| 53 temp = args.temp.rstrip("/") | 67 templateSequenceFile = "temp/templates.fasta" |
| 68 pdbDatabase = DBKit(args.index, args.database) | |
| 54 templates = set() | 69 templates = set() |
| 55 os.system("mkdir -p %s" % temp) | 70 with open(args.list) as file: |
| 56 templateSequenceFile = "%s/templates.fasta" % temp | 71 for rawId in file: |
| 57 if not os.path.isfile(templateSequenceFile): | 72 templateId = getId(rawId) |
| 73 templates.add(templateId) | |
| 74 if not isfile(templateSequenceFile): | |
| 58 templateSequences = open(templateSequenceFile, "w") | 75 templateSequences = open(templateSequenceFile, "w") |
| 59 with open(args.list) as file: | 76 with open(args.list) as file: |
| 60 for rawId in file: | 77 for rawId in file: |
| 61 templateId = getId(rawId) | 78 templateId = getId(rawId) |
| 62 templates.add(templateId) | 79 pdbFile, pdbChain = getPDB(templateId, pdbDatabase) |
| 63 pdbFile, pdbChain = getPDB(templateId, args) | |
| 64 try: | 80 try: |
| 65 templateMol = Molecule(pdbFile) | 81 templateMol = Molecule(pdbFile) |
| 66 templateSeq = templateMol.getSequence(pdbChain) | 82 templateSeq = templateMol.getSequence(pdbChain) |
| 67 templateSequences.write(">%s\n" % templateId) | 83 templateSequences.write(">%s\n" % templateId) |
| 68 templateSequences.write("%s\n" % templateSeq) | 84 templateSequences.write("%s\n" % templateSeq) |
| 69 except Exception: | 85 except Exception: |
| 70 logFile.write("Warning: File not found [%s].\n" % pdbFile) | 86 logFile.write("Warning: File not found [%s].\n" % templateId) |
| 71 templateSequences.close() | 87 templateSequences.close() |
| 72 os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) | 88 system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) |
| 73 else: | 89 else: |
| 74 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) | 90 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) |
| 75 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) | 91 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) |
| 76 logFile.flush() | 92 logFile.flush() |
| 77 | 93 |
| 85 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) | 101 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) |
| 86 logFile.flush() | 102 logFile.flush() |
| 87 | 103 |
| 88 for refEntry in crossReference: | 104 for refEntry in crossReference: |
| 89 coreId = refEntry["core"] | 105 coreId = refEntry["core"] |
| 106 logFile.write("Processing %s.\n" % coreId) | |
| 107 coreMatch = findMatch(coreId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) | |
| 90 partnerId = refEntry["partner"] | 108 partnerId = refEntry["partner"] |
| 91 partnerFile = "%s/%s.fasta" % (temp, partnerId) | 109 logFile.write("Processing %s.\n" % partnerId) |
| 92 partnerResultFile = "%s.result" % partnerFile | 110 partnerMatch = findMatch(partnerId, templates, templateSequenceFile, pdbDatabase, evalue=args.evalue) |
| 93 if partnerId in templates: | 111 if partnerMatch is None or coreMatch is None: |
| 94 refEntry["match"] = partnerId | 112 logFile.write("Warning: Failed alignment [%s, %s].\n" % (coreId, partnerId)) |
| 95 logFile.write("Found partner in template list alignment [%s].\n" % partnerId) | |
| 96 else: | 113 else: |
| 97 logFile.write("Processing %s.\n" % partnerId) | 114 logFile.write("Found matching entries [%s, %s].\n" % (coreMatch, partnerMatch)) |
| 98 if not os.path.isfile(partnerResultFile): | 115 refEntry["coreMatch"] = coreMatch |
| 99 pdbFile, pdbChain = getPDB(partnerId, args) | 116 refEntry["partnerMatch"] = partnerMatch |
| 100 partnerMol = Molecule(pdbFile) | |
| 101 partnerSeq = partnerMol.getSequence(pdbChain) | |
| 102 with open(partnerFile, "w") as partnerFasta: | |
| 103 partnerFasta.write(">%s\n" % partnerId) | |
| 104 partnerFasta.write("%s" % partnerSeq) | |
| 105 else: | |
| 106 logFile.write("Using existing results. [%s].\n" % partnerId) | |
| 107 matchedId = getHomologue(partnerFile, partnerResultFile, | |
| 108 templateSequenceFile) | |
| 109 if matchedId is None: | |
| 110 logFile.write("Warning: Failed alignment [%s].\n" % partnerId) | |
| 111 else: | |
| 112 logFile.write("Found matching entry %s.\n" % matchedId) | |
| 113 refEntry["match"] = matchedId | |
| 114 logFile.flush() | 117 logFile.flush() |
| 115 | 118 |
| 116 finalSet = set() | 119 entryCount = 0 |
| 117 for refEntry in crossReference: | 120 with open(args.output, 'w') as output_file: |
| 118 coreId = refEntry["core"] | 121 for refEntry in crossReference: |
| 119 partnerId = refEntry["partner"] | 122 coreId = refEntry["core"] |
| 120 if "match" in refEntry: | 123 partnerId = refEntry["partner"] |
| 121 entry = "%s\t%s" % (coreId, refEntry["match"]) | 124 if "coreMatch" in refEntry and "partnerMatch" in refEntry: |
| 122 if entry not in finalSet: | 125 entry = "%s\t%s\t%s\t%s\n" % (refEntry["coreMatch"], refEntry["partnerMatch"], refEntry["core"], refEntry["partner"]) |
| 123 finalSet.add(entry) | 126 output_file.write(entry) |
| 124 else: | 127 entryCount = entryCount + 1 |
| 125 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) | 128 else: |
| 126 logFile.write("Found %s cross reference entries.\n" % len(finalSet)) | 129 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) |
| 130 logFile.write("Found %s cross reference entries.\n" % entryCount) | |
| 127 logFile.close() | 131 logFile.close() |
| 128 | |
| 129 with open(args.output, 'w') as output_file: | |
| 130 for entry in sorted(finalSet): | |
| 131 output_file.write("%s\n" % entry) | |
| 132 | 132 |
| 133 | 133 |
| 134 if __name__ == "__main__": | 134 if __name__ == "__main__": |
| 135 parser = argparse.ArgumentParser(description='Maps binding partners to template library') | 135 parser = argparse.ArgumentParser(description='Maps binding partners to template library') |
| 136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) | 136 parser.add_argument('-l', '--list', help='List of template entries `PDB_CHAIN`', required=True) |
| 137 parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True) | 137 parser.add_argument('-i', '--index', help='PDB Database Index file (ffindex)', required=True) |
| 138 parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True) | 138 parser.add_argument('-d', '--database', help='PDB Database files (ffdata)', required=True) |
| 139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) | 139 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) |
| 140 parser.add_argument('-o', '--output', help='Cross reference', required=True) | 140 parser.add_argument('-o', '--output', help='Cross reference', required=True) |
| 141 parser.add_argument('-g', '--log', help='Log File', required=True) | 141 parser.add_argument('-g', '--log', help='Log File', required=True) |
| 142 parser.add_argument('-t', '--temp', help='Temporary directory', required=True) | 142 parser.add_argument('-e', '--evalue', help='e-Value threshold', type=float, default=0.0) |
| 143 args = parser.parse_args() | 143 args = parser.parse_args() |
| 144 main(args) | 144 main(args) |
