annotate spring_package/Alignment.py @ 39:172398348efd draft

"planemo upload commit 26b4018c88041ee0ca7c2976e0a012015173d7b6-dirty"
author guerler
date Fri, 22 Jan 2021 15:50:27 +0000
parents 0be0af9e695d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
37
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
1 from Bio import pairwise2
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
2
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
3
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
4 class Alignment:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
5 def __init__(self, fileName):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
6 self.queryName = None
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
7 self.queryStart = list()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
8 self.queryAlignment = list()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
9 self.templateName = None
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
10 self.templateStart = list()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
11 self.templateAlignment = list()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
12 self.readFile(fileName)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
13
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
14 def readFile(self, fileName):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
15 with open(fileName) as file:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
16 for line in file:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
17 cols = line.split()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
18 if len(cols) > 1 and cols[0] == "Query":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
19 self.queryName = cols[1].split()[0][0:14]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
20 if len(cols) > 1 and cols[0].startswith(">"):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
21 self.templateName = cols[0][1:]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
22 if self.queryName and self.templateName:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
23 if len(cols) > 2:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
24 if cols[0] == "Q" and cols[1] == self.queryName:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
25 self.queryStart.append(self.getStart(cols[2]))
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
26 self.queryAlignment.append(cols[3])
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
27 if cols[0] == "T" and cols[1] == self.templateName:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
28 self.templateStart.append(self.getStart(cols[2]))
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
29 self.templateAlignment.append(cols[3])
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
30 if len(cols) > 1 and cols[0] == "No" and cols[1] == "2":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
31 break
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
32
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
33 def createModel(self, templateChain):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
34 hhrMapping = self.mapSequence(templateChain)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
35 previousResidue = dict()
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
36 for residueNumber in templateChain:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
37 templateResidue = templateChain[residueNumber]["residue"]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
38 previousResidue[residueNumber] = templateResidue
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
39 templateChain[residueNumber]["residue"] = None
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
40 tCount = 0
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
41 for i in range(len(self.templateAlignment)):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
42 templateSequence = self.templateAlignment[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
43 querySequence = self.queryAlignment[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
44 queryStart = self.queryStart[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
45 n = len(querySequence)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
46 qCount = 0
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
47 for j in range(n):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
48 qs = querySequence[j]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
49 ts = templateSequence[j]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
50 rs = hhrMapping[tCount]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
51 if rs != "-" and qs != "-" and ts != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
52 residueNumber = rs
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
53 if residueNumber in templateChain:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
54 pr = previousResidue[residueNumber]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
55 if pr != self.toThreeAmino(ts):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
56 print("Warning: Ignoring mismatching residue [%s != %s]." % (pr, self.toThreeAmino(ts)))
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
57 templateChain[residueNumber]["residue"] = self.toThreeAmino(qs)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
58 templateChain[residueNumber]["residueNumber"] = queryStart + qCount
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
59 else:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
60 print("Warning: Skipping missing residue [%s]." % residueNumber)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
61 if qs != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
62 qCount = qCount + 1
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
63 if ts != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
64 tCount = tCount + 1
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
65
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
66 def mapSequence(self, templateChain):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
67 pdbSequence = ""
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
68 for residueNumber in templateChain:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
69 templateResidue = templateChain[residueNumber]["residue"]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
70 pdbSequence = pdbSequence + self.toSingleAmino(templateResidue)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
71 hhrSequence = ""
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
72 for i in range(len(self.templateAlignment)):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
73 templateSequence = self.templateAlignment[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
74 for s in templateSequence:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
75 if s != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
76 hhrSequence = hhrSequence + s
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
77 alignments = pairwise2.align.globalxx(pdbSequence, hhrSequence)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
78 pdbAlignment = alignments[0].seqA
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
79 hhrAlignment = alignments[0].seqB
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
80 pCount = 0
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
81 hhrMapping = []
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
82 for i in range(len(pdbAlignment)):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
83 p = pdbAlignment[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
84 h = hhrAlignment[i]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
85 if h != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
86 residueIndex = pCount if p != "-" else p
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
87 hhrMapping.append(residueIndex)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
88 if p != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
89 pCount = pCount + 1
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
90 sortedResidueIndex = sorted(templateChain.keys())
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
91 for i in range(len(hhrMapping)):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
92 if hhrMapping[i] != "-":
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
93 hhrMapping[i] = sortedResidueIndex[hhrMapping[i]]
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
94 return hhrMapping
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
95
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
96 def getStart(self, x):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
97 try:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
98 return int(x)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
99 except Exception:
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
100 raise Exception("Invalid start index in alignment [%s]." % x)
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
101
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
102 def toThreeAmino(self, seq):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
103 code = dict(G="GLY", A="ALA", V="VAL", L="LEU", I="ILE", M="MET", F="PHE", P="PRO", Y="TYR", W="TRP",
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
104 K="LYS", S="SER", C="CYS", N="ASN", Q="GLN", H="HIS", T="THR", E="GLU", D="ASP", R="ARG")
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
105 return code[seq] if seq in code else "XXX"
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
106
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
107 def toSingleAmino(self, seq):
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
108 code = dict(GLY="G", ALA="A", VAL="V", LEU="L", ILE="I", MET="M", PHE="F", PRO="P", TYR="Y", TRP="W",
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
109 LYS="K", SER="S", CYS="C", ASN="N", GLN="Q", HIS="H", THR="T", GLU="E", ASP="D", ARG="R")
0be0af9e695d "planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
guerler
parents:
diff changeset
110 return code[seq] if seq in code else "X"