0
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Copyright 2019 Ryan Wick (rrwick@gmail.com)
|
|
4 https://github.com/rrwick/Compare-annotations/
|
|
5 This program is free software: you can redistribute it and/or modify it under the terms of the GNU
|
|
6 General Public License as published by the Free Software Foundation, either version 3 of the
|
|
7 License, or (at your option) any later version.
|
|
8 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
|
|
9 even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
10 General Public License for more details.
|
|
11 You should have received a copy of the GNU General Public License along with this program. If not,
|
|
12 see <https://www.gnu.org/licenses/>.
|
|
13 """
|
|
14
|
|
15 import argparse
|
|
16 from Bio import SeqIO
|
|
17 from Bio import pairwise2
|
|
18 from Bio.pairwise2 import format_alignment
|
|
19 import itertools
|
|
20 import sys
|
|
21
|
|
22
|
|
23 def addArr(arrA, arrB):
|
|
24 res = []
|
|
25 for x in range(0, min(len(arrA), len(arrB))):
|
|
26 res.append(arrA[x] + arrB[x])
|
|
27 return res
|
|
28
|
|
29 def get_arguments():
|
|
30 parser = argparse.ArgumentParser(description='Compare GenBank annotations',
|
|
31 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
32
|
|
33 parser.add_argument('annotation_1', type=str,
|
|
34 help='First annotated genome in Genbank format')
|
|
35 parser.add_argument('annotation_2', type=str,
|
|
36 help='Second annotated genome in Genbank format')
|
|
37
|
|
38
|
|
39 parser.add_argument('--match_identity_threshold', type=float, default=0.7,
|
|
40 help='Two genes must have at least this identity to be considerd the same (0.0 to 1.0)')
|
|
41 parser.add_argument('--allowed_skipped_genes', type=int, default=10,
|
|
42 help='This many missing genes are allowed when aligning the annotations')
|
|
43 parser.add_argument("--addNotes", action="store_true", help="Add Note fields")
|
|
44
|
|
45 parser.add_argument(
|
|
46 "-sumOut", type=argparse.FileType("w"), help="Summary out file"
|
|
47 )
|
|
48 args = parser.parse_args()
|
|
49 return args
|
|
50
|
|
51
|
|
52 def main():
|
|
53 args = get_arguments()
|
|
54
|
|
55 # Load in the CDS features from the two assemblies.
|
|
56 old = SeqIO.parse(args.annotation_1, 'genbank')
|
|
57 new = SeqIO.parse(args.annotation_2, 'genbank')
|
|
58 old_record = next(old)
|
|
59 new_record = next(new)
|
|
60 old_features, new_features = [], []
|
|
61 for f in old_record.features:
|
|
62 if f.type == 'CDS':
|
|
63 old_features.append(f)
|
|
64 for f in new_record.features:
|
|
65 if f.type == 'CDS':
|
|
66 new_features.append(f)
|
|
67
|
|
68 args.sumOut.write('Features in First Genbank\'s assembly:\t' + str(len(old_features)) + "\n")
|
|
69 args.sumOut.write('Features in Second Genbank\'s assembly:\t' + str(len(new_features)) + "\n\n")
|
|
70
|
|
71 # Align the features to each other.
|
|
72 offsets = sorted(list(itertools.product(range(args.allowed_skipped_genes + 1),
|
|
73 range(args.allowed_skipped_genes + 1))),
|
|
74 key=lambda x: x[0]+x[1])
|
|
75 old_i, new_i = 0, 0
|
|
76 exactRec = 0
|
|
77 inexactRec = [0, 0 ,0]
|
|
78 hypoRec = [0, 0, 0]
|
|
79 newCount = 0
|
|
80 oldCount = 0
|
|
81 if args.addNotes:
|
|
82 print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\tFirst Record's Notes\tSecond Record's Notes\n")
|
|
83 else:
|
|
84 print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\n")
|
|
85 while True:
|
|
86 if old_i >= len(old_features) and new_i >= len(new_features):
|
|
87 break
|
|
88
|
|
89 for old_offset, new_offset in offsets:
|
|
90 try:
|
|
91 old_feature = old_features[old_i + old_offset]
|
|
92 except IndexError:
|
|
93 old_feature = None
|
|
94 try:
|
|
95 new_feature = new_features[new_i + new_offset]
|
|
96 except IndexError:
|
|
97 new_feature = None
|
|
98 try:
|
|
99 match, identity, length_diff = compare_features(old_feature, new_feature, old_record, new_record, args.match_identity_threshold)
|
|
100 except TypeError:
|
|
101 break
|
|
102 if match:
|
|
103 for j in range(old_offset):
|
|
104 print_in_old_not_new(old_features[old_i + j])
|
|
105 oldCount += 1
|
|
106 for j in range(new_offset):
|
|
107 print_in_new_not_old(new_features[new_i + j])
|
|
108 newCount += 1
|
|
109 if identity == 1.0:
|
|
110 exactRec += 1
|
|
111 res1, res2 = print_match(old_features[old_i + old_offset], new_features[new_i + new_offset], identity, length_diff, args.addNotes)
|
|
112 inexactRec = addArr(inexactRec, res1)
|
|
113 hypoRec = addArr(hypoRec, res2)
|
|
114 old_i += old_offset
|
|
115 new_i += new_offset
|
|
116 break
|
|
117 else:
|
|
118 sys.stderr.write("Exceeded allowed number of skipped genes (" + str(args.allowed_skipped_genes) + "), unable to maintain alignment and continue comparison.\n")
|
|
119 exit(2)
|
|
120
|
|
121 if old_feature is None and new_feature is None:
|
|
122 break
|
|
123
|
|
124 old_i += 1
|
|
125 new_i += 1
|
|
126
|
|
127 args.sumOut.write('Exact Match:\t' + str(exactRec) + "\n\n")
|
|
128
|
|
129 args.sumOut.write('Inexact Match:\t' + str(inexactRec[0] + inexactRec[1] + inexactRec[2]) + "\n")
|
|
130 args.sumOut.write(' Same length:\t' + str(inexactRec[0]) + "\n")
|
|
131 args.sumOut.write(' Second Gbk Seq longer:\t' + str(inexactRec[2]) + "\n")
|
|
132 args.sumOut.write(' First Gbk Seq longer:\t' + str(inexactRec[1]) + "\n\n")
|
|
133
|
|
134 args.sumOut.write('In Second Gbk but not in first:\t' + str(newCount) + "\n")
|
|
135 args.sumOut.write('In First Gbk but not in second:\t' + str(oldCount) + "\n\n")
|
|
136
|
|
137 args.sumOut.write('Hypothetical Annotation Change:\t' + str(hypoRec[1] + hypoRec[2]) + "\n")
|
|
138 args.sumOut.write('Hypothetical:\t' + str(hypoRec[0] + hypoRec[2]) + "\n")
|
|
139
|
|
140
|
|
141 def print_match(f1, f2, identity, length_diff, outNotes):
|
|
142 #print('', flush=True)
|
|
143 line = f1.qualifiers['product'][0] + "\t"
|
|
144 matchArr = [0, 0, 0]
|
|
145 hypoArr = [0, 0, 0]
|
|
146 if identity == 1.0:
|
|
147 # print('Exact match')
|
|
148 line += 'Exact match\t' + f2.qualifiers['product'][0] + "\t100.0\tSame Length\t"
|
|
149 else:
|
|
150 # print('Inexact match (' + '%.2f' % (identity * 100.0) + '% ID, ', end='')
|
|
151 line += 'Inexact match\t' + f2.qualifiers['product'][0] + "\t%.2f\t" % (identity * 100.0)
|
|
152 if length_diff == 0:
|
|
153 # print('same length)')
|
|
154 line +="Same Length\t"
|
|
155 matchArr[0] += 1
|
|
156 elif length_diff > 0:
|
|
157 # print('old seq longer)')
|
|
158 line +="First Gbk Seq Longer\t"
|
|
159 matchArr[1] += 1
|
|
160 elif length_diff < 0:
|
|
161 # print('new seq longer)')
|
|
162 line +="Second Gbk Seq Longer\t"
|
|
163 matchArr[2] += 1
|
|
164 # print(' old: ', end='')
|
|
165 # print_feature_one_line(f1)
|
|
166 line += print_feature_one_line(f1) + "\t"
|
|
167 # print(' new: ', end='')
|
|
168 # print_feature_one_line(f2)
|
|
169 line += print_feature_one_line(f2) + "\t"
|
|
170 p1 = f1.qualifiers['product'][0].lower()
|
|
171 p2 = f2.qualifiers['product'][0].lower()
|
|
172 if 'hypothetical' in p1 and 'hypothetical' in p2:
|
|
173 # print(' still hypothetical')
|
|
174 line += "Hypothetical\t"
|
|
175 hypoArr[0] += 1
|
|
176 elif 'hypothetical' in p1 and 'hypothetical' not in p2:
|
|
177 # print(' no longer hypothetical')
|
|
178 line += "No Longer Hypothetical\t"
|
|
179 hypoArr[1] += 1
|
|
180 elif 'hypothetical' not in p1 and 'hypothetical' in p2:
|
|
181 # print(' became hypothetical')
|
|
182 line += "Became Hypothetical\t"
|
|
183 hypoArr[2] += 1
|
|
184 else:
|
|
185 line += "'Hypothetical' not in second nor first Gbk's product tag"
|
|
186
|
|
187 if outNotes:
|
|
188 line += "\t"
|
|
189 if "note" in f1.qualifiers.keys():
|
|
190 for x in f1.qualifiers["note"]:
|
|
191 line += x
|
|
192 line += "\t"
|
|
193 else:
|
|
194 line += "N/A\t"
|
|
195 if "note" in f2.qualifiers.keys():
|
|
196 for x in f2.qualifiers["note"]:
|
|
197 line += x
|
|
198 else:
|
|
199 line += "N/A"
|
|
200
|
|
201 print(line)
|
|
202 return matchArr, hypoArr
|
|
203
|
|
204 def print_in_old_not_new(f): # rename file outputs
|
|
205 line = f.qualifiers['product'][0] + "\tIn First Gbk but not Second\tN/A\t0.00\t" + str(f.location.end - f.location.start) + "\t" + print_feature_one_line(f) + "\tN/A\tN/A"
|
|
206 # print('')
|
|
207 # print('In old but not in new:')
|
|
208 # print(' ', end='')
|
|
209 # print_feature_one_line(f)
|
|
210 print(line)
|
|
211
|
|
212
|
|
213 def print_in_new_not_old(f): # rename file outputs
|
|
214 line = "N/A\tIn Second Gbk but not First\t" + f.qualifiers['product'][0] + "\t0.00\t" + str(f.location.end - f.location.start) + "\tN/A\t" + print_feature_one_line(f) + "\tN/A"
|
|
215 #print('')
|
|
216 #print('In new but not in old:')
|
|
217 #print(' ', end='')
|
|
218 #print_feature_one_line(f)
|
|
219 print(line)
|
|
220
|
|
221
|
|
222 def print_feature_one_line(f):
|
|
223 #f_str = f.qualifiers['product'][0]
|
|
224 f_str = ""
|
|
225 strand = '+' if f.location.strand == 1 else '-'
|
|
226 f_str += '(' + str(f.location.start) + '-' + str(f.location.end) + ' ' + strand + ', '
|
|
227 f_str += str(f.location.end - f.location.start) + ' bp)'
|
|
228 return(f_str)
|
|
229
|
|
230
|
|
231 def compare_features(f1, f2, r1, r2, match_identity_threshold):
|
|
232 if f1 is None or f2 is None:
|
|
233 return False
|
|
234
|
|
235 s1 = f1.extract(r1).seq
|
|
236 s2 = f2.extract(r2).seq
|
|
237 score = pairwise2.align.globalms(s1, s2, 1, 0, 0, 0, score_only=True)
|
|
238 identity = score / max(len(s1), len(s2))
|
|
239 match = identity >= match_identity_threshold
|
|
240 length_diff = len(s1) - len(s2)
|
|
241 return match, identity, length_diff
|
|
242
|
|
243
|
|
244 if __name__ == '__main__':
|
|
245 main()
|