Mercurial > repos > cpt > cpt_disruptin_proximity
comparison disruptin_proximity_2_lysis_genes.py @ 1:5081ba961953 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:58 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:6661bb42b5a9 | 1:5081ba961953 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and genes encoding transmembrane domains. The goal is narrow a list of disruptin candidates by identifying the sequences close to the other lysis genes in the phage genome. | |
4 Inputs for this program include a .fasta file with protein sequences of lysis gene candidates from the phage genome, a .gff3 file with the tmhmm results from the genome, a .gff3 file with the results from interproscan of the genome, a .gff3 file of the genome, window size in number of base pairs, a tab separated list of endolysin domains, and optional names of output files. | |
5 The program outputs lists of lysis gene candidates that are close to protein codings sequences with endolysin domains or to sequences with transmembrane domains and lists of the proteins in proximity to the lysis gene candidates (one list for proteins with endolysin domains and one list for TMD-containing proteins). | |
6 """ | |
7 | |
8 from Bio import SeqIO | |
9 import argparse | |
10 import sys | |
11 from CPT_GFFParser import gffParse, gffWrite | |
12 from Bio.SeqRecord import SeqRecord | |
13 from Bio.SeqFeature import SeqFeature | |
14 from Bio.Seq import Seq | |
15 from intervaltree import IntervalTree, Interval | |
16 from gff3 import feature_lambda, feature_test_type | |
17 | |
18 # Used for genome in fasta format | |
19 # outputs the start and end coordinates from a record in fasta format | |
20 # Should work for seqrecords from NCBI database | |
21 # def location_start_end(seqrec): | |
22 # F = seqrec.description | |
23 # description_subparts = F.split(' ') | |
24 # for i in range(len(description_subparts)): | |
25 # if description_subparts[i].startswith('[location'): | |
26 # location = description_subparts[i][10:-1] | |
27 # location_se = location.split('..') | |
28 # location_start = location_se[0] | |
29 # location_end = location_se[1] | |
30 # | |
31 # return location_start, location_end | |
32 | |
33 # adapted from intersect_and_adjacent.py | |
34 def treeFeatures(features): | |
35 for feat in features: | |
36 # used with genome in fasta format | |
37 # start, end = location_start_end(feat) | |
38 | |
39 # Interval(begin, end, data) | |
40 yield Interval(int(feat.location.start), int(feat.location.end), feat.id) | |
41 | |
42 | |
43 # Function to read enzyme domain names and ids from the enzyme list | |
44 # Enzyme list must be a tab separated txt file with the format with four columns: Predicted catalytic or binding domain, Abbreviation, Conserved domain, Phage example | |
45 # The first column is used here as the domain name, and the 3rd column is the domain id | |
46 def read_enzyme_list(enzyme_file=None): | |
47 enzyme_file.seek(0) | |
48 domains = [] | |
49 # domain_names = [] | |
50 | |
51 for line in enzyme_file: | |
52 if not line.startswith("*"): | |
53 words = line.split("\t") | |
54 if len(words) > 3: | |
55 domains += [words[2]] | |
56 # domain_names += [words[0]] | |
57 | |
58 return domains[1:] | |
59 | |
60 | |
61 # adapted from intersect_and_adjacent.py | |
62 def intersect(rec_a, rec_b, window): | |
63 if len(rec_a) > 0 and len(rec_b) > 0: | |
64 | |
65 # builds interval tree from Interval objects of form (start, end, id) for each feature | |
66 tree_a = IntervalTree(list(treeFeatures(rec_a))) | |
67 tree_b = IntervalTree(list(treeFeatures(rec_b))) | |
68 | |
69 # Used to map ids back to features later | |
70 rec_a_map = {f.id: f for f in rec_a} | |
71 rec_b_map = {f.id: f for f in rec_b} | |
72 | |
73 rec_a_hits_in_b = [] | |
74 rec_b_hits_in_a = [] | |
75 | |
76 for feature in rec_a: | |
77 # Used with genome in fasta format | |
78 # start, end = location_start_end(feature) | |
79 # Save each feature in rec_a that overlaps a feature in rec_b | |
80 # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end))) | |
81 hits = tree_b[ | |
82 (int(feature.location.start) - window) : ( | |
83 int(feature.location.end) + window | |
84 ) | |
85 ] | |
86 # feature id is saved in interval result.data, use map to get full feature | |
87 for hit in hits: | |
88 rec_a_hits_in_b.append(rec_b_map[hit.data]) | |
89 | |
90 for feature in rec_b: | |
91 # Used with genome in fasta format | |
92 # start, end = location_start_end(feature) | |
93 # Save each feature in rec_a that overlaps a feature in rec_b | |
94 # hits = tree_a.find_range((int(feature.location.start), int(feature.location.end))) | |
95 hits = tree_a[ | |
96 (int(feature.location.start) - window) : ( | |
97 int(feature.location.end) + window | |
98 ) | |
99 ] | |
100 # feature id is saved in interval result.data, use map to get full feature | |
101 for hit in hits: | |
102 rec_b_hits_in_a.append(rec_a_map[hit.data]) | |
103 | |
104 # Remove duplicate features using sets | |
105 rec_a = set(rec_a_hits_in_b) | |
106 rec_b = set(rec_b_hits_in_a) | |
107 | |
108 else: | |
109 # If one input is empty, output two empty result files. | |
110 rec_a = SeqRecord(Seq(""), "none") | |
111 rec_b = SeqRecord(Seq(""), "none") | |
112 return rec_a, rec_b | |
113 | |
114 | |
115 # Function to identify enzyme domains associated with endolysin function from the interproscan results file | |
116 def find_endolysins(rec_ipro, enzyme_domain_ids): | |
117 | |
118 # print(rec_ipro) | |
119 | |
120 if len(rec_ipro) > 0: | |
121 endo_rec_names = [] | |
122 endo_rec_domain_ids = [] | |
123 rec_domain_name = [] | |
124 | |
125 # Check each feature in the ipro file if the domain id/feature qualifier "Name" is included in the domain list. | |
126 for seq in rec_ipro: | |
127 for i in range(len(seq.features)): | |
128 f = seq.features[i] | |
129 if f.type == "protein_match": | |
130 # print(f) | |
131 unwanted = ["TRANS", "SIGNAL", "CYTO", "Coil", "Signal"] | |
132 # Ignores feature with unwanted key words in the feature name | |
133 if all(x not in f.qualifiers["Name"][0] for x in unwanted): | |
134 # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored | |
135 domain_description = [str(f.qualifiers["Name"][0])] | |
136 if "signature_desc" in f.qualifiers: | |
137 domain_description += [ | |
138 str(f.qualifiers["signature_desc"][0]) | |
139 ] | |
140 | |
141 for i in enzyme_domain_ids: | |
142 for y in domain_description: | |
143 if i in y: | |
144 endo_rec_domain_ids += [f.qualifiers["Name"][0]] | |
145 | |
146 # e_index = enzyme_domain_ids.index(i) | |
147 # rec_domain_name += [enzyme_domain_names[e_index]] | |
148 | |
149 target = f.qualifiers["Target"][0] | |
150 target = target.split(" ") | |
151 protein_name = str(target[0]) + "**" | |
152 endo_rec_names += [protein_name] | |
153 | |
154 return endo_rec_names, endo_rec_domain_ids | |
155 | |
156 | |
157 def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): | |
158 rec_lgc = list(SeqIO.parse(lgc, "fasta")) | |
159 rec_tmhmm = gffParse(tmhmm) | |
160 rec_ipro = gffParse(ipro) | |
161 recTemp = gffParse(genome)[0] | |
162 tempFeats = feature_lambda( | |
163 recTemp.features, | |
164 feature_test_type, | |
165 {"types": ["CDS"]}, | |
166 subfeatures=True, | |
167 ) | |
168 recTemp.features = tempFeats | |
169 rec_genome_ini = [recTemp] | |
170 | |
171 # genome.seek(0) | |
172 # examiner = GFFExaminer() | |
173 # print(examiner.available_limits(genome)) | |
174 | |
175 enzyme_domain_ids = read_enzyme_list(enzyme) | |
176 | |
177 if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0: | |
178 | |
179 # find names of the proteins containing endolysin associated domains | |
180 endo_names, endo_domain_ids = find_endolysins(rec_ipro, list(enzyme_domain_ids)) | |
181 | |
182 # find names of proteins containing transmembrane domains | |
183 tmhmm_protein_names = [] | |
184 for seq in rec_tmhmm: | |
185 tmhmm_protein_names += [str(seq.id) + "**"] | |
186 | |
187 lgc_names = [] | |
188 for seq in rec_lgc: | |
189 lgc_names += [str(seq.id) + "**"] | |
190 | |
191 adjacent_endo = {} | |
192 adjacent_lgc_to_endo = {} | |
193 adjacent_tm = {} | |
194 adjacent_lgc_to_tm = {} | |
195 | |
196 # print(len(tmhmm_protein_names), len(endo_names)) | |
197 # print(rec_genome_ini) | |
198 # print(len(rec_genome_ini)) | |
199 | |
200 for i in range(len(rec_genome_ini)): | |
201 rec_genome = rec_genome_ini[i] | |
202 | |
203 # find records for proteins containing endolysin domains and tmds from genome fasta file | |
204 tm_seqrec = [] | |
205 endolysin_seqrec = [] | |
206 lgc_seqrec = [] | |
207 | |
208 # print(tmhmm_protein_names) | |
209 # print(endo_names) | |
210 # print(lgc_names) | |
211 # print(rec_genome) | |
212 | |
213 for feat in rec_genome.features: | |
214 # print(feat) | |
215 # searches for synonyms and | |
216 if feat.type == "CDS": | |
217 feat_names = [] | |
218 feat_names.append(str(feat.id) + "**") | |
219 if "locus_tag" in feat.qualifiers: | |
220 feat_names.append(str(feat.qualifiers["locus_tag"][0]) + "**") | |
221 if "protein_id" in feat.qualifiers: | |
222 feat_names.append(str(feat.qualifiers["protein_id"][0]) + "**") | |
223 if "Name" in feat.qualifiers: | |
224 if len(str(feat.qualifiers["Name"][0])) > 5: | |
225 feat_names.append(str(feat.qualifiers["Name"][0]) + "**") | |
226 | |
227 # print(str(feat_names)) | |
228 # print(str(feat.qualifiers)) | |
229 for i in range(len(feat_names)): | |
230 if str(feat_names[i]) in str(lgc_names): | |
231 lgc_seqrec += [feat] | |
232 # check if gene annotated as holin using key words/synonyms | |
233 holin_annotations = ["holin"] | |
234 if "product" in feat.qualifiers: | |
235 if any( | |
236 x for x in holin_annotations if (x in str(feat.qualifiers)) | |
237 ): | |
238 tm_seqrec += [feat] | |
239 # check if protein contains a TMD | |
240 for i in range(len(feat_names)): | |
241 if str(feat_names[i]) in str(tmhmm_protein_names): | |
242 # print(feat_names[i]) | |
243 tm_seqrec += [feat] | |
244 | |
245 # check if gene annotated as endolysin using key words/synonyms | |
246 endolysin_annotations = ["lysin", "lysozyme"] | |
247 if "product" in feat.qualifiers: | |
248 if any( | |
249 x | |
250 for x in endolysin_annotations | |
251 if (x in str(feat.qualifiers)) | |
252 ): | |
253 endolysin_seqrec += [feat] | |
254 # check if protein contains an endolysin-associated domain | |
255 for i in range(len(feat_names)): | |
256 if str(feat_names[i]) in str(endo_names): | |
257 endolysin_seqrec += [feat] | |
258 | |
259 # print(endolysin_seqrec, tm_seqrec, lgc_seqrec) | |
260 # find possible endolysins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates | |
261 # if len(endolysin_seqrec) > 0: | |
262 adjacent_lgc_to_endo_i, adjacent_endo_i = intersect( | |
263 endolysin_seqrec, lgc_seqrec, window | |
264 ) | |
265 # find TMD-containing proteins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates | |
266 # if len(tm_seqrec) > 0: | |
267 adjacent_lgc_to_tm_i, adjacent_tm_i = intersect( | |
268 tm_seqrec, lgc_seqrec, window | |
269 ) | |
270 | |
271 # print(len(endolysin_seqrec), len(lgc_seqrec), len(tm_seqrec)) | |
272 adjacent_endo[rec_genome.id] = adjacent_endo_i | |
273 adjacent_lgc_to_endo[rec_genome.id] = adjacent_lgc_to_endo_i | |
274 adjacent_tm[rec_genome.id] = adjacent_tm_i | |
275 adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i | |
276 # print(rec_genome.id) | |
277 else: | |
278 return 0, 0, 0, 0 | |
279 # print(adjacent_endo) | |
280 return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm | |
281 | |
282 | |
283 if __name__ == "__main__": | |
284 parser = argparse.ArgumentParser( | |
285 description="Identify lysis gene candidates next to possible endolysin or holin genes", | |
286 epilog="", | |
287 ) | |
288 parser.add_argument( | |
289 "lgc", | |
290 type=argparse.FileType("r"), | |
291 help="fasta file with protein coding sequences of lysis gene candidates", | |
292 ) | |
293 parser.add_argument( | |
294 "tmhmm", | |
295 type=argparse.FileType("r"), | |
296 help="gff3 file with tmhmm results from the coding sequences in the genome", | |
297 ) | |
298 parser.add_argument( | |
299 "ipro", | |
300 type=argparse.FileType("r"), | |
301 help="gff3 file with interpro results from protein sequences in the genome", | |
302 ) | |
303 parser.add_argument( | |
304 "genome", | |
305 type=argparse.FileType("r"), | |
306 help="fasta file with protein coding sequences for all genes in the genome", | |
307 ) | |
308 parser.add_argument( | |
309 "window", | |
310 type=int, | |
311 default=1000, | |
312 help="Allows features this far away to still be considered 'adjacent'", | |
313 ) | |
314 parser.add_argument( | |
315 "enzyme", | |
316 type=argparse.FileType("r"), | |
317 help="tab delimited text file including list of conserved protein domains linked to endolysin function", | |
318 ) | |
319 parser.add_argument("--oa", type=str, default="possible_endolysin.gff3") | |
320 parser.add_argument( | |
321 "--ob", type=str, default="lysis_gene_candidates_near_endolysin.gff3" | |
322 ) | |
323 parser.add_argument("--oc", type=str, default="possible_holin.gff3") | |
324 parser.add_argument( | |
325 "--od", type=str, default="lysis_gene_candidates_near_holin.gff3" | |
326 ) | |
327 args = parser.parse_args() | |
328 | |
329 endo, lgc_endo, tm, lgc_tm = adjacent_lgc( | |
330 args.lgc, args.tmhmm, args.ipro, args.genome, args.enzyme, args.window | |
331 ) | |
332 | |
333 if endo == 0: | |
334 with open(args.oa, "w") as handle: | |
335 handle.write("##gff-version 3") | |
336 with open(args.ob, "w") as handle: | |
337 handle.write("##gff-version 3") | |
338 with open(args.oc, "w") as handle: | |
339 handle.write("##gff-version 3") | |
340 with open(args.od, "w") as handle: | |
341 handle.write("##gff-version 3") | |
342 return | |
343 | |
344 args.genome.seek(0) | |
345 rec = list(gffParse(args.genome)) | |
346 | |
347 with open(args.oa, "w") as handle: | |
348 for i in range(len(rec)): | |
349 rec_i = rec[i] | |
350 if endo.get(rec_i.id, "") is not "": | |
351 rec_i.features = endo[rec_i.id] | |
352 gffWrite([rec_i], handle) | |
353 | |
354 with open(args.ob, "w") as handle: | |
355 for i in range(len(rec)): | |
356 rec_i = rec[i] | |
357 if lgc_endo.get(rec_i.id, "") is not "": | |
358 rec_i.features = lgc_endo[rec_i.id] | |
359 gffWrite([rec_i], handle) | |
360 | |
361 with open(args.oc, "w") as handle: | |
362 for i in range(len(rec)): | |
363 rec_i = rec[i] | |
364 if tm.get(rec_i.id, "") is not "": | |
365 rec_i.features = tm[rec_i.id] | |
366 gffWrite([rec_i], handle) | |
367 | |
368 with open(args.od, "w") as handle: | |
369 for i in range(len(rec)): | |
370 rec_i = rec[i] | |
371 if lgc_tm.get(rec_i.id, "") is not "": | |
372 rec_i.features = lgc_tm[rec_i.id] | |
373 gffWrite([rec_i], handle) |