Mercurial > repos > cpt > cpt_blast_to_xmfa
comparison blast2pxmfa.py @ 1:c66d6978a5f8 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:14:17 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:84761ea60231 | 1:c66d6978a5f8 |
---|---|
1 #!/usr/bin/env python | |
2 import os | |
3 import argparse | |
4 from CPT_GFFParser import gffParse, gffWrite | |
5 from Bio import SeqIO | |
6 from Bio.SeqRecord import SeqRecord | |
7 from gff3 import feature_lambda, feature_test_true | |
8 from Bio.Align.Applications import ClustalwCommandline | |
9 from Bio import AlignIO | |
10 import tempfile | |
11 import logging | |
12 | |
13 log = logging.getLogger() | |
14 log.setLevel(logging.DEBUG) | |
15 | |
16 | |
17 def parse_gff3(annotations, genome): | |
18 annotations.seek(0) | |
19 genome.seek(0) | |
20 | |
21 data = {} | |
22 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) | |
23 for record in gffParse(annotations, base_dict=seq_dict): | |
24 for feature in feature_lambda( | |
25 record.features, feature_test_true, {}, subfeatures=False | |
26 ): | |
27 data[feature.id] = { | |
28 "rec": record.id, | |
29 "loc": feature.location, | |
30 "seq": feature.extract(record).seq.translate(table=11, cds=False)[0:-1], | |
31 } | |
32 return data | |
33 | |
34 | |
35 def get_seqids(genome): | |
36 genome.seek(0) | |
37 recids = [] | |
38 for record in SeqIO.parse(genome, "fasta"): | |
39 recids.append(record.id) | |
40 return recids | |
41 | |
42 | |
43 def gen_relationships(blast): | |
44 for row in blast: | |
45 line = row.split("\t") | |
46 yield {"from": line[0], "to": line[1], "pident": line[2], "evalue": line[10]} | |
47 | |
48 | |
49 def cluster_relationships(data): | |
50 generated_clusters_ids = [] | |
51 | |
52 for relationship in data: | |
53 hit = False | |
54 relationship_set = set((relationship["from"], relationship["to"])) | |
55 for idx, cluster in enumerate(generated_clusters_ids): | |
56 if relationship["from"] in cluster or relationship["to"] in cluster: | |
57 generated_clusters_ids[idx] = cluster.__or__(relationship_set) | |
58 hit = True | |
59 break | |
60 | |
61 if not hit: | |
62 generated_clusters_ids.append(relationship_set) | |
63 return generated_clusters_ids | |
64 | |
65 | |
66 def align_sequences(SequenceList): | |
67 # No sense in aligning one sequence. | |
68 if len(SequenceList) < 2: | |
69 return None | |
70 | |
71 # Clustal mangles IDs. Fricking Clustal. Specifically it truncates them | |
72 # meaning we have to RE-ID every single sequence that goes into it. Lovely. | |
73 id_map = {} | |
74 for idx, seq in enumerate(SequenceList): | |
75 id_map[str(idx)] = seq.id | |
76 seq.id = str(idx) | |
77 | |
78 t_fa = tempfile.NamedTemporaryFile(prefix="blastxmfa.", delete=False) | |
79 t_aln = tempfile.NamedTemporaryFile(prefix="blastxmfa.", delete=False) | |
80 # Write fasta to file | |
81 SeqIO.write(SequenceList, str.encode(t_fa.name), "fasta") | |
82 # Flush & close | |
83 t_fa.flush() | |
84 t_fa.close() | |
85 t_aln.close() | |
86 | |
87 # Build clustal CLI | |
88 d = ClustalwCommandline(infile=t_fa.name, outfile=t_aln.name) | |
89 logging.debug(d) | |
90 # Call clustal | |
91 d() | |
92 # Get our alignment back | |
93 try: | |
94 aln = AlignIO.read(t_aln.name, "clustal") | |
95 # Now we replace the IDs with the correct, full length ones | |
96 for a in aln: | |
97 a.id = id_map[a.id] | |
98 # Cleanup | |
99 os.unlink(t_fa.name) | |
100 os.unlink(t_aln.name) | |
101 return aln | |
102 except Exception as e: | |
103 logging.error("%s, %s", e, t_fa.name) | |
104 os.unlink(t_aln.name) | |
105 return None | |
106 | |
107 | |
108 def split_by_n(seq, n): | |
109 """A generator to divide a sequence into chunks of n units.""" | |
110 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character | |
111 while seq: | |
112 yield seq[:n] | |
113 seq = seq[n:] | |
114 | |
115 | |
116 def larger_than_one(it): | |
117 for item in it: | |
118 if len(item) > 1: | |
119 yield item | |
120 | |
121 | |
122 def smaller_than_3n(x, it): | |
123 THRESH = 3 * x | |
124 for item in it: | |
125 if len(item) <= THRESH: | |
126 yield item | |
127 else: | |
128 log.warn( | |
129 "Cluster with %s (>%s) members, seems excessive", len(item), THRESH | |
130 ) | |
131 | |
132 | |
133 class XmfaWriter(object): | |
134 HEADER_TPL = "> {seqnum}:{start}-{end} {strand} {file} # {realname}\n" | |
135 | |
136 def __init__(self, handle): | |
137 self.output = handle | |
138 self.output.write("#FormatVersion Mauve1\n") | |
139 | |
140 def write(self, seqnum, start, end, strand, file, realname, sequence): | |
141 self.output.write( | |
142 self.HEADER_TPL.format( | |
143 seqnum=seqnum, | |
144 start=start, | |
145 end=end, | |
146 strand=strand, | |
147 file=file, | |
148 realname=realname, | |
149 ) | |
150 ) | |
151 | |
152 for line in split_by_n(sequence, 80): | |
153 self.output.write(line + "\n") | |
154 | |
155 def end(self): | |
156 self.output.write("=\n") | |
157 | |
158 | |
159 def blast2pxmfa(blast, fasta, gff3, output, genomic=False): | |
160 logging.info("Parsing sequence") | |
161 locations = parse_gff3(gff3, fasta) | |
162 logging.info("Parsed locations, clustering") | |
163 recids = get_seqids(fasta) | |
164 logging.info("Found %s records in fasta", len(recids)) | |
165 | |
166 xmw = XmfaWriter(output) | |
167 | |
168 # First let's generate some blastclust style clusters. | |
169 clusters = list( | |
170 smaller_than_3n( | |
171 len(recids), | |
172 larger_than_one(cluster_relationships(gen_relationships(blast))), | |
173 ) | |
174 ) | |
175 logging.debug("%s clusters generated", len(clusters)) | |
176 | |
177 def sortIndexForFeatId(element): | |
178 # Will not work correctly if genome length is >1mb | |
179 general = recids.index(locations[element]["rec"]) * 1000000 | |
180 specific = locations[element]["loc"].start | |
181 return general + specific | |
182 | |
183 for idx, cluster in enumerate(clusters): | |
184 logging.debug("Cluster %s/%s, size=%s", idx + 1, len(clusters), len(cluster)) | |
185 # We're considering 1 LCB :: 1 cluster | |
186 seqs = [] | |
187 for element in cluster: | |
188 if element not in locations: | |
189 logging.warning("Could not find this feature %s", element) | |
190 continue | |
191 | |
192 sr = SeqRecord( | |
193 locations[element]["seq"], | |
194 id=element, | |
195 description="[{0.start}:{0.end}:{0.strand}]".format( | |
196 locations[element]["loc"] | |
197 ), | |
198 ) | |
199 seqs.append(sr) | |
200 | |
201 aligned_seqs = align_sequences(seqs) | |
202 if aligned_seqs is None: | |
203 logging.error("Error aligning cluster [%s]", "".join(cluster)) | |
204 continue | |
205 | |
206 sortedCluster = sorted(cluster, key=lambda x: sortIndexForFeatId(x)) | |
207 sortedAligned = sorted(aligned_seqs, key=lambda x: sortIndexForFeatId(x.id)) | |
208 # print(sortedCluster, [x.id for x in sortedAligned]) | |
209 | |
210 # Pre-check the asserts. | |
211 goodToGo = all( | |
212 [ | |
213 element == aligned_seq.id | |
214 for (element, aligned_seq) in zip(sortedCluster, sortedAligned) | |
215 ] | |
216 ) | |
217 if not goodToGo: | |
218 logging.info( | |
219 "Skipping one grouping: %s != %s", | |
220 ",".join(sortedCluster), | |
221 ",".join([x.id for x in sortedAligned]), | |
222 ) | |
223 # Skip this look | |
224 continue | |
225 # print(aligned_seqs) | |
226 for element, aligned_seq in zip(sortedCluster, sortedAligned): | |
227 if element not in locations: | |
228 logging.warning("Could not find this feature %s", element) | |
229 continue | |
230 | |
231 eloc = locations[element] | |
232 xmw.write( | |
233 seqnum=recids.index(eloc["rec"]) + 1, | |
234 start=eloc["loc"].start, | |
235 end=eloc["loc"].end, | |
236 strand="+" if eloc["loc"].strand == 1 else "-", | |
237 file=eloc["rec"] + ".fa", | |
238 realname=element, | |
239 sequence=str(aligned_seq.seq), | |
240 ) | |
241 xmw.end() | |
242 | |
243 | |
244 if __name__ == "__main__": | |
245 parser = argparse.ArgumentParser( | |
246 description="Convert Blast TSV to protein XMFA", epilog="" | |
247 ) | |
248 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast TSV Output") | |
249 parser.add_argument("fasta", type=argparse.FileType("r"), help="Blast Input Fasta") | |
250 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 Gene Calls") | |
251 parser.add_argument( | |
252 "--genomic", | |
253 action="store_true", | |
254 help="Further reduce protein results into genomic-level results.", | |
255 ) | |
256 parser.add_argument( | |
257 "output", type=argparse.FileType("w"), help="Output file or - for stdout" | |
258 ) | |
259 args = parser.parse_args() | |
260 | |
261 blast2pxmfa(**vars(args)) |