comparison blast2pxmfa.py @ 1:c66d6978a5f8 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:14:17 +0000
parents
children
comparison
equal deleted inserted replaced
0:84761ea60231 1:c66d6978a5f8
1 #!/usr/bin/env python
2 import os
3 import argparse
4 from CPT_GFFParser import gffParse, gffWrite
5 from Bio import SeqIO
6 from Bio.SeqRecord import SeqRecord
7 from gff3 import feature_lambda, feature_test_true
8 from Bio.Align.Applications import ClustalwCommandline
9 from Bio import AlignIO
10 import tempfile
11 import logging
12
13 log = logging.getLogger()
14 log.setLevel(logging.DEBUG)
15
16
17 def parse_gff3(annotations, genome):
18 annotations.seek(0)
19 genome.seek(0)
20
21 data = {}
22 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
23 for record in gffParse(annotations, base_dict=seq_dict):
24 for feature in feature_lambda(
25 record.features, feature_test_true, {}, subfeatures=False
26 ):
27 data[feature.id] = {
28 "rec": record.id,
29 "loc": feature.location,
30 "seq": feature.extract(record).seq.translate(table=11, cds=False)[0:-1],
31 }
32 return data
33
34
35 def get_seqids(genome):
36 genome.seek(0)
37 recids = []
38 for record in SeqIO.parse(genome, "fasta"):
39 recids.append(record.id)
40 return recids
41
42
43 def gen_relationships(blast):
44 for row in blast:
45 line = row.split("\t")
46 yield {"from": line[0], "to": line[1], "pident": line[2], "evalue": line[10]}
47
48
49 def cluster_relationships(data):
50 generated_clusters_ids = []
51
52 for relationship in data:
53 hit = False
54 relationship_set = set((relationship["from"], relationship["to"]))
55 for idx, cluster in enumerate(generated_clusters_ids):
56 if relationship["from"] in cluster or relationship["to"] in cluster:
57 generated_clusters_ids[idx] = cluster.__or__(relationship_set)
58 hit = True
59 break
60
61 if not hit:
62 generated_clusters_ids.append(relationship_set)
63 return generated_clusters_ids
64
65
66 def align_sequences(SequenceList):
67 # No sense in aligning one sequence.
68 if len(SequenceList) < 2:
69 return None
70
71 # Clustal mangles IDs. Fricking Clustal. Specifically it truncates them
72 # meaning we have to RE-ID every single sequence that goes into it. Lovely.
73 id_map = {}
74 for idx, seq in enumerate(SequenceList):
75 id_map[str(idx)] = seq.id
76 seq.id = str(idx)
77
78 t_fa = tempfile.NamedTemporaryFile(prefix="blastxmfa.", delete=False)
79 t_aln = tempfile.NamedTemporaryFile(prefix="blastxmfa.", delete=False)
80 # Write fasta to file
81 SeqIO.write(SequenceList, str.encode(t_fa.name), "fasta")
82 # Flush & close
83 t_fa.flush()
84 t_fa.close()
85 t_aln.close()
86
87 # Build clustal CLI
88 d = ClustalwCommandline(infile=t_fa.name, outfile=t_aln.name)
89 logging.debug(d)
90 # Call clustal
91 d()
92 # Get our alignment back
93 try:
94 aln = AlignIO.read(t_aln.name, "clustal")
95 # Now we replace the IDs with the correct, full length ones
96 for a in aln:
97 a.id = id_map[a.id]
98 # Cleanup
99 os.unlink(t_fa.name)
100 os.unlink(t_aln.name)
101 return aln
102 except Exception as e:
103 logging.error("%s, %s", e, t_fa.name)
104 os.unlink(t_aln.name)
105 return None
106
107
108 def split_by_n(seq, n):
109 """A generator to divide a sequence into chunks of n units."""
110 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
111 while seq:
112 yield seq[:n]
113 seq = seq[n:]
114
115
116 def larger_than_one(it):
117 for item in it:
118 if len(item) > 1:
119 yield item
120
121
122 def smaller_than_3n(x, it):
123 THRESH = 3 * x
124 for item in it:
125 if len(item) <= THRESH:
126 yield item
127 else:
128 log.warn(
129 "Cluster with %s (>%s) members, seems excessive", len(item), THRESH
130 )
131
132
133 class XmfaWriter(object):
134 HEADER_TPL = "> {seqnum}:{start}-{end} {strand} {file} # {realname}\n"
135
136 def __init__(self, handle):
137 self.output = handle
138 self.output.write("#FormatVersion Mauve1\n")
139
140 def write(self, seqnum, start, end, strand, file, realname, sequence):
141 self.output.write(
142 self.HEADER_TPL.format(
143 seqnum=seqnum,
144 start=start,
145 end=end,
146 strand=strand,
147 file=file,
148 realname=realname,
149 )
150 )
151
152 for line in split_by_n(sequence, 80):
153 self.output.write(line + "\n")
154
155 def end(self):
156 self.output.write("=\n")
157
158
159 def blast2pxmfa(blast, fasta, gff3, output, genomic=False):
160 logging.info("Parsing sequence")
161 locations = parse_gff3(gff3, fasta)
162 logging.info("Parsed locations, clustering")
163 recids = get_seqids(fasta)
164 logging.info("Found %s records in fasta", len(recids))
165
166 xmw = XmfaWriter(output)
167
168 # First let's generate some blastclust style clusters.
169 clusters = list(
170 smaller_than_3n(
171 len(recids),
172 larger_than_one(cluster_relationships(gen_relationships(blast))),
173 )
174 )
175 logging.debug("%s clusters generated", len(clusters))
176
177 def sortIndexForFeatId(element):
178 # Will not work correctly if genome length is >1mb
179 general = recids.index(locations[element]["rec"]) * 1000000
180 specific = locations[element]["loc"].start
181 return general + specific
182
183 for idx, cluster in enumerate(clusters):
184 logging.debug("Cluster %s/%s, size=%s", idx + 1, len(clusters), len(cluster))
185 # We're considering 1 LCB :: 1 cluster
186 seqs = []
187 for element in cluster:
188 if element not in locations:
189 logging.warning("Could not find this feature %s", element)
190 continue
191
192 sr = SeqRecord(
193 locations[element]["seq"],
194 id=element,
195 description="[{0.start}:{0.end}:{0.strand}]".format(
196 locations[element]["loc"]
197 ),
198 )
199 seqs.append(sr)
200
201 aligned_seqs = align_sequences(seqs)
202 if aligned_seqs is None:
203 logging.error("Error aligning cluster [%s]", "".join(cluster))
204 continue
205
206 sortedCluster = sorted(cluster, key=lambda x: sortIndexForFeatId(x))
207 sortedAligned = sorted(aligned_seqs, key=lambda x: sortIndexForFeatId(x.id))
208 # print(sortedCluster, [x.id for x in sortedAligned])
209
210 # Pre-check the asserts.
211 goodToGo = all(
212 [
213 element == aligned_seq.id
214 for (element, aligned_seq) in zip(sortedCluster, sortedAligned)
215 ]
216 )
217 if not goodToGo:
218 logging.info(
219 "Skipping one grouping: %s != %s",
220 ",".join(sortedCluster),
221 ",".join([x.id for x in sortedAligned]),
222 )
223 # Skip this look
224 continue
225 # print(aligned_seqs)
226 for element, aligned_seq in zip(sortedCluster, sortedAligned):
227 if element not in locations:
228 logging.warning("Could not find this feature %s", element)
229 continue
230
231 eloc = locations[element]
232 xmw.write(
233 seqnum=recids.index(eloc["rec"]) + 1,
234 start=eloc["loc"].start,
235 end=eloc["loc"].end,
236 strand="+" if eloc["loc"].strand == 1 else "-",
237 file=eloc["rec"] + ".fa",
238 realname=element,
239 sequence=str(aligned_seq.seq),
240 )
241 xmw.end()
242
243
244 if __name__ == "__main__":
245 parser = argparse.ArgumentParser(
246 description="Convert Blast TSV to protein XMFA", epilog=""
247 )
248 parser.add_argument("blast", type=argparse.FileType("r"), help="Blast TSV Output")
249 parser.add_argument("fasta", type=argparse.FileType("r"), help="Blast Input Fasta")
250 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 Gene Calls")
251 parser.add_argument(
252 "--genomic",
253 action="store_true",
254 help="Further reduce protein results into genomic-level results.",
255 )
256 parser.add_argument(
257 "output", type=argparse.FileType("w"), help="Output file or - for stdout"
258 )
259 args = parser.parse_args()
260
261 blast2pxmfa(**vars(args))