Mercurial > repos > cpt > cpt_cluster_lcbs
comparison cluster_lcbs.py @ 1:e6d8cdb65df0 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:20 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:85cd5feab3b7 | 1:e6d8cdb65df0 |
---|---|
1 #!/usr/bin/env python | |
2 from Bio import SeqIO | |
3 import tempfile | |
4 import sys | |
5 import argparse | |
6 | |
7 | |
8 def parse_xmfa(xmfa): | |
9 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544""" | |
10 current_lcb = [] | |
11 current_seq = {} | |
12 for line in xmfa.readlines(): | |
13 if line.startswith("#"): | |
14 continue | |
15 | |
16 if line.strip() == "=": | |
17 if "id" in current_seq: | |
18 current_lcb.append(current_seq) | |
19 current_seq = {} | |
20 yield current_lcb | |
21 current_lcb = [] | |
22 else: | |
23 line = line.strip() | |
24 if line.startswith(">"): | |
25 if "id" in current_seq: | |
26 current_lcb.append(current_seq) | |
27 current_seq = {} | |
28 data = line.strip().split() | |
29 # 0 1 2 3 4 5 | |
30 # > 1:5986-6406 + CbK.fa # CbK_gp011 | |
31 id, loc = data[1].split(":") | |
32 start, end = loc.split("-") | |
33 current_seq = { | |
34 "rid": "_".join(data[1:]), | |
35 "id": id, | |
36 "start": int(start), | |
37 "end": int(end), | |
38 "strand": 1 if data[2] == "+" else -1, | |
39 "file": data[3], | |
40 "seq": "", | |
41 "comment": "", | |
42 } | |
43 if len(data) > 5: | |
44 current_seq["comment"] = " ".join(data[5:]) | |
45 else: | |
46 current_seq["seq"] += line.strip() | |
47 | |
48 | |
49 HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n" | |
50 | |
51 | |
52 def split_by_n(seq, n): | |
53 """A generator to divide a sequence into chunks of n units.""" | |
54 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character | |
55 while seq: | |
56 yield seq[:n] | |
57 seq = seq[n:] | |
58 | |
59 | |
60 def to_xmfa(lcbs, handle=sys.stdout): | |
61 handle.write("#FormatVersion Mauve1\n") | |
62 for lcb in lcbs: | |
63 for aln in lcb: | |
64 handle.write( | |
65 HEADER_TPL.format( | |
66 id=aln["id"], | |
67 start=aln["start"], | |
68 end=aln["end"], | |
69 strand="+" if aln["strand"] > 0 else "-", | |
70 file=aln["file"], | |
71 comment=aln["comment"], | |
72 ) | |
73 ) | |
74 | |
75 for line in split_by_n(aln["seq"], 80): | |
76 handle.write(line + "\n") | |
77 handle.write("=\n") | |
78 | |
79 | |
80 def percent_identity(a, b): | |
81 """Calculate % identity, ignoring gaps in the host sequence""" | |
82 match = 0 | |
83 mismatch = 0 | |
84 for char_a, char_b in zip(list(a), list(b)): | |
85 if char_a == "-": | |
86 continue | |
87 if char_a == char_b: | |
88 match += 1 | |
89 else: | |
90 mismatch += 1 | |
91 | |
92 if match + mismatch == 0: | |
93 return 0.0 | |
94 return 100 * float(match) / (match + mismatch) | |
95 | |
96 | |
97 def id_tn_dict(sequences, tmpfile=False): | |
98 """Figure out sequence IDs""" | |
99 label_convert = {} | |
100 correct_chrom = None | |
101 if not isinstance(sequences, list): | |
102 sequences = [sequences] | |
103 | |
104 i = 0 | |
105 for sequence_file in sequences: | |
106 for record in SeqIO.parse(sequence_file, "fasta"): | |
107 if correct_chrom is None: | |
108 correct_chrom = record.id | |
109 | |
110 i += 1 | |
111 key = str(i) | |
112 label_convert[key] = {"record_id": record.id, "len": len(record.seq)} | |
113 | |
114 if tmpfile: | |
115 label_convert[key] = tempfile.NamedTemporaryFile(delete=False) | |
116 | |
117 return label_convert | |
118 | |
119 | |
120 def filter_lcbs_for_seq(xmfa): | |
121 """clusters lcbs based on which sequences they involve""" | |
122 strand_info = {"1": "+", "-1": "-"} | |
123 clusters = {} | |
124 | |
125 for i in list(parse_xmfa(xmfa)): | |
126 cluster_name = "" | |
127 | |
128 for g in i: | |
129 cluster_name += g["id"] + strand_info[str(g["strand"])] | |
130 # allow clusters with all opposite strands to be together (alt name is opposite strand of orig) | |
131 alt_name = cluster_name.replace("+", "*").replace("-", "+").replace("*", "-") | |
132 | |
133 orig_not_in_clusters = cluster_name not in clusters | |
134 alt_not_in_clusters = alt_name not in clusters | |
135 | |
136 if orig_not_in_clusters and alt_not_in_clusters: | |
137 # if original or alternate names not already in clusters | |
138 clusters[cluster_name] = [i] | |
139 else: | |
140 if not orig_not_in_clusters: # if original name is already in clusters | |
141 clusters[cluster_name].append(i) | |
142 if not alt_not_in_clusters: # if alt name is already in clusters | |
143 clusters[alt_name].append(i) | |
144 | |
145 return clusters | |
146 # to_xmfa(clusters['123456']) | |
147 | |
148 | |
149 def merge_lcbs(lcb1, lcb2): | |
150 for num, i in enumerate(lcb1): | |
151 i["start"] = min([i["start"], lcb2[num]["start"]]) | |
152 i["end"] = max([i["end"], lcb2[num]["end"]]) | |
153 i["seq"] += lcb2[num]["seq"] | |
154 | |
155 return lcb1 | |
156 | |
157 | |
158 def resolve_clusters(clusters): | |
159 merged = [] | |
160 for lcbs in clusters: | |
161 if len(lcbs) == 1: | |
162 merged.append(lcbs[0]) | |
163 continue | |
164 merging = lcbs[0] | |
165 for lcb in lcbs[1:]: | |
166 merging = merge_lcbs(merging, lcb) | |
167 merged.append(merging) | |
168 | |
169 return merged | |
170 | |
171 | |
172 def new(clusters, lcb): | |
173 new = True | |
174 for c in clusters: | |
175 if lcb in c: | |
176 new = False | |
177 return new | |
178 | |
179 | |
180 def cluster_lcbs(lcbs, threshold): | |
181 """clusters lcbs based on how far apart they are""" | |
182 | |
183 clusters = [] | |
184 for o, i in enumerate(lcbs): | |
185 cluster = [] | |
186 | |
187 if not new(clusters, i): | |
188 continue | |
189 | |
190 cluster.append(i) | |
191 compare_against = i | |
192 | |
193 for n, j in enumerate(lcbs): | |
194 | |
195 if not new(clusters, j) or i == j or compare_against == j: | |
196 continue | |
197 | |
198 close = True | |
199 for num, k in enumerate(compare_against): | |
200 # for num, k in enumerate(i): | |
201 if j[num]["start"] - k["end"] > threshold: | |
202 close = False | |
203 | |
204 if close: | |
205 cluster.append(j) | |
206 compare_against = j | |
207 | |
208 clusters.append(cluster) | |
209 return resolve_clusters(clusters) | |
210 | |
211 | |
212 if __name__ == "__main__": | |
213 parser = argparse.ArgumentParser(description="process XMFA") | |
214 parser.add_argument("xmfa", type=argparse.FileType("r"), help="XMFA file") | |
215 parser.add_argument( | |
216 "threshold", | |
217 type=int, | |
218 help="maximum number of nucleotides between lcbs in a cluster", | |
219 ) | |
220 args = parser.parse_args() | |
221 | |
222 # assuming lcbs are filtered | |
223 final_lcbs = [] | |
224 lcbs_filtered_for_seq = filter_lcbs_for_seq(args.xmfa) | |
225 for i in lcbs_filtered_for_seq: | |
226 final_lcbs += cluster_lcbs(lcbs_filtered_for_seq[i], args.threshold) | |
227 to_xmfa(final_lcbs) |