# HG changeset patch
# User cpt
# Date 1656998636 0
# Node ID 85cd5feab3b7180d542923ac0f0c54d1a06ae000
Uploaded
diff -r 000000000000 -r 85cd5feab3b7 cpt_cluster_lcbs/cluster_lcbs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_cluster_lcbs/cluster_lcbs.py Tue Jul 05 05:23:56 2022 +0000
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import tempfile
+import sys
+import argparse
+
+
+def parse_xmfa(xmfa):
+ """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+ """
+ current_lcb = []
+ current_seq = {}
+ for line in xmfa.readlines():
+ if line.startswith("#"):
+ continue
+
+ if line.strip() == "=":
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ yield current_lcb
+ current_lcb = []
+ else:
+ line = line.strip()
+ if line.startswith(">"):
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ data = line.strip().split()
+ # 0 1 2 3 4 5
+ # > 1:5986-6406 + CbK.fa # CbK_gp011
+ id, loc = data[1].split(":")
+ start, end = loc.split("-")
+ current_seq = {
+ "rid": "_".join(data[1:]),
+ "id": id,
+ "start": int(start),
+ "end": int(end),
+ "strand": 1 if data[2] == "+" else -1,
+ "file": data[3],
+ "seq": "",
+ "comment": "",
+ }
+ if len(data) > 5:
+ current_seq["comment"] = " ".join(data[5:])
+ else:
+ current_seq["seq"] += line.strip()
+
+
+HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
+
+
+def split_by_n(seq, n):
+ """A generator to divide a sequence into chunks of n units."""
+ # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
+ while seq:
+ yield seq[:n]
+ seq = seq[n:]
+
+
+def to_xmfa(lcbs, handle=sys.stdout):
+ handle.write("#FormatVersion Mauve1\n")
+ for lcb in lcbs:
+ for aln in lcb:
+ handle.write(
+ HEADER_TPL.format(
+ id=aln["id"],
+ start=aln["start"],
+ end=aln["end"],
+ strand="+" if aln["strand"] > 0 else "-",
+ file=aln["file"],
+ comment=aln["comment"],
+ )
+ )
+
+ for line in split_by_n(aln["seq"], 80):
+ handle.write(line + "\n")
+ handle.write("=\n")
+
+
+def percent_identity(a, b):
+ """Calculate % identity, ignoring gaps in the host sequence
+ """
+ match = 0
+ mismatch = 0
+ for char_a, char_b in zip(list(a), list(b)):
+ if char_a == "-":
+ continue
+ if char_a == char_b:
+ match += 1
+ else:
+ mismatch += 1
+
+ if match + mismatch == 0:
+ return 0.0
+ return 100 * float(match) / (match + mismatch)
+
+
+def id_tn_dict(sequences, tmpfile=False):
+ """Figure out sequence IDs
+ """
+ label_convert = {}
+ correct_chrom = None
+ if not isinstance(sequences, list):
+ sequences = [sequences]
+
+ i = 0
+ for sequence_file in sequences:
+ for record in SeqIO.parse(sequence_file, "fasta"):
+ if correct_chrom is None:
+ correct_chrom = record.id
+
+ i += 1
+ key = str(i)
+ label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
+
+ if tmpfile:
+ label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
+
+ return label_convert
+
+
+def filter_lcbs_for_seq(xmfa):
+ """ clusters lcbs based on which sequences they involve """
+ strand_info = {"1": "+", "-1": "-"}
+ clusters = {}
+
+ for i in list(parse_xmfa(xmfa)):
+ cluster_name = ""
+
+ for g in i:
+ cluster_name += g["id"] + strand_info[str(g["strand"])]
+ # allow clusters with all opposite strands to be together (alt name is opposite strand of orig)
+ alt_name = cluster_name.replace("+", "*").replace("-", "+").replace("*", "-")
+
+ orig_not_in_clusters = cluster_name not in clusters
+ alt_not_in_clusters = alt_name not in clusters
+
+ if orig_not_in_clusters and alt_not_in_clusters:
+ # if original or alternate names not already in clusters
+ clusters[cluster_name] = [i]
+ else:
+ if not orig_not_in_clusters: # if original name is already in clusters
+ clusters[cluster_name].append(i)
+ if not alt_not_in_clusters: # if alt name is already in clusters
+ clusters[alt_name].append(i)
+
+ return clusters
+ # to_xmfa(clusters['123456'])
+
+
+def merge_lcbs(lcb1, lcb2):
+ for num, i in enumerate(lcb1):
+ i["start"] = min([i["start"], lcb2[num]["start"]])
+ i["end"] = max([i["end"], lcb2[num]["end"]])
+ i["seq"] += lcb2[num]["seq"]
+
+ return lcb1
+
+
+def resolve_clusters(clusters):
+ merged = []
+ for lcbs in clusters:
+ if len(lcbs) == 1:
+ merged.append(lcbs[0])
+ continue
+ merging = lcbs[0]
+ for lcb in lcbs[1:]:
+ merging = merge_lcbs(merging, lcb)
+ merged.append(merging)
+
+ return merged
+
+
+def new(clusters, lcb):
+ new = True
+ for c in clusters:
+ if lcb in c:
+ new = False
+ return new
+
+
+def cluster_lcbs(lcbs, threshold):
+ """ clusters lcbs based on how far apart they are"""
+
+ clusters = []
+ for o, i in enumerate(lcbs):
+ cluster = []
+
+ if not new(clusters, i):
+ continue
+
+ cluster.append(i)
+ compare_against = i
+
+ for n, j in enumerate(lcbs):
+
+ if not new(clusters, j) or i == j or compare_against == j:
+ continue
+
+ close = True
+ for num, k in enumerate(compare_against):
+ # for num, k in enumerate(i):
+ if j[num]["start"] - k["end"] > threshold:
+ close = False
+
+ if close:
+ cluster.append(j)
+ compare_against = j
+
+ clusters.append(cluster)
+ return resolve_clusters(clusters)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="process XMFA")
+ parser.add_argument("xmfa", type=argparse.FileType("r"), help="XMFA file")
+ parser.add_argument(
+ "threshold",
+ type=int,
+ help="maximum number of nucleotides between lcbs in a cluster",
+ )
+ args = parser.parse_args()
+
+ # assuming lcbs are filtered
+ final_lcbs = []
+ lcbs_filtered_for_seq = filter_lcbs_for_seq(args.xmfa)
+ for i in lcbs_filtered_for_seq:
+ final_lcbs += cluster_lcbs(lcbs_filtered_for_seq[i], args.threshold)
+ to_xmfa(final_lcbs)
diff -r 000000000000 -r 85cd5feab3b7 cpt_cluster_lcbs/cluster_lcbs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_cluster_lcbs/cluster_lcbs.xml Tue Jul 05 05:23:56 2022 +0000
@@ -0,0 +1,34 @@
+
+
+
+
+ macros.xml
+ cpt-macros.xml
+
+
+ $output
+]]>
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 85cd5feab3b7 cpt_cluster_lcbs/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_cluster_lcbs/cpt-macros.xml Tue Jul 05 05:23:56 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 85cd5feab3b7 cpt_cluster_lcbs/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_cluster_lcbs/macros.xml Tue Jul 05 05:23:56 2022 +0000
@@ -0,0 +1,61 @@
+
+
+
+
+ progressivemauve
+ python
+ biopython
+ cpt_gffparser
+
+
+
+ 2.4.0
+
+ 10.1371/journal.pone.0011147
+
+
+ 10.1093/bioinformatics/btm039
+
+
+
+ "$xmfa"
+
+
+
+
+
+
+ "$sequences"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ "$gff3_data"
+
+
+ genomeref.fa
+
+
+ ln -s $genome_fasta genomeref.fa;
+
+
+ genomeref.fa
+
+
+
+
+