# HG changeset patch
# User cpt
# Date 1685932820 0
# Node ID e6d8cdb65df0ba36bdd2e512c440022ffcabb861
# Parent 85cd5feab3b7180d542923ac0f0c54d1a06ae000
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cluster_lcbs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster_lcbs.py Mon Jun 05 02:40:20 2023 +0000
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import tempfile
+import sys
+import argparse
+
+
+def parse_xmfa(xmfa):
+ """Simple XMFA parser until https://github.com/biopython/biopython/pull/544"""
+ current_lcb = []
+ current_seq = {}
+ for line in xmfa.readlines():
+ if line.startswith("#"):
+ continue
+
+ if line.strip() == "=":
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ yield current_lcb
+ current_lcb = []
+ else:
+ line = line.strip()
+ if line.startswith(">"):
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ data = line.strip().split()
+ # 0 1 2 3 4 5
+ # > 1:5986-6406 + CbK.fa # CbK_gp011
+ id, loc = data[1].split(":")
+ start, end = loc.split("-")
+ current_seq = {
+ "rid": "_".join(data[1:]),
+ "id": id,
+ "start": int(start),
+ "end": int(end),
+ "strand": 1 if data[2] == "+" else -1,
+ "file": data[3],
+ "seq": "",
+ "comment": "",
+ }
+ if len(data) > 5:
+ current_seq["comment"] = " ".join(data[5:])
+ else:
+ current_seq["seq"] += line.strip()
+
+
+HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
+
+
+def split_by_n(seq, n):
+ """A generator to divide a sequence into chunks of n units."""
+ # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
+ while seq:
+ yield seq[:n]
+ seq = seq[n:]
+
+
+def to_xmfa(lcbs, handle=sys.stdout):
+ handle.write("#FormatVersion Mauve1\n")
+ for lcb in lcbs:
+ for aln in lcb:
+ handle.write(
+ HEADER_TPL.format(
+ id=aln["id"],
+ start=aln["start"],
+ end=aln["end"],
+ strand="+" if aln["strand"] > 0 else "-",
+ file=aln["file"],
+ comment=aln["comment"],
+ )
+ )
+
+ for line in split_by_n(aln["seq"], 80):
+ handle.write(line + "\n")
+ handle.write("=\n")
+
+
+def percent_identity(a, b):
+ """Calculate % identity, ignoring gaps in the host sequence"""
+ match = 0
+ mismatch = 0
+ for char_a, char_b in zip(list(a), list(b)):
+ if char_a == "-":
+ continue
+ if char_a == char_b:
+ match += 1
+ else:
+ mismatch += 1
+
+ if match + mismatch == 0:
+ return 0.0
+ return 100 * float(match) / (match + mismatch)
+
+
+def id_tn_dict(sequences, tmpfile=False):
+ """Figure out sequence IDs"""
+ label_convert = {}
+ correct_chrom = None
+ if not isinstance(sequences, list):
+ sequences = [sequences]
+
+ i = 0
+ for sequence_file in sequences:
+ for record in SeqIO.parse(sequence_file, "fasta"):
+ if correct_chrom is None:
+ correct_chrom = record.id
+
+ i += 1
+ key = str(i)
+ label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
+
+ if tmpfile:
+ label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
+
+ return label_convert
+
+
+def filter_lcbs_for_seq(xmfa):
+ """clusters lcbs based on which sequences they involve"""
+ strand_info = {"1": "+", "-1": "-"}
+ clusters = {}
+
+ for i in list(parse_xmfa(xmfa)):
+ cluster_name = ""
+
+ for g in i:
+ cluster_name += g["id"] + strand_info[str(g["strand"])]
+ # allow clusters with all opposite strands to be together (alt name is opposite strand of orig)
+ alt_name = cluster_name.replace("+", "*").replace("-", "+").replace("*", "-")
+
+ orig_not_in_clusters = cluster_name not in clusters
+ alt_not_in_clusters = alt_name not in clusters
+
+ if orig_not_in_clusters and alt_not_in_clusters:
+ # if original or alternate names not already in clusters
+ clusters[cluster_name] = [i]
+ else:
+ if not orig_not_in_clusters: # if original name is already in clusters
+ clusters[cluster_name].append(i)
+ if not alt_not_in_clusters: # if alt name is already in clusters
+ clusters[alt_name].append(i)
+
+ return clusters
+ # to_xmfa(clusters['123456'])
+
+
+def merge_lcbs(lcb1, lcb2):
+ for num, i in enumerate(lcb1):
+ i["start"] = min([i["start"], lcb2[num]["start"]])
+ i["end"] = max([i["end"], lcb2[num]["end"]])
+ i["seq"] += lcb2[num]["seq"]
+
+ return lcb1
+
+
+def resolve_clusters(clusters):
+ merged = []
+ for lcbs in clusters:
+ if len(lcbs) == 1:
+ merged.append(lcbs[0])
+ continue
+ merging = lcbs[0]
+ for lcb in lcbs[1:]:
+ merging = merge_lcbs(merging, lcb)
+ merged.append(merging)
+
+ return merged
+
+
+def new(clusters, lcb):
+ new = True
+ for c in clusters:
+ if lcb in c:
+ new = False
+ return new
+
+
+def cluster_lcbs(lcbs, threshold):
+ """clusters lcbs based on how far apart they are"""
+
+ clusters = []
+ for o, i in enumerate(lcbs):
+ cluster = []
+
+ if not new(clusters, i):
+ continue
+
+ cluster.append(i)
+ compare_against = i
+
+ for n, j in enumerate(lcbs):
+
+ if not new(clusters, j) or i == j or compare_against == j:
+ continue
+
+ close = True
+ for num, k in enumerate(compare_against):
+ # for num, k in enumerate(i):
+ if j[num]["start"] - k["end"] > threshold:
+ close = False
+
+ if close:
+ cluster.append(j)
+ compare_against = j
+
+ clusters.append(cluster)
+ return resolve_clusters(clusters)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="process XMFA")
+ parser.add_argument("xmfa", type=argparse.FileType("r"), help="XMFA file")
+ parser.add_argument(
+ "threshold",
+ type=int,
+ help="maximum number of nucleotides between lcbs in a cluster",
+ )
+ args = parser.parse_args()
+
+ # assuming lcbs are filtered
+ final_lcbs = []
+ lcbs_filtered_for_seq = filter_lcbs_for_seq(args.xmfa)
+ for i in lcbs_filtered_for_seq:
+ final_lcbs += cluster_lcbs(lcbs_filtered_for_seq[i], args.threshold)
+ to_xmfa(final_lcbs)
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cluster_lcbs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster_lcbs.xml Mon Jun 05 02:40:20 2023 +0000
@@ -0,0 +1,33 @@
+
+
+
+ macros.xml
+ cpt-macros.xml
+
+
+ '$output'
+]]>
+
+
+
+
+
+
+
+
+
+
+
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:40:20 2023 +0000
@@ -0,0 +1,115 @@
+
+
+
+ python
+ biopython
+ requests
+ cpt_gffparser
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cpt_cluster_lcbs/cluster_lcbs.py
--- a/cpt_cluster_lcbs/cluster_lcbs.py Tue Jul 05 05:23:56 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from Bio import SeqIO
-import tempfile
-import sys
-import argparse
-
-
-def parse_xmfa(xmfa):
- """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
- """
- current_lcb = []
- current_seq = {}
- for line in xmfa.readlines():
- if line.startswith("#"):
- continue
-
- if line.strip() == "=":
- if "id" in current_seq:
- current_lcb.append(current_seq)
- current_seq = {}
- yield current_lcb
- current_lcb = []
- else:
- line = line.strip()
- if line.startswith(">"):
- if "id" in current_seq:
- current_lcb.append(current_seq)
- current_seq = {}
- data = line.strip().split()
- # 0 1 2 3 4 5
- # > 1:5986-6406 + CbK.fa # CbK_gp011
- id, loc = data[1].split(":")
- start, end = loc.split("-")
- current_seq = {
- "rid": "_".join(data[1:]),
- "id": id,
- "start": int(start),
- "end": int(end),
- "strand": 1 if data[2] == "+" else -1,
- "file": data[3],
- "seq": "",
- "comment": "",
- }
- if len(data) > 5:
- current_seq["comment"] = " ".join(data[5:])
- else:
- current_seq["seq"] += line.strip()
-
-
-HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
-
-
-def split_by_n(seq, n):
- """A generator to divide a sequence into chunks of n units."""
- # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
- while seq:
- yield seq[:n]
- seq = seq[n:]
-
-
-def to_xmfa(lcbs, handle=sys.stdout):
- handle.write("#FormatVersion Mauve1\n")
- for lcb in lcbs:
- for aln in lcb:
- handle.write(
- HEADER_TPL.format(
- id=aln["id"],
- start=aln["start"],
- end=aln["end"],
- strand="+" if aln["strand"] > 0 else "-",
- file=aln["file"],
- comment=aln["comment"],
- )
- )
-
- for line in split_by_n(aln["seq"], 80):
- handle.write(line + "\n")
- handle.write("=\n")
-
-
-def percent_identity(a, b):
- """Calculate % identity, ignoring gaps in the host sequence
- """
- match = 0
- mismatch = 0
- for char_a, char_b in zip(list(a), list(b)):
- if char_a == "-":
- continue
- if char_a == char_b:
- match += 1
- else:
- mismatch += 1
-
- if match + mismatch == 0:
- return 0.0
- return 100 * float(match) / (match + mismatch)
-
-
-def id_tn_dict(sequences, tmpfile=False):
- """Figure out sequence IDs
- """
- label_convert = {}
- correct_chrom = None
- if not isinstance(sequences, list):
- sequences = [sequences]
-
- i = 0
- for sequence_file in sequences:
- for record in SeqIO.parse(sequence_file, "fasta"):
- if correct_chrom is None:
- correct_chrom = record.id
-
- i += 1
- key = str(i)
- label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
-
- if tmpfile:
- label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
-
- return label_convert
-
-
-def filter_lcbs_for_seq(xmfa):
- """ clusters lcbs based on which sequences they involve """
- strand_info = {"1": "+", "-1": "-"}
- clusters = {}
-
- for i in list(parse_xmfa(xmfa)):
- cluster_name = ""
-
- for g in i:
- cluster_name += g["id"] + strand_info[str(g["strand"])]
- # allow clusters with all opposite strands to be together (alt name is opposite strand of orig)
- alt_name = cluster_name.replace("+", "*").replace("-", "+").replace("*", "-")
-
- orig_not_in_clusters = cluster_name not in clusters
- alt_not_in_clusters = alt_name not in clusters
-
- if orig_not_in_clusters and alt_not_in_clusters:
- # if original or alternate names not already in clusters
- clusters[cluster_name] = [i]
- else:
- if not orig_not_in_clusters: # if original name is already in clusters
- clusters[cluster_name].append(i)
- if not alt_not_in_clusters: # if alt name is already in clusters
- clusters[alt_name].append(i)
-
- return clusters
- # to_xmfa(clusters['123456'])
-
-
-def merge_lcbs(lcb1, lcb2):
- for num, i in enumerate(lcb1):
- i["start"] = min([i["start"], lcb2[num]["start"]])
- i["end"] = max([i["end"], lcb2[num]["end"]])
- i["seq"] += lcb2[num]["seq"]
-
- return lcb1
-
-
-def resolve_clusters(clusters):
- merged = []
- for lcbs in clusters:
- if len(lcbs) == 1:
- merged.append(lcbs[0])
- continue
- merging = lcbs[0]
- for lcb in lcbs[1:]:
- merging = merge_lcbs(merging, lcb)
- merged.append(merging)
-
- return merged
-
-
-def new(clusters, lcb):
- new = True
- for c in clusters:
- if lcb in c:
- new = False
- return new
-
-
-def cluster_lcbs(lcbs, threshold):
- """ clusters lcbs based on how far apart they are"""
-
- clusters = []
- for o, i in enumerate(lcbs):
- cluster = []
-
- if not new(clusters, i):
- continue
-
- cluster.append(i)
- compare_against = i
-
- for n, j in enumerate(lcbs):
-
- if not new(clusters, j) or i == j or compare_against == j:
- continue
-
- close = True
- for num, k in enumerate(compare_against):
- # for num, k in enumerate(i):
- if j[num]["start"] - k["end"] > threshold:
- close = False
-
- if close:
- cluster.append(j)
- compare_against = j
-
- clusters.append(cluster)
- return resolve_clusters(clusters)
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="process XMFA")
- parser.add_argument("xmfa", type=argparse.FileType("r"), help="XMFA file")
- parser.add_argument(
- "threshold",
- type=int,
- help="maximum number of nucleotides between lcbs in a cluster",
- )
- args = parser.parse_args()
-
- # assuming lcbs are filtered
- final_lcbs = []
- lcbs_filtered_for_seq = filter_lcbs_for_seq(args.xmfa)
- for i in lcbs_filtered_for_seq:
- final_lcbs += cluster_lcbs(lcbs_filtered_for_seq[i], args.threshold)
- to_xmfa(final_lcbs)
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cpt_cluster_lcbs/cluster_lcbs.xml
--- a/cpt_cluster_lcbs/cluster_lcbs.xml Tue Jul 05 05:23:56 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,34 +0,0 @@
-
-
-
-
- macros.xml
- cpt-macros.xml
-
-
- $output
-]]>
-
-
-
-
-
-
-
-
-
-
-
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cpt_cluster_lcbs/cpt-macros.xml
--- a/cpt_cluster_lcbs/cpt-macros.xml Tue Jul 05 05:23:56 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-
-
-
-
- python
- biopython
- requests
-
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
diff -r 85cd5feab3b7 -r e6d8cdb65df0 cpt_cluster_lcbs/macros.xml
--- a/cpt_cluster_lcbs/macros.xml Tue Jul 05 05:23:56 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-
-
-
-
- progressivemauve
- python
- biopython
- cpt_gffparser
-
-
-
- 2.4.0
-
- 10.1371/journal.pone.0011147
-
-
- 10.1093/bioinformatics/btm039
-
-
-
- "$xmfa"
-
-
-
-
-
-
- "$sequences"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- "$gff3_data"
-
-
- genomeref.fa
-
-
- ln -s $genome_fasta genomeref.fa;
-
-
- genomeref.fa
-
-
-
-
-
diff -r 85cd5feab3b7 -r e6d8cdb65df0 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:40:20 2023 +0000
@@ -0,0 +1,74 @@
+
+
+
+ progressivemauve
+
+ bcbiogff
+
+
+
+ 2.4.0
+
+ 10.1371/journal.pone.0011147
+
+
+ 10.1093/bioinformatics/btm039
+
+
+ '$xmfa'
+
+
+
+
+
+ '$sequences'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ '$gff3_data'
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
+ #if $reference_genome.reference_genome_source == 'history':
+ ln -s '$reference_genome.genome_fasta' genomeref.fa;
+ #end if
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+