# HG changeset patch # User cpt # Date 1656998387 0 # Node ID 21d00cf831372bcc3912ba231605308b9da2c70d Uploaded diff -r 000000000000 -r 21d00cf83137 cpt_xmfa_split/cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/cpt-macros.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,115 @@ + + + + + python + biopython + requests + + + + + + + + 10.1371/journal.pcbi.1008214 + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + diff -r 000000000000 -r 21d00cf83137 cpt_xmfa_split/lcb_split.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/lcb_split.py Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +import argparse +import copy +import logging +import xmfa +from itertools import groupby + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def split_lcb(lcb, window_size=10, threshold=0.7): + # Transpose sequence + lines = [] + max_align_num = len(lcb[0]["seq"]) + for i in range(max_align_num): + lines.append([]) + for j in range(len(lcb)): + c = lcb[j]["seq"][i] + if c != "-": + lines[i].append(j) + + count_groups = [] + for i in range(0, len(lines), window_size): + current_lines = lines[i : i + window_size] + flat_list = [a for b in current_lines for a in b] + counts = [] + for i in range(len(lcb)): + value = float(flat_list.count(i)) / window_size + if value >= threshold: + counts.append(i) + count_groups.append(counts) + + # groups = [(next(j), len(list(j)) + 1) for i, j in ] + # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)] + # This says for 2 window sizes, we emit a new LCB with just [0:10] and + # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14. + new_lcbs = [] + position = 0 + for i, j in groupby(count_groups): + tmp = list(j) + count = len(tmp) + members = tmp[0] + local_members = [] + for member in members: + tmp_member = copy.deepcopy(lcb[member]) + tmp_member["seq"] = tmp_member["seq"][ + window_size * position : window_size * (position + count) + ] + tmp_member["start"] = tmp_member["start"] + (3 * window_size * position) + tmp_member["end"] = tmp_member["start"] + (3 * window_size * count) + local_members.append(tmp_member) + if len(local_members) > 0: + new_lcbs.append(local_members) + + position += count + return new_lcbs + + +def split_lcbs(lcbs, window_size=10, threshold=100): + new_lcbs = [] + for lcb in lcbs: + new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold)) + return new_lcbs + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Split XMFA alignments", prog="xmfa2smallerXmfa" + ) + parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File") + + parser.add_argument( + "--window_size", type=int, help="Window size for analysis", default=10 + ) + parser.add_argument( + "--threshold", + type=float, + help="All genomes must meet N percent similarity", + default=0.7, + ) + + args = parser.parse_args() + + # Write + xmfa.to_xmfa( + # Split + split_lcbs( + # Parse + xmfa.parse_xmfa(args.xmfa_file), + window_size=args.window_size, + threshold=args.threshold, + ) + ) diff -r 000000000000 -r 21d00cf83137 cpt_xmfa_split/lcb_split.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/lcb_split.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,36 @@ + + + + + macros.xml + cpt-macros.xml + + + $output +]]> + + + + + + + + + + + + diff -r 000000000000 -r 21d00cf83137 cpt_xmfa_split/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/macros.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,61 @@ + + + + + progressivemauve + python + biopython + cpt_gffparser + + + + 2.4.0 + + 10.1371/journal.pone.0011147 + + + 10.1093/bioinformatics/btm039 + + + + "$xmfa" + + + + + + + "$sequences" + + + + + + + + + + + + + + + + + "$gff3_data" + + + genomeref.fa + + + ln -s $genome_fasta genomeref.fa; + + + genomeref.fa + + + + +