Mercurial > repos > cpt > cpt_xmfa_split
changeset 1:5d9bc33ec5d3 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:54:34 +0000 |
parents | 21d00cf83137 |
children | e4207a7661e7 |
files | cpt-macros.xml cpt_xmfa_split/cpt-macros.xml cpt_xmfa_split/lcb_split.py cpt_xmfa_split/lcb_split.xml cpt_xmfa_split/macros.xml lcb_split.py lcb_split.xml macros.xml |
diffstat | 8 files changed, 318 insertions(+), 306 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:54:34 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_xmfa_split/cpt-macros.xml Tue Jul 05 05:19:47 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_xmfa_split/lcb_split.py Tue Jul 05 05:19:47 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -#!/usr/bin/env python -import argparse -import copy -import logging -import xmfa -from itertools import groupby - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger(__name__) - - -def split_lcb(lcb, window_size=10, threshold=0.7): - # Transpose sequence - lines = [] - max_align_num = len(lcb[0]["seq"]) - for i in range(max_align_num): - lines.append([]) - for j in range(len(lcb)): - c = lcb[j]["seq"][i] - if c != "-": - lines[i].append(j) - - count_groups = [] - for i in range(0, len(lines), window_size): - current_lines = lines[i : i + window_size] - flat_list = [a for b in current_lines for a in b] - counts = [] - for i in range(len(lcb)): - value = float(flat_list.count(i)) / window_size - if value >= threshold: - counts.append(i) - count_groups.append(counts) - - # groups = [(next(j), len(list(j)) + 1) for i, j in ] - # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)] - # This says for 2 window sizes, we emit a new LCB with just [0:10] and - # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14. - new_lcbs = [] - position = 0 - for i, j in groupby(count_groups): - tmp = list(j) - count = len(tmp) - members = tmp[0] - local_members = [] - for member in members: - tmp_member = copy.deepcopy(lcb[member]) - tmp_member["seq"] = tmp_member["seq"][ - window_size * position : window_size * (position + count) - ] - tmp_member["start"] = tmp_member["start"] + (3 * window_size * position) - tmp_member["end"] = tmp_member["start"] + (3 * window_size * count) - local_members.append(tmp_member) - if len(local_members) > 0: - new_lcbs.append(local_members) - - position += count - return new_lcbs - - -def split_lcbs(lcbs, window_size=10, threshold=100): - new_lcbs = [] - for lcb in lcbs: - new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold)) - return new_lcbs - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Split XMFA alignments", prog="xmfa2smallerXmfa" - ) - parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File") - - parser.add_argument( - "--window_size", type=int, help="Window size for analysis", default=10 - ) - parser.add_argument( - "--threshold", - type=float, - help="All genomes must meet N percent similarity", - default=0.7, - ) - - args = parser.parse_args() - - # Write - xmfa.to_xmfa( - # Split - split_lcbs( - # Parse - xmfa.parse_xmfa(args.xmfa_file), - window_size=args.window_size, - threshold=args.threshold, - ) - )
--- a/cpt_xmfa_split/lcb_split.xml Tue Jul 05 05:19:47 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0"> - <description></description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/lcb_split.py -@XMFA_INPUT@ ---window_size $window_size ---threshold $threshold -> $output -]]></command> - <inputs> - <expand macro="xmfa_input" /> - <param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs" /> - <param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs" /> - </inputs> - <outputs> - <data format="xmfa" name="output" /> - </outputs> - <help><![CDATA[ -**What it does** - -Helps reduce large and non-sensical protein LCBs into real protein alignments. - -**WARNING** - -Probably does not work if you have - strand genes. Need to test. - -]]></help> -<!-- TODO --> - <expand macro="citations" /> -</tool>
--- a/cpt_xmfa_split/macros.xml Tue Jul 05 05:19:47 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package">progressivemauve</requirement> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <token name="@WRAPPER_VERSION@">2.4.0</token> - <xml name="citation/progressive_mauve"> - <citation type="doi">10.1371/journal.pone.0011147</citation> - </xml> - <xml name="citation/gepard"> - <citation type="doi">10.1093/bioinformatics/btm039</citation> - </xml> - - <token name="@XMFA_INPUT@"> - "$xmfa" - </token> - <xml name="xmfa_input" - token_formats="xmfa"> - <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" /> - </xml> - - <token name="@XMFA_FA_INPUT@"> - "$sequences" - </token> - <xml name="xmfa_fa_input"> - <param type="data" format="fasta" name="sequences" label="Sequences in alignment" - help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> - - </xml> - <xml name="genome_selector"> - <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> - </xml> - <xml name="gff3_input"> - <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> - </xml> - <xml name="input/gff3+fasta"> - <expand macro="gff3_input" /> - <expand macro="genome_selector" /> - </xml> - <token name="@INPUT_GFF@"> - "$gff3_data" - </token> - <token name="@INPUT_FASTA@"> - genomeref.fa - </token> - <token name="@GENOME_SELECTOR_PRE@"> - ln -s $genome_fasta genomeref.fa; - </token> - <token name="@GENOME_SELECTOR@"> - genomeref.fa - </token> - <xml name="input/fasta"> - <param label="Fasta file" name="sequences" type="data" format="fasta"/> - </xml> -</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lcb_split.py Mon Jun 05 02:54:34 2023 +0000 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +import argparse +import copy +import logging +import xmfa +from itertools import groupby + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def split_lcb(lcb, window_size=10, threshold=0.7): + # Transpose sequence + lines = [] + max_align_num = len(lcb[0]["seq"]) + for i in range(max_align_num): + lines.append([]) + for j in range(len(lcb)): + c = lcb[j]["seq"][i] + if c != "-": + lines[i].append(j) + + count_groups = [] + for i in range(0, len(lines), window_size): + current_lines = lines[i : i + window_size] + flat_list = [a for b in current_lines for a in b] + counts = [] + for i in range(len(lcb)): + value = float(flat_list.count(i)) / window_size + if value >= threshold: + counts.append(i) + count_groups.append(counts) + + # groups = [(next(j), len(list(j)) + 1) for i, j in ] + # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)] + # This says for 2 window sizes, we emit a new LCB with just [0:10] and + # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14. + new_lcbs = [] + position = 0 + for i, j in groupby(count_groups): + tmp = list(j) + count = len(tmp) + members = tmp[0] + local_members = [] + for member in members: + tmp_member = copy.deepcopy(lcb[member]) + tmp_member["seq"] = tmp_member["seq"][ + window_size * position : window_size * (position + count) + ] + tmp_member["start"] = tmp_member["start"] + (3 * window_size * position) + tmp_member["end"] = tmp_member["start"] + (3 * window_size * count) + local_members.append(tmp_member) + if len(local_members) > 0: + new_lcbs.append(local_members) + + position += count + return new_lcbs + + +def split_lcbs(lcbs, window_size=10, threshold=100): + new_lcbs = [] + for lcb in lcbs: + new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold)) + return new_lcbs + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Split XMFA alignments", prog="xmfa2smallerXmfa" + ) + parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File") + + parser.add_argument( + "--window_size", type=int, help="Window size for analysis", default=10 + ) + parser.add_argument( + "--threshold", + type=float, + help="All genomes must meet N percent similarity", + default=0.7, + ) + + args = parser.parse_args() + + # Write + xmfa.to_xmfa( + # Split + split_lcbs( + # Parse + xmfa.parse_xmfa(args.xmfa_file), + window_size=args.window_size, + threshold=args.threshold, + ) + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lcb_split.xml Mon Jun 05 02:54:34 2023 +0000 @@ -0,0 +1,35 @@ +<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0"> + <description/> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +'python $__tool_directory__/lcb_split.py' +@XMFA_INPUT@ +--window_size '$window_size' +--threshold '$threshold' +> '$output' +]]></command> + <inputs> + <expand macro="xmfa_input"/> + <param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs"/> + <param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs"/> + </inputs> + <outputs> + <data format="xmfa" name="output"/> + </outputs> + <help><![CDATA[ +**What it does** + +Helps reduce large and non-sensical protein LCBs into real protein alignments. + +**WARNING** + +Probably does not work if you have - strand genes. Need to test. + +]]></help> + <!-- TODO --> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:54:34 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>