Mercurial > repos > cpt > cpt_xmfa_split
changeset 0:21d00cf83137 draft
Uploaded
author | cpt |
---|---|
date | Tue, 05 Jul 2022 05:19:47 +0000 |
parents | |
children | 5d9bc33ec5d3 |
files | cpt_xmfa_split/cpt-macros.xml cpt_xmfa_split/lcb_split.py cpt_xmfa_split/lcb_split.xml cpt_xmfa_split/macros.xml |
diffstat | 4 files changed, 306 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/cpt-macros.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/lcb_split.py Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +import argparse +import copy +import logging +import xmfa +from itertools import groupby + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def split_lcb(lcb, window_size=10, threshold=0.7): + # Transpose sequence + lines = [] + max_align_num = len(lcb[0]["seq"]) + for i in range(max_align_num): + lines.append([]) + for j in range(len(lcb)): + c = lcb[j]["seq"][i] + if c != "-": + lines[i].append(j) + + count_groups = [] + for i in range(0, len(lines), window_size): + current_lines = lines[i : i + window_size] + flat_list = [a for b in current_lines for a in b] + counts = [] + for i in range(len(lcb)): + value = float(flat_list.count(i)) / window_size + if value >= threshold: + counts.append(i) + count_groups.append(counts) + + # groups = [(next(j), len(list(j)) + 1) for i, j in ] + # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)] + # This says for 2 window sizes, we emit a new LCB with just [0:10] and + # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14. + new_lcbs = [] + position = 0 + for i, j in groupby(count_groups): + tmp = list(j) + count = len(tmp) + members = tmp[0] + local_members = [] + for member in members: + tmp_member = copy.deepcopy(lcb[member]) + tmp_member["seq"] = tmp_member["seq"][ + window_size * position : window_size * (position + count) + ] + tmp_member["start"] = tmp_member["start"] + (3 * window_size * position) + tmp_member["end"] = tmp_member["start"] + (3 * window_size * count) + local_members.append(tmp_member) + if len(local_members) > 0: + new_lcbs.append(local_members) + + position += count + return new_lcbs + + +def split_lcbs(lcbs, window_size=10, threshold=100): + new_lcbs = [] + for lcb in lcbs: + new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold)) + return new_lcbs + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Split XMFA alignments", prog="xmfa2smallerXmfa" + ) + parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File") + + parser.add_argument( + "--window_size", type=int, help="Window size for analysis", default=10 + ) + parser.add_argument( + "--threshold", + type=float, + help="All genomes must meet N percent similarity", + default=0.7, + ) + + args = parser.parse_args() + + # Write + xmfa.to_xmfa( + # Split + split_lcbs( + # Parse + xmfa.parse_xmfa(args.xmfa_file), + window_size=args.window_size, + threshold=args.threshold, + ) + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/lcb_split.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,36 @@ +<?xml version="1.0"?> +<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0"> + <description></description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python $__tool_directory__/lcb_split.py +@XMFA_INPUT@ +--window_size $window_size +--threshold $threshold +> $output +]]></command> + <inputs> + <expand macro="xmfa_input" /> + <param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs" /> + <param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs" /> + </inputs> + <outputs> + <data format="xmfa" name="output" /> + </outputs> + <help><![CDATA[ +**What it does** + +Helps reduce large and non-sensical protein LCBs into real protein alignments. + +**WARNING** + +Probably does not work if you have - strand genes. Need to test. + +]]></help> +<!-- TODO --> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_xmfa_split/macros.xml Tue Jul 05 05:19:47 2022 +0000 @@ -0,0 +1,61 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <requirement type="package" version="3.8.13">python</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + + <token name="@XMFA_INPUT@"> + "$xmfa" + </token> + <xml name="xmfa_input" + token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" /> + </xml> + + <token name="@XMFA_FA_INPUT@"> + "$sequences" + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" + help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + + </xml> + <xml name="genome_selector"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> + genomeref.fa + </token> + <token name="@GENOME_SELECTOR_PRE@"> + ln -s $genome_fasta genomeref.fa; + </token> + <token name="@GENOME_SELECTOR@"> + genomeref.fa + </token> + <xml name="input/fasta"> + <param label="Fasta file" name="sequences" type="data" format="fasta"/> + </xml> +</macros>