Mercurial > repos > cpt > cpt_xmfa_split

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml	Mon Jun 05 02:54:34 2023 +0000
@@ -0,0 +1,115 @@
+<macros>
+    <xml name="gff_requirements">
+        <requirements>
+            <requirement type="package" version="2.7">python</requirement>
+            <requirement type="package" version="1.65">biopython</requirement>
+            <requirement type="package" version="2.12.1">requests</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+            <yield/>
+        </requirements>
+        <version_command>
+		<![CDATA[
+			cd '$__tool_directory__' && git rev-parse HEAD
+		]]>
+		</version_command>
+    </xml>
+    <xml name="citation/mijalisrasche">
+        <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+        <citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-crr">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-2020-AJC-solo">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="citations-clm">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+            <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+            <yield/>
+        </citations>
+    </xml>
+    <xml name="sl-citations-clm">
+        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+        <yield/>
+    </xml>
+</macros>
--- a/cpt_xmfa_split/cpt-macros.xml	Tue Jul 05 05:19:47 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="gff_requirements">
-		<requirements>
-			<requirement type="package" version="2.7">python</requirement>
-			<requirement type="package" version="1.65">biopython</requirement>
-			<requirement type="package" version="2.12.1">requests</requirement>
-			<yield/>
-		</requirements>
-		<version_command>
-		<![CDATA[
-			cd $__tool_directory__ && git rev-parse HEAD
-		]]>
-		</version_command>
-	</xml>
-	<xml name="citation/mijalisrasche">
-		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-		<citation type="bibtex">@unpublished{galaxyTools,
-		author = {E. Mijalis, H. Rasche},
-		title = {CPT Galaxy Tools},
-		year = {2013-2017},
-		note = {https://github.com/tamu-cpt/galaxy-tools/}
-		}
-		</citation>
-	</xml>
-	<xml name="citations">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-    	<xml name="citations-crr">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Ross},
-				title = {CPT Galaxy Tools},
-				year = {2020-},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-		<yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {E. Mijalis, H. Rasche},
-				title = {CPT Galaxy Tools},
-				year = {2013-2017},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-2020-AJC-solo">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-                        <citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {A. Criscione},
-				title = {CPT Galaxy Tools},
-				year = {2019-2021},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-                        </citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="citations-clm">
-		<citations>
-			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-		</citations>
-	</xml>
-        <xml name="sl-citations-clm">
-			<citation type="bibtex">
-			@unpublished{galaxyTools,
-				author = {C. Maughmer},
-				title = {CPT Galaxy Tools},
-				year = {2017-2020},
-				note = {https://github.com/tamu-cpt/galaxy-tools/}
-			}
-			</citation>
-                        <yield/>
-	</xml>
-</macros>
--- a/cpt_xmfa_split/lcb_split.py	Tue Jul 05 05:19:47 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,94 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import copy
-import logging
-import xmfa
-from itertools import groupby
-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(__name__)
-
-
-def split_lcb(lcb, window_size=10, threshold=0.7):
-    # Transpose sequence
-    lines = []
-    max_align_num = len(lcb[0]["seq"])
-    for i in range(max_align_num):
-        lines.append([])
-        for j in range(len(lcb)):
-            c = lcb[j]["seq"][i]
-            if c != "-":
-                lines[i].append(j)
-
-    count_groups = []
-    for i in range(0, len(lines), window_size):
-        current_lines = lines[i : i + window_size]
-        flat_list = [a for b in current_lines for a in b]
-        counts = []
-        for i in range(len(lcb)):
-            value = float(flat_list.count(i)) / window_size
-            if value >= threshold:
-                counts.append(i)
-        count_groups.append(counts)
-
-    # groups = [(next(j), len(list(j)) + 1) for i, j in ]
-    # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)]
-    # This says for 2 window sizes, we emit a new LCB with just [0:10] and
-    # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14.
-    new_lcbs = []
-    position = 0
-    for i, j in groupby(count_groups):
-        tmp = list(j)
-        count = len(tmp)
-        members = tmp[0]
-        local_members = []
-        for member in members:
-            tmp_member = copy.deepcopy(lcb[member])
-            tmp_member["seq"] = tmp_member["seq"][
-                window_size * position : window_size * (position + count)
-            ]
-            tmp_member["start"] = tmp_member["start"] + (3 * window_size * position)
-            tmp_member["end"] = tmp_member["start"] + (3 * window_size * count)
-            local_members.append(tmp_member)
-        if len(local_members) > 0:
-            new_lcbs.append(local_members)
-
-        position += count
-    return new_lcbs
-
-
-def split_lcbs(lcbs, window_size=10, threshold=100):
-    new_lcbs = []
-    for lcb in lcbs:
-        new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold))
-    return new_lcbs
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Split XMFA alignments", prog="xmfa2smallerXmfa"
-    )
-    parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
-
-    parser.add_argument(
-        "--window_size", type=int, help="Window size for analysis", default=10
-    )
-    parser.add_argument(
-        "--threshold",
-        type=float,
-        help="All genomes must meet N percent similarity",
-        default=0.7,
-    )
-
-    args = parser.parse_args()
-
-    # Write
-    xmfa.to_xmfa(
-        # Split
-        split_lcbs(
-            # Parse
-            xmfa.parse_xmfa(args.xmfa_file),
-            window_size=args.window_size,
-            threshold=args.threshold,
-        )
-    )
--- a/cpt_xmfa_split/lcb_split.xml	Tue Jul 05 05:19:47 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0">
-	<description></description>
-	<macros>
-		<import>macros.xml</import>
-		<import>cpt-macros.xml</import>
-	</macros>
-	<expand macro="requirements"/>
-	<command detect_errors="aggressive"><![CDATA[
-python $__tool_directory__/lcb_split.py
-@XMFA_INPUT@
---window_size $window_size
---threshold $threshold
-> $output
-]]></command>
-	<inputs>
-		<expand macro="xmfa_input" />
-		<param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs" />
-		<param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs" />
-	</inputs>
-	<outputs>
-		<data format="xmfa" name="output" />
-	</outputs>
-	<help><![CDATA[
-**What it does**
-
-Helps reduce large and non-sensical protein LCBs into real protein alignments.
-
-**WARNING**
-
-Probably does not work if you have - strand genes. Need to test.
-
-]]></help>
-<!-- TODO -->
-		<expand macro="citations" />
-</tool>
--- a/cpt_xmfa_split/macros.xml	Tue Jul 05 05:19:47 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-<?xml version="1.0"?>
-<macros>
-	<xml name="requirements">
-		<requirements>
-			<requirement type="package">progressivemauve</requirement>
-			<requirement type="package" version="3.8.13">python</requirement>
-			<requirement type="package" version="1.79">biopython</requirement>
-			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
-			<yield/>
-		</requirements>
-	</xml>
-	<token name="@WRAPPER_VERSION@">2.4.0</token>
-	<xml name="citation/progressive_mauve">
-		<citation type="doi">10.1371/journal.pone.0011147</citation>
-	</xml>
-	<xml name="citation/gepard">
-		<citation type="doi">10.1093/bioinformatics/btm039</citation>
-	</xml>
-
-	<token name="@XMFA_INPUT@">
-		"$xmfa"
-	</token>
-	<xml name="xmfa_input"
-		token_formats="xmfa">
-		<param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" />
-	</xml>
-
-	<token name="@XMFA_FA_INPUT@">
-		"$sequences"
-	</token>
-	<xml name="xmfa_fa_input">
-		<param type="data" format="fasta" name="sequences" label="Sequences in alignment"
-			help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
-
-	</xml>
-	<xml name="genome_selector">
-		<param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
-	</xml>
-	<xml name="gff3_input">
-		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
-	</xml>
-	<xml name="input/gff3+fasta">
-		<expand macro="gff3_input" />
-		<expand macro="genome_selector" />
-	</xml>
-	<token name="@INPUT_GFF@">
-	"$gff3_data"
-	</token>
-	<token name="@INPUT_FASTA@">
-		genomeref.fa
-	</token>
-	<token name="@GENOME_SELECTOR_PRE@">
-		ln -s $genome_fasta genomeref.fa;
-	</token>
-	<token name="@GENOME_SELECTOR@">
-		genomeref.fa
-	</token>
-        <xml name="input/fasta">
-		<param label="Fasta file" name="sequences" type="data" format="fasta"/>
-	</xml>
-</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lcb_split.py	Mon Jun 05 02:54:34 2023 +0000
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+import argparse
+import copy
+import logging
+import xmfa
+from itertools import groupby
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def split_lcb(lcb, window_size=10, threshold=0.7):
+    # Transpose sequence
+    lines = []
+    max_align_num = len(lcb[0]["seq"])
+    for i in range(max_align_num):
+        lines.append([])
+        for j in range(len(lcb)):
+            c = lcb[j]["seq"][i]
+            if c != "-":
+                lines[i].append(j)
+
+    count_groups = []
+    for i in range(0, len(lines), window_size):
+        current_lines = lines[i : i + window_size]
+        flat_list = [a for b in current_lines for a in b]
+        counts = []
+        for i in range(len(lcb)):
+            value = float(flat_list.count(i)) / window_size
+            if value >= threshold:
+                counts.append(i)
+        count_groups.append(counts)
+
+    # groups = [(next(j), len(list(j)) + 1) for i, j in ]
+    # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)]
+    # This says for 2 window sizes, we emit a new LCB with just [0:10] and
+    # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14.
+    new_lcbs = []
+    position = 0
+    for i, j in groupby(count_groups):
+        tmp = list(j)
+        count = len(tmp)
+        members = tmp[0]
+        local_members = []
+        for member in members:
+            tmp_member = copy.deepcopy(lcb[member])
+            tmp_member["seq"] = tmp_member["seq"][
+                window_size * position : window_size * (position + count)
+            ]
+            tmp_member["start"] = tmp_member["start"] + (3 * window_size * position)
+            tmp_member["end"] = tmp_member["start"] + (3 * window_size * count)
+            local_members.append(tmp_member)
+        if len(local_members) > 0:
+            new_lcbs.append(local_members)
+
+        position += count
+    return new_lcbs
+
+
+def split_lcbs(lcbs, window_size=10, threshold=100):
+    new_lcbs = []
+    for lcb in lcbs:
+        new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold))
+    return new_lcbs
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Split XMFA alignments", prog="xmfa2smallerXmfa"
+    )
+    parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
+
+    parser.add_argument(
+        "--window_size", type=int, help="Window size for analysis", default=10
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        help="All genomes must meet N percent similarity",
+        default=0.7,
+    )
+
+    args = parser.parse_args()
+
+    # Write
+    xmfa.to_xmfa(
+        # Split
+        split_lcbs(
+            # Parse
+            xmfa.parse_xmfa(args.xmfa_file),
+            window_size=args.window_size,
+            threshold=args.threshold,
+        )
+    )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lcb_split.xml	Mon Jun 05 02:54:34 2023 +0000
@@ -0,0 +1,35 @@
+<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0">
+    <description/>
+    <macros>
+        <import>macros.xml</import>
+        <import>cpt-macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="aggressive"><![CDATA[
+'python $__tool_directory__/lcb_split.py'
+@XMFA_INPUT@
+--window_size '$window_size'
+--threshold '$threshold'
+> '$output'
+]]></command>
+    <inputs>
+        <expand macro="xmfa_input"/>
+        <param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs"/>
+        <param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs"/>
+    </inputs>
+    <outputs>
+        <data format="xmfa" name="output"/>
+    </outputs>
+    <help><![CDATA[
+**What it does**
+
+Helps reduce large and non-sensical protein LCBs into real protein alignments.
+
+**WARNING**
+
+Probably does not work if you have - strand genes. Need to test.
+
+]]></help>
+    <!-- TODO -->
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jun 05 02:54:34 2023 +0000
@@ -0,0 +1,74 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package">progressivemauve</requirement>
+            <!--<requirement type="package" version="2.7">python</requirement>-->
+            <requirement type="package" version="0.6.4">bcbiogff</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">2.4.0</token>
+    <xml name="citation/progressive_mauve">
+        <citation type="doi">10.1371/journal.pone.0011147</citation>
+    </xml>
+    <xml name="citation/gepard">
+        <citation type="doi">10.1093/bioinformatics/btm039</citation>
+    </xml>
+    <token name="@XMFA_INPUT@">
+		'$xmfa'
+	</token>
+    <xml name="xmfa_input" token_formats="xmfa">
+        <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/>
+    </xml>
+    <token name="@XMFA_FA_INPUT@">
+		'$sequences'
+	</token>
+    <xml name="xmfa_fa_input">
+        <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+    </xml>
+    <xml name="genome_selector">
+        <conditional name="reference_genome">
+            <param name="reference_genome_source" type="select" label="Reference Genome">
+                <option value="history" selected="True">From History</option>
+                <option value="cached">Locally Cached</option>
+            </param>
+            <when value="cached">
+                <param name="fasta_indexes" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="gff3_input">
+        <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+    </xml>
+    <xml name="input/gff3+fasta">
+        <expand macro="gff3_input"/>
+        <expand macro="genome_selector"/>
+    </xml>
+    <token name="@INPUT_GFF@">
+	    '$gff3_data'
+	</token>
+    <token name="@INPUT_FASTA@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR_PRE@">
+    #if $reference_genome.reference_genome_source == 'history':
+            ln -s '$reference_genome.genome_fasta' genomeref.fa;
+    #end if
+	</token>
+    <token name="@GENOME_SELECTOR@">
+    #if str($reference_genome.reference_genome_source) == 'cached':
+            '${reference_genome.fasta_indexes.fields.path}'
+    #else if str($reference_genome.reference_genome_source) == 'history':
+            genomeref.fa
+    #end if
+	</token>
+</macros>