Mercurial > repos > cpt > cpt_xmfa_split

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_xmfa_split/cpt-macros.xml	Tue Jul 05 05:19:47 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_xmfa_split/lcb_split.py	Tue Jul 05 05:19:47 2022 +0000
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+import argparse
+import copy
+import logging
+import xmfa
+from itertools import groupby
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def split_lcb(lcb, window_size=10, threshold=0.7):
+    # Transpose sequence
+    lines = []
+    max_align_num = len(lcb[0]["seq"])
+    for i in range(max_align_num):
+        lines.append([])
+        for j in range(len(lcb)):
+            c = lcb[j]["seq"][i]
+            if c != "-":
+                lines[i].append(j)
+
+    count_groups = []
+    for i in range(0, len(lines), window_size):
+        current_lines = lines[i : i + window_size]
+        flat_list = [a for b in current_lines for a in b]
+        counts = []
+        for i in range(len(lcb)):
+            value = float(flat_list.count(i)) / window_size
+            if value >= threshold:
+                counts.append(i)
+        count_groups.append(counts)
+
+    # groups = [(next(j), len(list(j)) + 1) for i, j in ]
+    # [([4], 2), ([2, 3, 4, 5, 6], 2), ([0, 1, 2, 3, 4, 5, 6], 14), ([0, 3], 1)]
+    # This says for 2 window sizes, we emit a new LCB with just [0:10] and
+    # [10:20] for lcb #4, then one with all but 0/1 for 2, then all for 14.
+    new_lcbs = []
+    position = 0
+    for i, j in groupby(count_groups):
+        tmp = list(j)
+        count = len(tmp)
+        members = tmp[0]
+        local_members = []
+        for member in members:
+            tmp_member = copy.deepcopy(lcb[member])
+            tmp_member["seq"] = tmp_member["seq"][
+                window_size * position : window_size * (position + count)
+            ]
+            tmp_member["start"] = tmp_member["start"] + (3 * window_size * position)
+            tmp_member["end"] = tmp_member["start"] + (3 * window_size * count)
+            local_members.append(tmp_member)
+        if len(local_members) > 0:
+            new_lcbs.append(local_members)
+
+        position += count
+    return new_lcbs
+
+
+def split_lcbs(lcbs, window_size=10, threshold=100):
+    new_lcbs = []
+    for lcb in lcbs:
+        new_lcbs.extend(split_lcb(lcb, window_size=window_size, threshold=threshold))
+    return new_lcbs
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Split XMFA alignments", prog="xmfa2smallerXmfa"
+    )
+    parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
+
+    parser.add_argument(
+        "--window_size", type=int, help="Window size for analysis", default=10
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        help="All genomes must meet N percent similarity",
+        default=0.7,
+    )
+
+    args = parser.parse_args()
+
+    # Write
+    xmfa.to_xmfa(
+        # Split
+        split_lcbs(
+            # Parse
+            xmfa.parse_xmfa(args.xmfa_file),
+            window_size=args.window_size,
+            threshold=args.threshold,
+        )
+    )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_xmfa_split/lcb_split.xml	Tue Jul 05 05:19:47 2022 +0000
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<tool id="edu.tamu.cpt.xmfa.split" name="Split LCBs into smaller LCBs" version="@WRAPPER_VERSION@.0">
+	<description></description>
+	<macros>
+		<import>macros.xml</import>
+		<import>cpt-macros.xml</import>
+	</macros>
+	<expand macro="requirements"/>
+	<command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/lcb_split.py
+@XMFA_INPUT@
+--window_size $window_size
+--threshold $threshold
+> $output
+]]></command>
+	<inputs>
+		<expand macro="xmfa_input" />
+		<param type="integer" name="window_size" value="10" label="Default window size generating smaller LCBs" />
+		<param type="float" name="threshold" value="0.7" min="0" max="1" label="Threshold at which a given genome is part of the new small LCBs" />
+	</inputs>
+	<outputs>
+		<data format="xmfa" name="output" />
+	</outputs>
+	<help><![CDATA[
+**What it does**
+
+Helps reduce large and non-sensical protein LCBs into real protein alignments.
+
+**WARNING**
+
+Probably does not work if you have - strand genes. Need to test.
+
+]]></help>
+<!-- TODO -->
+		<expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_xmfa_split/macros.xml	Tue Jul 05 05:19:47 2022 +0000
@@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="requirements">
+		<requirements>
+			<requirement type="package">progressivemauve</requirement>
+			<requirement type="package" version="3.8.13">python</requirement>
+			<requirement type="package" version="1.79">biopython</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+			<yield/>
+		</requirements>
+	</xml>
+	<token name="@WRAPPER_VERSION@">2.4.0</token>
+	<xml name="citation/progressive_mauve">
+		<citation type="doi">10.1371/journal.pone.0011147</citation>
+	</xml>
+	<xml name="citation/gepard">
+		<citation type="doi">10.1093/bioinformatics/btm039</citation>
+	</xml>
+
+	<token name="@XMFA_INPUT@">
+		"$xmfa"
+	</token>
+	<xml name="xmfa_input"
+		token_formats="xmfa">
+		<param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" />
+	</xml>
+
+	<token name="@XMFA_FA_INPUT@">
+		"$sequences"
+	</token>
+	<xml name="xmfa_fa_input">
+		<param type="data" format="fasta" name="sequences" label="Sequences in alignment"
+			help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+
+	</xml>
+	<xml name="genome_selector">
+		<param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+	</xml>
+	<xml name="gff3_input">
+		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+	</xml>
+	<xml name="input/gff3+fasta">
+		<expand macro="gff3_input" />
+		<expand macro="genome_selector" />
+	</xml>
+	<token name="@INPUT_GFF@">
+	"$gff3_data"
+	</token>
+	<token name="@INPUT_FASTA@">
+		genomeref.fa
+	</token>
+	<token name="@GENOME_SELECTOR_PRE@">
+		ln -s $genome_fasta genomeref.fa;
+	</token>
+	<token name="@GENOME_SELECTOR@">
+		genomeref.fa
+	</token>
+        <xml name="input/fasta">
+		<param label="Fasta file" name="sequences" type="data" format="fasta"/>
+	</xml>
+</macros>