Mercurial > repos > iuc > tn93_cluster
changeset 1:112d80c9ccca draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author | iuc |
---|---|
date | Wed, 20 Apr 2022 17:00:11 +0000 |
parents | af03f3398f03 |
children | eb6f0ec5b95e |
files | macros.xml test-data/filter-out1.fasta tn93_cluster.py tn93_cluster.xml tn93_filter.py |
diffstat | 5 files changed, 39 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Fri Apr 23 03:04:15 2021 +0000 +++ b/macros.xml Wed Apr 20 17:00:11 2022 +0000 @@ -1,6 +1,12 @@ <?xml version="1.0"?> <macros> - <token name="@VERSION@">1.0.6</token> + <token name="@TOOL_VERSION@">1.0.6</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">tn93</requirement> + <yield /> + </requirements> + </xml> <xml name="citations"> <citations> <citation type="bibtex">
--- a/test-data/filter-out1.fasta Fri Apr 23 03:04:15 2021 +0000 +++ b/test-data/filter-out1.fasta Wed Apr 20 17:00:11 2022 +0000 @@ -13,3 +13,8 @@ >gb_MW518841_Organism_Severe_acute_respiratory_syndrome_coronavirus_2_Strain_Name_SARS_CoV_2_human_USA_CA_CDC_STM_220_2020_Segment_null_1 ATGTTAGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTTGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATCATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTTAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGGTGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACA + +>epi_isl_1041403/hCoV-19/USA/NY-PRL-2021_02_08_05H08/2021 +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +>REFERENCE +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGACG \ No newline at end of file
--- a/tn93_cluster.py Fri Apr 23 03:04:15 2021 +0000 +++ b/tn93_cluster.py Wed Apr 20 17:00:11 2022 +0000 @@ -2,7 +2,6 @@ import json import os import shlex -import shutil import subprocess import sys @@ -41,27 +40,22 @@ def main(arguments): threshold = arguments.threshold step = threshold * 0.25 - shutil.copy(arguments.input, os.path.join(os.getcwd(), 'reference_msa.fa')) - shutil.copy(arguments.input, os.path.join(os.getcwd(), 'reference_msa.fa.bak')) with open(arguments.reference) as fh: for line in fh: if line[0] == '>': _ref_seq_name = line[1:].split(' ')[0].strip() break - while True and threshold <= 1: - command = 'tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f reference_msa.fa' % (threshold, arguments.ambigs, arguments.cluster_type, arguments.overlap, arguments.fraction) + while threshold <= 1: + command = 'tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f %s' % (threshold, arguments.ambigs, arguments.cluster_type, arguments.overlap, arguments.fraction, arguments.input) return_code = run_command(command) if return_code != 0: return return_code - input_stamp, cluster_count = cluster_to_fasta('clusters.json', 'reference_msa.fa.bak', _ref_seq_name) - if cluster_count <= arguments.cluster_count or threshold == 1: + input_stamp, cluster_count = cluster_to_fasta('clusters.json', 'clusters.fa', _ref_seq_name) + if cluster_count <= arguments.cluster_count: break else: threshold += step print('Found %d clusters at threshold %f' % (cluster_count, threshold)) - shutil.copy('reference_msa.fa.bak', arguments.compressed) - shutil.copy('clusters.json', arguments.output) - os.remove('reference_msa.fa.bak') return 0
--- a/tn93_cluster.xml Fri Apr 23 03:04:15 2021 +0000 +++ b/tn93_cluster.xml Wed Apr 20 17:00:11 2022 +0000 @@ -1,12 +1,11 @@ -<tool id="tn93_cluster" name="TN93 Cluster" version="@VERSION@"> +<tool id="tn93_cluster" name="TN93 Cluster" version="@TOOL_VERSION@+galaxy1"> <description>sequences that lie within a specific distance of each other</description> <macros> <import>macros.xml</import> </macros> - <requirements> - <requirement type="package" version="@VERSION@">tn93</requirement> + <expand macro="requirements"> <requirement type="package" version="3.9">python</requirement> - </requirements> + </expand> <version_command><![CDATA[tn93 --version]]></version_command> <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/tn93_cluster.py' --input '$input_fasta' --reference '$reference' --output '$tn93_clusters' @@ -25,7 +24,7 @@ <param name="reference" type="data" format="fasta" label="Reference in FASTA format" /> <param argument="--compress" type="boolean" truevalue="--compress" falsevalue="" label="Output additional fasta dataset with compressed clusters" /> <param argument="--cluster-count" type="integer" value="200" label="Only retain this many clusters" /> - <param argument="--threshold" type="float" min="0" value="0.0005" label="Distance threshold" help="Sequences which lie within this distance will be clustered" /> + <param argument="--threshold" type="float" value="0.0005" min="0" max="1" label="Distance threshold" help="Sequences which lie within this distance will be clustered" /> <param argument="--ambigs" type="select" label="Strategy for ambiguous nucleotides"> <option value="resolve" selected="true">resolve</option> <option value="average">average</option> @@ -42,8 +41,8 @@ label="Maximum tolerated fraction of ambiguous characters" /> </inputs> <outputs> - <data format="json" name="tn93_clusters" /> - <data format="fasta" name="tn93_compressed_clusters"> + <data format="json" name="tn93_clusters" from_work_dir="clusters.json" /> + <data format="fasta" name="tn93_compressed_clusters" from_work_dir="clusters.fa"> <filter>compress</filter> </data> </outputs>
--- a/tn93_filter.py Fri Apr 23 03:04:15 2021 +0000 +++ b/tn93_filter.py Wed Apr 20 17:00:11 2022 +0000 @@ -1,5 +1,6 @@ import argparse import csv +import random from Bio import SeqIO @@ -8,15 +9,22 @@ arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) -arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str) +arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str) settings = arguments.parse_args() reference_name = 'REFERENCE' reference_seq = '' + +def unique_id(new_id, existing_ids): + while new_id in existing_ids: + new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10)) + return new_id + + with open(settings.reference) as seq_fh: for seq_record in SeqIO.parse(seq_fh, 'fasta'): - reference_name = seq_record.name + reference_name = seq_record.name.split(' ')[0] reference_seq = seq_record.seq break @@ -27,17 +35,19 @@ for line in reader: if line[1] not in seqs_to_filter: seqs_to_filter.add(line[1]) + else: + seqs_to_filter.add(unique_id(line[1], seqs_to_filter)) if reference_name in seqs_to_filter: seqs_to_filter.remove(reference_name) with open(settings.reads, "a+") as fh: seqs_filtered = list() for seq_record in SeqIO.parse(settings.clusters, "fasta"): + if seq_record.name.split(' ')[0] == reference_name: + continue if seq_record.name not in seqs_to_filter: - if seq_record.name == reference_name: - if seq_record.name not in seqs_filtered: - seqs_filtered.append(seq_record.name) - else: - continue + unique_name = unique_id(seq_record.name, seqs_filtered) + fh.write('\n>%s\n%s' % (unique_name, seq_record.seq)) + seqs_filtered.append(unique_name) if reference_name not in seqs_filtered: fh.write('\n>REFERENCE\n%s' % reference_seq)