Mercurial > repos > artbio > cherry_pick_fasta
changeset 7:6c0aefd9fee3 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit 849d6d2087dadb81f1b790e3bcb5bda40c3c83af
| author | artbio | 
|---|---|
| date | Thu, 29 Dec 2022 11:53:05 +0000 | 
| parents | d8fa616a228a | 
| children | ee689b6999d5 | 
| files | cherry_pick_fasta.py cherry_pick_fasta.xml | 
| diffstat | 2 files changed, 42 insertions(+), 17 deletions(-) [+] | 
line wrap: on
 line diff
--- a/cherry_pick_fasta.py Fri Apr 08 16:56:42 2022 +0000 +++ b/cherry_pick_fasta.py Thu Dec 29 11:53:05 2022 +0000 @@ -1,4 +1,5 @@ import argparse +from collections import defaultdict def Parser(): @@ -28,23 +29,49 @@ def parse_fasta_dict(query, fasta_dict, mode): + if not isinstance(query, list): query = [query] + + def kmers(string, ksize, index): + if ksize > len(string): + return + for i in range(len(string) - ksize + 1): + kmer = string[i:i+ksize] + index[kmer].append(string) + + def consult_index(word, index): + accumulator = [] + print(len(index[word])) + for title in index[word]: + accumulator.append(title) + print(len(accumulator)) + for title in set(accumulator): + print(title) + accumulator = [] if mode == 'includes': - for seq_id in fasta_dict: - for string in query: - if string in seq_id: - accumulator.append(seq_id) - continue + kmersizes = set([len(word) for word in query]) + index = defaultdict(list) + for size in kmersizes: + for header in fasta_dict: + kmers(header, size, index) + for keyword in query: + for header in index[keyword]: + accumulator.append(header) + accumulator = set(accumulator) + res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} + return res_dict elif mode == 'exact': - for seq_id in fasta_dict: - for string in query: - if string == seq_id: - accumulator.append(seq_id) - continue - res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} - return res_dict + for keyword in query: + try: + len(fasta_dict[keyword]) + accumulator.append(keyword) + except KeyError: + pass + accumulator = set(accumulator) + res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} + return res_dict def complement_fasta_dict(fasta_dict, subfasta_dict):
--- a/cherry_pick_fasta.xml Fri Apr 08 16:56:42 2022 +0000 +++ b/cherry_pick_fasta.xml Thu Dec 29 11:53:05 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.3"> +<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="4.0"> <description>with header satisfying a string query</description> <requirements> <requirement type="package" version="3.8.0">python</requirement> @@ -93,8 +93,6 @@ <param name="match" value="exact" /> <output name="output" ftype="fasta" file="output_exact.fa" /> </test> - - <test> <param ftype="fasta" name="input" value="input.fa" /> <param name="options_selector" value="textdataset" /> @@ -111,8 +109,6 @@ <param name="match" value="exact" /> <output name="output" ftype="fasta" file="output_alt_termlist.fa" /> </test> - - <!-- partial matches --> <test> <param ftype="fasta" name="input" value="input.fa" /> @@ -147,5 +143,7 @@ This tool retrieves nucleotide/peptide sequences from a fasta file whose headers match or do not match a given string, or a list of strings. +Note that the version 4 of the tools is amazingly accelerated ! + </help> </tool>
