Mercurial > repos > galaxyp > filter_by_fasta_ids
changeset 4:cd22452edec2 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
author | galaxyp |
---|---|
date | Thu, 18 Apr 2019 02:45:18 -0400 |
parents | 3c623e81be77 |
children | dff7df6fcab5 |
files | filter_by_fasta_ids.py filter_by_fasta_ids.xml test-data/discarded_header_regexp_sequence_regexp.fasta test-data/output_header_regexp_sequence_regexp.fasta |
diffstat | 4 files changed, 69 insertions(+), 45 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_by_fasta_ids.py Fri Feb 15 16:38:31 2019 -0500 +++ b/filter_by_fasta_ids.py Thu Apr 18 02:45:18 2019 -0400 @@ -41,29 +41,27 @@ yield Sequence(header, sequence_parts) -def target_match(targets, search_entry, pattern='>([^| ]+)'): +def target_match(targets, search_entry, pattern): ''' Matches ''' search_entry = search_entry.upper() - m = re.search(pattern,search_entry) + m = pattern.search(search_entry) if m: target = m.group(len(m.groups())) if target in targets: return target else: - print( 'No ID match: %s' % search_entry, file=sys.stdout) + print('No ID match: %s' % search_entry, file=sys.stdout) return None def main(): - ''' the main function''' - parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, help='Path to input FASTA file') parser.add_argument('-o', required=True, help='Path to output FASTA file') parser.add_argument('-d', help='Path to discarded entries file') header_criteria = parser.add_mutually_exclusive_group() header_criteria.add_argument('--id_list', help='Path to the ID list file') - parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') + parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry') header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') sequence_criteria = parser.add_mutually_exclusive_group() sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') @@ -71,22 +69,20 @@ parser.add_argument('--max_length', type=int, help='Maximum sequence length') parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') options = parser.parse_args() - - + if options.pattern: - pattern = options.pattern - if not re.match('^.*[(](?![?]:).*[)].*$',pattern): - print('pattern: "%s" did not include capture group "()" in regex ' % pattern) - exit(1) - + if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern): + sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern) + pattern = re.compile(options.pattern) + if options.min_length is not None and options.max_length is None: options.max_length = sys.maxsize if options.header_regexp: - regexp = re.compile(options.header_regexp) + header_regexp = re.compile(options.header_regexp) if options.sequence_regexp: - regexp = re.compile(options.sequence_regexp) + sequence_regexp = re.compile(options.sequence_regexp) - work_summary = {'found': 0} + work_summary = {'found': 0, 'discarded': 0} if options.dedup: used_sequences = set() @@ -95,7 +91,7 @@ if options.id_list: targets = [] with open(options.id_list) as f_target: - for line in f_target.readlines(): + for line in f_target: targets.append(line.strip().upper()) work_summary['wanted'] = len(targets) @@ -107,22 +103,20 @@ for entry in homd_db: print_entry = True if options.id_list: - target_matched_results = target_match(targets, entry.header, pattern=pattern) + target_matched_results = target_match(targets, entry.header, pattern) if target_matched_results: - work_summary['found'] += 1 targets.remove(target_matched_results) else: print_entry = False - elif options.header_regexp: - if regexp.search(entry.header) is None: + if header_regexp.search(entry.header) is None: print_entry = False if options.min_length is not None: sequence_length = len(entry.sequence) if not(options.min_length <= sequence_length <= options.max_length): print_entry = False elif options.sequence_regexp: - if regexp.search(entry.sequence) is None: + if sequence_regexp.search(entry.sequence) is None: print_entry = False if print_entry: if options.dedup: @@ -131,9 +125,15 @@ continue else: used_sequences.add(entry.sequence) + work_summary['found'] += 1 entry.print(output) - elif options.d: - entry.print(discarded) + else: + work_summary['discarded'] += 1 + if options.d: + entry.print(discarded) + + if options.d: + discarded.close() for parm, count in work_summary.items(): print('%s ==> %d' % (parm, count))
--- a/filter_by_fasta_ids.xml Fri Feb 15 16:38:31 2019 -0500 +++ b/filter_by_fasta_ids.xml Thu Apr 18 02:45:18 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.1"> +<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.2"> <description>on the headers and/or the sequences</description> <macros> <xml name="regexp_macro" token_label="Regular expression pattern"> @@ -25,8 +25,8 @@ --id_list '$header_criteria.identifiers' #if $header_criteria.id_regex.find == 'pattern': --pattern '$header_criteria.id_regex.pattern' - #elif $header_criteria.id_regex.find == 'beginning': - --pattern '$header_criteria.id_regex.pattern' + #else: + --pattern '>([^| ]+)' #end if #elif $header_criteria.header_criteria_select == 'regexp' --header_regexp '$header_criteria.regexp' @@ -56,30 +56,21 @@ <when value="" /> <when value="id_list"> <param name="identifiers" type="data" format="txt" label="List of IDs to extract sequences for"/> - - <conditional name="id_regex"> - <param name="find" type="select" label="Match IDs by"> + <param name="find" type="select" label="Match IDs by" + help="Default: >ID will use search pattern >([^| ]+) to input ID; Use custom regex to change"> <option value="beginning">Default: ID is expected at the beginning: >ID </option> - <help>Default: >ID will use search pattern >([^| ]+) to input ID; Use custom regex to change</help> <option value="pattern">Custom regex pattern</option> </param> - <when value="beginning"> - <param name="pattern" type="hidden" value=">([^| ]+)" label="regex search pattern for ID" > - <sanitizer sanitize="False"/> - <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator> - </param> - </when> + <when value="beginning" /> <when value="pattern"> - <param name="pattern" type="text" value="" label="regex search pattern for ID"> - <help>search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$ </help> + <param name="pattern" type="text" value="" label="Regex search pattern for ID" + help="Search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$"> <sanitizer sanitize="False"/> <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator> </param> </when> </conditional> - - </when> <when value="regexp"> <expand macro="regexp_macro" label="Regular expression pattern the header should match" /> @@ -137,8 +128,10 @@ </test> <test expect_num_outputs="2"> <param name="input" ftype="fasta" value="input.fasta" /> - <param name="header_criteria_select" value="regexp" /> - <param name="regexp" value="2" /> + <conditional name="header_criteria"> + <param name="header_criteria_select" value="regexp" /> + <param name="regexp" value="2" /> + </conditional> <param name="dedup" value="False" /> <param name="output_discarded" value="True" /> <output name="output" file="output_header_regexp.fasta" /> @@ -164,13 +157,30 @@ </test> <test expect_num_outputs="2"> <param name="input" ftype="fasta" value="input.fasta" /> - <param name="sequence_criteria_select" value="regexp" /> - <param name="regexp" value="T{2,}" /> + <conditional name="sequence_criteria"> + <param name="sequence_criteria_select" value="regexp" /> + <param name="regexp" value="T{2,}" /> + </conditional> <param name="dedup" value="False" /> <param name="output_discarded" value="True" /> <output name="output" file="output_sequence_regexp.fasta" /> <output name="discarded" file="discarded_sequence_regexp.fasta" /> </test> + <test expect_num_outputs="2"> + <param name="input" ftype="fasta" value="input.fasta" /> + <conditional name="header_criteria"> + <param name="header_criteria_select" value="regexp" /> + <param name="regexp" value="3|5" /> + </conditional> + <conditional name="sequence_criteria"> + <param name="sequence_criteria_select" value="regexp" /> + <param name="regexp" value="ACGT" /> + </conditional> + <param name="dedup" value="False" /> + <param name="output_discarded" value="True" /> + <output name="output" file="output_header_regexp_sequence_regexp.fasta" /> + <output name="discarded" file="discarded_header_regexp_sequence_regexp.fasta" /> + </test> </tests> <help><![CDATA[ **What it does**