# HG changeset patch # User galaxyp # Date 1555569918 14400 # Node ID cd22452edec2d116e4bc5e79153a5a38f1f52d4f # Parent 3c623e81be778d4a29b8233643041955e0ee3877 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af diff -r 3c623e81be77 -r cd22452edec2 filter_by_fasta_ids.py --- a/filter_by_fasta_ids.py Fri Feb 15 16:38:31 2019 -0500 +++ b/filter_by_fasta_ids.py Thu Apr 18 02:45:18 2019 -0400 @@ -41,29 +41,27 @@ yield Sequence(header, sequence_parts) -def target_match(targets, search_entry, pattern='>([^| ]+)'): +def target_match(targets, search_entry, pattern): ''' Matches ''' search_entry = search_entry.upper() - m = re.search(pattern,search_entry) + m = pattern.search(search_entry) if m: target = m.group(len(m.groups())) if target in targets: return target else: - print( 'No ID match: %s' % search_entry, file=sys.stdout) + print('No ID match: %s' % search_entry, file=sys.stdout) return None def main(): - ''' the main function''' - parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, help='Path to input FASTA file') parser.add_argument('-o', required=True, help='Path to output FASTA file') parser.add_argument('-d', help='Path to discarded entries file') header_criteria = parser.add_mutually_exclusive_group() header_criteria.add_argument('--id_list', help='Path to the ID list file') - parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') + parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry') header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') sequence_criteria = parser.add_mutually_exclusive_group() sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') @@ -71,22 +69,20 @@ parser.add_argument('--max_length', type=int, help='Maximum sequence length') parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') options = parser.parse_args() - - + if options.pattern: - pattern = options.pattern - if not re.match('^.*[(](?![?]:).*[)].*$',pattern): - print('pattern: "%s" did not include capture group "()" in regex ' % pattern) - exit(1) - + if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern): + sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern) + pattern = re.compile(options.pattern) + if options.min_length is not None and options.max_length is None: options.max_length = sys.maxsize if options.header_regexp: - regexp = re.compile(options.header_regexp) + header_regexp = re.compile(options.header_regexp) if options.sequence_regexp: - regexp = re.compile(options.sequence_regexp) + sequence_regexp = re.compile(options.sequence_regexp) - work_summary = {'found': 0} + work_summary = {'found': 0, 'discarded': 0} if options.dedup: used_sequences = set() @@ -95,7 +91,7 @@ if options.id_list: targets = [] with open(options.id_list) as f_target: - for line in f_target.readlines(): + for line in f_target: targets.append(line.strip().upper()) work_summary['wanted'] = len(targets) @@ -107,22 +103,20 @@ for entry in homd_db: print_entry = True if options.id_list: - target_matched_results = target_match(targets, entry.header, pattern=pattern) + target_matched_results = target_match(targets, entry.header, pattern) if target_matched_results: - work_summary['found'] += 1 targets.remove(target_matched_results) else: print_entry = False - elif options.header_regexp: - if regexp.search(entry.header) is None: + if header_regexp.search(entry.header) is None: print_entry = False if options.min_length is not None: sequence_length = len(entry.sequence) if not(options.min_length <= sequence_length <= options.max_length): print_entry = False elif options.sequence_regexp: - if regexp.search(entry.sequence) is None: + if sequence_regexp.search(entry.sequence) is None: print_entry = False if print_entry: if options.dedup: @@ -131,9 +125,15 @@ continue else: used_sequences.add(entry.sequence) + work_summary['found'] += 1 entry.print(output) - elif options.d: - entry.print(discarded) + else: + work_summary['discarded'] += 1 + if options.d: + entry.print(discarded) + + if options.d: + discarded.close() for parm, count in work_summary.items(): print('%s ==> %d' % (parm, count)) diff -r 3c623e81be77 -r cd22452edec2 filter_by_fasta_ids.xml --- a/filter_by_fasta_ids.xml Fri Feb 15 16:38:31 2019 -0500 +++ b/filter_by_fasta_ids.xml Thu Apr 18 02:45:18 2019 -0400 @@ -1,4 +1,4 @@ - + on the headers and/or the sequences @@ -25,8 +25,8 @@ --id_list '$header_criteria.identifiers' #if $header_criteria.id_regex.find == 'pattern': --pattern '$header_criteria.id_regex.pattern' - #elif $header_criteria.id_regex.find == 'beginning': - --pattern '$header_criteria.id_regex.pattern' + #else: + --pattern '>([^| ]+)' #end if #elif $header_criteria.header_criteria_select == 'regexp' --header_regexp '$header_criteria.regexp' @@ -56,30 +56,21 @@ - - - + - Default: >ID will use search pattern >([^| ]+) to input ID; Use custom regex to change - - - - ^.*[(](?![?]:).*[)].*$ - - + - - search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$ + ^.*[(](?![?]:).*[)].*$ - - @@ -137,8 +128,10 @@ - - + + + + @@ -164,13 +157,30 @@ - - + + + + + + + + + + + + + + + + + + + 1 +TGAC +>2 +AAAAAAAA +>2_bis +AAAA +AAAA +>4 +ACGT +TGAC +>5 +TTTT diff -r 3c623e81be77 -r cd22452edec2 test-data/output_header_regexp_sequence_regexp.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_header_regexp_sequence_regexp.fasta Thu Apr 18 02:45:18 2019 -0400 @@ -0,0 +1,2 @@ +>3 +ACGT