Mercurial > repos > galaxyp > filter_by_fasta_ids
diff filter_by_fasta_ids.py @ 4:cd22452edec2 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
author | galaxyp |
---|---|
date | Thu, 18 Apr 2019 02:45:18 -0400 |
parents | 3c623e81be77 |
children | dff7df6fcab5 |
line wrap: on
line diff
--- a/filter_by_fasta_ids.py Fri Feb 15 16:38:31 2019 -0500 +++ b/filter_by_fasta_ids.py Thu Apr 18 02:45:18 2019 -0400 @@ -41,29 +41,27 @@ yield Sequence(header, sequence_parts) -def target_match(targets, search_entry, pattern='>([^| ]+)'): +def target_match(targets, search_entry, pattern): ''' Matches ''' search_entry = search_entry.upper() - m = re.search(pattern,search_entry) + m = pattern.search(search_entry) if m: target = m.group(len(m.groups())) if target in targets: return target else: - print( 'No ID match: %s' % search_entry, file=sys.stdout) + print('No ID match: %s' % search_entry, file=sys.stdout) return None def main(): - ''' the main function''' - parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, help='Path to input FASTA file') parser.add_argument('-o', required=True, help='Path to output FASTA file') parser.add_argument('-d', help='Path to discarded entries file') header_criteria = parser.add_mutually_exclusive_group() header_criteria.add_argument('--id_list', help='Path to the ID list file') - parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') + parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry') header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') sequence_criteria = parser.add_mutually_exclusive_group() sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') @@ -71,22 +69,20 @@ parser.add_argument('--max_length', type=int, help='Maximum sequence length') parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') options = parser.parse_args() - - + if options.pattern: - pattern = options.pattern - if not re.match('^.*[(](?![?]:).*[)].*$',pattern): - print('pattern: "%s" did not include capture group "()" in regex ' % pattern) - exit(1) - + if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern): + sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern) + pattern = re.compile(options.pattern) + if options.min_length is not None and options.max_length is None: options.max_length = sys.maxsize if options.header_regexp: - regexp = re.compile(options.header_regexp) + header_regexp = re.compile(options.header_regexp) if options.sequence_regexp: - regexp = re.compile(options.sequence_regexp) + sequence_regexp = re.compile(options.sequence_regexp) - work_summary = {'found': 0} + work_summary = {'found': 0, 'discarded': 0} if options.dedup: used_sequences = set() @@ -95,7 +91,7 @@ if options.id_list: targets = [] with open(options.id_list) as f_target: - for line in f_target.readlines(): + for line in f_target: targets.append(line.strip().upper()) work_summary['wanted'] = len(targets) @@ -107,22 +103,20 @@ for entry in homd_db: print_entry = True if options.id_list: - target_matched_results = target_match(targets, entry.header, pattern=pattern) + target_matched_results = target_match(targets, entry.header, pattern) if target_matched_results: - work_summary['found'] += 1 targets.remove(target_matched_results) else: print_entry = False - elif options.header_regexp: - if regexp.search(entry.header) is None: + if header_regexp.search(entry.header) is None: print_entry = False if options.min_length is not None: sequence_length = len(entry.sequence) if not(options.min_length <= sequence_length <= options.max_length): print_entry = False elif options.sequence_regexp: - if regexp.search(entry.sequence) is None: + if sequence_regexp.search(entry.sequence) is None: print_entry = False if print_entry: if options.dedup: @@ -131,9 +125,15 @@ continue else: used_sequences.add(entry.sequence) + work_summary['found'] += 1 entry.print(output) - elif options.d: - entry.print(discarded) + else: + work_summary['discarded'] += 1 + if options.d: + entry.print(discarded) + + if options.d: + discarded.close() for parm, count in work_summary.items(): print('%s ==> %d' % (parm, count))