filter_by_fasta_ids: filter_by_fasta

comparison filter_by_fasta_ids.py @ 4:cd22452edec2 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af

author	galaxyp
date	Thu, 18 Apr 2019 02:45:18 -0400
parents	3c623e81be77
children	dff7df6fcab5

comparison

equal deleted inserted replaced

-:3c623e81be77
+:cd22452edec2
 sequence_parts.append(line.rstrip())
 line = fasta_file.readline()
 yield Sequence(header, sequence_parts)
-def target_match(targets, search_entry, pattern='>([^| ]+)'):
+def target_match(targets, search_entry, pattern):
 ''' Matches '''
 search_entry = search_entry.upper()
-m = re.search(pattern,search_entry)
+m = pattern.search(search_entry)
 if m:
 target = m.group(len(m.groups()))
 if target in targets:
 return target
 else:
-print( 'No ID match: %s' % search_entry, file=sys.stdout)
+print('No ID match: %s' % search_entry, file=sys.stdout)
 return None
 def main():
-''' the main function'''
 parser = argparse.ArgumentParser()
 parser.add_argument('-i', required=True, help='Path to input FASTA file')
 parser.add_argument('-o', required=True, help='Path to output FASTA file')
 parser.add_argument('-d', help='Path to discarded entries file')
 header_criteria = parser.add_mutually_exclusive_group()
 header_criteria.add_argument('--id_list', help='Path to the ID list file')
-parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
+parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry')
 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
 sequence_criteria = parser.add_mutually_exclusive_group()
 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match')
 parser.add_argument('--max_length', type=int, help='Maximum sequence length')
 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
 options = parser.parse_args()
 if options.pattern:
-pattern =  options.pattern
+if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern):
-if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
+sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern)
-print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
+pattern = re.compile(options.pattern)
-exit(1)
 if options.min_length is not None and options.max_length is None:
 options.max_length = sys.maxsize
 if options.header_regexp:
-regexp = re.compile(options.header_regexp)
+header_regexp = re.compile(options.header_regexp)
 if options.sequence_regexp:
-regexp = re.compile(options.sequence_regexp)
+sequence_regexp = re.compile(options.sequence_regexp)
-work_summary = {'found': 0}
+work_summary = {'found': 0, 'discarded': 0}
 if options.dedup:
 used_sequences = set()
 work_summary['duplicates'] = 0
 if options.id_list:
 targets = []
 with open(options.id_list) as f_target:
-for line in f_target.readlines():
+for line in f_target:
 targets.append(line.strip().upper())
 work_summary['wanted'] = len(targets)
 homd_db = FASTAReader_gen(options.i)
 if options.d:
 with open(options.o, "w") as output:
 for entry in homd_db:
 print_entry = True
 if options.id_list:
-target_matched_results = target_match(targets, entry.header, pattern=pattern)
+target_matched_results = target_match(targets, entry.header, pattern)
 if target_matched_results:
-work_summary['found'] += 1
 targets.remove(target_matched_results)
 else:
 print_entry = False
 elif options.header_regexp:
-if regexp.search(entry.header) is None:
+if header_regexp.search(entry.header) is None:
 print_entry = False
 if options.min_length is not None:
 sequence_length = len(entry.sequence)
 if not(options.min_length <= sequence_length <= options.max_length):
 print_entry = False
 elif options.sequence_regexp:
-if regexp.search(entry.sequence) is None:
+if sequence_regexp.search(entry.sequence) is None:
 print_entry = False
 if print_entry:
 if options.dedup:
 if entry.sequence in used_sequences:
 work_summary['duplicates'] += 1
 continue
 else:
 used_sequences.add(entry.sequence)
+work_summary['found'] += 1
 entry.print(output)
-elif options.d:
+else:
-entry.print(discarded)
+work_summary['discarded'] += 1
+if options.d:
+entry.print(discarded)
+if options.d:
+discarded.close()
 for parm, count in work_summary.items():
 print('%s ==> %d' % (parm, count))

Mercurial > repos > galaxyp > filter_by_fasta_ids

comparison filter_by_fasta_ids.py @ 4:cd22452edec2 draft