filter_by_fasta_ids: filter_by_fasta

comparison filter_by_fasta_ids.py @ 3:3c623e81be77 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76

author	galaxyp
date	Fri, 15 Feb 2019 16:38:31 -0500
parents	1bd985f14938
children	cd22452edec2

comparison

equal deleted inserted replaced

-:1bd985f14938
+:3c623e81be77
 sequence_parts.append(line.rstrip())
 line = fasta_file.readline()
 yield Sequence(header, sequence_parts)
-def target_match(targets, header):
+def target_match(targets, search_entry, pattern='>([^| ]+)'):
 ''' Matches '''
-# Remove '>' and initial spaces from the header
+search_entry = search_entry.upper()
-header = header[1:].lstrip().upper()
+m = re.search(pattern,search_entry)
-# Search for an exact match among the targets
+if m:
-if header in targets:
+target = m.group(len(m.groups()))
-return header
+if target in targets:
-# Try to find an exact match for the first "word" in the header
+return target
-header = header.split()[0]
+else:
-if header in targets:
+print( 'No ID match: %s' % search_entry, file=sys.stdout)
-return header
 return None
 def main():
 ''' the main function'''
 parser.add_argument('-i', required=True, help='Path to input FASTA file')
 parser.add_argument('-o', required=True, help='Path to output FASTA file')
 parser.add_argument('-d', help='Path to discarded entries file')
 header_criteria = parser.add_mutually_exclusive_group()
 header_criteria.add_argument('--id_list', help='Path to the ID list file')
+parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
 sequence_criteria = parser.add_mutually_exclusive_group()
 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match')
 parser.add_argument('--max_length', type=int, help='Maximum sequence length')
 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
 options = parser.parse_args()
+if options.pattern:
+pattern =  options.pattern
+if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
+print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
+exit(1)
 if options.min_length is not None and options.max_length is None:
 options.max_length = sys.maxsize
 if options.header_regexp:
 regexp = re.compile(options.header_regexp)
 if options.sequence_regexp:
 with open(options.o, "w") as output:
 for entry in homd_db:
 print_entry = True
 if options.id_list:
-target_matched_results = target_match(targets, entry.header)
+target_matched_results = target_match(targets, entry.header, pattern=pattern)
 if target_matched_results:
 work_summary['found'] += 1
 targets.remove(target_matched_results)
 else:
 print_entry = False
 elif options.header_regexp:
 if regexp.search(entry.header) is None:
 print_entry = False
 if options.min_length is not None:
 sequence_length = len(entry.sequence)

Mercurial > repos > galaxyp > filter_by_fasta_ids

comparison filter_by_fasta_ids.py @ 3:3c623e81be77 draft