diff filter_by_fasta_ids.py @ 4:cd22452edec2 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
author galaxyp
date Thu, 18 Apr 2019 02:45:18 -0400
parents 3c623e81be77
children dff7df6fcab5
line wrap: on
line diff
--- a/filter_by_fasta_ids.py	Fri Feb 15 16:38:31 2019 -0500
+++ b/filter_by_fasta_ids.py	Thu Apr 18 02:45:18 2019 -0400
@@ -41,29 +41,27 @@
             yield Sequence(header, sequence_parts)
 
 
-def target_match(targets, search_entry, pattern='>([^| ]+)'):
+def target_match(targets, search_entry, pattern):
     ''' Matches '''
     search_entry = search_entry.upper()
-    m = re.search(pattern,search_entry)
+    m = pattern.search(search_entry)
     if m:
         target = m.group(len(m.groups()))
         if target in targets:
             return target
     else:
-         print( 'No ID match: %s' % search_entry, file=sys.stdout)
+        print('No ID match: %s' % search_entry, file=sys.stdout)
     return None
 
 
 def main():
-    ''' the main function'''
-
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', required=True, help='Path to input FASTA file')
     parser.add_argument('-o', required=True, help='Path to output FASTA file')
     parser.add_argument('-d', help='Path to discarded entries file')
     header_criteria = parser.add_mutually_exclusive_group()
     header_criteria.add_argument('--id_list', help='Path to the ID list file')
-    parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
+    parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry')
     header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
     sequence_criteria = parser.add_mutually_exclusive_group()
     sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
@@ -71,22 +69,20 @@
     parser.add_argument('--max_length', type=int, help='Maximum sequence length')
     parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
     options = parser.parse_args()
-    
-    
+
     if options.pattern:
-        pattern =  options.pattern 
-        if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
-            print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
-            exit(1)
-    
+        if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern):
+            sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern)
+        pattern = re.compile(options.pattern)
+
     if options.min_length is not None and options.max_length is None:
         options.max_length = sys.maxsize
     if options.header_regexp:
-        regexp = re.compile(options.header_regexp)
+        header_regexp = re.compile(options.header_regexp)
     if options.sequence_regexp:
-        regexp = re.compile(options.sequence_regexp)
+        sequence_regexp = re.compile(options.sequence_regexp)
 
-    work_summary = {'found': 0}
+    work_summary = {'found': 0, 'discarded': 0}
 
     if options.dedup:
         used_sequences = set()
@@ -95,7 +91,7 @@
     if options.id_list:
         targets = []
         with open(options.id_list) as f_target:
-            for line in f_target.readlines():
+            for line in f_target:
                 targets.append(line.strip().upper())
         work_summary['wanted'] = len(targets)
 
@@ -107,22 +103,20 @@
         for entry in homd_db:
             print_entry = True
             if options.id_list:
-                target_matched_results = target_match(targets, entry.header, pattern=pattern)
+                target_matched_results = target_match(targets, entry.header, pattern)
                 if target_matched_results:
-                    work_summary['found'] += 1
                     targets.remove(target_matched_results)
                 else:
                     print_entry = False
-            
             elif options.header_regexp:
-                if regexp.search(entry.header) is None:
+                if header_regexp.search(entry.header) is None:
                     print_entry = False
             if options.min_length is not None:
                 sequence_length = len(entry.sequence)
                 if not(options.min_length <= sequence_length <= options.max_length):
                     print_entry = False
             elif options.sequence_regexp:
-                if regexp.search(entry.sequence) is None:
+                if sequence_regexp.search(entry.sequence) is None:
                     print_entry = False
             if print_entry:
                 if options.dedup:
@@ -131,9 +125,15 @@
                         continue
                     else:
                         used_sequences.add(entry.sequence)
+                work_summary['found'] += 1
                 entry.print(output)
-            elif options.d:
-                entry.print(discarded)
+            else:
+                work_summary['discarded'] += 1
+                if options.d:
+                    entry.print(discarded)
+
+    if options.d:
+        discarded.close()
 
     for parm, count in work_summary.items():
         print('%s ==> %d' % (parm, count))