diff filter_by_fasta_ids.py @ 3:3c623e81be77 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
author galaxyp
date Fri, 15 Feb 2019 16:38:31 -0500
parents 1bd985f14938
children cd22452edec2
line wrap: on
line diff
--- a/filter_by_fasta_ids.py	Sat Apr 28 03:49:28 2018 -0400
+++ b/filter_by_fasta_ids.py	Fri Feb 15 16:38:31 2019 -0500
@@ -41,17 +41,16 @@
             yield Sequence(header, sequence_parts)
 
 
-def target_match(targets, header):
+def target_match(targets, search_entry, pattern='>([^| ]+)'):
     ''' Matches '''
-    # Remove '>' and initial spaces from the header
-    header = header[1:].lstrip().upper()
-    # Search for an exact match among the targets
-    if header in targets:
-        return header
-    # Try to find an exact match for the first "word" in the header
-    header = header.split()[0]
-    if header in targets:
-        return header
+    search_entry = search_entry.upper()
+    m = re.search(pattern,search_entry)
+    if m:
+        target = m.group(len(m.groups()))
+        if target in targets:
+            return target
+    else:
+         print( 'No ID match: %s' % search_entry, file=sys.stdout)
     return None
 
 
@@ -64,6 +63,7 @@
     parser.add_argument('-d', help='Path to discarded entries file')
     header_criteria = parser.add_mutually_exclusive_group()
     header_criteria.add_argument('--id_list', help='Path to the ID list file')
+    parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
     header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
     sequence_criteria = parser.add_mutually_exclusive_group()
     sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
@@ -71,7 +71,14 @@
     parser.add_argument('--max_length', type=int, help='Maximum sequence length')
     parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
     options = parser.parse_args()
-
+    
+    
+    if options.pattern:
+        pattern =  options.pattern 
+        if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
+            print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
+            exit(1)
+    
     if options.min_length is not None and options.max_length is None:
         options.max_length = sys.maxsize
     if options.header_regexp:
@@ -100,12 +107,13 @@
         for entry in homd_db:
             print_entry = True
             if options.id_list:
-                target_matched_results = target_match(targets, entry.header)
+                target_matched_results = target_match(targets, entry.header, pattern=pattern)
                 if target_matched_results:
                     work_summary['found'] += 1
                     targets.remove(target_matched_results)
                 else:
                     print_entry = False
+            
             elif options.header_regexp:
                 if regexp.search(entry.header) is None:
                     print_entry = False