Mercurial > repos > galaxyp > filter_by_fasta_ids

--- a/filter_by_fasta_ids.py	Fri Feb 15 16:38:31 2019 -0500
+++ b/filter_by_fasta_ids.py	Thu Apr 18 02:45:18 2019 -0400
@@ -41,29 +41,27 @@
             yield Sequence(header, sequence_parts)


-def target_match(targets, search_entry, pattern='>([^| ]+)'):
+def target_match(targets, search_entry, pattern):
     ''' Matches '''
     search_entry = search_entry.upper()
-    m = re.search(pattern,search_entry)
+    m = pattern.search(search_entry)
     if m:
         target = m.group(len(m.groups()))
         if target in targets:
             return target
     else:
-         print( 'No ID match: %s' % search_entry, file=sys.stdout)
+        print('No ID match: %s' % search_entry, file=sys.stdout)
     return None


 def main():
-    ''' the main function'''
-
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', required=True, help='Path to input FASTA file')
     parser.add_argument('-o', required=True, help='Path to output FASTA file')
     parser.add_argument('-d', help='Path to discarded entries file')
     header_criteria = parser.add_mutually_exclusive_group()
     header_criteria.add_argument('--id_list', help='Path to the ID list file')
-    parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
+    parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry')
     header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
     sequence_criteria = parser.add_mutually_exclusive_group()
     sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
@@ -71,22 +69,20 @@
     parser.add_argument('--max_length', type=int, help='Maximum sequence length')
     parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
     options = parser.parse_args()
-
-
+
     if options.pattern:
-        pattern =  options.pattern
-        if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
-            print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
-            exit(1)
-
+        if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern):
+            sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern)
+        pattern = re.compile(options.pattern)
+
     if options.min_length is not None and options.max_length is None:
         options.max_length = sys.maxsize
     if options.header_regexp:
-        regexp = re.compile(options.header_regexp)
+        header_regexp = re.compile(options.header_regexp)
     if options.sequence_regexp:
-        regexp = re.compile(options.sequence_regexp)
+        sequence_regexp = re.compile(options.sequence_regexp)

-    work_summary = {'found': 0}
+    work_summary = {'found': 0, 'discarded': 0}

     if options.dedup:
         used_sequences = set()
@@ -95,7 +91,7 @@
     if options.id_list:
         targets = []
         with open(options.id_list) as f_target:
-            for line in f_target.readlines():
+            for line in f_target:
                 targets.append(line.strip().upper())
         work_summary['wanted'] = len(targets)

@@ -107,22 +103,20 @@
         for entry in homd_db:
             print_entry = True
             if options.id_list:
-                target_matched_results = target_match(targets, entry.header, pattern=pattern)
+                target_matched_results = target_match(targets, entry.header, pattern)
                 if target_matched_results:
-                    work_summary['found'] += 1
                     targets.remove(target_matched_results)
                 else:
                     print_entry = False
-
             elif options.header_regexp:
-                if regexp.search(entry.header) is None:
+                if header_regexp.search(entry.header) is None:
                     print_entry = False
             if options.min_length is not None:
                 sequence_length = len(entry.sequence)
                 if not(options.min_length <= sequence_length <= options.max_length):
                     print_entry = False
             elif options.sequence_regexp:
-                if regexp.search(entry.sequence) is None:
+                if sequence_regexp.search(entry.sequence) is None:
                     print_entry = False
             if print_entry:
                 if options.dedup:
@@ -131,9 +125,15 @@
                         continue
                     else:
                         used_sequences.add(entry.sequence)
+                work_summary['found'] += 1
                 entry.print(output)
-            elif options.d:
-                entry.print(discarded)
+            else:
+                work_summary['discarded'] += 1
+                if options.d:
+                    entry.print(discarded)
+
+    if options.d:
+        discarded.close()

     for parm, count in work_summary.items():
         print('%s ==> %d' % (parm, count))
--- a/filter_by_fasta_ids.xml	Fri Feb 15 16:38:31 2019 -0500
+++ b/filter_by_fasta_ids.xml	Thu Apr 18 02:45:18 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.1">
+<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.2">
     <description>on the headers and/or the sequences</description>
     <macros>
         <xml name="regexp_macro" token_label="Regular expression pattern">
@@ -25,8 +25,8 @@
     --id_list '$header_criteria.identifiers'
     #if $header_criteria.id_regex.find == 'pattern':
         --pattern '$header_criteria.id_regex.pattern'
-    #elif $header_criteria.id_regex.find == 'beginning':
-        --pattern '$header_criteria.id_regex.pattern'
+    #else:
+        --pattern '>([^| ]+)'
     #end if
 #elif $header_criteria.header_criteria_select == 'regexp'
     --header_regexp '$header_criteria.regexp'
@@ -56,30 +56,21 @@
             <when value="" />
             <when value="id_list">
                 <param name="identifiers" type="data" format="txt" label="List of IDs to extract sequences for"/>
-
-
                 <conditional name="id_regex">
-                    <param name="find" type="select" label="Match IDs by">
+                    <param name="find" type="select" label="Match IDs by"
+                        help="Default: &gt;ID will use search pattern >([^| ]+) to input ID; Use custom regex to change">
                         <option value="beginning">Default: ID is expected at the beginning: &gt;ID </option>
-                        <help>Default: &gt;ID will use search pattern >([^| ]+) to input ID; Use custom regex to change</help>
                         <option value="pattern">Custom regex pattern</option>
                     </param>
-                    <when value="beginning">
-                        <param name="pattern" type="hidden" value=">([^| ]+)" label="regex search pattern for ID" >
-                            <sanitizer sanitize="False"/>
-                            <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator>
-                        </param>
-                    </when>
+                    <when value="beginning" />
                     <when value="pattern">
-                        <param name="pattern" type="text" value="" label="regex search pattern for ID">
-                            <help>search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$ </help>
+                        <param name="pattern" type="text" value="" label="Regex search pattern for ID"
+                            help="Search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$">
                             <sanitizer sanitize="False"/>
                             <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator>
                         </param>
                     </when>
                 </conditional>
-
-
             </when>
             <when value="regexp">
                 <expand macro="regexp_macro" label="Regular expression pattern the header should match" />
@@ -137,8 +128,10 @@
         </test>
         <test expect_num_outputs="2">
             <param name="input" ftype="fasta" value="input.fasta" />
-            <param name="header_criteria_select" value="regexp" />
-            <param name="regexp" value="2" />
+            <conditional name="header_criteria">
+                <param name="header_criteria_select" value="regexp" />
+                <param name="regexp" value="2" />
+            </conditional>
             <param name="dedup" value="False" />
             <param name="output_discarded" value="True" />
             <output name="output" file="output_header_regexp.fasta" />
@@ -164,13 +157,30 @@
         </test>
         <test expect_num_outputs="2">
             <param name="input" ftype="fasta" value="input.fasta" />
-            <param name="sequence_criteria_select" value="regexp" />
-            <param name="regexp" value="T{2,}" />
+            <conditional name="sequence_criteria">
+                <param name="sequence_criteria_select" value="regexp" />
+                <param name="regexp" value="T{2,}" />
+            </conditional>
             <param name="dedup" value="False" />
             <param name="output_discarded" value="True" />
             <output name="output" file="output_sequence_regexp.fasta" />
             <output name="discarded" file="discarded_sequence_regexp.fasta" />
         </test>
+        <test expect_num_outputs="2">
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <conditional name="header_criteria">
+                <param name="header_criteria_select" value="regexp" />
+                <param name="regexp" value="3|5" />
+            </conditional>
+            <conditional name="sequence_criteria">
+                <param name="sequence_criteria_select" value="regexp" />
+                <param name="regexp" value="ACGT" />
+            </conditional>
+            <param name="dedup" value="False" />
+            <param name="output_discarded" value="True" />
+            <output name="output" file="output_header_regexp_sequence_regexp.fasta" />
+            <output name="discarded" file="discarded_header_regexp_sequence_regexp.fasta" />
+        </test>
     </tests>
     <help><![CDATA[
 **What it does**
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded_header_regexp_sequence_regexp.fasta	Thu Apr 18 02:45:18 2019 -0400
@@ -0,0 +1,12 @@
+>1
+TGAC
+>2
+AAAAAAAA
+>2_bis
+AAAA
+AAAA
+>4
+ACGT
+TGAC
+>5
+TTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_header_regexp_sequence_regexp.fasta	Thu Apr 18 02:45:18 2019 -0400
@@ -0,0 +1,2 @@
+>3
+ACGT