# HG changeset patch
# User galaxyp
# Date 1464109522 14400
# Node ID 8d15aebf55fd85d9f9600108d2b2f32b12127d32
# Parent  463ebeccb8547f457923e4db9448c8f445b278d4
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30

diff -r 463ebeccb854 -r 8d15aebf55fd README.md
--- a/README.md	Fri Sep 26 14:23:16 2014 -0400
+++ b/README.md	Tue May 24 13:05:22 2016 -0400
@@ -1,7 +1,7 @@
 GalaxyP - Filter by FASTA IDs
 =============================
 
-* Home: <https://bitbucket.org/galaxyp/filter_by_fasta_ids>
+* Home: <https://github.com/galaxyproteomics/tools-galaxyp/>
 * Galaxy Tool Shed: <http://toolshed.g2.bx.psu.edu/view/galaxyp/filter_by_fasta_ids>
 * Tool ID: `filter_by_fasta_ids`
 
@@ -15,9 +15,9 @@
 GalaxyP Community
 -----------------
 
-Current governing community policies for [GalaxyP](https://bitbucket.org/galaxyp/) and other information can be found at:
+Current governing community policies for [GalaxyP](https://github.com/galaxyproteomics/) and other information can be found at:
 
-<https://bitbucket.org/galaxyp/galaxyp>
+<https://github.com/galaxyproteomics>
 
 
 License
@@ -35,7 +35,7 @@
 Contributing
 ------------
 
-Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://bitbucket.org/galaxyp/galaxyp/CONTRIBUTORS.md> unless you opt-out.
+Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in <https://github.com/galaxyproteomics/tools-galaxyp/>
 
 
 Authors
diff -r 463ebeccb854 -r 8d15aebf55fd filter_by_fasta_ids.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_fasta_ids.py	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+""" A script to build specific fasta databases """
+from __future__ import print_function
+import optparse
+
+
+# ===================================== Iterator ===============================
+class Sequence:
+    ''' Holds protein sequence information '''
+    def __init__(self):
+        self.header = ""
+        self.sequence_parts = []
+
+    def get_sequence(self):
+        return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
+
+
+class FASTAReader:
+    """
+        FASTA db iterator. Returns a single FASTA sequence object.
+    """
+    def __init__(self, fasta_name):
+        self.fasta_file = open(fasta_name)
+        self.next_line = self.fasta_file.readline()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        next_line = self.next_line
+        if not next_line:
+            raise StopIteration
+
+        seq = Sequence()
+        seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
+
+        next_line = self.fasta_file.readline()
+        while next_line and next_line[0] != '>':
+            seq.sequence_parts.append(next_line)
+            next_line = self.fasta_file.readline()
+        self.next_line = next_line
+        return seq
+
+    # Python 2/3 compat
+    next = __next__
+
+
+def target_match(target, search_entry):
+    ''' Matches '''
+    search_entry = search_entry.upper()
+    for atarget in target:
+        if search_entry.find(atarget) > -1:
+            return atarget
+    return None
+
+
+def main():
+    ''' the main function'''
+
+    parser = optparse.OptionParser()
+    parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
+    (options, args) = parser.parse_args()
+
+    targets = []
+
+    with open(args[0]) as f_target:
+        for line in f_target.readlines():
+            targets.append(">%s" % line.strip().upper())
+
+    print('Read target file, now looking for %d sequences.' % len(targets))
+
+    work_summary = {'wanted': len(targets), 'found': 0}
+    if options.dedup:
+        used_sequences = set()
+        work_summary['duplicates'] = 0
+    homd_db = FASTAReader(args[1])
+
+    with open(args[2], "w") as output:
+        for entry in homd_db:
+            target_matched_results = target_match(targets, entry.header)
+            if target_matched_results:
+                work_summary['found'] += 1
+                targets.remove(target_matched_results)
+                sequence = entry.get_sequence()
+                if options.dedup:
+                    if sequence in used_sequences:
+                        work_summary['duplicates'] += 1
+                        continue
+                    else:
+                        used_sequences.add(sequence)
+                print(entry.header, file=output)
+                print(sequence, file=output)
+
+    print('Completed filtering.')
+    for parm, count in work_summary.items():
+        print('%s ==> %d' % (parm, count))
+
+if __name__ == "__main__":
+    main()
diff -r 463ebeccb854 -r 8d15aebf55fd filter_by_fasta_ids.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_fasta_ids.xml	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,40 @@
+<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs">
+    <description>Extract sequences from a FASTA file based on a list of IDs</description>
+    <command>
+<![CDATA[
+        python $__tool_directory__/filter_by_fasta_ids.py
+            $dedup
+            '$identifiers'
+            '$input'
+            '$output'
+]]>
+    </command>
+    <inputs>
+        <param format="fasta" name="input" type="data" label="FASTA sequences"/>
+        <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
+        <param name="dedup" type="boolean" truevalue="--dedup" falsevalue="" checked="true" label="Remove duplicate sequences" />
+    </inputs>
+    <outputs>
+        <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="identifiers" ftype="txt" value="ids.txt" />
+            <output name="output" file="output_dedup.fasta" />
+        </test>
+        <test>
+            <param name="input" ftype="fasta" value="input.fasta" />
+            <param name="identifiers" ftype="txt" value="ids.txt" />
+            <param name="dedup" value="False" />
+            <output name="output" file="output_not_dedup.fasta" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+**What it does**
+
+Extract sequences from a FASTA file based on a list of IDs.
+]]>
+    </help>
+</tool>
diff -r 463ebeccb854 -r 8d15aebf55fd test-data/.gitkeep
diff -r 463ebeccb854 -r 8d15aebf55fd test-data/ids.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ids.txt	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,5 @@
+2
+2_bis
+3
+4
+6
diff -r 463ebeccb854 -r 8d15aebf55fd test-data/input.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,14 @@
+>1
+TGAC
+>2
+AAAAAAAA
+>3
+ACGT
+>2_bis
+AAAA
+AAAA
+>4
+ACGT
+TGAC
+>5
+TTTT
diff -r 463ebeccb854 -r 8d15aebf55fd test-data/output_dedup.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_dedup.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,6 @@
+>2
+AAAAAAAA
+>3
+ACGT
+>4
+ACGTTGAC
diff -r 463ebeccb854 -r 8d15aebf55fd test-data/output_not_dedup.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_not_dedup.fasta	Tue May 24 13:05:22 2016 -0400
@@ -0,0 +1,8 @@
+>2
+AAAAAAAA
+>3
+ACGT
+>2_bis
+AAAAAAAA
+>4
+ACGTTGAC
diff -r 463ebeccb854 -r 8d15aebf55fd tool-data/.gitkeep
diff -r 463ebeccb854 -r 8d15aebf55fd tools/filter_by_fasta_ids.py
--- a/tools/filter_by_fasta_ids.py	Fri Sep 26 14:23:16 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-""" A script to build specific fasta databases """
-from __future__ import print_function
-import sys
-import logging
-
-#===================================== Iterator ===============================
-class Sequence:
-    ''' Holds protein sequence information '''
-    def __init__(self):
-        self.header = ""
-        self.sequence_parts = []
-
-    def get_sequence(self):
-        return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts])
-
-class FASTAReader:
-    """
-        FASTA db iterator. Returns a single FASTA sequence object.
-    """
-    def __init__(self, fasta_name):
-        self.fasta_file = open(fasta_name)
-        self.next_line = self.fasta_file.readline()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        ''' Iteration '''
-        #while True:
-        #    line = self.fasta_file.readline()
-        #    if not line:
-        #        raise StopIteration
-        #    if line[0] == '>':
-        #        break
-        next_line = self.next_line
-        if not next_line:
-            raise StopIteration
-
-        seq = Sequence()
-        seq.header = next_line.rstrip().replace('\n','').replace('\r','')
-
-        next_line = self.fasta_file.readline()
-        while next_line and next_line[0] != '>':
-            #tail = self.fasta_file.tell()
-            #line = self.fasta_file.readline()
-            #if not line:
-            #    break
-            #if line[0] == '>':
-            #    self.fasta_file.seek(tail)
-            #    break
-            seq.sequence_parts.append(next_line)
-            next_line = self.fasta_file.readline()
-        self.next_line = next_line
-        return seq
-
-    # Python 2/3 compat
-    next = __next__
-#==============================================================================
-
-def target_match(target, search_entry):
-    ''' Matches '''
-    search_entry = search_entry.upper()
-    for atarget in target:
-        if search_entry.find(atarget) > -1:
-            return atarget
-    return None
-
-
-def main():
-    ''' the main function'''
-    logging.basicConfig(filename='filter_fasta_log',
-        level=logging.INFO,
-        format='%(asctime)s :: %(levelname)s :: %(message)s',)
-
-    used_sequences = set()
-    work_summary = {'wanted': 0, 'found':0, 'duplicates':0}
-    targets = []
-
-    f_target = open(sys.argv[1])
-    for line in f_target.readlines():
-        targets.append(">%s" % line.strip().upper())
-    f_target.close()
-
-    logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.')
-
-    work_summary['wanted'] = len(targets)
-    homd_db = FASTAReader(sys.argv[2])
-
-    i = 0
-    output = open(sys.argv[3], "w")
-    try:
-        for entry in homd_db:
-            target_matched_results = target_match(targets, entry.header)
-            if target_matched_results:
-                work_summary['found'] += 1
-                targets.remove(target_matched_results)
-                sequence = entry.get_sequence()
-                if sequence in used_sequences:
-                    work_summary['duplicates'] += 1
-                else:
-                    used_sequences.add(sequence)
-                    print(entry.header, file=output)
-                    print(sequence, file=output)
-    finally:
-        output.close()
-
-    logging.info('Completed filtering')
-    for parm, count in work_summary.items():
-        logging.info('%s ==> %d', parm, count)
-
-if __name__ == "__main__":
-    main()
diff -r 463ebeccb854 -r 8d15aebf55fd tools/filter_by_fasta_ids.xml
--- a/tools/filter_by_fasta_ids.xml	Fri Sep 26 14:23:16 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-<tool id="filter_by_fasta_ids" version="1.0" name="Filter by FASTA IDs">
-  <description>Extract sequences from a FASTA file based on a list of IDs</description>
-  <command interpreter="python">filter_by_fasta_ids.py $identifiers $input $output</command>
-  <inputs>
-    <param format="fasta" name="input" type="data" label="FASTA sequences"/>
-    <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
-  </inputs>
-  <outputs>
-    <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
-  </outputs>
-  <help>
-  </help>
-</tool>