diff filter_by_fasta_ids.py @ 0:794a6e864a96 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
author earlhaminst
date Thu, 15 Dec 2016 11:04:25 -0500
parents
children b3833e5b50d4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_fasta_ids.py	Thu Dec 15 11:04:25 2016 -0500
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+""" A script to build specific fasta databases """
+from __future__ import print_function
+
+
+import logging
+import sys
+
+
+# ===================================== Iterator ===============================
+class Sequence:
+    ''' Holds protein sequence information '''
+    def __init__(self):
+        self.header = ""
+        self.sequence_parts = []
+
+    def get_sequence(self):
+        return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
+
+
+class FASTAReader:
+    """
+        FASTA db iterator. Returns a single FASTA sequence object.
+    """
+    def __init__(self, fasta_name):
+        self.fasta_file = open(fasta_name)
+        self.next_line = self.fasta_file.readline()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        # while True:
+        #    line = self.fasta_file.readline()
+        #    if not line:
+        #        raise StopIteration
+        #    if line[0] == '>':
+        #        break
+        next_line = self.next_line
+        if not next_line:
+            raise StopIteration
+
+        seq = Sequence()
+        seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
+
+        next_line = self.fasta_file.readline()
+        while next_line and next_line[0] != '>':
+            # tail = self.fasta_file.tell()
+            # line = self.fasta_file.readline()
+            # if not line:
+            #   break
+            # if line[0] == '>':
+            #   self.fasta_file.seek(tail)
+            #   break
+            seq.sequence_parts.append(next_line)
+            next_line = self.fasta_file.readline()
+        self.next_line = next_line
+        return seq
+
+    # Python 2/3 compat
+    next = __next__
+# ==============================================================================
+
+
+def target_match(target, search_entry):
+    ''' Matches '''
+    search_entry = search_entry.upper()
+    for atarget in target:
+        if search_entry.find(atarget) > -1:
+            return atarget
+    return None
+
+
+def main():
+    ''' the main function'''
+    logging.basicConfig(filename='filter_fasta_log',
+                        level=logging.INFO,
+                        format='%(asctime)s :: %(levelname)s :: %(message)s',)
+
+    used_sequences = set()
+    work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}
+    targets = []
+
+    f_target = open(sys.argv[1])
+    for line in f_target.readlines():
+        targets.append(">%s" % line.strip().upper())
+    f_target.close()
+
+    work_summary['wanted'] = len(targets)
+    homd_db = FASTAReader(sys.argv[2])
+
+    # output = open(sys.argv[3], "w")
+    for entry in homd_db:
+        target_matched_results = target_match(targets, entry.header)
+        if target_matched_results:
+            work_summary['found'] += 1
+            targets.remove(target_matched_results)
+            sequence = entry.get_sequence()
+            used_sequences.add(sequence)
+            print(entry.header)
+            print(sequence)
+    for parm, count in work_summary.items():
+        logging.info('%s ==> %d', parm, count)
+
+
+if __name__ == "__main__":
+    main()