comparison filter_by_fasta_ids.py @ 3:78dd29aa7fc1 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author earlhaminst
date Mon, 20 Feb 2017 06:25:50 -0500
parents b3833e5b50d4
children fa59d6fea7f5
comparison
equal deleted inserted replaced
2:df6527887a18 3:78dd29aa7fc1
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ A script to build specific fasta databases """ 2 """ A script to build specific fasta databases """
3 from __future__ import print_function 3 from __future__ import print_function
4 4
5 5 import collections
6 import logging
7 import sys 6 import sys
8 7
9 8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
10 # ===================================== Iterator ===============================
11 class Sequence:
12 ''' Holds protein sequence information '''
13 def __init__(self):
14 self.header = ""
15 self.sequence_parts = []
16
17 def get_sequence(self):
18 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
19 9
20 10
21 class FASTAReader: 11 def FASTAReader_gen(fasta_filename):
22 """ 12 fasta_file = open(fasta_filename)
23 FASTA db iterator. Returns a single FASTA sequence object. 13 line = fasta_file.readline()
24 """ 14 while True:
25 def __init__(self, fasta_name): 15 if not line:
26 self.fasta_file = open(fasta_name) 16 return
27 self.next_line = self.fasta_file.readline() 17 assert line.startswith('>'), "FASTA headers must start with >"
28 18 header = line.rstrip()
29 def __iter__(self): 19 sequence_parts = []
30 return self 20 line = fasta_file.readline()
31 21 while line and line[0] != '>':
32 def __next__(self): 22 sequence_parts.append(line.rstrip())
33 ''' Iteration ''' 23 line = fasta_file.readline()
34 # while True: 24 sequence = "".join(sequence_parts)
35 # line = self.fasta_file.readline() 25 yield Sequence(header, sequence)
36 # if not line:
37 # raise StopIteration
38 # if line[0] == '>':
39 # break
40 next_line = self.next_line
41 if not next_line:
42 raise StopIteration
43
44 seq = Sequence()
45 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
46
47 next_line = self.fasta_file.readline()
48 while next_line and next_line[0] != '>':
49 # tail = self.fasta_file.tell()
50 # line = self.fasta_file.readline()
51 # if not line:
52 # break
53 # if line[0] == '>':
54 # self.fasta_file.seek(tail)
55 # break
56 seq.sequence_parts.append(next_line)
57 next_line = self.fasta_file.readline()
58 self.next_line = next_line
59 return seq
60
61 # Python 2/3 compat
62 next = __next__
63 # ==============================================================================
64 26
65 27
66 def target_match(target, search_entry): 28 def target_match(target, search_entry):
67 ''' Matches ''' 29 ''' Matches '''
68 search_entry = search_entry.upper() 30 search_entry = search_entry.upper()
72 return None 34 return None
73 35
74 36
75 def main(): 37 def main():
76 ''' the main function''' 38 ''' the main function'''
77 logging.basicConfig(filename='filter_fasta_log',
78 level=logging.INFO,
79 format='%(asctime)s :: %(levelname)s :: %(message)s',)
80 39
81 used_sequences = set() 40 used_sequences = set()
82 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} 41 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}
83 targets = [] 42 targets = []
84 43
85 with open(sys.argv[1]) as f_target: 44 with open(sys.argv[1]) as f_target:
86 for line in f_target.readlines(): 45 for line in f_target.readlines():
87 targets.append(">%s" % line.strip().upper()) 46 targets.append(">%s" % line.strip().upper())
88 47
89 work_summary['wanted'] = len(targets) 48 work_summary['wanted'] = len(targets)
90 homd_db = FASTAReader(sys.argv[2])
91 49
92 # output = open(sys.argv[3], "w") 50 # output = open(sys.argv[3], "w")
93 for entry in homd_db: 51 for entry in FASTAReader_gen(sys.argv[2]):
94 target_matched_results = target_match(targets, entry.header) 52 target_matched_results = target_match(targets, entry.header)
95 if target_matched_results: 53 if target_matched_results:
96 work_summary['found'] += 1 54 work_summary['found'] += 1
97 targets.remove(target_matched_results) 55 targets.remove(target_matched_results)
98 sequence = entry.get_sequence() 56 sequence = entry.sequence
99 used_sequences.add(sequence) 57 used_sequences.add(sequence)
100 print(entry.header) 58 print(entry.header)
101 print(sequence) 59 print(sequence)
102 for parm, count in work_summary.items():
103 logging.info('%s ==> %d', parm, count)
104 60
105 61
106 if __name__ == "__main__": 62 if __name__ == "__main__":
107 main() 63 main()