comparison filter_by_fasta_ids.py @ 1:8d15aebf55fd draft

planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
author galaxyp
date Tue, 24 May 2016 13:05:22 -0400
parents
children 1bd985f14938
comparison
equal deleted inserted replaced
0:463ebeccb854 1:8d15aebf55fd
1 #!/usr/bin/env python
2 """ A script to build specific fasta databases """
3 from __future__ import print_function
4 import optparse
5
6
7 # ===================================== Iterator ===============================
8 class Sequence:
9 ''' Holds protein sequence information '''
10 def __init__(self):
11 self.header = ""
12 self.sequence_parts = []
13
14 def get_sequence(self):
15 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
16
17
18 class FASTAReader:
19 """
20 FASTA db iterator. Returns a single FASTA sequence object.
21 """
22 def __init__(self, fasta_name):
23 self.fasta_file = open(fasta_name)
24 self.next_line = self.fasta_file.readline()
25
26 def __iter__(self):
27 return self
28
29 def __next__(self):
30 ''' Iteration '''
31 next_line = self.next_line
32 if not next_line:
33 raise StopIteration
34
35 seq = Sequence()
36 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
37
38 next_line = self.fasta_file.readline()
39 while next_line and next_line[0] != '>':
40 seq.sequence_parts.append(next_line)
41 next_line = self.fasta_file.readline()
42 self.next_line = next_line
43 return seq
44
45 # Python 2/3 compat
46 next = __next__
47
48
49 def target_match(target, search_entry):
50 ''' Matches '''
51 search_entry = search_entry.upper()
52 for atarget in target:
53 if search_entry.find(atarget) > -1:
54 return atarget
55 return None
56
57
58 def main():
59 ''' the main function'''
60
61 parser = optparse.OptionParser()
62 parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
63 (options, args) = parser.parse_args()
64
65 targets = []
66
67 with open(args[0]) as f_target:
68 for line in f_target.readlines():
69 targets.append(">%s" % line.strip().upper())
70
71 print('Read target file, now looking for %d sequences.' % len(targets))
72
73 work_summary = {'wanted': len(targets), 'found': 0}
74 if options.dedup:
75 used_sequences = set()
76 work_summary['duplicates'] = 0
77 homd_db = FASTAReader(args[1])
78
79 with open(args[2], "w") as output:
80 for entry in homd_db:
81 target_matched_results = target_match(targets, entry.header)
82 if target_matched_results:
83 work_summary['found'] += 1
84 targets.remove(target_matched_results)
85 sequence = entry.get_sequence()
86 if options.dedup:
87 if sequence in used_sequences:
88 work_summary['duplicates'] += 1
89 continue
90 else:
91 used_sequences.add(sequence)
92 print(entry.header, file=output)
93 print(sequence, file=output)
94
95 print('Completed filtering.')
96 for parm, count in work_summary.items():
97 print('%s ==> %d' % (parm, count))
98
99 if __name__ == "__main__":
100 main()