annotate filter_by_fasta_ids.py @ 1:8d15aebf55fd draft

planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
author galaxyp
date Tue, 24 May 2016 13:05:22 -0400
parents
children 1bd985f14938
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
2 """ A script to build specific fasta databases """
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
3 from __future__ import print_function
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
4 import optparse
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
5
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
6
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
7 # ===================================== Iterator ===============================
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
8 class Sequence:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
9 ''' Holds protein sequence information '''
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
10 def __init__(self):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
11 self.header = ""
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
12 self.sequence_parts = []
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
13
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
14 def get_sequence(self):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
15 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts])
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
16
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
17
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
18 class FASTAReader:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
19 """
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
20 FASTA db iterator. Returns a single FASTA sequence object.
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
21 """
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
22 def __init__(self, fasta_name):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
23 self.fasta_file = open(fasta_name)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
24 self.next_line = self.fasta_file.readline()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
25
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
26 def __iter__(self):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
27 return self
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
28
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
29 def __next__(self):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
30 ''' Iteration '''
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
31 next_line = self.next_line
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
32 if not next_line:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
33 raise StopIteration
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
34
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
35 seq = Sequence()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
36 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '')
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
37
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
38 next_line = self.fasta_file.readline()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
39 while next_line and next_line[0] != '>':
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
40 seq.sequence_parts.append(next_line)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
41 next_line = self.fasta_file.readline()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
42 self.next_line = next_line
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
43 return seq
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
44
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
45 # Python 2/3 compat
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
46 next = __next__
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
47
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
48
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
49 def target_match(target, search_entry):
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
50 ''' Matches '''
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
51 search_entry = search_entry.upper()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
52 for atarget in target:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
53 if search_entry.find(atarget) > -1:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
54 return atarget
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
55 return None
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
56
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
57
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
58 def main():
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
59 ''' the main function'''
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
60
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
61 parser = optparse.OptionParser()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
62 parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
63 (options, args) = parser.parse_args()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
64
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
65 targets = []
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
66
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
67 with open(args[0]) as f_target:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
68 for line in f_target.readlines():
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
69 targets.append(">%s" % line.strip().upper())
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
70
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
71 print('Read target file, now looking for %d sequences.' % len(targets))
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
72
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
73 work_summary = {'wanted': len(targets), 'found': 0}
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
74 if options.dedup:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
75 used_sequences = set()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
76 work_summary['duplicates'] = 0
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
77 homd_db = FASTAReader(args[1])
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
78
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
79 with open(args[2], "w") as output:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
80 for entry in homd_db:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
81 target_matched_results = target_match(targets, entry.header)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
82 if target_matched_results:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
83 work_summary['found'] += 1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
84 targets.remove(target_matched_results)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
85 sequence = entry.get_sequence()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
86 if options.dedup:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
87 if sequence in used_sequences:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
88 work_summary['duplicates'] += 1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
89 continue
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
90 else:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
91 used_sequences.add(sequence)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
92 print(entry.header, file=output)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
93 print(sequence, file=output)
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
94
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
95 print('Completed filtering.')
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
96 for parm, count in work_summary.items():
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
97 print('%s ==> %d' % (parm, count))
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
98
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
99 if __name__ == "__main__":
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
100 main()