Mercurial > repos > galaxyp > filter_by_fasta_ids
comparison filter_by_fasta_ids.py @ 1:8d15aebf55fd draft
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
author | galaxyp |
---|---|
date | Tue, 24 May 2016 13:05:22 -0400 |
parents | |
children | 1bd985f14938 |
comparison
equal
deleted
inserted
replaced
0:463ebeccb854 | 1:8d15aebf55fd |
---|---|
1 #!/usr/bin/env python | |
2 """ A script to build specific fasta databases """ | |
3 from __future__ import print_function | |
4 import optparse | |
5 | |
6 | |
7 # ===================================== Iterator =============================== | |
8 class Sequence: | |
9 ''' Holds protein sequence information ''' | |
10 def __init__(self): | |
11 self.header = "" | |
12 self.sequence_parts = [] | |
13 | |
14 def get_sequence(self): | |
15 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) | |
16 | |
17 | |
18 class FASTAReader: | |
19 """ | |
20 FASTA db iterator. Returns a single FASTA sequence object. | |
21 """ | |
22 def __init__(self, fasta_name): | |
23 self.fasta_file = open(fasta_name) | |
24 self.next_line = self.fasta_file.readline() | |
25 | |
26 def __iter__(self): | |
27 return self | |
28 | |
29 def __next__(self): | |
30 ''' Iteration ''' | |
31 next_line = self.next_line | |
32 if not next_line: | |
33 raise StopIteration | |
34 | |
35 seq = Sequence() | |
36 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') | |
37 | |
38 next_line = self.fasta_file.readline() | |
39 while next_line and next_line[0] != '>': | |
40 seq.sequence_parts.append(next_line) | |
41 next_line = self.fasta_file.readline() | |
42 self.next_line = next_line | |
43 return seq | |
44 | |
45 # Python 2/3 compat | |
46 next = __next__ | |
47 | |
48 | |
49 def target_match(target, search_entry): | |
50 ''' Matches ''' | |
51 search_entry = search_entry.upper() | |
52 for atarget in target: | |
53 if search_entry.find(atarget) > -1: | |
54 return atarget | |
55 return None | |
56 | |
57 | |
58 def main(): | |
59 ''' the main function''' | |
60 | |
61 parser = optparse.OptionParser() | |
62 parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') | |
63 (options, args) = parser.parse_args() | |
64 | |
65 targets = [] | |
66 | |
67 with open(args[0]) as f_target: | |
68 for line in f_target.readlines(): | |
69 targets.append(">%s" % line.strip().upper()) | |
70 | |
71 print('Read target file, now looking for %d sequences.' % len(targets)) | |
72 | |
73 work_summary = {'wanted': len(targets), 'found': 0} | |
74 if options.dedup: | |
75 used_sequences = set() | |
76 work_summary['duplicates'] = 0 | |
77 homd_db = FASTAReader(args[1]) | |
78 | |
79 with open(args[2], "w") as output: | |
80 for entry in homd_db: | |
81 target_matched_results = target_match(targets, entry.header) | |
82 if target_matched_results: | |
83 work_summary['found'] += 1 | |
84 targets.remove(target_matched_results) | |
85 sequence = entry.get_sequence() | |
86 if options.dedup: | |
87 if sequence in used_sequences: | |
88 work_summary['duplicates'] += 1 | |
89 continue | |
90 else: | |
91 used_sequences.add(sequence) | |
92 print(entry.header, file=output) | |
93 print(sequence, file=output) | |
94 | |
95 print('Completed filtering.') | |
96 for parm, count in work_summary.items(): | |
97 print('%s ==> %d' % (parm, count)) | |
98 | |
99 if __name__ == "__main__": | |
100 main() |