Mercurial > repos > galaxyp > filter_by_fasta_ids
annotate filter_by_fasta_ids.py @ 1:8d15aebf55fd draft
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
author | galaxyp |
---|---|
date | Tue, 24 May 2016 13:05:22 -0400 |
parents | |
children | 1bd985f14938 |
rev | line source |
---|---|
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
1 #!/usr/bin/env python |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
2 """ A script to build specific fasta databases """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
3 from __future__ import print_function |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
4 import optparse |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
5 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
6 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
7 # ===================================== Iterator =============================== |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
8 class Sequence: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
9 ''' Holds protein sequence information ''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
10 def __init__(self): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
11 self.header = "" |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
12 self.sequence_parts = [] |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
13 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
14 def get_sequence(self): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
15 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
16 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
17 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
18 class FASTAReader: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
19 """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
20 FASTA db iterator. Returns a single FASTA sequence object. |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
21 """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
22 def __init__(self, fasta_name): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
23 self.fasta_file = open(fasta_name) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
24 self.next_line = self.fasta_file.readline() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
25 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
26 def __iter__(self): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
27 return self |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
28 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
29 def __next__(self): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
30 ''' Iteration ''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
31 next_line = self.next_line |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
32 if not next_line: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
33 raise StopIteration |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
34 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
35 seq = Sequence() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
36 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
37 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
38 next_line = self.fasta_file.readline() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
39 while next_line and next_line[0] != '>': |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
40 seq.sequence_parts.append(next_line) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
41 next_line = self.fasta_file.readline() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
42 self.next_line = next_line |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
43 return seq |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
44 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
45 # Python 2/3 compat |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
46 next = __next__ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
47 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
48 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
49 def target_match(target, search_entry): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
50 ''' Matches ''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
51 search_entry = search_entry.upper() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
52 for atarget in target: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
53 if search_entry.find(atarget) > -1: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
54 return atarget |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
55 return None |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
56 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
57 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
58 def main(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
59 ''' the main function''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
60 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
61 parser = optparse.OptionParser() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
62 parser.add_option('--dedup', dest='dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
63 (options, args) = parser.parse_args() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
64 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
65 targets = [] |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
66 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
67 with open(args[0]) as f_target: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
68 for line in f_target.readlines(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
69 targets.append(">%s" % line.strip().upper()) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
70 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
71 print('Read target file, now looking for %d sequences.' % len(targets)) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
72 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
73 work_summary = {'wanted': len(targets), 'found': 0} |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
74 if options.dedup: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
75 used_sequences = set() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
76 work_summary['duplicates'] = 0 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
77 homd_db = FASTAReader(args[1]) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
78 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
79 with open(args[2], "w") as output: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
80 for entry in homd_db: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
81 target_matched_results = target_match(targets, entry.header) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
82 if target_matched_results: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
83 work_summary['found'] += 1 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
84 targets.remove(target_matched_results) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
85 sequence = entry.get_sequence() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
86 if options.dedup: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
87 if sequence in used_sequences: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
88 work_summary['duplicates'] += 1 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
89 continue |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
90 else: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
91 used_sequences.add(sequence) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
92 print(entry.header, file=output) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
93 print(sequence, file=output) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
94 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
95 print('Completed filtering.') |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
96 for parm, count in work_summary.items(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
97 print('%s ==> %d' % (parm, count)) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
98 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
99 if __name__ == "__main__": |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
100 main() |