Mercurial > repos > earlhaminst > t_coffee
comparison filter_by_fasta_ids.py @ 3:78dd29aa7fc1 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author | earlhaminst |
---|---|
date | Mon, 20 Feb 2017 06:25:50 -0500 |
parents | b3833e5b50d4 |
children | fa59d6fea7f5 |
comparison
equal
deleted
inserted
replaced
2:df6527887a18 | 3:78dd29aa7fc1 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """ A script to build specific fasta databases """ | 2 """ A script to build specific fasta databases """ |
3 from __future__ import print_function | 3 from __future__ import print_function |
4 | 4 |
5 | 5 import collections |
6 import logging | |
7 import sys | 6 import sys |
8 | 7 |
9 | 8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
10 # ===================================== Iterator =============================== | |
11 class Sequence: | |
12 ''' Holds protein sequence information ''' | |
13 def __init__(self): | |
14 self.header = "" | |
15 self.sequence_parts = [] | |
16 | |
17 def get_sequence(self): | |
18 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) | |
19 | 9 |
20 | 10 |
21 class FASTAReader: | 11 def FASTAReader_gen(fasta_filename): |
22 """ | 12 fasta_file = open(fasta_filename) |
23 FASTA db iterator. Returns a single FASTA sequence object. | 13 line = fasta_file.readline() |
24 """ | 14 while True: |
25 def __init__(self, fasta_name): | 15 if not line: |
26 self.fasta_file = open(fasta_name) | 16 return |
27 self.next_line = self.fasta_file.readline() | 17 assert line.startswith('>'), "FASTA headers must start with >" |
28 | 18 header = line.rstrip() |
29 def __iter__(self): | 19 sequence_parts = [] |
30 return self | 20 line = fasta_file.readline() |
31 | 21 while line and line[0] != '>': |
32 def __next__(self): | 22 sequence_parts.append(line.rstrip()) |
33 ''' Iteration ''' | 23 line = fasta_file.readline() |
34 # while True: | 24 sequence = "".join(sequence_parts) |
35 # line = self.fasta_file.readline() | 25 yield Sequence(header, sequence) |
36 # if not line: | |
37 # raise StopIteration | |
38 # if line[0] == '>': | |
39 # break | |
40 next_line = self.next_line | |
41 if not next_line: | |
42 raise StopIteration | |
43 | |
44 seq = Sequence() | |
45 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') | |
46 | |
47 next_line = self.fasta_file.readline() | |
48 while next_line and next_line[0] != '>': | |
49 # tail = self.fasta_file.tell() | |
50 # line = self.fasta_file.readline() | |
51 # if not line: | |
52 # break | |
53 # if line[0] == '>': | |
54 # self.fasta_file.seek(tail) | |
55 # break | |
56 seq.sequence_parts.append(next_line) | |
57 next_line = self.fasta_file.readline() | |
58 self.next_line = next_line | |
59 return seq | |
60 | |
61 # Python 2/3 compat | |
62 next = __next__ | |
63 # ============================================================================== | |
64 | 26 |
65 | 27 |
66 def target_match(target, search_entry): | 28 def target_match(target, search_entry): |
67 ''' Matches ''' | 29 ''' Matches ''' |
68 search_entry = search_entry.upper() | 30 search_entry = search_entry.upper() |
72 return None | 34 return None |
73 | 35 |
74 | 36 |
75 def main(): | 37 def main(): |
76 ''' the main function''' | 38 ''' the main function''' |
77 logging.basicConfig(filename='filter_fasta_log', | |
78 level=logging.INFO, | |
79 format='%(asctime)s :: %(levelname)s :: %(message)s',) | |
80 | 39 |
81 used_sequences = set() | 40 used_sequences = set() |
82 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} | 41 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} |
83 targets = [] | 42 targets = [] |
84 | 43 |
85 with open(sys.argv[1]) as f_target: | 44 with open(sys.argv[1]) as f_target: |
86 for line in f_target.readlines(): | 45 for line in f_target.readlines(): |
87 targets.append(">%s" % line.strip().upper()) | 46 targets.append(">%s" % line.strip().upper()) |
88 | 47 |
89 work_summary['wanted'] = len(targets) | 48 work_summary['wanted'] = len(targets) |
90 homd_db = FASTAReader(sys.argv[2]) | |
91 | 49 |
92 # output = open(sys.argv[3], "w") | 50 # output = open(sys.argv[3], "w") |
93 for entry in homd_db: | 51 for entry in FASTAReader_gen(sys.argv[2]): |
94 target_matched_results = target_match(targets, entry.header) | 52 target_matched_results = target_match(targets, entry.header) |
95 if target_matched_results: | 53 if target_matched_results: |
96 work_summary['found'] += 1 | 54 work_summary['found'] += 1 |
97 targets.remove(target_matched_results) | 55 targets.remove(target_matched_results) |
98 sequence = entry.get_sequence() | 56 sequence = entry.sequence |
99 used_sequences.add(sequence) | 57 used_sequences.add(sequence) |
100 print(entry.header) | 58 print(entry.header) |
101 print(sequence) | 59 print(sequence) |
102 for parm, count in work_summary.items(): | |
103 logging.info('%s ==> %d', parm, count) | |
104 | 60 |
105 | 61 |
106 if __name__ == "__main__": | 62 if __name__ == "__main__": |
107 main() | 63 main() |