Mercurial > repos > earlhaminst > t_coffee
comparison filter_by_fasta_ids.py @ 0:794a6e864a96 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
author | earlhaminst |
---|---|
date | Thu, 15 Dec 2016 11:04:25 -0500 |
parents | |
children | b3833e5b50d4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:794a6e864a96 |
---|---|
1 #!/usr/bin/env python | |
2 """ A script to build specific fasta databases """ | |
3 from __future__ import print_function | |
4 | |
5 | |
6 import logging | |
7 import sys | |
8 | |
9 | |
10 # ===================================== Iterator =============================== | |
11 class Sequence: | |
12 ''' Holds protein sequence information ''' | |
13 def __init__(self): | |
14 self.header = "" | |
15 self.sequence_parts = [] | |
16 | |
17 def get_sequence(self): | |
18 return "".join([line.rstrip().replace('\n', '').replace('\r', '') for line in self.sequence_parts]) | |
19 | |
20 | |
21 class FASTAReader: | |
22 """ | |
23 FASTA db iterator. Returns a single FASTA sequence object. | |
24 """ | |
25 def __init__(self, fasta_name): | |
26 self.fasta_file = open(fasta_name) | |
27 self.next_line = self.fasta_file.readline() | |
28 | |
29 def __iter__(self): | |
30 return self | |
31 | |
32 def __next__(self): | |
33 ''' Iteration ''' | |
34 # while True: | |
35 # line = self.fasta_file.readline() | |
36 # if not line: | |
37 # raise StopIteration | |
38 # if line[0] == '>': | |
39 # break | |
40 next_line = self.next_line | |
41 if not next_line: | |
42 raise StopIteration | |
43 | |
44 seq = Sequence() | |
45 seq.header = next_line.rstrip().replace('\n', '').replace('\r', '') | |
46 | |
47 next_line = self.fasta_file.readline() | |
48 while next_line and next_line[0] != '>': | |
49 # tail = self.fasta_file.tell() | |
50 # line = self.fasta_file.readline() | |
51 # if not line: | |
52 # break | |
53 # if line[0] == '>': | |
54 # self.fasta_file.seek(tail) | |
55 # break | |
56 seq.sequence_parts.append(next_line) | |
57 next_line = self.fasta_file.readline() | |
58 self.next_line = next_line | |
59 return seq | |
60 | |
61 # Python 2/3 compat | |
62 next = __next__ | |
63 # ============================================================================== | |
64 | |
65 | |
66 def target_match(target, search_entry): | |
67 ''' Matches ''' | |
68 search_entry = search_entry.upper() | |
69 for atarget in target: | |
70 if search_entry.find(atarget) > -1: | |
71 return atarget | |
72 return None | |
73 | |
74 | |
75 def main(): | |
76 ''' the main function''' | |
77 logging.basicConfig(filename='filter_fasta_log', | |
78 level=logging.INFO, | |
79 format='%(asctime)s :: %(levelname)s :: %(message)s',) | |
80 | |
81 used_sequences = set() | |
82 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} | |
83 targets = [] | |
84 | |
85 f_target = open(sys.argv[1]) | |
86 for line in f_target.readlines(): | |
87 targets.append(">%s" % line.strip().upper()) | |
88 f_target.close() | |
89 | |
90 work_summary['wanted'] = len(targets) | |
91 homd_db = FASTAReader(sys.argv[2]) | |
92 | |
93 # output = open(sys.argv[3], "w") | |
94 for entry in homd_db: | |
95 target_matched_results = target_match(targets, entry.header) | |
96 if target_matched_results: | |
97 work_summary['found'] += 1 | |
98 targets.remove(target_matched_results) | |
99 sequence = entry.get_sequence() | |
100 used_sequences.add(sequence) | |
101 print(entry.header) | |
102 print(sequence) | |
103 for parm, count in work_summary.items(): | |
104 logging.info('%s ==> %d', parm, count) | |
105 | |
106 | |
107 if __name__ == "__main__": | |
108 main() |