Mercurial > repos > dereeper > ragoo
comparison RaGOO/ragoo_utilities/SeqReader.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 import gzip | |
2 | |
3 | |
4 class SeqReader: | |
5 | |
6 def __init__(self, in_file): | |
7 """ | |
8 Initialize sequence file to be parsed. | |
9 :param in_file: | |
10 """ | |
11 if not isinstance(in_file, str): | |
12 raise AttributeError('Only a string can be used to instantiate a SeqReader object.') | |
13 self.in_file = in_file | |
14 | |
15 def parse_fasta(self): | |
16 """ | |
17 Generator yielding header and sequence, for each sequence | |
18 in the fasta file sent to the class. | |
19 """ | |
20 with open(self.in_file) as fasta_file: | |
21 sequence = '' | |
22 # Find first header. | |
23 line = fasta_file.readline() | |
24 while not line.startswith('>'): | |
25 line = fasta_file.readline() | |
26 if not line: | |
27 error = """ This file provided is not in proper fasta format. | |
28 In addition to the usual fasta conventions, be sure that there are | |
29 no blank lines in the file. | |
30 """ | |
31 raise RuntimeError(error) | |
32 header = line.rstrip() | |
33 | |
34 # Get sequence associated with that header. | |
35 for line in fasta_file: | |
36 if line.startswith('>'): | |
37 # Once the sequence is over, (next header begins), | |
38 # yield initial header and sequence. | |
39 yield header, sequence | |
40 header = line.rstrip() | |
41 sequence = '' | |
42 else: | |
43 sequence += ''.join(line.rstrip().split()) | |
44 yield header, sequence | |
45 | |
46 def parse_gzip_fasta(self): | |
47 """ | |
48 Generator yielding header and sequence, for each sequence | |
49 in the fasta file sent to the class. For gzipped fasta files. | |
50 """ | |
51 with gzip.open(self.in_file) as fasta_file: | |
52 sequence = '' | |
53 # Find first header. | |
54 line = fasta_file.readline().decode('utf-8') | |
55 while not line.startswith('>'): | |
56 line = fasta_file.readline().decode('utf-8') | |
57 if not line: | |
58 error = """ This file provided is not in proper fasta format. | |
59 In addition to the usual fasta conventions, be sure that there are | |
60 no blank lines in the file. | |
61 """ | |
62 raise RuntimeError(error) | |
63 header = line.rstrip() | |
64 | |
65 # Get sequence associated with that header. | |
66 for line in fasta_file: | |
67 line = line.decode('utf-8') | |
68 if line.startswith('>'): | |
69 # Once the sequence is over, (next header begins), | |
70 # yield initial header and sequence. | |
71 yield header, sequence | |
72 header = line.rstrip() | |
73 sequence = '' | |
74 else: | |
75 sequence += ''.join(line.rstrip().split()) | |
76 yield header, sequence |