comparison RaGOO/ragoo_utilities/SeqReader.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 import gzip
2
3
4 class SeqReader:
5
6 def __init__(self, in_file):
7 """
8 Initialize sequence file to be parsed.
9 :param in_file:
10 """
11 if not isinstance(in_file, str):
12 raise AttributeError('Only a string can be used to instantiate a SeqReader object.')
13 self.in_file = in_file
14
15 def parse_fasta(self):
16 """
17 Generator yielding header and sequence, for each sequence
18 in the fasta file sent to the class.
19 """
20 with open(self.in_file) as fasta_file:
21 sequence = ''
22 # Find first header.
23 line = fasta_file.readline()
24 while not line.startswith('>'):
25 line = fasta_file.readline()
26 if not line:
27 error = """ This file provided is not in proper fasta format.
28 In addition to the usual fasta conventions, be sure that there are
29 no blank lines in the file.
30 """
31 raise RuntimeError(error)
32 header = line.rstrip()
33
34 # Get sequence associated with that header.
35 for line in fasta_file:
36 if line.startswith('>'):
37 # Once the sequence is over, (next header begins),
38 # yield initial header and sequence.
39 yield header, sequence
40 header = line.rstrip()
41 sequence = ''
42 else:
43 sequence += ''.join(line.rstrip().split())
44 yield header, sequence
45
46 def parse_gzip_fasta(self):
47 """
48 Generator yielding header and sequence, for each sequence
49 in the fasta file sent to the class. For gzipped fasta files.
50 """
51 with gzip.open(self.in_file) as fasta_file:
52 sequence = ''
53 # Find first header.
54 line = fasta_file.readline().decode('utf-8')
55 while not line.startswith('>'):
56 line = fasta_file.readline().decode('utf-8')
57 if not line:
58 error = """ This file provided is not in proper fasta format.
59 In addition to the usual fasta conventions, be sure that there are
60 no blank lines in the file.
61 """
62 raise RuntimeError(error)
63 header = line.rstrip()
64
65 # Get sequence associated with that header.
66 for line in fasta_file:
67 line = line.decode('utf-8')
68 if line.startswith('>'):
69 # Once the sequence is over, (next header begins),
70 # yield initial header and sequence.
71 yield header, sequence
72 header = line.rstrip()
73 sequence = ''
74 else:
75 sequence += ''.join(line.rstrip().split())
76 yield header, sequence