13
|
1 import gzip
|
|
2
|
|
3
|
|
4 class SeqReader:
|
|
5
|
|
6 def __init__(self, in_file):
|
|
7 """
|
|
8 Initialize sequence file to be parsed.
|
|
9 :param in_file:
|
|
10 """
|
|
11 if not isinstance(in_file, str):
|
|
12 raise AttributeError('Only a string can be used to instantiate a SeqReader object.')
|
|
13 self.in_file = in_file
|
|
14
|
|
15 def parse_fasta(self):
|
|
16 """
|
|
17 Generator yielding header and sequence, for each sequence
|
|
18 in the fasta file sent to the class.
|
|
19 """
|
|
20 with open(self.in_file) as fasta_file:
|
|
21 sequence = ''
|
|
22 # Find first header.
|
|
23 line = fasta_file.readline()
|
|
24 while not line.startswith('>'):
|
|
25 line = fasta_file.readline()
|
|
26 if not line:
|
|
27 error = """ This file provided is not in proper fasta format.
|
|
28 In addition to the usual fasta conventions, be sure that there are
|
|
29 no blank lines in the file.
|
|
30 """
|
|
31 raise RuntimeError(error)
|
|
32 header = line.rstrip()
|
|
33
|
|
34 # Get sequence associated with that header.
|
|
35 for line in fasta_file:
|
|
36 if line.startswith('>'):
|
|
37 # Once the sequence is over, (next header begins),
|
|
38 # yield initial header and sequence.
|
|
39 yield header, sequence
|
|
40 header = line.rstrip()
|
|
41 sequence = ''
|
|
42 else:
|
|
43 sequence += ''.join(line.rstrip().split())
|
|
44 yield header, sequence
|
|
45
|
|
46 def parse_gzip_fasta(self):
|
|
47 """
|
|
48 Generator yielding header and sequence, for each sequence
|
|
49 in the fasta file sent to the class. For gzipped fasta files.
|
|
50 """
|
|
51 with gzip.open(self.in_file) as fasta_file:
|
|
52 sequence = ''
|
|
53 # Find first header.
|
|
54 line = fasta_file.readline().decode('utf-8')
|
|
55 while not line.startswith('>'):
|
|
56 line = fasta_file.readline().decode('utf-8')
|
|
57 if not line:
|
|
58 error = """ This file provided is not in proper fasta format.
|
|
59 In addition to the usual fasta conventions, be sure that there are
|
|
60 no blank lines in the file.
|
|
61 """
|
|
62 raise RuntimeError(error)
|
|
63 header = line.rstrip()
|
|
64
|
|
65 # Get sequence associated with that header.
|
|
66 for line in fasta_file:
|
|
67 line = line.decode('utf-8')
|
|
68 if line.startswith('>'):
|
|
69 # Once the sequence is over, (next header begins),
|
|
70 # yield initial header and sequence.
|
|
71 yield header, sequence
|
|
72 header = line.rstrip()
|
|
73 sequence = ''
|
|
74 else:
|
|
75 sequence += ''.join(line.rstrip().split())
|
|
76 yield header, sequence |