annotate RaGOO/ragoo_utilities/SeqReader.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 import gzip
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 class SeqReader:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6 def __init__(self, in_file):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8 Initialize sequence file to be parsed.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 :param in_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11 if not isinstance(in_file, str):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 raise AttributeError('Only a string can be used to instantiate a SeqReader object.')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13 self.in_file = in_file
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15 def parse_fasta(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 Generator yielding header and sequence, for each sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 in the fasta file sent to the class.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20 with open(self.in_file) as fasta_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 sequence = ''
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 # Find first header.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23 line = fasta_file.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 while not line.startswith('>'):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 line = fasta_file.readline()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 if not line:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27 error = """ This file provided is not in proper fasta format.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 In addition to the usual fasta conventions, be sure that there are
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 no blank lines in the file.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31 raise RuntimeError(error)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 header = line.rstrip()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 # Get sequence associated with that header.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 for line in fasta_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 if line.startswith('>'):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 # Once the sequence is over, (next header begins),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 # yield initial header and sequence.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39 yield header, sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40 header = line.rstrip()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 sequence = ''
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 sequence += ''.join(line.rstrip().split())
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 yield header, sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 def parse_gzip_fasta(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48 Generator yielding header and sequence, for each sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 in the fasta file sent to the class. For gzipped fasta files.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 with gzip.open(self.in_file) as fasta_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 sequence = ''
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 # Find first header.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 line = fasta_file.readline().decode('utf-8')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55 while not line.startswith('>'):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 line = fasta_file.readline().decode('utf-8')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 if not line:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 error = """ This file provided is not in proper fasta format.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 In addition to the usual fasta conventions, be sure that there are
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60 no blank lines in the file.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 raise RuntimeError(error)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 header = line.rstrip()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 # Get sequence associated with that header.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 for line in fasta_file:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 line = line.decode('utf-8')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68 if line.startswith('>'):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 # Once the sequence is over, (next header begins),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 # yield initial header and sequence.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 yield header, sequence
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 header = line.rstrip()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 sequence = ''
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75 sequence += ''.join(line.rstrip().split())
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 yield header, sequence