Mercurial > repos > dereeper > ragoo
view RaGOO/ragoo_utilities/SeqReader.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
line wrap: on
line source
import gzip class SeqReader: def __init__(self, in_file): """ Initialize sequence file to be parsed. :param in_file: """ if not isinstance(in_file, str): raise AttributeError('Only a string can be used to instantiate a SeqReader object.') self.in_file = in_file def parse_fasta(self): """ Generator yielding header and sequence, for each sequence in the fasta file sent to the class. """ with open(self.in_file) as fasta_file: sequence = '' # Find first header. line = fasta_file.readline() while not line.startswith('>'): line = fasta_file.readline() if not line: error = """ This file provided is not in proper fasta format. In addition to the usual fasta conventions, be sure that there are no blank lines in the file. """ raise RuntimeError(error) header = line.rstrip() # Get sequence associated with that header. for line in fasta_file: if line.startswith('>'): # Once the sequence is over, (next header begins), # yield initial header and sequence. yield header, sequence header = line.rstrip() sequence = '' else: sequence += ''.join(line.rstrip().split()) yield header, sequence def parse_gzip_fasta(self): """ Generator yielding header and sequence, for each sequence in the fasta file sent to the class. For gzipped fasta files. """ with gzip.open(self.in_file) as fasta_file: sequence = '' # Find first header. line = fasta_file.readline().decode('utf-8') while not line.startswith('>'): line = fasta_file.readline().decode('utf-8') if not line: error = """ This file provided is not in proper fasta format. In addition to the usual fasta conventions, be sure that there are no blank lines in the file. """ raise RuntimeError(error) header = line.rstrip() # Get sequence associated with that header. for line in fasta_file: line = line.decode('utf-8') if line.startswith('>'): # Once the sequence is over, (next header begins), # yield initial header and sequence. yield header, sequence header = line.rstrip() sequence = '' else: sequence += ''.join(line.rstrip().split()) yield header, sequence