diff RaGOO/ragoo_utilities/SeqReader.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RaGOO/ragoo_utilities/SeqReader.py	Mon Jul 26 18:22:37 2021 +0000
@@ -0,0 +1,76 @@
+import gzip
+
+
+class SeqReader:
+
+    def __init__(self, in_file):
+        """
+        Initialize sequence file to be parsed.
+        :param in_file:
+        """
+        if not isinstance(in_file, str):
+            raise AttributeError('Only a string can be used to instantiate a SeqReader object.')
+        self.in_file = in_file
+
+    def parse_fasta(self):
+        """
+        Generator yielding header and sequence, for each sequence
+        in the fasta file sent to the class.
+        """
+        with open(self.in_file) as fasta_file:
+            sequence = ''
+            # Find first header.
+            line = fasta_file.readline()
+            while not line.startswith('>'):
+                line = fasta_file.readline()
+                if not line:
+                    error = """ This file provided is not in proper fasta format.
+                    In addition to the usual fasta conventions, be sure that there are
+                    no blank lines in the file.
+                    """
+                    raise RuntimeError(error)
+            header = line.rstrip()
+
+            # Get sequence associated with that header.
+            for line in fasta_file:
+                if line.startswith('>'):
+                    # Once the sequence is over, (next header begins),
+                    # yield initial header and sequence.
+                    yield header, sequence
+                    header = line.rstrip()
+                    sequence = ''
+                else:
+                    sequence += ''.join(line.rstrip().split())
+        yield header, sequence
+
+    def parse_gzip_fasta(self):
+        """
+        Generator yielding header and sequence, for each sequence
+        in the fasta file sent to the class. For gzipped fasta files.
+        """
+        with gzip.open(self.in_file) as fasta_file:
+            sequence = ''
+            # Find first header.
+            line = fasta_file.readline().decode('utf-8')
+            while not line.startswith('>'):
+                line = fasta_file.readline().decode('utf-8')
+                if not line:
+                    error = """ This file provided is not in proper fasta format.
+                            In addition to the usual fasta conventions, be sure that there are
+                            no blank lines in the file.
+                            """
+                    raise RuntimeError(error)
+            header = line.rstrip()
+
+            # Get sequence associated with that header.
+            for line in fasta_file:
+                line = line.decode('utf-8')
+                if line.startswith('>'):
+                    # Once the sequence is over, (next header begins),
+                    # yield initial header and sequence.
+                    yield header, sequence
+                    header = line.rstrip()
+                    sequence = ''
+                else:
+                    sequence += ''.join(line.rstrip().split())
+        yield header, sequence
\ No newline at end of file