alphafold2: validate_fasta.py comparison

comparison validate_fasta.py @ 1:6c92e000d684 draft

"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86"

author	galaxy-australia
date	Tue, 01 Mar 2022 02:53:05 +0000
parents	7ae9d78b06f5
children	04e95886cf24

comparison

equal deleted inserted replaced

-:7ae9d78b06f5
+:6c92e000d684
+"""Validate input FASTA sequence."""
+import re
 import argparse
 from typing import List, TextIO
 class Fasta:
 self.header = header_str
 self.aa_seq = seq_str
 class FastaLoader:
-def __init__(self):
+def __init__(self, fasta_path: str):
-"""creates a Fasta() from a file"""
+"""Initialize from FASTA file."""
-self.fastas: List[Fasta] = []
+self.fastas = []
+self.load(fasta_path)
+print("Loaded FASTA sequences:")
+for f in self.fastas:
+print(f.header)
+print(f.aa_seq)
 def load(self, fasta_path: str):
-"""
+"""Load bare or FASTA formatted sequence."""
-load function has to be very flexible.
+with open(fasta_path, 'r') as f:
-file may be normal fasta format (header, seq) or can just be a bare sequence.
+self.content = f.read()
-"""
-with open(fasta_path, 'r') as fp:
+if "__cn__" in self.content:
-header, sequence = self.interpret_first_line(fp)
+# Pasted content with escaped characters
-line = fp.readline().rstrip('\n')
+self.newline = '__cn__'
+self.caret = '__gt__'
-while line:
+else:
-if line.startswith('>'):
+# Uploaded file with normal content
-self.update_fastas(header, sequence)
+self.newline = '\n'
-header = line
+self.caret = '>'
-sequence = ''
-else:
+self.lines = self.content.split(self.newline)
-sequence += line
+header, sequence = self.interpret_first_line()
-line = fp.readline().rstrip('\n')
+i = 0
+while i < len(self.lines):
+line = self.lines[i]
+if line.startswith(self.caret):
+self.update_fastas(header, sequence)
+header = '>' + self.strip_header(line)
+sequence = ''
+else:
+sequence += line.strip('\n ')
+i += 1
 # after reading whole file, header & sequence buffers might be full
 self.update_fastas(header, sequence)
-return self.fastas
-def interpret_first_line(self, fp: TextIO):
+def interpret_first_line(self):
-header = ''
+line = self.lines[0]
-sequence = ''
+if line.startswith(self.caret):
-line = fp.readline().rstrip('\n')
+header = '>' + self.strip_header(line)
-if line.startswith('>'):
+return header, ''
-header = line
 else:
-sequence += line
+return '', line
-return header, sequence
+def strip_header(self, line):
+"""Strip characters escaped with underscores from pasted text."""
+return re.sub(r'\_\_.{2}\_\_', '', line).strip('>')
 def update_fastas(self, header: str, sequence: str):
 # if we have a sequence
-if not sequence == '':
+if sequence:
 # create generic header if not exists
-if header == '':
+if not header:
 fasta_count = len(self.fastas)
 header = f'>sequence_{fasta_count}'
-# create new Fasta
+# Create new Fasta
 self.fastas.append(Fasta(header, sequence))
 class FastaValidator:
 def __init__(self, fasta_list: List[Fasta]):
 self.fasta_list = fasta_list
 self.min_length = 30
 self.max_length = 2000
 self.iupac_characters = {
 'A', 'B', 'C', 'D', 'E', 'F', 'G',
 'H', 'I', 'K', 'L', 'M', 'N', 'P',
 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 'Y', 'Z', '-'
 }
 def validate(self):
 """performs fasta validation"""
 self.validate_num_seqs()
 self.validate_length()
 self.validate_alphabet()
 # not checking for 'X' nucleotides at the moment.
 # alphafold can throw an error if it doesn't like it.
 #self.validate_x()
 def validate_num_seqs(self) -> None:
 if len(self.fasta_list) > 1:
 raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input')
 elif len(self.fasta_list) == 0:
 fasta = self.fasta_list[0]
 if len(fasta.aa_seq) < self.min_length:
 raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa')
 if len(fasta.aa_seq) > self.max_length:
 raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa')
 def validate_alphabet(self):
 """
 Confirms whether the sequence conforms to IUPAC codes.
 If not, reports the offending character and its position.
 """
 fasta = self.fasta_list[0]
 for i, char in enumerate(fasta.aa_seq.upper()):
 if char not in self.iupac_characters:
-raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: {char}')
+raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: "{char}"')
 def validate_x(self):
 """checks if any bases are X. TODO check whether alphafold accepts X bases. """
 fasta = self.fasta_list[0]
 for i, char in enumerate(fasta.aa_seq.upper()):
 if char == 'X':
 raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}')
 def main():
 # load fasta file
 args = parse_args()
-fl = FastaLoader()
+fas = FastaLoader(args.input_fasta)
-fastas = fl.load(args.input_fasta)
 # validate
-fv = FastaValidator(fastas)
+fv = FastaValidator(fas.fastas)
 fv.validate()
 # write cleaned version
 fw = FastaWriter()
-fw.write(fastas[0])
+fw.write(fas.fastas[0])
 def parse_args() -> argparse.Namespace:
 parser = argparse.ArgumentParser()
 parser.add_argument(
 "input_fasta",
 help="input fasta file",
 type=str
 )
 return parser.parse_args()
 if __name__ == '__main__':

Mercurial > repos > galaxy-australia > alphafold2

comparison validate_fasta.py @ 1:6c92e000d684 draft