comparison scripts/validate_fasta.py @ 23:2891385d6ace draft default tip

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit b347c6ccc82b14fcbff360b3357050d1d43e3ef5-dirty
author galaxy-australia
date Wed, 16 Apr 2025 05:46:58 +0000
parents 2f7702fd0a4c
children
comparison
equal deleted inserted replaced
22:3f188450ca4f 23:2891385d6ace
10 10
11 11
12 class Fasta: 12 class Fasta:
13 def __init__(self, header_str: str, seq_str: str): 13 def __init__(self, header_str: str, seq_str: str):
14 self.header = header_str 14 self.header = header_str
15 self.aa_seq = seq_str 15 self.sequence = seq_str
16 16
17 17
18 class FastaLoader: 18 class FastaLoader:
19 def __init__(self, fasta_path: str): 19 def __init__(self, fasta_path: str):
20 """Initialize from FASTA file.""" 20 """Initialize from FASTA file."""
138 138
139 def validate_length(self): 139 def validate_length(self):
140 """Confirm whether sequence length is valid.""" 140 """Confirm whether sequence length is valid."""
141 fasta = self.fasta_list[0] 141 fasta = self.fasta_list[0]
142 if self.min_length: 142 if self.min_length:
143 if len(fasta.aa_seq) < self.min_length: 143 if len(fasta.sequence) < self.min_length:
144 raise ValueError( 144 raise ValueError(
145 'Error encountered validating FASTA:\n Sequence too short' 145 'Error encountered validating FASTA:\n Sequence too short'
146 f' ({len(fasta.aa_seq)}AA).' 146 f' ({len(fasta.sequence)}AA).'
147 f' Minimum length is {self.min_length}AA.') 147 f' Minimum length is {self.min_length}AA.')
148 if self.max_length: 148 if self.max_length:
149 if len(fasta.aa_seq) > self.max_length: 149 if len(fasta.sequence) > self.max_length:
150 raise ValueError( 150 raise ValueError(
151 'Error encountered validating FASTA:\n' 151 'Error encountered validating FASTA:\n'
152 f' Sequence too long ({len(fasta.aa_seq)}AA).' 152 f' Sequence too long ({len(fasta.sequence)}AA).'
153 f' Maximum length is {self.max_length}AA.') 153 f' Maximum length is {self.max_length}AA.')
154 154
155 def validate_alphabet(self): 155 def validate_alphabet(self):
156 """Confirm whether the sequence conforms to IUPAC codes. 156 """Confirm whether the sequence conforms to IUPAC codes.
157 157
158 If not, report the offending character and its position. 158 If not, report the offending character and its position.
159 """ 159 """
160 fasta = self.fasta_list[0] 160 fasta = self.fasta_list[0]
161 for i, char in enumerate(fasta.aa_seq.upper()): 161 for i, char in enumerate(fasta.sequence.upper()):
162 if char not in self.iupac_characters: 162 if char not in self.iupac_characters:
163 raise ValueError( 163 raise ValueError(
164 'Error encountered validating FASTA:\n Invalid amino acid' 164 'Error encountered validating FASTA:\n Invalid amino acid'
165 f' found at pos {i}: "{char}"') 165 f' found at pos {i}: "{char}"')
166 166
167 def validate_x(self): 167 def validate_x(self):
168 """Check for X bases.""" 168 """Check for X bases."""
169 fasta = self.fasta_list[0] 169 fasta = self.fasta_list[0]
170 for i, char in enumerate(fasta.aa_seq.upper()): 170 for i, char in enumerate(fasta.sequence.upper()):
171 if char == 'X': 171 if char == 'X':
172 raise ValueError( 172 raise ValueError(
173 'Error encountered validating FASTA:\n Unsupported AA code' 173 'Error encountered validating FASTA:\n Unsupported AA code'
174 f' "X" found at pos {i}') 174 f' "X" found at pos {i}')
175 175
178 def __init__(self) -> None: 178 def __init__(self) -> None:
179 self.line_wrap = 60 179 self.line_wrap = 60
180 180
181 def write(self, fasta: Fasta): 181 def write(self, fasta: Fasta):
182 header = fasta.header 182 header = fasta.header
183 seq = self.format_sequence(fasta.aa_seq) 183 seq = self.format_sequence(fasta.sequence)
184 sys.stdout.write(header + '\n') 184 sys.stdout.write(header + '\n')
185 sys.stdout.write(seq) 185 sys.stdout.write(seq)
186 186
187 def format_sequence(self, aa_seq: str): 187 def format_sequence(self, sequence: str):
188 formatted_seq = '' 188 formatted_seq = ''
189 for i in range(0, len(aa_seq), self.line_wrap): 189 for i in range(0, len(sequence), self.line_wrap):
190 formatted_seq += aa_seq[i: i + self.line_wrap] + '\n' 190 formatted_seq += sequence[i: i + self.line_wrap] + '\n'
191 return formatted_seq.upper() 191 return formatted_seq.upper()
192 192
193 193
194 def main(): 194 def main():
195 # load fasta file 195 # load fasta file
212 fw.write(fas) 212 fw.write(fas)
213 213
214 sys.stderr.write("Validated FASTA sequence(s):\n\n") 214 sys.stderr.write("Validated FASTA sequence(s):\n\n")
215 for fas in clean_fastas: 215 for fas in clean_fastas:
216 sys.stderr.write(fas.header + '\n') 216 sys.stderr.write(fas.header + '\n')
217 sys.stderr.write(fas.aa_seq + '\n\n') 217 sys.stderr.write(fas.sequence + '\n\n')
218 218
219 except ValueError as exc: 219 except ValueError as exc:
220 sys.stderr.write(f"{exc}\n\n") 220 sys.stderr.write(f"{exc}\n\n")
221 raise exc 221 raise exc
222 222