Mercurial > repos > galaxy-australia > alphafold2
comparison scripts/validate_fasta.py @ 23:2891385d6ace draft default tip
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit b347c6ccc82b14fcbff360b3357050d1d43e3ef5-dirty
author | galaxy-australia |
---|---|
date | Wed, 16 Apr 2025 05:46:58 +0000 |
parents | 2f7702fd0a4c |
children |
comparison
equal
deleted
inserted
replaced
22:3f188450ca4f | 23:2891385d6ace |
---|---|
10 | 10 |
11 | 11 |
12 class Fasta: | 12 class Fasta: |
13 def __init__(self, header_str: str, seq_str: str): | 13 def __init__(self, header_str: str, seq_str: str): |
14 self.header = header_str | 14 self.header = header_str |
15 self.aa_seq = seq_str | 15 self.sequence = seq_str |
16 | 16 |
17 | 17 |
18 class FastaLoader: | 18 class FastaLoader: |
19 def __init__(self, fasta_path: str): | 19 def __init__(self, fasta_path: str): |
20 """Initialize from FASTA file.""" | 20 """Initialize from FASTA file.""" |
138 | 138 |
139 def validate_length(self): | 139 def validate_length(self): |
140 """Confirm whether sequence length is valid.""" | 140 """Confirm whether sequence length is valid.""" |
141 fasta = self.fasta_list[0] | 141 fasta = self.fasta_list[0] |
142 if self.min_length: | 142 if self.min_length: |
143 if len(fasta.aa_seq) < self.min_length: | 143 if len(fasta.sequence) < self.min_length: |
144 raise ValueError( | 144 raise ValueError( |
145 'Error encountered validating FASTA:\n Sequence too short' | 145 'Error encountered validating FASTA:\n Sequence too short' |
146 f' ({len(fasta.aa_seq)}AA).' | 146 f' ({len(fasta.sequence)}AA).' |
147 f' Minimum length is {self.min_length}AA.') | 147 f' Minimum length is {self.min_length}AA.') |
148 if self.max_length: | 148 if self.max_length: |
149 if len(fasta.aa_seq) > self.max_length: | 149 if len(fasta.sequence) > self.max_length: |
150 raise ValueError( | 150 raise ValueError( |
151 'Error encountered validating FASTA:\n' | 151 'Error encountered validating FASTA:\n' |
152 f' Sequence too long ({len(fasta.aa_seq)}AA).' | 152 f' Sequence too long ({len(fasta.sequence)}AA).' |
153 f' Maximum length is {self.max_length}AA.') | 153 f' Maximum length is {self.max_length}AA.') |
154 | 154 |
155 def validate_alphabet(self): | 155 def validate_alphabet(self): |
156 """Confirm whether the sequence conforms to IUPAC codes. | 156 """Confirm whether the sequence conforms to IUPAC codes. |
157 | 157 |
158 If not, report the offending character and its position. | 158 If not, report the offending character and its position. |
159 """ | 159 """ |
160 fasta = self.fasta_list[0] | 160 fasta = self.fasta_list[0] |
161 for i, char in enumerate(fasta.aa_seq.upper()): | 161 for i, char in enumerate(fasta.sequence.upper()): |
162 if char not in self.iupac_characters: | 162 if char not in self.iupac_characters: |
163 raise ValueError( | 163 raise ValueError( |
164 'Error encountered validating FASTA:\n Invalid amino acid' | 164 'Error encountered validating FASTA:\n Invalid amino acid' |
165 f' found at pos {i}: "{char}"') | 165 f' found at pos {i}: "{char}"') |
166 | 166 |
167 def validate_x(self): | 167 def validate_x(self): |
168 """Check for X bases.""" | 168 """Check for X bases.""" |
169 fasta = self.fasta_list[0] | 169 fasta = self.fasta_list[0] |
170 for i, char in enumerate(fasta.aa_seq.upper()): | 170 for i, char in enumerate(fasta.sequence.upper()): |
171 if char == 'X': | 171 if char == 'X': |
172 raise ValueError( | 172 raise ValueError( |
173 'Error encountered validating FASTA:\n Unsupported AA code' | 173 'Error encountered validating FASTA:\n Unsupported AA code' |
174 f' "X" found at pos {i}') | 174 f' "X" found at pos {i}') |
175 | 175 |
178 def __init__(self) -> None: | 178 def __init__(self) -> None: |
179 self.line_wrap = 60 | 179 self.line_wrap = 60 |
180 | 180 |
181 def write(self, fasta: Fasta): | 181 def write(self, fasta: Fasta): |
182 header = fasta.header | 182 header = fasta.header |
183 seq = self.format_sequence(fasta.aa_seq) | 183 seq = self.format_sequence(fasta.sequence) |
184 sys.stdout.write(header + '\n') | 184 sys.stdout.write(header + '\n') |
185 sys.stdout.write(seq) | 185 sys.stdout.write(seq) |
186 | 186 |
187 def format_sequence(self, aa_seq: str): | 187 def format_sequence(self, sequence: str): |
188 formatted_seq = '' | 188 formatted_seq = '' |
189 for i in range(0, len(aa_seq), self.line_wrap): | 189 for i in range(0, len(sequence), self.line_wrap): |
190 formatted_seq += aa_seq[i: i + self.line_wrap] + '\n' | 190 formatted_seq += sequence[i: i + self.line_wrap] + '\n' |
191 return formatted_seq.upper() | 191 return formatted_seq.upper() |
192 | 192 |
193 | 193 |
194 def main(): | 194 def main(): |
195 # load fasta file | 195 # load fasta file |
212 fw.write(fas) | 212 fw.write(fas) |
213 | 213 |
214 sys.stderr.write("Validated FASTA sequence(s):\n\n") | 214 sys.stderr.write("Validated FASTA sequence(s):\n\n") |
215 for fas in clean_fastas: | 215 for fas in clean_fastas: |
216 sys.stderr.write(fas.header + '\n') | 216 sys.stderr.write(fas.header + '\n') |
217 sys.stderr.write(fas.aa_seq + '\n\n') | 217 sys.stderr.write(fas.sequence + '\n\n') |
218 | 218 |
219 except ValueError as exc: | 219 except ValueError as exc: |
220 sys.stderr.write(f"{exc}\n\n") | 220 sys.stderr.write(f"{exc}\n\n") |
221 raise exc | 221 raise exc |
222 | 222 |