comparison validate_fasta.py @ 7:eb085b3dbaf8 draft

"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 8d9f0ae6af9e8d9313c6cdcc551b24c6c44ae341"
author galaxy-australia
date Tue, 19 Apr 2022 00:39:29 +0000
parents 04e95886cf24
children ca90d17ff51b
comparison
equal deleted inserted replaced
6:04e95886cf24 7:eb085b3dbaf8
84 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 84 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
85 'Y', 'Z', '-' 85 'Y', 'Z', '-'
86 } 86 }
87 87
88 def validate(self): 88 def validate(self):
89 """performs fasta validation""" 89 """Perform FASTA validation."""
90 self.validate_num_seqs() 90 self.validate_num_seqs()
91 self.validate_length() 91 self.validate_length()
92 self.validate_alphabet() 92 self.validate_alphabet()
93 93
94 # not checking for 'X' nucleotides at the moment. 94 # not checking for 'X' nucleotides at the moment.
96 # self.validate_x() 96 # self.validate_x()
97 97
98 def validate_num_seqs(self) -> None: 98 def validate_num_seqs(self) -> None:
99 """Assert that only one sequence has been provided.""" 99 """Assert that only one sequence has been provided."""
100 if len(self.fasta_list) > 1: 100 if len(self.fasta_list) > 1:
101 raise Exception( 101 raise ValueError(
102 'Error encountered validating fasta:' 102 'Error encountered validating FASTA:\n'
103 f' More than 1 sequence detected ({len(self.fasta_list)}).' 103 f' More than 1 sequence detected ({len(self.fasta_list)}).'
104 ' Please use single fasta sequence as input.') 104 ' Please use single FASTA sequence as input.')
105 elif len(self.fasta_list) == 0: 105 elif len(self.fasta_list) == 0:
106 raise Exception( 106 raise ValueError(
107 'Error encountered validating fasta:' 107 'Error encountered validating FASTA:\n'
108 ' input file has no fasta sequences') 108 ' input file has no FASTA sequences')
109 109
110 def validate_length(self): 110 def validate_length(self):
111 """Confirm whether sequence length is valid.""" 111 """Confirm whether sequence length is valid."""
112 fasta = self.fasta_list[0] 112 fasta = self.fasta_list[0]
113 if self.min_length: 113 if self.min_length:
114 if len(fasta.aa_seq) < self.min_length: 114 if len(fasta.aa_seq) < self.min_length:
115 raise Exception( 115 raise ValueError(
116 'Error encountered validating fasta: Sequence too short' 116 'Error encountered validating FASTA:\n Sequence too short'
117 f' ({len(fasta.aa_seq)}AA).' 117 f' ({len(fasta.aa_seq)}AA).'
118 f' Minimum length is {self.min_length}AA.') 118 f' Minimum length is {self.min_length}AA.')
119 if self.max_length: 119 if self.max_length:
120 if len(fasta.aa_seq) > self.max_length: 120 if len(fasta.aa_seq) > self.max_length:
121 raise Exception( 121 raise ValueError(
122 'Error encountered validating fasta:' 122 'Error encountered validating FASTA:\n'
123 f' Sequence too long ({len(fasta.aa_seq)}AA).' 123 f' Sequence too long ({len(fasta.aa_seq)}AA).'
124 f' Maximum length is {self.max_length}AA.') 124 f' Maximum length is {self.max_length}AA.')
125 125
126 def validate_alphabet(self): 126 def validate_alphabet(self):
127 """ 127 """Confirm whether the sequence conforms to IUPAC codes.
128 Confirm whether the sequence conforms to IUPAC codes. 128
129 If not, report the offending character and its position. 129 If not, report the offending character and its position.
130 """ 130 """
131 fasta = self.fasta_list[0] 131 fasta = self.fasta_list[0]
132 for i, char in enumerate(fasta.aa_seq.upper()): 132 for i, char in enumerate(fasta.aa_seq.upper()):
133 if char not in self.iupac_characters: 133 if char not in self.iupac_characters:
134 raise Exception( 134 raise ValueError(
135 'Error encountered validating fasta: Invalid amino acid' 135 'Error encountered validating FASTA:\n Invalid amino acid'
136 f' found at pos {i}: "{char}"') 136 f' found at pos {i}: "{char}"')
137 137
138 def validate_x(self): 138 def validate_x(self):
139 """Check for X bases.""" 139 """Check for X bases."""
140 fasta = self.fasta_list[0] 140 fasta = self.fasta_list[0]
141 for i, char in enumerate(fasta.aa_seq.upper()): 141 for i, char in enumerate(fasta.aa_seq.upper()):
142 if char == 'X': 142 if char == 'X':
143 raise Exception( 143 raise ValueError(
144 'Error encountered validating fasta: Unsupported AA code' 144 'Error encountered validating FASTA:\n Unsupported AA code'
145 f' "X" found at pos {i}') 145 f' "X" found at pos {i}')
146 146
147 147
148 class FastaWriter: 148 class FastaWriter:
149 def __init__(self) -> None: 149 def __init__(self) -> None:
162 return formatted_seq 162 return formatted_seq
163 163
164 164
165 def main(): 165 def main():
166 # load fasta file 166 # load fasta file
167 args = parse_args() 167 try:
168 fas = FastaLoader(args.input) 168 args = parse_args()
169 169 fas = FastaLoader(args.input)
170 # validate 170
171 fv = FastaValidator( 171 # validate
172 fas.fastas, 172 fv = FastaValidator(
173 min_length=args.min_length, 173 fas.fastas,
174 max_length=args.max_length, 174 min_length=args.min_length,
175 ) 175 max_length=args.max_length,
176 fv.validate() 176 )
177 177 fv.validate()
178 # write cleaned version 178
179 fw = FastaWriter() 179 # write cleaned version
180 fw.write(fas.fastas[0]) 180 fw = FastaWriter()
181 fw.write(fas.fastas[0])
182
183 except ValueError as exc:
184 sys.stderr.write(f"{exc}\n\n")
185 raise exc
186
187 except Exception as exc:
188 sys.stderr.write(
189 "Input error: FASTA input is invalid. Please check your input.\n\n"
190 )
191 raise exc
181 192
182 193
183 def parse_args() -> argparse.Namespace: 194 def parse_args() -> argparse.Namespace:
184 parser = argparse.ArgumentParser() 195 parser = argparse.ArgumentParser()
185 parser.add_argument( 196 parser.add_argument(