comparison scripts/validate_fasta.py @ 18:e4a053d67e24 draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit ef97511263dcac81f8563ae6a98d1db2400fcf1d
author galaxy-australia
date Fri, 01 Sep 2023 00:09:46 +0000
parents f9eb041c518c
children 2f7702fd0a4c
comparison
equal deleted inserted replaced
17:5b85006245f3 18:e4a053d67e24
4 import re 4 import re
5 import sys 5 import sys
6 from typing import List 6 from typing import List
7 7
8 MULTIMER_MAX_SEQUENCE_COUNT = 10 8 MULTIMER_MAX_SEQUENCE_COUNT = 10
9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' ']
9 10
10 11
11 class Fasta: 12 class Fasta:
12 def __init__(self, header_str: str, seq_str: str): 13 def __init__(self, header_str: str, seq_str: str):
13 self.header = header_str 14 self.header = header_str
64 if sequence: 65 if sequence:
65 # create generic header if not exists 66 # create generic header if not exists
66 if not header: 67 if not header:
67 fasta_count = len(self.fastas) 68 fasta_count = len(self.fastas)
68 header = f'>sequence_{fasta_count}' 69 header = f'>sequence_{fasta_count}'
70
71 for char in STRIP_SEQUENCE_CHARS:
72 sequence = sequence.replace(char, '')
69 73
70 # Create new Fasta 74 # Create new Fasta
71 self.fastas.append(Fasta(header, sequence)) 75 self.fastas.append(Fasta(header, sequence))
72 76
73 77
107 raise ValueError( 111 raise ValueError(
108 'Error encountered validating FASTA:\n' 112 'Error encountered validating FASTA:\n'
109 'Multimer mode requires multiple input sequence.' 113 'Multimer mode requires multiple input sequence.'
110 f' Only {fasta_count} sequences were detected in' 114 f' Only {fasta_count} sequences were detected in'
111 ' the provided file.') 115 ' the provided file.')
112 self.fasta_list = self.fasta_list
113 116
114 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: 117 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT:
115 sys.stderr.write( 118 sys.stderr.write(
116 f'WARNING: detected {fasta_count} sequences but the' 119 f'WARNING: detected {fasta_count} sequences but the'
117 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' 120 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}'