Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/ruamel/yaml/scanner.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
comparison
equal
deleted
inserted
replaced
4:79f47841a781 | 5:9b1c78e6ba9c |
---|---|
1 # coding: utf-8 | |
2 | |
3 from __future__ import print_function, absolute_import, division, unicode_literals | |
4 | |
5 # Scanner produces tokens of the following types: | |
6 # STREAM-START | |
7 # STREAM-END | |
8 # DIRECTIVE(name, value) | |
9 # DOCUMENT-START | |
10 # DOCUMENT-END | |
11 # BLOCK-SEQUENCE-START | |
12 # BLOCK-MAPPING-START | |
13 # BLOCK-END | |
14 # FLOW-SEQUENCE-START | |
15 # FLOW-MAPPING-START | |
16 # FLOW-SEQUENCE-END | |
17 # FLOW-MAPPING-END | |
18 # BLOCK-ENTRY | |
19 # FLOW-ENTRY | |
20 # KEY | |
21 # VALUE | |
22 # ALIAS(value) | |
23 # ANCHOR(value) | |
24 # TAG(value) | |
25 # SCALAR(value, plain, style) | |
26 # | |
27 # RoundTripScanner | |
28 # COMMENT(value) | |
29 # | |
30 # Read comments in the Scanner code for more details. | |
31 # | |
32 | |
33 from ruamel.yaml.error import MarkedYAMLError | |
34 from ruamel.yaml.tokens import * # NOQA | |
35 from ruamel.yaml.compat import utf8, unichr, PY3, check_anchorname_char, nprint # NOQA | |
36 | |
37 if False: # MYPY | |
38 from typing import Any, Dict, Optional, List, Union, Text # NOQA | |
39 from ruamel.yaml.compat import VersionType # NOQA | |
40 | |
41 __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError'] | |
42 | |
43 | |
44 _THE_END = '\n\0\r\x85\u2028\u2029' | |
45 _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029' | |
46 _SPACE_TAB = ' \t' | |
47 | |
48 | |
49 class ScannerError(MarkedYAMLError): | |
50 pass | |
51 | |
52 | |
53 class SimpleKey(object): | |
54 # See below simple keys treatment. | |
55 | |
56 def __init__(self, token_number, required, index, line, column, mark): | |
57 # type: (Any, Any, int, int, int, Any) -> None | |
58 self.token_number = token_number | |
59 self.required = required | |
60 self.index = index | |
61 self.line = line | |
62 self.column = column | |
63 self.mark = mark | |
64 | |
65 | |
66 class Scanner(object): | |
67 def __init__(self, loader=None): | |
68 # type: (Any) -> None | |
69 """Initialize the scanner.""" | |
70 # It is assumed that Scanner and Reader will have a common descendant. | |
71 # Reader do the dirty work of checking for BOM and converting the | |
72 # input data to Unicode. It also adds NUL to the end. | |
73 # | |
74 # Reader supports the following methods | |
75 # self.peek(i=0) # peek the next i-th character | |
76 # self.prefix(l=1) # peek the next l characters | |
77 # self.forward(l=1) # read the next l characters and move the pointer | |
78 | |
79 self.loader = loader | |
80 if self.loader is not None and getattr(self.loader, '_scanner', None) is None: | |
81 self.loader._scanner = self | |
82 self.reset_scanner() | |
83 self.first_time = False | |
84 | |
85 @property | |
86 def flow_level(self): | |
87 # type: () -> int | |
88 return len(self.flow_context) | |
89 | |
90 def reset_scanner(self): | |
91 # type: () -> None | |
92 # Had we reached the end of the stream? | |
93 self.done = False | |
94 | |
95 # flow_context is an expanding/shrinking list consisting of '{' and '[' | |
96 # for each unclosed flow context. If empty list that means block context | |
97 self.flow_context = [] # type: List[Text] | |
98 | |
99 # List of processed tokens that are not yet emitted. | |
100 self.tokens = [] # type: List[Any] | |
101 | |
102 # Add the STREAM-START token. | |
103 self.fetch_stream_start() | |
104 | |
105 # Number of tokens that were emitted through the `get_token` method. | |
106 self.tokens_taken = 0 | |
107 | |
108 # The current indentation level. | |
109 self.indent = -1 | |
110 | |
111 # Past indentation levels. | |
112 self.indents = [] # type: List[int] | |
113 | |
114 # Variables related to simple keys treatment. | |
115 | |
116 # A simple key is a key that is not denoted by the '?' indicator. | |
117 # Example of simple keys: | |
118 # --- | |
119 # block simple key: value | |
120 # ? not a simple key: | |
121 # : { flow simple key: value } | |
122 # We emit the KEY token before all keys, so when we find a potential | |
123 # simple key, we try to locate the corresponding ':' indicator. | |
124 # Simple keys should be limited to a single line and 1024 characters. | |
125 | |
126 # Can a simple key start at the current position? A simple key may | |
127 # start: | |
128 # - at the beginning of the line, not counting indentation spaces | |
129 # (in block context), | |
130 # - after '{', '[', ',' (in the flow context), | |
131 # - after '?', ':', '-' (in the block context). | |
132 # In the block context, this flag also signifies if a block collection | |
133 # may start at the current position. | |
134 self.allow_simple_key = True | |
135 | |
136 # Keep track of possible simple keys. This is a dictionary. The key | |
137 # is `flow_level`; there can be no more that one possible simple key | |
138 # for each level. The value is a SimpleKey record: | |
139 # (token_number, required, index, line, column, mark) | |
140 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), | |
141 # '[', or '{' tokens. | |
142 self.possible_simple_keys = {} # type: Dict[Any, Any] | |
143 | |
144 @property | |
145 def reader(self): | |
146 # type: () -> Any | |
147 try: | |
148 return self._scanner_reader # type: ignore | |
149 except AttributeError: | |
150 if hasattr(self.loader, 'typ'): | |
151 self._scanner_reader = self.loader.reader | |
152 else: | |
153 self._scanner_reader = self.loader._reader | |
154 return self._scanner_reader | |
155 | |
156 @property | |
157 def scanner_processing_version(self): # prefix until un-composited | |
158 # type: () -> VersionType | |
159 if hasattr(self.loader, 'typ'): | |
160 return self.loader.resolver.processing_version # type: ignore | |
161 return self.loader.processing_version # type: ignore | |
162 | |
163 # Public methods. | |
164 | |
165 def check_token(self, *choices): | |
166 # type: (Any) -> bool | |
167 # Check if the next token is one of the given types. | |
168 while self.need_more_tokens(): | |
169 self.fetch_more_tokens() | |
170 if bool(self.tokens): | |
171 if not choices: | |
172 return True | |
173 for choice in choices: | |
174 if isinstance(self.tokens[0], choice): | |
175 return True | |
176 return False | |
177 | |
178 def peek_token(self): | |
179 # type: () -> Any | |
180 # Return the next token, but do not delete if from the queue. | |
181 while self.need_more_tokens(): | |
182 self.fetch_more_tokens() | |
183 if bool(self.tokens): | |
184 return self.tokens[0] | |
185 | |
186 def get_token(self): | |
187 # type: () -> Any | |
188 # Return the next token. | |
189 while self.need_more_tokens(): | |
190 self.fetch_more_tokens() | |
191 if bool(self.tokens): | |
192 self.tokens_taken += 1 | |
193 return self.tokens.pop(0) | |
194 | |
195 # Private methods. | |
196 | |
197 def need_more_tokens(self): | |
198 # type: () -> bool | |
199 if self.done: | |
200 return False | |
201 if not self.tokens: | |
202 return True | |
203 # The current token may be a potential simple key, so we | |
204 # need to look further. | |
205 self.stale_possible_simple_keys() | |
206 if self.next_possible_simple_key() == self.tokens_taken: | |
207 return True | |
208 return False | |
209 | |
210 def fetch_comment(self, comment): | |
211 # type: (Any) -> None | |
212 raise NotImplementedError | |
213 | |
214 def fetch_more_tokens(self): | |
215 # type: () -> Any | |
216 # Eat whitespaces and comments until we reach the next token. | |
217 comment = self.scan_to_next_token() | |
218 if comment is not None: # never happens for base scanner | |
219 return self.fetch_comment(comment) | |
220 # Remove obsolete possible simple keys. | |
221 self.stale_possible_simple_keys() | |
222 | |
223 # Compare the current indentation and column. It may add some tokens | |
224 # and decrease the current indentation level. | |
225 self.unwind_indent(self.reader.column) | |
226 | |
227 # Peek the next character. | |
228 ch = self.reader.peek() | |
229 | |
230 # Is it the end of stream? | |
231 if ch == '\0': | |
232 return self.fetch_stream_end() | |
233 | |
234 # Is it a directive? | |
235 if ch == '%' and self.check_directive(): | |
236 return self.fetch_directive() | |
237 | |
238 # Is it the document start? | |
239 if ch == '-' and self.check_document_start(): | |
240 return self.fetch_document_start() | |
241 | |
242 # Is it the document end? | |
243 if ch == '.' and self.check_document_end(): | |
244 return self.fetch_document_end() | |
245 | |
246 # TODO: support for BOM within a stream. | |
247 # if ch == u'\uFEFF': | |
248 # return self.fetch_bom() <-- issue BOMToken | |
249 | |
250 # Note: the order of the following checks is NOT significant. | |
251 | |
252 # Is it the flow sequence start indicator? | |
253 if ch == '[': | |
254 return self.fetch_flow_sequence_start() | |
255 | |
256 # Is it the flow mapping start indicator? | |
257 if ch == '{': | |
258 return self.fetch_flow_mapping_start() | |
259 | |
260 # Is it the flow sequence end indicator? | |
261 if ch == ']': | |
262 return self.fetch_flow_sequence_end() | |
263 | |
264 # Is it the flow mapping end indicator? | |
265 if ch == '}': | |
266 return self.fetch_flow_mapping_end() | |
267 | |
268 # Is it the flow entry indicator? | |
269 if ch == ',': | |
270 return self.fetch_flow_entry() | |
271 | |
272 # Is it the block entry indicator? | |
273 if ch == '-' and self.check_block_entry(): | |
274 return self.fetch_block_entry() | |
275 | |
276 # Is it the key indicator? | |
277 if ch == '?' and self.check_key(): | |
278 return self.fetch_key() | |
279 | |
280 # Is it the value indicator? | |
281 if ch == ':' and self.check_value(): | |
282 return self.fetch_value() | |
283 | |
284 # Is it an alias? | |
285 if ch == '*': | |
286 return self.fetch_alias() | |
287 | |
288 # Is it an anchor? | |
289 if ch == '&': | |
290 return self.fetch_anchor() | |
291 | |
292 # Is it a tag? | |
293 if ch == '!': | |
294 return self.fetch_tag() | |
295 | |
296 # Is it a literal scalar? | |
297 if ch == '|' and not self.flow_level: | |
298 return self.fetch_literal() | |
299 | |
300 # Is it a folded scalar? | |
301 if ch == '>' and not self.flow_level: | |
302 return self.fetch_folded() | |
303 | |
304 # Is it a single quoted scalar? | |
305 if ch == "'": | |
306 return self.fetch_single() | |
307 | |
308 # Is it a double quoted scalar? | |
309 if ch == '"': | |
310 return self.fetch_double() | |
311 | |
312 # It must be a plain scalar then. | |
313 if self.check_plain(): | |
314 return self.fetch_plain() | |
315 | |
316 # No? It's an error. Let's produce a nice error message. | |
317 raise ScannerError( | |
318 'while scanning for the next token', | |
319 None, | |
320 'found character %r that cannot start any token' % utf8(ch), | |
321 self.reader.get_mark(), | |
322 ) | |
323 | |
324 # Simple keys treatment. | |
325 | |
326 def next_possible_simple_key(self): | |
327 # type: () -> Any | |
328 # Return the number of the nearest possible simple key. Actually we | |
329 # don't need to loop through the whole dictionary. We may replace it | |
330 # with the following code: | |
331 # if not self.possible_simple_keys: | |
332 # return None | |
333 # return self.possible_simple_keys[ | |
334 # min(self.possible_simple_keys.keys())].token_number | |
335 min_token_number = None | |
336 for level in self.possible_simple_keys: | |
337 key = self.possible_simple_keys[level] | |
338 if min_token_number is None or key.token_number < min_token_number: | |
339 min_token_number = key.token_number | |
340 return min_token_number | |
341 | |
342 def stale_possible_simple_keys(self): | |
343 # type: () -> None | |
344 # Remove entries that are no longer possible simple keys. According to | |
345 # the YAML specification, simple keys | |
346 # - should be limited to a single line, | |
347 # - should be no longer than 1024 characters. | |
348 # Disabling this procedure will allow simple keys of any length and | |
349 # height (may cause problems if indentation is broken though). | |
350 for level in list(self.possible_simple_keys): | |
351 key = self.possible_simple_keys[level] | |
352 if key.line != self.reader.line or self.reader.index - key.index > 1024: | |
353 if key.required: | |
354 raise ScannerError( | |
355 'while scanning a simple key', | |
356 key.mark, | |
357 "could not find expected ':'", | |
358 self.reader.get_mark(), | |
359 ) | |
360 del self.possible_simple_keys[level] | |
361 | |
362 def save_possible_simple_key(self): | |
363 # type: () -> None | |
364 # The next token may start a simple key. We check if it's possible | |
365 # and save its position. This function is called for | |
366 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. | |
367 | |
368 # Check if a simple key is required at the current position. | |
369 required = not self.flow_level and self.indent == self.reader.column | |
370 | |
371 # The next token might be a simple key. Let's save it's number and | |
372 # position. | |
373 if self.allow_simple_key: | |
374 self.remove_possible_simple_key() | |
375 token_number = self.tokens_taken + len(self.tokens) | |
376 key = SimpleKey( | |
377 token_number, | |
378 required, | |
379 self.reader.index, | |
380 self.reader.line, | |
381 self.reader.column, | |
382 self.reader.get_mark(), | |
383 ) | |
384 self.possible_simple_keys[self.flow_level] = key | |
385 | |
386 def remove_possible_simple_key(self): | |
387 # type: () -> None | |
388 # Remove the saved possible key position at the current flow level. | |
389 if self.flow_level in self.possible_simple_keys: | |
390 key = self.possible_simple_keys[self.flow_level] | |
391 | |
392 if key.required: | |
393 raise ScannerError( | |
394 'while scanning a simple key', | |
395 key.mark, | |
396 "could not find expected ':'", | |
397 self.reader.get_mark(), | |
398 ) | |
399 | |
400 del self.possible_simple_keys[self.flow_level] | |
401 | |
402 # Indentation functions. | |
403 | |
404 def unwind_indent(self, column): | |
405 # type: (Any) -> None | |
406 # In flow context, tokens should respect indentation. | |
407 # Actually the condition should be `self.indent >= column` according to | |
408 # the spec. But this condition will prohibit intuitively correct | |
409 # constructions such as | |
410 # key : { | |
411 # } | |
412 # #### | |
413 # if self.flow_level and self.indent > column: | |
414 # raise ScannerError(None, None, | |
415 # "invalid intendation or unclosed '[' or '{'", | |
416 # self.reader.get_mark()) | |
417 | |
418 # In the flow context, indentation is ignored. We make the scanner less | |
419 # restrictive then specification requires. | |
420 if bool(self.flow_level): | |
421 return | |
422 | |
423 # In block context, we may need to issue the BLOCK-END tokens. | |
424 while self.indent > column: | |
425 mark = self.reader.get_mark() | |
426 self.indent = self.indents.pop() | |
427 self.tokens.append(BlockEndToken(mark, mark)) | |
428 | |
429 def add_indent(self, column): | |
430 # type: (int) -> bool | |
431 # Check if we need to increase indentation. | |
432 if self.indent < column: | |
433 self.indents.append(self.indent) | |
434 self.indent = column | |
435 return True | |
436 return False | |
437 | |
438 # Fetchers. | |
439 | |
440 def fetch_stream_start(self): | |
441 # type: () -> None | |
442 # We always add STREAM-START as the first token and STREAM-END as the | |
443 # last token. | |
444 # Read the token. | |
445 mark = self.reader.get_mark() | |
446 # Add STREAM-START. | |
447 self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding)) | |
448 | |
449 def fetch_stream_end(self): | |
450 # type: () -> None | |
451 # Set the current intendation to -1. | |
452 self.unwind_indent(-1) | |
453 # Reset simple keys. | |
454 self.remove_possible_simple_key() | |
455 self.allow_simple_key = False | |
456 self.possible_simple_keys = {} # type: Dict[Any, Any] | |
457 # Read the token. | |
458 mark = self.reader.get_mark() | |
459 # Add STREAM-END. | |
460 self.tokens.append(StreamEndToken(mark, mark)) | |
461 # The steam is finished. | |
462 self.done = True | |
463 | |
464 def fetch_directive(self): | |
465 # type: () -> None | |
466 # Set the current intendation to -1. | |
467 self.unwind_indent(-1) | |
468 | |
469 # Reset simple keys. | |
470 self.remove_possible_simple_key() | |
471 self.allow_simple_key = False | |
472 | |
473 # Scan and add DIRECTIVE. | |
474 self.tokens.append(self.scan_directive()) | |
475 | |
476 def fetch_document_start(self): | |
477 # type: () -> None | |
478 self.fetch_document_indicator(DocumentStartToken) | |
479 | |
480 def fetch_document_end(self): | |
481 # type: () -> None | |
482 self.fetch_document_indicator(DocumentEndToken) | |
483 | |
484 def fetch_document_indicator(self, TokenClass): | |
485 # type: (Any) -> None | |
486 # Set the current intendation to -1. | |
487 self.unwind_indent(-1) | |
488 | |
489 # Reset simple keys. Note that there could not be a block collection | |
490 # after '---'. | |
491 self.remove_possible_simple_key() | |
492 self.allow_simple_key = False | |
493 | |
494 # Add DOCUMENT-START or DOCUMENT-END. | |
495 start_mark = self.reader.get_mark() | |
496 self.reader.forward(3) | |
497 end_mark = self.reader.get_mark() | |
498 self.tokens.append(TokenClass(start_mark, end_mark)) | |
499 | |
500 def fetch_flow_sequence_start(self): | |
501 # type: () -> None | |
502 self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[') | |
503 | |
504 def fetch_flow_mapping_start(self): | |
505 # type: () -> None | |
506 self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{') | |
507 | |
508 def fetch_flow_collection_start(self, TokenClass, to_push): | |
509 # type: (Any, Text) -> None | |
510 # '[' and '{' may start a simple key. | |
511 self.save_possible_simple_key() | |
512 # Increase the flow level. | |
513 self.flow_context.append(to_push) | |
514 # Simple keys are allowed after '[' and '{'. | |
515 self.allow_simple_key = True | |
516 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. | |
517 start_mark = self.reader.get_mark() | |
518 self.reader.forward() | |
519 end_mark = self.reader.get_mark() | |
520 self.tokens.append(TokenClass(start_mark, end_mark)) | |
521 | |
522 def fetch_flow_sequence_end(self): | |
523 # type: () -> None | |
524 self.fetch_flow_collection_end(FlowSequenceEndToken) | |
525 | |
526 def fetch_flow_mapping_end(self): | |
527 # type: () -> None | |
528 self.fetch_flow_collection_end(FlowMappingEndToken) | |
529 | |
530 def fetch_flow_collection_end(self, TokenClass): | |
531 # type: (Any) -> None | |
532 # Reset possible simple key on the current level. | |
533 self.remove_possible_simple_key() | |
534 # Decrease the flow level. | |
535 try: | |
536 popped = self.flow_context.pop() # NOQA | |
537 except IndexError: | |
538 # We must not be in a list or object. | |
539 # Defer error handling to the parser. | |
540 pass | |
541 # No simple keys after ']' or '}'. | |
542 self.allow_simple_key = False | |
543 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. | |
544 start_mark = self.reader.get_mark() | |
545 self.reader.forward() | |
546 end_mark = self.reader.get_mark() | |
547 self.tokens.append(TokenClass(start_mark, end_mark)) | |
548 | |
549 def fetch_flow_entry(self): | |
550 # type: () -> None | |
551 # Simple keys are allowed after ','. | |
552 self.allow_simple_key = True | |
553 # Reset possible simple key on the current level. | |
554 self.remove_possible_simple_key() | |
555 # Add FLOW-ENTRY. | |
556 start_mark = self.reader.get_mark() | |
557 self.reader.forward() | |
558 end_mark = self.reader.get_mark() | |
559 self.tokens.append(FlowEntryToken(start_mark, end_mark)) | |
560 | |
561 def fetch_block_entry(self): | |
562 # type: () -> None | |
563 # Block context needs additional checks. | |
564 if not self.flow_level: | |
565 # Are we allowed to start a new entry? | |
566 if not self.allow_simple_key: | |
567 raise ScannerError( | |
568 None, None, 'sequence entries are not allowed here', self.reader.get_mark() | |
569 ) | |
570 # We may need to add BLOCK-SEQUENCE-START. | |
571 if self.add_indent(self.reader.column): | |
572 mark = self.reader.get_mark() | |
573 self.tokens.append(BlockSequenceStartToken(mark, mark)) | |
574 # It's an error for the block entry to occur in the flow context, | |
575 # but we let the parser detect this. | |
576 else: | |
577 pass | |
578 # Simple keys are allowed after '-'. | |
579 self.allow_simple_key = True | |
580 # Reset possible simple key on the current level. | |
581 self.remove_possible_simple_key() | |
582 | |
583 # Add BLOCK-ENTRY. | |
584 start_mark = self.reader.get_mark() | |
585 self.reader.forward() | |
586 end_mark = self.reader.get_mark() | |
587 self.tokens.append(BlockEntryToken(start_mark, end_mark)) | |
588 | |
589 def fetch_key(self): | |
590 # type: () -> None | |
591 # Block context needs additional checks. | |
592 if not self.flow_level: | |
593 | |
594 # Are we allowed to start a key (not nessesary a simple)? | |
595 if not self.allow_simple_key: | |
596 raise ScannerError( | |
597 None, None, 'mapping keys are not allowed here', self.reader.get_mark() | |
598 ) | |
599 | |
600 # We may need to add BLOCK-MAPPING-START. | |
601 if self.add_indent(self.reader.column): | |
602 mark = self.reader.get_mark() | |
603 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
604 | |
605 # Simple keys are allowed after '?' in the block context. | |
606 self.allow_simple_key = not self.flow_level | |
607 | |
608 # Reset possible simple key on the current level. | |
609 self.remove_possible_simple_key() | |
610 | |
611 # Add KEY. | |
612 start_mark = self.reader.get_mark() | |
613 self.reader.forward() | |
614 end_mark = self.reader.get_mark() | |
615 self.tokens.append(KeyToken(start_mark, end_mark)) | |
616 | |
617 def fetch_value(self): | |
618 # type: () -> None | |
619 # Do we determine a simple key? | |
620 if self.flow_level in self.possible_simple_keys: | |
621 # Add KEY. | |
622 key = self.possible_simple_keys[self.flow_level] | |
623 del self.possible_simple_keys[self.flow_level] | |
624 self.tokens.insert( | |
625 key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark) | |
626 ) | |
627 | |
628 # If this key starts a new block mapping, we need to add | |
629 # BLOCK-MAPPING-START. | |
630 if not self.flow_level: | |
631 if self.add_indent(key.column): | |
632 self.tokens.insert( | |
633 key.token_number - self.tokens_taken, | |
634 BlockMappingStartToken(key.mark, key.mark), | |
635 ) | |
636 | |
637 # There cannot be two simple keys one after another. | |
638 self.allow_simple_key = False | |
639 | |
640 # It must be a part of a complex key. | |
641 else: | |
642 | |
643 # Block context needs additional checks. | |
644 # (Do we really need them? They will be caught by the parser | |
645 # anyway.) | |
646 if not self.flow_level: | |
647 | |
648 # We are allowed to start a complex value if and only if | |
649 # we can start a simple key. | |
650 if not self.allow_simple_key: | |
651 raise ScannerError( | |
652 None, | |
653 None, | |
654 'mapping values are not allowed here', | |
655 self.reader.get_mark(), | |
656 ) | |
657 | |
658 # If this value starts a new block mapping, we need to add | |
659 # BLOCK-MAPPING-START. It will be detected as an error later by | |
660 # the parser. | |
661 if not self.flow_level: | |
662 if self.add_indent(self.reader.column): | |
663 mark = self.reader.get_mark() | |
664 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
665 | |
666 # Simple keys are allowed after ':' in the block context. | |
667 self.allow_simple_key = not self.flow_level | |
668 | |
669 # Reset possible simple key on the current level. | |
670 self.remove_possible_simple_key() | |
671 | |
672 # Add VALUE. | |
673 start_mark = self.reader.get_mark() | |
674 self.reader.forward() | |
675 end_mark = self.reader.get_mark() | |
676 self.tokens.append(ValueToken(start_mark, end_mark)) | |
677 | |
678 def fetch_alias(self): | |
679 # type: () -> None | |
680 # ALIAS could be a simple key. | |
681 self.save_possible_simple_key() | |
682 # No simple keys after ALIAS. | |
683 self.allow_simple_key = False | |
684 # Scan and add ALIAS. | |
685 self.tokens.append(self.scan_anchor(AliasToken)) | |
686 | |
687 def fetch_anchor(self): | |
688 # type: () -> None | |
689 # ANCHOR could start a simple key. | |
690 self.save_possible_simple_key() | |
691 # No simple keys after ANCHOR. | |
692 self.allow_simple_key = False | |
693 # Scan and add ANCHOR. | |
694 self.tokens.append(self.scan_anchor(AnchorToken)) | |
695 | |
696 def fetch_tag(self): | |
697 # type: () -> None | |
698 # TAG could start a simple key. | |
699 self.save_possible_simple_key() | |
700 # No simple keys after TAG. | |
701 self.allow_simple_key = False | |
702 # Scan and add TAG. | |
703 self.tokens.append(self.scan_tag()) | |
704 | |
705 def fetch_literal(self): | |
706 # type: () -> None | |
707 self.fetch_block_scalar(style='|') | |
708 | |
709 def fetch_folded(self): | |
710 # type: () -> None | |
711 self.fetch_block_scalar(style='>') | |
712 | |
713 def fetch_block_scalar(self, style): | |
714 # type: (Any) -> None | |
715 # A simple key may follow a block scalar. | |
716 self.allow_simple_key = True | |
717 # Reset possible simple key on the current level. | |
718 self.remove_possible_simple_key() | |
719 # Scan and add SCALAR. | |
720 self.tokens.append(self.scan_block_scalar(style)) | |
721 | |
722 def fetch_single(self): | |
723 # type: () -> None | |
724 self.fetch_flow_scalar(style="'") | |
725 | |
726 def fetch_double(self): | |
727 # type: () -> None | |
728 self.fetch_flow_scalar(style='"') | |
729 | |
730 def fetch_flow_scalar(self, style): | |
731 # type: (Any) -> None | |
732 # A flow scalar could be a simple key. | |
733 self.save_possible_simple_key() | |
734 # No simple keys after flow scalars. | |
735 self.allow_simple_key = False | |
736 # Scan and add SCALAR. | |
737 self.tokens.append(self.scan_flow_scalar(style)) | |
738 | |
739 def fetch_plain(self): | |
740 # type: () -> None | |
741 # A plain scalar could be a simple key. | |
742 self.save_possible_simple_key() | |
743 # No simple keys after plain scalars. But note that `scan_plain` will | |
744 # change this flag if the scan is finished at the beginning of the | |
745 # line. | |
746 self.allow_simple_key = False | |
747 # Scan and add SCALAR. May change `allow_simple_key`. | |
748 self.tokens.append(self.scan_plain()) | |
749 | |
750 # Checkers. | |
751 | |
752 def check_directive(self): | |
753 # type: () -> Any | |
754 # DIRECTIVE: ^ '%' ... | |
755 # The '%' indicator is already checked. | |
756 if self.reader.column == 0: | |
757 return True | |
758 return None | |
759 | |
760 def check_document_start(self): | |
761 # type: () -> Any | |
762 # DOCUMENT-START: ^ '---' (' '|'\n') | |
763 if self.reader.column == 0: | |
764 if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB: | |
765 return True | |
766 return None | |
767 | |
768 def check_document_end(self): | |
769 # type: () -> Any | |
770 # DOCUMENT-END: ^ '...' (' '|'\n') | |
771 if self.reader.column == 0: | |
772 if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB: | |
773 return True | |
774 return None | |
775 | |
776 def check_block_entry(self): | |
777 # type: () -> Any | |
778 # BLOCK-ENTRY: '-' (' '|'\n') | |
779 return self.reader.peek(1) in _THE_END_SPACE_TAB | |
780 | |
781 def check_key(self): | |
782 # type: () -> Any | |
783 # KEY(flow context): '?' | |
784 if bool(self.flow_level): | |
785 return True | |
786 # KEY(block context): '?' (' '|'\n') | |
787 return self.reader.peek(1) in _THE_END_SPACE_TAB | |
788 | |
789 def check_value(self): | |
790 # type: () -> Any | |
791 # VALUE(flow context): ':' | |
792 if self.scanner_processing_version == (1, 1): | |
793 if bool(self.flow_level): | |
794 return True | |
795 else: | |
796 if bool(self.flow_level): | |
797 if self.flow_context[-1] == '[': | |
798 if self.reader.peek(1) not in _THE_END_SPACE_TAB: | |
799 return False | |
800 elif self.tokens and isinstance(self.tokens[-1], ValueToken): | |
801 # mapping flow context scanning a value token | |
802 if self.reader.peek(1) not in _THE_END_SPACE_TAB: | |
803 return False | |
804 return True | |
805 # VALUE(block context): ':' (' '|'\n') | |
806 return self.reader.peek(1) in _THE_END_SPACE_TAB | |
807 | |
808 def check_plain(self): | |
809 # type: () -> Any | |
810 # A plain scalar may start with any non-space character except: | |
811 # '-', '?', ':', ',', '[', ']', '{', '}', | |
812 # '#', '&', '*', '!', '|', '>', '\'', '\"', | |
813 # '%', '@', '`'. | |
814 # | |
815 # It may also start with | |
816 # '-', '?', ':' | |
817 # if it is followed by a non-space character. | |
818 # | |
819 # Note that we limit the last rule to the block context (except the | |
820 # '-' character) because we want the flow context to be space | |
821 # independent. | |
822 srp = self.reader.peek | |
823 ch = srp() | |
824 if self.scanner_processing_version == (1, 1): | |
825 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or ( | |
826 srp(1) not in _THE_END_SPACE_TAB | |
827 and (ch == '-' or (not self.flow_level and ch in '?:')) | |
828 ) | |
829 # YAML 1.2 | |
830 if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`': | |
831 # ################### ^ ??? | |
832 return True | |
833 ch1 = srp(1) | |
834 if ch == '-' and ch1 not in _THE_END_SPACE_TAB: | |
835 return True | |
836 if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB: | |
837 return True | |
838 | |
839 return srp(1) not in _THE_END_SPACE_TAB and ( | |
840 ch == '-' or (not self.flow_level and ch in '?:') | |
841 ) | |
842 | |
843 # Scanners. | |
844 | |
845 def scan_to_next_token(self): | |
846 # type: () -> Any | |
847 # We ignore spaces, line breaks and comments. | |
848 # If we find a line break in the block context, we set the flag | |
849 # `allow_simple_key` on. | |
850 # The byte order mark is stripped if it's the first character in the | |
851 # stream. We do not yet support BOM inside the stream as the | |
852 # specification requires. Any such mark will be considered as a part | |
853 # of the document. | |
854 # | |
855 # TODO: We need to make tab handling rules more sane. A good rule is | |
856 # Tabs cannot precede tokens | |
857 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
858 # KEY(block), VALUE(block), BLOCK-ENTRY | |
859 # So the checking code is | |
860 # if <TAB>: | |
861 # self.allow_simple_keys = False | |
862 # We also need to add the check for `allow_simple_keys == True` to | |
863 # `unwind_indent` before issuing BLOCK-END. | |
864 # Scanners for block, flow, and plain scalars need to be modified. | |
865 srp = self.reader.peek | |
866 srf = self.reader.forward | |
867 if self.reader.index == 0 and srp() == '\uFEFF': | |
868 srf() | |
869 found = False | |
870 _the_end = _THE_END | |
871 while not found: | |
872 while srp() == ' ': | |
873 srf() | |
874 if srp() == '#': | |
875 while srp() not in _the_end: | |
876 srf() | |
877 if self.scan_line_break(): | |
878 if not self.flow_level: | |
879 self.allow_simple_key = True | |
880 else: | |
881 found = True | |
882 return None | |
883 | |
884 def scan_directive(self): | |
885 # type: () -> Any | |
886 # See the specification for details. | |
887 srp = self.reader.peek | |
888 srf = self.reader.forward | |
889 start_mark = self.reader.get_mark() | |
890 srf() | |
891 name = self.scan_directive_name(start_mark) | |
892 value = None | |
893 if name == 'YAML': | |
894 value = self.scan_yaml_directive_value(start_mark) | |
895 end_mark = self.reader.get_mark() | |
896 elif name == 'TAG': | |
897 value = self.scan_tag_directive_value(start_mark) | |
898 end_mark = self.reader.get_mark() | |
899 else: | |
900 end_mark = self.reader.get_mark() | |
901 while srp() not in _THE_END: | |
902 srf() | |
903 self.scan_directive_ignored_line(start_mark) | |
904 return DirectiveToken(name, value, start_mark, end_mark) | |
905 | |
906 def scan_directive_name(self, start_mark): | |
907 # type: (Any) -> Any | |
908 # See the specification for details. | |
909 length = 0 | |
910 srp = self.reader.peek | |
911 ch = srp(length) | |
912 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.': | |
913 length += 1 | |
914 ch = srp(length) | |
915 if not length: | |
916 raise ScannerError( | |
917 'while scanning a directive', | |
918 start_mark, | |
919 'expected alphabetic or numeric character, but found %r' % utf8(ch), | |
920 self.reader.get_mark(), | |
921 ) | |
922 value = self.reader.prefix(length) | |
923 self.reader.forward(length) | |
924 ch = srp() | |
925 if ch not in '\0 \r\n\x85\u2028\u2029': | |
926 raise ScannerError( | |
927 'while scanning a directive', | |
928 start_mark, | |
929 'expected alphabetic or numeric character, but found %r' % utf8(ch), | |
930 self.reader.get_mark(), | |
931 ) | |
932 return value | |
933 | |
934 def scan_yaml_directive_value(self, start_mark): | |
935 # type: (Any) -> Any | |
936 # See the specification for details. | |
937 srp = self.reader.peek | |
938 srf = self.reader.forward | |
939 while srp() == ' ': | |
940 srf() | |
941 major = self.scan_yaml_directive_number(start_mark) | |
942 if srp() != '.': | |
943 raise ScannerError( | |
944 'while scanning a directive', | |
945 start_mark, | |
946 "expected a digit or '.', but found %r" % utf8(srp()), | |
947 self.reader.get_mark(), | |
948 ) | |
949 srf() | |
950 minor = self.scan_yaml_directive_number(start_mark) | |
951 if srp() not in '\0 \r\n\x85\u2028\u2029': | |
952 raise ScannerError( | |
953 'while scanning a directive', | |
954 start_mark, | |
955 "expected a digit or ' ', but found %r" % utf8(srp()), | |
956 self.reader.get_mark(), | |
957 ) | |
958 return (major, minor) | |
959 | |
960 def scan_yaml_directive_number(self, start_mark): | |
961 # type: (Any) -> Any | |
962 # See the specification for details. | |
963 srp = self.reader.peek | |
964 srf = self.reader.forward | |
965 ch = srp() | |
966 if not ('0' <= ch <= '9'): | |
967 raise ScannerError( | |
968 'while scanning a directive', | |
969 start_mark, | |
970 'expected a digit, but found %r' % utf8(ch), | |
971 self.reader.get_mark(), | |
972 ) | |
973 length = 0 | |
974 while '0' <= srp(length) <= '9': | |
975 length += 1 | |
976 value = int(self.reader.prefix(length)) | |
977 srf(length) | |
978 return value | |
979 | |
980 def scan_tag_directive_value(self, start_mark): | |
981 # type: (Any) -> Any | |
982 # See the specification for details. | |
983 srp = self.reader.peek | |
984 srf = self.reader.forward | |
985 while srp() == ' ': | |
986 srf() | |
987 handle = self.scan_tag_directive_handle(start_mark) | |
988 while srp() == ' ': | |
989 srf() | |
990 prefix = self.scan_tag_directive_prefix(start_mark) | |
991 return (handle, prefix) | |
992 | |
993 def scan_tag_directive_handle(self, start_mark): | |
994 # type: (Any) -> Any | |
995 # See the specification for details. | |
996 value = self.scan_tag_handle('directive', start_mark) | |
997 ch = self.reader.peek() | |
998 if ch != ' ': | |
999 raise ScannerError( | |
1000 'while scanning a directive', | |
1001 start_mark, | |
1002 "expected ' ', but found %r" % utf8(ch), | |
1003 self.reader.get_mark(), | |
1004 ) | |
1005 return value | |
1006 | |
1007 def scan_tag_directive_prefix(self, start_mark): | |
1008 # type: (Any) -> Any | |
1009 # See the specification for details. | |
1010 value = self.scan_tag_uri('directive', start_mark) | |
1011 ch = self.reader.peek() | |
1012 if ch not in '\0 \r\n\x85\u2028\u2029': | |
1013 raise ScannerError( | |
1014 'while scanning a directive', | |
1015 start_mark, | |
1016 "expected ' ', but found %r" % utf8(ch), | |
1017 self.reader.get_mark(), | |
1018 ) | |
1019 return value | |
1020 | |
1021 def scan_directive_ignored_line(self, start_mark): | |
1022 # type: (Any) -> None | |
1023 # See the specification for details. | |
1024 srp = self.reader.peek | |
1025 srf = self.reader.forward | |
1026 while srp() == ' ': | |
1027 srf() | |
1028 if srp() == '#': | |
1029 while srp() not in _THE_END: | |
1030 srf() | |
1031 ch = srp() | |
1032 if ch not in _THE_END: | |
1033 raise ScannerError( | |
1034 'while scanning a directive', | |
1035 start_mark, | |
1036 'expected a comment or a line break, but found %r' % utf8(ch), | |
1037 self.reader.get_mark(), | |
1038 ) | |
1039 self.scan_line_break() | |
1040 | |
1041 def scan_anchor(self, TokenClass): | |
1042 # type: (Any) -> Any | |
1043 # The specification does not restrict characters for anchors and | |
1044 # aliases. This may lead to problems, for instance, the document: | |
1045 # [ *alias, value ] | |
1046 # can be interpteted in two ways, as | |
1047 # [ "value" ] | |
1048 # and | |
1049 # [ *alias , "value" ] | |
1050 # Therefore we restrict aliases to numbers and ASCII letters. | |
1051 srp = self.reader.peek | |
1052 start_mark = self.reader.get_mark() | |
1053 indicator = srp() | |
1054 if indicator == '*': | |
1055 name = 'alias' | |
1056 else: | |
1057 name = 'anchor' | |
1058 self.reader.forward() | |
1059 length = 0 | |
1060 ch = srp(length) | |
1061 # while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \ | |
1062 # or ch in u'-_': | |
1063 while check_anchorname_char(ch): | |
1064 length += 1 | |
1065 ch = srp(length) | |
1066 if not length: | |
1067 raise ScannerError( | |
1068 'while scanning an %s' % (name,), | |
1069 start_mark, | |
1070 'expected alphabetic or numeric character, but found %r' % utf8(ch), | |
1071 self.reader.get_mark(), | |
1072 ) | |
1073 value = self.reader.prefix(length) | |
1074 self.reader.forward(length) | |
1075 # ch1 = ch | |
1076 # ch = srp() # no need to peek, ch is already set | |
1077 # assert ch1 == ch | |
1078 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`': | |
1079 raise ScannerError( | |
1080 'while scanning an %s' % (name,), | |
1081 start_mark, | |
1082 'expected alphabetic or numeric character, but found %r' % utf8(ch), | |
1083 self.reader.get_mark(), | |
1084 ) | |
1085 end_mark = self.reader.get_mark() | |
1086 return TokenClass(value, start_mark, end_mark) | |
1087 | |
1088 def scan_tag(self): | |
1089 # type: () -> Any | |
1090 # See the specification for details. | |
1091 srp = self.reader.peek | |
1092 start_mark = self.reader.get_mark() | |
1093 ch = srp(1) | |
1094 if ch == '<': | |
1095 handle = None | |
1096 self.reader.forward(2) | |
1097 suffix = self.scan_tag_uri('tag', start_mark) | |
1098 if srp() != '>': | |
1099 raise ScannerError( | |
1100 'while parsing a tag', | |
1101 start_mark, | |
1102 "expected '>', but found %r" % utf8(srp()), | |
1103 self.reader.get_mark(), | |
1104 ) | |
1105 self.reader.forward() | |
1106 elif ch in _THE_END_SPACE_TAB: | |
1107 handle = None | |
1108 suffix = '!' | |
1109 self.reader.forward() | |
1110 else: | |
1111 length = 1 | |
1112 use_handle = False | |
1113 while ch not in '\0 \r\n\x85\u2028\u2029': | |
1114 if ch == '!': | |
1115 use_handle = True | |
1116 break | |
1117 length += 1 | |
1118 ch = srp(length) | |
1119 handle = '!' | |
1120 if use_handle: | |
1121 handle = self.scan_tag_handle('tag', start_mark) | |
1122 else: | |
1123 handle = '!' | |
1124 self.reader.forward() | |
1125 suffix = self.scan_tag_uri('tag', start_mark) | |
1126 ch = srp() | |
1127 if ch not in '\0 \r\n\x85\u2028\u2029': | |
1128 raise ScannerError( | |
1129 'while scanning a tag', | |
1130 start_mark, | |
1131 "expected ' ', but found %r" % utf8(ch), | |
1132 self.reader.get_mark(), | |
1133 ) | |
1134 value = (handle, suffix) | |
1135 end_mark = self.reader.get_mark() | |
1136 return TagToken(value, start_mark, end_mark) | |
1137 | |
1138 def scan_block_scalar(self, style, rt=False): | |
1139 # type: (Any, Optional[bool]) -> Any | |
1140 # See the specification for details. | |
1141 srp = self.reader.peek | |
1142 if style == '>': | |
1143 folded = True | |
1144 else: | |
1145 folded = False | |
1146 | |
1147 chunks = [] # type: List[Any] | |
1148 start_mark = self.reader.get_mark() | |
1149 | |
1150 # Scan the header. | |
1151 self.reader.forward() | |
1152 chomping, increment = self.scan_block_scalar_indicators(start_mark) | |
1153 # block scalar comment e.g. : |+ # comment text | |
1154 block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark) | |
1155 | |
1156 # Determine the indentation level and go to the first non-empty line. | |
1157 min_indent = self.indent + 1 | |
1158 if increment is None: | |
1159 # no increment and top level, min_indent could be 0 | |
1160 if min_indent < 1 and ( | |
1161 style not in '|>' | |
1162 or (self.scanner_processing_version == (1, 1)) | |
1163 and getattr( | |
1164 self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False | |
1165 ) | |
1166 ): | |
1167 min_indent = 1 | |
1168 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() | |
1169 indent = max(min_indent, max_indent) | |
1170 else: | |
1171 if min_indent < 1: | |
1172 min_indent = 1 | |
1173 indent = min_indent + increment - 1 | |
1174 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
1175 line_break = "" | |
1176 | |
1177 # Scan the inner part of the block scalar. | |
1178 while self.reader.column == indent and srp() != '\0': | |
1179 chunks.extend(breaks) | |
1180 leading_non_space = srp() not in ' \t' | |
1181 length = 0 | |
1182 while srp(length) not in _THE_END: | |
1183 length += 1 | |
1184 chunks.append(self.reader.prefix(length)) | |
1185 self.reader.forward(length) | |
1186 line_break = self.scan_line_break() | |
1187 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
1188 if style in '|>' and min_indent == 0: | |
1189 # at the beginning of a line, if in block style see if | |
1190 # end of document/start_new_document | |
1191 if self.check_document_start() or self.check_document_end(): | |
1192 break | |
1193 if self.reader.column == indent and srp() != '\0': | |
1194 | |
1195 # Unfortunately, folding rules are ambiguous. | |
1196 # | |
1197 # This is the folding according to the specification: | |
1198 | |
1199 if rt and folded and line_break == '\n': | |
1200 chunks.append('\a') | |
1201 if folded and line_break == '\n' and leading_non_space and srp() not in ' \t': | |
1202 if not breaks: | |
1203 chunks.append(' ') | |
1204 else: | |
1205 chunks.append(line_break) | |
1206 | |
1207 # This is Clark Evans's interpretation (also in the spec | |
1208 # examples): | |
1209 # | |
1210 # if folded and line_break == u'\n': | |
1211 # if not breaks: | |
1212 # if srp() not in ' \t': | |
1213 # chunks.append(u' ') | |
1214 # else: | |
1215 # chunks.append(line_break) | |
1216 # else: | |
1217 # chunks.append(line_break) | |
1218 else: | |
1219 break | |
1220 | |
1221 # Process trailing line breaks. The 'chomping' setting determines | |
1222 # whether they are included in the value. | |
1223 trailing = [] # type: List[Any] | |
1224 if chomping in [None, True]: | |
1225 chunks.append(line_break) | |
1226 if chomping is True: | |
1227 chunks.extend(breaks) | |
1228 elif chomping in [None, False]: | |
1229 trailing.extend(breaks) | |
1230 | |
1231 # We are done. | |
1232 token = ScalarToken("".join(chunks), False, start_mark, end_mark, style) | |
1233 if block_scalar_comment is not None: | |
1234 token.add_pre_comments([block_scalar_comment]) | |
1235 if len(trailing) > 0: | |
1236 # nprint('trailing 1', trailing) # XXXXX | |
1237 # Eat whitespaces and comments until we reach the next token. | |
1238 comment = self.scan_to_next_token() | |
1239 while comment: | |
1240 trailing.append(' ' * comment[1].column + comment[0]) | |
1241 comment = self.scan_to_next_token() | |
1242 | |
1243 # Keep track of the trailing whitespace and following comments | |
1244 # as a comment token, if isn't all included in the actual value. | |
1245 comment_end_mark = self.reader.get_mark() | |
1246 comment = CommentToken("".join(trailing), end_mark, comment_end_mark) | |
1247 token.add_post_comment(comment) | |
1248 return token | |
1249 | |
1250 def scan_block_scalar_indicators(self, start_mark): | |
1251 # type: (Any) -> Any | |
1252 # See the specification for details. | |
1253 srp = self.reader.peek | |
1254 chomping = None | |
1255 increment = None | |
1256 ch = srp() | |
1257 if ch in '+-': | |
1258 if ch == '+': | |
1259 chomping = True | |
1260 else: | |
1261 chomping = False | |
1262 self.reader.forward() | |
1263 ch = srp() | |
1264 if ch in '0123456789': | |
1265 increment = int(ch) | |
1266 if increment == 0: | |
1267 raise ScannerError( | |
1268 'while scanning a block scalar', | |
1269 start_mark, | |
1270 'expected indentation indicator in the range 1-9, ' 'but found 0', | |
1271 self.reader.get_mark(), | |
1272 ) | |
1273 self.reader.forward() | |
1274 elif ch in '0123456789': | |
1275 increment = int(ch) | |
1276 if increment == 0: | |
1277 raise ScannerError( | |
1278 'while scanning a block scalar', | |
1279 start_mark, | |
1280 'expected indentation indicator in the range 1-9, ' 'but found 0', | |
1281 self.reader.get_mark(), | |
1282 ) | |
1283 self.reader.forward() | |
1284 ch = srp() | |
1285 if ch in '+-': | |
1286 if ch == '+': | |
1287 chomping = True | |
1288 else: | |
1289 chomping = False | |
1290 self.reader.forward() | |
1291 ch = srp() | |
1292 if ch not in '\0 \r\n\x85\u2028\u2029': | |
1293 raise ScannerError( | |
1294 'while scanning a block scalar', | |
1295 start_mark, | |
1296 'expected chomping or indentation indicators, but found %r' % utf8(ch), | |
1297 self.reader.get_mark(), | |
1298 ) | |
1299 return chomping, increment | |
1300 | |
1301 def scan_block_scalar_ignored_line(self, start_mark): | |
1302 # type: (Any) -> Any | |
1303 # See the specification for details. | |
1304 srp = self.reader.peek | |
1305 srf = self.reader.forward | |
1306 prefix = '' | |
1307 comment = None | |
1308 while srp() == ' ': | |
1309 prefix += srp() | |
1310 srf() | |
1311 if srp() == '#': | |
1312 comment = prefix | |
1313 while srp() not in _THE_END: | |
1314 comment += srp() | |
1315 srf() | |
1316 ch = srp() | |
1317 if ch not in _THE_END: | |
1318 raise ScannerError( | |
1319 'while scanning a block scalar', | |
1320 start_mark, | |
1321 'expected a comment or a line break, but found %r' % utf8(ch), | |
1322 self.reader.get_mark(), | |
1323 ) | |
1324 self.scan_line_break() | |
1325 return comment | |
1326 | |
1327 def scan_block_scalar_indentation(self): | |
1328 # type: () -> Any | |
1329 # See the specification for details. | |
1330 srp = self.reader.peek | |
1331 srf = self.reader.forward | |
1332 chunks = [] | |
1333 max_indent = 0 | |
1334 end_mark = self.reader.get_mark() | |
1335 while srp() in ' \r\n\x85\u2028\u2029': | |
1336 if srp() != ' ': | |
1337 chunks.append(self.scan_line_break()) | |
1338 end_mark = self.reader.get_mark() | |
1339 else: | |
1340 srf() | |
1341 if self.reader.column > max_indent: | |
1342 max_indent = self.reader.column | |
1343 return chunks, max_indent, end_mark | |
1344 | |
1345 def scan_block_scalar_breaks(self, indent): | |
1346 # type: (int) -> Any | |
1347 # See the specification for details. | |
1348 chunks = [] | |
1349 srp = self.reader.peek | |
1350 srf = self.reader.forward | |
1351 end_mark = self.reader.get_mark() | |
1352 while self.reader.column < indent and srp() == ' ': | |
1353 srf() | |
1354 while srp() in '\r\n\x85\u2028\u2029': | |
1355 chunks.append(self.scan_line_break()) | |
1356 end_mark = self.reader.get_mark() | |
1357 while self.reader.column < indent and srp() == ' ': | |
1358 srf() | |
1359 return chunks, end_mark | |
1360 | |
1361 def scan_flow_scalar(self, style): | |
1362 # type: (Any) -> Any | |
1363 # See the specification for details. | |
1364 # Note that we loose indentation rules for quoted scalars. Quoted | |
1365 # scalars don't need to adhere indentation because " and ' clearly | |
1366 # mark the beginning and the end of them. Therefore we are less | |
1367 # restrictive then the specification requires. We only need to check | |
1368 # that document separators are not included in scalars. | |
1369 if style == '"': | |
1370 double = True | |
1371 else: | |
1372 double = False | |
1373 srp = self.reader.peek | |
1374 chunks = [] # type: List[Any] | |
1375 start_mark = self.reader.get_mark() | |
1376 quote = srp() | |
1377 self.reader.forward() | |
1378 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
1379 while srp() != quote: | |
1380 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) | |
1381 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
1382 self.reader.forward() | |
1383 end_mark = self.reader.get_mark() | |
1384 return ScalarToken("".join(chunks), False, start_mark, end_mark, style) | |
1385 | |
1386 ESCAPE_REPLACEMENTS = { | |
1387 '0': '\0', | |
1388 'a': '\x07', | |
1389 'b': '\x08', | |
1390 't': '\x09', | |
1391 '\t': '\x09', | |
1392 'n': '\x0A', | |
1393 'v': '\x0B', | |
1394 'f': '\x0C', | |
1395 'r': '\x0D', | |
1396 'e': '\x1B', | |
1397 ' ': '\x20', | |
1398 '"': '"', | |
1399 '/': '/', # as per http://www.json.org/ | |
1400 '\\': '\\', | |
1401 'N': '\x85', | |
1402 '_': '\xA0', | |
1403 'L': '\u2028', | |
1404 'P': '\u2029', | |
1405 } | |
1406 | |
1407 ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8} | |
1408 | |
1409 def scan_flow_scalar_non_spaces(self, double, start_mark): | |
1410 # type: (Any, Any) -> Any | |
1411 # See the specification for details. | |
1412 chunks = [] # type: List[Any] | |
1413 srp = self.reader.peek | |
1414 srf = self.reader.forward | |
1415 while True: | |
1416 length = 0 | |
1417 while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029': | |
1418 length += 1 | |
1419 if length != 0: | |
1420 chunks.append(self.reader.prefix(length)) | |
1421 srf(length) | |
1422 ch = srp() | |
1423 if not double and ch == "'" and srp(1) == "'": | |
1424 chunks.append("'") | |
1425 srf(2) | |
1426 elif (double and ch == "'") or (not double and ch in '"\\'): | |
1427 chunks.append(ch) | |
1428 srf() | |
1429 elif double and ch == '\\': | |
1430 srf() | |
1431 ch = srp() | |
1432 if ch in self.ESCAPE_REPLACEMENTS: | |
1433 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) | |
1434 srf() | |
1435 elif ch in self.ESCAPE_CODES: | |
1436 length = self.ESCAPE_CODES[ch] | |
1437 srf() | |
1438 for k in range(length): | |
1439 if srp(k) not in '0123456789ABCDEFabcdef': | |
1440 raise ScannerError( | |
1441 'while scanning a double-quoted scalar', | |
1442 start_mark, | |
1443 'expected escape sequence of %d hexdecimal ' | |
1444 'numbers, but found %r' % (length, utf8(srp(k))), | |
1445 self.reader.get_mark(), | |
1446 ) | |
1447 code = int(self.reader.prefix(length), 16) | |
1448 chunks.append(unichr(code)) | |
1449 srf(length) | |
1450 elif ch in '\n\r\x85\u2028\u2029': | |
1451 self.scan_line_break() | |
1452 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) | |
1453 else: | |
1454 raise ScannerError( | |
1455 'while scanning a double-quoted scalar', | |
1456 start_mark, | |
1457 'found unknown escape character %r' % utf8(ch), | |
1458 self.reader.get_mark(), | |
1459 ) | |
1460 else: | |
1461 return chunks | |
1462 | |
1463 def scan_flow_scalar_spaces(self, double, start_mark): | |
1464 # type: (Any, Any) -> Any | |
1465 # See the specification for details. | |
1466 srp = self.reader.peek | |
1467 chunks = [] | |
1468 length = 0 | |
1469 while srp(length) in ' \t': | |
1470 length += 1 | |
1471 whitespaces = self.reader.prefix(length) | |
1472 self.reader.forward(length) | |
1473 ch = srp() | |
1474 if ch == '\0': | |
1475 raise ScannerError( | |
1476 'while scanning a quoted scalar', | |
1477 start_mark, | |
1478 'found unexpected end of stream', | |
1479 self.reader.get_mark(), | |
1480 ) | |
1481 elif ch in '\r\n\x85\u2028\u2029': | |
1482 line_break = self.scan_line_break() | |
1483 breaks = self.scan_flow_scalar_breaks(double, start_mark) | |
1484 if line_break != '\n': | |
1485 chunks.append(line_break) | |
1486 elif not breaks: | |
1487 chunks.append(' ') | |
1488 chunks.extend(breaks) | |
1489 else: | |
1490 chunks.append(whitespaces) | |
1491 return chunks | |
1492 | |
1493 def scan_flow_scalar_breaks(self, double, start_mark): | |
1494 # type: (Any, Any) -> Any | |
1495 # See the specification for details. | |
1496 chunks = [] # type: List[Any] | |
1497 srp = self.reader.peek | |
1498 srf = self.reader.forward | |
1499 while True: | |
1500 # Instead of checking indentation, we check for document | |
1501 # separators. | |
1502 prefix = self.reader.prefix(3) | |
1503 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB: | |
1504 raise ScannerError( | |
1505 'while scanning a quoted scalar', | |
1506 start_mark, | |
1507 'found unexpected document separator', | |
1508 self.reader.get_mark(), | |
1509 ) | |
1510 while srp() in ' \t': | |
1511 srf() | |
1512 if srp() in '\r\n\x85\u2028\u2029': | |
1513 chunks.append(self.scan_line_break()) | |
1514 else: | |
1515 return chunks | |
1516 | |
1517 def scan_plain(self): | |
1518 # type: () -> Any | |
1519 # See the specification for details. | |
1520 # We add an additional restriction for the flow context: | |
1521 # plain scalars in the flow context cannot contain ',', ': ' and '?'. | |
1522 # We also keep track of the `allow_simple_key` flag here. | |
1523 # Indentation rules are loosed for the flow context. | |
1524 srp = self.reader.peek | |
1525 srf = self.reader.forward | |
1526 chunks = [] # type: List[Any] | |
1527 start_mark = self.reader.get_mark() | |
1528 end_mark = start_mark | |
1529 indent = self.indent + 1 | |
1530 # We allow zero indentation for scalars, but then we need to check for | |
1531 # document separators at the beginning of the line. | |
1532 # if indent == 0: | |
1533 # indent = 1 | |
1534 spaces = [] # type: List[Any] | |
1535 while True: | |
1536 length = 0 | |
1537 if srp() == '#': | |
1538 break | |
1539 while True: | |
1540 ch = srp(length) | |
1541 if ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB: | |
1542 pass | |
1543 elif ch == '?' and self.scanner_processing_version != (1, 1): | |
1544 pass | |
1545 elif ( | |
1546 ch in _THE_END_SPACE_TAB | |
1547 or ( | |
1548 not self.flow_level | |
1549 and ch == ':' | |
1550 and srp(length + 1) in _THE_END_SPACE_TAB | |
1551 ) | |
1552 or (self.flow_level and ch in ',:?[]{}') | |
1553 ): | |
1554 break | |
1555 length += 1 | |
1556 # It's not clear what we should do with ':' in the flow context. | |
1557 if ( | |
1558 self.flow_level | |
1559 and ch == ':' | |
1560 and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}' | |
1561 ): | |
1562 srf(length) | |
1563 raise ScannerError( | |
1564 'while scanning a plain scalar', | |
1565 start_mark, | |
1566 "found unexpected ':'", | |
1567 self.reader.get_mark(), | |
1568 'Please check ' | |
1569 'http://pyyaml.org/wiki/YAMLColonInFlowContext ' | |
1570 'for details.', | |
1571 ) | |
1572 if length == 0: | |
1573 break | |
1574 self.allow_simple_key = False | |
1575 chunks.extend(spaces) | |
1576 chunks.append(self.reader.prefix(length)) | |
1577 srf(length) | |
1578 end_mark = self.reader.get_mark() | |
1579 spaces = self.scan_plain_spaces(indent, start_mark) | |
1580 if ( | |
1581 not spaces | |
1582 or srp() == '#' | |
1583 or (not self.flow_level and self.reader.column < indent) | |
1584 ): | |
1585 break | |
1586 | |
1587 token = ScalarToken("".join(chunks), True, start_mark, end_mark) | |
1588 if spaces and spaces[0] == '\n': | |
1589 # Create a comment token to preserve the trailing line breaks. | |
1590 comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark) | |
1591 token.add_post_comment(comment) | |
1592 return token | |
1593 | |
1594 def scan_plain_spaces(self, indent, start_mark): | |
1595 # type: (Any, Any) -> Any | |
1596 # See the specification for details. | |
1597 # The specification is really confusing about tabs in plain scalars. | |
1598 # We just forbid them completely. Do not use tabs in YAML! | |
1599 srp = self.reader.peek | |
1600 srf = self.reader.forward | |
1601 chunks = [] | |
1602 length = 0 | |
1603 while srp(length) in ' ': | |
1604 length += 1 | |
1605 whitespaces = self.reader.prefix(length) | |
1606 self.reader.forward(length) | |
1607 ch = srp() | |
1608 if ch in '\r\n\x85\u2028\u2029': | |
1609 line_break = self.scan_line_break() | |
1610 self.allow_simple_key = True | |
1611 prefix = self.reader.prefix(3) | |
1612 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB: | |
1613 return | |
1614 breaks = [] | |
1615 while srp() in ' \r\n\x85\u2028\u2029': | |
1616 if srp() == ' ': | |
1617 srf() | |
1618 else: | |
1619 breaks.append(self.scan_line_break()) | |
1620 prefix = self.reader.prefix(3) | |
1621 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB: | |
1622 return | |
1623 if line_break != '\n': | |
1624 chunks.append(line_break) | |
1625 elif not breaks: | |
1626 chunks.append(' ') | |
1627 chunks.extend(breaks) | |
1628 elif whitespaces: | |
1629 chunks.append(whitespaces) | |
1630 return chunks | |
1631 | |
1632 def scan_tag_handle(self, name, start_mark): | |
1633 # type: (Any, Any) -> Any | |
1634 # See the specification for details. | |
1635 # For some strange reasons, the specification does not allow '_' in | |
1636 # tag handles. I have allowed it anyway. | |
1637 srp = self.reader.peek | |
1638 ch = srp() | |
1639 if ch != '!': | |
1640 raise ScannerError( | |
1641 'while scanning a %s' % (name,), | |
1642 start_mark, | |
1643 "expected '!', but found %r" % utf8(ch), | |
1644 self.reader.get_mark(), | |
1645 ) | |
1646 length = 1 | |
1647 ch = srp(length) | |
1648 if ch != ' ': | |
1649 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_': | |
1650 length += 1 | |
1651 ch = srp(length) | |
1652 if ch != '!': | |
1653 self.reader.forward(length) | |
1654 raise ScannerError( | |
1655 'while scanning a %s' % (name,), | |
1656 start_mark, | |
1657 "expected '!', but found %r" % utf8(ch), | |
1658 self.reader.get_mark(), | |
1659 ) | |
1660 length += 1 | |
1661 value = self.reader.prefix(length) | |
1662 self.reader.forward(length) | |
1663 return value | |
1664 | |
1665 def scan_tag_uri(self, name, start_mark): | |
1666 # type: (Any, Any) -> Any | |
1667 # See the specification for details. | |
1668 # Note: we do not check if URI is well-formed. | |
1669 srp = self.reader.peek | |
1670 chunks = [] | |
1671 length = 0 | |
1672 ch = srp(length) | |
1673 while ( | |
1674 '0' <= ch <= '9' | |
1675 or 'A' <= ch <= 'Z' | |
1676 or 'a' <= ch <= 'z' | |
1677 or ch in "-;/?:@&=+$,_.!~*'()[]%" | |
1678 ): | |
1679 if ch == '%': | |
1680 chunks.append(self.reader.prefix(length)) | |
1681 self.reader.forward(length) | |
1682 length = 0 | |
1683 chunks.append(self.scan_uri_escapes(name, start_mark)) | |
1684 else: | |
1685 length += 1 | |
1686 ch = srp(length) | |
1687 if length != 0: | |
1688 chunks.append(self.reader.prefix(length)) | |
1689 self.reader.forward(length) | |
1690 length = 0 | |
1691 if not chunks: | |
1692 raise ScannerError( | |
1693 'while parsing a %s' % (name,), | |
1694 start_mark, | |
1695 'expected URI, but found %r' % utf8(ch), | |
1696 self.reader.get_mark(), | |
1697 ) | |
1698 return "".join(chunks) | |
1699 | |
1700 def scan_uri_escapes(self, name, start_mark): | |
1701 # type: (Any, Any) -> Any | |
1702 # See the specification for details. | |
1703 srp = self.reader.peek | |
1704 srf = self.reader.forward | |
1705 code_bytes = [] # type: List[Any] | |
1706 mark = self.reader.get_mark() | |
1707 while srp() == '%': | |
1708 srf() | |
1709 for k in range(2): | |
1710 if srp(k) not in '0123456789ABCDEFabcdef': | |
1711 raise ScannerError( | |
1712 'while scanning a %s' % (name,), | |
1713 start_mark, | |
1714 'expected URI escape sequence of 2 hexdecimal numbers,' | |
1715 ' but found %r' % utf8(srp(k)), | |
1716 self.reader.get_mark(), | |
1717 ) | |
1718 if PY3: | |
1719 code_bytes.append(int(self.reader.prefix(2), 16)) | |
1720 else: | |
1721 code_bytes.append(chr(int(self.reader.prefix(2), 16))) | |
1722 srf(2) | |
1723 try: | |
1724 if PY3: | |
1725 value = bytes(code_bytes).decode('utf-8') | |
1726 else: | |
1727 value = unicode(b"".join(code_bytes), 'utf-8') | |
1728 except UnicodeDecodeError as exc: | |
1729 raise ScannerError('while scanning a %s' % (name,), start_mark, str(exc), mark) | |
1730 return value | |
1731 | |
1732 def scan_line_break(self): | |
1733 # type: () -> Any | |
1734 # Transforms: | |
1735 # '\r\n' : '\n' | |
1736 # '\r' : '\n' | |
1737 # '\n' : '\n' | |
1738 # '\x85' : '\n' | |
1739 # '\u2028' : '\u2028' | |
1740 # '\u2029 : '\u2029' | |
1741 # default : '' | |
1742 ch = self.reader.peek() | |
1743 if ch in '\r\n\x85': | |
1744 if self.reader.prefix(2) == '\r\n': | |
1745 self.reader.forward(2) | |
1746 else: | |
1747 self.reader.forward() | |
1748 return '\n' | |
1749 elif ch in '\u2028\u2029': | |
1750 self.reader.forward() | |
1751 return ch | |
1752 return "" | |
1753 | |
1754 | |
1755 class RoundTripScanner(Scanner): | |
1756 def check_token(self, *choices): | |
1757 # type: (Any) -> bool | |
1758 # Check if the next token is one of the given types. | |
1759 while self.need_more_tokens(): | |
1760 self.fetch_more_tokens() | |
1761 self._gather_comments() | |
1762 if bool(self.tokens): | |
1763 if not choices: | |
1764 return True | |
1765 for choice in choices: | |
1766 if isinstance(self.tokens[0], choice): | |
1767 return True | |
1768 return False | |
1769 | |
1770 def peek_token(self): | |
1771 # type: () -> Any | |
1772 # Return the next token, but do not delete if from the queue. | |
1773 while self.need_more_tokens(): | |
1774 self.fetch_more_tokens() | |
1775 self._gather_comments() | |
1776 if bool(self.tokens): | |
1777 return self.tokens[0] | |
1778 return None | |
1779 | |
1780 def _gather_comments(self): | |
1781 # type: () -> Any | |
1782 """combine multiple comment lines""" | |
1783 comments = [] # type: List[Any] | |
1784 if not self.tokens: | |
1785 return comments | |
1786 if isinstance(self.tokens[0], CommentToken): | |
1787 comment = self.tokens.pop(0) | |
1788 self.tokens_taken += 1 | |
1789 comments.append(comment) | |
1790 while self.need_more_tokens(): | |
1791 self.fetch_more_tokens() | |
1792 if not self.tokens: | |
1793 return comments | |
1794 if isinstance(self.tokens[0], CommentToken): | |
1795 self.tokens_taken += 1 | |
1796 comment = self.tokens.pop(0) | |
1797 # nprint('dropping2', comment) | |
1798 comments.append(comment) | |
1799 if len(comments) >= 1: | |
1800 self.tokens[0].add_pre_comments(comments) | |
1801 # pull in post comment on e.g. ':' | |
1802 if not self.done and len(self.tokens) < 2: | |
1803 self.fetch_more_tokens() | |
1804 | |
1805 def get_token(self): | |
1806 # type: () -> Any | |
1807 # Return the next token. | |
1808 while self.need_more_tokens(): | |
1809 self.fetch_more_tokens() | |
1810 self._gather_comments() | |
1811 if bool(self.tokens): | |
1812 # nprint('tk', self.tokens) | |
1813 # only add post comment to single line tokens: | |
1814 # scalar, value token. FlowXEndToken, otherwise | |
1815 # hidden streamtokens could get them (leave them and they will be | |
1816 # pre comments for the next map/seq | |
1817 if ( | |
1818 len(self.tokens) > 1 | |
1819 and isinstance( | |
1820 self.tokens[0], | |
1821 (ScalarToken, ValueToken, FlowSequenceEndToken, FlowMappingEndToken), | |
1822 ) | |
1823 and isinstance(self.tokens[1], CommentToken) | |
1824 and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line | |
1825 ): | |
1826 self.tokens_taken += 1 | |
1827 c = self.tokens.pop(1) | |
1828 self.fetch_more_tokens() | |
1829 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken): | |
1830 self.tokens_taken += 1 | |
1831 c1 = self.tokens.pop(1) | |
1832 c.value = c.value + (' ' * c1.start_mark.column) + c1.value | |
1833 self.fetch_more_tokens() | |
1834 self.tokens[0].add_post_comment(c) | |
1835 elif ( | |
1836 len(self.tokens) > 1 | |
1837 and isinstance(self.tokens[0], ScalarToken) | |
1838 and isinstance(self.tokens[1], CommentToken) | |
1839 and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line | |
1840 ): | |
1841 self.tokens_taken += 1 | |
1842 c = self.tokens.pop(1) | |
1843 c.value = ( | |
1844 '\n' * (c.start_mark.line - self.tokens[0].end_mark.line) | |
1845 + (' ' * c.start_mark.column) | |
1846 + c.value | |
1847 ) | |
1848 self.tokens[0].add_post_comment(c) | |
1849 self.fetch_more_tokens() | |
1850 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken): | |
1851 self.tokens_taken += 1 | |
1852 c1 = self.tokens.pop(1) | |
1853 c.value = c.value + (' ' * c1.start_mark.column) + c1.value | |
1854 self.fetch_more_tokens() | |
1855 self.tokens_taken += 1 | |
1856 return self.tokens.pop(0) | |
1857 return None | |
1858 | |
1859 def fetch_comment(self, comment): | |
1860 # type: (Any) -> None | |
1861 value, start_mark, end_mark = comment | |
1862 while value and value[-1] == ' ': | |
1863 # empty line within indented key context | |
1864 # no need to update end-mark, that is not used | |
1865 value = value[:-1] | |
1866 self.tokens.append(CommentToken(value, start_mark, end_mark)) | |
1867 | |
1868 # scanner | |
1869 | |
1870 def scan_to_next_token(self): | |
1871 # type: () -> Any | |
1872 # We ignore spaces, line breaks and comments. | |
1873 # If we find a line break in the block context, we set the flag | |
1874 # `allow_simple_key` on. | |
1875 # The byte order mark is stripped if it's the first character in the | |
1876 # stream. We do not yet support BOM inside the stream as the | |
1877 # specification requires. Any such mark will be considered as a part | |
1878 # of the document. | |
1879 # | |
1880 # TODO: We need to make tab handling rules more sane. A good rule is | |
1881 # Tabs cannot precede tokens | |
1882 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
1883 # KEY(block), VALUE(block), BLOCK-ENTRY | |
1884 # So the checking code is | |
1885 # if <TAB>: | |
1886 # self.allow_simple_keys = False | |
1887 # We also need to add the check for `allow_simple_keys == True` to | |
1888 # `unwind_indent` before issuing BLOCK-END. | |
1889 # Scanners for block, flow, and plain scalars need to be modified. | |
1890 | |
1891 srp = self.reader.peek | |
1892 srf = self.reader.forward | |
1893 if self.reader.index == 0 and srp() == '\uFEFF': | |
1894 srf() | |
1895 found = False | |
1896 while not found: | |
1897 while srp() == ' ': | |
1898 srf() | |
1899 ch = srp() | |
1900 if ch == '#': | |
1901 start_mark = self.reader.get_mark() | |
1902 comment = ch | |
1903 srf() | |
1904 while ch not in _THE_END: | |
1905 ch = srp() | |
1906 if ch == '\0': # don't gobble the end-of-stream character | |
1907 # but add an explicit newline as "YAML processors should terminate | |
1908 # the stream with an explicit line break | |
1909 # https://yaml.org/spec/1.2/spec.html#id2780069 | |
1910 comment += '\n' | |
1911 break | |
1912 comment += ch | |
1913 srf() | |
1914 # gather any blank lines following the comment too | |
1915 ch = self.scan_line_break() | |
1916 while len(ch) > 0: | |
1917 comment += ch | |
1918 ch = self.scan_line_break() | |
1919 end_mark = self.reader.get_mark() | |
1920 if not self.flow_level: | |
1921 self.allow_simple_key = True | |
1922 return comment, start_mark, end_mark | |
1923 if bool(self.scan_line_break()): | |
1924 start_mark = self.reader.get_mark() | |
1925 if not self.flow_level: | |
1926 self.allow_simple_key = True | |
1927 ch = srp() | |
1928 if ch == '\n': # empty toplevel lines | |
1929 start_mark = self.reader.get_mark() | |
1930 comment = "" | |
1931 while ch: | |
1932 ch = self.scan_line_break(empty_line=True) | |
1933 comment += ch | |
1934 if srp() == '#': | |
1935 # empty line followed by indented real comment | |
1936 comment = comment.rsplit('\n', 1)[0] + '\n' | |
1937 end_mark = self.reader.get_mark() | |
1938 return comment, start_mark, end_mark | |
1939 else: | |
1940 found = True | |
1941 return None | |
1942 | |
1943 def scan_line_break(self, empty_line=False): | |
1944 # type: (bool) -> Text | |
1945 # Transforms: | |
1946 # '\r\n' : '\n' | |
1947 # '\r' : '\n' | |
1948 # '\n' : '\n' | |
1949 # '\x85' : '\n' | |
1950 # '\u2028' : '\u2028' | |
1951 # '\u2029 : '\u2029' | |
1952 # default : '' | |
1953 ch = self.reader.peek() # type: Text | |
1954 if ch in '\r\n\x85': | |
1955 if self.reader.prefix(2) == '\r\n': | |
1956 self.reader.forward(2) | |
1957 else: | |
1958 self.reader.forward() | |
1959 return '\n' | |
1960 elif ch in '\u2028\u2029': | |
1961 self.reader.forward() | |
1962 return ch | |
1963 elif empty_line and ch in '\t ': | |
1964 self.reader.forward() | |
1965 return ch | |
1966 return "" | |
1967 | |
1968 def scan_block_scalar(self, style, rt=True): | |
1969 # type: (Any, Optional[bool]) -> Any | |
1970 return Scanner.scan_block_scalar(self, style, rt=rt) | |
1971 | |
1972 | |
1973 # try: | |
1974 # import psyco | |
1975 # psyco.bind(Scanner) | |
1976 # except ImportError: | |
1977 # pass |