comparison env/lib/python3.7/site-packages/ruamel/yaml/scanner.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # coding: utf-8
2
3 from __future__ import print_function, absolute_import, division, unicode_literals
4
5 # Scanner produces tokens of the following types:
6 # STREAM-START
7 # STREAM-END
8 # DIRECTIVE(name, value)
9 # DOCUMENT-START
10 # DOCUMENT-END
11 # BLOCK-SEQUENCE-START
12 # BLOCK-MAPPING-START
13 # BLOCK-END
14 # FLOW-SEQUENCE-START
15 # FLOW-MAPPING-START
16 # FLOW-SEQUENCE-END
17 # FLOW-MAPPING-END
18 # BLOCK-ENTRY
19 # FLOW-ENTRY
20 # KEY
21 # VALUE
22 # ALIAS(value)
23 # ANCHOR(value)
24 # TAG(value)
25 # SCALAR(value, plain, style)
26 #
27 # RoundTripScanner
28 # COMMENT(value)
29 #
30 # Read comments in the Scanner code for more details.
31 #
32
33 from ruamel.yaml.error import MarkedYAMLError
34 from ruamel.yaml.tokens import * # NOQA
35 from ruamel.yaml.compat import utf8, unichr, PY3, check_anchorname_char, nprint # NOQA
36
37 if False: # MYPY
38 from typing import Any, Dict, Optional, List, Union, Text # NOQA
39 from ruamel.yaml.compat import VersionType # NOQA
40
41 __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError']
42
43
44 _THE_END = '\n\0\r\x85\u2028\u2029'
45 _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029'
46 _SPACE_TAB = ' \t'
47
48
49 class ScannerError(MarkedYAMLError):
50 pass
51
52
53 class SimpleKey(object):
54 # See below simple keys treatment.
55
56 def __init__(self, token_number, required, index, line, column, mark):
57 # type: (Any, Any, int, int, int, Any) -> None
58 self.token_number = token_number
59 self.required = required
60 self.index = index
61 self.line = line
62 self.column = column
63 self.mark = mark
64
65
66 class Scanner(object):
67 def __init__(self, loader=None):
68 # type: (Any) -> None
69 """Initialize the scanner."""
70 # It is assumed that Scanner and Reader will have a common descendant.
71 # Reader do the dirty work of checking for BOM and converting the
72 # input data to Unicode. It also adds NUL to the end.
73 #
74 # Reader supports the following methods
75 # self.peek(i=0) # peek the next i-th character
76 # self.prefix(l=1) # peek the next l characters
77 # self.forward(l=1) # read the next l characters and move the pointer
78
79 self.loader = loader
80 if self.loader is not None and getattr(self.loader, '_scanner', None) is None:
81 self.loader._scanner = self
82 self.reset_scanner()
83 self.first_time = False
84
85 @property
86 def flow_level(self):
87 # type: () -> int
88 return len(self.flow_context)
89
90 def reset_scanner(self):
91 # type: () -> None
92 # Had we reached the end of the stream?
93 self.done = False
94
95 # flow_context is an expanding/shrinking list consisting of '{' and '['
96 # for each unclosed flow context. If empty list that means block context
97 self.flow_context = [] # type: List[Text]
98
99 # List of processed tokens that are not yet emitted.
100 self.tokens = [] # type: List[Any]
101
102 # Add the STREAM-START token.
103 self.fetch_stream_start()
104
105 # Number of tokens that were emitted through the `get_token` method.
106 self.tokens_taken = 0
107
108 # The current indentation level.
109 self.indent = -1
110
111 # Past indentation levels.
112 self.indents = [] # type: List[int]
113
114 # Variables related to simple keys treatment.
115
116 # A simple key is a key that is not denoted by the '?' indicator.
117 # Example of simple keys:
118 # ---
119 # block simple key: value
120 # ? not a simple key:
121 # : { flow simple key: value }
122 # We emit the KEY token before all keys, so when we find a potential
123 # simple key, we try to locate the corresponding ':' indicator.
124 # Simple keys should be limited to a single line and 1024 characters.
125
126 # Can a simple key start at the current position? A simple key may
127 # start:
128 # - at the beginning of the line, not counting indentation spaces
129 # (in block context),
130 # - after '{', '[', ',' (in the flow context),
131 # - after '?', ':', '-' (in the block context).
132 # In the block context, this flag also signifies if a block collection
133 # may start at the current position.
134 self.allow_simple_key = True
135
136 # Keep track of possible simple keys. This is a dictionary. The key
137 # is `flow_level`; there can be no more that one possible simple key
138 # for each level. The value is a SimpleKey record:
139 # (token_number, required, index, line, column, mark)
140 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
141 # '[', or '{' tokens.
142 self.possible_simple_keys = {} # type: Dict[Any, Any]
143
144 @property
145 def reader(self):
146 # type: () -> Any
147 try:
148 return self._scanner_reader # type: ignore
149 except AttributeError:
150 if hasattr(self.loader, 'typ'):
151 self._scanner_reader = self.loader.reader
152 else:
153 self._scanner_reader = self.loader._reader
154 return self._scanner_reader
155
156 @property
157 def scanner_processing_version(self): # prefix until un-composited
158 # type: () -> VersionType
159 if hasattr(self.loader, 'typ'):
160 return self.loader.resolver.processing_version # type: ignore
161 return self.loader.processing_version # type: ignore
162
163 # Public methods.
164
165 def check_token(self, *choices):
166 # type: (Any) -> bool
167 # Check if the next token is one of the given types.
168 while self.need_more_tokens():
169 self.fetch_more_tokens()
170 if bool(self.tokens):
171 if not choices:
172 return True
173 for choice in choices:
174 if isinstance(self.tokens[0], choice):
175 return True
176 return False
177
178 def peek_token(self):
179 # type: () -> Any
180 # Return the next token, but do not delete if from the queue.
181 while self.need_more_tokens():
182 self.fetch_more_tokens()
183 if bool(self.tokens):
184 return self.tokens[0]
185
186 def get_token(self):
187 # type: () -> Any
188 # Return the next token.
189 while self.need_more_tokens():
190 self.fetch_more_tokens()
191 if bool(self.tokens):
192 self.tokens_taken += 1
193 return self.tokens.pop(0)
194
195 # Private methods.
196
197 def need_more_tokens(self):
198 # type: () -> bool
199 if self.done:
200 return False
201 if not self.tokens:
202 return True
203 # The current token may be a potential simple key, so we
204 # need to look further.
205 self.stale_possible_simple_keys()
206 if self.next_possible_simple_key() == self.tokens_taken:
207 return True
208 return False
209
210 def fetch_comment(self, comment):
211 # type: (Any) -> None
212 raise NotImplementedError
213
214 def fetch_more_tokens(self):
215 # type: () -> Any
216 # Eat whitespaces and comments until we reach the next token.
217 comment = self.scan_to_next_token()
218 if comment is not None: # never happens for base scanner
219 return self.fetch_comment(comment)
220 # Remove obsolete possible simple keys.
221 self.stale_possible_simple_keys()
222
223 # Compare the current indentation and column. It may add some tokens
224 # and decrease the current indentation level.
225 self.unwind_indent(self.reader.column)
226
227 # Peek the next character.
228 ch = self.reader.peek()
229
230 # Is it the end of stream?
231 if ch == '\0':
232 return self.fetch_stream_end()
233
234 # Is it a directive?
235 if ch == '%' and self.check_directive():
236 return self.fetch_directive()
237
238 # Is it the document start?
239 if ch == '-' and self.check_document_start():
240 return self.fetch_document_start()
241
242 # Is it the document end?
243 if ch == '.' and self.check_document_end():
244 return self.fetch_document_end()
245
246 # TODO: support for BOM within a stream.
247 # if ch == u'\uFEFF':
248 # return self.fetch_bom() <-- issue BOMToken
249
250 # Note: the order of the following checks is NOT significant.
251
252 # Is it the flow sequence start indicator?
253 if ch == '[':
254 return self.fetch_flow_sequence_start()
255
256 # Is it the flow mapping start indicator?
257 if ch == '{':
258 return self.fetch_flow_mapping_start()
259
260 # Is it the flow sequence end indicator?
261 if ch == ']':
262 return self.fetch_flow_sequence_end()
263
264 # Is it the flow mapping end indicator?
265 if ch == '}':
266 return self.fetch_flow_mapping_end()
267
268 # Is it the flow entry indicator?
269 if ch == ',':
270 return self.fetch_flow_entry()
271
272 # Is it the block entry indicator?
273 if ch == '-' and self.check_block_entry():
274 return self.fetch_block_entry()
275
276 # Is it the key indicator?
277 if ch == '?' and self.check_key():
278 return self.fetch_key()
279
280 # Is it the value indicator?
281 if ch == ':' and self.check_value():
282 return self.fetch_value()
283
284 # Is it an alias?
285 if ch == '*':
286 return self.fetch_alias()
287
288 # Is it an anchor?
289 if ch == '&':
290 return self.fetch_anchor()
291
292 # Is it a tag?
293 if ch == '!':
294 return self.fetch_tag()
295
296 # Is it a literal scalar?
297 if ch == '|' and not self.flow_level:
298 return self.fetch_literal()
299
300 # Is it a folded scalar?
301 if ch == '>' and not self.flow_level:
302 return self.fetch_folded()
303
304 # Is it a single quoted scalar?
305 if ch == "'":
306 return self.fetch_single()
307
308 # Is it a double quoted scalar?
309 if ch == '"':
310 return self.fetch_double()
311
312 # It must be a plain scalar then.
313 if self.check_plain():
314 return self.fetch_plain()
315
316 # No? It's an error. Let's produce a nice error message.
317 raise ScannerError(
318 'while scanning for the next token',
319 None,
320 'found character %r that cannot start any token' % utf8(ch),
321 self.reader.get_mark(),
322 )
323
324 # Simple keys treatment.
325
326 def next_possible_simple_key(self):
327 # type: () -> Any
328 # Return the number of the nearest possible simple key. Actually we
329 # don't need to loop through the whole dictionary. We may replace it
330 # with the following code:
331 # if not self.possible_simple_keys:
332 # return None
333 # return self.possible_simple_keys[
334 # min(self.possible_simple_keys.keys())].token_number
335 min_token_number = None
336 for level in self.possible_simple_keys:
337 key = self.possible_simple_keys[level]
338 if min_token_number is None or key.token_number < min_token_number:
339 min_token_number = key.token_number
340 return min_token_number
341
342 def stale_possible_simple_keys(self):
343 # type: () -> None
344 # Remove entries that are no longer possible simple keys. According to
345 # the YAML specification, simple keys
346 # - should be limited to a single line,
347 # - should be no longer than 1024 characters.
348 # Disabling this procedure will allow simple keys of any length and
349 # height (may cause problems if indentation is broken though).
350 for level in list(self.possible_simple_keys):
351 key = self.possible_simple_keys[level]
352 if key.line != self.reader.line or self.reader.index - key.index > 1024:
353 if key.required:
354 raise ScannerError(
355 'while scanning a simple key',
356 key.mark,
357 "could not find expected ':'",
358 self.reader.get_mark(),
359 )
360 del self.possible_simple_keys[level]
361
362 def save_possible_simple_key(self):
363 # type: () -> None
364 # The next token may start a simple key. We check if it's possible
365 # and save its position. This function is called for
366 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
367
368 # Check if a simple key is required at the current position.
369 required = not self.flow_level and self.indent == self.reader.column
370
371 # The next token might be a simple key. Let's save it's number and
372 # position.
373 if self.allow_simple_key:
374 self.remove_possible_simple_key()
375 token_number = self.tokens_taken + len(self.tokens)
376 key = SimpleKey(
377 token_number,
378 required,
379 self.reader.index,
380 self.reader.line,
381 self.reader.column,
382 self.reader.get_mark(),
383 )
384 self.possible_simple_keys[self.flow_level] = key
385
386 def remove_possible_simple_key(self):
387 # type: () -> None
388 # Remove the saved possible key position at the current flow level.
389 if self.flow_level in self.possible_simple_keys:
390 key = self.possible_simple_keys[self.flow_level]
391
392 if key.required:
393 raise ScannerError(
394 'while scanning a simple key',
395 key.mark,
396 "could not find expected ':'",
397 self.reader.get_mark(),
398 )
399
400 del self.possible_simple_keys[self.flow_level]
401
402 # Indentation functions.
403
404 def unwind_indent(self, column):
405 # type: (Any) -> None
406 # In flow context, tokens should respect indentation.
407 # Actually the condition should be `self.indent >= column` according to
408 # the spec. But this condition will prohibit intuitively correct
409 # constructions such as
410 # key : {
411 # }
412 # ####
413 # if self.flow_level and self.indent > column:
414 # raise ScannerError(None, None,
415 # "invalid intendation or unclosed '[' or '{'",
416 # self.reader.get_mark())
417
418 # In the flow context, indentation is ignored. We make the scanner less
419 # restrictive then specification requires.
420 if bool(self.flow_level):
421 return
422
423 # In block context, we may need to issue the BLOCK-END tokens.
424 while self.indent > column:
425 mark = self.reader.get_mark()
426 self.indent = self.indents.pop()
427 self.tokens.append(BlockEndToken(mark, mark))
428
429 def add_indent(self, column):
430 # type: (int) -> bool
431 # Check if we need to increase indentation.
432 if self.indent < column:
433 self.indents.append(self.indent)
434 self.indent = column
435 return True
436 return False
437
438 # Fetchers.
439
440 def fetch_stream_start(self):
441 # type: () -> None
442 # We always add STREAM-START as the first token and STREAM-END as the
443 # last token.
444 # Read the token.
445 mark = self.reader.get_mark()
446 # Add STREAM-START.
447 self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding))
448
449 def fetch_stream_end(self):
450 # type: () -> None
451 # Set the current intendation to -1.
452 self.unwind_indent(-1)
453 # Reset simple keys.
454 self.remove_possible_simple_key()
455 self.allow_simple_key = False
456 self.possible_simple_keys = {} # type: Dict[Any, Any]
457 # Read the token.
458 mark = self.reader.get_mark()
459 # Add STREAM-END.
460 self.tokens.append(StreamEndToken(mark, mark))
461 # The steam is finished.
462 self.done = True
463
464 def fetch_directive(self):
465 # type: () -> None
466 # Set the current intendation to -1.
467 self.unwind_indent(-1)
468
469 # Reset simple keys.
470 self.remove_possible_simple_key()
471 self.allow_simple_key = False
472
473 # Scan and add DIRECTIVE.
474 self.tokens.append(self.scan_directive())
475
476 def fetch_document_start(self):
477 # type: () -> None
478 self.fetch_document_indicator(DocumentStartToken)
479
480 def fetch_document_end(self):
481 # type: () -> None
482 self.fetch_document_indicator(DocumentEndToken)
483
484 def fetch_document_indicator(self, TokenClass):
485 # type: (Any) -> None
486 # Set the current intendation to -1.
487 self.unwind_indent(-1)
488
489 # Reset simple keys. Note that there could not be a block collection
490 # after '---'.
491 self.remove_possible_simple_key()
492 self.allow_simple_key = False
493
494 # Add DOCUMENT-START or DOCUMENT-END.
495 start_mark = self.reader.get_mark()
496 self.reader.forward(3)
497 end_mark = self.reader.get_mark()
498 self.tokens.append(TokenClass(start_mark, end_mark))
499
500 def fetch_flow_sequence_start(self):
501 # type: () -> None
502 self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[')
503
504 def fetch_flow_mapping_start(self):
505 # type: () -> None
506 self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{')
507
508 def fetch_flow_collection_start(self, TokenClass, to_push):
509 # type: (Any, Text) -> None
510 # '[' and '{' may start a simple key.
511 self.save_possible_simple_key()
512 # Increase the flow level.
513 self.flow_context.append(to_push)
514 # Simple keys are allowed after '[' and '{'.
515 self.allow_simple_key = True
516 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
517 start_mark = self.reader.get_mark()
518 self.reader.forward()
519 end_mark = self.reader.get_mark()
520 self.tokens.append(TokenClass(start_mark, end_mark))
521
522 def fetch_flow_sequence_end(self):
523 # type: () -> None
524 self.fetch_flow_collection_end(FlowSequenceEndToken)
525
526 def fetch_flow_mapping_end(self):
527 # type: () -> None
528 self.fetch_flow_collection_end(FlowMappingEndToken)
529
530 def fetch_flow_collection_end(self, TokenClass):
531 # type: (Any) -> None
532 # Reset possible simple key on the current level.
533 self.remove_possible_simple_key()
534 # Decrease the flow level.
535 try:
536 popped = self.flow_context.pop() # NOQA
537 except IndexError:
538 # We must not be in a list or object.
539 # Defer error handling to the parser.
540 pass
541 # No simple keys after ']' or '}'.
542 self.allow_simple_key = False
543 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
544 start_mark = self.reader.get_mark()
545 self.reader.forward()
546 end_mark = self.reader.get_mark()
547 self.tokens.append(TokenClass(start_mark, end_mark))
548
549 def fetch_flow_entry(self):
550 # type: () -> None
551 # Simple keys are allowed after ','.
552 self.allow_simple_key = True
553 # Reset possible simple key on the current level.
554 self.remove_possible_simple_key()
555 # Add FLOW-ENTRY.
556 start_mark = self.reader.get_mark()
557 self.reader.forward()
558 end_mark = self.reader.get_mark()
559 self.tokens.append(FlowEntryToken(start_mark, end_mark))
560
561 def fetch_block_entry(self):
562 # type: () -> None
563 # Block context needs additional checks.
564 if not self.flow_level:
565 # Are we allowed to start a new entry?
566 if not self.allow_simple_key:
567 raise ScannerError(
568 None, None, 'sequence entries are not allowed here', self.reader.get_mark()
569 )
570 # We may need to add BLOCK-SEQUENCE-START.
571 if self.add_indent(self.reader.column):
572 mark = self.reader.get_mark()
573 self.tokens.append(BlockSequenceStartToken(mark, mark))
574 # It's an error for the block entry to occur in the flow context,
575 # but we let the parser detect this.
576 else:
577 pass
578 # Simple keys are allowed after '-'.
579 self.allow_simple_key = True
580 # Reset possible simple key on the current level.
581 self.remove_possible_simple_key()
582
583 # Add BLOCK-ENTRY.
584 start_mark = self.reader.get_mark()
585 self.reader.forward()
586 end_mark = self.reader.get_mark()
587 self.tokens.append(BlockEntryToken(start_mark, end_mark))
588
589 def fetch_key(self):
590 # type: () -> None
591 # Block context needs additional checks.
592 if not self.flow_level:
593
594 # Are we allowed to start a key (not nessesary a simple)?
595 if not self.allow_simple_key:
596 raise ScannerError(
597 None, None, 'mapping keys are not allowed here', self.reader.get_mark()
598 )
599
600 # We may need to add BLOCK-MAPPING-START.
601 if self.add_indent(self.reader.column):
602 mark = self.reader.get_mark()
603 self.tokens.append(BlockMappingStartToken(mark, mark))
604
605 # Simple keys are allowed after '?' in the block context.
606 self.allow_simple_key = not self.flow_level
607
608 # Reset possible simple key on the current level.
609 self.remove_possible_simple_key()
610
611 # Add KEY.
612 start_mark = self.reader.get_mark()
613 self.reader.forward()
614 end_mark = self.reader.get_mark()
615 self.tokens.append(KeyToken(start_mark, end_mark))
616
617 def fetch_value(self):
618 # type: () -> None
619 # Do we determine a simple key?
620 if self.flow_level in self.possible_simple_keys:
621 # Add KEY.
622 key = self.possible_simple_keys[self.flow_level]
623 del self.possible_simple_keys[self.flow_level]
624 self.tokens.insert(
625 key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark)
626 )
627
628 # If this key starts a new block mapping, we need to add
629 # BLOCK-MAPPING-START.
630 if not self.flow_level:
631 if self.add_indent(key.column):
632 self.tokens.insert(
633 key.token_number - self.tokens_taken,
634 BlockMappingStartToken(key.mark, key.mark),
635 )
636
637 # There cannot be two simple keys one after another.
638 self.allow_simple_key = False
639
640 # It must be a part of a complex key.
641 else:
642
643 # Block context needs additional checks.
644 # (Do we really need them? They will be caught by the parser
645 # anyway.)
646 if not self.flow_level:
647
648 # We are allowed to start a complex value if and only if
649 # we can start a simple key.
650 if not self.allow_simple_key:
651 raise ScannerError(
652 None,
653 None,
654 'mapping values are not allowed here',
655 self.reader.get_mark(),
656 )
657
658 # If this value starts a new block mapping, we need to add
659 # BLOCK-MAPPING-START. It will be detected as an error later by
660 # the parser.
661 if not self.flow_level:
662 if self.add_indent(self.reader.column):
663 mark = self.reader.get_mark()
664 self.tokens.append(BlockMappingStartToken(mark, mark))
665
666 # Simple keys are allowed after ':' in the block context.
667 self.allow_simple_key = not self.flow_level
668
669 # Reset possible simple key on the current level.
670 self.remove_possible_simple_key()
671
672 # Add VALUE.
673 start_mark = self.reader.get_mark()
674 self.reader.forward()
675 end_mark = self.reader.get_mark()
676 self.tokens.append(ValueToken(start_mark, end_mark))
677
678 def fetch_alias(self):
679 # type: () -> None
680 # ALIAS could be a simple key.
681 self.save_possible_simple_key()
682 # No simple keys after ALIAS.
683 self.allow_simple_key = False
684 # Scan and add ALIAS.
685 self.tokens.append(self.scan_anchor(AliasToken))
686
687 def fetch_anchor(self):
688 # type: () -> None
689 # ANCHOR could start a simple key.
690 self.save_possible_simple_key()
691 # No simple keys after ANCHOR.
692 self.allow_simple_key = False
693 # Scan and add ANCHOR.
694 self.tokens.append(self.scan_anchor(AnchorToken))
695
696 def fetch_tag(self):
697 # type: () -> None
698 # TAG could start a simple key.
699 self.save_possible_simple_key()
700 # No simple keys after TAG.
701 self.allow_simple_key = False
702 # Scan and add TAG.
703 self.tokens.append(self.scan_tag())
704
705 def fetch_literal(self):
706 # type: () -> None
707 self.fetch_block_scalar(style='|')
708
709 def fetch_folded(self):
710 # type: () -> None
711 self.fetch_block_scalar(style='>')
712
713 def fetch_block_scalar(self, style):
714 # type: (Any) -> None
715 # A simple key may follow a block scalar.
716 self.allow_simple_key = True
717 # Reset possible simple key on the current level.
718 self.remove_possible_simple_key()
719 # Scan and add SCALAR.
720 self.tokens.append(self.scan_block_scalar(style))
721
722 def fetch_single(self):
723 # type: () -> None
724 self.fetch_flow_scalar(style="'")
725
726 def fetch_double(self):
727 # type: () -> None
728 self.fetch_flow_scalar(style='"')
729
730 def fetch_flow_scalar(self, style):
731 # type: (Any) -> None
732 # A flow scalar could be a simple key.
733 self.save_possible_simple_key()
734 # No simple keys after flow scalars.
735 self.allow_simple_key = False
736 # Scan and add SCALAR.
737 self.tokens.append(self.scan_flow_scalar(style))
738
739 def fetch_plain(self):
740 # type: () -> None
741 # A plain scalar could be a simple key.
742 self.save_possible_simple_key()
743 # No simple keys after plain scalars. But note that `scan_plain` will
744 # change this flag if the scan is finished at the beginning of the
745 # line.
746 self.allow_simple_key = False
747 # Scan and add SCALAR. May change `allow_simple_key`.
748 self.tokens.append(self.scan_plain())
749
750 # Checkers.
751
752 def check_directive(self):
753 # type: () -> Any
754 # DIRECTIVE: ^ '%' ...
755 # The '%' indicator is already checked.
756 if self.reader.column == 0:
757 return True
758 return None
759
760 def check_document_start(self):
761 # type: () -> Any
762 # DOCUMENT-START: ^ '---' (' '|'\n')
763 if self.reader.column == 0:
764 if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB:
765 return True
766 return None
767
768 def check_document_end(self):
769 # type: () -> Any
770 # DOCUMENT-END: ^ '...' (' '|'\n')
771 if self.reader.column == 0:
772 if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB:
773 return True
774 return None
775
776 def check_block_entry(self):
777 # type: () -> Any
778 # BLOCK-ENTRY: '-' (' '|'\n')
779 return self.reader.peek(1) in _THE_END_SPACE_TAB
780
781 def check_key(self):
782 # type: () -> Any
783 # KEY(flow context): '?'
784 if bool(self.flow_level):
785 return True
786 # KEY(block context): '?' (' '|'\n')
787 return self.reader.peek(1) in _THE_END_SPACE_TAB
788
789 def check_value(self):
790 # type: () -> Any
791 # VALUE(flow context): ':'
792 if self.scanner_processing_version == (1, 1):
793 if bool(self.flow_level):
794 return True
795 else:
796 if bool(self.flow_level):
797 if self.flow_context[-1] == '[':
798 if self.reader.peek(1) not in _THE_END_SPACE_TAB:
799 return False
800 elif self.tokens and isinstance(self.tokens[-1], ValueToken):
801 # mapping flow context scanning a value token
802 if self.reader.peek(1) not in _THE_END_SPACE_TAB:
803 return False
804 return True
805 # VALUE(block context): ':' (' '|'\n')
806 return self.reader.peek(1) in _THE_END_SPACE_TAB
807
808 def check_plain(self):
809 # type: () -> Any
810 # A plain scalar may start with any non-space character except:
811 # '-', '?', ':', ',', '[', ']', '{', '}',
812 # '#', '&', '*', '!', '|', '>', '\'', '\"',
813 # '%', '@', '`'.
814 #
815 # It may also start with
816 # '-', '?', ':'
817 # if it is followed by a non-space character.
818 #
819 # Note that we limit the last rule to the block context (except the
820 # '-' character) because we want the flow context to be space
821 # independent.
822 srp = self.reader.peek
823 ch = srp()
824 if self.scanner_processing_version == (1, 1):
825 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or (
826 srp(1) not in _THE_END_SPACE_TAB
827 and (ch == '-' or (not self.flow_level and ch in '?:'))
828 )
829 # YAML 1.2
830 if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`':
831 # ################### ^ ???
832 return True
833 ch1 = srp(1)
834 if ch == '-' and ch1 not in _THE_END_SPACE_TAB:
835 return True
836 if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB:
837 return True
838
839 return srp(1) not in _THE_END_SPACE_TAB and (
840 ch == '-' or (not self.flow_level and ch in '?:')
841 )
842
843 # Scanners.
844
845 def scan_to_next_token(self):
846 # type: () -> Any
847 # We ignore spaces, line breaks and comments.
848 # If we find a line break in the block context, we set the flag
849 # `allow_simple_key` on.
850 # The byte order mark is stripped if it's the first character in the
851 # stream. We do not yet support BOM inside the stream as the
852 # specification requires. Any such mark will be considered as a part
853 # of the document.
854 #
855 # TODO: We need to make tab handling rules more sane. A good rule is
856 # Tabs cannot precede tokens
857 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
858 # KEY(block), VALUE(block), BLOCK-ENTRY
859 # So the checking code is
860 # if <TAB>:
861 # self.allow_simple_keys = False
862 # We also need to add the check for `allow_simple_keys == True` to
863 # `unwind_indent` before issuing BLOCK-END.
864 # Scanners for block, flow, and plain scalars need to be modified.
865 srp = self.reader.peek
866 srf = self.reader.forward
867 if self.reader.index == 0 and srp() == '\uFEFF':
868 srf()
869 found = False
870 _the_end = _THE_END
871 while not found:
872 while srp() == ' ':
873 srf()
874 if srp() == '#':
875 while srp() not in _the_end:
876 srf()
877 if self.scan_line_break():
878 if not self.flow_level:
879 self.allow_simple_key = True
880 else:
881 found = True
882 return None
883
884 def scan_directive(self):
885 # type: () -> Any
886 # See the specification for details.
887 srp = self.reader.peek
888 srf = self.reader.forward
889 start_mark = self.reader.get_mark()
890 srf()
891 name = self.scan_directive_name(start_mark)
892 value = None
893 if name == 'YAML':
894 value = self.scan_yaml_directive_value(start_mark)
895 end_mark = self.reader.get_mark()
896 elif name == 'TAG':
897 value = self.scan_tag_directive_value(start_mark)
898 end_mark = self.reader.get_mark()
899 else:
900 end_mark = self.reader.get_mark()
901 while srp() not in _THE_END:
902 srf()
903 self.scan_directive_ignored_line(start_mark)
904 return DirectiveToken(name, value, start_mark, end_mark)
905
906 def scan_directive_name(self, start_mark):
907 # type: (Any) -> Any
908 # See the specification for details.
909 length = 0
910 srp = self.reader.peek
911 ch = srp(length)
912 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.':
913 length += 1
914 ch = srp(length)
915 if not length:
916 raise ScannerError(
917 'while scanning a directive',
918 start_mark,
919 'expected alphabetic or numeric character, but found %r' % utf8(ch),
920 self.reader.get_mark(),
921 )
922 value = self.reader.prefix(length)
923 self.reader.forward(length)
924 ch = srp()
925 if ch not in '\0 \r\n\x85\u2028\u2029':
926 raise ScannerError(
927 'while scanning a directive',
928 start_mark,
929 'expected alphabetic or numeric character, but found %r' % utf8(ch),
930 self.reader.get_mark(),
931 )
932 return value
933
934 def scan_yaml_directive_value(self, start_mark):
935 # type: (Any) -> Any
936 # See the specification for details.
937 srp = self.reader.peek
938 srf = self.reader.forward
939 while srp() == ' ':
940 srf()
941 major = self.scan_yaml_directive_number(start_mark)
942 if srp() != '.':
943 raise ScannerError(
944 'while scanning a directive',
945 start_mark,
946 "expected a digit or '.', but found %r" % utf8(srp()),
947 self.reader.get_mark(),
948 )
949 srf()
950 minor = self.scan_yaml_directive_number(start_mark)
951 if srp() not in '\0 \r\n\x85\u2028\u2029':
952 raise ScannerError(
953 'while scanning a directive',
954 start_mark,
955 "expected a digit or ' ', but found %r" % utf8(srp()),
956 self.reader.get_mark(),
957 )
958 return (major, minor)
959
960 def scan_yaml_directive_number(self, start_mark):
961 # type: (Any) -> Any
962 # See the specification for details.
963 srp = self.reader.peek
964 srf = self.reader.forward
965 ch = srp()
966 if not ('0' <= ch <= '9'):
967 raise ScannerError(
968 'while scanning a directive',
969 start_mark,
970 'expected a digit, but found %r' % utf8(ch),
971 self.reader.get_mark(),
972 )
973 length = 0
974 while '0' <= srp(length) <= '9':
975 length += 1
976 value = int(self.reader.prefix(length))
977 srf(length)
978 return value
979
980 def scan_tag_directive_value(self, start_mark):
981 # type: (Any) -> Any
982 # See the specification for details.
983 srp = self.reader.peek
984 srf = self.reader.forward
985 while srp() == ' ':
986 srf()
987 handle = self.scan_tag_directive_handle(start_mark)
988 while srp() == ' ':
989 srf()
990 prefix = self.scan_tag_directive_prefix(start_mark)
991 return (handle, prefix)
992
993 def scan_tag_directive_handle(self, start_mark):
994 # type: (Any) -> Any
995 # See the specification for details.
996 value = self.scan_tag_handle('directive', start_mark)
997 ch = self.reader.peek()
998 if ch != ' ':
999 raise ScannerError(
1000 'while scanning a directive',
1001 start_mark,
1002 "expected ' ', but found %r" % utf8(ch),
1003 self.reader.get_mark(),
1004 )
1005 return value
1006
1007 def scan_tag_directive_prefix(self, start_mark):
1008 # type: (Any) -> Any
1009 # See the specification for details.
1010 value = self.scan_tag_uri('directive', start_mark)
1011 ch = self.reader.peek()
1012 if ch not in '\0 \r\n\x85\u2028\u2029':
1013 raise ScannerError(
1014 'while scanning a directive',
1015 start_mark,
1016 "expected ' ', but found %r" % utf8(ch),
1017 self.reader.get_mark(),
1018 )
1019 return value
1020
1021 def scan_directive_ignored_line(self, start_mark):
1022 # type: (Any) -> None
1023 # See the specification for details.
1024 srp = self.reader.peek
1025 srf = self.reader.forward
1026 while srp() == ' ':
1027 srf()
1028 if srp() == '#':
1029 while srp() not in _THE_END:
1030 srf()
1031 ch = srp()
1032 if ch not in _THE_END:
1033 raise ScannerError(
1034 'while scanning a directive',
1035 start_mark,
1036 'expected a comment or a line break, but found %r' % utf8(ch),
1037 self.reader.get_mark(),
1038 )
1039 self.scan_line_break()
1040
1041 def scan_anchor(self, TokenClass):
1042 # type: (Any) -> Any
1043 # The specification does not restrict characters for anchors and
1044 # aliases. This may lead to problems, for instance, the document:
1045 # [ *alias, value ]
1046 # can be interpteted in two ways, as
1047 # [ "value" ]
1048 # and
1049 # [ *alias , "value" ]
1050 # Therefore we restrict aliases to numbers and ASCII letters.
1051 srp = self.reader.peek
1052 start_mark = self.reader.get_mark()
1053 indicator = srp()
1054 if indicator == '*':
1055 name = 'alias'
1056 else:
1057 name = 'anchor'
1058 self.reader.forward()
1059 length = 0
1060 ch = srp(length)
1061 # while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
1062 # or ch in u'-_':
1063 while check_anchorname_char(ch):
1064 length += 1
1065 ch = srp(length)
1066 if not length:
1067 raise ScannerError(
1068 'while scanning an %s' % (name,),
1069 start_mark,
1070 'expected alphabetic or numeric character, but found %r' % utf8(ch),
1071 self.reader.get_mark(),
1072 )
1073 value = self.reader.prefix(length)
1074 self.reader.forward(length)
1075 # ch1 = ch
1076 # ch = srp() # no need to peek, ch is already set
1077 # assert ch1 == ch
1078 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`':
1079 raise ScannerError(
1080 'while scanning an %s' % (name,),
1081 start_mark,
1082 'expected alphabetic or numeric character, but found %r' % utf8(ch),
1083 self.reader.get_mark(),
1084 )
1085 end_mark = self.reader.get_mark()
1086 return TokenClass(value, start_mark, end_mark)
1087
1088 def scan_tag(self):
1089 # type: () -> Any
1090 # See the specification for details.
1091 srp = self.reader.peek
1092 start_mark = self.reader.get_mark()
1093 ch = srp(1)
1094 if ch == '<':
1095 handle = None
1096 self.reader.forward(2)
1097 suffix = self.scan_tag_uri('tag', start_mark)
1098 if srp() != '>':
1099 raise ScannerError(
1100 'while parsing a tag',
1101 start_mark,
1102 "expected '>', but found %r" % utf8(srp()),
1103 self.reader.get_mark(),
1104 )
1105 self.reader.forward()
1106 elif ch in _THE_END_SPACE_TAB:
1107 handle = None
1108 suffix = '!'
1109 self.reader.forward()
1110 else:
1111 length = 1
1112 use_handle = False
1113 while ch not in '\0 \r\n\x85\u2028\u2029':
1114 if ch == '!':
1115 use_handle = True
1116 break
1117 length += 1
1118 ch = srp(length)
1119 handle = '!'
1120 if use_handle:
1121 handle = self.scan_tag_handle('tag', start_mark)
1122 else:
1123 handle = '!'
1124 self.reader.forward()
1125 suffix = self.scan_tag_uri('tag', start_mark)
1126 ch = srp()
1127 if ch not in '\0 \r\n\x85\u2028\u2029':
1128 raise ScannerError(
1129 'while scanning a tag',
1130 start_mark,
1131 "expected ' ', but found %r" % utf8(ch),
1132 self.reader.get_mark(),
1133 )
1134 value = (handle, suffix)
1135 end_mark = self.reader.get_mark()
1136 return TagToken(value, start_mark, end_mark)
1137
1138 def scan_block_scalar(self, style, rt=False):
1139 # type: (Any, Optional[bool]) -> Any
1140 # See the specification for details.
1141 srp = self.reader.peek
1142 if style == '>':
1143 folded = True
1144 else:
1145 folded = False
1146
1147 chunks = [] # type: List[Any]
1148 start_mark = self.reader.get_mark()
1149
1150 # Scan the header.
1151 self.reader.forward()
1152 chomping, increment = self.scan_block_scalar_indicators(start_mark)
1153 # block scalar comment e.g. : |+ # comment text
1154 block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark)
1155
1156 # Determine the indentation level and go to the first non-empty line.
1157 min_indent = self.indent + 1
1158 if increment is None:
1159 # no increment and top level, min_indent could be 0
1160 if min_indent < 1 and (
1161 style not in '|>'
1162 or (self.scanner_processing_version == (1, 1))
1163 and getattr(
1164 self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False
1165 )
1166 ):
1167 min_indent = 1
1168 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
1169 indent = max(min_indent, max_indent)
1170 else:
1171 if min_indent < 1:
1172 min_indent = 1
1173 indent = min_indent + increment - 1
1174 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1175 line_break = ""
1176
1177 # Scan the inner part of the block scalar.
1178 while self.reader.column == indent and srp() != '\0':
1179 chunks.extend(breaks)
1180 leading_non_space = srp() not in ' \t'
1181 length = 0
1182 while srp(length) not in _THE_END:
1183 length += 1
1184 chunks.append(self.reader.prefix(length))
1185 self.reader.forward(length)
1186 line_break = self.scan_line_break()
1187 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1188 if style in '|>' and min_indent == 0:
1189 # at the beginning of a line, if in block style see if
1190 # end of document/start_new_document
1191 if self.check_document_start() or self.check_document_end():
1192 break
1193 if self.reader.column == indent and srp() != '\0':
1194
1195 # Unfortunately, folding rules are ambiguous.
1196 #
1197 # This is the folding according to the specification:
1198
1199 if rt and folded and line_break == '\n':
1200 chunks.append('\a')
1201 if folded and line_break == '\n' and leading_non_space and srp() not in ' \t':
1202 if not breaks:
1203 chunks.append(' ')
1204 else:
1205 chunks.append(line_break)
1206
1207 # This is Clark Evans's interpretation (also in the spec
1208 # examples):
1209 #
1210 # if folded and line_break == u'\n':
1211 # if not breaks:
1212 # if srp() not in ' \t':
1213 # chunks.append(u' ')
1214 # else:
1215 # chunks.append(line_break)
1216 # else:
1217 # chunks.append(line_break)
1218 else:
1219 break
1220
1221 # Process trailing line breaks. The 'chomping' setting determines
1222 # whether they are included in the value.
1223 trailing = [] # type: List[Any]
1224 if chomping in [None, True]:
1225 chunks.append(line_break)
1226 if chomping is True:
1227 chunks.extend(breaks)
1228 elif chomping in [None, False]:
1229 trailing.extend(breaks)
1230
1231 # We are done.
1232 token = ScalarToken("".join(chunks), False, start_mark, end_mark, style)
1233 if block_scalar_comment is not None:
1234 token.add_pre_comments([block_scalar_comment])
1235 if len(trailing) > 0:
1236 # nprint('trailing 1', trailing) # XXXXX
1237 # Eat whitespaces and comments until we reach the next token.
1238 comment = self.scan_to_next_token()
1239 while comment:
1240 trailing.append(' ' * comment[1].column + comment[0])
1241 comment = self.scan_to_next_token()
1242
1243 # Keep track of the trailing whitespace and following comments
1244 # as a comment token, if isn't all included in the actual value.
1245 comment_end_mark = self.reader.get_mark()
1246 comment = CommentToken("".join(trailing), end_mark, comment_end_mark)
1247 token.add_post_comment(comment)
1248 return token
1249
1250 def scan_block_scalar_indicators(self, start_mark):
1251 # type: (Any) -> Any
1252 # See the specification for details.
1253 srp = self.reader.peek
1254 chomping = None
1255 increment = None
1256 ch = srp()
1257 if ch in '+-':
1258 if ch == '+':
1259 chomping = True
1260 else:
1261 chomping = False
1262 self.reader.forward()
1263 ch = srp()
1264 if ch in '0123456789':
1265 increment = int(ch)
1266 if increment == 0:
1267 raise ScannerError(
1268 'while scanning a block scalar',
1269 start_mark,
1270 'expected indentation indicator in the range 1-9, ' 'but found 0',
1271 self.reader.get_mark(),
1272 )
1273 self.reader.forward()
1274 elif ch in '0123456789':
1275 increment = int(ch)
1276 if increment == 0:
1277 raise ScannerError(
1278 'while scanning a block scalar',
1279 start_mark,
1280 'expected indentation indicator in the range 1-9, ' 'but found 0',
1281 self.reader.get_mark(),
1282 )
1283 self.reader.forward()
1284 ch = srp()
1285 if ch in '+-':
1286 if ch == '+':
1287 chomping = True
1288 else:
1289 chomping = False
1290 self.reader.forward()
1291 ch = srp()
1292 if ch not in '\0 \r\n\x85\u2028\u2029':
1293 raise ScannerError(
1294 'while scanning a block scalar',
1295 start_mark,
1296 'expected chomping or indentation indicators, but found %r' % utf8(ch),
1297 self.reader.get_mark(),
1298 )
1299 return chomping, increment
1300
1301 def scan_block_scalar_ignored_line(self, start_mark):
1302 # type: (Any) -> Any
1303 # See the specification for details.
1304 srp = self.reader.peek
1305 srf = self.reader.forward
1306 prefix = ''
1307 comment = None
1308 while srp() == ' ':
1309 prefix += srp()
1310 srf()
1311 if srp() == '#':
1312 comment = prefix
1313 while srp() not in _THE_END:
1314 comment += srp()
1315 srf()
1316 ch = srp()
1317 if ch not in _THE_END:
1318 raise ScannerError(
1319 'while scanning a block scalar',
1320 start_mark,
1321 'expected a comment or a line break, but found %r' % utf8(ch),
1322 self.reader.get_mark(),
1323 )
1324 self.scan_line_break()
1325 return comment
1326
1327 def scan_block_scalar_indentation(self):
1328 # type: () -> Any
1329 # See the specification for details.
1330 srp = self.reader.peek
1331 srf = self.reader.forward
1332 chunks = []
1333 max_indent = 0
1334 end_mark = self.reader.get_mark()
1335 while srp() in ' \r\n\x85\u2028\u2029':
1336 if srp() != ' ':
1337 chunks.append(self.scan_line_break())
1338 end_mark = self.reader.get_mark()
1339 else:
1340 srf()
1341 if self.reader.column > max_indent:
1342 max_indent = self.reader.column
1343 return chunks, max_indent, end_mark
1344
1345 def scan_block_scalar_breaks(self, indent):
1346 # type: (int) -> Any
1347 # See the specification for details.
1348 chunks = []
1349 srp = self.reader.peek
1350 srf = self.reader.forward
1351 end_mark = self.reader.get_mark()
1352 while self.reader.column < indent and srp() == ' ':
1353 srf()
1354 while srp() in '\r\n\x85\u2028\u2029':
1355 chunks.append(self.scan_line_break())
1356 end_mark = self.reader.get_mark()
1357 while self.reader.column < indent and srp() == ' ':
1358 srf()
1359 return chunks, end_mark
1360
1361 def scan_flow_scalar(self, style):
1362 # type: (Any) -> Any
1363 # See the specification for details.
1364 # Note that we loose indentation rules for quoted scalars. Quoted
1365 # scalars don't need to adhere indentation because " and ' clearly
1366 # mark the beginning and the end of them. Therefore we are less
1367 # restrictive then the specification requires. We only need to check
1368 # that document separators are not included in scalars.
1369 if style == '"':
1370 double = True
1371 else:
1372 double = False
1373 srp = self.reader.peek
1374 chunks = [] # type: List[Any]
1375 start_mark = self.reader.get_mark()
1376 quote = srp()
1377 self.reader.forward()
1378 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1379 while srp() != quote:
1380 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1381 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1382 self.reader.forward()
1383 end_mark = self.reader.get_mark()
1384 return ScalarToken("".join(chunks), False, start_mark, end_mark, style)
1385
1386 ESCAPE_REPLACEMENTS = {
1387 '0': '\0',
1388 'a': '\x07',
1389 'b': '\x08',
1390 't': '\x09',
1391 '\t': '\x09',
1392 'n': '\x0A',
1393 'v': '\x0B',
1394 'f': '\x0C',
1395 'r': '\x0D',
1396 'e': '\x1B',
1397 ' ': '\x20',
1398 '"': '"',
1399 '/': '/', # as per http://www.json.org/
1400 '\\': '\\',
1401 'N': '\x85',
1402 '_': '\xA0',
1403 'L': '\u2028',
1404 'P': '\u2029',
1405 }
1406
1407 ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8}
1408
1409 def scan_flow_scalar_non_spaces(self, double, start_mark):
1410 # type: (Any, Any) -> Any
1411 # See the specification for details.
1412 chunks = [] # type: List[Any]
1413 srp = self.reader.peek
1414 srf = self.reader.forward
1415 while True:
1416 length = 0
1417 while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029':
1418 length += 1
1419 if length != 0:
1420 chunks.append(self.reader.prefix(length))
1421 srf(length)
1422 ch = srp()
1423 if not double and ch == "'" and srp(1) == "'":
1424 chunks.append("'")
1425 srf(2)
1426 elif (double and ch == "'") or (not double and ch in '"\\'):
1427 chunks.append(ch)
1428 srf()
1429 elif double and ch == '\\':
1430 srf()
1431 ch = srp()
1432 if ch in self.ESCAPE_REPLACEMENTS:
1433 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1434 srf()
1435 elif ch in self.ESCAPE_CODES:
1436 length = self.ESCAPE_CODES[ch]
1437 srf()
1438 for k in range(length):
1439 if srp(k) not in '0123456789ABCDEFabcdef':
1440 raise ScannerError(
1441 'while scanning a double-quoted scalar',
1442 start_mark,
1443 'expected escape sequence of %d hexdecimal '
1444 'numbers, but found %r' % (length, utf8(srp(k))),
1445 self.reader.get_mark(),
1446 )
1447 code = int(self.reader.prefix(length), 16)
1448 chunks.append(unichr(code))
1449 srf(length)
1450 elif ch in '\n\r\x85\u2028\u2029':
1451 self.scan_line_break()
1452 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1453 else:
1454 raise ScannerError(
1455 'while scanning a double-quoted scalar',
1456 start_mark,
1457 'found unknown escape character %r' % utf8(ch),
1458 self.reader.get_mark(),
1459 )
1460 else:
1461 return chunks
1462
1463 def scan_flow_scalar_spaces(self, double, start_mark):
1464 # type: (Any, Any) -> Any
1465 # See the specification for details.
1466 srp = self.reader.peek
1467 chunks = []
1468 length = 0
1469 while srp(length) in ' \t':
1470 length += 1
1471 whitespaces = self.reader.prefix(length)
1472 self.reader.forward(length)
1473 ch = srp()
1474 if ch == '\0':
1475 raise ScannerError(
1476 'while scanning a quoted scalar',
1477 start_mark,
1478 'found unexpected end of stream',
1479 self.reader.get_mark(),
1480 )
1481 elif ch in '\r\n\x85\u2028\u2029':
1482 line_break = self.scan_line_break()
1483 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1484 if line_break != '\n':
1485 chunks.append(line_break)
1486 elif not breaks:
1487 chunks.append(' ')
1488 chunks.extend(breaks)
1489 else:
1490 chunks.append(whitespaces)
1491 return chunks
1492
1493 def scan_flow_scalar_breaks(self, double, start_mark):
1494 # type: (Any, Any) -> Any
1495 # See the specification for details.
1496 chunks = [] # type: List[Any]
1497 srp = self.reader.peek
1498 srf = self.reader.forward
1499 while True:
1500 # Instead of checking indentation, we check for document
1501 # separators.
1502 prefix = self.reader.prefix(3)
1503 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1504 raise ScannerError(
1505 'while scanning a quoted scalar',
1506 start_mark,
1507 'found unexpected document separator',
1508 self.reader.get_mark(),
1509 )
1510 while srp() in ' \t':
1511 srf()
1512 if srp() in '\r\n\x85\u2028\u2029':
1513 chunks.append(self.scan_line_break())
1514 else:
1515 return chunks
1516
1517 def scan_plain(self):
1518 # type: () -> Any
1519 # See the specification for details.
1520 # We add an additional restriction for the flow context:
1521 # plain scalars in the flow context cannot contain ',', ': ' and '?'.
1522 # We also keep track of the `allow_simple_key` flag here.
1523 # Indentation rules are loosed for the flow context.
1524 srp = self.reader.peek
1525 srf = self.reader.forward
1526 chunks = [] # type: List[Any]
1527 start_mark = self.reader.get_mark()
1528 end_mark = start_mark
1529 indent = self.indent + 1
1530 # We allow zero indentation for scalars, but then we need to check for
1531 # document separators at the beginning of the line.
1532 # if indent == 0:
1533 # indent = 1
1534 spaces = [] # type: List[Any]
1535 while True:
1536 length = 0
1537 if srp() == '#':
1538 break
1539 while True:
1540 ch = srp(length)
1541 if ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB:
1542 pass
1543 elif ch == '?' and self.scanner_processing_version != (1, 1):
1544 pass
1545 elif (
1546 ch in _THE_END_SPACE_TAB
1547 or (
1548 not self.flow_level
1549 and ch == ':'
1550 and srp(length + 1) in _THE_END_SPACE_TAB
1551 )
1552 or (self.flow_level and ch in ',:?[]{}')
1553 ):
1554 break
1555 length += 1
1556 # It's not clear what we should do with ':' in the flow context.
1557 if (
1558 self.flow_level
1559 and ch == ':'
1560 and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'
1561 ):
1562 srf(length)
1563 raise ScannerError(
1564 'while scanning a plain scalar',
1565 start_mark,
1566 "found unexpected ':'",
1567 self.reader.get_mark(),
1568 'Please check '
1569 'http://pyyaml.org/wiki/YAMLColonInFlowContext '
1570 'for details.',
1571 )
1572 if length == 0:
1573 break
1574 self.allow_simple_key = False
1575 chunks.extend(spaces)
1576 chunks.append(self.reader.prefix(length))
1577 srf(length)
1578 end_mark = self.reader.get_mark()
1579 spaces = self.scan_plain_spaces(indent, start_mark)
1580 if (
1581 not spaces
1582 or srp() == '#'
1583 or (not self.flow_level and self.reader.column < indent)
1584 ):
1585 break
1586
1587 token = ScalarToken("".join(chunks), True, start_mark, end_mark)
1588 if spaces and spaces[0] == '\n':
1589 # Create a comment token to preserve the trailing line breaks.
1590 comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark)
1591 token.add_post_comment(comment)
1592 return token
1593
1594 def scan_plain_spaces(self, indent, start_mark):
1595 # type: (Any, Any) -> Any
1596 # See the specification for details.
1597 # The specification is really confusing about tabs in plain scalars.
1598 # We just forbid them completely. Do not use tabs in YAML!
1599 srp = self.reader.peek
1600 srf = self.reader.forward
1601 chunks = []
1602 length = 0
1603 while srp(length) in ' ':
1604 length += 1
1605 whitespaces = self.reader.prefix(length)
1606 self.reader.forward(length)
1607 ch = srp()
1608 if ch in '\r\n\x85\u2028\u2029':
1609 line_break = self.scan_line_break()
1610 self.allow_simple_key = True
1611 prefix = self.reader.prefix(3)
1612 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1613 return
1614 breaks = []
1615 while srp() in ' \r\n\x85\u2028\u2029':
1616 if srp() == ' ':
1617 srf()
1618 else:
1619 breaks.append(self.scan_line_break())
1620 prefix = self.reader.prefix(3)
1621 if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
1622 return
1623 if line_break != '\n':
1624 chunks.append(line_break)
1625 elif not breaks:
1626 chunks.append(' ')
1627 chunks.extend(breaks)
1628 elif whitespaces:
1629 chunks.append(whitespaces)
1630 return chunks
1631
1632 def scan_tag_handle(self, name, start_mark):
1633 # type: (Any, Any) -> Any
1634 # See the specification for details.
1635 # For some strange reasons, the specification does not allow '_' in
1636 # tag handles. I have allowed it anyway.
1637 srp = self.reader.peek
1638 ch = srp()
1639 if ch != '!':
1640 raise ScannerError(
1641 'while scanning a %s' % (name,),
1642 start_mark,
1643 "expected '!', but found %r" % utf8(ch),
1644 self.reader.get_mark(),
1645 )
1646 length = 1
1647 ch = srp(length)
1648 if ch != ' ':
1649 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_':
1650 length += 1
1651 ch = srp(length)
1652 if ch != '!':
1653 self.reader.forward(length)
1654 raise ScannerError(
1655 'while scanning a %s' % (name,),
1656 start_mark,
1657 "expected '!', but found %r" % utf8(ch),
1658 self.reader.get_mark(),
1659 )
1660 length += 1
1661 value = self.reader.prefix(length)
1662 self.reader.forward(length)
1663 return value
1664
1665 def scan_tag_uri(self, name, start_mark):
1666 # type: (Any, Any) -> Any
1667 # See the specification for details.
1668 # Note: we do not check if URI is well-formed.
1669 srp = self.reader.peek
1670 chunks = []
1671 length = 0
1672 ch = srp(length)
1673 while (
1674 '0' <= ch <= '9'
1675 or 'A' <= ch <= 'Z'
1676 or 'a' <= ch <= 'z'
1677 or ch in "-;/?:@&=+$,_.!~*'()[]%"
1678 ):
1679 if ch == '%':
1680 chunks.append(self.reader.prefix(length))
1681 self.reader.forward(length)
1682 length = 0
1683 chunks.append(self.scan_uri_escapes(name, start_mark))
1684 else:
1685 length += 1
1686 ch = srp(length)
1687 if length != 0:
1688 chunks.append(self.reader.prefix(length))
1689 self.reader.forward(length)
1690 length = 0
1691 if not chunks:
1692 raise ScannerError(
1693 'while parsing a %s' % (name,),
1694 start_mark,
1695 'expected URI, but found %r' % utf8(ch),
1696 self.reader.get_mark(),
1697 )
1698 return "".join(chunks)
1699
1700 def scan_uri_escapes(self, name, start_mark):
1701 # type: (Any, Any) -> Any
1702 # See the specification for details.
1703 srp = self.reader.peek
1704 srf = self.reader.forward
1705 code_bytes = [] # type: List[Any]
1706 mark = self.reader.get_mark()
1707 while srp() == '%':
1708 srf()
1709 for k in range(2):
1710 if srp(k) not in '0123456789ABCDEFabcdef':
1711 raise ScannerError(
1712 'while scanning a %s' % (name,),
1713 start_mark,
1714 'expected URI escape sequence of 2 hexdecimal numbers,'
1715 ' but found %r' % utf8(srp(k)),
1716 self.reader.get_mark(),
1717 )
1718 if PY3:
1719 code_bytes.append(int(self.reader.prefix(2), 16))
1720 else:
1721 code_bytes.append(chr(int(self.reader.prefix(2), 16)))
1722 srf(2)
1723 try:
1724 if PY3:
1725 value = bytes(code_bytes).decode('utf-8')
1726 else:
1727 value = unicode(b"".join(code_bytes), 'utf-8')
1728 except UnicodeDecodeError as exc:
1729 raise ScannerError('while scanning a %s' % (name,), start_mark, str(exc), mark)
1730 return value
1731
1732 def scan_line_break(self):
1733 # type: () -> Any
1734 # Transforms:
1735 # '\r\n' : '\n'
1736 # '\r' : '\n'
1737 # '\n' : '\n'
1738 # '\x85' : '\n'
1739 # '\u2028' : '\u2028'
1740 # '\u2029 : '\u2029'
1741 # default : ''
1742 ch = self.reader.peek()
1743 if ch in '\r\n\x85':
1744 if self.reader.prefix(2) == '\r\n':
1745 self.reader.forward(2)
1746 else:
1747 self.reader.forward()
1748 return '\n'
1749 elif ch in '\u2028\u2029':
1750 self.reader.forward()
1751 return ch
1752 return ""
1753
1754
1755 class RoundTripScanner(Scanner):
1756 def check_token(self, *choices):
1757 # type: (Any) -> bool
1758 # Check if the next token is one of the given types.
1759 while self.need_more_tokens():
1760 self.fetch_more_tokens()
1761 self._gather_comments()
1762 if bool(self.tokens):
1763 if not choices:
1764 return True
1765 for choice in choices:
1766 if isinstance(self.tokens[0], choice):
1767 return True
1768 return False
1769
1770 def peek_token(self):
1771 # type: () -> Any
1772 # Return the next token, but do not delete if from the queue.
1773 while self.need_more_tokens():
1774 self.fetch_more_tokens()
1775 self._gather_comments()
1776 if bool(self.tokens):
1777 return self.tokens[0]
1778 return None
1779
1780 def _gather_comments(self):
1781 # type: () -> Any
1782 """combine multiple comment lines"""
1783 comments = [] # type: List[Any]
1784 if not self.tokens:
1785 return comments
1786 if isinstance(self.tokens[0], CommentToken):
1787 comment = self.tokens.pop(0)
1788 self.tokens_taken += 1
1789 comments.append(comment)
1790 while self.need_more_tokens():
1791 self.fetch_more_tokens()
1792 if not self.tokens:
1793 return comments
1794 if isinstance(self.tokens[0], CommentToken):
1795 self.tokens_taken += 1
1796 comment = self.tokens.pop(0)
1797 # nprint('dropping2', comment)
1798 comments.append(comment)
1799 if len(comments) >= 1:
1800 self.tokens[0].add_pre_comments(comments)
1801 # pull in post comment on e.g. ':'
1802 if not self.done and len(self.tokens) < 2:
1803 self.fetch_more_tokens()
1804
1805 def get_token(self):
1806 # type: () -> Any
1807 # Return the next token.
1808 while self.need_more_tokens():
1809 self.fetch_more_tokens()
1810 self._gather_comments()
1811 if bool(self.tokens):
1812 # nprint('tk', self.tokens)
1813 # only add post comment to single line tokens:
1814 # scalar, value token. FlowXEndToken, otherwise
1815 # hidden streamtokens could get them (leave them and they will be
1816 # pre comments for the next map/seq
1817 if (
1818 len(self.tokens) > 1
1819 and isinstance(
1820 self.tokens[0],
1821 (ScalarToken, ValueToken, FlowSequenceEndToken, FlowMappingEndToken),
1822 )
1823 and isinstance(self.tokens[1], CommentToken)
1824 and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line
1825 ):
1826 self.tokens_taken += 1
1827 c = self.tokens.pop(1)
1828 self.fetch_more_tokens()
1829 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
1830 self.tokens_taken += 1
1831 c1 = self.tokens.pop(1)
1832 c.value = c.value + (' ' * c1.start_mark.column) + c1.value
1833 self.fetch_more_tokens()
1834 self.tokens[0].add_post_comment(c)
1835 elif (
1836 len(self.tokens) > 1
1837 and isinstance(self.tokens[0], ScalarToken)
1838 and isinstance(self.tokens[1], CommentToken)
1839 and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line
1840 ):
1841 self.tokens_taken += 1
1842 c = self.tokens.pop(1)
1843 c.value = (
1844 '\n' * (c.start_mark.line - self.tokens[0].end_mark.line)
1845 + (' ' * c.start_mark.column)
1846 + c.value
1847 )
1848 self.tokens[0].add_post_comment(c)
1849 self.fetch_more_tokens()
1850 while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
1851 self.tokens_taken += 1
1852 c1 = self.tokens.pop(1)
1853 c.value = c.value + (' ' * c1.start_mark.column) + c1.value
1854 self.fetch_more_tokens()
1855 self.tokens_taken += 1
1856 return self.tokens.pop(0)
1857 return None
1858
1859 def fetch_comment(self, comment):
1860 # type: (Any) -> None
1861 value, start_mark, end_mark = comment
1862 while value and value[-1] == ' ':
1863 # empty line within indented key context
1864 # no need to update end-mark, that is not used
1865 value = value[:-1]
1866 self.tokens.append(CommentToken(value, start_mark, end_mark))
1867
1868 # scanner
1869
1870 def scan_to_next_token(self):
1871 # type: () -> Any
1872 # We ignore spaces, line breaks and comments.
1873 # If we find a line break in the block context, we set the flag
1874 # `allow_simple_key` on.
1875 # The byte order mark is stripped if it's the first character in the
1876 # stream. We do not yet support BOM inside the stream as the
1877 # specification requires. Any such mark will be considered as a part
1878 # of the document.
1879 #
1880 # TODO: We need to make tab handling rules more sane. A good rule is
1881 # Tabs cannot precede tokens
1882 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
1883 # KEY(block), VALUE(block), BLOCK-ENTRY
1884 # So the checking code is
1885 # if <TAB>:
1886 # self.allow_simple_keys = False
1887 # We also need to add the check for `allow_simple_keys == True` to
1888 # `unwind_indent` before issuing BLOCK-END.
1889 # Scanners for block, flow, and plain scalars need to be modified.
1890
1891 srp = self.reader.peek
1892 srf = self.reader.forward
1893 if self.reader.index == 0 and srp() == '\uFEFF':
1894 srf()
1895 found = False
1896 while not found:
1897 while srp() == ' ':
1898 srf()
1899 ch = srp()
1900 if ch == '#':
1901 start_mark = self.reader.get_mark()
1902 comment = ch
1903 srf()
1904 while ch not in _THE_END:
1905 ch = srp()
1906 if ch == '\0': # don't gobble the end-of-stream character
1907 # but add an explicit newline as "YAML processors should terminate
1908 # the stream with an explicit line break
1909 # https://yaml.org/spec/1.2/spec.html#id2780069
1910 comment += '\n'
1911 break
1912 comment += ch
1913 srf()
1914 # gather any blank lines following the comment too
1915 ch = self.scan_line_break()
1916 while len(ch) > 0:
1917 comment += ch
1918 ch = self.scan_line_break()
1919 end_mark = self.reader.get_mark()
1920 if not self.flow_level:
1921 self.allow_simple_key = True
1922 return comment, start_mark, end_mark
1923 if bool(self.scan_line_break()):
1924 start_mark = self.reader.get_mark()
1925 if not self.flow_level:
1926 self.allow_simple_key = True
1927 ch = srp()
1928 if ch == '\n': # empty toplevel lines
1929 start_mark = self.reader.get_mark()
1930 comment = ""
1931 while ch:
1932 ch = self.scan_line_break(empty_line=True)
1933 comment += ch
1934 if srp() == '#':
1935 # empty line followed by indented real comment
1936 comment = comment.rsplit('\n', 1)[0] + '\n'
1937 end_mark = self.reader.get_mark()
1938 return comment, start_mark, end_mark
1939 else:
1940 found = True
1941 return None
1942
1943 def scan_line_break(self, empty_line=False):
1944 # type: (bool) -> Text
1945 # Transforms:
1946 # '\r\n' : '\n'
1947 # '\r' : '\n'
1948 # '\n' : '\n'
1949 # '\x85' : '\n'
1950 # '\u2028' : '\u2028'
1951 # '\u2029 : '\u2029'
1952 # default : ''
1953 ch = self.reader.peek() # type: Text
1954 if ch in '\r\n\x85':
1955 if self.reader.prefix(2) == '\r\n':
1956 self.reader.forward(2)
1957 else:
1958 self.reader.forward()
1959 return '\n'
1960 elif ch in '\u2028\u2029':
1961 self.reader.forward()
1962 return ch
1963 elif empty_line and ch in '\t ':
1964 self.reader.forward()
1965 return ch
1966 return ""
1967
1968 def scan_block_scalar(self, style, rt=True):
1969 # type: (Any, Optional[bool]) -> Any
1970 return Scanner.scan_block_scalar(self, style, rt=rt)
1971
1972
1973 # try:
1974 # import psyco
1975 # psyco.bind(Scanner)
1976 # except ImportError:
1977 # pass