comparison env/lib/python3.7/site-packages/yaml/scanner.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1
2 # Scanner produces tokens of the following types:
3 # STREAM-START
4 # STREAM-END
5 # DIRECTIVE(name, value)
6 # DOCUMENT-START
7 # DOCUMENT-END
8 # BLOCK-SEQUENCE-START
9 # BLOCK-MAPPING-START
10 # BLOCK-END
11 # FLOW-SEQUENCE-START
12 # FLOW-MAPPING-START
13 # FLOW-SEQUENCE-END
14 # FLOW-MAPPING-END
15 # BLOCK-ENTRY
16 # FLOW-ENTRY
17 # KEY
18 # VALUE
19 # ALIAS(value)
20 # ANCHOR(value)
21 # TAG(value)
22 # SCALAR(value, plain, style)
23 #
24 # Read comments in the Scanner code for more details.
25 #
26
27 __all__ = ['Scanner', 'ScannerError']
28
29 from .error import MarkedYAMLError
30 from .tokens import *
31
32 class ScannerError(MarkedYAMLError):
33 pass
34
35 class SimpleKey:
36 # See below simple keys treatment.
37
38 def __init__(self, token_number, required, index, line, column, mark):
39 self.token_number = token_number
40 self.required = required
41 self.index = index
42 self.line = line
43 self.column = column
44 self.mark = mark
45
46 class Scanner:
47
48 def __init__(self):
49 """Initialize the scanner."""
50 # It is assumed that Scanner and Reader will have a common descendant.
51 # Reader do the dirty work of checking for BOM and converting the
52 # input data to Unicode. It also adds NUL to the end.
53 #
54 # Reader supports the following methods
55 # self.peek(i=0) # peek the next i-th character
56 # self.prefix(l=1) # peek the next l characters
57 # self.forward(l=1) # read the next l characters and move the pointer.
58
59 # Had we reached the end of the stream?
60 self.done = False
61
62 # The number of unclosed '{' and '['. `flow_level == 0` means block
63 # context.
64 self.flow_level = 0
65
66 # List of processed tokens that are not yet emitted.
67 self.tokens = []
68
69 # Add the STREAM-START token.
70 self.fetch_stream_start()
71
72 # Number of tokens that were emitted through the `get_token` method.
73 self.tokens_taken = 0
74
75 # The current indentation level.
76 self.indent = -1
77
78 # Past indentation levels.
79 self.indents = []
80
81 # Variables related to simple keys treatment.
82
83 # A simple key is a key that is not denoted by the '?' indicator.
84 # Example of simple keys:
85 # ---
86 # block simple key: value
87 # ? not a simple key:
88 # : { flow simple key: value }
89 # We emit the KEY token before all keys, so when we find a potential
90 # simple key, we try to locate the corresponding ':' indicator.
91 # Simple keys should be limited to a single line and 1024 characters.
92
93 # Can a simple key start at the current position? A simple key may
94 # start:
95 # - at the beginning of the line, not counting indentation spaces
96 # (in block context),
97 # - after '{', '[', ',' (in the flow context),
98 # - after '?', ':', '-' (in the block context).
99 # In the block context, this flag also signifies if a block collection
100 # may start at the current position.
101 self.allow_simple_key = True
102
103 # Keep track of possible simple keys. This is a dictionary. The key
104 # is `flow_level`; there can be no more that one possible simple key
105 # for each level. The value is a SimpleKey record:
106 # (token_number, required, index, line, column, mark)
107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
108 # '[', or '{' tokens.
109 self.possible_simple_keys = {}
110
111 # Public methods.
112
113 def check_token(self, *choices):
114 # Check if the next token is one of the given types.
115 while self.need_more_tokens():
116 self.fetch_more_tokens()
117 if self.tokens:
118 if not choices:
119 return True
120 for choice in choices:
121 if isinstance(self.tokens[0], choice):
122 return True
123 return False
124
125 def peek_token(self):
126 # Return the next token, but do not delete if from the queue.
127 # Return None if no more tokens.
128 while self.need_more_tokens():
129 self.fetch_more_tokens()
130 if self.tokens:
131 return self.tokens[0]
132 else:
133 return None
134
135 def get_token(self):
136 # Return the next token.
137 while self.need_more_tokens():
138 self.fetch_more_tokens()
139 if self.tokens:
140 self.tokens_taken += 1
141 return self.tokens.pop(0)
142
143 # Private methods.
144
145 def need_more_tokens(self):
146 if self.done:
147 return False
148 if not self.tokens:
149 return True
150 # The current token may be a potential simple key, so we
151 # need to look further.
152 self.stale_possible_simple_keys()
153 if self.next_possible_simple_key() == self.tokens_taken:
154 return True
155
156 def fetch_more_tokens(self):
157
158 # Eat whitespaces and comments until we reach the next token.
159 self.scan_to_next_token()
160
161 # Remove obsolete possible simple keys.
162 self.stale_possible_simple_keys()
163
164 # Compare the current indentation and column. It may add some tokens
165 # and decrease the current indentation level.
166 self.unwind_indent(self.column)
167
168 # Peek the next character.
169 ch = self.peek()
170
171 # Is it the end of stream?
172 if ch == '\0':
173 return self.fetch_stream_end()
174
175 # Is it a directive?
176 if ch == '%' and self.check_directive():
177 return self.fetch_directive()
178
179 # Is it the document start?
180 if ch == '-' and self.check_document_start():
181 return self.fetch_document_start()
182
183 # Is it the document end?
184 if ch == '.' and self.check_document_end():
185 return self.fetch_document_end()
186
187 # TODO: support for BOM within a stream.
188 #if ch == '\uFEFF':
189 # return self.fetch_bom() <-- issue BOMToken
190
191 # Note: the order of the following checks is NOT significant.
192
193 # Is it the flow sequence start indicator?
194 if ch == '[':
195 return self.fetch_flow_sequence_start()
196
197 # Is it the flow mapping start indicator?
198 if ch == '{':
199 return self.fetch_flow_mapping_start()
200
201 # Is it the flow sequence end indicator?
202 if ch == ']':
203 return self.fetch_flow_sequence_end()
204
205 # Is it the flow mapping end indicator?
206 if ch == '}':
207 return self.fetch_flow_mapping_end()
208
209 # Is it the flow entry indicator?
210 if ch == ',':
211 return self.fetch_flow_entry()
212
213 # Is it the block entry indicator?
214 if ch == '-' and self.check_block_entry():
215 return self.fetch_block_entry()
216
217 # Is it the key indicator?
218 if ch == '?' and self.check_key():
219 return self.fetch_key()
220
221 # Is it the value indicator?
222 if ch == ':' and self.check_value():
223 return self.fetch_value()
224
225 # Is it an alias?
226 if ch == '*':
227 return self.fetch_alias()
228
229 # Is it an anchor?
230 if ch == '&':
231 return self.fetch_anchor()
232
233 # Is it a tag?
234 if ch == '!':
235 return self.fetch_tag()
236
237 # Is it a literal scalar?
238 if ch == '|' and not self.flow_level:
239 return self.fetch_literal()
240
241 # Is it a folded scalar?
242 if ch == '>' and not self.flow_level:
243 return self.fetch_folded()
244
245 # Is it a single quoted scalar?
246 if ch == '\'':
247 return self.fetch_single()
248
249 # Is it a double quoted scalar?
250 if ch == '\"':
251 return self.fetch_double()
252
253 # It must be a plain scalar then.
254 if self.check_plain():
255 return self.fetch_plain()
256
257 # No? It's an error. Let's produce a nice error message.
258 raise ScannerError("while scanning for the next token", None,
259 "found character %r that cannot start any token" % ch,
260 self.get_mark())
261
262 # Simple keys treatment.
263
264 def next_possible_simple_key(self):
265 # Return the number of the nearest possible simple key. Actually we
266 # don't need to loop through the whole dictionary. We may replace it
267 # with the following code:
268 # if not self.possible_simple_keys:
269 # return None
270 # return self.possible_simple_keys[
271 # min(self.possible_simple_keys.keys())].token_number
272 min_token_number = None
273 for level in self.possible_simple_keys:
274 key = self.possible_simple_keys[level]
275 if min_token_number is None or key.token_number < min_token_number:
276 min_token_number = key.token_number
277 return min_token_number
278
279 def stale_possible_simple_keys(self):
280 # Remove entries that are no longer possible simple keys. According to
281 # the YAML specification, simple keys
282 # - should be limited to a single line,
283 # - should be no longer than 1024 characters.
284 # Disabling this procedure will allow simple keys of any length and
285 # height (may cause problems if indentation is broken though).
286 for level in list(self.possible_simple_keys):
287 key = self.possible_simple_keys[level]
288 if key.line != self.line \
289 or self.index-key.index > 1024:
290 if key.required:
291 raise ScannerError("while scanning a simple key", key.mark,
292 "could not find expected ':'", self.get_mark())
293 del self.possible_simple_keys[level]
294
295 def save_possible_simple_key(self):
296 # The next token may start a simple key. We check if it's possible
297 # and save its position. This function is called for
298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
299
300 # Check if a simple key is required at the current position.
301 required = not self.flow_level and self.indent == self.column
302
303 # The next token might be a simple key. Let's save it's number and
304 # position.
305 if self.allow_simple_key:
306 self.remove_possible_simple_key()
307 token_number = self.tokens_taken+len(self.tokens)
308 key = SimpleKey(token_number, required,
309 self.index, self.line, self.column, self.get_mark())
310 self.possible_simple_keys[self.flow_level] = key
311
312 def remove_possible_simple_key(self):
313 # Remove the saved possible key position at the current flow level.
314 if self.flow_level in self.possible_simple_keys:
315 key = self.possible_simple_keys[self.flow_level]
316
317 if key.required:
318 raise ScannerError("while scanning a simple key", key.mark,
319 "could not find expected ':'", self.get_mark())
320
321 del self.possible_simple_keys[self.flow_level]
322
323 # Indentation functions.
324
325 def unwind_indent(self, column):
326
327 ## In flow context, tokens should respect indentation.
328 ## Actually the condition should be `self.indent >= column` according to
329 ## the spec. But this condition will prohibit intuitively correct
330 ## constructions such as
331 ## key : {
332 ## }
333 #if self.flow_level and self.indent > column:
334 # raise ScannerError(None, None,
335 # "invalid indentation or unclosed '[' or '{'",
336 # self.get_mark())
337
338 # In the flow context, indentation is ignored. We make the scanner less
339 # restrictive then specification requires.
340 if self.flow_level:
341 return
342
343 # In block context, we may need to issue the BLOCK-END tokens.
344 while self.indent > column:
345 mark = self.get_mark()
346 self.indent = self.indents.pop()
347 self.tokens.append(BlockEndToken(mark, mark))
348
349 def add_indent(self, column):
350 # Check if we need to increase indentation.
351 if self.indent < column:
352 self.indents.append(self.indent)
353 self.indent = column
354 return True
355 return False
356
357 # Fetchers.
358
359 def fetch_stream_start(self):
360 # We always add STREAM-START as the first token and STREAM-END as the
361 # last token.
362
363 # Read the token.
364 mark = self.get_mark()
365
366 # Add STREAM-START.
367 self.tokens.append(StreamStartToken(mark, mark,
368 encoding=self.encoding))
369
370
371 def fetch_stream_end(self):
372
373 # Set the current indentation to -1.
374 self.unwind_indent(-1)
375
376 # Reset simple keys.
377 self.remove_possible_simple_key()
378 self.allow_simple_key = False
379 self.possible_simple_keys = {}
380
381 # Read the token.
382 mark = self.get_mark()
383
384 # Add STREAM-END.
385 self.tokens.append(StreamEndToken(mark, mark))
386
387 # The steam is finished.
388 self.done = True
389
390 def fetch_directive(self):
391
392 # Set the current indentation to -1.
393 self.unwind_indent(-1)
394
395 # Reset simple keys.
396 self.remove_possible_simple_key()
397 self.allow_simple_key = False
398
399 # Scan and add DIRECTIVE.
400 self.tokens.append(self.scan_directive())
401
402 def fetch_document_start(self):
403 self.fetch_document_indicator(DocumentStartToken)
404
405 def fetch_document_end(self):
406 self.fetch_document_indicator(DocumentEndToken)
407
408 def fetch_document_indicator(self, TokenClass):
409
410 # Set the current indentation to -1.
411 self.unwind_indent(-1)
412
413 # Reset simple keys. Note that there could not be a block collection
414 # after '---'.
415 self.remove_possible_simple_key()
416 self.allow_simple_key = False
417
418 # Add DOCUMENT-START or DOCUMENT-END.
419 start_mark = self.get_mark()
420 self.forward(3)
421 end_mark = self.get_mark()
422 self.tokens.append(TokenClass(start_mark, end_mark))
423
424 def fetch_flow_sequence_start(self):
425 self.fetch_flow_collection_start(FlowSequenceStartToken)
426
427 def fetch_flow_mapping_start(self):
428 self.fetch_flow_collection_start(FlowMappingStartToken)
429
430 def fetch_flow_collection_start(self, TokenClass):
431
432 # '[' and '{' may start a simple key.
433 self.save_possible_simple_key()
434
435 # Increase the flow level.
436 self.flow_level += 1
437
438 # Simple keys are allowed after '[' and '{'.
439 self.allow_simple_key = True
440
441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
442 start_mark = self.get_mark()
443 self.forward()
444 end_mark = self.get_mark()
445 self.tokens.append(TokenClass(start_mark, end_mark))
446
447 def fetch_flow_sequence_end(self):
448 self.fetch_flow_collection_end(FlowSequenceEndToken)
449
450 def fetch_flow_mapping_end(self):
451 self.fetch_flow_collection_end(FlowMappingEndToken)
452
453 def fetch_flow_collection_end(self, TokenClass):
454
455 # Reset possible simple key on the current level.
456 self.remove_possible_simple_key()
457
458 # Decrease the flow level.
459 self.flow_level -= 1
460
461 # No simple keys after ']' or '}'.
462 self.allow_simple_key = False
463
464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
465 start_mark = self.get_mark()
466 self.forward()
467 end_mark = self.get_mark()
468 self.tokens.append(TokenClass(start_mark, end_mark))
469
470 def fetch_flow_entry(self):
471
472 # Simple keys are allowed after ','.
473 self.allow_simple_key = True
474
475 # Reset possible simple key on the current level.
476 self.remove_possible_simple_key()
477
478 # Add FLOW-ENTRY.
479 start_mark = self.get_mark()
480 self.forward()
481 end_mark = self.get_mark()
482 self.tokens.append(FlowEntryToken(start_mark, end_mark))
483
484 def fetch_block_entry(self):
485
486 # Block context needs additional checks.
487 if not self.flow_level:
488
489 # Are we allowed to start a new entry?
490 if not self.allow_simple_key:
491 raise ScannerError(None, None,
492 "sequence entries are not allowed here",
493 self.get_mark())
494
495 # We may need to add BLOCK-SEQUENCE-START.
496 if self.add_indent(self.column):
497 mark = self.get_mark()
498 self.tokens.append(BlockSequenceStartToken(mark, mark))
499
500 # It's an error for the block entry to occur in the flow context,
501 # but we let the parser detect this.
502 else:
503 pass
504
505 # Simple keys are allowed after '-'.
506 self.allow_simple_key = True
507
508 # Reset possible simple key on the current level.
509 self.remove_possible_simple_key()
510
511 # Add BLOCK-ENTRY.
512 start_mark = self.get_mark()
513 self.forward()
514 end_mark = self.get_mark()
515 self.tokens.append(BlockEntryToken(start_mark, end_mark))
516
517 def fetch_key(self):
518
519 # Block context needs additional checks.
520 if not self.flow_level:
521
522 # Are we allowed to start a key (not necessary a simple)?
523 if not self.allow_simple_key:
524 raise ScannerError(None, None,
525 "mapping keys are not allowed here",
526 self.get_mark())
527
528 # We may need to add BLOCK-MAPPING-START.
529 if self.add_indent(self.column):
530 mark = self.get_mark()
531 self.tokens.append(BlockMappingStartToken(mark, mark))
532
533 # Simple keys are allowed after '?' in the block context.
534 self.allow_simple_key = not self.flow_level
535
536 # Reset possible simple key on the current level.
537 self.remove_possible_simple_key()
538
539 # Add KEY.
540 start_mark = self.get_mark()
541 self.forward()
542 end_mark = self.get_mark()
543 self.tokens.append(KeyToken(start_mark, end_mark))
544
545 def fetch_value(self):
546
547 # Do we determine a simple key?
548 if self.flow_level in self.possible_simple_keys:
549
550 # Add KEY.
551 key = self.possible_simple_keys[self.flow_level]
552 del self.possible_simple_keys[self.flow_level]
553 self.tokens.insert(key.token_number-self.tokens_taken,
554 KeyToken(key.mark, key.mark))
555
556 # If this key starts a new block mapping, we need to add
557 # BLOCK-MAPPING-START.
558 if not self.flow_level:
559 if self.add_indent(key.column):
560 self.tokens.insert(key.token_number-self.tokens_taken,
561 BlockMappingStartToken(key.mark, key.mark))
562
563 # There cannot be two simple keys one after another.
564 self.allow_simple_key = False
565
566 # It must be a part of a complex key.
567 else:
568
569 # Block context needs additional checks.
570 # (Do we really need them? They will be caught by the parser
571 # anyway.)
572 if not self.flow_level:
573
574 # We are allowed to start a complex value if and only if
575 # we can start a simple key.
576 if not self.allow_simple_key:
577 raise ScannerError(None, None,
578 "mapping values are not allowed here",
579 self.get_mark())
580
581 # If this value starts a new block mapping, we need to add
582 # BLOCK-MAPPING-START. It will be detected as an error later by
583 # the parser.
584 if not self.flow_level:
585 if self.add_indent(self.column):
586 mark = self.get_mark()
587 self.tokens.append(BlockMappingStartToken(mark, mark))
588
589 # Simple keys are allowed after ':' in the block context.
590 self.allow_simple_key = not self.flow_level
591
592 # Reset possible simple key on the current level.
593 self.remove_possible_simple_key()
594
595 # Add VALUE.
596 start_mark = self.get_mark()
597 self.forward()
598 end_mark = self.get_mark()
599 self.tokens.append(ValueToken(start_mark, end_mark))
600
601 def fetch_alias(self):
602
603 # ALIAS could be a simple key.
604 self.save_possible_simple_key()
605
606 # No simple keys after ALIAS.
607 self.allow_simple_key = False
608
609 # Scan and add ALIAS.
610 self.tokens.append(self.scan_anchor(AliasToken))
611
612 def fetch_anchor(self):
613
614 # ANCHOR could start a simple key.
615 self.save_possible_simple_key()
616
617 # No simple keys after ANCHOR.
618 self.allow_simple_key = False
619
620 # Scan and add ANCHOR.
621 self.tokens.append(self.scan_anchor(AnchorToken))
622
623 def fetch_tag(self):
624
625 # TAG could start a simple key.
626 self.save_possible_simple_key()
627
628 # No simple keys after TAG.
629 self.allow_simple_key = False
630
631 # Scan and add TAG.
632 self.tokens.append(self.scan_tag())
633
634 def fetch_literal(self):
635 self.fetch_block_scalar(style='|')
636
637 def fetch_folded(self):
638 self.fetch_block_scalar(style='>')
639
640 def fetch_block_scalar(self, style):
641
642 # A simple key may follow a block scalar.
643 self.allow_simple_key = True
644
645 # Reset possible simple key on the current level.
646 self.remove_possible_simple_key()
647
648 # Scan and add SCALAR.
649 self.tokens.append(self.scan_block_scalar(style))
650
651 def fetch_single(self):
652 self.fetch_flow_scalar(style='\'')
653
654 def fetch_double(self):
655 self.fetch_flow_scalar(style='"')
656
657 def fetch_flow_scalar(self, style):
658
659 # A flow scalar could be a simple key.
660 self.save_possible_simple_key()
661
662 # No simple keys after flow scalars.
663 self.allow_simple_key = False
664
665 # Scan and add SCALAR.
666 self.tokens.append(self.scan_flow_scalar(style))
667
668 def fetch_plain(self):
669
670 # A plain scalar could be a simple key.
671 self.save_possible_simple_key()
672
673 # No simple keys after plain scalars. But note that `scan_plain` will
674 # change this flag if the scan is finished at the beginning of the
675 # line.
676 self.allow_simple_key = False
677
678 # Scan and add SCALAR. May change `allow_simple_key`.
679 self.tokens.append(self.scan_plain())
680
681 # Checkers.
682
683 def check_directive(self):
684
685 # DIRECTIVE: ^ '%' ...
686 # The '%' indicator is already checked.
687 if self.column == 0:
688 return True
689
690 def check_document_start(self):
691
692 # DOCUMENT-START: ^ '---' (' '|'\n')
693 if self.column == 0:
694 if self.prefix(3) == '---' \
695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
696 return True
697
698 def check_document_end(self):
699
700 # DOCUMENT-END: ^ '...' (' '|'\n')
701 if self.column == 0:
702 if self.prefix(3) == '...' \
703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
704 return True
705
706 def check_block_entry(self):
707
708 # BLOCK-ENTRY: '-' (' '|'\n')
709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
710
711 def check_key(self):
712
713 # KEY(flow context): '?'
714 if self.flow_level:
715 return True
716
717 # KEY(block context): '?' (' '|'\n')
718 else:
719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
720
721 def check_value(self):
722
723 # VALUE(flow context): ':'
724 if self.flow_level:
725 return True
726
727 # VALUE(block context): ':' (' '|'\n')
728 else:
729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
730
731 def check_plain(self):
732
733 # A plain scalar may start with any non-space character except:
734 # '-', '?', ':', ',', '[', ']', '{', '}',
735 # '#', '&', '*', '!', '|', '>', '\'', '\"',
736 # '%', '@', '`'.
737 #
738 # It may also start with
739 # '-', '?', ':'
740 # if it is followed by a non-space character.
741 #
742 # Note that we limit the last rule to the block context (except the
743 # '-' character) because we want the flow context to be space
744 # independent.
745 ch = self.peek()
746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
748 and (ch == '-' or (not self.flow_level and ch in '?:')))
749
750 # Scanners.
751
752 def scan_to_next_token(self):
753 # We ignore spaces, line breaks and comments.
754 # If we find a line break in the block context, we set the flag
755 # `allow_simple_key` on.
756 # The byte order mark is stripped if it's the first character in the
757 # stream. We do not yet support BOM inside the stream as the
758 # specification requires. Any such mark will be considered as a part
759 # of the document.
760 #
761 # TODO: We need to make tab handling rules more sane. A good rule is
762 # Tabs cannot precede tokens
763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
764 # KEY(block), VALUE(block), BLOCK-ENTRY
765 # So the checking code is
766 # if <TAB>:
767 # self.allow_simple_keys = False
768 # We also need to add the check for `allow_simple_keys == True` to
769 # `unwind_indent` before issuing BLOCK-END.
770 # Scanners for block, flow, and plain scalars need to be modified.
771
772 if self.index == 0 and self.peek() == '\uFEFF':
773 self.forward()
774 found = False
775 while not found:
776 while self.peek() == ' ':
777 self.forward()
778 if self.peek() == '#':
779 while self.peek() not in '\0\r\n\x85\u2028\u2029':
780 self.forward()
781 if self.scan_line_break():
782 if not self.flow_level:
783 self.allow_simple_key = True
784 else:
785 found = True
786
787 def scan_directive(self):
788 # See the specification for details.
789 start_mark = self.get_mark()
790 self.forward()
791 name = self.scan_directive_name(start_mark)
792 value = None
793 if name == 'YAML':
794 value = self.scan_yaml_directive_value(start_mark)
795 end_mark = self.get_mark()
796 elif name == 'TAG':
797 value = self.scan_tag_directive_value(start_mark)
798 end_mark = self.get_mark()
799 else:
800 end_mark = self.get_mark()
801 while self.peek() not in '\0\r\n\x85\u2028\u2029':
802 self.forward()
803 self.scan_directive_ignored_line(start_mark)
804 return DirectiveToken(name, value, start_mark, end_mark)
805
806 def scan_directive_name(self, start_mark):
807 # See the specification for details.
808 length = 0
809 ch = self.peek(length)
810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
811 or ch in '-_':
812 length += 1
813 ch = self.peek(length)
814 if not length:
815 raise ScannerError("while scanning a directive", start_mark,
816 "expected alphabetic or numeric character, but found %r"
817 % ch, self.get_mark())
818 value = self.prefix(length)
819 self.forward(length)
820 ch = self.peek()
821 if ch not in '\0 \r\n\x85\u2028\u2029':
822 raise ScannerError("while scanning a directive", start_mark,
823 "expected alphabetic or numeric character, but found %r"
824 % ch, self.get_mark())
825 return value
826
827 def scan_yaml_directive_value(self, start_mark):
828 # See the specification for details.
829 while self.peek() == ' ':
830 self.forward()
831 major = self.scan_yaml_directive_number(start_mark)
832 if self.peek() != '.':
833 raise ScannerError("while scanning a directive", start_mark,
834 "expected a digit or '.', but found %r" % self.peek(),
835 self.get_mark())
836 self.forward()
837 minor = self.scan_yaml_directive_number(start_mark)
838 if self.peek() not in '\0 \r\n\x85\u2028\u2029':
839 raise ScannerError("while scanning a directive", start_mark,
840 "expected a digit or ' ', but found %r" % self.peek(),
841 self.get_mark())
842 return (major, minor)
843
844 def scan_yaml_directive_number(self, start_mark):
845 # See the specification for details.
846 ch = self.peek()
847 if not ('0' <= ch <= '9'):
848 raise ScannerError("while scanning a directive", start_mark,
849 "expected a digit, but found %r" % ch, self.get_mark())
850 length = 0
851 while '0' <= self.peek(length) <= '9':
852 length += 1
853 value = int(self.prefix(length))
854 self.forward(length)
855 return value
856
857 def scan_tag_directive_value(self, start_mark):
858 # See the specification for details.
859 while self.peek() == ' ':
860 self.forward()
861 handle = self.scan_tag_directive_handle(start_mark)
862 while self.peek() == ' ':
863 self.forward()
864 prefix = self.scan_tag_directive_prefix(start_mark)
865 return (handle, prefix)
866
867 def scan_tag_directive_handle(self, start_mark):
868 # See the specification for details.
869 value = self.scan_tag_handle('directive', start_mark)
870 ch = self.peek()
871 if ch != ' ':
872 raise ScannerError("while scanning a directive", start_mark,
873 "expected ' ', but found %r" % ch, self.get_mark())
874 return value
875
876 def scan_tag_directive_prefix(self, start_mark):
877 # See the specification for details.
878 value = self.scan_tag_uri('directive', start_mark)
879 ch = self.peek()
880 if ch not in '\0 \r\n\x85\u2028\u2029':
881 raise ScannerError("while scanning a directive", start_mark,
882 "expected ' ', but found %r" % ch, self.get_mark())
883 return value
884
885 def scan_directive_ignored_line(self, start_mark):
886 # See the specification for details.
887 while self.peek() == ' ':
888 self.forward()
889 if self.peek() == '#':
890 while self.peek() not in '\0\r\n\x85\u2028\u2029':
891 self.forward()
892 ch = self.peek()
893 if ch not in '\0\r\n\x85\u2028\u2029':
894 raise ScannerError("while scanning a directive", start_mark,
895 "expected a comment or a line break, but found %r"
896 % ch, self.get_mark())
897 self.scan_line_break()
898
899 def scan_anchor(self, TokenClass):
900 # The specification does not restrict characters for anchors and
901 # aliases. This may lead to problems, for instance, the document:
902 # [ *alias, value ]
903 # can be interpreted in two ways, as
904 # [ "value" ]
905 # and
906 # [ *alias , "value" ]
907 # Therefore we restrict aliases to numbers and ASCII letters.
908 start_mark = self.get_mark()
909 indicator = self.peek()
910 if indicator == '*':
911 name = 'alias'
912 else:
913 name = 'anchor'
914 self.forward()
915 length = 0
916 ch = self.peek(length)
917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
918 or ch in '-_':
919 length += 1
920 ch = self.peek(length)
921 if not length:
922 raise ScannerError("while scanning an %s" % name, start_mark,
923 "expected alphabetic or numeric character, but found %r"
924 % ch, self.get_mark())
925 value = self.prefix(length)
926 self.forward(length)
927 ch = self.peek()
928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
929 raise ScannerError("while scanning an %s" % name, start_mark,
930 "expected alphabetic or numeric character, but found %r"
931 % ch, self.get_mark())
932 end_mark = self.get_mark()
933 return TokenClass(value, start_mark, end_mark)
934
935 def scan_tag(self):
936 # See the specification for details.
937 start_mark = self.get_mark()
938 ch = self.peek(1)
939 if ch == '<':
940 handle = None
941 self.forward(2)
942 suffix = self.scan_tag_uri('tag', start_mark)
943 if self.peek() != '>':
944 raise ScannerError("while parsing a tag", start_mark,
945 "expected '>', but found %r" % self.peek(),
946 self.get_mark())
947 self.forward()
948 elif ch in '\0 \t\r\n\x85\u2028\u2029':
949 handle = None
950 suffix = '!'
951 self.forward()
952 else:
953 length = 1
954 use_handle = False
955 while ch not in '\0 \r\n\x85\u2028\u2029':
956 if ch == '!':
957 use_handle = True
958 break
959 length += 1
960 ch = self.peek(length)
961 handle = '!'
962 if use_handle:
963 handle = self.scan_tag_handle('tag', start_mark)
964 else:
965 handle = '!'
966 self.forward()
967 suffix = self.scan_tag_uri('tag', start_mark)
968 ch = self.peek()
969 if ch not in '\0 \r\n\x85\u2028\u2029':
970 raise ScannerError("while scanning a tag", start_mark,
971 "expected ' ', but found %r" % ch, self.get_mark())
972 value = (handle, suffix)
973 end_mark = self.get_mark()
974 return TagToken(value, start_mark, end_mark)
975
976 def scan_block_scalar(self, style):
977 # See the specification for details.
978
979 if style == '>':
980 folded = True
981 else:
982 folded = False
983
984 chunks = []
985 start_mark = self.get_mark()
986
987 # Scan the header.
988 self.forward()
989 chomping, increment = self.scan_block_scalar_indicators(start_mark)
990 self.scan_block_scalar_ignored_line(start_mark)
991
992 # Determine the indentation level and go to the first non-empty line.
993 min_indent = self.indent+1
994 if min_indent < 1:
995 min_indent = 1
996 if increment is None:
997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
998 indent = max(min_indent, max_indent)
999 else:
1000 indent = min_indent+increment-1
1001 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1002 line_break = ''
1003
1004 # Scan the inner part of the block scalar.
1005 while self.column == indent and self.peek() != '\0':
1006 chunks.extend(breaks)
1007 leading_non_space = self.peek() not in ' \t'
1008 length = 0
1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
1010 length += 1
1011 chunks.append(self.prefix(length))
1012 self.forward(length)
1013 line_break = self.scan_line_break()
1014 breaks, end_mark = self.scan_block_scalar_breaks(indent)
1015 if self.column == indent and self.peek() != '\0':
1016
1017 # Unfortunately, folding rules are ambiguous.
1018 #
1019 # This is the folding according to the specification:
1020
1021 if folded and line_break == '\n' \
1022 and leading_non_space and self.peek() not in ' \t':
1023 if not breaks:
1024 chunks.append(' ')
1025 else:
1026 chunks.append(line_break)
1027
1028 # This is Clark Evans's interpretation (also in the spec
1029 # examples):
1030 #
1031 #if folded and line_break == '\n':
1032 # if not breaks:
1033 # if self.peek() not in ' \t':
1034 # chunks.append(' ')
1035 # else:
1036 # chunks.append(line_break)
1037 #else:
1038 # chunks.append(line_break)
1039 else:
1040 break
1041
1042 # Chomp the tail.
1043 if chomping is not False:
1044 chunks.append(line_break)
1045 if chomping is True:
1046 chunks.extend(breaks)
1047
1048 # We are done.
1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark,
1050 style)
1051
1052 def scan_block_scalar_indicators(self, start_mark):
1053 # See the specification for details.
1054 chomping = None
1055 increment = None
1056 ch = self.peek()
1057 if ch in '+-':
1058 if ch == '+':
1059 chomping = True
1060 else:
1061 chomping = False
1062 self.forward()
1063 ch = self.peek()
1064 if ch in '0123456789':
1065 increment = int(ch)
1066 if increment == 0:
1067 raise ScannerError("while scanning a block scalar", start_mark,
1068 "expected indentation indicator in the range 1-9, but found 0",
1069 self.get_mark())
1070 self.forward()
1071 elif ch in '0123456789':
1072 increment = int(ch)
1073 if increment == 0:
1074 raise ScannerError("while scanning a block scalar", start_mark,
1075 "expected indentation indicator in the range 1-9, but found 0",
1076 self.get_mark())
1077 self.forward()
1078 ch = self.peek()
1079 if ch in '+-':
1080 if ch == '+':
1081 chomping = True
1082 else:
1083 chomping = False
1084 self.forward()
1085 ch = self.peek()
1086 if ch not in '\0 \r\n\x85\u2028\u2029':
1087 raise ScannerError("while scanning a block scalar", start_mark,
1088 "expected chomping or indentation indicators, but found %r"
1089 % ch, self.get_mark())
1090 return chomping, increment
1091
1092 def scan_block_scalar_ignored_line(self, start_mark):
1093 # See the specification for details.
1094 while self.peek() == ' ':
1095 self.forward()
1096 if self.peek() == '#':
1097 while self.peek() not in '\0\r\n\x85\u2028\u2029':
1098 self.forward()
1099 ch = self.peek()
1100 if ch not in '\0\r\n\x85\u2028\u2029':
1101 raise ScannerError("while scanning a block scalar", start_mark,
1102 "expected a comment or a line break, but found %r" % ch,
1103 self.get_mark())
1104 self.scan_line_break()
1105
1106 def scan_block_scalar_indentation(self):
1107 # See the specification for details.
1108 chunks = []
1109 max_indent = 0
1110 end_mark = self.get_mark()
1111 while self.peek() in ' \r\n\x85\u2028\u2029':
1112 if self.peek() != ' ':
1113 chunks.append(self.scan_line_break())
1114 end_mark = self.get_mark()
1115 else:
1116 self.forward()
1117 if self.column > max_indent:
1118 max_indent = self.column
1119 return chunks, max_indent, end_mark
1120
1121 def scan_block_scalar_breaks(self, indent):
1122 # See the specification for details.
1123 chunks = []
1124 end_mark = self.get_mark()
1125 while self.column < indent and self.peek() == ' ':
1126 self.forward()
1127 while self.peek() in '\r\n\x85\u2028\u2029':
1128 chunks.append(self.scan_line_break())
1129 end_mark = self.get_mark()
1130 while self.column < indent and self.peek() == ' ':
1131 self.forward()
1132 return chunks, end_mark
1133
1134 def scan_flow_scalar(self, style):
1135 # See the specification for details.
1136 # Note that we loose indentation rules for quoted scalars. Quoted
1137 # scalars don't need to adhere indentation because " and ' clearly
1138 # mark the beginning and the end of them. Therefore we are less
1139 # restrictive then the specification requires. We only need to check
1140 # that document separators are not included in scalars.
1141 if style == '"':
1142 double = True
1143 else:
1144 double = False
1145 chunks = []
1146 start_mark = self.get_mark()
1147 quote = self.peek()
1148 self.forward()
1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1150 while self.peek() != quote:
1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1153 self.forward()
1154 end_mark = self.get_mark()
1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark,
1156 style)
1157
1158 ESCAPE_REPLACEMENTS = {
1159 '0': '\0',
1160 'a': '\x07',
1161 'b': '\x08',
1162 't': '\x09',
1163 '\t': '\x09',
1164 'n': '\x0A',
1165 'v': '\x0B',
1166 'f': '\x0C',
1167 'r': '\x0D',
1168 'e': '\x1B',
1169 ' ': '\x20',
1170 '\"': '\"',
1171 '\\': '\\',
1172 '/': '/',
1173 'N': '\x85',
1174 '_': '\xA0',
1175 'L': '\u2028',
1176 'P': '\u2029',
1177 }
1178
1179 ESCAPE_CODES = {
1180 'x': 2,
1181 'u': 4,
1182 'U': 8,
1183 }
1184
1185 def scan_flow_scalar_non_spaces(self, double, start_mark):
1186 # See the specification for details.
1187 chunks = []
1188 while True:
1189 length = 0
1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
1191 length += 1
1192 if length:
1193 chunks.append(self.prefix(length))
1194 self.forward(length)
1195 ch = self.peek()
1196 if not double and ch == '\'' and self.peek(1) == '\'':
1197 chunks.append('\'')
1198 self.forward(2)
1199 elif (double and ch == '\'') or (not double and ch in '\"\\'):
1200 chunks.append(ch)
1201 self.forward()
1202 elif double and ch == '\\':
1203 self.forward()
1204 ch = self.peek()
1205 if ch in self.ESCAPE_REPLACEMENTS:
1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1207 self.forward()
1208 elif ch in self.ESCAPE_CODES:
1209 length = self.ESCAPE_CODES[ch]
1210 self.forward()
1211 for k in range(length):
1212 if self.peek(k) not in '0123456789ABCDEFabcdef':
1213 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1214 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1215 (length, self.peek(k)), self.get_mark())
1216 code = int(self.prefix(length), 16)
1217 chunks.append(chr(code))
1218 self.forward(length)
1219 elif ch in '\r\n\x85\u2028\u2029':
1220 self.scan_line_break()
1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1222 else:
1223 raise ScannerError("while scanning a double-quoted scalar", start_mark,
1224 "found unknown escape character %r" % ch, self.get_mark())
1225 else:
1226 return chunks
1227
1228 def scan_flow_scalar_spaces(self, double, start_mark):
1229 # See the specification for details.
1230 chunks = []
1231 length = 0
1232 while self.peek(length) in ' \t':
1233 length += 1
1234 whitespaces = self.prefix(length)
1235 self.forward(length)
1236 ch = self.peek()
1237 if ch == '\0':
1238 raise ScannerError("while scanning a quoted scalar", start_mark,
1239 "found unexpected end of stream", self.get_mark())
1240 elif ch in '\r\n\x85\u2028\u2029':
1241 line_break = self.scan_line_break()
1242 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1243 if line_break != '\n':
1244 chunks.append(line_break)
1245 elif not breaks:
1246 chunks.append(' ')
1247 chunks.extend(breaks)
1248 else:
1249 chunks.append(whitespaces)
1250 return chunks
1251
1252 def scan_flow_scalar_breaks(self, double, start_mark):
1253 # See the specification for details.
1254 chunks = []
1255 while True:
1256 # Instead of checking indentation, we check for document
1257 # separators.
1258 prefix = self.prefix(3)
1259 if (prefix == '---' or prefix == '...') \
1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
1261 raise ScannerError("while scanning a quoted scalar", start_mark,
1262 "found unexpected document separator", self.get_mark())
1263 while self.peek() in ' \t':
1264 self.forward()
1265 if self.peek() in '\r\n\x85\u2028\u2029':
1266 chunks.append(self.scan_line_break())
1267 else:
1268 return chunks
1269
1270 def scan_plain(self):
1271 # See the specification for details.
1272 # We add an additional restriction for the flow context:
1273 # plain scalars in the flow context cannot contain ',' or '?'.
1274 # We also keep track of the `allow_simple_key` flag here.
1275 # Indentation rules are loosed for the flow context.
1276 chunks = []
1277 start_mark = self.get_mark()
1278 end_mark = start_mark
1279 indent = self.indent+1
1280 # We allow zero indentation for scalars, but then we need to check for
1281 # document separators at the beginning of the line.
1282 #if indent == 0:
1283 # indent = 1
1284 spaces = []
1285 while True:
1286 length = 0
1287 if self.peek() == '#':
1288 break
1289 while True:
1290 ch = self.peek(length)
1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \
1292 or (ch == ':' and
1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029'
1294 + (u',[]{}' if self.flow_level else u''))\
1295 or (self.flow_level and ch in ',?[]{}'):
1296 break
1297 length += 1
1298 if length == 0:
1299 break
1300 self.allow_simple_key = False
1301 chunks.extend(spaces)
1302 chunks.append(self.prefix(length))
1303 self.forward(length)
1304 end_mark = self.get_mark()
1305 spaces = self.scan_plain_spaces(indent, start_mark)
1306 if not spaces or self.peek() == '#' \
1307 or (not self.flow_level and self.column < indent):
1308 break
1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark)
1310
1311 def scan_plain_spaces(self, indent, start_mark):
1312 # See the specification for details.
1313 # The specification is really confusing about tabs in plain scalars.
1314 # We just forbid them completely. Do not use tabs in YAML!
1315 chunks = []
1316 length = 0
1317 while self.peek(length) in ' ':
1318 length += 1
1319 whitespaces = self.prefix(length)
1320 self.forward(length)
1321 ch = self.peek()
1322 if ch in '\r\n\x85\u2028\u2029':
1323 line_break = self.scan_line_break()
1324 self.allow_simple_key = True
1325 prefix = self.prefix(3)
1326 if (prefix == '---' or prefix == '...') \
1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
1328 return
1329 breaks = []
1330 while self.peek() in ' \r\n\x85\u2028\u2029':
1331 if self.peek() == ' ':
1332 self.forward()
1333 else:
1334 breaks.append(self.scan_line_break())
1335 prefix = self.prefix(3)
1336 if (prefix == '---' or prefix == '...') \
1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
1338 return
1339 if line_break != '\n':
1340 chunks.append(line_break)
1341 elif not breaks:
1342 chunks.append(' ')
1343 chunks.extend(breaks)
1344 elif whitespaces:
1345 chunks.append(whitespaces)
1346 return chunks
1347
1348 def scan_tag_handle(self, name, start_mark):
1349 # See the specification for details.
1350 # For some strange reasons, the specification does not allow '_' in
1351 # tag handles. I have allowed it anyway.
1352 ch = self.peek()
1353 if ch != '!':
1354 raise ScannerError("while scanning a %s" % name, start_mark,
1355 "expected '!', but found %r" % ch, self.get_mark())
1356 length = 1
1357 ch = self.peek(length)
1358 if ch != ' ':
1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
1360 or ch in '-_':
1361 length += 1
1362 ch = self.peek(length)
1363 if ch != '!':
1364 self.forward(length)
1365 raise ScannerError("while scanning a %s" % name, start_mark,
1366 "expected '!', but found %r" % ch, self.get_mark())
1367 length += 1
1368 value = self.prefix(length)
1369 self.forward(length)
1370 return value
1371
1372 def scan_tag_uri(self, name, start_mark):
1373 # See the specification for details.
1374 # Note: we do not check if URI is well-formed.
1375 chunks = []
1376 length = 0
1377 ch = self.peek(length)
1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%':
1380 if ch == '%':
1381 chunks.append(self.prefix(length))
1382 self.forward(length)
1383 length = 0
1384 chunks.append(self.scan_uri_escapes(name, start_mark))
1385 else:
1386 length += 1
1387 ch = self.peek(length)
1388 if length:
1389 chunks.append(self.prefix(length))
1390 self.forward(length)
1391 length = 0
1392 if not chunks:
1393 raise ScannerError("while parsing a %s" % name, start_mark,
1394 "expected URI, but found %r" % ch, self.get_mark())
1395 return ''.join(chunks)
1396
1397 def scan_uri_escapes(self, name, start_mark):
1398 # See the specification for details.
1399 codes = []
1400 mark = self.get_mark()
1401 while self.peek() == '%':
1402 self.forward()
1403 for k in range(2):
1404 if self.peek(k) not in '0123456789ABCDEFabcdef':
1405 raise ScannerError("while scanning a %s" % name, start_mark,
1406 "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
1407 % self.peek(k), self.get_mark())
1408 codes.append(int(self.prefix(2), 16))
1409 self.forward(2)
1410 try:
1411 value = bytes(codes).decode('utf-8')
1412 except UnicodeDecodeError as exc:
1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
1414 return value
1415
1416 def scan_line_break(self):
1417 # Transforms:
1418 # '\r\n' : '\n'
1419 # '\r' : '\n'
1420 # '\n' : '\n'
1421 # '\x85' : '\n'
1422 # '\u2028' : '\u2028'
1423 # '\u2029 : '\u2029'
1424 # default : ''
1425 ch = self.peek()
1426 if ch in '\r\n\x85':
1427 if self.prefix(2) == '\r\n':
1428 self.forward(2)
1429 else:
1430 self.forward()
1431 return '\n'
1432 elif ch in '\u2028\u2029':
1433 self.forward()
1434 return ch
1435 return ''