Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/yaml/scanner.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
comparison
equal
deleted
inserted
replaced
4:79f47841a781 | 5:9b1c78e6ba9c |
---|---|
1 | |
2 # Scanner produces tokens of the following types: | |
3 # STREAM-START | |
4 # STREAM-END | |
5 # DIRECTIVE(name, value) | |
6 # DOCUMENT-START | |
7 # DOCUMENT-END | |
8 # BLOCK-SEQUENCE-START | |
9 # BLOCK-MAPPING-START | |
10 # BLOCK-END | |
11 # FLOW-SEQUENCE-START | |
12 # FLOW-MAPPING-START | |
13 # FLOW-SEQUENCE-END | |
14 # FLOW-MAPPING-END | |
15 # BLOCK-ENTRY | |
16 # FLOW-ENTRY | |
17 # KEY | |
18 # VALUE | |
19 # ALIAS(value) | |
20 # ANCHOR(value) | |
21 # TAG(value) | |
22 # SCALAR(value, plain, style) | |
23 # | |
24 # Read comments in the Scanner code for more details. | |
25 # | |
26 | |
27 __all__ = ['Scanner', 'ScannerError'] | |
28 | |
29 from .error import MarkedYAMLError | |
30 from .tokens import * | |
31 | |
32 class ScannerError(MarkedYAMLError): | |
33 pass | |
34 | |
35 class SimpleKey: | |
36 # See below simple keys treatment. | |
37 | |
38 def __init__(self, token_number, required, index, line, column, mark): | |
39 self.token_number = token_number | |
40 self.required = required | |
41 self.index = index | |
42 self.line = line | |
43 self.column = column | |
44 self.mark = mark | |
45 | |
46 class Scanner: | |
47 | |
48 def __init__(self): | |
49 """Initialize the scanner.""" | |
50 # It is assumed that Scanner and Reader will have a common descendant. | |
51 # Reader do the dirty work of checking for BOM and converting the | |
52 # input data to Unicode. It also adds NUL to the end. | |
53 # | |
54 # Reader supports the following methods | |
55 # self.peek(i=0) # peek the next i-th character | |
56 # self.prefix(l=1) # peek the next l characters | |
57 # self.forward(l=1) # read the next l characters and move the pointer. | |
58 | |
59 # Had we reached the end of the stream? | |
60 self.done = False | |
61 | |
62 # The number of unclosed '{' and '['. `flow_level == 0` means block | |
63 # context. | |
64 self.flow_level = 0 | |
65 | |
66 # List of processed tokens that are not yet emitted. | |
67 self.tokens = [] | |
68 | |
69 # Add the STREAM-START token. | |
70 self.fetch_stream_start() | |
71 | |
72 # Number of tokens that were emitted through the `get_token` method. | |
73 self.tokens_taken = 0 | |
74 | |
75 # The current indentation level. | |
76 self.indent = -1 | |
77 | |
78 # Past indentation levels. | |
79 self.indents = [] | |
80 | |
81 # Variables related to simple keys treatment. | |
82 | |
83 # A simple key is a key that is not denoted by the '?' indicator. | |
84 # Example of simple keys: | |
85 # --- | |
86 # block simple key: value | |
87 # ? not a simple key: | |
88 # : { flow simple key: value } | |
89 # We emit the KEY token before all keys, so when we find a potential | |
90 # simple key, we try to locate the corresponding ':' indicator. | |
91 # Simple keys should be limited to a single line and 1024 characters. | |
92 | |
93 # Can a simple key start at the current position? A simple key may | |
94 # start: | |
95 # - at the beginning of the line, not counting indentation spaces | |
96 # (in block context), | |
97 # - after '{', '[', ',' (in the flow context), | |
98 # - after '?', ':', '-' (in the block context). | |
99 # In the block context, this flag also signifies if a block collection | |
100 # may start at the current position. | |
101 self.allow_simple_key = True | |
102 | |
103 # Keep track of possible simple keys. This is a dictionary. The key | |
104 # is `flow_level`; there can be no more that one possible simple key | |
105 # for each level. The value is a SimpleKey record: | |
106 # (token_number, required, index, line, column, mark) | |
107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), | |
108 # '[', or '{' tokens. | |
109 self.possible_simple_keys = {} | |
110 | |
111 # Public methods. | |
112 | |
113 def check_token(self, *choices): | |
114 # Check if the next token is one of the given types. | |
115 while self.need_more_tokens(): | |
116 self.fetch_more_tokens() | |
117 if self.tokens: | |
118 if not choices: | |
119 return True | |
120 for choice in choices: | |
121 if isinstance(self.tokens[0], choice): | |
122 return True | |
123 return False | |
124 | |
125 def peek_token(self): | |
126 # Return the next token, but do not delete if from the queue. | |
127 # Return None if no more tokens. | |
128 while self.need_more_tokens(): | |
129 self.fetch_more_tokens() | |
130 if self.tokens: | |
131 return self.tokens[0] | |
132 else: | |
133 return None | |
134 | |
135 def get_token(self): | |
136 # Return the next token. | |
137 while self.need_more_tokens(): | |
138 self.fetch_more_tokens() | |
139 if self.tokens: | |
140 self.tokens_taken += 1 | |
141 return self.tokens.pop(0) | |
142 | |
143 # Private methods. | |
144 | |
145 def need_more_tokens(self): | |
146 if self.done: | |
147 return False | |
148 if not self.tokens: | |
149 return True | |
150 # The current token may be a potential simple key, so we | |
151 # need to look further. | |
152 self.stale_possible_simple_keys() | |
153 if self.next_possible_simple_key() == self.tokens_taken: | |
154 return True | |
155 | |
156 def fetch_more_tokens(self): | |
157 | |
158 # Eat whitespaces and comments until we reach the next token. | |
159 self.scan_to_next_token() | |
160 | |
161 # Remove obsolete possible simple keys. | |
162 self.stale_possible_simple_keys() | |
163 | |
164 # Compare the current indentation and column. It may add some tokens | |
165 # and decrease the current indentation level. | |
166 self.unwind_indent(self.column) | |
167 | |
168 # Peek the next character. | |
169 ch = self.peek() | |
170 | |
171 # Is it the end of stream? | |
172 if ch == '\0': | |
173 return self.fetch_stream_end() | |
174 | |
175 # Is it a directive? | |
176 if ch == '%' and self.check_directive(): | |
177 return self.fetch_directive() | |
178 | |
179 # Is it the document start? | |
180 if ch == '-' and self.check_document_start(): | |
181 return self.fetch_document_start() | |
182 | |
183 # Is it the document end? | |
184 if ch == '.' and self.check_document_end(): | |
185 return self.fetch_document_end() | |
186 | |
187 # TODO: support for BOM within a stream. | |
188 #if ch == '\uFEFF': | |
189 # return self.fetch_bom() <-- issue BOMToken | |
190 | |
191 # Note: the order of the following checks is NOT significant. | |
192 | |
193 # Is it the flow sequence start indicator? | |
194 if ch == '[': | |
195 return self.fetch_flow_sequence_start() | |
196 | |
197 # Is it the flow mapping start indicator? | |
198 if ch == '{': | |
199 return self.fetch_flow_mapping_start() | |
200 | |
201 # Is it the flow sequence end indicator? | |
202 if ch == ']': | |
203 return self.fetch_flow_sequence_end() | |
204 | |
205 # Is it the flow mapping end indicator? | |
206 if ch == '}': | |
207 return self.fetch_flow_mapping_end() | |
208 | |
209 # Is it the flow entry indicator? | |
210 if ch == ',': | |
211 return self.fetch_flow_entry() | |
212 | |
213 # Is it the block entry indicator? | |
214 if ch == '-' and self.check_block_entry(): | |
215 return self.fetch_block_entry() | |
216 | |
217 # Is it the key indicator? | |
218 if ch == '?' and self.check_key(): | |
219 return self.fetch_key() | |
220 | |
221 # Is it the value indicator? | |
222 if ch == ':' and self.check_value(): | |
223 return self.fetch_value() | |
224 | |
225 # Is it an alias? | |
226 if ch == '*': | |
227 return self.fetch_alias() | |
228 | |
229 # Is it an anchor? | |
230 if ch == '&': | |
231 return self.fetch_anchor() | |
232 | |
233 # Is it a tag? | |
234 if ch == '!': | |
235 return self.fetch_tag() | |
236 | |
237 # Is it a literal scalar? | |
238 if ch == '|' and not self.flow_level: | |
239 return self.fetch_literal() | |
240 | |
241 # Is it a folded scalar? | |
242 if ch == '>' and not self.flow_level: | |
243 return self.fetch_folded() | |
244 | |
245 # Is it a single quoted scalar? | |
246 if ch == '\'': | |
247 return self.fetch_single() | |
248 | |
249 # Is it a double quoted scalar? | |
250 if ch == '\"': | |
251 return self.fetch_double() | |
252 | |
253 # It must be a plain scalar then. | |
254 if self.check_plain(): | |
255 return self.fetch_plain() | |
256 | |
257 # No? It's an error. Let's produce a nice error message. | |
258 raise ScannerError("while scanning for the next token", None, | |
259 "found character %r that cannot start any token" % ch, | |
260 self.get_mark()) | |
261 | |
262 # Simple keys treatment. | |
263 | |
264 def next_possible_simple_key(self): | |
265 # Return the number of the nearest possible simple key. Actually we | |
266 # don't need to loop through the whole dictionary. We may replace it | |
267 # with the following code: | |
268 # if not self.possible_simple_keys: | |
269 # return None | |
270 # return self.possible_simple_keys[ | |
271 # min(self.possible_simple_keys.keys())].token_number | |
272 min_token_number = None | |
273 for level in self.possible_simple_keys: | |
274 key = self.possible_simple_keys[level] | |
275 if min_token_number is None or key.token_number < min_token_number: | |
276 min_token_number = key.token_number | |
277 return min_token_number | |
278 | |
279 def stale_possible_simple_keys(self): | |
280 # Remove entries that are no longer possible simple keys. According to | |
281 # the YAML specification, simple keys | |
282 # - should be limited to a single line, | |
283 # - should be no longer than 1024 characters. | |
284 # Disabling this procedure will allow simple keys of any length and | |
285 # height (may cause problems if indentation is broken though). | |
286 for level in list(self.possible_simple_keys): | |
287 key = self.possible_simple_keys[level] | |
288 if key.line != self.line \ | |
289 or self.index-key.index > 1024: | |
290 if key.required: | |
291 raise ScannerError("while scanning a simple key", key.mark, | |
292 "could not find expected ':'", self.get_mark()) | |
293 del self.possible_simple_keys[level] | |
294 | |
295 def save_possible_simple_key(self): | |
296 # The next token may start a simple key. We check if it's possible | |
297 # and save its position. This function is called for | |
298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. | |
299 | |
300 # Check if a simple key is required at the current position. | |
301 required = not self.flow_level and self.indent == self.column | |
302 | |
303 # The next token might be a simple key. Let's save it's number and | |
304 # position. | |
305 if self.allow_simple_key: | |
306 self.remove_possible_simple_key() | |
307 token_number = self.tokens_taken+len(self.tokens) | |
308 key = SimpleKey(token_number, required, | |
309 self.index, self.line, self.column, self.get_mark()) | |
310 self.possible_simple_keys[self.flow_level] = key | |
311 | |
312 def remove_possible_simple_key(self): | |
313 # Remove the saved possible key position at the current flow level. | |
314 if self.flow_level in self.possible_simple_keys: | |
315 key = self.possible_simple_keys[self.flow_level] | |
316 | |
317 if key.required: | |
318 raise ScannerError("while scanning a simple key", key.mark, | |
319 "could not find expected ':'", self.get_mark()) | |
320 | |
321 del self.possible_simple_keys[self.flow_level] | |
322 | |
323 # Indentation functions. | |
324 | |
325 def unwind_indent(self, column): | |
326 | |
327 ## In flow context, tokens should respect indentation. | |
328 ## Actually the condition should be `self.indent >= column` according to | |
329 ## the spec. But this condition will prohibit intuitively correct | |
330 ## constructions such as | |
331 ## key : { | |
332 ## } | |
333 #if self.flow_level and self.indent > column: | |
334 # raise ScannerError(None, None, | |
335 # "invalid indentation or unclosed '[' or '{'", | |
336 # self.get_mark()) | |
337 | |
338 # In the flow context, indentation is ignored. We make the scanner less | |
339 # restrictive then specification requires. | |
340 if self.flow_level: | |
341 return | |
342 | |
343 # In block context, we may need to issue the BLOCK-END tokens. | |
344 while self.indent > column: | |
345 mark = self.get_mark() | |
346 self.indent = self.indents.pop() | |
347 self.tokens.append(BlockEndToken(mark, mark)) | |
348 | |
349 def add_indent(self, column): | |
350 # Check if we need to increase indentation. | |
351 if self.indent < column: | |
352 self.indents.append(self.indent) | |
353 self.indent = column | |
354 return True | |
355 return False | |
356 | |
357 # Fetchers. | |
358 | |
359 def fetch_stream_start(self): | |
360 # We always add STREAM-START as the first token and STREAM-END as the | |
361 # last token. | |
362 | |
363 # Read the token. | |
364 mark = self.get_mark() | |
365 | |
366 # Add STREAM-START. | |
367 self.tokens.append(StreamStartToken(mark, mark, | |
368 encoding=self.encoding)) | |
369 | |
370 | |
371 def fetch_stream_end(self): | |
372 | |
373 # Set the current indentation to -1. | |
374 self.unwind_indent(-1) | |
375 | |
376 # Reset simple keys. | |
377 self.remove_possible_simple_key() | |
378 self.allow_simple_key = False | |
379 self.possible_simple_keys = {} | |
380 | |
381 # Read the token. | |
382 mark = self.get_mark() | |
383 | |
384 # Add STREAM-END. | |
385 self.tokens.append(StreamEndToken(mark, mark)) | |
386 | |
387 # The steam is finished. | |
388 self.done = True | |
389 | |
390 def fetch_directive(self): | |
391 | |
392 # Set the current indentation to -1. | |
393 self.unwind_indent(-1) | |
394 | |
395 # Reset simple keys. | |
396 self.remove_possible_simple_key() | |
397 self.allow_simple_key = False | |
398 | |
399 # Scan and add DIRECTIVE. | |
400 self.tokens.append(self.scan_directive()) | |
401 | |
402 def fetch_document_start(self): | |
403 self.fetch_document_indicator(DocumentStartToken) | |
404 | |
405 def fetch_document_end(self): | |
406 self.fetch_document_indicator(DocumentEndToken) | |
407 | |
408 def fetch_document_indicator(self, TokenClass): | |
409 | |
410 # Set the current indentation to -1. | |
411 self.unwind_indent(-1) | |
412 | |
413 # Reset simple keys. Note that there could not be a block collection | |
414 # after '---'. | |
415 self.remove_possible_simple_key() | |
416 self.allow_simple_key = False | |
417 | |
418 # Add DOCUMENT-START or DOCUMENT-END. | |
419 start_mark = self.get_mark() | |
420 self.forward(3) | |
421 end_mark = self.get_mark() | |
422 self.tokens.append(TokenClass(start_mark, end_mark)) | |
423 | |
424 def fetch_flow_sequence_start(self): | |
425 self.fetch_flow_collection_start(FlowSequenceStartToken) | |
426 | |
427 def fetch_flow_mapping_start(self): | |
428 self.fetch_flow_collection_start(FlowMappingStartToken) | |
429 | |
430 def fetch_flow_collection_start(self, TokenClass): | |
431 | |
432 # '[' and '{' may start a simple key. | |
433 self.save_possible_simple_key() | |
434 | |
435 # Increase the flow level. | |
436 self.flow_level += 1 | |
437 | |
438 # Simple keys are allowed after '[' and '{'. | |
439 self.allow_simple_key = True | |
440 | |
441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. | |
442 start_mark = self.get_mark() | |
443 self.forward() | |
444 end_mark = self.get_mark() | |
445 self.tokens.append(TokenClass(start_mark, end_mark)) | |
446 | |
447 def fetch_flow_sequence_end(self): | |
448 self.fetch_flow_collection_end(FlowSequenceEndToken) | |
449 | |
450 def fetch_flow_mapping_end(self): | |
451 self.fetch_flow_collection_end(FlowMappingEndToken) | |
452 | |
453 def fetch_flow_collection_end(self, TokenClass): | |
454 | |
455 # Reset possible simple key on the current level. | |
456 self.remove_possible_simple_key() | |
457 | |
458 # Decrease the flow level. | |
459 self.flow_level -= 1 | |
460 | |
461 # No simple keys after ']' or '}'. | |
462 self.allow_simple_key = False | |
463 | |
464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. | |
465 start_mark = self.get_mark() | |
466 self.forward() | |
467 end_mark = self.get_mark() | |
468 self.tokens.append(TokenClass(start_mark, end_mark)) | |
469 | |
470 def fetch_flow_entry(self): | |
471 | |
472 # Simple keys are allowed after ','. | |
473 self.allow_simple_key = True | |
474 | |
475 # Reset possible simple key on the current level. | |
476 self.remove_possible_simple_key() | |
477 | |
478 # Add FLOW-ENTRY. | |
479 start_mark = self.get_mark() | |
480 self.forward() | |
481 end_mark = self.get_mark() | |
482 self.tokens.append(FlowEntryToken(start_mark, end_mark)) | |
483 | |
484 def fetch_block_entry(self): | |
485 | |
486 # Block context needs additional checks. | |
487 if not self.flow_level: | |
488 | |
489 # Are we allowed to start a new entry? | |
490 if not self.allow_simple_key: | |
491 raise ScannerError(None, None, | |
492 "sequence entries are not allowed here", | |
493 self.get_mark()) | |
494 | |
495 # We may need to add BLOCK-SEQUENCE-START. | |
496 if self.add_indent(self.column): | |
497 mark = self.get_mark() | |
498 self.tokens.append(BlockSequenceStartToken(mark, mark)) | |
499 | |
500 # It's an error for the block entry to occur in the flow context, | |
501 # but we let the parser detect this. | |
502 else: | |
503 pass | |
504 | |
505 # Simple keys are allowed after '-'. | |
506 self.allow_simple_key = True | |
507 | |
508 # Reset possible simple key on the current level. | |
509 self.remove_possible_simple_key() | |
510 | |
511 # Add BLOCK-ENTRY. | |
512 start_mark = self.get_mark() | |
513 self.forward() | |
514 end_mark = self.get_mark() | |
515 self.tokens.append(BlockEntryToken(start_mark, end_mark)) | |
516 | |
517 def fetch_key(self): | |
518 | |
519 # Block context needs additional checks. | |
520 if not self.flow_level: | |
521 | |
522 # Are we allowed to start a key (not necessary a simple)? | |
523 if not self.allow_simple_key: | |
524 raise ScannerError(None, None, | |
525 "mapping keys are not allowed here", | |
526 self.get_mark()) | |
527 | |
528 # We may need to add BLOCK-MAPPING-START. | |
529 if self.add_indent(self.column): | |
530 mark = self.get_mark() | |
531 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
532 | |
533 # Simple keys are allowed after '?' in the block context. | |
534 self.allow_simple_key = not self.flow_level | |
535 | |
536 # Reset possible simple key on the current level. | |
537 self.remove_possible_simple_key() | |
538 | |
539 # Add KEY. | |
540 start_mark = self.get_mark() | |
541 self.forward() | |
542 end_mark = self.get_mark() | |
543 self.tokens.append(KeyToken(start_mark, end_mark)) | |
544 | |
545 def fetch_value(self): | |
546 | |
547 # Do we determine a simple key? | |
548 if self.flow_level in self.possible_simple_keys: | |
549 | |
550 # Add KEY. | |
551 key = self.possible_simple_keys[self.flow_level] | |
552 del self.possible_simple_keys[self.flow_level] | |
553 self.tokens.insert(key.token_number-self.tokens_taken, | |
554 KeyToken(key.mark, key.mark)) | |
555 | |
556 # If this key starts a new block mapping, we need to add | |
557 # BLOCK-MAPPING-START. | |
558 if not self.flow_level: | |
559 if self.add_indent(key.column): | |
560 self.tokens.insert(key.token_number-self.tokens_taken, | |
561 BlockMappingStartToken(key.mark, key.mark)) | |
562 | |
563 # There cannot be two simple keys one after another. | |
564 self.allow_simple_key = False | |
565 | |
566 # It must be a part of a complex key. | |
567 else: | |
568 | |
569 # Block context needs additional checks. | |
570 # (Do we really need them? They will be caught by the parser | |
571 # anyway.) | |
572 if not self.flow_level: | |
573 | |
574 # We are allowed to start a complex value if and only if | |
575 # we can start a simple key. | |
576 if not self.allow_simple_key: | |
577 raise ScannerError(None, None, | |
578 "mapping values are not allowed here", | |
579 self.get_mark()) | |
580 | |
581 # If this value starts a new block mapping, we need to add | |
582 # BLOCK-MAPPING-START. It will be detected as an error later by | |
583 # the parser. | |
584 if not self.flow_level: | |
585 if self.add_indent(self.column): | |
586 mark = self.get_mark() | |
587 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
588 | |
589 # Simple keys are allowed after ':' in the block context. | |
590 self.allow_simple_key = not self.flow_level | |
591 | |
592 # Reset possible simple key on the current level. | |
593 self.remove_possible_simple_key() | |
594 | |
595 # Add VALUE. | |
596 start_mark = self.get_mark() | |
597 self.forward() | |
598 end_mark = self.get_mark() | |
599 self.tokens.append(ValueToken(start_mark, end_mark)) | |
600 | |
601 def fetch_alias(self): | |
602 | |
603 # ALIAS could be a simple key. | |
604 self.save_possible_simple_key() | |
605 | |
606 # No simple keys after ALIAS. | |
607 self.allow_simple_key = False | |
608 | |
609 # Scan and add ALIAS. | |
610 self.tokens.append(self.scan_anchor(AliasToken)) | |
611 | |
612 def fetch_anchor(self): | |
613 | |
614 # ANCHOR could start a simple key. | |
615 self.save_possible_simple_key() | |
616 | |
617 # No simple keys after ANCHOR. | |
618 self.allow_simple_key = False | |
619 | |
620 # Scan and add ANCHOR. | |
621 self.tokens.append(self.scan_anchor(AnchorToken)) | |
622 | |
623 def fetch_tag(self): | |
624 | |
625 # TAG could start a simple key. | |
626 self.save_possible_simple_key() | |
627 | |
628 # No simple keys after TAG. | |
629 self.allow_simple_key = False | |
630 | |
631 # Scan and add TAG. | |
632 self.tokens.append(self.scan_tag()) | |
633 | |
634 def fetch_literal(self): | |
635 self.fetch_block_scalar(style='|') | |
636 | |
637 def fetch_folded(self): | |
638 self.fetch_block_scalar(style='>') | |
639 | |
640 def fetch_block_scalar(self, style): | |
641 | |
642 # A simple key may follow a block scalar. | |
643 self.allow_simple_key = True | |
644 | |
645 # Reset possible simple key on the current level. | |
646 self.remove_possible_simple_key() | |
647 | |
648 # Scan and add SCALAR. | |
649 self.tokens.append(self.scan_block_scalar(style)) | |
650 | |
651 def fetch_single(self): | |
652 self.fetch_flow_scalar(style='\'') | |
653 | |
654 def fetch_double(self): | |
655 self.fetch_flow_scalar(style='"') | |
656 | |
657 def fetch_flow_scalar(self, style): | |
658 | |
659 # A flow scalar could be a simple key. | |
660 self.save_possible_simple_key() | |
661 | |
662 # No simple keys after flow scalars. | |
663 self.allow_simple_key = False | |
664 | |
665 # Scan and add SCALAR. | |
666 self.tokens.append(self.scan_flow_scalar(style)) | |
667 | |
668 def fetch_plain(self): | |
669 | |
670 # A plain scalar could be a simple key. | |
671 self.save_possible_simple_key() | |
672 | |
673 # No simple keys after plain scalars. But note that `scan_plain` will | |
674 # change this flag if the scan is finished at the beginning of the | |
675 # line. | |
676 self.allow_simple_key = False | |
677 | |
678 # Scan and add SCALAR. May change `allow_simple_key`. | |
679 self.tokens.append(self.scan_plain()) | |
680 | |
681 # Checkers. | |
682 | |
683 def check_directive(self): | |
684 | |
685 # DIRECTIVE: ^ '%' ... | |
686 # The '%' indicator is already checked. | |
687 if self.column == 0: | |
688 return True | |
689 | |
690 def check_document_start(self): | |
691 | |
692 # DOCUMENT-START: ^ '---' (' '|'\n') | |
693 if self.column == 0: | |
694 if self.prefix(3) == '---' \ | |
695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
696 return True | |
697 | |
698 def check_document_end(self): | |
699 | |
700 # DOCUMENT-END: ^ '...' (' '|'\n') | |
701 if self.column == 0: | |
702 if self.prefix(3) == '...' \ | |
703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
704 return True | |
705 | |
706 def check_block_entry(self): | |
707 | |
708 # BLOCK-ENTRY: '-' (' '|'\n') | |
709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
710 | |
711 def check_key(self): | |
712 | |
713 # KEY(flow context): '?' | |
714 if self.flow_level: | |
715 return True | |
716 | |
717 # KEY(block context): '?' (' '|'\n') | |
718 else: | |
719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
720 | |
721 def check_value(self): | |
722 | |
723 # VALUE(flow context): ':' | |
724 if self.flow_level: | |
725 return True | |
726 | |
727 # VALUE(block context): ':' (' '|'\n') | |
728 else: | |
729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' | |
730 | |
731 def check_plain(self): | |
732 | |
733 # A plain scalar may start with any non-space character except: | |
734 # '-', '?', ':', ',', '[', ']', '{', '}', | |
735 # '#', '&', '*', '!', '|', '>', '\'', '\"', | |
736 # '%', '@', '`'. | |
737 # | |
738 # It may also start with | |
739 # '-', '?', ':' | |
740 # if it is followed by a non-space character. | |
741 # | |
742 # Note that we limit the last rule to the block context (except the | |
743 # '-' character) because we want the flow context to be space | |
744 # independent. | |
745 ch = self.peek() | |
746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ | |
747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' | |
748 and (ch == '-' or (not self.flow_level and ch in '?:'))) | |
749 | |
750 # Scanners. | |
751 | |
752 def scan_to_next_token(self): | |
753 # We ignore spaces, line breaks and comments. | |
754 # If we find a line break in the block context, we set the flag | |
755 # `allow_simple_key` on. | |
756 # The byte order mark is stripped if it's the first character in the | |
757 # stream. We do not yet support BOM inside the stream as the | |
758 # specification requires. Any such mark will be considered as a part | |
759 # of the document. | |
760 # | |
761 # TODO: We need to make tab handling rules more sane. A good rule is | |
762 # Tabs cannot precede tokens | |
763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
764 # KEY(block), VALUE(block), BLOCK-ENTRY | |
765 # So the checking code is | |
766 # if <TAB>: | |
767 # self.allow_simple_keys = False | |
768 # We also need to add the check for `allow_simple_keys == True` to | |
769 # `unwind_indent` before issuing BLOCK-END. | |
770 # Scanners for block, flow, and plain scalars need to be modified. | |
771 | |
772 if self.index == 0 and self.peek() == '\uFEFF': | |
773 self.forward() | |
774 found = False | |
775 while not found: | |
776 while self.peek() == ' ': | |
777 self.forward() | |
778 if self.peek() == '#': | |
779 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
780 self.forward() | |
781 if self.scan_line_break(): | |
782 if not self.flow_level: | |
783 self.allow_simple_key = True | |
784 else: | |
785 found = True | |
786 | |
787 def scan_directive(self): | |
788 # See the specification for details. | |
789 start_mark = self.get_mark() | |
790 self.forward() | |
791 name = self.scan_directive_name(start_mark) | |
792 value = None | |
793 if name == 'YAML': | |
794 value = self.scan_yaml_directive_value(start_mark) | |
795 end_mark = self.get_mark() | |
796 elif name == 'TAG': | |
797 value = self.scan_tag_directive_value(start_mark) | |
798 end_mark = self.get_mark() | |
799 else: | |
800 end_mark = self.get_mark() | |
801 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
802 self.forward() | |
803 self.scan_directive_ignored_line(start_mark) | |
804 return DirectiveToken(name, value, start_mark, end_mark) | |
805 | |
806 def scan_directive_name(self, start_mark): | |
807 # See the specification for details. | |
808 length = 0 | |
809 ch = self.peek(length) | |
810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
811 or ch in '-_': | |
812 length += 1 | |
813 ch = self.peek(length) | |
814 if not length: | |
815 raise ScannerError("while scanning a directive", start_mark, | |
816 "expected alphabetic or numeric character, but found %r" | |
817 % ch, self.get_mark()) | |
818 value = self.prefix(length) | |
819 self.forward(length) | |
820 ch = self.peek() | |
821 if ch not in '\0 \r\n\x85\u2028\u2029': | |
822 raise ScannerError("while scanning a directive", start_mark, | |
823 "expected alphabetic or numeric character, but found %r" | |
824 % ch, self.get_mark()) | |
825 return value | |
826 | |
827 def scan_yaml_directive_value(self, start_mark): | |
828 # See the specification for details. | |
829 while self.peek() == ' ': | |
830 self.forward() | |
831 major = self.scan_yaml_directive_number(start_mark) | |
832 if self.peek() != '.': | |
833 raise ScannerError("while scanning a directive", start_mark, | |
834 "expected a digit or '.', but found %r" % self.peek(), | |
835 self.get_mark()) | |
836 self.forward() | |
837 minor = self.scan_yaml_directive_number(start_mark) | |
838 if self.peek() not in '\0 \r\n\x85\u2028\u2029': | |
839 raise ScannerError("while scanning a directive", start_mark, | |
840 "expected a digit or ' ', but found %r" % self.peek(), | |
841 self.get_mark()) | |
842 return (major, minor) | |
843 | |
844 def scan_yaml_directive_number(self, start_mark): | |
845 # See the specification for details. | |
846 ch = self.peek() | |
847 if not ('0' <= ch <= '9'): | |
848 raise ScannerError("while scanning a directive", start_mark, | |
849 "expected a digit, but found %r" % ch, self.get_mark()) | |
850 length = 0 | |
851 while '0' <= self.peek(length) <= '9': | |
852 length += 1 | |
853 value = int(self.prefix(length)) | |
854 self.forward(length) | |
855 return value | |
856 | |
857 def scan_tag_directive_value(self, start_mark): | |
858 # See the specification for details. | |
859 while self.peek() == ' ': | |
860 self.forward() | |
861 handle = self.scan_tag_directive_handle(start_mark) | |
862 while self.peek() == ' ': | |
863 self.forward() | |
864 prefix = self.scan_tag_directive_prefix(start_mark) | |
865 return (handle, prefix) | |
866 | |
867 def scan_tag_directive_handle(self, start_mark): | |
868 # See the specification for details. | |
869 value = self.scan_tag_handle('directive', start_mark) | |
870 ch = self.peek() | |
871 if ch != ' ': | |
872 raise ScannerError("while scanning a directive", start_mark, | |
873 "expected ' ', but found %r" % ch, self.get_mark()) | |
874 return value | |
875 | |
876 def scan_tag_directive_prefix(self, start_mark): | |
877 # See the specification for details. | |
878 value = self.scan_tag_uri('directive', start_mark) | |
879 ch = self.peek() | |
880 if ch not in '\0 \r\n\x85\u2028\u2029': | |
881 raise ScannerError("while scanning a directive", start_mark, | |
882 "expected ' ', but found %r" % ch, self.get_mark()) | |
883 return value | |
884 | |
885 def scan_directive_ignored_line(self, start_mark): | |
886 # See the specification for details. | |
887 while self.peek() == ' ': | |
888 self.forward() | |
889 if self.peek() == '#': | |
890 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
891 self.forward() | |
892 ch = self.peek() | |
893 if ch not in '\0\r\n\x85\u2028\u2029': | |
894 raise ScannerError("while scanning a directive", start_mark, | |
895 "expected a comment or a line break, but found %r" | |
896 % ch, self.get_mark()) | |
897 self.scan_line_break() | |
898 | |
899 def scan_anchor(self, TokenClass): | |
900 # The specification does not restrict characters for anchors and | |
901 # aliases. This may lead to problems, for instance, the document: | |
902 # [ *alias, value ] | |
903 # can be interpreted in two ways, as | |
904 # [ "value" ] | |
905 # and | |
906 # [ *alias , "value" ] | |
907 # Therefore we restrict aliases to numbers and ASCII letters. | |
908 start_mark = self.get_mark() | |
909 indicator = self.peek() | |
910 if indicator == '*': | |
911 name = 'alias' | |
912 else: | |
913 name = 'anchor' | |
914 self.forward() | |
915 length = 0 | |
916 ch = self.peek(length) | |
917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
918 or ch in '-_': | |
919 length += 1 | |
920 ch = self.peek(length) | |
921 if not length: | |
922 raise ScannerError("while scanning an %s" % name, start_mark, | |
923 "expected alphabetic or numeric character, but found %r" | |
924 % ch, self.get_mark()) | |
925 value = self.prefix(length) | |
926 self.forward(length) | |
927 ch = self.peek() | |
928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': | |
929 raise ScannerError("while scanning an %s" % name, start_mark, | |
930 "expected alphabetic or numeric character, but found %r" | |
931 % ch, self.get_mark()) | |
932 end_mark = self.get_mark() | |
933 return TokenClass(value, start_mark, end_mark) | |
934 | |
935 def scan_tag(self): | |
936 # See the specification for details. | |
937 start_mark = self.get_mark() | |
938 ch = self.peek(1) | |
939 if ch == '<': | |
940 handle = None | |
941 self.forward(2) | |
942 suffix = self.scan_tag_uri('tag', start_mark) | |
943 if self.peek() != '>': | |
944 raise ScannerError("while parsing a tag", start_mark, | |
945 "expected '>', but found %r" % self.peek(), | |
946 self.get_mark()) | |
947 self.forward() | |
948 elif ch in '\0 \t\r\n\x85\u2028\u2029': | |
949 handle = None | |
950 suffix = '!' | |
951 self.forward() | |
952 else: | |
953 length = 1 | |
954 use_handle = False | |
955 while ch not in '\0 \r\n\x85\u2028\u2029': | |
956 if ch == '!': | |
957 use_handle = True | |
958 break | |
959 length += 1 | |
960 ch = self.peek(length) | |
961 handle = '!' | |
962 if use_handle: | |
963 handle = self.scan_tag_handle('tag', start_mark) | |
964 else: | |
965 handle = '!' | |
966 self.forward() | |
967 suffix = self.scan_tag_uri('tag', start_mark) | |
968 ch = self.peek() | |
969 if ch not in '\0 \r\n\x85\u2028\u2029': | |
970 raise ScannerError("while scanning a tag", start_mark, | |
971 "expected ' ', but found %r" % ch, self.get_mark()) | |
972 value = (handle, suffix) | |
973 end_mark = self.get_mark() | |
974 return TagToken(value, start_mark, end_mark) | |
975 | |
976 def scan_block_scalar(self, style): | |
977 # See the specification for details. | |
978 | |
979 if style == '>': | |
980 folded = True | |
981 else: | |
982 folded = False | |
983 | |
984 chunks = [] | |
985 start_mark = self.get_mark() | |
986 | |
987 # Scan the header. | |
988 self.forward() | |
989 chomping, increment = self.scan_block_scalar_indicators(start_mark) | |
990 self.scan_block_scalar_ignored_line(start_mark) | |
991 | |
992 # Determine the indentation level and go to the first non-empty line. | |
993 min_indent = self.indent+1 | |
994 if min_indent < 1: | |
995 min_indent = 1 | |
996 if increment is None: | |
997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() | |
998 indent = max(min_indent, max_indent) | |
999 else: | |
1000 indent = min_indent+increment-1 | |
1001 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
1002 line_break = '' | |
1003 | |
1004 # Scan the inner part of the block scalar. | |
1005 while self.column == indent and self.peek() != '\0': | |
1006 chunks.extend(breaks) | |
1007 leading_non_space = self.peek() not in ' \t' | |
1008 length = 0 | |
1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029': | |
1010 length += 1 | |
1011 chunks.append(self.prefix(length)) | |
1012 self.forward(length) | |
1013 line_break = self.scan_line_break() | |
1014 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
1015 if self.column == indent and self.peek() != '\0': | |
1016 | |
1017 # Unfortunately, folding rules are ambiguous. | |
1018 # | |
1019 # This is the folding according to the specification: | |
1020 | |
1021 if folded and line_break == '\n' \ | |
1022 and leading_non_space and self.peek() not in ' \t': | |
1023 if not breaks: | |
1024 chunks.append(' ') | |
1025 else: | |
1026 chunks.append(line_break) | |
1027 | |
1028 # This is Clark Evans's interpretation (also in the spec | |
1029 # examples): | |
1030 # | |
1031 #if folded and line_break == '\n': | |
1032 # if not breaks: | |
1033 # if self.peek() not in ' \t': | |
1034 # chunks.append(' ') | |
1035 # else: | |
1036 # chunks.append(line_break) | |
1037 #else: | |
1038 # chunks.append(line_break) | |
1039 else: | |
1040 break | |
1041 | |
1042 # Chomp the tail. | |
1043 if chomping is not False: | |
1044 chunks.append(line_break) | |
1045 if chomping is True: | |
1046 chunks.extend(breaks) | |
1047 | |
1048 # We are done. | |
1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
1050 style) | |
1051 | |
1052 def scan_block_scalar_indicators(self, start_mark): | |
1053 # See the specification for details. | |
1054 chomping = None | |
1055 increment = None | |
1056 ch = self.peek() | |
1057 if ch in '+-': | |
1058 if ch == '+': | |
1059 chomping = True | |
1060 else: | |
1061 chomping = False | |
1062 self.forward() | |
1063 ch = self.peek() | |
1064 if ch in '0123456789': | |
1065 increment = int(ch) | |
1066 if increment == 0: | |
1067 raise ScannerError("while scanning a block scalar", start_mark, | |
1068 "expected indentation indicator in the range 1-9, but found 0", | |
1069 self.get_mark()) | |
1070 self.forward() | |
1071 elif ch in '0123456789': | |
1072 increment = int(ch) | |
1073 if increment == 0: | |
1074 raise ScannerError("while scanning a block scalar", start_mark, | |
1075 "expected indentation indicator in the range 1-9, but found 0", | |
1076 self.get_mark()) | |
1077 self.forward() | |
1078 ch = self.peek() | |
1079 if ch in '+-': | |
1080 if ch == '+': | |
1081 chomping = True | |
1082 else: | |
1083 chomping = False | |
1084 self.forward() | |
1085 ch = self.peek() | |
1086 if ch not in '\0 \r\n\x85\u2028\u2029': | |
1087 raise ScannerError("while scanning a block scalar", start_mark, | |
1088 "expected chomping or indentation indicators, but found %r" | |
1089 % ch, self.get_mark()) | |
1090 return chomping, increment | |
1091 | |
1092 def scan_block_scalar_ignored_line(self, start_mark): | |
1093 # See the specification for details. | |
1094 while self.peek() == ' ': | |
1095 self.forward() | |
1096 if self.peek() == '#': | |
1097 while self.peek() not in '\0\r\n\x85\u2028\u2029': | |
1098 self.forward() | |
1099 ch = self.peek() | |
1100 if ch not in '\0\r\n\x85\u2028\u2029': | |
1101 raise ScannerError("while scanning a block scalar", start_mark, | |
1102 "expected a comment or a line break, but found %r" % ch, | |
1103 self.get_mark()) | |
1104 self.scan_line_break() | |
1105 | |
1106 def scan_block_scalar_indentation(self): | |
1107 # See the specification for details. | |
1108 chunks = [] | |
1109 max_indent = 0 | |
1110 end_mark = self.get_mark() | |
1111 while self.peek() in ' \r\n\x85\u2028\u2029': | |
1112 if self.peek() != ' ': | |
1113 chunks.append(self.scan_line_break()) | |
1114 end_mark = self.get_mark() | |
1115 else: | |
1116 self.forward() | |
1117 if self.column > max_indent: | |
1118 max_indent = self.column | |
1119 return chunks, max_indent, end_mark | |
1120 | |
1121 def scan_block_scalar_breaks(self, indent): | |
1122 # See the specification for details. | |
1123 chunks = [] | |
1124 end_mark = self.get_mark() | |
1125 while self.column < indent and self.peek() == ' ': | |
1126 self.forward() | |
1127 while self.peek() in '\r\n\x85\u2028\u2029': | |
1128 chunks.append(self.scan_line_break()) | |
1129 end_mark = self.get_mark() | |
1130 while self.column < indent and self.peek() == ' ': | |
1131 self.forward() | |
1132 return chunks, end_mark | |
1133 | |
1134 def scan_flow_scalar(self, style): | |
1135 # See the specification for details. | |
1136 # Note that we loose indentation rules for quoted scalars. Quoted | |
1137 # scalars don't need to adhere indentation because " and ' clearly | |
1138 # mark the beginning and the end of them. Therefore we are less | |
1139 # restrictive then the specification requires. We only need to check | |
1140 # that document separators are not included in scalars. | |
1141 if style == '"': | |
1142 double = True | |
1143 else: | |
1144 double = False | |
1145 chunks = [] | |
1146 start_mark = self.get_mark() | |
1147 quote = self.peek() | |
1148 self.forward() | |
1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
1150 while self.peek() != quote: | |
1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) | |
1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
1153 self.forward() | |
1154 end_mark = self.get_mark() | |
1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark, | |
1156 style) | |
1157 | |
1158 ESCAPE_REPLACEMENTS = { | |
1159 '0': '\0', | |
1160 'a': '\x07', | |
1161 'b': '\x08', | |
1162 't': '\x09', | |
1163 '\t': '\x09', | |
1164 'n': '\x0A', | |
1165 'v': '\x0B', | |
1166 'f': '\x0C', | |
1167 'r': '\x0D', | |
1168 'e': '\x1B', | |
1169 ' ': '\x20', | |
1170 '\"': '\"', | |
1171 '\\': '\\', | |
1172 '/': '/', | |
1173 'N': '\x85', | |
1174 '_': '\xA0', | |
1175 'L': '\u2028', | |
1176 'P': '\u2029', | |
1177 } | |
1178 | |
1179 ESCAPE_CODES = { | |
1180 'x': 2, | |
1181 'u': 4, | |
1182 'U': 8, | |
1183 } | |
1184 | |
1185 def scan_flow_scalar_non_spaces(self, double, start_mark): | |
1186 # See the specification for details. | |
1187 chunks = [] | |
1188 while True: | |
1189 length = 0 | |
1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': | |
1191 length += 1 | |
1192 if length: | |
1193 chunks.append(self.prefix(length)) | |
1194 self.forward(length) | |
1195 ch = self.peek() | |
1196 if not double and ch == '\'' and self.peek(1) == '\'': | |
1197 chunks.append('\'') | |
1198 self.forward(2) | |
1199 elif (double and ch == '\'') or (not double and ch in '\"\\'): | |
1200 chunks.append(ch) | |
1201 self.forward() | |
1202 elif double and ch == '\\': | |
1203 self.forward() | |
1204 ch = self.peek() | |
1205 if ch in self.ESCAPE_REPLACEMENTS: | |
1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) | |
1207 self.forward() | |
1208 elif ch in self.ESCAPE_CODES: | |
1209 length = self.ESCAPE_CODES[ch] | |
1210 self.forward() | |
1211 for k in range(length): | |
1212 if self.peek(k) not in '0123456789ABCDEFabcdef': | |
1213 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
1214 "expected escape sequence of %d hexdecimal numbers, but found %r" % | |
1215 (length, self.peek(k)), self.get_mark()) | |
1216 code = int(self.prefix(length), 16) | |
1217 chunks.append(chr(code)) | |
1218 self.forward(length) | |
1219 elif ch in '\r\n\x85\u2028\u2029': | |
1220 self.scan_line_break() | |
1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) | |
1222 else: | |
1223 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
1224 "found unknown escape character %r" % ch, self.get_mark()) | |
1225 else: | |
1226 return chunks | |
1227 | |
1228 def scan_flow_scalar_spaces(self, double, start_mark): | |
1229 # See the specification for details. | |
1230 chunks = [] | |
1231 length = 0 | |
1232 while self.peek(length) in ' \t': | |
1233 length += 1 | |
1234 whitespaces = self.prefix(length) | |
1235 self.forward(length) | |
1236 ch = self.peek() | |
1237 if ch == '\0': | |
1238 raise ScannerError("while scanning a quoted scalar", start_mark, | |
1239 "found unexpected end of stream", self.get_mark()) | |
1240 elif ch in '\r\n\x85\u2028\u2029': | |
1241 line_break = self.scan_line_break() | |
1242 breaks = self.scan_flow_scalar_breaks(double, start_mark) | |
1243 if line_break != '\n': | |
1244 chunks.append(line_break) | |
1245 elif not breaks: | |
1246 chunks.append(' ') | |
1247 chunks.extend(breaks) | |
1248 else: | |
1249 chunks.append(whitespaces) | |
1250 return chunks | |
1251 | |
1252 def scan_flow_scalar_breaks(self, double, start_mark): | |
1253 # See the specification for details. | |
1254 chunks = [] | |
1255 while True: | |
1256 # Instead of checking indentation, we check for document | |
1257 # separators. | |
1258 prefix = self.prefix(3) | |
1259 if (prefix == '---' or prefix == '...') \ | |
1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
1261 raise ScannerError("while scanning a quoted scalar", start_mark, | |
1262 "found unexpected document separator", self.get_mark()) | |
1263 while self.peek() in ' \t': | |
1264 self.forward() | |
1265 if self.peek() in '\r\n\x85\u2028\u2029': | |
1266 chunks.append(self.scan_line_break()) | |
1267 else: | |
1268 return chunks | |
1269 | |
1270 def scan_plain(self): | |
1271 # See the specification for details. | |
1272 # We add an additional restriction for the flow context: | |
1273 # plain scalars in the flow context cannot contain ',' or '?'. | |
1274 # We also keep track of the `allow_simple_key` flag here. | |
1275 # Indentation rules are loosed for the flow context. | |
1276 chunks = [] | |
1277 start_mark = self.get_mark() | |
1278 end_mark = start_mark | |
1279 indent = self.indent+1 | |
1280 # We allow zero indentation for scalars, but then we need to check for | |
1281 # document separators at the beginning of the line. | |
1282 #if indent == 0: | |
1283 # indent = 1 | |
1284 spaces = [] | |
1285 while True: | |
1286 length = 0 | |
1287 if self.peek() == '#': | |
1288 break | |
1289 while True: | |
1290 ch = self.peek(length) | |
1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \ | |
1292 or (ch == ':' and | |
1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' | |
1294 + (u',[]{}' if self.flow_level else u''))\ | |
1295 or (self.flow_level and ch in ',?[]{}'): | |
1296 break | |
1297 length += 1 | |
1298 if length == 0: | |
1299 break | |
1300 self.allow_simple_key = False | |
1301 chunks.extend(spaces) | |
1302 chunks.append(self.prefix(length)) | |
1303 self.forward(length) | |
1304 end_mark = self.get_mark() | |
1305 spaces = self.scan_plain_spaces(indent, start_mark) | |
1306 if not spaces or self.peek() == '#' \ | |
1307 or (not self.flow_level and self.column < indent): | |
1308 break | |
1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark) | |
1310 | |
1311 def scan_plain_spaces(self, indent, start_mark): | |
1312 # See the specification for details. | |
1313 # The specification is really confusing about tabs in plain scalars. | |
1314 # We just forbid them completely. Do not use tabs in YAML! | |
1315 chunks = [] | |
1316 length = 0 | |
1317 while self.peek(length) in ' ': | |
1318 length += 1 | |
1319 whitespaces = self.prefix(length) | |
1320 self.forward(length) | |
1321 ch = self.peek() | |
1322 if ch in '\r\n\x85\u2028\u2029': | |
1323 line_break = self.scan_line_break() | |
1324 self.allow_simple_key = True | |
1325 prefix = self.prefix(3) | |
1326 if (prefix == '---' or prefix == '...') \ | |
1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
1328 return | |
1329 breaks = [] | |
1330 while self.peek() in ' \r\n\x85\u2028\u2029': | |
1331 if self.peek() == ' ': | |
1332 self.forward() | |
1333 else: | |
1334 breaks.append(self.scan_line_break()) | |
1335 prefix = self.prefix(3) | |
1336 if (prefix == '---' or prefix == '...') \ | |
1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': | |
1338 return | |
1339 if line_break != '\n': | |
1340 chunks.append(line_break) | |
1341 elif not breaks: | |
1342 chunks.append(' ') | |
1343 chunks.extend(breaks) | |
1344 elif whitespaces: | |
1345 chunks.append(whitespaces) | |
1346 return chunks | |
1347 | |
1348 def scan_tag_handle(self, name, start_mark): | |
1349 # See the specification for details. | |
1350 # For some strange reasons, the specification does not allow '_' in | |
1351 # tag handles. I have allowed it anyway. | |
1352 ch = self.peek() | |
1353 if ch != '!': | |
1354 raise ScannerError("while scanning a %s" % name, start_mark, | |
1355 "expected '!', but found %r" % ch, self.get_mark()) | |
1356 length = 1 | |
1357 ch = self.peek(length) | |
1358 if ch != ' ': | |
1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
1360 or ch in '-_': | |
1361 length += 1 | |
1362 ch = self.peek(length) | |
1363 if ch != '!': | |
1364 self.forward(length) | |
1365 raise ScannerError("while scanning a %s" % name, start_mark, | |
1366 "expected '!', but found %r" % ch, self.get_mark()) | |
1367 length += 1 | |
1368 value = self.prefix(length) | |
1369 self.forward(length) | |
1370 return value | |
1371 | |
1372 def scan_tag_uri(self, name, start_mark): | |
1373 # See the specification for details. | |
1374 # Note: we do not check if URI is well-formed. | |
1375 chunks = [] | |
1376 length = 0 | |
1377 ch = self.peek(length) | |
1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ | |
1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%': | |
1380 if ch == '%': | |
1381 chunks.append(self.prefix(length)) | |
1382 self.forward(length) | |
1383 length = 0 | |
1384 chunks.append(self.scan_uri_escapes(name, start_mark)) | |
1385 else: | |
1386 length += 1 | |
1387 ch = self.peek(length) | |
1388 if length: | |
1389 chunks.append(self.prefix(length)) | |
1390 self.forward(length) | |
1391 length = 0 | |
1392 if not chunks: | |
1393 raise ScannerError("while parsing a %s" % name, start_mark, | |
1394 "expected URI, but found %r" % ch, self.get_mark()) | |
1395 return ''.join(chunks) | |
1396 | |
1397 def scan_uri_escapes(self, name, start_mark): | |
1398 # See the specification for details. | |
1399 codes = [] | |
1400 mark = self.get_mark() | |
1401 while self.peek() == '%': | |
1402 self.forward() | |
1403 for k in range(2): | |
1404 if self.peek(k) not in '0123456789ABCDEFabcdef': | |
1405 raise ScannerError("while scanning a %s" % name, start_mark, | |
1406 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" | |
1407 % self.peek(k), self.get_mark()) | |
1408 codes.append(int(self.prefix(2), 16)) | |
1409 self.forward(2) | |
1410 try: | |
1411 value = bytes(codes).decode('utf-8') | |
1412 except UnicodeDecodeError as exc: | |
1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) | |
1414 return value | |
1415 | |
1416 def scan_line_break(self): | |
1417 # Transforms: | |
1418 # '\r\n' : '\n' | |
1419 # '\r' : '\n' | |
1420 # '\n' : '\n' | |
1421 # '\x85' : '\n' | |
1422 # '\u2028' : '\u2028' | |
1423 # '\u2029 : '\u2029' | |
1424 # default : '' | |
1425 ch = self.peek() | |
1426 if ch in '\r\n\x85': | |
1427 if self.prefix(2) == '\r\n': | |
1428 self.forward(2) | |
1429 else: | |
1430 self.forward() | |
1431 return '\n' | |
1432 elif ch in '\u2028\u2029': | |
1433 self.forward() | |
1434 return ch | |
1435 return '' |