Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/lxml/html/diff.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 # cython: language_level=3 | |
2 | |
3 from __future__ import absolute_import | |
4 | |
5 import difflib | |
6 from lxml import etree | |
7 from lxml.html import fragment_fromstring | |
8 import re | |
9 | |
10 __all__ = ['html_annotate', 'htmldiff'] | |
11 | |
12 try: | |
13 from html import escape as html_escape | |
14 except ImportError: | |
15 from cgi import escape as html_escape | |
16 try: | |
17 _unicode = unicode | |
18 except NameError: | |
19 # Python 3 | |
20 _unicode = str | |
21 try: | |
22 basestring | |
23 except NameError: | |
24 # Python 3 | |
25 basestring = str | |
26 | |
27 ############################################################ | |
28 ## Annotation | |
29 ############################################################ | |
30 | |
31 def default_markup(text, version): | |
32 return '<span title="%s">%s</span>' % ( | |
33 html_escape(_unicode(version), 1), text) | |
34 | |
35 def html_annotate(doclist, markup=default_markup): | |
36 """ | |
37 doclist should be ordered from oldest to newest, like:: | |
38 | |
39 >>> version1 = 'Hello World' | |
40 >>> version2 = 'Goodbye World' | |
41 >>> print(html_annotate([(version1, 'version 1'), | |
42 ... (version2, 'version 2')])) | |
43 <span title="version 2">Goodbye</span> <span title="version 1">World</span> | |
44 | |
45 The documents must be *fragments* (str/UTF8 or unicode), not | |
46 complete documents | |
47 | |
48 The markup argument is a function to markup the spans of words. | |
49 This function is called like markup('Hello', 'version 2'), and | |
50 returns HTML. The first argument is text and never includes any | |
51 markup. The default uses a span with a title: | |
52 | |
53 >>> print(default_markup('Some Text', 'by Joe')) | |
54 <span title="by Joe">Some Text</span> | |
55 """ | |
56 # The basic strategy we have is to split the documents up into | |
57 # logical tokens (which are words with attached markup). We then | |
58 # do diffs of each of the versions to track when a token first | |
59 # appeared in the document; the annotation attached to the token | |
60 # is the version where it first appeared. | |
61 tokenlist = [tokenize_annotated(doc, version) | |
62 for doc, version in doclist] | |
63 cur_tokens = tokenlist[0] | |
64 for tokens in tokenlist[1:]: | |
65 html_annotate_merge_annotations(cur_tokens, tokens) | |
66 cur_tokens = tokens | |
67 | |
68 # After we've tracked all the tokens, we can combine spans of text | |
69 # that are adjacent and have the same annotation | |
70 cur_tokens = compress_tokens(cur_tokens) | |
71 # And finally add markup | |
72 result = markup_serialize_tokens(cur_tokens, markup) | |
73 return ''.join(result).strip() | |
74 | |
75 def tokenize_annotated(doc, annotation): | |
76 """Tokenize a document and add an annotation attribute to each token | |
77 """ | |
78 tokens = tokenize(doc, include_hrefs=False) | |
79 for tok in tokens: | |
80 tok.annotation = annotation | |
81 return tokens | |
82 | |
83 def html_annotate_merge_annotations(tokens_old, tokens_new): | |
84 """Merge the annotations from tokens_old into tokens_new, when the | |
85 tokens in the new document already existed in the old document. | |
86 """ | |
87 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) | |
88 commands = s.get_opcodes() | |
89 | |
90 for command, i1, i2, j1, j2 in commands: | |
91 if command == 'equal': | |
92 eq_old = tokens_old[i1:i2] | |
93 eq_new = tokens_new[j1:j2] | |
94 copy_annotations(eq_old, eq_new) | |
95 | |
96 def copy_annotations(src, dest): | |
97 """ | |
98 Copy annotations from the tokens listed in src to the tokens in dest | |
99 """ | |
100 assert len(src) == len(dest) | |
101 for src_tok, dest_tok in zip(src, dest): | |
102 dest_tok.annotation = src_tok.annotation | |
103 | |
104 def compress_tokens(tokens): | |
105 """ | |
106 Combine adjacent tokens when there is no HTML between the tokens, | |
107 and they share an annotation | |
108 """ | |
109 result = [tokens[0]] | |
110 for tok in tokens[1:]: | |
111 if (not result[-1].post_tags and | |
112 not tok.pre_tags and | |
113 result[-1].annotation == tok.annotation): | |
114 compress_merge_back(result, tok) | |
115 else: | |
116 result.append(tok) | |
117 return result | |
118 | |
119 def compress_merge_back(tokens, tok): | |
120 """ Merge tok into the last element of tokens (modifying the list of | |
121 tokens in-place). """ | |
122 last = tokens[-1] | |
123 if type(last) is not token or type(tok) is not token: | |
124 tokens.append(tok) | |
125 else: | |
126 text = _unicode(last) | |
127 if last.trailing_whitespace: | |
128 text += last.trailing_whitespace | |
129 text += tok | |
130 merged = token(text, | |
131 pre_tags=last.pre_tags, | |
132 post_tags=tok.post_tags, | |
133 trailing_whitespace=tok.trailing_whitespace) | |
134 merged.annotation = last.annotation | |
135 tokens[-1] = merged | |
136 | |
137 def markup_serialize_tokens(tokens, markup_func): | |
138 """ | |
139 Serialize the list of tokens into a list of text chunks, calling | |
140 markup_func around text to add annotations. | |
141 """ | |
142 for token in tokens: | |
143 for pre in token.pre_tags: | |
144 yield pre | |
145 html = token.html() | |
146 html = markup_func(html, token.annotation) | |
147 if token.trailing_whitespace: | |
148 html += token.trailing_whitespace | |
149 yield html | |
150 for post in token.post_tags: | |
151 yield post | |
152 | |
153 | |
154 ############################################################ | |
155 ## HTML Diffs | |
156 ############################################################ | |
157 | |
158 def htmldiff(old_html, new_html): | |
159 ## FIXME: this should take parsed documents too, and use their body | |
160 ## or other content. | |
161 """ Do a diff of the old and new document. The documents are HTML | |
162 *fragments* (str/UTF8 or unicode), they are not complete documents | |
163 (i.e., no <html> tag). | |
164 | |
165 Returns HTML with <ins> and <del> tags added around the | |
166 appropriate text. | |
167 | |
168 Markup is generally ignored, with the markup from new_html | |
169 preserved, and possibly some markup from old_html (though it is | |
170 considered acceptable to lose some of the old markup). Only the | |
171 words in the HTML are diffed. The exception is <img> tags, which | |
172 are treated like words, and the href attribute of <a> tags, which | |
173 are noted inside the tag itself when there are changes. | |
174 """ | |
175 old_html_tokens = tokenize(old_html) | |
176 new_html_tokens = tokenize(new_html) | |
177 result = htmldiff_tokens(old_html_tokens, new_html_tokens) | |
178 result = ''.join(result).strip() | |
179 return fixup_ins_del_tags(result) | |
180 | |
181 def htmldiff_tokens(html1_tokens, html2_tokens): | |
182 """ Does a diff on the tokens themselves, returning a list of text | |
183 chunks (not tokens). | |
184 """ | |
185 # There are several passes as we do the differences. The tokens | |
186 # isolate the portion of the content we care to diff; difflib does | |
187 # all the actual hard work at that point. | |
188 # | |
189 # Then we must create a valid document from pieces of both the old | |
190 # document and the new document. We generally prefer to take | |
191 # markup from the new document, and only do a best effort attempt | |
192 # to keep markup from the old document; anything that we can't | |
193 # resolve we throw away. Also we try to put the deletes as close | |
194 # to the location where we think they would have been -- because | |
195 # we are only keeping the markup from the new document, it can be | |
196 # fuzzy where in the new document the old text would have gone. | |
197 # Again we just do a best effort attempt. | |
198 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) | |
199 commands = s.get_opcodes() | |
200 result = [] | |
201 for command, i1, i2, j1, j2 in commands: | |
202 if command == 'equal': | |
203 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) | |
204 continue | |
205 if command == 'insert' or command == 'replace': | |
206 ins_tokens = expand_tokens(html2_tokens[j1:j2]) | |
207 merge_insert(ins_tokens, result) | |
208 if command == 'delete' or command == 'replace': | |
209 del_tokens = expand_tokens(html1_tokens[i1:i2]) | |
210 merge_delete(del_tokens, result) | |
211 # If deletes were inserted directly as <del> then we'd have an | |
212 # invalid document at this point. Instead we put in special | |
213 # markers, and when the complete diffed document has been created | |
214 # we try to move the deletes around and resolve any problems. | |
215 result = cleanup_delete(result) | |
216 | |
217 return result | |
218 | |
219 def expand_tokens(tokens, equal=False): | |
220 """Given a list of tokens, return a generator of the chunks of | |
221 text for the data in the tokens. | |
222 """ | |
223 for token in tokens: | |
224 for pre in token.pre_tags: | |
225 yield pre | |
226 if not equal or not token.hide_when_equal: | |
227 if token.trailing_whitespace: | |
228 yield token.html() + token.trailing_whitespace | |
229 else: | |
230 yield token.html() | |
231 for post in token.post_tags: | |
232 yield post | |
233 | |
234 def merge_insert(ins_chunks, doc): | |
235 """ doc is the already-handled document (as a list of text chunks); | |
236 here we add <ins>ins_chunks</ins> to the end of that. """ | |
237 # Though we don't throw away unbalanced_start or unbalanced_end | |
238 # (we assume there is accompanying markup later or earlier in the | |
239 # document), we only put <ins> around the balanced portion. | |
240 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) | |
241 doc.extend(unbalanced_start) | |
242 if doc and not doc[-1].endswith(' '): | |
243 # Fix up the case where the word before the insert didn't end with | |
244 # a space | |
245 doc[-1] += ' ' | |
246 doc.append('<ins>') | |
247 if balanced and balanced[-1].endswith(' '): | |
248 # We move space outside of </ins> | |
249 balanced[-1] = balanced[-1][:-1] | |
250 doc.extend(balanced) | |
251 doc.append('</ins> ') | |
252 doc.extend(unbalanced_end) | |
253 | |
254 # These are sentinals to represent the start and end of a <del> | |
255 # segment, until we do the cleanup phase to turn them into proper | |
256 # markup: | |
257 class DEL_START: | |
258 pass | |
259 class DEL_END: | |
260 pass | |
261 | |
262 class NoDeletes(Exception): | |
263 """ Raised when the document no longer contains any pending deletes | |
264 (DEL_START/DEL_END) """ | |
265 | |
266 def merge_delete(del_chunks, doc): | |
267 """ Adds the text chunks in del_chunks to the document doc (another | |
268 list of text chunks) with marker to show it is a delete. | |
269 cleanup_delete later resolves these markers into <del> tags.""" | |
270 doc.append(DEL_START) | |
271 doc.extend(del_chunks) | |
272 doc.append(DEL_END) | |
273 | |
274 def cleanup_delete(chunks): | |
275 """ Cleans up any DEL_START/DEL_END markers in the document, replacing | |
276 them with <del></del>. To do this while keeping the document | |
277 valid, it may need to drop some tags (either start or end tags). | |
278 | |
279 It may also move the del into adjacent tags to try to move it to a | |
280 similar location where it was originally located (e.g., moving a | |
281 delete into preceding <div> tag, if the del looks like (DEL_START, | |
282 'Text</div>', DEL_END)""" | |
283 while 1: | |
284 # Find a pending DEL_START/DEL_END, splitting the document | |
285 # into stuff-preceding-DEL_START, stuff-inside, and | |
286 # stuff-following-DEL_END | |
287 try: | |
288 pre_delete, delete, post_delete = split_delete(chunks) | |
289 except NoDeletes: | |
290 # Nothing found, we've cleaned up the entire doc | |
291 break | |
292 # The stuff-inside-DEL_START/END may not be well balanced | |
293 # markup. First we figure out what unbalanced portions there are: | |
294 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) | |
295 # Then we move the span forward and/or backward based on these | |
296 # unbalanced portions: | |
297 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) | |
298 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) | |
299 doc = pre_delete | |
300 if doc and not doc[-1].endswith(' '): | |
301 # Fix up case where the word before us didn't have a trailing space | |
302 doc[-1] += ' ' | |
303 doc.append('<del>') | |
304 if balanced and balanced[-1].endswith(' '): | |
305 # We move space outside of </del> | |
306 balanced[-1] = balanced[-1][:-1] | |
307 doc.extend(balanced) | |
308 doc.append('</del> ') | |
309 doc.extend(post_delete) | |
310 chunks = doc | |
311 return chunks | |
312 | |
313 def split_unbalanced(chunks): | |
314 """Return (unbalanced_start, balanced, unbalanced_end), where each is | |
315 a list of text and tag chunks. | |
316 | |
317 unbalanced_start is a list of all the tags that are opened, but | |
318 not closed in this span. Similarly, unbalanced_end is a list of | |
319 tags that are closed but were not opened. Extracting these might | |
320 mean some reordering of the chunks.""" | |
321 start = [] | |
322 end = [] | |
323 tag_stack = [] | |
324 balanced = [] | |
325 for chunk in chunks: | |
326 if not chunk.startswith('<'): | |
327 balanced.append(chunk) | |
328 continue | |
329 endtag = chunk[1] == '/' | |
330 name = chunk.split()[0].strip('<>/') | |
331 if name in empty_tags: | |
332 balanced.append(chunk) | |
333 continue | |
334 if endtag: | |
335 if tag_stack and tag_stack[-1][0] == name: | |
336 balanced.append(chunk) | |
337 name, pos, tag = tag_stack.pop() | |
338 balanced[pos] = tag | |
339 elif tag_stack: | |
340 start.extend([tag for name, pos, tag in tag_stack]) | |
341 tag_stack = [] | |
342 end.append(chunk) | |
343 else: | |
344 end.append(chunk) | |
345 else: | |
346 tag_stack.append((name, len(balanced), chunk)) | |
347 balanced.append(None) | |
348 start.extend( | |
349 [chunk for name, pos, chunk in tag_stack]) | |
350 balanced = [chunk for chunk in balanced if chunk is not None] | |
351 return start, balanced, end | |
352 | |
353 def split_delete(chunks): | |
354 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, | |
355 stuff_after_DEL_END). Returns the first case found (there may be | |
356 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if | |
357 there's no DEL_START found. """ | |
358 try: | |
359 pos = chunks.index(DEL_START) | |
360 except ValueError: | |
361 raise NoDeletes | |
362 pos2 = chunks.index(DEL_END) | |
363 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] | |
364 | |
365 def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): | |
366 """ pre_delete and post_delete implicitly point to a place in the | |
367 document (where the two were split). This moves that point (by | |
368 popping items from one and pushing them onto the other). It moves | |
369 the point to try to find a place where unbalanced_start applies. | |
370 | |
371 As an example:: | |
372 | |
373 >>> unbalanced_start = ['<div>'] | |
374 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] | |
375 >>> pre, post = doc[:3], doc[3:] | |
376 >>> pre, post | |
377 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) | |
378 >>> locate_unbalanced_start(unbalanced_start, pre, post) | |
379 >>> pre, post | |
380 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) | |
381 | |
382 As you can see, we moved the point so that the dangling <div> that | |
383 we found will be effectively replaced by the div in the original | |
384 document. If this doesn't work out, we just throw away | |
385 unbalanced_start without doing anything. | |
386 """ | |
387 while 1: | |
388 if not unbalanced_start: | |
389 # We have totally succeeded in finding the position | |
390 break | |
391 finding = unbalanced_start[0] | |
392 finding_name = finding.split()[0].strip('<>') | |
393 if not post_delete: | |
394 break | |
395 next = post_delete[0] | |
396 if next is DEL_START or not next.startswith('<'): | |
397 # Reached a word, we can't move the delete text forward | |
398 break | |
399 if next[1] == '/': | |
400 # Reached a closing tag, can we go further? Maybe not... | |
401 break | |
402 name = next.split()[0].strip('<>') | |
403 if name == 'ins': | |
404 # Can't move into an insert | |
405 break | |
406 assert name != 'del', ( | |
407 "Unexpected delete tag: %r" % next) | |
408 if name == finding_name: | |
409 unbalanced_start.pop(0) | |
410 pre_delete.append(post_delete.pop(0)) | |
411 else: | |
412 # Found a tag that doesn't match | |
413 break | |
414 | |
415 def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): | |
416 """ like locate_unbalanced_start, except handling end tags and | |
417 possibly moving the point earlier in the document. """ | |
418 while 1: | |
419 if not unbalanced_end: | |
420 # Success | |
421 break | |
422 finding = unbalanced_end[-1] | |
423 finding_name = finding.split()[0].strip('<>/') | |
424 if not pre_delete: | |
425 break | |
426 next = pre_delete[-1] | |
427 if next is DEL_END or not next.startswith('</'): | |
428 # A word or a start tag | |
429 break | |
430 name = next.split()[0].strip('<>/') | |
431 if name == 'ins' or name == 'del': | |
432 # Can't move into an insert or delete | |
433 break | |
434 if name == finding_name: | |
435 unbalanced_end.pop() | |
436 post_delete.insert(0, pre_delete.pop()) | |
437 else: | |
438 # Found a tag that doesn't match | |
439 break | |
440 | |
441 class token(_unicode): | |
442 """ Represents a diffable token, generally a word that is displayed to | |
443 the user. Opening tags are attached to this token when they are | |
444 adjacent (pre_tags) and closing tags that follow the word | |
445 (post_tags). Some exceptions occur when there are empty tags | |
446 adjacent to a word, so there may be close tags in pre_tags, or | |
447 open tags in post_tags. | |
448 | |
449 We also keep track of whether the word was originally followed by | |
450 whitespace, even though we do not want to treat the word as | |
451 equivalent to a similar word that does not have a trailing | |
452 space.""" | |
453 | |
454 # When this is true, the token will be eliminated from the | |
455 # displayed diff if no change has occurred: | |
456 hide_when_equal = False | |
457 | |
458 def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): | |
459 obj = _unicode.__new__(cls, text) | |
460 | |
461 if pre_tags is not None: | |
462 obj.pre_tags = pre_tags | |
463 else: | |
464 obj.pre_tags = [] | |
465 | |
466 if post_tags is not None: | |
467 obj.post_tags = post_tags | |
468 else: | |
469 obj.post_tags = [] | |
470 | |
471 obj.trailing_whitespace = trailing_whitespace | |
472 | |
473 return obj | |
474 | |
475 def __repr__(self): | |
476 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, | |
477 self.post_tags, self.trailing_whitespace) | |
478 | |
479 def html(self): | |
480 return _unicode(self) | |
481 | |
482 class tag_token(token): | |
483 | |
484 """ Represents a token that is actually a tag. Currently this is just | |
485 the <img> tag, which takes up visible space just like a word but | |
486 is only represented in a document by a tag. """ | |
487 | |
488 def __new__(cls, tag, data, html_repr, pre_tags=None, | |
489 post_tags=None, trailing_whitespace=""): | |
490 obj = token.__new__(cls, "%s: %s" % (type, data), | |
491 pre_tags=pre_tags, | |
492 post_tags=post_tags, | |
493 trailing_whitespace=trailing_whitespace) | |
494 obj.tag = tag | |
495 obj.data = data | |
496 obj.html_repr = html_repr | |
497 return obj | |
498 | |
499 def __repr__(self): | |
500 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( | |
501 self.tag, | |
502 self.data, | |
503 self.html_repr, | |
504 self.pre_tags, | |
505 self.post_tags, | |
506 self.trailing_whitespace) | |
507 def html(self): | |
508 return self.html_repr | |
509 | |
510 class href_token(token): | |
511 | |
512 """ Represents the href in an anchor tag. Unlike other words, we only | |
513 show the href when it changes. """ | |
514 | |
515 hide_when_equal = True | |
516 | |
517 def html(self): | |
518 return ' Link: %s' % self | |
519 | |
520 def tokenize(html, include_hrefs=True): | |
521 """ | |
522 Parse the given HTML and returns token objects (words with attached tags). | |
523 | |
524 This parses only the content of a page; anything in the head is | |
525 ignored, and the <head> and <body> elements are themselves | |
526 optional. The content is then parsed by lxml, which ensures the | |
527 validity of the resulting parsed document (though lxml may make | |
528 incorrect guesses when the markup is particular bad). | |
529 | |
530 <ins> and <del> tags are also eliminated from the document, as | |
531 that gets confusing. | |
532 | |
533 If include_hrefs is true, then the href attribute of <a> tags is | |
534 included as a special kind of diffable token.""" | |
535 if etree.iselement(html): | |
536 body_el = html | |
537 else: | |
538 body_el = parse_html(html, cleanup=True) | |
539 # Then we split the document into text chunks for each tag, word, and end tag: | |
540 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) | |
541 # Finally re-joining them into token objects: | |
542 return fixup_chunks(chunks) | |
543 | |
544 def parse_html(html, cleanup=True): | |
545 """ | |
546 Parses an HTML fragment, returning an lxml element. Note that the HTML will be | |
547 wrapped in a <div> tag that was not in the original document. | |
548 | |
549 If cleanup is true, make sure there's no <head> or <body>, and get | |
550 rid of any <ins> and <del> tags. | |
551 """ | |
552 if cleanup: | |
553 # This removes any extra markup or structure like <head>: | |
554 html = cleanup_html(html) | |
555 return fragment_fromstring(html, create_parent=True) | |
556 | |
557 _body_re = re.compile(r'<body.*?>', re.I|re.S) | |
558 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) | |
559 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) | |
560 | |
561 def cleanup_html(html): | |
562 """ This 'cleans' the HTML, meaning that any page structure is removed | |
563 (only the contents of <body> are used, if there is any <body). | |
564 Also <ins> and <del> tags are removed. """ | |
565 match = _body_re.search(html) | |
566 if match: | |
567 html = html[match.end():] | |
568 match = _end_body_re.search(html) | |
569 if match: | |
570 html = html[:match.start()] | |
571 html = _ins_del_re.sub('', html) | |
572 return html | |
573 | |
574 | |
575 end_whitespace_re = re.compile(r'[ \t\n\r]$') | |
576 | |
577 def split_trailing_whitespace(word): | |
578 """ | |
579 This function takes a word, such as 'test\n\n' and returns ('test','\n\n') | |
580 """ | |
581 stripped_length = len(word.rstrip()) | |
582 return word[0:stripped_length], word[stripped_length:] | |
583 | |
584 | |
585 def fixup_chunks(chunks): | |
586 """ | |
587 This function takes a list of chunks and produces a list of tokens. | |
588 """ | |
589 tag_accum = [] | |
590 cur_word = None | |
591 result = [] | |
592 for chunk in chunks: | |
593 if isinstance(chunk, tuple): | |
594 if chunk[0] == 'img': | |
595 src = chunk[1] | |
596 tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) | |
597 cur_word = tag_token('img', src, html_repr=tag, | |
598 pre_tags=tag_accum, | |
599 trailing_whitespace=trailing_whitespace) | |
600 tag_accum = [] | |
601 result.append(cur_word) | |
602 | |
603 elif chunk[0] == 'href': | |
604 href = chunk[1] | |
605 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") | |
606 tag_accum = [] | |
607 result.append(cur_word) | |
608 continue | |
609 | |
610 if is_word(chunk): | |
611 chunk, trailing_whitespace = split_trailing_whitespace(chunk) | |
612 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) | |
613 tag_accum = [] | |
614 result.append(cur_word) | |
615 | |
616 elif is_start_tag(chunk): | |
617 tag_accum.append(chunk) | |
618 | |
619 elif is_end_tag(chunk): | |
620 if tag_accum: | |
621 tag_accum.append(chunk) | |
622 else: | |
623 assert cur_word, ( | |
624 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" | |
625 % (cur_word, result, chunk, chunks)) | |
626 cur_word.post_tags.append(chunk) | |
627 else: | |
628 assert False | |
629 | |
630 if not result: | |
631 return [token('', pre_tags=tag_accum)] | |
632 else: | |
633 result[-1].post_tags.extend(tag_accum) | |
634 | |
635 return result | |
636 | |
637 | |
638 # All the tags in HTML that don't require end tags: | |
639 empty_tags = ( | |
640 'param', 'img', 'area', 'br', 'basefont', 'input', | |
641 'base', 'meta', 'link', 'col') | |
642 | |
643 block_level_tags = ( | |
644 'address', | |
645 'blockquote', | |
646 'center', | |
647 'dir', | |
648 'div', | |
649 'dl', | |
650 'fieldset', | |
651 'form', | |
652 'h1', | |
653 'h2', | |
654 'h3', | |
655 'h4', | |
656 'h5', | |
657 'h6', | |
658 'hr', | |
659 'isindex', | |
660 'menu', | |
661 'noframes', | |
662 'noscript', | |
663 'ol', | |
664 'p', | |
665 'pre', | |
666 'table', | |
667 'ul', | |
668 ) | |
669 | |
670 block_level_container_tags = ( | |
671 'dd', | |
672 'dt', | |
673 'frameset', | |
674 'li', | |
675 'tbody', | |
676 'td', | |
677 'tfoot', | |
678 'th', | |
679 'thead', | |
680 'tr', | |
681 ) | |
682 | |
683 | |
684 def flatten_el(el, include_hrefs, skip_tag=False): | |
685 """ Takes an lxml element el, and generates all the text chunks for | |
686 that tag. Each start tag is a chunk, each word is a chunk, and each | |
687 end tag is a chunk. | |
688 | |
689 If skip_tag is true, then the outermost container tag is | |
690 not returned (just its contents).""" | |
691 if not skip_tag: | |
692 if el.tag == 'img': | |
693 yield ('img', el.get('src'), start_tag(el)) | |
694 else: | |
695 yield start_tag(el) | |
696 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: | |
697 return | |
698 start_words = split_words(el.text) | |
699 for word in start_words: | |
700 yield html_escape(word) | |
701 for child in el: | |
702 for item in flatten_el(child, include_hrefs=include_hrefs): | |
703 yield item | |
704 if el.tag == 'a' and el.get('href') and include_hrefs: | |
705 yield ('href', el.get('href')) | |
706 if not skip_tag: | |
707 yield end_tag(el) | |
708 end_words = split_words(el.tail) | |
709 for word in end_words: | |
710 yield html_escape(word) | |
711 | |
712 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) | |
713 | |
714 def split_words(text): | |
715 """ Splits some text into words. Includes trailing whitespace | |
716 on each word when appropriate. """ | |
717 if not text or not text.strip(): | |
718 return [] | |
719 | |
720 words = split_words_re.findall(text) | |
721 return words | |
722 | |
723 start_whitespace_re = re.compile(r'^[ \t\n\r]') | |
724 | |
725 def start_tag(el): | |
726 """ | |
727 The text representation of the start tag for a tag. | |
728 """ | |
729 return '<%s%s>' % ( | |
730 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) | |
731 for name, value in el.attrib.items()])) | |
732 | |
733 def end_tag(el): | |
734 """ The text representation of an end tag for a tag. Includes | |
735 trailing whitespace when appropriate. """ | |
736 if el.tail and start_whitespace_re.search(el.tail): | |
737 extra = ' ' | |
738 else: | |
739 extra = '' | |
740 return '</%s>%s' % (el.tag, extra) | |
741 | |
742 def is_word(tok): | |
743 return not tok.startswith('<') | |
744 | |
745 def is_end_tag(tok): | |
746 return tok.startswith('</') | |
747 | |
748 def is_start_tag(tok): | |
749 return tok.startswith('<') and not tok.startswith('</') | |
750 | |
751 def fixup_ins_del_tags(html): | |
752 """ Given an html string, move any <ins> or <del> tags inside of any | |
753 block-level elements, e.g. transform <ins><p>word</p></ins> to | |
754 <p><ins>word</ins></p> """ | |
755 doc = parse_html(html, cleanup=False) | |
756 _fixup_ins_del_tags(doc) | |
757 html = serialize_html_fragment(doc, skip_outer=True) | |
758 return html | |
759 | |
760 def serialize_html_fragment(el, skip_outer=False): | |
761 """ Serialize a single lxml element as HTML. The serialized form | |
762 includes the elements tail. | |
763 | |
764 If skip_outer is true, then don't serialize the outermost tag | |
765 """ | |
766 assert not isinstance(el, basestring), ( | |
767 "You should pass in an element, not a string like %r" % el) | |
768 html = etree.tostring(el, method="html", encoding=_unicode) | |
769 if skip_outer: | |
770 # Get rid of the extra starting tag: | |
771 html = html[html.find('>')+1:] | |
772 # Get rid of the extra end tag: | |
773 html = html[:html.rfind('<')] | |
774 return html.strip() | |
775 else: | |
776 return html | |
777 | |
778 def _fixup_ins_del_tags(doc): | |
779 """fixup_ins_del_tags that works on an lxml document in-place | |
780 """ | |
781 for tag in ['ins', 'del']: | |
782 for el in doc.xpath('descendant-or-self::%s' % tag): | |
783 if not _contains_block_level_tag(el): | |
784 continue | |
785 _move_el_inside_block(el, tag=tag) | |
786 el.drop_tag() | |
787 #_merge_element_contents(el) | |
788 | |
789 def _contains_block_level_tag(el): | |
790 """True if the element contains any block-level elements, like <p>, <td>, etc. | |
791 """ | |
792 if el.tag in block_level_tags or el.tag in block_level_container_tags: | |
793 return True | |
794 for child in el: | |
795 if _contains_block_level_tag(child): | |
796 return True | |
797 return False | |
798 | |
799 def _move_el_inside_block(el, tag): | |
800 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags | |
801 and moves them inside any block-level tags. """ | |
802 for child in el: | |
803 if _contains_block_level_tag(child): | |
804 break | |
805 else: | |
806 # No block-level tags in any child | |
807 children_tag = etree.Element(tag) | |
808 children_tag.text = el.text | |
809 el.text = None | |
810 children_tag.extend(list(el)) | |
811 el[:] = [children_tag] | |
812 return | |
813 for child in list(el): | |
814 if _contains_block_level_tag(child): | |
815 _move_el_inside_block(child, tag) | |
816 if child.tail: | |
817 tail_tag = etree.Element(tag) | |
818 tail_tag.text = child.tail | |
819 child.tail = None | |
820 el.insert(el.index(child)+1, tail_tag) | |
821 else: | |
822 child_tag = etree.Element(tag) | |
823 el.replace(child, child_tag) | |
824 child_tag.append(child) | |
825 if el.text: | |
826 text_tag = etree.Element(tag) | |
827 text_tag.text = el.text | |
828 el.text = None | |
829 el.insert(0, text_tag) | |
830 | |
831 def _merge_element_contents(el): | |
832 """ | |
833 Removes an element, but merges its contents into its place, e.g., | |
834 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get | |
835 <p>Hi there!</p> | |
836 """ | |
837 parent = el.getparent() | |
838 text = el.text or '' | |
839 if el.tail: | |
840 if not len(el): | |
841 text += el.tail | |
842 else: | |
843 if el[-1].tail: | |
844 el[-1].tail += el.tail | |
845 else: | |
846 el[-1].tail = el.tail | |
847 index = parent.index(el) | |
848 if text: | |
849 if index == 0: | |
850 previous = None | |
851 else: | |
852 previous = parent[index-1] | |
853 if previous is None: | |
854 if parent.text: | |
855 parent.text += text | |
856 else: | |
857 parent.text = text | |
858 else: | |
859 if previous.tail: | |
860 previous.tail += text | |
861 else: | |
862 previous.tail = text | |
863 parent[index:index+1] = el.getchildren() | |
864 | |
865 class InsensitiveSequenceMatcher(difflib.SequenceMatcher): | |
866 """ | |
867 Acts like SequenceMatcher, but tries not to find very small equal | |
868 blocks amidst large spans of changes | |
869 """ | |
870 | |
871 threshold = 2 | |
872 | |
873 def get_matching_blocks(self): | |
874 size = min(len(self.b), len(self.b)) | |
875 threshold = min(self.threshold, size / 4) | |
876 actual = difflib.SequenceMatcher.get_matching_blocks(self) | |
877 return [item for item in actual | |
878 if item[2] > threshold | |
879 or not item[2]] | |
880 | |
881 if __name__ == '__main__': | |
882 from lxml.html import _diffcommand | |
883 _diffcommand.main() | |
884 |