Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/lxml/html/soupparser.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 
1
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
1 """External interface to the BeautifulSoup HTML parser. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
2 """ | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
3 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
4 __all__ = ["fromstring", "parse", "convert_tree"] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
5 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
6 import re | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
7 from lxml import etree, html | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
8 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
9 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
10 from bs4 import ( | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
11 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
12 Declaration, Doctype) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
13 _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
14 except ImportError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
15 from BeautifulSoup import ( | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
16 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
17 Declaration) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
18 _DECLARATION_OR_DOCTYPE = Declaration | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
19 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
20 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
21 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
22 """Parse a string of HTML data into an Element tree using the | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
23 BeautifulSoup parser. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
24 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
25 Returns the root ``<html>`` Element of the tree. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
26 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
27 You can pass a different BeautifulSoup parser through the | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
28 `beautifulsoup` keyword, and a diffent Element factory function | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
29 through the `makeelement` keyword. By default, the standard | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
30 ``BeautifulSoup`` class and the default factory of `lxml.html` are | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
31 used. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
32 """ | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
33 return _parse(data, beautifulsoup, makeelement, **bsargs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
34 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
35 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
36 def parse(file, beautifulsoup=None, makeelement=None, **bsargs): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
37 """Parse a file into an ElemenTree using the BeautifulSoup parser. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
38 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
39 You can pass a different BeautifulSoup parser through the | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
40 `beautifulsoup` keyword, and a diffent Element factory function | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
41 through the `makeelement` keyword. By default, the standard | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
42 ``BeautifulSoup`` class and the default factory of `lxml.html` are | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
43 used. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
44 """ | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
45 if not hasattr(file, 'read'): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
46 file = open(file) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
47 root = _parse(file, beautifulsoup, makeelement, **bsargs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
48 return etree.ElementTree(root) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
49 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
50 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
51 def convert_tree(beautiful_soup_tree, makeelement=None): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
52 """Convert a BeautifulSoup tree to a list of Element trees. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
53 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
54 Returns a list instead of a single root Element to support | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
55 HTML-like soup with more than one root element. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
56 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
57 You can pass a different Element factory through the `makeelement` | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
58 keyword. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
59 """ | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
60 root = _convert_tree(beautiful_soup_tree, makeelement) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
61 children = root.getchildren() | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
62 for child in children: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
63 root.remove(child) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
64 return children | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
65 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
66 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
67 # helpers | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
68 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
69 def _parse(source, beautifulsoup, makeelement, **bsargs): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
70 if beautifulsoup is None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
71 beautifulsoup = BeautifulSoup | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
72 if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
73 if 'convertEntities' not in bsargs: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
74 bsargs['convertEntities'] = 'html' | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
75 if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
76 if 'features' not in bsargs: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
77 bsargs['features'] = 'html.parser' # use Python html parser | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
78 tree = beautifulsoup(source, **bsargs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
79 root = _convert_tree(tree, makeelement) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
80 # from ET: wrap the document in a html root element, if necessary | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
81 if len(root) == 1 and root[0].tag == "html": | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
82 return root[0] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
83 root.tag = "html" | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
84 return root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
85 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
86 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
87 _parse_doctype_declaration = re.compile( | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
88 r'(?:\s|[<!])*DOCTYPE\s*HTML' | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
89 r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
90 r'(?:\s+(\'[^\']*\'|"[^"]*"))?', | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
91 re.IGNORECASE).match | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
92 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
93 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
94 class _PseudoTag: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
95 # Minimal imitation of BeautifulSoup.Tag | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
96 def __init__(self, contents): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
97 self.name = 'html' | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
98 self.attrs = [] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
99 self.contents = contents | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
100 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
101 def __iter__(self): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
102 return self.contents.__iter__() | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
103 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
104 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
105 def _convert_tree(beautiful_soup_tree, makeelement): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
106 if makeelement is None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
107 makeelement = html.html_parser.makeelement | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
108 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
109 # Split the tree into three parts: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
110 # i) everything before the root element: document type | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
111 # declaration, comments, processing instructions, whitespace | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
112 # ii) the root(s), | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
113 # iii) everything after the root: comments, processing | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
114 # instructions, whitespace | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
115 first_element_idx = last_element_idx = None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
116 html_root = declaration = None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
117 for i, e in enumerate(beautiful_soup_tree): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
118 if isinstance(e, Tag): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
119 if first_element_idx is None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
120 first_element_idx = i | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
121 last_element_idx = i | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
122 if html_root is None and e.name and e.name.lower() == 'html': | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
123 html_root = e | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
124 elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
125 declaration = e | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
126 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
127 # For a nice, well-formatted document, the variable roots below is | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
128 # a list consisting of a single <html> element. However, the document | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
129 # may be a soup like '<meta><head><title>Hello</head><body>Hi | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
130 # all<\p>'. In this example roots is a list containing meta, head | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
131 # and body elements. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
132 if first_element_idx is None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
133 pre_root = post_root = [] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
134 roots = beautiful_soup_tree.contents | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
135 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
136 pre_root = beautiful_soup_tree.contents[:first_element_idx] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
137 roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
138 post_root = beautiful_soup_tree.contents[last_element_idx+1:] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
139 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
140 # Reorganize so that there is one <html> root... | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
141 if html_root is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
142 # ... use existing one if possible, ... | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
143 i = roots.index(html_root) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
144 html_root.contents = roots[:i] + html_root.contents + roots[i+1:] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
145 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
146 # ... otherwise create a new one. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
147 html_root = _PseudoTag(roots) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
148 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
149 convert_node = _init_node_converters(makeelement) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
150 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
151 # Process pre_root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
152 res_root = convert_node(html_root) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
153 prev = res_root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
154 for e in reversed(pre_root): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
155 converted = convert_node(e) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
156 if converted is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
157 prev.addprevious(converted) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
158 prev = converted | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
159 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
160 # ditto for post_root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
161 prev = res_root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
162 for e in post_root: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
163 converted = convert_node(e) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
164 if converted is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
165 prev.addnext(converted) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
166 prev = converted | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
167 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
168 if declaration is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
169 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
170 # bs4 provides full Doctype string | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
171 doctype_string = declaration.output_ready() | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
172 except AttributeError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
173 doctype_string = declaration.string | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
174 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
175 match = _parse_doctype_declaration(doctype_string) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
176 if not match: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
177 # Something is wrong if we end up in here. Since soupparser should | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
178 # tolerate errors, do not raise Exception, just let it pass. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
179 pass | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
180 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
181 external_id, sys_uri = match.groups() | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
182 docinfo = res_root.getroottree().docinfo | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
183 # strip quotes and update DOCTYPE values (any of None, '', '...') | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
184 docinfo.public_id = external_id and external_id[1:-1] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
185 docinfo.system_url = sys_uri and sys_uri[1:-1] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
186 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
187 return res_root | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
188 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
189 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
190 def _init_node_converters(makeelement): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
191 converters = {} | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
192 ordered_node_types = [] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
193 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
194 def converter(*types): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
195 def add(handler): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
196 for t in types: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
197 converters[t] = handler | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
198 ordered_node_types.append(t) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
199 return handler | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
200 return add | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
201 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
202 def find_best_converter(node): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
203 for t in ordered_node_types: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
204 if isinstance(node, t): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
205 return converters[t] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
206 return None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
207 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
208 def convert_node(bs_node, parent=None): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
209 # duplicated in convert_tag() below | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
210 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
211 handler = converters[type(bs_node)] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
212 except KeyError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
213 handler = converters[type(bs_node)] = find_best_converter(bs_node) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
214 if handler is None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
215 return None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
216 return handler(bs_node, parent) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
217 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
218 def map_attrs(bs_attrs): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
219 if isinstance(bs_attrs, dict): # bs4 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
220 attribs = {} | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
221 for k, v in bs_attrs.items(): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
222 if isinstance(v, list): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
223 v = " ".join(v) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
224 attribs[k] = unescape(v) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
225 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
226 attribs = dict((k, unescape(v)) for k, v in bs_attrs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
227 return attribs | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
228 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
229 def append_text(parent, text): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
230 if len(parent) == 0: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
231 parent.text = (parent.text or '') + text | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
232 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
233 parent[-1].tail = (parent[-1].tail or '') + text | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
234 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
235 # converters are tried in order of their definition | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
236 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
237 @converter(Tag, _PseudoTag) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
238 def convert_tag(bs_node, parent): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
239 attrs = bs_node.attrs | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
240 if parent is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
241 attribs = map_attrs(attrs) if attrs else None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
242 res = etree.SubElement(parent, bs_node.name, attrib=attribs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
243 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
244 attribs = map_attrs(attrs) if attrs else {} | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
245 res = makeelement(bs_node.name, attrib=attribs) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
246 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
247 for child in bs_node: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
248 # avoid double recursion by inlining convert_node(), see above | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
249 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
250 handler = converters[type(child)] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
251 except KeyError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
252 pass | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
253 else: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
254 if handler is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
255 handler(child, res) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
256 continue | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
257 convert_node(child, res) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
258 return res | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
259 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
260 @converter(Comment) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
261 def convert_comment(bs_node, parent): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
262 res = html.HtmlComment(bs_node) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
263 if parent is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
264 parent.append(res) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
265 return res | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
266 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
267 @converter(ProcessingInstruction) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
268 def convert_pi(bs_node, parent): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
269 if bs_node.endswith('?'): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
270 # The PI is of XML style (<?as df?>) but BeautifulSoup | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
271 # interpreted it as being SGML style (<?as df>). Fix. | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
272 bs_node = bs_node[:-1] | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
273 res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
274 if parent is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
275 parent.append(res) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
276 return res | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
277 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
278 @converter(NavigableString) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
279 def convert_text(bs_node, parent): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
280 if parent is not None: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
281 append_text(parent, unescape(bs_node)) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
282 return None | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
283 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
284 return convert_node | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
285 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
286 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
287 # copied from ET's ElementSoup | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
288 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
289 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
290 from html.entities import name2codepoint # Python 3 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
291 except ImportError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
292 from htmlentitydefs import name2codepoint | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
293 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
294 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
295 handle_entities = re.compile(r"&(\w+);").sub | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
296 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
297 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
298 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
299 unichr | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
300 except NameError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
301 # Python 3 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
302 unichr = chr | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
303 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
304 | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
305 def unescape(string): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
306 if not string: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
307 return '' | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
308 # work around oddities in BeautifulSoup's entity handling | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
309 def unescape_entity(m): | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
310 try: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
311 return unichr(name2codepoint[m.group(1)]) | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
312 except KeyError: | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
313 return m.group(0) # use as is | 
| 
 
56ad4e20f292
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 
guerler 
parents:  
diff
changeset
 | 
314 return handle_entities(unescape_entity, string) | 
