Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/boto/cloudsearch/document.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 # Copyright (c) 2012 Mitch Garnaat http://garnaat.org/ | |
2 # Copyright (c) 2012 Amazon.com, Inc. or its affiliates. | |
3 # All Rights Reserved | |
4 # | |
5 # Permission is hereby granted, free of charge, to any person obtaining a | |
6 # copy of this software and associated documentation files (the | |
7 # "Software"), to deal in the Software without restriction, including | |
8 # without limitation the rights to use, copy, modify, merge, publish, dis- | |
9 # tribute, sublicense, and/or sell copies of the Software, and to permit | |
10 # persons to whom the Software is furnished to do so, subject to the fol- | |
11 # lowing conditions: | |
12 # | |
13 # The above copyright notice and this permission notice shall be included | |
14 # in all copies or substantial portions of the Software. | |
15 # | |
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
17 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- | |
18 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT | |
19 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
20 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
22 # IN THE SOFTWARE. | |
23 # | |
24 | |
25 import boto.exception | |
26 from boto.compat import json | |
27 import requests | |
28 import boto | |
29 | |
30 class SearchServiceException(Exception): | |
31 pass | |
32 | |
33 | |
34 class CommitMismatchError(Exception): | |
35 pass | |
36 | |
37 class EncodingError(Exception): | |
38 """ | |
39 Content sent for Cloud Search indexing was incorrectly encoded. | |
40 | |
41 This usually happens when a document is marked as unicode but non-unicode | |
42 characters are present. | |
43 """ | |
44 pass | |
45 | |
46 class ContentTooLongError(Exception): | |
47 """ | |
48 Content sent for Cloud Search indexing was too long | |
49 | |
50 This will usually happen when documents queued for indexing add up to more | |
51 than the limit allowed per upload batch (5MB) | |
52 | |
53 """ | |
54 pass | |
55 | |
56 class DocumentServiceConnection(object): | |
57 """ | |
58 A CloudSearch document service. | |
59 | |
60 The DocumentServiceConection is used to add, remove and update documents in | |
61 CloudSearch. Commands are uploaded to CloudSearch in SDF (Search Document Format). | |
62 | |
63 To generate an appropriate SDF, use :func:`add` to add or update documents, | |
64 as well as :func:`delete` to remove documents. | |
65 | |
66 Once the set of documents is ready to be index, use :func:`commit` to send the | |
67 commands to CloudSearch. | |
68 | |
69 If there are a lot of documents to index, it may be preferable to split the | |
70 generation of SDF data and the actual uploading into CloudSearch. Retrieve | |
71 the current SDF with :func:`get_sdf`. If this file is the uploaded into S3, | |
72 it can be retrieved back afterwards for upload into CloudSearch using | |
73 :func:`add_sdf_from_s3`. | |
74 | |
75 The SDF is not cleared after a :func:`commit`. If you wish to continue | |
76 using the DocumentServiceConnection for another batch upload of commands, | |
77 you will need to :func:`clear_sdf` first to stop the previous batch of | |
78 commands from being uploaded again. | |
79 | |
80 """ | |
81 | |
82 def __init__(self, domain=None, endpoint=None): | |
83 self.domain = domain | |
84 self.endpoint = endpoint | |
85 if not self.endpoint: | |
86 self.endpoint = domain.doc_service_endpoint | |
87 self.documents_batch = [] | |
88 self._sdf = None | |
89 | |
90 def add(self, _id, version, fields, lang='en'): | |
91 """ | |
92 Add a document to be processed by the DocumentService | |
93 | |
94 The document will not actually be added until :func:`commit` is called | |
95 | |
96 :type _id: string | |
97 :param _id: A unique ID used to refer to this document. | |
98 | |
99 :type version: int | |
100 :param version: Version of the document being indexed. If a file is | |
101 being reindexed, the version should be higher than the existing one | |
102 in CloudSearch. | |
103 | |
104 :type fields: dict | |
105 :param fields: A dictionary of key-value pairs to be uploaded . | |
106 | |
107 :type lang: string | |
108 :param lang: The language code the data is in. Only 'en' is currently | |
109 supported | |
110 """ | |
111 | |
112 d = {'type': 'add', 'id': _id, 'version': version, 'lang': lang, | |
113 'fields': fields} | |
114 self.documents_batch.append(d) | |
115 | |
116 def delete(self, _id, version): | |
117 """ | |
118 Schedule a document to be removed from the CloudSearch service | |
119 | |
120 The document will not actually be scheduled for removal until :func:`commit` is called | |
121 | |
122 :type _id: string | |
123 :param _id: The unique ID of this document. | |
124 | |
125 :type version: int | |
126 :param version: Version of the document to remove. The delete will only | |
127 occur if this version number is higher than the version currently | |
128 in the index. | |
129 """ | |
130 | |
131 d = {'type': 'delete', 'id': _id, 'version': version} | |
132 self.documents_batch.append(d) | |
133 | |
134 def get_sdf(self): | |
135 """ | |
136 Generate the working set of documents in Search Data Format (SDF) | |
137 | |
138 :rtype: string | |
139 :returns: JSON-formatted string of the documents in SDF | |
140 """ | |
141 | |
142 return self._sdf if self._sdf else json.dumps(self.documents_batch) | |
143 | |
144 def clear_sdf(self): | |
145 """ | |
146 Clear the working documents from this DocumentServiceConnection | |
147 | |
148 This should be used after :func:`commit` if the connection will be reused | |
149 for another set of documents. | |
150 """ | |
151 | |
152 self._sdf = None | |
153 self.documents_batch = [] | |
154 | |
155 def add_sdf_from_s3(self, key_obj): | |
156 """ | |
157 Load an SDF from S3 | |
158 | |
159 Using this method will result in documents added through | |
160 :func:`add` and :func:`delete` being ignored. | |
161 | |
162 :type key_obj: :class:`boto.s3.key.Key` | |
163 :param key_obj: An S3 key which contains an SDF | |
164 """ | |
165 #@todo:: (lucas) would be nice if this could just take an s3://uri..." | |
166 | |
167 self._sdf = key_obj.get_contents_as_string() | |
168 | |
169 def commit(self): | |
170 """ | |
171 Actually send an SDF to CloudSearch for processing | |
172 | |
173 If an SDF file has been explicitly loaded it will be used. Otherwise, | |
174 documents added through :func:`add` and :func:`delete` will be used. | |
175 | |
176 :rtype: :class:`CommitResponse` | |
177 :returns: A summary of documents added and deleted | |
178 """ | |
179 | |
180 sdf = self.get_sdf() | |
181 | |
182 if ': null' in sdf: | |
183 boto.log.error('null value in sdf detected. This will probably raise ' | |
184 '500 error.') | |
185 index = sdf.index(': null') | |
186 boto.log.error(sdf[index - 100:index + 100]) | |
187 | |
188 url = "http://%s/2011-02-01/documents/batch" % (self.endpoint) | |
189 | |
190 # Keep-alive is automatic in a post-1.0 requests world. | |
191 session = requests.Session() | |
192 adapter = requests.adapters.HTTPAdapter( | |
193 pool_connections=20, | |
194 pool_maxsize=50, | |
195 max_retries=5 | |
196 ) | |
197 session.mount('http://', adapter) | |
198 session.mount('https://', adapter) | |
199 r = session.post(url, data=sdf, headers={'Content-Type': 'application/json'}) | |
200 | |
201 return CommitResponse(r, self, sdf) | |
202 | |
203 | |
204 class CommitResponse(object): | |
205 """Wrapper for response to Cloudsearch document batch commit. | |
206 | |
207 :type response: :class:`requests.models.Response` | |
208 :param response: Response from Cloudsearch /documents/batch API | |
209 | |
210 :type doc_service: :class:`boto.cloudsearch.document.DocumentServiceConnection` | |
211 :param doc_service: Object containing the documents posted and methods to | |
212 retry | |
213 | |
214 :raises: :class:`boto.exception.BotoServerError` | |
215 :raises: :class:`boto.cloudsearch.document.SearchServiceException` | |
216 :raises: :class:`boto.cloudsearch.document.EncodingError` | |
217 :raises: :class:`boto.cloudsearch.document.ContentTooLongError` | |
218 """ | |
219 def __init__(self, response, doc_service, sdf): | |
220 self.response = response | |
221 self.doc_service = doc_service | |
222 self.sdf = sdf | |
223 | |
224 _body = response.content.decode('utf-8') | |
225 | |
226 try: | |
227 self.content = json.loads(_body) | |
228 except: | |
229 boto.log.error('Error indexing documents.\nResponse Content:\n{0}\n\n' | |
230 'SDF:\n{1}'.format(_body, self.sdf)) | |
231 raise boto.exception.BotoServerError(self.response.status_code, '', | |
232 body=_body) | |
233 | |
234 self.status = self.content['status'] | |
235 if self.status == 'error': | |
236 self.errors = [e.get('message') for e in self.content.get('errors', | |
237 [])] | |
238 for e in self.errors: | |
239 if "Illegal Unicode character" in e: | |
240 raise EncodingError("Illegal Unicode character in document") | |
241 elif e == "The Content-Length is too long": | |
242 raise ContentTooLongError("Content was too long") | |
243 if 'adds' not in self.content or 'deletes' not in self.content: | |
244 raise SearchServiceException("Error indexing documents" | |
245 " => %s" % self.content.get('message', '')) | |
246 else: | |
247 self.errors = [] | |
248 | |
249 self.adds = self.content['adds'] | |
250 self.deletes = self.content['deletes'] | |
251 self._check_num_ops('add', self.adds) | |
252 self._check_num_ops('delete', self.deletes) | |
253 | |
254 def _check_num_ops(self, type_, response_num): | |
255 """Raise exception if number of ops in response doesn't match commit | |
256 | |
257 :type type_: str | |
258 :param type_: Type of commit operation: 'add' or 'delete' | |
259 | |
260 :type response_num: int | |
261 :param response_num: Number of adds or deletes in the response. | |
262 | |
263 :raises: :class:`boto.cloudsearch.document.CommitMismatchError` | |
264 """ | |
265 commit_num = len([d for d in self.doc_service.documents_batch | |
266 if d['type'] == type_]) | |
267 | |
268 if response_num != commit_num: | |
269 raise CommitMismatchError( | |
270 'Incorrect number of {0}s returned. Commit: {1} Response: {2}'\ | |
271 .format(type_, commit_num, response_num)) |