comparison planemo/lib/python3.7/site-packages/boto/cloudsearch2/search.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 # Copyright (c) 2014 Amazon.com, Inc. or its affiliates.
2 # All Rights Reserved
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the
6 # "Software"), to deal in the Software without restriction, including
7 # without limitation the rights to use, copy, modify, merge, publish, dis-
8 # tribute, sublicense, and/or sell copies of the Software, and to permit
9 # persons to whom the Software is furnished to do so, subject to the fol-
10 # lowing conditions:
11 #
12 # The above copyright notice and this permission notice shall be included
13 # in all copies or substantial portions of the Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
17 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
18 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 from math import ceil
24 from boto.compat import json, map, six
25 import requests
26 from boto.cloudsearchdomain.layer1 import CloudSearchDomainConnection
27
28 SIMPLE = 'simple'
29 STRUCTURED = 'structured'
30 LUCENE = 'lucene'
31 DISMAX = 'dismax'
32
33
34 class SearchServiceException(Exception):
35 pass
36
37
38 class SearchResults(object):
39 def __init__(self, **attrs):
40 self.rid = attrs['status']['rid']
41 self.time_ms = attrs['status']['time-ms']
42 self.hits = attrs['hits']['found']
43 self.docs = attrs['hits']['hit']
44 self.start = attrs['hits']['start']
45 self.query = attrs['query']
46 self.search_service = attrs['search_service']
47
48 self.facets = {}
49 if 'facets' in attrs:
50 for (facet, values) in attrs['facets'].items():
51 if 'buckets' in values:
52 self.facets[facet] = dict((k, v) for (k, v) in map(lambda x: (x['value'], x['count']), values.get('buckets', [])))
53
54 self.num_pages_needed = ceil(self.hits / self.query.real_size)
55
56 def __len__(self):
57 return len(self.docs)
58
59 def __iter__(self):
60 return iter(self.docs)
61
62 def next_page(self):
63 """Call Cloudsearch to get the next page of search results
64
65 :rtype: :class:`boto.cloudsearch2.search.SearchResults`
66 :return: the following page of search results
67 """
68 if self.query.page <= self.num_pages_needed:
69 self.query.start += self.query.real_size
70 self.query.page += 1
71 return self.search_service(self.query)
72 else:
73 raise StopIteration
74
75
76 class Query(object):
77
78 RESULTS_PER_PAGE = 500
79
80 def __init__(self, q=None, parser=None, fq=None, expr=None,
81 return_fields=None, size=10, start=0, sort=None,
82 facet=None, highlight=None, partial=None, options=None):
83
84 self.q = q
85 self.parser = parser
86 self.fq = fq
87 self.expr = expr or {}
88 self.sort = sort or []
89 self.return_fields = return_fields or []
90 self.start = start
91 self.facet = facet or {}
92 self.highlight = highlight or {}
93 self.partial = partial
94 self.options = options
95 self.page = 0
96 self.update_size(size)
97
98 def update_size(self, new_size):
99 self.size = new_size
100 self.real_size = Query.RESULTS_PER_PAGE if (self.size >
101 Query.RESULTS_PER_PAGE or self.size == 0) else self.size
102
103 def to_params(self):
104 """Transform search parameters from instance properties to a dictionary
105
106 :rtype: dict
107 :return: search parameters
108 """
109 params = {'start': self.start, 'size': self.real_size}
110
111 if self.q:
112 params['q'] = self.q
113
114 if self.parser:
115 params['q.parser'] = self.parser
116
117 if self.fq:
118 params['fq'] = self.fq
119
120 if self.expr:
121 for k, v in six.iteritems(self.expr):
122 params['expr.%s' % k] = v
123
124 if self.facet:
125 for k, v in six.iteritems(self.facet):
126 if not isinstance(v, six.string_types):
127 v = json.dumps(v)
128 params['facet.%s' % k] = v
129
130 if self.highlight:
131 for k, v in six.iteritems(self.highlight):
132 params['highlight.%s' % k] = v
133
134 if self.options:
135 params['q.options'] = self.options
136
137 if self.return_fields:
138 params['return'] = ','.join(self.return_fields)
139
140 if self.partial is not None:
141 params['partial'] = self.partial
142
143 if self.sort:
144 params['sort'] = ','.join(self.sort)
145
146 return params
147
148 def to_domain_connection_params(self):
149 """
150 Transform search parameters from instance properties to a dictionary
151 that CloudSearchDomainConnection can accept
152
153 :rtype: dict
154 :return: search parameters
155 """
156 params = {'start': self.start, 'size': self.real_size}
157
158 if self.q:
159 params['q'] = self.q
160
161 if self.parser:
162 params['query_parser'] = self.parser
163
164 if self.fq:
165 params['filter_query'] = self.fq
166
167 if self.expr:
168 expr = {}
169 for k, v in six.iteritems(self.expr):
170 expr['expr.%s' % k] = v
171
172 params['expr'] = expr
173
174 if self.facet:
175 facet = {}
176 for k, v in six.iteritems(self.facet):
177 if not isinstance(v, six.string_types):
178 v = json.dumps(v)
179 facet['facet.%s' % k] = v
180
181 params['facet'] = facet
182
183 if self.highlight:
184 highlight = {}
185 for k, v in six.iteritems(self.highlight):
186 highlight['highlight.%s' % k] = v
187
188 params['highlight'] = highlight
189
190 if self.options:
191 params['query_options'] = self.options
192
193 if self.return_fields:
194 params['ret'] = ','.join(self.return_fields)
195
196 if self.partial is not None:
197 params['partial'] = self.partial
198
199 if self.sort:
200 params['sort'] = ','.join(self.sort)
201
202 return params
203
204
205 class SearchConnection(object):
206
207 def __init__(self, domain=None, endpoint=None):
208 self.domain = domain
209 self.endpoint = endpoint
210 self.session = requests.Session()
211
212 # Endpoint needs to be set before initializing CloudSearchDomainConnection
213 if not endpoint:
214 self.endpoint = domain.search_service_endpoint
215
216 # Copy proxy settings from connection and check if request should be signed
217 self.sign_request = False
218 if self.domain and self.domain.layer1:
219 if self.domain.layer1.use_proxy:
220 self.session.proxies['http'] = self.domain.layer1.get_proxy_url_with_auth()
221
222 self.sign_request = getattr(self.domain.layer1, 'sign_request', False)
223
224 if self.sign_request:
225 layer1 = self.domain.layer1
226 self.domain_connection = CloudSearchDomainConnection(
227 host=self.endpoint,
228 aws_access_key_id=layer1.aws_access_key_id,
229 aws_secret_access_key=layer1.aws_secret_access_key,
230 region=layer1.region,
231 provider=layer1.provider
232 )
233
234 def build_query(self, q=None, parser=None, fq=None, rank=None, return_fields=None,
235 size=10, start=0, facet=None, highlight=None, sort=None,
236 partial=None, options=None):
237 return Query(q=q, parser=parser, fq=fq, expr=rank, return_fields=return_fields,
238 size=size, start=start, facet=facet, highlight=highlight,
239 sort=sort, partial=partial, options=options)
240
241 def search(self, q=None, parser=None, fq=None, rank=None, return_fields=None,
242 size=10, start=0, facet=None, highlight=None, sort=None, partial=None,
243 options=None):
244 """
245 Send a query to CloudSearch
246
247 Each search query should use at least the q or bq argument to specify
248 the search parameter. The other options are used to specify the
249 criteria of the search.
250
251 :type q: string
252 :param q: A string to search the default search fields for.
253
254 :type parser: string
255 :param parser: The parser to use. 'simple', 'structured', 'lucene', 'dismax'
256
257 :type fq: string
258 :param fq: The filter query to use.
259
260 :type sort: List of strings
261 :param sort: A list of fields or rank expressions used to order the
262 search results. Order is handled by adding 'desc' or 'asc' after the field name.
263 ``['year desc', 'author asc']``
264
265 :type return_fields: List of strings
266 :param return_fields: A list of fields which should be returned by the
267 search. If this field is not specified, only IDs will be returned.
268 ``['headline']``
269
270 :type size: int
271 :param size: Number of search results to specify
272
273 :type start: int
274 :param start: Offset of the first search result to return (can be used
275 for paging)
276
277 :type facet: dict
278 :param facet: Dictionary of fields for which facets should be returned
279 The facet value is string of JSON options
280 ``{'year': '{sort:"bucket", size:3}', 'genres': '{buckets:["Action","Adventure","Sci-Fi"]}'}``
281
282 :type highlight: dict
283 :param highlight: Dictionary of fields for which highlights should be returned
284 The facet value is string of JSON options
285 ``{'genres': '{format:'text',max_phrases:2,pre_tag:'<b>',post_tag:'</b>'}'}``
286
287 :type partial: bool
288 :param partial: Should partial results from a partioned service be returned if
289 one or more index partitions are unreachable.
290
291 :type options: str
292 :param options: Options for the query parser specified in *parser*.
293 Specified as a string in JSON format.
294 ``{fields: ['title^5', 'description']}``
295
296 :rtype: :class:`boto.cloudsearch2.search.SearchResults`
297 :return: Returns the results of this search
298
299 The following examples all assume we have indexed a set of documents
300 with fields: *author*, *date*, *headline*
301
302 A simple search will look for documents whose default text search
303 fields will contain the search word exactly:
304
305 >>> search(q='Tim') # Return documents with the word Tim in them (but not Timothy)
306
307 A simple search with more keywords will return documents whose default
308 text search fields contain the search strings together or separately.
309
310 >>> search(q='Tim apple') # Will match "tim" and "apple"
311
312 More complex searches require the boolean search operator.
313
314 Wildcard searches can be used to search for any words that start with
315 the search string.
316
317 >>> search(q="'Tim*'") # Return documents with words like Tim or Timothy)
318
319 Search terms can also be combined. Allowed operators are "and", "or",
320 "not", "field", "optional", "token", "phrase", or "filter"
321
322 >>> search(q="(and 'Tim' (field author 'John Smith'))", parser='structured')
323
324 Facets allow you to show classification information about the search
325 results. For example, you can retrieve the authors who have written
326 about Tim with a max of 3
327
328 >>> search(q='Tim', facet={'Author': '{sort:"bucket", size:3}'})
329 """
330
331 query = self.build_query(q=q, parser=parser, fq=fq, rank=rank,
332 return_fields=return_fields,
333 size=size, start=start, facet=facet,
334 highlight=highlight, sort=sort,
335 partial=partial, options=options)
336 return self(query)
337
338 def _search_with_auth(self, params):
339 return self.domain_connection.search(params.pop("q", ""), **params)
340
341 def _search_without_auth(self, params, api_version):
342 url = "http://%s/%s/search" % (self.endpoint, api_version)
343 resp = self.session.get(url, params=params)
344
345 return {'body': resp.content.decode('utf-8'), 'status_code': resp.status_code}
346
347 def __call__(self, query):
348 """Make a call to CloudSearch
349
350 :type query: :class:`boto.cloudsearch2.search.Query`
351 :param query: A group of search criteria
352
353 :rtype: :class:`boto.cloudsearch2.search.SearchResults`
354 :return: search results
355 """
356 api_version = '2013-01-01'
357 if self.domain and self.domain.layer1:
358 api_version = self.domain.layer1.APIVersion
359
360 if self.sign_request:
361 data = self._search_with_auth(query.to_domain_connection_params())
362 else:
363 r = self._search_without_auth(query.to_params(), api_version)
364
365 _body = r['body']
366 _status_code = r['status_code']
367
368 try:
369 data = json.loads(_body)
370 except ValueError:
371 if _status_code == 403:
372 msg = ''
373 import re
374 g = re.search('<html><body><h1>403 Forbidden</h1>([^<]+)<', _body)
375 try:
376 msg = ': %s' % (g.groups()[0].strip())
377 except AttributeError:
378 pass
379 raise SearchServiceException('Authentication error from Amazon%s' % msg)
380 raise SearchServiceException("Got non-json response from Amazon. %s" % _body, query)
381
382 if 'messages' in data and 'error' in data:
383 for m in data['messages']:
384 if m['severity'] == 'fatal':
385 raise SearchServiceException("Error processing search %s "
386 "=> %s" % (params, m['message']), query)
387 elif 'error' in data:
388 raise SearchServiceException("Unknown error processing search %s"
389 % json.dumps(data), query)
390
391 data['query'] = query
392 data['search_service'] = self
393
394 return SearchResults(**data)
395
396 def get_all_paged(self, query, per_page):
397 """Get a generator to iterate over all pages of search results
398
399 :type query: :class:`boto.cloudsearch2.search.Query`
400 :param query: A group of search criteria
401
402 :type per_page: int
403 :param per_page: Number of docs in each :class:`boto.cloudsearch2.search.SearchResults` object.
404
405 :rtype: generator
406 :return: Generator containing :class:`boto.cloudsearch2.search.SearchResults`
407 """
408 query.update_size(per_page)
409 page = 0
410 num_pages_needed = 0
411 while page <= num_pages_needed:
412 results = self(query)
413 num_pages_needed = results.num_pages_needed
414 yield results
415 query.start += query.real_size
416 page += 1
417
418 def get_all_hits(self, query):
419 """Get a generator to iterate over all search results
420
421 Transparently handles the results paging from Cloudsearch
422 search results so even if you have many thousands of results
423 you can iterate over all results in a reasonably efficient
424 manner.
425
426 :type query: :class:`boto.cloudsearch2.search.Query`
427 :param query: A group of search criteria
428
429 :rtype: generator
430 :return: All docs matching query
431 """
432 page = 0
433 num_pages_needed = 0
434 while page <= num_pages_needed:
435 results = self(query)
436 num_pages_needed = results.num_pages_needed
437 for doc in results:
438 yield doc
439 query.start += query.real_size
440 page += 1
441
442 def get_num_hits(self, query):
443 """Return the total number of hits for query
444
445 :type query: :class:`boto.cloudsearch2.search.Query`
446 :param query: a group of search criteria
447
448 :rtype: int
449 :return: Total number of hits for query
450 """
451 query.update_size(1)
452 return self(query).hits