annotate planemo/lib/python3.7/site-packages/boto/cloudsearch/document.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
1 # Copyright (c) 2012 Mitch Garnaat http://garnaat.org/
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
2 # Copyright (c) 2012 Amazon.com, Inc. or its affiliates.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
3 # All Rights Reserved
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
4 #
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
5 # Permission is hereby granted, free of charge, to any person obtaining a
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
6 # copy of this software and associated documentation files (the
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
7 # "Software"), to deal in the Software without restriction, including
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
8 # without limitation the rights to use, copy, modify, merge, publish, dis-
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
9 # tribute, sublicense, and/or sell copies of the Software, and to permit
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
10 # persons to whom the Software is furnished to do so, subject to the fol-
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
11 # lowing conditions:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
12 #
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
13 # The above copyright notice and this permission notice shall be included
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
14 # in all copies or substantial portions of the Software.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
15 #
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
17 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
18 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
19 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
20 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
22 # IN THE SOFTWARE.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
23 #
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
24
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
25 import boto.exception
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
26 from boto.compat import json
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
27 import requests
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
28 import boto
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
29
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
30 class SearchServiceException(Exception):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
31 pass
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
32
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
33
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
34 class CommitMismatchError(Exception):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
35 pass
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
36
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
37 class EncodingError(Exception):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
38 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
39 Content sent for Cloud Search indexing was incorrectly encoded.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
40
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
41 This usually happens when a document is marked as unicode but non-unicode
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
42 characters are present.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
43 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
44 pass
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
45
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
46 class ContentTooLongError(Exception):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
47 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
48 Content sent for Cloud Search indexing was too long
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
49
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
50 This will usually happen when documents queued for indexing add up to more
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
51 than the limit allowed per upload batch (5MB)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
52
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
53 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
54 pass
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
55
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
56 class DocumentServiceConnection(object):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
57 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
58 A CloudSearch document service.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
59
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
60 The DocumentServiceConection is used to add, remove and update documents in
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
61 CloudSearch. Commands are uploaded to CloudSearch in SDF (Search Document Format).
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
62
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
63 To generate an appropriate SDF, use :func:`add` to add or update documents,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
64 as well as :func:`delete` to remove documents.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
65
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
66 Once the set of documents is ready to be index, use :func:`commit` to send the
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
67 commands to CloudSearch.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
68
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
69 If there are a lot of documents to index, it may be preferable to split the
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
70 generation of SDF data and the actual uploading into CloudSearch. Retrieve
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
71 the current SDF with :func:`get_sdf`. If this file is the uploaded into S3,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
72 it can be retrieved back afterwards for upload into CloudSearch using
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
73 :func:`add_sdf_from_s3`.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
74
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
75 The SDF is not cleared after a :func:`commit`. If you wish to continue
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
76 using the DocumentServiceConnection for another batch upload of commands,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
77 you will need to :func:`clear_sdf` first to stop the previous batch of
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
78 commands from being uploaded again.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
79
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
80 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
81
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
82 def __init__(self, domain=None, endpoint=None):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
83 self.domain = domain
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
84 self.endpoint = endpoint
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
85 if not self.endpoint:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
86 self.endpoint = domain.doc_service_endpoint
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
87 self.documents_batch = []
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
88 self._sdf = None
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
89
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
90 def add(self, _id, version, fields, lang='en'):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
91 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
92 Add a document to be processed by the DocumentService
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
93
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
94 The document will not actually be added until :func:`commit` is called
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
95
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
96 :type _id: string
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
97 :param _id: A unique ID used to refer to this document.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
98
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
99 :type version: int
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
100 :param version: Version of the document being indexed. If a file is
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
101 being reindexed, the version should be higher than the existing one
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
102 in CloudSearch.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
103
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
104 :type fields: dict
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
105 :param fields: A dictionary of key-value pairs to be uploaded .
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
106
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
107 :type lang: string
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
108 :param lang: The language code the data is in. Only 'en' is currently
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
109 supported
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
110 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
111
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
112 d = {'type': 'add', 'id': _id, 'version': version, 'lang': lang,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
113 'fields': fields}
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
114 self.documents_batch.append(d)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
115
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
116 def delete(self, _id, version):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
117 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
118 Schedule a document to be removed from the CloudSearch service
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
119
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
120 The document will not actually be scheduled for removal until :func:`commit` is called
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
121
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
122 :type _id: string
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
123 :param _id: The unique ID of this document.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
124
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
125 :type version: int
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
126 :param version: Version of the document to remove. The delete will only
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
127 occur if this version number is higher than the version currently
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
128 in the index.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
129 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
130
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
131 d = {'type': 'delete', 'id': _id, 'version': version}
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
132 self.documents_batch.append(d)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
133
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
134 def get_sdf(self):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
135 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
136 Generate the working set of documents in Search Data Format (SDF)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
137
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
138 :rtype: string
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
139 :returns: JSON-formatted string of the documents in SDF
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
140 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
141
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
142 return self._sdf if self._sdf else json.dumps(self.documents_batch)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
143
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
144 def clear_sdf(self):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
145 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
146 Clear the working documents from this DocumentServiceConnection
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
147
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
148 This should be used after :func:`commit` if the connection will be reused
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
149 for another set of documents.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
150 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
151
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
152 self._sdf = None
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
153 self.documents_batch = []
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
154
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
155 def add_sdf_from_s3(self, key_obj):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
156 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
157 Load an SDF from S3
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
158
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
159 Using this method will result in documents added through
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
160 :func:`add` and :func:`delete` being ignored.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
161
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
162 :type key_obj: :class:`boto.s3.key.Key`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
163 :param key_obj: An S3 key which contains an SDF
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
164 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
165 #@todo:: (lucas) would be nice if this could just take an s3://uri..."
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
166
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
167 self._sdf = key_obj.get_contents_as_string()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
168
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
169 def commit(self):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
170 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
171 Actually send an SDF to CloudSearch for processing
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
172
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
173 If an SDF file has been explicitly loaded it will be used. Otherwise,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
174 documents added through :func:`add` and :func:`delete` will be used.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
175
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
176 :rtype: :class:`CommitResponse`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
177 :returns: A summary of documents added and deleted
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
178 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
179
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
180 sdf = self.get_sdf()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
181
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
182 if ': null' in sdf:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
183 boto.log.error('null value in sdf detected. This will probably raise '
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
184 '500 error.')
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
185 index = sdf.index(': null')
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
186 boto.log.error(sdf[index - 100:index + 100])
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
187
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
188 url = "http://%s/2011-02-01/documents/batch" % (self.endpoint)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
189
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
190 # Keep-alive is automatic in a post-1.0 requests world.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
191 session = requests.Session()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
192 adapter = requests.adapters.HTTPAdapter(
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
193 pool_connections=20,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
194 pool_maxsize=50,
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
195 max_retries=5
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
196 )
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
197 session.mount('http://', adapter)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
198 session.mount('https://', adapter)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
199 r = session.post(url, data=sdf, headers={'Content-Type': 'application/json'})
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
200
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
201 return CommitResponse(r, self, sdf)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
202
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
203
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
204 class CommitResponse(object):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
205 """Wrapper for response to Cloudsearch document batch commit.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
206
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
207 :type response: :class:`requests.models.Response`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
208 :param response: Response from Cloudsearch /documents/batch API
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
209
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
210 :type doc_service: :class:`boto.cloudsearch.document.DocumentServiceConnection`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
211 :param doc_service: Object containing the documents posted and methods to
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
212 retry
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
213
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
214 :raises: :class:`boto.exception.BotoServerError`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
215 :raises: :class:`boto.cloudsearch.document.SearchServiceException`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
216 :raises: :class:`boto.cloudsearch.document.EncodingError`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
217 :raises: :class:`boto.cloudsearch.document.ContentTooLongError`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
218 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
219 def __init__(self, response, doc_service, sdf):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
220 self.response = response
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
221 self.doc_service = doc_service
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
222 self.sdf = sdf
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
223
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
224 _body = response.content.decode('utf-8')
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
225
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
226 try:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
227 self.content = json.loads(_body)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
228 except:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
229 boto.log.error('Error indexing documents.\nResponse Content:\n{0}\n\n'
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
230 'SDF:\n{1}'.format(_body, self.sdf))
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
231 raise boto.exception.BotoServerError(self.response.status_code, '',
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
232 body=_body)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
233
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
234 self.status = self.content['status']
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
235 if self.status == 'error':
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
236 self.errors = [e.get('message') for e in self.content.get('errors',
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
237 [])]
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
238 for e in self.errors:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
239 if "Illegal Unicode character" in e:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
240 raise EncodingError("Illegal Unicode character in document")
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
241 elif e == "The Content-Length is too long":
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
242 raise ContentTooLongError("Content was too long")
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
243 if 'adds' not in self.content or 'deletes' not in self.content:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
244 raise SearchServiceException("Error indexing documents"
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
245 " => %s" % self.content.get('message', ''))
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
246 else:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
247 self.errors = []
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
248
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
249 self.adds = self.content['adds']
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
250 self.deletes = self.content['deletes']
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
251 self._check_num_ops('add', self.adds)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
252 self._check_num_ops('delete', self.deletes)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
253
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
254 def _check_num_ops(self, type_, response_num):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
255 """Raise exception if number of ops in response doesn't match commit
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
256
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
257 :type type_: str
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
258 :param type_: Type of commit operation: 'add' or 'delete'
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
259
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
260 :type response_num: int
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
261 :param response_num: Number of adds or deletes in the response.
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
262
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
263 :raises: :class:`boto.cloudsearch.document.CommitMismatchError`
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
264 """
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
265 commit_num = len([d for d in self.doc_service.documents_batch
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
266 if d['type'] == type_])
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
267
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
268 if response_num != commit_num:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
269 raise CommitMismatchError(
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
270 'Incorrect number of {0}s returned. Commit: {1} Response: {2}'\
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
271 .format(type_, commit_num, response_num))