Run really large queries as a series of smaller ones
authorJonathan Griffin <jgriffin@mozilla.com>
Thu, 07 Jul 2011 16:31:50 -0700
changeset 38 cd11afd7e9bfaf6499d0d7770ccfde3d5d54ff70
parent 37 d04bff5fca62eb243f0d12ea0564356e0af06c29
child 39 07a51ccc9cd10de9c4ebed949ef294057a602813
push id39
push userjgriffin@mozilla.com
push dateThu, 07 Jul 2011 23:32:01 +0000
Run really large queries as a series of smaller ones
mozautoeslib/eslib.py
--- a/mozautoeslib/eslib.py
+++ b/mozautoeslib/eslib.py
@@ -125,18 +125,16 @@ class ESLib(object):
                                     doc_types=self.doc_type)
 
     if result and result['hits'] and result['hits']['hits']:
       # partially flatten the data
       for hit in result['hits']['hits']:
         if not '_source' in hit:
           raise Exception("Key ['_source'] not found in response hit")
         resultlist.append(hit['_source'])
-    else:
-      raise Exception("Key ['hits']['hits'] not found in response data")
 
     return resultlist
 
   def query(self, include={}, exclude={}, size=None, doc_type=None, sort=None,
             withSource=False):
     """Return a list of hits which match all the fields in 'include',
        but none of the fields in 'exclude', up to a maximum of 'size' hits,
        or all hits when 'size' is None.
@@ -171,33 +169,37 @@ class ESLib(object):
       if not 'count' in count:
         raise Exception("Key ['count'] not found in count response data")
       query_size = count['count']
 
     # there's no data to return, so don't bother searching
     if query_size == 0:
       return []
 
-    q = Search(query=boolquery, sort=sort, size=query_size)
-    result = self.connection.search(query=q,
-                                    indexes=[self.read_index],
-                                    doc_types=self.doc_type)
-    #print json.dumps(result, indent=2)
+    chunk_size = 2000
+    for x in range(0,(query_size-1)/chunk_size + 1):
+      start = x * chunk_size
+      this_size = query_size - x*chunk_size if query_size - x*chunk_size < chunk_size else chunk_size
+      if this_size > 0:
+        q = Search(query=boolquery, sort=sort, size=this_size, start=start)
+        result = self.connection.search(query=q,
+                                        indexes=[self.read_index],
+                                        doc_types=self.doc_type)
 
-    if result and result['hits'] and result['hits']['hits']:
-      # partially flatten the data
-      for hit in result['hits']['hits']:
-        if withSource:
-          resultlist.append(hit)
+        if result and result['hits'] and result['hits']['hits']:
+          # partially flatten the data
+          for hit in result['hits']['hits']:
+            if withSource:
+              resultlist.append(hit)
+            else:
+              if not '_source' in hit:
+                raise Exception("Key ['_source'] not found in response hit")
+              resultlist.append(hit['_source'])
         else:
-          if not '_source' in hit:
-            raise Exception("Key ['_source'] not found in response hit")
-          resultlist.append(hit['_source'])
-    else:
-      raise Exception("Key ['hits']['hits'] not found in response data")
+          raise Exception("Key ['hits']['hits'] not found in response data")
 
     return resultlist
 
   def aggregates(self, include={}, exclude={}, aggregate_by={}, doc_type=None):
     """Return a count of hits that match all possible combinations of fields
        in aggregate_by.
 
        Example: