bug 469051: make gloda indexer index folders in two passes, one to determine how many messages need indexing and a second to actually index them; r=standard8, sr=bienvenu
authorMyk Melez <myk@mozilla.org>
Mon, 04 May 2009 12:57:08 -0700
changeset 2550 7991bb9932ca527d4b2ea92bdf50b119b0f33341
parent 2549 2580d4920e207ab45bec9f24c36233eecc8b9b9c
child 2551 bf8d5258553b1f8c67f211318b1a1d92a0a07f50
push idunknown
push userunknown
push dateunknown
reviewersstandard8, bienvenu
bugs469051
bug 469051: make gloda indexer index folders in two passes, one to determine how many messages need indexing and a second to actually index them; r=standard8, sr=bienvenu
mailnews/db/gloda/modules/indexer.js
--- a/mailnews/db/gloda/modules/indexer.js
+++ b/mailnews/db/gloda/modules/indexer.js
@@ -60,16 +60,19 @@ Cu.import("resource://app/modules/gloda/
 Cu.import("resource://app/modules/gloda/collection.js");
 Cu.import("resource://app/modules/gloda/connotent.js");
 
 Cu.import("resource://app/modules/gloda/mimemsg.js");
 
 // Components.results does not have mailnews error codes!
 const NS_MSG_ERROR_FOLDER_SUMMARY_OUT_OF_DATE = 0x80550005;
 
+const GLODA_MESSAGE_ID_PROPERTY = "gloda-id";
+const GLODA_DIRTY_PROPERTY = "gloda-dirty";
+
 // for list comprehension fun
 function range(begin, end) {
   for (let i = begin; i < end; ++i) {
     yield i;
   }
 }
 
 const GFI = Log4Moz.repository.getLogger("gloda.fixiterator");
@@ -137,27 +140,25 @@ function fixIterator(aEnum, aIface) {
     }
     return { __iterator__: iter };
   } catch(ex) {}
 }
 
 function MakeCleanMsgHdrCallback(aMsgHdr, aGlodaMessageID) {
   return function() {
     // Mark this message as indexed
-    aMsgHdr.setUint32Property(GlodaIndexer.GLODA_MESSAGE_ID_PROPERTY,
-                              aGlodaMessageID);
+    aMsgHdr.setUint32Property(GLODA_MESSAGE_ID_PROPERTY, aGlodaMessageID);
     // If there is a gloda-dirty flag on there, clear it by writing a 0.  (But
     //  don't do this if we didn't have a dirty flag on there in the first
     //  case.)  It sounds like we would actually prefer to "cut" the "cell",
     //  but I don't see any in-domain means of doing that.
     try {
-      let isDirty = aMsgHdr.getUint32Property(
-        GlodaIndexer.GLODA_DIRTY_PROPERTY);
+      let isDirty = aMsgHdr.getUint32Property(GLODA_DIRTY_PROPERTY);
       if (isDirty)
-        aMsgHdr.setUint32Property(GlodaIndexer.GLODA_DIRTY_PROPERTY, 0);
+        aMsgHdr.setUint32Property(GLODA_DIRTY_PROPERTY, 0);
     }
     catch (ex) {}
   };
 }
 
 /**
  * @class Capture the indexing batch concept explicitly.
  *
@@ -763,18 +764,16 @@ var GlodaIndexer = {
    * Indicates that we have pending deletions to process, meaning that there
    *  are gloda message rows flagged for deletion.  If this value is a boolean,
    *  it means the value is known reliably.  If this value is null, it means
    *  that we don't know, likely because we have started up and have not checked
    *  the database.
    */
   pendingDeletions: null,
   
-  GLODA_MESSAGE_ID_PROPERTY: "gloda-id",
-  GLODA_DIRTY_PROPERTY: "gloda-dirty",
   /**
    * The message (or folder state) is believed up-to-date.
    */
   kMessageClean: 0,
   /**
    * The message (or folder) is known to not be up-to-date. In the case of
    *  folders, this means that some of the messages in the folder may be dirty.
    *  However, because of the way our indexing works, it is possible there may
@@ -1572,18 +1571,16 @@ var GlodaIndexer = {
    * Index the contents of a folder.
    */
   _worker_folderIndex: function gloda_worker_folderIndex(aJob) {
     yield this._indexerEnterFolder(aJob.id, true);
     
     if (!this.shouldIndexFolder(this._indexingFolder))
       yield this.kWorkDone;
     
-    aJob.goal = this._indexingFolder.getTotalMessages(false);
-    
     // there is of course a cost to all this header investigation even if we
     //  don't do something.  so we will yield with kWorkSync for every block. 
     const HEADER_CHECK_BLOCK_SIZE = 10;
     
     let isLocal = this._indexingFolder instanceof Ci.nsIMsgLocalMailFolder;
     // we can safely presume if we are here that this folder has been selected
     //  for offline processing...
 
@@ -1602,56 +1599,84 @@ var GlodaIndexer = {
     if (glodaFolder.dirtyStatus == glodaFolder.kFolderFilthy) {
       let count = 0;
       for (let msgHdr in this._indexingIterator) {
         // we still need to avoid locking up the UI, pause periodically...
         if (++count % HEADER_CHECK_BLOCK_SIZE == 0)
           yield this.kWorkSync;
         
         let glodaMessageId = msgHdr.getUint32Property(
-                             this.GLODA_MESSAGE_ID_PROPERTY);
+          GLODA_MESSAGE_ID_PROPERTY);
         // if it has a gloda message id, we need to mark it filthy
         if (glodaMessageId != 0)
-          msgHdr.setUint32Property(this.GLODA_DIRTY_PROPERTY,
-                                   this.kMessageFilthy);
+          msgHdr.setUint32Property(GLODA_DIRTY_PROPERTY, this.kMessageFilthy);
         // if it doesn't have a gloda message id, we will definitely index it,
         //  so no action is required.
       }
       // this will automatically persist to the database
       glodaFolder.dirtyStatus = glodaFolder.kFolderDirty;
       
       // We used up the iterator, get a new one.
       this._indexerGetIterator();
     }
 
+    // Whether or not the given message should be indexed.  Messages should
+    // be indexed if they're indexable (local or offline and not expunged)
+    // and either haven't been indexed or are dirty.
+    let shouldIndexMessage = function(msgHdr) {
+      if ((!isLocal &&
+           !(msgHdr.flags & Components.interfaces.nsMsgMessageFlags.Offline)) ||
+          (msgHdr.flags & Components.interfaces.nsMsgMessageFlags.Expunged))
+        return false;
+
+      // returns 0 when missing, which means this message hasn't been indexed
+      if (msgHdr.getUint32Property(GLODA_MESSAGE_ID_PROPERTY) == 0)
+        return true;
+
+      // returns 0 when missing, which means this message is clean
+      return (msgHdr.getUint32Property(GLODA_DIRTY_PROPERTY) != 0);
+    };
+
+    // Pass 1: count the number of messages to index.
+    //  We do this in order to be able to report to the user what we're doing.
+    // To avoid traversing the entire folder again in the second pass, we could
+    //  cache headers that need indexing here, which would work fine for sparse
+    //  indexing but might eat too much memory for dense indexing.  Perhaps we
+    //  could employ a hybrid approach where we cache up to a certain number
+    //  of headers before falling back to full traversal in the second pass.
+    // TODO: give up after reaching a certain number of messages in folders
+    //  with ridiculous numbers of messages and make the interface just say
+    //  something like "over N messages to go."
+    let count = 0;
+    let numMessagesToIndex = 0;
+    for (let msgHdr in this._indexingIterator) {
+      // we still need to avoid locking up the UI, pause periodically...
+      if (++count % HEADER_CHECK_BLOCK_SIZE == 0)
+        yield this.kWorkSync;
+
+      if (shouldIndexMessage(msgHdr))
+        ++numMessagesToIndex;
+    }
+
+    // We used up the iterator, get a new one.
+    this._indexerGetIterator();
+
+    aJob.goal = numMessagesToIndex;
+
+    // Pass 2: index the messages.
+    count = 0;
     for (let msgHdr in this._indexingIterator) {
       // per above, we want to periodically release control while doing all
       //  this header traversal/investigation.
-      if (++aJob.offset % HEADER_CHECK_BLOCK_SIZE == 0) {
+      if (++count % HEADER_CHECK_BLOCK_SIZE == 0) {
         yield this.kWorkSync;
       }
-      
-      if ((isLocal ||
-           (msgHdr.flags & Components.interfaces.nsMsgMessageFlags.Offline)) &&
-          !(msgHdr.flags & Components.interfaces.nsMsgMessageFlags.Expunged)) {
-        // this returns 0 when missing
-        let glodaMessageId = msgHdr.getUint32Property(
-                             this.GLODA_MESSAGE_ID_PROPERTY);
-        
-        // if it has a gloda message id, it has been indexed, but it still
-        //  could be dirty.
-        if (glodaMessageId != 0) {
-          // (returns 0 when missing)
-          let isDirty = msgHdr.getUint32Property(this.GLODA_DIRTY_PROPERTY)!= 0;
 
-          // it's up to date if it's not dirty 
-          if (!isDirty)
-            continue;
-        }
-        
+      if (shouldIndexMessage(msgHdr)) {
+        ++aJob.offset;
         this._log.debug(">>>  _indexMessage");
         yield this._callbackHandle.pushAndGo(this._indexMessage(msgHdr,
             this._callbackHandle));
         this._log.debug("<<<  _indexMessage");
       }
     }
     
     glodaFolder.dirtyStatus = glodaFolder.kFolderClean;
@@ -1962,17 +1987,17 @@ var GlodaIndexer = {
       
       let glodaMessageIds = [];
       
       let deleteJob = new IndexingJob("message", -1, null);
       for (let iMsgHdr = 0; iMsgHdr < aMsgHdrs.length; iMsgHdr++) {
         let msgHdr = aMsgHdrs.queryElementAt(iMsgHdr, Ci.nsIMsgDBHdr);
         try {
           glodaMessageIds.push(msgHdr.getUint32Property(
-            this.indexer.GLODA_MESSAGE_ID_PROPERTY));
+            GLODA_MESSAGE_ID_PROPERTY));
         }
         catch (ex) {}
       }
       
       if (glodaMessageIds.length) {
         this.indexer._datastore.markMessagesDeletedByIDs(glodaMessageIds);
         this.indexer.pendingDeletions = true;
       }
@@ -2026,17 +2051,17 @@ var GlodaIndexer = {
             let newMessageKeys = [];
             for each (let destMsgHdr in fixIterator(aDestFolder.messages,
                                                     Ci.nsIMsgDBHdr)) {
               let destMsgId = destMsgHdr.messageId;
               let matchingSrcHdr = srcMsgIdToHdr[destMsgId];
               if (matchingSrcHdr) {
                 try {
                   let glodaId = matchingSrcHdr.getUint32Property(
-                    this.indexer.GLODA_MESSAGE_ID_PROPERTY); 
+                    GLODA_MESSAGE_ID_PROPERTY);
                   glodaIds.push(glodaId);
                   newMessageKeys.push(destMsgHdr.messageKey);
                 }
                 // no gloda id means it hasn't been indexed, so the move isn't
                 //  required.
                 catch (ex) {}
               }
             }
@@ -2227,17 +2252,17 @@ var GlodaIndexer = {
       let isFolderLocal = msgFolder instanceof Ci.nsIMsgLocalMailFolder;
       if (!isFolderLocal && !(msgFolder.flags&Ci.nsMsgFolderFlags.Offline))
         return;
     
       // mark the message as dirty
       // (We could check for the presence of the gloda message id property
       //  first to know whether we technically need the dirty property.  I'm
       //  not sure whether it is worth the high-probability exception cost.) 
-      aMsgHdr.setUint32Property(this.indexer.GLODA_DIRTY_PROPERTY, 1);
+      aMsgHdr.setUint32Property(GLODA_DIRTY_PROPERTY, 1);
       // mark the folder dirty too, so we know to look inside
       let glodaFolder = GlodaDatastore._mapFolder(msgFolder);
       glodaFolder.dirtyStatus = true;
       
       if (this.indexer._pendingAddJob === null) {
         this.indexer._pendingAddJob = new IndexingJob("message", 1, null);
         this.indexer._indexQueue.push(this.indexer._pendingAddJob);
         this.indexer._indexingJobGoal++;