Backed out changeset: baff9bc304b6
authorSerge Gautherie <sgautherie.bz@free.fr>
Wed, 08 Oct 2008 21:07:01 +0200
changeset 557 027d53b6838c9a4d0b1a7bf709b4396134aec438
parent 556 baff9bc304b62c390c8806b937e6f3970695ec6f
child 558 1d810e40645fde2b7e895b556166cd9a3934b1d1
push idunknown
push userunknown
push dateunknown
bugs453881
Backed out changeset: baff9bc304b6 Bug 453881 - Split Bayes tokenizer into message and store versions; r=bugzilla sr=bienvenu
mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@@ -17,17 +17,16 @@
  * The Initial Developer of the Original Code is
  * Netscape Communications Corporation.
  * Portions created by the Initial Developer are Copyright (C) 2002
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Patrick C. Beard <beard@netscape.com>
  *   Seth Spitzer <sspitzer@netscape.com>
- *   Kent James <kent@caspia.com>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either of the GNU General Public License Version 2 or later (the "GPL"),
  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -101,225 +100,180 @@ static int kMaxLengthForToken = 12; // u
 #ifndef M_LN2
 #define M_LN2 0.69314718055994530942
 #endif
 
 #ifndef M_E
 #define M_E   2.7182818284590452354
 #endif
 
-// provide base implementation of hash lookup of a string
-struct BaseToken : public PLDHashEntryHdr
-{
+struct Token : public PLDHashEntryHdr {
     const char* mWord;
-};
-
-// token for a particular message
-struct Token : public BaseToken {
-    PRUint32 mCount;
+    PRUint32 mLength;
+    PRUint32 mCount;            // TODO:  put good/bad count values in same token object.
     double mProbability;        // TODO:  cache probabilities
     double mDistance;
 };
 
-// token stored in a training file for a group of messages
-struct CorpusToken : public BaseToken
-{
-    PRUint32 mJunkCount;
-    PRUint32 mGoodCount;
-};
-
 TokenEnumeration::TokenEnumeration(PLDHashTable* table)
     :   mEntrySize(table->entrySize),
         mEntryCount(table->entryCount),
         mEntryOffset(0),
         mEntryAddr(table->entryStore)
 {
     PRUint32 capacity = PL_DHASH_TABLE_SIZE(table);
     mEntryLimit = mEntryAddr + capacity * mEntrySize;
 }
 
 inline PRBool TokenEnumeration::hasMoreTokens()
 {
     return (mEntryOffset < mEntryCount);
 }
 
-inline BaseToken* TokenEnumeration::nextToken()
+inline Token* TokenEnumeration::nextToken()
 {
-    BaseToken* token = nsnull;
+    Token* token = NULL;
     PRUint32 entrySize = mEntrySize;
     char *entryAddr = mEntryAddr, *entryLimit = mEntryLimit;
     while (entryAddr < entryLimit) {
         PLDHashEntryHdr* entry = (PLDHashEntryHdr*) entryAddr;
         entryAddr += entrySize;
         if (PL_DHASH_ENTRY_IS_LIVE(entry)) {
-            token = static_cast<BaseToken*>(entry);
+            token = static_cast<Token*>(entry);
             ++mEntryOffset;
             break;
         }
     }
     mEntryAddr = entryAddr;
     return token;
 }
 
 struct VisitClosure {
-    PRBool (*f) (BaseToken*, void*);
+    PRBool (*f) (Token*, void*);
     void* data;
 };
 
 static PLDHashOperator PR_CALLBACK VisitEntry(PLDHashTable* table, PLDHashEntryHdr* entry,
                                               PRUint32 number, void* arg)
 {
     VisitClosure* closure = reinterpret_cast<VisitClosure*>(arg);
-    BaseToken* token = static_cast<BaseToken*>(entry);
+    Token* token = static_cast<Token*>(entry);
     return (closure->f(token, closure->data) ? PL_DHASH_NEXT : PL_DHASH_STOP);
 }
 
 // member variables
 static const PLDHashTableOps gTokenTableOps = {
     PL_DHashAllocTable,
     PL_DHashFreeTable,
     PL_DHashStringKey,
     PL_DHashMatchStringKey,
     PL_DHashMoveEntryStub,
     PL_DHashClearEntryStub,
     PL_DHashFinalizeStub
 };
 
-TokenHash::TokenHash(PRUint32 aEntrySize)
+Tokenizer::Tokenizer()
 {
-    mEntrySize = aEntrySize;
     PL_INIT_ARENA_POOL(&mWordPool, "Words Arena", 16384);
-    PRBool ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull,
-                                  aEntrySize, 256);
+    PRBool ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
     NS_ASSERTION(ok, "mTokenTable failed to initialize");
     if (!ok)
       PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("mTokenTable failed to initialize"));
 }
 
-TokenHash::~TokenHash()
+Tokenizer::~Tokenizer()
 {
     if (mTokenTable.entryStore)
         PL_DHashTableFinish(&mTokenTable);
     PL_FinishArenaPool(&mWordPool);
 }
 
-nsresult TokenHash::clearTokens()
+nsresult Tokenizer::clearTokens()
 {
     // we re-use the tokenizer when classifying multiple messages,
     // so this gets called after every message classification.
     PRBool ok = PR_TRUE;
     if (mTokenTable.entryStore)
     {
         PL_DHashTableFinish(&mTokenTable);
         PL_FreeArenaPool(&mWordPool);
-        ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull,
-                               mEntrySize, 256);
+        ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
         NS_ASSERTION(ok, "mTokenTable failed to initialize");
         if (!ok)
           PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("mTokenTable failed to initialize in clearTokens()"));
     }
     return (ok) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
 }
 
-char* TokenHash::copyWord(const char* word, PRUint32 len)
+char* Tokenizer::copyWord(const char* word, PRUint32 len)
 {
     void* result;
     PRUint32 size = 1 + len;
     PL_ARENA_ALLOCATE(result, &mWordPool, size);
     if (result)
         memcpy(result, word, size);
     return reinterpret_cast<char*>(result);
 }
 
-inline BaseToken* TokenHash::get(const char* word)
+inline Token* Tokenizer::get(const char* word)
 {
     PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_LOOKUP);
     if (PL_DHASH_ENTRY_IS_BUSY(entry))
-        return static_cast<BaseToken*>(entry);
+        return static_cast<Token*>(entry);
     return NULL;
 }
 
-BaseToken* TokenHash::add(const char* word)
+Token* Tokenizer::add(const char* word, PRUint32 count)
 {
-    if (!word || !*word)
-    {
-      NS_ERROR("Trying to add a null word");
-      return nsnull;
-    }
-
-    PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("add word: %s", word));
+    PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("add word: %s (count=%d)", word, count));
 
     PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
-    BaseToken* token = static_cast<BaseToken*>(entry);
+    Token* token = static_cast<Token*>(entry);
     if (token) {
         if (token->mWord == NULL) {
             PRUint32 len = strlen(word);
             NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
             if (!len)
               PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("adding zero length word to tokenizer"));
             token->mWord = copyWord(word, len);
             NS_ASSERTION(token->mWord, "copyWord failed");
             if (!token->mWord) {
                 PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("copyWord failed: %s (%d)", word, len));
                 PL_DHashTableRawRemove(&mTokenTable, entry);
                 return NULL;
             }
+            token->mLength = len;
+            token->mCount = count;
+            token->mProbability = 0;
+            PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("adding word to tokenizer: %s (len=%d) (count=%d)", word, len, count));
+        } else {
+            token->mCount += count;
+            PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count, token->mCount));
         }
     }
     return token;
 }
 
-void TokenHash::visit(PRBool (*f) (BaseToken*, void*), void* data)
-{
-    VisitClosure closure = { f, data };
-    PRUint32 visitCount = PL_DHashTableEnumerate(&mTokenTable, VisitEntry, &closure);
-    NS_ASSERTION(visitCount == mTokenTable.entryCount, "visitCount != entryCount!");
-    if (visitCount != mTokenTable.entryCount) {
-      PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("visitCount != entryCount!: %d vs %d", visitCount, mTokenTable.entryCount));
-    }
-}
-
-inline PRUint32 TokenHash::countTokens()
-{
-  return mTokenTable.entryCount;
-}
-
-inline TokenEnumeration TokenHash::getTokens()
-{
-  return TokenEnumeration(&mTokenTable);
-}
-
-Tokenizer::Tokenizer() :
-  TokenHash(sizeof(Token))
+void Tokenizer::remove(const char* word, PRUint32 count)
 {
-}
-
-Tokenizer::~Tokenizer()
-{
-}
-
-inline Token* Tokenizer::get(const char* word)
-{
-  return static_cast<Token*>(TokenHash::get(word));
-}
-
-Token* Tokenizer::add(const char* word, PRUint32 count)
-{
-  PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("add word: %s (count=%d)",
-         word, count));
-
-  Token* token = static_cast<Token*>(TokenHash::add(word));
-  if (token) 
-  {
-    token->mCount += count; // hash code initializes this to zero
-    PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG,
-           ("adding word to tokenizer: %s (count=%d) (mCount=%d)",
-           word, count, token->mCount));
-  }
-  return token;
+    PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("remove word: %s (count=%d)", word, count));
+    Token* token = get(word);
+    if (token) {
+        PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG,
+          ("remove word: %s (count=%d) (mCount=%d)", word, count, token->mCount));
+        
+        if (token->mCount >= count)
+            token->mCount -= count;
+        else
+            token->mCount = 0;
+         
+        if (token->mCount == 0)
+            PL_DHashTableRawRemove(&mTokenTable, token);
+        
+    }
 }
 
 static PRBool isDecimalNumber(const char* word)
 {
     const char* p = word;
     if (*p == '-') ++p;
     char c;
     while ((c = *p++)) {
@@ -540,17 +494,17 @@ enum char_class{
     kanji,
     kuten,
     touten,
     kigou,
     fwlatain,
     ascii
 };
 
-static char_class getCharClass(PRUnichar c)
+char_class getCharClass(PRUnichar c)
 {
   char_class charClass = others;
 
   if(IS_JA_HIRAGANA(c))
     charClass = hiragana;
   else if(IS_JA_KATAKANA(c))
     charClass = katakana;
   else if(IS_JA_KANJI(c))
@@ -574,17 +528,17 @@ static PRBool isJapanese(const char* wor
   // it is japanese chunk if it contains any hiragana or katakana.
   while((c = *p++))
     if( IS_JAPANESE_SPECIFIC(c))
       return PR_TRUE;
 
   return PR_FALSE;
 }
 
-static PRBool isFWNumeral(const PRUnichar* p1, const PRUnichar* p2)
+PRBool isFWNumeral(const PRUnichar* p1, const PRUnichar* p2)
 {
   for(;p1<p2;p1++)
     if(!IS_JA_FWNUMERAL(*p1))
       return PR_FALSE;
 
   return PR_TRUE;
 }
 
@@ -706,32 +660,52 @@ void Tokenizer::tokenize(const char* aTe
                     break;
                 }
             }
         }
     }
   }
 }
 
+void Tokenizer::visit(PRBool (*f) (Token*, void*), void* data)
+{
+    VisitClosure closure = { f, data };
+    PRUint32 visitCount = PL_DHashTableEnumerate(&mTokenTable, VisitEntry, &closure);
+    NS_ASSERTION(visitCount == mTokenTable.entryCount, "visitCount != entryCount!");
+    if (visitCount != mTokenTable.entryCount) {
+      PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("visitCount != entryCount!: %d vs %d", visitCount, mTokenTable.entryCount));
+    }
+}
+
+inline PRUint32 Tokenizer::countTokens()
+{
+    return mTokenTable.entryCount;
+}
+
 Token* Tokenizer::copyTokens()
 {
     PRUint32 count = countTokens();
     if (count > 0) {
         Token* tokens = new Token[count];
         if (tokens) {
             Token* tp = tokens;
             TokenEnumeration e(&mTokenTable);
             while (e.hasMoreTokens())
-                *tp++ = *(static_cast<Token*>(e.nextToken()));
+                *tp++ = *e.nextToken();
         }
         return tokens;
     }
     return NULL;
 }
 
+inline TokenEnumeration Tokenizer::getTokens()
+{
+    return TokenEnumeration(&mTokenTable);
+}
+
 class TokenAnalyzer {
 public:
     virtual ~TokenAnalyzer() {}
 
     virtual void analyzeTokens(Tokenizer& tokenizer) = 0;
     void setTokenListener(nsIStreamListener *aTokenListener)
     {
       mTokenListener = aTokenListener;
@@ -947,52 +921,60 @@ NS_IMETHODIMP TokenStreamListener::OnSto
 
     return NS_OK;
 }
 
 /* Implementation file */
 NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
 
 nsBayesianFilter::nsBayesianFilter()
-    :   mTrainingDataDirty(PR_FALSE)
+    :   mGoodCount(0), mBadCount(0), mTrainingDataDirty(PR_FALSE)
 {
     if (!BayesianFilterLogModule)
       BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
 
     PRInt32 junkThreshold = 0;
     nsresult rv;
     nsCOMPtr<nsIPrefBranch> pPrefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID, &rv));
     if (pPrefBranch)
       pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold", &junkThreshold);
 
     mJunkProbabilityThreshold = ((double) junkThreshold) / 100;
     if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1)
       mJunkProbabilityThreshold = kDefaultJunkThreshold;
 
     PR_LOG(BayesianFilterLogModule, PR_LOG_WARNING, ("junk probability threshold: %f", mJunkProbabilityThreshold));
 
-    mCorpus.readTrainingData();
+    getTrainingFile(getter_AddRefs(mTrainingFile));
+
+    PRBool ok = (mGoodTokens && mBadTokens);
+    NS_ASSERTION(ok, "error allocating tokenizers");
+    if (ok)
+        readTrainingData();
+    else {
+      PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("error allocating tokenizers"));
+    }
 
     // get parameters for training data flushing, from the prefs
 
     nsCOMPtr<nsIPrefBranch> prefBranch;
 
     nsCOMPtr<nsIPrefService> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
     NS_ASSERTION(NS_SUCCEEDED(rv),"failed accessing preferences service");
     rv = prefs->GetBranch(nsnull, getter_AddRefs(prefBranch));
     NS_ASSERTION(NS_SUCCEEDED(rv),"failed getting preferences branch");
 
     rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.minimum_interval",&mMinFlushInterval);
     // it is not a good idea to allow a minimum interval of under 1 second
     if (NS_FAILED(rv) || (mMinFlushInterval <= 1000) )
         mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES;
-
+        
     rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", &mMaximumTokenCount);
     if (NS_FAILED(rv))
-      mMaximumTokenCount = 0; // which means do not limit token counts
+      mMaximumTokenCount = 0; // which means do not limit token counts    
     PR_LOG(BayesianFilterLogModule, PR_LOG_WARNING, ("maximum junk tokens: %d", mMaximumTokenCount));
 
     mTimer = do_CreateInstance(NS_TIMER_CONTRACTID, &rv);
     NS_ASSERTION(NS_SUCCEEDED(rv), "unable to create a timer; training data will only be written on exit");
 
     // the timer is not used on object construction, since for
     // the time being there are no dirying messages
 
@@ -1000,18 +982,17 @@ nsBayesianFilter::nsBayesianFilter()
 
 void
 nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure)
 {
     // we will flush the training data to disk after enough time has passed
     // since the first time a message has been classified after the last flush
 
     nsBayesianFilter *filter = static_cast<nsBayesianFilter *>(aClosure);
-    filter->mCorpus.writeTrainingData(filter->mMaximumTokenCount);
-    filter->mTrainingDataDirty = PR_FALSE;
+    filter->writeTrainingData();
 }
 
 nsBayesianFilter::~nsBayesianFilter()
 {
     if (mTimer)
     {
         mTimer->Cancel();
         mTimer = nsnull;
@@ -1135,42 +1116,41 @@ void nsBayesianFilter::classifyMessage(T
     // that won't be the case with users who first use the junk mail feature
     // so, we do certain things to encourage them to train.
     //
     // if there are no good tokens, assume the message is junk
     // this will "encourage" the user to train
     // and if there are no bad tokens, assume the message is not junk
     // this will also "encourage" the user to train
     // see bug #194238
-    if (listener && !mCorpus.mGoodMessageCount)
-    {
+    if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
       PR_LOG(BayesianFilterLogModule, PR_LOG_WARNING, ("no good tokens, assume junk"));
       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK),
         nsIJunkMailPlugin::IS_SPAM_SCORE);
       return;
     }
-    if (listener && !mCorpus.mJunkMessageCount)
-    {
+    if (listener && !mBadCount && !mBadTokens.countTokens()) {
       PR_LOG(BayesianFilterLogModule, PR_LOG_WARNING, ("no bad tokens, assume good"));
       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD),
         nsIJunkMailPlugin::IS_HAM_SCORE);
       return;
     }
 
     /* this part is similar to the Graham algorithm with some adjustments. */
     PRUint32 i, goodclues=0, count = tokenizer.countTokens();
-    double ngood = mCorpus.mGoodMessageCount,
-           nbad = mCorpus.mJunkMessageCount, prob;
+    double ngood = mGoodCount, nbad = mBadCount, prob;
 
     for (i = 0; i < count; ++i)
     {
-      Token& token = tokens[i];
-      CorpusToken *t = mCorpus.get(token.mWord);
-      double hamcount = ((t != nsnull) ? t->mGoodCount : 0);
-      double spamcount = ((t != nsnull) ? t->mJunkCount : 0);
+        Token& token = tokens[i];
+        const char* word = token.mWord;
+        Token* t = mGoodTokens.get(word);
+      double hamcount = ((t != NULL) ? t->mCount : 0);
+        t = mBadTokens.get(word);
+       double spamcount = ((t != NULL) ? t->mCount : 0);
 
       // if hamcount and spam count are both 0, we could end up with a divide by 0 error,
       // tread carefully here. (Bug #240819)
       double probDenom = (hamcount *nbad + spamcount*ngood);
       if (probDenom == 0.0) // nGood and nbad are known to be non zero or we wouldn't be here
         probDenom = nbad + ngood; // error case use a value of 1 for hamcount and spamcount if they are both zero.
 
       prob = (spamcount * ngood)/probDenom;
@@ -1183,17 +1163,17 @@ void nsBayesianFilter::classifyMessage(T
          token.mDistance = distance;
          token.mProbability = prob;
         }
       else
         token.mDistance = -1; //ignore clue
     }
 
     // sort the array by the token distances
-    NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
+        NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
     PRUint32 first, last = count;
     first = (goodclues > 150) ? count - 150 : 0;
 
     double H = 1.0, S = 1.0;
     PRInt32 Hexp = 0, Sexp = 0;
     goodclues=0;
     int e;
 
@@ -1233,51 +1213,48 @@ void nsBayesianFilter::classifyMessage(T
             prob = (S-H +1.0) / 2.0;
         else
             prob = 0.5;
     }
     else
         prob = 0.5;
 
     PRBool isJunk = (prob >= mJunkProbabilityThreshold);
-    PRUint32 junkPercent = static_cast<PRUint32>(prob*100. + .5);
+    PRUint32 junkPercent = prob*100. + .5;
     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("%s is junk probability = (%f)  HAM SCORE:%f SPAM SCORE:%f", messageURI, prob,H,S));
 
     delete[] tokens;
 
     if (listener)
         listener->OnMessageClassified(messageURI, isJunk ?
           nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD),
           junkPercent);
 }
 
 /* void shutdown (); */
 NS_IMETHODIMP nsBayesianFilter::Shutdown()
 {
-  if (mTrainingDataDirty)
-    mCorpus.writeTrainingData(mMaximumTokenCount);
-  mTrainingDataDirty = PR_FALSE;
-
-  return NS_OK;
+    if (mTrainingDataDirty)
+        writeTrainingData();
+    return NS_OK;
 }
 
 /* readonly attribute boolean shouldDownloadAllHeaders; */
 NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(PRBool *aShouldDownloadAllHeaders)
 {
     // bayesian filters work on the whole msg body currently.
     *aShouldDownloadAllHeaders = PR_FALSE;
     return NS_OK;
 }
 
 /* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener aListener); */
 NS_IMETHODIMP nsBayesianFilter::ClassifyMessage(const char *aMessageURL, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
 {
     MessageClassifier* analyzer = new MessageClassifier(this, aListener, aMsgWindow, 1, &aMessageURL);
-    if (!analyzer)
-      return NS_ERROR_OUT_OF_MEMORY;
+    if (!analyzer) return NS_ERROR_OUT_OF_MEMORY;
     TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
     analyzer->setTokenListener(tokenListener);
     return tokenizeMessage(aMessageURL, aMsgWindow, analyzer);
 }
 
 /* void classifyMessages (in unsigned long aCount, [array, size_is (aCount)] in string aMsgURLs, in nsIJunkMailClassificationListener aListener); */
 NS_IMETHODIMP nsBayesianFilter::ClassifyMessages(PRUint32 aCount, const char **aMsgURLs, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
 {
@@ -1312,16 +1289,36 @@ public:
 private:
     nsBayesianFilter* mFilter;
     nsCOMPtr<nsISupports> mSupports;
     nsCOMPtr<nsIJunkMailClassificationListener> mListener;
     nsMsgJunkStatus mOldClassification;
     nsMsgJunkStatus mNewClassification;
 };
 
+static void forgetTokens(Tokenizer& corpus, TokenEnumeration tokens)
+{
+    // if we are forgetting the tokens for a message, should only
+    // subtract 1 from the occurrence count for that token in the training set
+    // because we assume we only bumped the training set count once per messages
+    // containing the token.
+    while (tokens.hasMoreTokens()) {
+        Token* token = tokens.nextToken();
+        corpus.remove(token->mWord);
+    }
+}
+
+static void rememberTokens(Tokenizer& corpus, TokenEnumeration tokens)
+{
+    while (tokens.hasMoreTokens()) {
+        Token* token = tokens.nextToken();
+        corpus.add(token->mWord);
+    }
+}
+
 void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageURL,
                                       nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification,
                                       nsIJunkMailClassificationListener* listener)
 {
     PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
 
     PRBool trainingDataWasDirty = mTrainingDataDirty;
     TokenEnumeration tokens = tokenizer.getTokens();
@@ -1333,47 +1330,47 @@ void nsBayesianFilter::observeMessage(To
     // just re-training. But this then allows users to re-classify the same message on the same training set over and over again
     // leading to data skew. But that's all I can think to do right now to address this.....
     if (oldClassification != newClassification)
     {
       // remove the tokens from the token set it is currently in
     switch (oldClassification) {
     case nsIJunkMailPlugin::JUNK:
         // remove tokens from junk corpus.
-        if (mCorpus.mJunkMessageCount > 0) {
-            --mCorpus.mJunkMessageCount;
-            mCorpus.forgetTokens(tokens, 1, 0);
+        if (mBadCount > 0) {
+            --mBadCount;
+            forgetTokens(mBadTokens, tokens);
             mTrainingDataDirty = PR_TRUE;
         }
         break;
     case nsIJunkMailPlugin::GOOD:
         // remove tokens from good corpus.
-        if (mCorpus.mGoodMessageCount > 0) {
-            --mCorpus.mGoodMessageCount;
-            mCorpus.forgetTokens(tokens, 0, 1);
+        if (mGoodCount > 0) {
+            --mGoodCount;
+            forgetTokens(mGoodTokens, tokens);
             mTrainingDataDirty = PR_TRUE;
         }
         break;
     }
     }
 
 
     PRUint32 junkPercent;
     switch (newClassification) {
     case nsIJunkMailPlugin::JUNK:
         // put tokens into junk corpus.
-        ++mCorpus.mJunkMessageCount;
-        mCorpus.rememberTokens(tokens, 1, 0);
+        ++mBadCount;
+        rememberTokens(mBadTokens, tokens);
         mTrainingDataDirty = PR_TRUE;
         junkPercent = nsIJunkMailPlugin::IS_SPAM_SCORE;
         break;
     case nsIJunkMailPlugin::GOOD:
         // put tokens into good corpus.
-        ++mCorpus.mGoodMessageCount;
-        mCorpus.rememberTokens(tokens, 0, 1);
+        ++mGoodCount;
+        rememberTokens(mGoodTokens, tokens);
         mTrainingDataDirty = PR_TRUE;
         junkPercent = nsIJunkMailPlugin::IS_HAM_SCORE;
         break;
     }
 
     if (listener)
         listener->OnMessageClassified(messageURL, newClassification, junkPercent);
 
@@ -1383,168 +1380,90 @@ void nsBayesianFilter::observeMessage(To
         // mMinFlushInterval msec from now
         PR_LOG(
             BayesianFilterLogModule, PR_LOG_DEBUG,
             ("starting training data flush timer %i msec", mMinFlushInterval));
         mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, this, mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
     }
 }
 
-NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(PRBool *aResult)
-{
-  *aResult = ((mCorpus.mGoodMessageCount + mCorpus.mJunkMessageCount) &&
-              mCorpus.countTokens());
-  return NS_OK;
-}
-
-/* void setMessageClassification (in string aMsgURL,
-   in long aOldClassification, in long aNewClassification); */
-NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(
-                const char *aMsgURL,
-                nsMsgJunkStatus aOldClassification,
-                nsMsgJunkStatus aNewClassification,
-                nsIMsgWindow *aMsgWindow,
-                nsIJunkMailClassificationListener *aListener)
-{
-  MessageObserver* analyzer = new MessageObserver(this, 
-      aOldClassification, aNewClassification, aListener);
-  if (!analyzer)
-    return NS_ERROR_OUT_OF_MEMORY;
-
-  TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
-  if (!tokenListener)
-    return NS_ERROR_OUT_OF_MEMORY;
-
-  analyzer->setTokenListener(tokenListener);
-  return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
-}
-
-NS_IMETHODIMP nsBayesianFilter::ResetTrainingData()
-{
-  if (mCorpus)
-    return mCorpus.resetTrainingData();
-  return NS_ERROR_FAILURE;
-}
-
-/* Corpus Store */
-
 /*
     Format of the training file for version 1:
     [0xFEEDFACE]
     [number good messages][number bad messages]
     [number good tokens]
     [count][length of word]word
     ...
     [number bad tokens]
     [count][length of word]word
     ...
  */
 
-CorpusStore::CorpusStore() :
-  TokenHash(sizeof(CorpusToken)), mGoodMessageCount(0), mJunkMessageCount(0)
-{
-  getTrainingFile(getter_AddRefs(mTrainingFile));
-}
-
-CorpusStore::~CorpusStore()
-{
-}
-
 inline int writeUInt32(FILE* stream, PRUint32 value)
 {
     value = PR_htonl(value);
     return fwrite(&value, sizeof(PRUint32), 1, stream);
 }
 
 inline int readUInt32(FILE* stream, PRUint32* value)
 {
     int n = fread(value, sizeof(PRUint32), 1, stream);
     if (n == 1) {
         *value = PR_ntohl(*value);
     }
     return n;
 }
 
-void CorpusStore::forgetTokens(TokenEnumeration tokens,
-                    PRUint32 aJunkCount, PRUint32 aGoodCount)
+static PRBool writeTokens(FILE* stream, Tokenizer& tokenizer, PRBool shrink)
 {
-  // if we are forgetting the tokens for a message, should only
-  // subtract 1 from the occurrence count for that token in the training set
-  // because we assume we only bumped the training set count once per messages
-  // containing the token.
-  while (tokens.hasMoreTokens())
-  {
-    CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
-    remove(token->mWord, aJunkCount, aGoodCount);
-  }
-}
+    PRUint32 tokenCount = tokenizer.countTokens();
+    PRUint32 newTokenCount = 0;
+    
+    if (shrink) {
+      // Shrinking the token database is accomplished by dividing all token counts by 2.
+      // Recalculate the shrunk token count, keeping tokens with a count > 1
+      
+      TokenEnumeration tokens = tokenizer.getTokens();
+      for (PRUint32 i = 0; i < tokenCount; ++i) {
+        Token* token = tokens.nextToken();
+        if (token->mCount > 1)
+          newTokenCount++;
+      }
+    }
+    else // Use the original token count
+      newTokenCount = tokenCount;
+    
+    if (writeUInt32(stream, newTokenCount) != 1)
+        return PR_FALSE;
 
-void CorpusStore::rememberTokens(TokenEnumeration tokens,
-                    PRUint32 aJunkCount, PRUint32 aGoodCount)
-{
-  while (tokens.hasMoreTokens())
-  {
-    CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
-    if (!token)
-    {
-      NS_ERROR("null token");
-      continue;
+    if (newTokenCount > 0) {
+      TokenEnumeration tokens = tokenizer.getTokens();
+      for (PRUint32 i = 0; i < tokenCount; ++i) {
+            Token* token = tokens.nextToken();
+            PRUint32 wordCount = token->mCount;
+            if (shrink) {
+              if (wordCount > 1)
+                wordCount /= 2;
+              else
+                continue;
+            }
+            if (writeUInt32(stream, wordCount) != 1)
+                break;
+            PRUint32 tokenLength = token->mLength;
+            if (writeUInt32(stream, tokenLength) != 1)
+                break;
+            if (fwrite(token->mWord, tokenLength, 1, stream) != 1)
+                break;
+        }
     }
-    add(token->mWord, aJunkCount, aGoodCount);
-  }
+
+    return PR_TRUE;
 }
 
-PRBool CorpusStore::writeTokens(FILE* stream, PRBool shrink, PRBool aIsJunk)
-{
-  PRUint32 tokenCount = countTokens();
-  PRUint32 newTokenCount = 0;
-  TokenEnumeration tokens = getTokens();
-
-  // Shrinking the token database is accomplished by dividing all token
-  // counts by 2. If shrinking, recalculate the shrunk token count,
-  // keeping tokens with a count > 1. Otherwise, keep tokens with
-  // count > 0
-
-  for (PRUint32 i = 0; i < tokenCount; ++i)
-  {
-    CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
-    {
-      PRUint32 count = aIsJunk ? token->mJunkCount : token->mGoodCount;
-      if (count > 1 || (!shrink && count == 1))
-        newTokenCount++;
-    }
-  }
-
-  if (writeUInt32(stream, newTokenCount) != 1)
-    return PR_FALSE;
-
-  if (newTokenCount > 0)
-  {
-    TokenEnumeration tokens = getTokens();
-    for (PRUint32 i = 0; i < tokenCount; ++i)
-    {
-      CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
-      PRUint32 wordCount = aIsJunk ? token->mJunkCount : token->mGoodCount;
-      if (shrink)
-        wordCount /= 2;
-      if (!wordCount)
-        continue; // Don't output zero count words
-      if (writeUInt32(stream, wordCount) != 1)
-        return PR_FALSE;
-      PRUint32 tokenLength = strlen(token->mWord);
-      if (writeUInt32(stream, tokenLength) != 1)
-        return PR_FALSE;
-      if (fwrite(token->mWord, tokenLength, 1, stream) != 1)
-        return PR_FALSE;
-    }
-  }
-  return PR_TRUE;
-}
-
-PRBool CorpusStore::readTokens(FILE* stream, PRInt64 fileSize, PRBool isJunk)
+static PRBool readTokens(FILE* stream, Tokenizer& tokenizer, PRInt64 fileSize)
 {
     PRUint32 tokenCount;
     if (readUInt32(stream, &tokenCount) != 1)
         return PR_FALSE;
 
     PRInt64 fpos = ftell(stream);
     if (fpos < 0)
         return PR_FALSE;
@@ -1574,100 +1493,100 @@ PRBool CorpusStore::readTokens(FILE* str
             }
             buffer = new char[bufferSize];
             if (!buffer) return PR_FALSE;
         }
         if (fread(buffer, size, 1, stream) != 1)
             break;
         fpos += size;
         buffer[size] = '\0';
-        if (isJunk)
-          add(buffer, count, 0);
-        else
-          add(buffer, 0, count);
+        tokenizer.add(buffer, count);
     }
 
     delete[] buffer;
 
     return PR_TRUE;
 }
 
 
-nsresult CorpusStore::getTrainingFile(nsILocalFile ** aTrainingFile)
+nsresult nsBayesianFilter::getTrainingFile(nsILocalFile ** aTrainingFile)
 {
   // should we cache the profile manager's directory?
   nsCOMPtr<nsIFile> profileDir;
 
   nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir));
   NS_ENSURE_SUCCESS(rv, rv);
   rv = profileDir->Append(NS_LITERAL_STRING("training.dat"));
   NS_ENSURE_SUCCESS(rv, rv);
 
   return profileDir->QueryInterface(NS_GET_IID(nsILocalFile), (void **) aTrainingFile);
 }
 
 static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' };
 
-void CorpusStore::writeTrainingData(PRInt32 aMaximumTokenCount)
+void nsBayesianFilter::writeTrainingData()
 {
   PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG, ("writeTrainingData() entered"));
   if (!mTrainingFile)
     return;
 
   // open the file, and write out training data
   FILE* stream;
   nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream);
   if (NS_FAILED(rv))
     return;
-
+    
   // If the number of tokens exceeds our limit, set the shrink flag
   PRBool shrink = false;
-  if ((aMaximumTokenCount > 0) && // if 0, do not limit tokens
-      (countTokens() > aMaximumTokenCount))
-  {
+  if ((mMaximumTokenCount > 0) && // if 0, do not limit tokens
+      (mGoodTokens.countTokens() + mBadTokens.countTokens() > mMaximumTokenCount)) {
     shrink = true;
     PR_LOG(BayesianFilterLogModule, PR_LOG_WARNING, ("shrinking token data file"));
   }
 
-  // We implement shrink by dividing counts by two
-  PRUint32 shrinkFactor = shrink ? 2 : 1;
-
   if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) &&
-         writeUInt32(stream, mGoodMessageCount / shrinkFactor) &&
-         writeUInt32(stream, mJunkMessageCount / shrinkFactor) &&
-         writeTokens(stream, shrink, PR_FALSE) &&
-         writeTokens(stream, shrink, PR_TRUE)))
+        (writeUInt32(stream, shrink ? mGoodCount/2 : mGoodCount) == 1) &&
+        (writeUInt32(stream, shrink ? mBadCount/2 : mBadCount) == 1) &&
+         writeTokens(stream, mGoodTokens, shrink) &&
+         writeTokens(stream, mBadTokens, shrink)))
   {
     NS_WARNING("failed to write training data.");
     fclose(stream);
     // delete the training data file, since it is potentially corrupt.
     mTrainingFile->Remove(PR_FALSE);
   }
   else
   {
     fclose(stream);
 
     if (shrink) {
 
       // We'll clear the tokens, and read them back in from the file.
       // Yes this is slower than in place, but this is a rare event.
 
-      if (countTokens())
+      if (mGoodTokens && mGoodTokens.countTokens())
       {
-        clearTokens();
-        mGoodMessageCount = 0;
-        mJunkMessageCount = 0;
+        mGoodTokens.clearTokens();
+        mGoodCount = 0;
+      }
+
+      if (mBadTokens && mBadTokens.countTokens())
+      {
+        mBadTokens.clearTokens();
+        mBadCount = 0;
       }
 
       readTrainingData();
     }
+
+    mTrainingDataDirty = PR_FALSE;
   }
 }
 
-void CorpusStore::readTrainingData()
+void nsBayesianFilter::readTrainingData()
 {
   if (!mTrainingFile)
     return;
 
   PRBool exists;
   nsresult rv = mTrainingFile->Exists(&exists);
   if (NS_FAILED(rv) || !exists)
     return;
@@ -1681,79 +1600,62 @@ void CorpusStore::readTrainingData()
   rv = mTrainingFile->GetFileSize(&fileSize);
   if (NS_FAILED(rv))
     return;
 
   // FIXME:  should make sure that the tokenizers are empty.
   char cookie[4];
   if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
         (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
-        (readUInt32(stream, &mGoodMessageCount) == 1) &&
-        (readUInt32(stream, &mJunkMessageCount) == 1) &&
-         readTokens(stream, fileSize, PR_FALSE) &&
-         readTokens(stream, fileSize, PR_TRUE))) {
+        (readUInt32(stream, &mGoodCount) == 1) &&
+        (readUInt32(stream, &mBadCount) == 1) &&
+         readTokens(stream, mGoodTokens, fileSize) &&
+         readTokens(stream, mBadTokens, fileSize))) {
       NS_WARNING("failed to read training data.");
       PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("failed to read training data."));
   }
 
   fclose(stream);
 }
 
-nsresult CorpusStore::resetTrainingData()
+NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(PRBool *aResult)
 {
-  // clear out our in memory training tokens...
-  if (countTokens())
-    clearTokens();
-
-  mGoodMessageCount = 0;
-  mJunkMessageCount = 0;
-
-  if (mTrainingFile)
-    mTrainingFile->Remove(PR_FALSE);
+  *aResult = (mGoodCount && mGoodTokens.countTokens() ||
+              mBadCount && mBadTokens.countTokens());
   return NS_OK;
 }
 
-inline CorpusToken* CorpusStore::get(const char* word)
-{
-    return static_cast<CorpusToken*>(TokenHash::get(word));
-}
-
-CorpusToken* CorpusStore::add(const char* word, PRUint32 aJunkCount,
-                              PRUint32 aGoodCount)
+/* void setMessageClassification (in string aMsgURL, in long aOldClassification, in long aNewClassification); */
+NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(const char *aMsgURL,
+                                                         nsMsgJunkStatus aOldClassification,
+                                                         nsMsgJunkStatus aNewClassification,
+                                                         nsIMsgWindow *aMsgWindow,
+                                                         nsIJunkMailClassificationListener *aListener)
 {
-  PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG,
-         ("add word: %s (aJunkCount=%d) (aGoodCount=%d)", word, aJunkCount,
-         aGoodCount));
-  CorpusToken* token = static_cast<CorpusToken*>(TokenHash::add(word));
-  if (token)
-  {
-    token->mJunkCount += aJunkCount;
-    token->mGoodCount += aGoodCount;
-  }
-  PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG,
-         ("adding word to corpus store: %s (junkCount=%d) (goodCount=%d)",
-         word, token->mJunkCount, token->mGoodCount));
-  return token;
+    MessageObserver* analyzer = new MessageObserver(this, aOldClassification, aNewClassification, aListener);
+    if (!analyzer) return NS_ERROR_OUT_OF_MEMORY;
+    TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+    analyzer->setTokenListener(tokenListener);
+    return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
 }
 
-void CorpusStore::remove(const char* word, PRUint32 aJunkCount,
-                         PRUint32 aGoodCount)
+NS_IMETHODIMP nsBayesianFilter::ResetTrainingData()
 {
-  PR_LOG(BayesianFilterLogModule, PR_LOG_DEBUG,
-         ("remove word: %s (junkCount=%d) (goodCount=%d)",
-         word, aJunkCount, aGoodCount));
-  CorpusToken* token = get(word);
-  if (token)
+  // clear out our in memory training tokens...
+  if (mGoodCount && mGoodTokens.countTokens())
+  {
+    mGoodTokens.clearTokens();
+    mGoodCount = 0;
+  }
+
+  if (mBadCount && mBadTokens.countTokens())
   {
-    if (token->mJunkCount >= aJunkCount)
-      token->mJunkCount -= aJunkCount;
-    else
-      token->mJunkCount = 0;
+    mBadTokens.clearTokens();
+    mBadCount = 0;
+  }
 
-    if (token->mGoodCount >= aGoodCount)
-      token->mGoodCount -= aGoodCount;
-    else
-      token->mGoodCount = 0;
+  // now remove training.dat
+  if (mTrainingFile)
+    mTrainingFile->Remove(PR_FALSE);
 
-    if (token->mGoodCount == 0 && token->mJunkCount == 0)
-      PL_DHashTableRawRemove(&mTokenTable, token);
-  }
+  return NS_OK;
 }
+
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
@@ -16,17 +16,16 @@
  *
  * The Initial Developer of the Original Code is
  * Netscape Communications Corporation.
  * Portions created by the Initial Developer are Copyright (C) 2002
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Patrick C. Beard <beard@netscape.com>
- *   Kent James <kent@caspia.com>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either of the GNU General Public License Version 2 or later (the "GPL"),
  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -53,212 +52,112 @@
 #define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES             15*60*1000
 
 struct Token;
 class TokenEnumeration;
 class TokenAnalyzer;
 class nsIMsgWindow;
 class nsIMimeHeaders;
 class nsIUTF8StringEnumerator;
-struct BaseToken;
-struct CorpusToken;
 
 /**
  * Helper class to enumerate Token objects in a PLDHashTable
  * safely and without copying (see bugzilla #174859). The
  * enumeration is safe to use until a PL_DHASH_ADD
  * or PL_DHASH_REMOVE is performed on the table.
  */
 class TokenEnumeration {
 public:
     TokenEnumeration(PLDHashTable* table);
     PRBool hasMoreTokens();
-    BaseToken* nextToken();
+    Token* nextToken();
     
 private:
     PRUint32 mEntrySize, mEntryCount, mEntryOffset;
     char *mEntryAddr, *mEntryLimit;
 };
 
-class TokenHash {
-public:
-
-    virtual ~TokenHash();
-    /**
-     * Clears out the previous message tokens.
-     */
-    nsresult clearTokens();
-    operator int() { return mTokenTable.entryStore != NULL; }
-    PRUint32 countTokens();
-    TokenEnumeration getTokens();
-    BaseToken* add(const char* word);
-    
-protected:
-    TokenHash(PRUint32 entrySize);
-    PLArenaPool mWordPool;
-    PRUint32 mEntrySize;
-    PLDHashTable mTokenTable;
-    char* copyWord(const char* word, PRUint32 len);
-    /**
-     * Calls passed-in function for each token in the table.
-     */
-    void visit(PRBool (*f) (BaseToken*, void*), void* data);
-    BaseToken* get(const char* word);
-
-};
-
-class Tokenizer: public TokenHash {
+class Tokenizer {
 public:
     Tokenizer();
     ~Tokenizer();
+
+    operator int() { return mTokenTable.entryStore != NULL; }
     
     Token* get(const char* word);
 
     // The training set keeps an occurrence count on each word. This count 
     // is supposed to count the # of messsages it occurs in.
     // When add/remove is called while tokenizing a message and NOT the training set,
     // 
     Token* add(const char* word, PRUint32 count = 1);
+    void remove(const char* word, PRUint32 count = 1);
     
+    PRUint32 countTokens();
     Token* copyTokens();
+    TokenEnumeration getTokens();
+
+    /**
+     * Clears out the previous message tokens.
+     */
+    nsresult clearTokens();
 
     void tokenize(const char* text);
 
     /**
      *  Creates specific tokens based on the mime headers for the message being tokenized
      */
     void tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues);
 
     void tokenizeAttachment(const char * aContentType, const char * aFileName);
 
+    /**
+     * Calls passed-in function for each token in the table.
+     */
+    void visit(PRBool (*f) (Token*, void*), void* data);
+
 private:
-
+    char* copyWord(const char* word, PRUint32 len);
     void tokenize_ascii_word(char * word);
     void tokenize_japanese_word(char* chunk);
     inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false);
     nsresult stripHTML(const nsAString& inString, nsAString& outString);
 
 private:
+    PLDHashTable mTokenTable;
+    PLArenaPool mWordPool;
     nsCOMPtr<nsISemanticUnitScanner> mScanner;
 };
 
-/**
- * Implements storage of a collection of message tokens and counts for
- * a corpus of classified messages
- */
-
-class CorpusStore: public TokenHash {
-public:
-    CorpusStore();
-    ~CorpusStore();
-
-    /**
-     * retrieve the token structure for a particular string
-     *
-     * @param word  the character representation of the token
-     *
-     * @return      token structure containing counts, null if not found
-     */
-    CorpusToken* get(const char* word);
-
-    /**
-     * add tokens to the storage, or increment counts if already exists.
-     *
-     * @param tokens     enumerator for the list of tokens to remember
-     * @param aJunkCount number of new messages classified as junk with this
-     *                   token list
-     * @param aGoodCount number of new messages classified as good with this
-     *                   token list
-     */
-    void rememberTokens(TokenEnumeration tokens, PRUint32 aJunkCount,
-                        PRUint32 aGoodCount);
-    
-    /**
-     * decrement counts for tokens in the storage, removing if all counts
-     * are zero
-     *
-     * @param tokens     enumerator for the list of tokens to forget
-     * @param aJunkCount number of messages classified as junk with this token
-     *                   to be removed
-     * @param aGoodCount number of new messages classified as good with this
-     *                   token to be removed
-     */
-    void forgetTokens(TokenEnumeration tokens, PRUint32 aJunkCount, PRUint32 aGoodCount);
-    
-    /**
-     * write the corpus information to file storage
-     *
-     * @param aMaximumTokenCount  prune tokens if number of tokens exceeds
-     *                            this value.  == 0  for no pruning
-     */
-    void writeTrainingData(PRInt32 aMaximumTokenCount);
-    
-    /**
-     * read the corpus information from file storage
-     */
-    void readTrainingData();
-    
-    /**
-     * delete the local corpus storage file and data
-     */
-    nsresult resetTrainingData();
-
-    PRUint32 mGoodMessageCount;    // count of good messages in the store
-    PRUint32 mJunkMessageCount;    // count of junk messages in the store
-
-protected:
-    
-    /**
-     * return the local corpus storage file
-     */
-    nsresult getTrainingFile(nsILocalFile ** aFile);
-
-    /**
-     * read token strings from the data file
-     */
-    PRBool readTokens(FILE* stream, PRInt64 fileSize, PRBool isJunk);
-    
-    /**
-     * write token strings to the data file
-     */
-    PRBool writeTokens(FILE* stream, PRBool shrink, PRBool aIsJunk);
-    
-    /**
-     * remove counts for a token string, and delete it if all counts are zero
-     */
-    void remove(const char* word, PRUint32 aJunkCount, PRUint32 aGoodCount);
-
-    /**
-     * add counts for a token string, adding the token string if new
-     */
-    CorpusToken* add(const char* word, PRUint32 aJunkCount,
-                     PRUint32 aGoodCount);
-    nsCOMPtr<nsILocalFile> mTrainingFile; // file used to store training data
-};
-
 class nsBayesianFilter : public nsIJunkMailPlugin {
 public:
     NS_DECL_ISUPPORTS
     NS_DECL_NSIMSGFILTERPLUGIN
     NS_DECL_NSIJUNKMAILPLUGIN
     
     nsBayesianFilter();
     virtual ~nsBayesianFilter();
     
     nsresult tokenizeMessage(const char* messageURI, nsIMsgWindow *aMsgWindow, TokenAnalyzer* analyzer);
     void classifyMessage(Tokenizer& tokens, const char* messageURI, nsIJunkMailClassificationListener* listener);
     void observeMessage(Tokenizer& tokens, const char* messageURI, nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification, 
                         nsIJunkMailClassificationListener* listener);
 
+    void writeTrainingData();
+    void readTrainingData();
+    nsresult getTrainingFile(nsILocalFile ** aFile);
+    
 protected:
 
     static void TimerCallback(nsITimer* aTimer, void* aClosure);
 
-    CorpusStore mCorpus;
+    Tokenizer mGoodTokens, mBadTokens;
     double   mJunkProbabilityThreshold;
+    PRUint32 mGoodCount, mBadCount;
     PRInt32 mMaximumTokenCount;
     PRPackedBool mTrainingDataDirty;
     PRInt32 mMinFlushInterval; // in milliseconds, must be positive
                                //and not too close to 0
     nsCOMPtr<nsITimer> mTimer;
+    nsCOMPtr<nsILocalFile> mTrainingFile;
 };
 
 #endif // _nsBayesianFilter_h__
--- a/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js
+++ b/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js
@@ -38,18 +38,18 @@
 
 // main setup
 
 do_import_script("../mailnews/extensions/bayesian-spam-filter/test/resources/trainingfile.js");
 
 const nsIPrefBranch = Cc["@mozilla.org/preferences-service;1"]
                         .getService(Ci.nsIPrefBranch);
 
-// before shrink, the trained messages have 78 tokens. Force shrink.
-nsIPrefBranch.setIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", 77);
+// before shrink, the trained messages have 84 tokens. Force shrink.
+nsIPrefBranch.setIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", 83);
 
 const nsIJunkMailPlugin = Cc["@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter"]
                             .getService(Ci.nsIJunkMailPlugin);
 const nsIIOService = Cc["@mozilla.org/network/io-service;1"]
                        .getService(Ci.nsIIOService);
 
 // local constants
 const kUnclassified = nsIJunkMailPlugin.UNCLASSIFIED;
@@ -104,17 +104,17 @@ var doTestingListener =
     for (var token in trainingData.mGoodCounts)
       dump("count: " + trainingData.mGoodCounts[token] + " token: " + token + "\n");
     print("Junk Counts");
     for (var token in trainingData.mJunkCounts)
       dump("count: " + trainingData.mJunkCounts[token] + " token: " + token + "\n");
     */
     
     /* Selected pre-shrink counts after training
-    training.data results: goodMessages=2 junkMessages = 4 tokens = 78
+    training.data results: goodMessages=2 junkMessages = 4 goodTokens = 34 junkTokens = 50
     Good counts
     count: 1 token: subject:report
     count: 2 token: important
     count: 2 token: to:careful reader <reader@example.org>
 
     Junk Counts
     count: 3 token: make
     count: 4 token: money