bug 471885 bayes analysis should probability to 0 or 1 with unbalanced tokens, r/sr=bienvenu
authorKent James <kent@caspia.com>
Sun, 18 Jan 2009 13:17:53 -0800
changeset 1674 629517ab551c0aa99862596fbfc7f475fade6bc4
parent 1673 9b2b51394a182142d39317ea0dc16f6752892ab5
child 1675 13557e287fe63782d1f886b2a64553e610358c53
push id1340
push userbienvenu@nventure.com
push dateSun, 18 Jan 2009 21:17:52 +0000
treeherdercomm-central@629517ab551c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs471885
bug 471885 bayes analysis should probability to 0 or 1 with unbalanced tokens, r/sr=bienvenu
mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@@ -1275,31 +1275,39 @@ void nsBayesianFilter::classifyMessage(
         continue;
       for (PRUint32 traitIndex = 0; traitIndex < traitCount; traitIndex++)
       {
         double proCount =
           static_cast<double>(mCorpus.getTraitCount(t, aProTraits[traitIndex]));
         double antiCount =
           static_cast<double>(mCorpus.getTraitCount(t, aAntiTraits[traitIndex]));
 
-        // if proCount and antiCount are both 0, we could end up with a
-        // divide by 0 error, tread carefully here. (Bug #240819)
-        double probDenom = (proCount * numAntiMessages[traitIndex] +
-                            antiCount * numProMessages[traitIndex]);
-        if (probDenom != 0.0)
+        double prob, proDenom, antiDenom;
+        // Prevent a divide by zero error by setting defaults for prob
+
+        // If there are no matching tokens at all, ignore.
+        if (antiCount == 0.0 && proCount == 0.0)
+          continue;
+        // if only anti match, set probability to 0%
+        if ((proDenom = proCount * numProMessages[traitIndex]) == 0.0)
+          prob = 0.0;
+        // if only pro match, set probability to 100%
+        else if ((antiDenom = antiCount * numAntiMessages[traitIndex]) == 0.0)
+          prob = 1.0;
+        else
+          prob = (proCount * numAntiMessages[traitIndex]) /
+                 (proDenom + antiDenom);
+
+        double n = proCount + antiCount;
+        prob =  (0.225 + n * prob) / (.45 + n);
+        double distance = PR_ABS(prob - 0.5);
+        if (distance >= .1)
         {
-          double prob = (proCount * numAntiMessages[traitIndex])/probDenom;
-          double n = proCount + antiCount;
-          prob =  (0.225 + n * prob) / (.45 + n);
-          double distance = PR_ABS(prob - 0.5);
-          if (distance >= .1)
-          {
-            nsresult rv = setAnalysis(token, traitIndex, distance, prob);
-            NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
-          }
+          nsresult rv = setAnalysis(token, traitIndex, distance, prob);
+          NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
         }
       }
     }
 
     for (PRUint32 traitIndex = 0; traitIndex < traitCount; traitIndex++)
     {
       nsAutoTArray<TraitAnalysis, 1024> traitAnalyses;
       // copy valid tokens into an array to sort
--- a/mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js
+++ b/mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js
@@ -90,20 +90,20 @@ var tests =
    traitListener: false,
    junkListener: true},
   // train 1 ham message
   {command: kTrainT,
    fileName: "ham1.eml",
    junkPercent: 0,
    traitListener: false,
    junkListener: true},
-  // with ham but no spam training, percents still 50 but classifies as ham
+  // with ham but no spam training, percents are 0 and classifies as ham
   {command: kClassT,
    fileName: "ham1.eml",
-   junkPercent: 50,
+   junkPercent: 0,
    traitListener: false,
    junkListener: true},
   // train 1 spam message
   {command: kTrainT,
    fileName: "spam1.eml",
    junkPercent: 100,
    traitListener: true,
    junkListener: false},