Don't require a minimum data threshold for encodings that correspond to the selected language.
Bug 431054, r=Masatoshi Kimura (:emk) <VYV03354@nifty.ne.jp>
--- a/extensions/universalchardet/src/base/CharDistribution.cpp
+++ b/extensions/universalchardet/src/base/CharDistribution.cpp
@@ -44,22 +44,23 @@
#include "GB2312Freq.tab"
#define SURE_YES 0.99f
#define SURE_NO 0.01f
#define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data
-float CharDistributionAnalysis::GetConfidence()
+float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//if we didn't receive any character in our consideration range, or the
// number of frequent characters is below the minimum threshold, return
// negative answer
- if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
+ if (mTotalChars <= 0 ||
+ !aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO;
if (mTotalChars != mFreqChars) {
float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
if (r < SURE_YES)
return r;
}
--- a/extensions/universalchardet/src/base/CharDistribution.h
+++ b/extensions/universalchardet/src/base/CharDistribution.h
@@ -66,17 +66,17 @@ public:
{
if (512 > mCharToFreqOrder[order])
mFreqChars++;
}
}
}
//return confidence base on existing data
- float GetConfidence();
+ float GetConfidence(PRBool aIsPreferredLanguage);
//Reset analyser, clear any state
void Reset(void)
{
mDone = PR_FALSE;
mTotalChars = 0;
mFreqChars = 0;
}
--- a/extensions/universalchardet/src/base/JpCntx.cpp
+++ b/extensions/universalchardet/src/base/JpCntx.cpp
@@ -176,20 +176,20 @@ void JapaneseContextAnalysis::Reset(void
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
mRelSample[i] = 0;
mNeedToSkipCharNum = 0;
mLastCharOrder = -1;
mDone = PR_FALSE;
}
#define DONT_KNOW (float)-1
-float JapaneseContextAnalysis::GetConfidence()
+float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//This is just one way to calculate confidence. It works well for me.
- if (mTotalRel > MINIMUM_DATA_THRESHOLD)
+ if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else
return (float)DONT_KNOW;
}
PRInt32 SJISContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
{
--- a/extensions/universalchardet/src/base/JpCntx.h
+++ b/extensions/universalchardet/src/base/JpCntx.h
@@ -69,17 +69,17 @@ public:
{
mTotalRel++;
//count this sequence to its category counter
mRelSample[jp2CharContext[mLastCharOrder][order]]++;
}
mLastCharOrder = order;
}
- float GetConfidence();
+ float GetConfidence(PRBool aIsPreferredLanguage);
void Reset(void);
void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
protected:
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
virtual PRInt32 GetOrder(const char* str) = 0;
--- a/extensions/universalchardet/src/base/nsBig5Prober.cpp
+++ b/extensions/universalchardet/src/base/nsBig5Prober.cpp
@@ -76,13 +76,13 @@ nsProbingState nsBig5Prober::HandleData(
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
mState = eFoundIt;
return mState;
}
float nsBig5Prober::GetConfidence(void)
{
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}
--- a/extensions/universalchardet/src/base/nsBig5Prober.h
+++ b/extensions/universalchardet/src/base/nsBig5Prober.h
@@ -39,18 +39,20 @@
#define nsBig5Prober_h__
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsBig5Prober: public nsCharSetProber {
public:
- nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
- Reset();}
+ nsBig5Prober(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&Big5SMModel);
+ Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
@@ -58,14 +60,15 @@ protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
//Big5ContextAnalysis mContextAnalyser;
Big5DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsBig5Prober_h__ */
--- a/extensions/universalchardet/src/base/nsEUCJPProber.cpp
+++ b/extensions/universalchardet/src/base/nsEUCJPProber.cpp
@@ -86,14 +86,14 @@ nsProbingState nsEUCJPProber::HandleData
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
mState = eFoundIt;
return mState;
}
float nsEUCJPProber::GetConfidence(void)
{
- float contxtCf = mContextAnalyser.GetConfidence();
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}
--- a/extensions/universalchardet/src/base/nsEUCJPProber.h
+++ b/extensions/universalchardet/src/base/nsEUCJPProber.h
@@ -45,31 +45,34 @@
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "JpCntx.h"
#include "CharDistribution.h"
class nsEUCJPProber: public nsCharSetProber {
public:
- nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
- Reset();}
+ nsEUCJPProber(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
+ Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
protected:
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
EUCJPContextAnalysis mContextAnalyser;
EUCJPDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsEUCJPProber_h__ */
--- a/extensions/universalchardet/src/base/nsEUCKRProber.cpp
+++ b/extensions/universalchardet/src/base/nsEUCKRProber.cpp
@@ -79,13 +79,13 @@ nsProbingState nsEUCKRProber::HandleData
// else
// mDistributionAnalyser.HandleData(aBuf, aLen);
return mState;
}
float nsEUCKRProber::GetConfidence(void)
{
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}
--- a/extensions/universalchardet/src/base/nsEUCKRProber.h
+++ b/extensions/universalchardet/src/base/nsEUCKRProber.h
@@ -39,18 +39,21 @@
#define nsEUCKRProber_h__
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsEUCKRProber: public nsCharSetProber {
public:
- nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
- Reset();}
+ nsEUCKRProber(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
+ Reset();
+ }
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-KR";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
@@ -58,14 +61,15 @@ protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
//EUCKRContextAnalysis mContextAnalyser;
EUCKRDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsEUCKRProber_h__ */
--- a/extensions/universalchardet/src/base/nsEUCTWProber.cpp
+++ b/extensions/universalchardet/src/base/nsEUCTWProber.cpp
@@ -79,13 +79,13 @@ nsProbingState nsEUCTWProber::HandleData
// else
// mDistributionAnalyser.HandleData(aBuf, aLen);
return mState;
}
float nsEUCTWProber::GetConfidence(void)
{
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}
--- a/extensions/universalchardet/src/base/nsEUCTWProber.h
+++ b/extensions/universalchardet/src/base/nsEUCTWProber.h
@@ -39,18 +39,20 @@
#define nsEUCTWProber_h__
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
class nsEUCTWProber: public nsCharSetProber {
public:
- nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
- Reset();}
+ nsEUCTWProber(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
+ Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
@@ -58,14 +60,15 @@ protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
//EUCTWContextAnalysis mContextAnalyser;
EUCTWDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsEUCTWProber_h__ */
--- a/extensions/universalchardet/src/base/nsGB2312Prober.cpp
+++ b/extensions/universalchardet/src/base/nsGB2312Prober.cpp
@@ -84,13 +84,13 @@ nsProbingState nsGB18030Prober::HandleDa
// else
// mDistributionAnalyser.HandleData(aBuf, aLen);
return mState;
}
float nsGB18030Prober::GetConfidence(void)
{
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}
--- a/extensions/universalchardet/src/base/nsGB2312Prober.h
+++ b/extensions/universalchardet/src/base/nsGB2312Prober.h
@@ -41,18 +41,20 @@
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "CharDistribution.h"
// We use gb18030 to replace gb2312, because 18030 is a superset.
class nsGB18030Prober: public nsCharSetProber {
public:
- nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
- Reset();}
+ nsGB18030Prober(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
+ Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
@@ -60,14 +62,15 @@ protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
//GB2312ContextAnalysis mContextAnalyser;
GB2312DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsGB2312Prober_h__ */
--- a/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp
+++ b/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp
@@ -58,27 +58,27 @@ const char *ProberName[] =
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
mProbers[i] = nsnull;
mProbers[0] = new nsUTF8Prober();
if (aLanguageFilter & NS_FILTER_JAPANESE)
{
- mProbers[1] = new nsSJISProber();
- mProbers[2] = new nsEUCJPProber();
+ mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
+ mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
}
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
- mProbers[3] = new nsGB18030Prober();
+ mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
- mProbers[4] = new nsEUCKRProber();
+ mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
- mProbers[5] = new nsBig5Prober();
- mProbers[6] = new nsEUCTWProber();
+ mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
+ mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
Reset();
}
nsMBCSGroupProber::~nsMBCSGroupProber()
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
{
--- a/extensions/universalchardet/src/base/nsSJISProber.cpp
+++ b/extensions/universalchardet/src/base/nsSJISProber.cpp
@@ -85,14 +85,14 @@ nsProbingState nsSJISProber::HandleData(
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
mState = eFoundIt;
return mState;
}
float nsSJISProber::GetConfidence(void)
{
- float contxtCf = mContextAnalyser.GetConfidence();
- float distribCf = mDistributionAnalyser.GetConfidence();
+ float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
+ float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}
--- a/extensions/universalchardet/src/base/nsSJISProber.h
+++ b/extensions/universalchardet/src/base/nsSJISProber.h
@@ -46,32 +46,35 @@
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
#include "JpCntx.h"
#include "CharDistribution.h"
class nsSJISProber: public nsCharSetProber {
public:
- nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
- Reset();}
+ nsSJISProber(PRBool aIsPreferredLanguage)
+ :mIsPreferredLanguage(aIsPreferredLanguage)
+ {mCodingSM = new nsCodingStateMachine(&SJISSMModel);
+ Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {}
protected:
nsCodingStateMachine* mCodingSM;
nsProbingState mState;
SJISContextAnalysis mContextAnalyser;
SJISDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
+ PRBool mIsPreferredLanguage;
};
#endif /* nsSJISProber_h__ */
--- a/extensions/universalchardet/tests/Makefile.in
+++ b/extensions/universalchardet/tests/Makefile.in
@@ -78,12 +78,15 @@ relativesrcdir = extensions/universalcha
bug306272_text.html \
test_bug306272.html \
bug421271_text.html \
test_bug421271.html \
bug426271_text-euc-jp.html \
test_bug426271-euc-jp.html \
bug426271_text-utf-8.html \
test_bug426271-utf-8.html \
+ bug431054_text.html \
+ test_bug431054.html \
+ test_bug431054-japanese.html \
$(NULL)
libs:: $(_TEST_FILES)
$(INSTALL) $(foreach f,$^,"$f") $(DEPTH)/_tests/testing/mochitest/tests/$(relativesrcdir)
new file mode 100644
--- /dev/null
+++ b/extensions/universalchardet/tests/bug431054_text.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+¥í¥°¥¤¥ó
+</body>
+</html>
--- a/extensions/universalchardet/tests/test_bug306272.html
+++ b/extensions/universalchardet/tests/test_bug306272.html
@@ -17,17 +17,16 @@ https://bugzilla.mozilla.org/show_bug.cg
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 306272 **/
CharsetDetectionTests("bug306272_text.html",
"UTF-8",
new Array("ja_parallel_state_machine",
- "ko_parallel_state_machine",
"zh_parallel_state_machine",
"zhtw_parallel_state_machine",
"zhcn_parallel_state_machine",
"cjk_parallel_state_machine",
"universal_charset_detector"));
</script>
</pre>
</body>
new file mode 100644
--- /dev/null
+++ b/extensions/universalchardet/tests/test_bug431054-japanese.html
@@ -0,0 +1,28 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=431054
+-->
+<head>
+ <title>Test for Bug 431054</title>
+ <script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
+ <script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <script type="text/javascript" src="CharsetDetectionTests.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+</div>
+<iframe id="testframe"></iframe>
+<pre id="test">
+<script class="testbody" type="text/javascript">
+/** Test for Bug 431054 **/
+CharsetDetectionTests("bug431054_text.html",
+ "EUC-JP",
+ new Array("ja_parallel_state_machine"));
+</script>
+</pre>
+</body>
+</html>
new file mode 100644
--- /dev/null
+++ b/extensions/universalchardet/tests/test_bug431054.html
@@ -0,0 +1,32 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=431054
+-->
+<head>
+ <title>Test for Bug 431054</title>
+ <script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
+ <script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+ <script type="text/javascript" src="CharsetDetectionTests.js"></script>
+ <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+</div>
+<iframe id="testframe"></iframe>
+<pre id="test">
+<script class="testbody" type="text/javascript">
+/** Test for Bug 431054 **/
+CharsetDetectionTests("bug431054_text.html",
+ "windows-1252",
+ new Array("zhtw_parallel_state_machine",
+ "zhcn_parallel_state_machine",
+ "zh_parallel_state_machine",
+ "cjk_parallel_state_machine",
+ "universal_charset_detector"));
+</script>
+</pre>
+</body>
+</html>