1 /* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5 */
6
7 // for japanese encoding, observe characteristic:
8 // 1, kana character (or hankaku?) often have high frequency of appearance
9 // 2, kana character often exist in group
10 // 3, certain combination of kana is never used in japanese language
11
12 #include "nsEUCJPProber.h"
13
14 namespace kencodingprober
15 {
Reset(void)16 void nsEUCJPProber::Reset(void)
17 {
18 mCodingSM->Reset();
19 mState = eDetecting;
20 mContextAnalyser.Reset();
21 mDistributionAnalyser.Reset();
22 }
23
HandleData(const char * aBuf,unsigned int aLen)24 nsProbingState nsEUCJPProber::HandleData(const char *aBuf, unsigned int aLen)
25 {
26 if (aLen == 0) {
27 return mState;
28 }
29
30 nsSMState codingState;
31
32 for (unsigned int i = 0; i < aLen; i++) {
33 codingState = mCodingSM->NextState(aBuf[i]);
34 if (codingState == eError) {
35 mState = eNotMe;
36 break;
37 }
38 if (codingState == eItsMe) {
39 mState = eFoundIt;
40 break;
41 }
42 if (codingState == eStart) {
43 unsigned int charLen = mCodingSM->GetCurrentCharLen();
44
45 if (i == 0) {
46 mLastChar[1] = aBuf[0];
47 mContextAnalyser.HandleOneChar(mLastChar, charLen);
48 mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
49 } else {
50 mContextAnalyser.HandleOneChar(aBuf + i - 1, charLen);
51 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen);
52 }
53 }
54 }
55
56 mLastChar[0] = aBuf[aLen - 1];
57
58 if (mState == eDetecting) {
59 if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
60 mState = eFoundIt;
61 }
62 }
63
64 return mState;
65 }
66
GetConfidence(void)67 float nsEUCJPProber::GetConfidence(void)
68 {
69 float contxtCf = mContextAnalyser.GetConfidence();
70 float distribCf = mDistributionAnalyser.GetConfidence();
71
72 return (contxtCf > distribCf ? contxtCf : distribCf);
73 }
74 }
75