1 #include "CharsetConv.h"
2 #include "uchardect/src/nscore.h"
3 #include "uchardect/src/nsUniversalDetector.h"
4 #include <cstring>
5 #include <errno.h>
6 #include <iconv.h>
7 #include <iostream>
8 using namespace std;
9 using namespace mous;
10
11 namespace mous {
12
13 class nsDetectorWrapper: public nsUniversalDetector
14 {
15 public:
nsDetectorWrapper()16 nsDetectorWrapper():
17 //nsUniversalDetector(NS_FILTER_ALL)
18 //nsUniversalDetector(NS_FILTER_CJK)
19 nsUniversalDetector(NS_FILTER_CHINESE_SIMPLIFIED)
20 {
21 }
22
~nsDetectorWrapper()23 virtual ~nsDetectorWrapper()
24 {
25 }
26
GetCharset() const27 string GetCharset() const
28 {
29 return mCharset;
30 }
31
Reset()32 virtual void Reset()
33 {
34 nsUniversalDetector::Reset();
35 mCharset.clear();
36 }
37
38 private:
Report(const char * charset)39 virtual void Report(const char* charset)
40 {
41 mCharset.assign(charset);
42 }
43
44 private:
45 string mCharset;
46 };
47
48 }
49
CharsetConv(int buflen)50 CharsetConv::CharsetConv(int buflen):
51 mDetector(new nsDetectorWrapper),
52 mBuffer(new char[buflen]),
53 mBufferLen(buflen)
54 {
55 }
56
~CharsetConv()57 CharsetConv::~CharsetConv()
58 {
59 if (mDetector != NULL)
60 delete mDetector;
61 if (mBuffer != NULL)
62 delete[] mBuffer;
63 }
64
Probe(const char * buf,size_t len)65 string CharsetConv::Probe(const char* buf, size_t len)
66 {
67 /*
68 //#include <unicode/ucsdet.h>
69 //-licuio
70 int max = -1;
71 int maxi = -1;
72
73 UErrorCode uerr = U_ZERO_ERROR;
74 int32_t found = 1;
75 UCharsetDetector* udec = ucsdet_open(&uerr);
76 ucsdet_setText(udec, buf, len, &uerr);
77 const UCharsetMatch** match = ucsdet_detectAll(udec, &found, &uerr);
78 for (int i = 0; i < found; ++i) {
79 int conf = ucsdet_getConfidence(match[i], &uerr);
80 if (conf > max) {
81 max = conf;
82 maxi = i;
83 }
84 cout << ucsdet_getName(match[i], &uerr) << '\t';
85 cout << conf << endl;
86 }
87 cout << found << endl;
88 ucsdet_close(udec);
89
90 if (maxi != -1)
91 return ucsdet_getName(match[maxi], &uerr);
92 else
93 return "";
94 */
95
96 string charset;
97 int ret = mDetector->HandleData(buf, (PRUint32)len);
98 mDetector->DataEnd();
99 if (ret == NS_OK) {
100 charset.assign(mDetector->GetCharset());
101 }
102 mDetector->Reset();
103 return charset;
104 }
105
AutoConv(const char * buf,size_t len,string & content)106 bool CharsetConv::AutoConv(const char* buf, size_t len, string& content)
107 {
108 return AutoConvTo("UTF-8", buf, len, content);
109 }
110
AutoConvTo(const string & wanted,const char * buf,size_t len,string & content)111 bool CharsetConv::AutoConvTo(const string& wanted, const char* buf, size_t len, string& content)
112 {
113 string charset(Probe(buf, len));
114 if (!charset.empty()) {
115 return ConvFromTo(charset, wanted, buf, len, content);
116 } else {
117 return false;
118 }
119 }
120
ConvFromTo(const string & from,const string & wanted,const char * buf,size_t len,string & content)121 bool CharsetConv::ConvFromTo(const string& from, const string& wanted, const char* buf, size_t len, string& content)
122 {
123 typedef size_t (*StdIconv)(iconv_t, const char**, size_t*, char**, size_t*);
124 StdIconv std_iconv = (StdIconv)iconv;
125
126 if (from.empty() || wanted.empty())
127 return false;
128 if (from == wanted)
129 return false;
130
131 cout << "set:" << from << endl;
132 bool ok = true;
133 const char* inbuf = buf;
134 size_t inleft = len;
135
136 char* outstart;
137 size_t outlen;
138 char* outbuf;
139 size_t outleft;
140 if (len <= mBufferLen) {
141 outstart = mBuffer;
142 outlen = mBufferLen;
143 } else {
144 outstart = new char[len+4];
145 outlen = len+4;
146 }
147 outbuf = outstart;
148 outleft = outlen;
149
150 size_t converted = 0;
151 do {
152 iconv_t cd = iconv_open(wanted.c_str(), from.c_str());
153 if (cd == (iconv_t)-1) {
154 ok = false;
155 break;
156 }
157
158 errno = 0;
159 converted = std_iconv(cd, &inbuf, &inleft, &outbuf, &outleft);
160 if (converted != (size_t)-1) {
161 cout << "done" << endl;
162 break;
163 } else if (errno != E2BIG) {
164 cout << strerror(errno) << endl;
165 ok = false;
166 break;
167 }
168 inbuf = buf;
169 inleft = len;
170 if (outstart != mBuffer) {
171 delete[] outstart;
172 }
173 outlen = (outlen << 1);
174 outstart = new char[outlen];
175 outbuf = outstart;
176 outleft = outlen;
177
178 iconv_close(cd);
179 } while(true);
180
181 if (ok) {
182 content.assign(outstart, outlen-outleft);
183 }
184 if (outstart != mBuffer) {
185 delete[] outstart;
186 }
187 return ok;
188 }
189