1 #include "CharsetConv.h"
2 #include "uchardect/src/nscore.h"
3 #include "uchardect/src/nsUniversalDetector.h"
4 #include <cstring>
5 #include <errno.h>
6 #include <iconv.h>
7 #include <iostream>
8 using namespace std;
9 using namespace mous;
10 
11 namespace mous {
12 
13 class nsDetectorWrapper: public nsUniversalDetector
14 {
15 public:
nsDetectorWrapper()16     nsDetectorWrapper():
17         //nsUniversalDetector(NS_FILTER_ALL)
18         //nsUniversalDetector(NS_FILTER_CJK)
19         nsUniversalDetector(NS_FILTER_CHINESE_SIMPLIFIED)
20     {
21     }
22 
~nsDetectorWrapper()23     virtual ~nsDetectorWrapper()
24     {
25     }
26 
GetCharset() const27     string GetCharset() const
28     {
29         return mCharset;
30     }
31 
Reset()32     virtual void Reset()
33     {
34         nsUniversalDetector::Reset();
35         mCharset.clear();
36     }
37 
38 private:
Report(const char * charset)39     virtual void Report(const char* charset)
40     {
41         mCharset.assign(charset);
42     }
43 
44 private:
45     string mCharset;
46 };
47 
48 }
49 
CharsetConv(int buflen)50 CharsetConv::CharsetConv(int buflen):
51     mDetector(new nsDetectorWrapper),
52     mBuffer(new char[buflen]),
53     mBufferLen(buflen)
54 {
55 }
56 
~CharsetConv()57 CharsetConv::~CharsetConv()
58 {
59     if (mDetector != NULL)
60         delete mDetector;
61     if (mBuffer != NULL)
62         delete[] mBuffer;
63 }
64 
Probe(const char * buf,size_t len)65 string CharsetConv::Probe(const char* buf, size_t len)
66 {
67     /*
68     //#include <unicode/ucsdet.h>
69     //-licuio
70     int max = -1;
71     int maxi = -1;
72 
73     UErrorCode uerr = U_ZERO_ERROR;
74     int32_t found = 1;
75     UCharsetDetector* udec = ucsdet_open(&uerr);
76     ucsdet_setText(udec, buf, len, &uerr);
77     const UCharsetMatch** match = ucsdet_detectAll(udec, &found, &uerr);
78     for (int i = 0; i < found; ++i) {
79         int conf = ucsdet_getConfidence(match[i], &uerr);
80         if (conf > max) {
81             max = conf;
82             maxi = i;
83         }
84         cout << ucsdet_getName(match[i], &uerr) << '\t';
85         cout << conf << endl;
86     }
87     cout << found << endl;
88     ucsdet_close(udec);
89 
90     if (maxi != -1)
91         return ucsdet_getName(match[maxi], &uerr);
92     else
93         return "";
94     */
95 
96     string charset;
97     int ret = mDetector->HandleData(buf, (PRUint32)len);
98     mDetector->DataEnd();
99     if (ret == NS_OK) {
100         charset.assign(mDetector->GetCharset());
101     }
102     mDetector->Reset();
103     return charset;
104 }
105 
AutoConv(const char * buf,size_t len,string & content)106 bool CharsetConv::AutoConv(const char* buf, size_t len, string& content)
107 {
108     return AutoConvTo("UTF-8", buf, len, content);
109 }
110 
AutoConvTo(const string & wanted,const char * buf,size_t len,string & content)111 bool CharsetConv::AutoConvTo(const string& wanted, const char* buf, size_t len, string& content)
112 {
113     string charset(Probe(buf, len));
114     if (!charset.empty()) {
115         return ConvFromTo(charset, wanted, buf, len, content);
116     } else {
117         return false;
118     }
119 }
120 
ConvFromTo(const string & from,const string & wanted,const char * buf,size_t len,string & content)121 bool CharsetConv::ConvFromTo(const string& from, const string& wanted, const char* buf, size_t len, string& content)
122 {
123     typedef size_t (*StdIconv)(iconv_t, const char**, size_t*, char**, size_t*);
124     StdIconv std_iconv = (StdIconv)iconv;
125 
126     if (from.empty() || wanted.empty())
127         return false;
128     if (from == wanted)
129         return false;
130 
131     cout << "set:" << from << endl;
132     bool ok = true;
133     const char* inbuf = buf;
134     size_t inleft = len;
135 
136     char* outstart;
137     size_t outlen;
138     char* outbuf;
139     size_t outleft;
140     if (len <= mBufferLen) {
141         outstart = mBuffer;
142         outlen = mBufferLen;
143     } else {
144         outstart = new char[len+4];
145         outlen = len+4;
146     }
147     outbuf = outstart;
148     outleft = outlen;
149 
150     size_t converted = 0;
151     do {
152         iconv_t cd = iconv_open(wanted.c_str(), from.c_str());
153         if (cd == (iconv_t)-1) {
154             ok = false;
155             break;
156         }
157 
158         errno = 0;
159         converted = std_iconv(cd, &inbuf, &inleft, &outbuf, &outleft);
160         if (converted != (size_t)-1) {
161             cout << "done" << endl;
162             break;
163         } else if (errno != E2BIG) {
164             cout << strerror(errno) << endl;
165             ok = false;
166             break;
167         }
168         inbuf = buf;
169         inleft = len;
170         if (outstart != mBuffer) {
171             delete[] outstart;
172         }
173         outlen = (outlen << 1);
174         outstart = new char[outlen];
175         outbuf = outstart;
176         outleft = outlen;
177 
178         iconv_close(cd);
179     } while(true);
180 
181     if (ok) {
182         content.assign(outstart, outlen-outleft);
183     }
184     if (outstart != mBuffer) {
185         delete[] outstart;
186     }
187     return ok;
188 }
189