1 //   MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 //   Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 //   Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #ifndef MECAB_UTILS_H
7 #define MECAB_UTILS_H
8 
9 #include <algorithm>
10 #include <cstdlib>
11 #include <cstdio>
12 #include <cstring>
13 #include <string>
14 #include <vector>
15 #include "common.h"
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #ifdef HAVE_STDINT_H
22 #include <stdint.h>
23 #else  // HAVE_STDINT_H
24 #if defined(_WIN32) && !defined(__CYGWIN__)
25 #if defined(_MSC_VER) && (_MSC_VER <= 1500)
26 typedef unsigned char uint8_t;
27 typedef unsigned long uint32_t;
28 typedef unsigned long long uint64_t;
29 #else  // _MSC_VER
30 #include <stdint.h>
31 #endif  // _MSC_VER
32 #else   // _WIN32
33 typedef unsigned char uint8_t;
34 typedef unsigned long uint32_t;
35 typedef unsigned __int64 uint64_t;
36 #endif  // _WIN32
37 #endif  // HAVE_STDINT_H
38 
39 namespace MeCab {
40 
41 class Param;
42 
43 enum { EUC_JP, CP932, UTF8, UTF16, UTF16LE, UTF16BE, ASCII };
44 int decode_charset(const char *charset);
45 
dtoa(double val,char * s)46 void inline dtoa(double val, char *s) {
47   std::sprintf(s, "%-16f", val);
48   char *p = s;
49   for (; *p != ' '; ++p) {}
50   *p = '\0';
51   return;
52 }
53 
54 template <class T>
itoa(T val,char * s)55 inline void itoa(T val, char *s) {
56   char *t;
57   T mod;
58 
59   if (val < 0) {
60     *s++ = '-';
61     val = -val;
62   }
63   t = s;
64 
65   while (val) {
66     mod = val % 10;
67     *t++ = static_cast<char>(mod) + '0';
68     val /= 10;
69   }
70 
71   if (s == t) *t++ = '0';
72   *t = '\0';
73   std::reverse(s, t);
74 
75   return;
76 }
77 
78 template <class T>
uitoa(T val,char * s)79 inline void uitoa(T val, char *s) {
80   char *t;
81   T mod;
82   t = s;
83   while (val) {
84     mod = val % 10;
85     *t++ = static_cast<char>(mod) + '0';
86     val /= 10;
87   }
88 
89   if (s == t) *t++ = '0';
90   *t = '\0';
91   std::reverse(s, t);
92   return;
93 }
94 
read_ptr(const char ** ptr,size_t size)95 inline const char *read_ptr(const char **ptr, size_t size) {
96   const char *r = *ptr;
97   *ptr += size;
98   return r;
99 }
100 
101 template <class T>
read_static(const char ** ptr,T & value)102 inline void read_static(const char **ptr, T& value) {
103   const char *r = read_ptr(ptr, sizeof(T));
104   memcpy(&value, r, sizeof(T));
105 }
106 
107 bool file_exists(const char *filename);
108 
109 int load_request_type(const Param &param);
110 
111 bool load_dictionary_resource(Param *);
112 
113 bool escape_csv_element(std::string *w);
114 
115 void enum_csv_dictionaries(const char *path,
116                            std::vector<std::string> *dics);
117 
118 int  progress_bar(const char* message, size_t current, size_t total);
119 
120 bool toLower(std::string *);
121 
122 std::string create_filename(const std::string &path,
123                             const std::string &file);
124 void remove_filename(std::string *s);
125 void remove_pathname(std::string *s);
126 void replace_string(std::string *s,
127                     const std::string &src,
128                     const std::string &dst);
129 
130 template <class Iterator>
tokenizeCSV(char * str,Iterator out,size_t max)131 inline size_t tokenizeCSV(char *str,
132                           Iterator out, size_t max) {
133   char *eos = str + std::strlen(str);
134   char *start = 0;
135   char *end = 0;
136   size_t n = 0;
137 
138   for (; str < eos; ++str) {
139     // skip white spaces
140      while (*str == ' ' || *str == '\t') ++str;
141     if (*str == '"') {
142       start = ++str;
143       end = start;
144       for (; str < eos; ++str) {
145         if (*str == '"') {
146           str++;
147           if (*str != '"')
148             break;
149         }
150         *end++ = *str;
151       }
152       str = std::find(str, eos, ',');
153     } else {
154       start = str;
155       str = std::find(str, eos, ',');
156       end = str;
157     }
158     if (max-- > 1) *end = '\0';
159     *out++ = start;
160     ++n;
161     if (max == 0) break;
162   }
163 
164   return n;
165 }
166 
167 template <class Iterator>
tokenize(char * str,const char * del,Iterator out,size_t max)168 inline size_t tokenize(char *str, const char *del,
169                        Iterator out, size_t max) {
170   char *stre = str + std::strlen(str);
171   const char *dele = del + std::strlen(del);
172   size_t size = 0;
173 
174   while (size < max) {
175     char *n = std::find_first_of(str, stre, del, dele);
176     *n = '\0';
177     *out++ = str;
178     ++size;
179     if (n == stre) break;
180     str = n + 1;
181   }
182 
183   return size;
184 }
185 
186 // continus run of space is regarded as one space
187 template <class Iterator>
tokenize2(char * str,const char * del,Iterator out,size_t max)188 inline size_t tokenize2(char *str, const char *del,
189                         Iterator out, size_t max) {
190   char *stre = str + std::strlen(str);
191   const char *dele = del + std::strlen(del);
192   size_t size = 0;
193 
194   while (size < max) {
195     char *n = std::find_first_of(str, stre, del, dele);
196     *n = '\0';
197     if (*str != '\0') {
198       *out++ = str;
199       ++size;
200     }
201     if (n == stre) break;
202     str = n + 1;
203   }
204 
205   return size;
206 }
207 
logsumexp(double x,double y,bool flg)208 inline double logsumexp(double x, double y, bool flg) {
209 #define MINUS_LOG_EPSILON  50
210 
211   if (flg) return y;  // init mode
212   double vmin = std::min<double>(x, y);
213   double vmax = std::max<double>(x, y);
214   if (vmax > vmin + MINUS_LOG_EPSILON) {
215     return vmax;
216   } else {
217     return vmax + std::log(std::exp(vmin - vmax) + 1.0);
218   }
219 }
220 
tocost(double d,int n)221 inline short int tocost(double d, int n) {
222   static const short max = +32767;
223   static const short min = -32767;
224   return static_cast<short>(std::max<double>(
225                                 std::min<double>(
226                                     -n * d,
227                                     static_cast<double>(max)),
228                                 static_cast<double>(min)) );
229 }
230 
getEscapedChar(const char p)231 inline char getEscapedChar(const char p) {
232   switch (p) {
233     case '0':  return '\0';
234     case 'a':  return '\a';
235     case 'b':  return '\b';
236     case 't':  return '\t';
237     case 'n':  return '\n';
238     case 'v':  return '\v';
239     case 'f':  return '\f';
240     case 'r':  return '\r';
241     case 's':  return ' ';
242     case '\\': return '\\';
243     default: break;
244   }
245 
246   return '\0';  // never be here
247 }
248 
249 // return 64 bit hash
250 uint64_t fingerprint(const char *str, size_t size);
251 uint64_t fingerprint(const std::string &str);
252 
253 #if defined(_WIN32) && !defined(__CYGWIN__)
254 std::wstring Utf8ToWide(const std::string &input);
255 std::string WideToUtf8(const std::wstring &input);
256 #endif
257 }
258 #endif
259