1 
2 /******************************************************************************
3 * MODULE     : hyphenate.cpp
4 * DESCRIPTION: hyphenation by Liang's algorithm
5 * COPYRIGHT  : (C) 1999  Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11 
12 #include "file.hpp"
13 #include "hyphenate.hpp"
14 #include "analyze.hpp"
15 #include "converter.hpp"
16 
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 
21 #define MAX_SEARCH 10
22 #define MAX_BUFFER_SIZE 256
23 
24 /*
25 static bool
26 my_strncmp (char* s1, char* s2, int len) {
27   int i;
28   for (i=0; i<len; i++) if (s1[i]!=s2[i]) return false;
29   return true;
30 }
31 */
32 
33 static string
unpattern(string s)34 unpattern (string s) {
35   int i, n= N(s);
36   string r;
37   for (i=0; i<n; ) {
38     while ((i<n) && (s[i]>='0') && (s[i]<='9')) i++;
39     if (i<n) r << s[i++];
40   }
41   return r;
42 }
43 
44 static string
hyphen_normalize(string s)45 hyphen_normalize (string s) {
46   int i;
47   string r (0);
48   for (i=0; i<N(s); i++)
49     if ((i+3<N(s)) && (s[i]=='^') && (s[i+1]=='^')) {
50       r << from_hexadecimal (s (i+2, i+4));
51       i+=3;
52     }
53     else r << s[i];
54   return r;
55 }
56 
57 void
load_hyphen_tables(string file_name,hashmap<string,string> & patterns,hashmap<string,string> & hyphenations,bool toCork)58 load_hyphen_tables (string file_name,
59                     hashmap<string,string>& patterns,
60                     hashmap<string,string>& hyphenations, bool toCork) {
61   string s;
62   file_name= string ("hyphen.") * file_name;
63   load_string (url ("$TEXMACS_PATH/langs/natural/hyphen", file_name), s, true);
64   if (DEBUG_VERBOSE)
65     debug_automatic << "TeXmacs] Loading " << file_name << "\n";
66 
67   if (toCork) s= utf8_to_cork (s);
68 
69   hashmap<string,string> H ("?");
70   bool pattern_flag=false;
71   bool hyphenation_flag=false;
72   int i=0, n= N(s);
73   while (i<n) {
74     string buffer;
75     while ((i<n) && (s[i]!=' ') && (s[i]!='\t') && (s[i]!='\n') && (s[i]!='\r')) {
76       if (s[i] != '%') buffer << s[i++];
77       else while ((i<n) && (s[i]!='\n')) i++;
78     }
79     if (i<n) i++;
80     if (buffer == "}") {
81       pattern_flag=false;
82       hyphenation_flag=false;
83     }
84     if (pattern_flag && i != 0 && N(buffer) != 0) {
85       string norm= hyphen_normalize (buffer);
86       patterns (unpattern (norm))= norm;
87       //cout << unpattern (norm) << " ==> " << norm << "\n";
88     }
89     if (hyphenation_flag && i != 0 && N(buffer) != 0) {
90       string word= replace (buffer, "-", "");
91       hyphenations (word)= buffer;
92       //cout << word << " --> " << buffer << "\n";
93     }
94     if (buffer == "\\patterns{") pattern_flag=true;
95     if (buffer == "\\hyphenation{") hyphenation_flag=true;
96   }
97 }
98 
99 static string
lower_case(string s)100 lower_case (string s) {
101   int i;
102   string r (N(s));
103   for (i=0; i<N(s); i++) {
104     if ((s[i]>='A') && (s[i]<='Z'))
105       r[i]= (char) (((int) s[i])+ ((int) 'a')- ((int) 'A'));
106     else r[i]=s[i];
107   }
108   return r;
109 }
110 
111 array<int>
get_hyphens(string s,hashmap<string,string> patterns,hashmap<string,string> hyphenations)112 get_hyphens (string s,
113              hashmap<string,string> patterns,
114              hashmap<string,string> hyphenations) {
115   return get_hyphens (s, patterns, hyphenations, false);
116 }
117 
118 void
goto_next_char(string s,int & i,bool utf8)119 goto_next_char (string s, int &i, bool utf8) {
120   if (utf8) decode_from_utf8 (s, i);
121   else if (i < N(s)) {
122     if (s[i] == '<') {
123       i++;
124       while (i < N(s) && s[i] != '>') i++;
125       if (i < N(s)) i++;
126     }
127     else i++;
128   }
129 }
130 
131 int
str_length(string s,bool utf8)132 str_length (string s, bool utf8) {
133   if (utf8) {
134     int i=0, r=0;
135     while (i < N(s)) {
136       decode_from_utf8 (s, i);
137       r++;
138     }
139     return r;
140   }
141   else return N(s);
142 }
143 
144 array<int>
get_hyphens(string s,hashmap<string,string> patterns,hashmap<string,string> hyphenations,bool utf8)145 get_hyphens (string s,
146              hashmap<string,string> patterns,
147              hashmap<string,string> hyphenations, bool utf8) {
148   ASSERT (N(s) != 0, "hyphenation of empty string");
149 
150   if (utf8) s= cork_to_utf8 (s);
151 
152   if (hyphenations->contains (s)) {
153     string h= hyphenations [s];
154     array<int> penalty (str_length (s, utf8)-1);
155     int i=0, j=0;
156     while (h[j] == '-') j++;
157     i++; goto_next_char (h, j, utf8);
158     while (i < N(penalty)+1) {
159       penalty[i-1]= HYPH_INVALID;
160       while (j < N(h) && h[j] == '-') {
161         penalty[i-1]= HYPH_STD;
162         j++;
163       }
164       i++;
165       goto_next_char (h, j, utf8);
166     }
167     //cout << s << " --> " << penalty << "\n";
168     return penalty;
169   }
170   else {
171     s= "." * lower_case (s) * ".";
172     // cout << s << "\n";
173     int i, j, k, l, m, len;
174     array<int> T (str_length (s, utf8)+1);
175     for (i=0; i<N(T); i++) T[i]=0;
176     for (len=1; len < MAX_SEARCH; len++)
177       for (i=0, l=0; i<N(s) - len; goto_next_char (s, i, utf8), l++) {
178         string r= patterns [s (i, i+len)];
179         if (!(r == "?")) {
180           // cout << "  " << s (i, i+len) << " => " << r << "\n";
181           for (j=0, k=0; j<=len; j++, k++) {
182             if ((k<N(r)) && (r[k]>='0') && (r[k]<='9')) {
183               m=((int) r[k])-((int) '0');
184               k++;
185             }
186             else m=0;
187             if (m>T[l+j]) T[l+j]=m;
188           }
189         }
190       }
191 
192     array<int> penalty (N(T)-4);
193     for (i=2; i < N(T)-4; i++)
194       penalty [i-2]= (((T[i]&1)==1)? HYPH_STD: HYPH_INVALID);
195     if (N(penalty)>0) penalty[0] = penalty[N(penalty)-1] = HYPH_INVALID;
196     if (N(penalty)>1) penalty[1] = penalty[N(penalty)-2] = HYPH_INVALID;
197     if (N(penalty)>2) penalty[N(penalty)-3] = HYPH_INVALID;
198     // cout << s << " --> " << penalty << "\n";
199     return penalty;
200   }
201 }
202 
203 void
std_hyphenate(string s,int after,string & left,string & right,int penalty)204 std_hyphenate (string s, int after, string& left, string& right, int penalty) {
205   std_hyphenate (s, after, left, right, penalty, false);
206 }
207 
208 void
std_hyphenate(string s,int after,string & left,string & right,int penalty,bool utf8)209 std_hyphenate (string s, int after, string& left, string& right, int penalty,
210                bool utf8) {
211   //cout << "Hyphen " << s << ", " << after << "\n";
212   if (!utf8) {
213     left = s (0, after+1);
214     right= s (after+1, N(s));
215   }
216   else {
217     int i= 0, l= 0;
218     while (i < N(s) && l <= after) {
219       if (s[i] == '<') {
220         while (i < N(s) && s[i] != '>') i++;
221         if (i < N(s)) i++;
222       }
223       else i++;
224       l++;
225     }
226     left = s (0, i);
227     right= s (i, N(s));
228     if (i == N(s)) return;
229   }
230   if (penalty >= HYPH_INVALID) left << string ("\\");
231   else left << string ("-");
232   //cout << "Yields " << left << ", " << right << "\n";
233 }
234