1 /* Sentence handling.
2 Copyright (C) 2015 Free Software Foundation, Inc.
3 Written by Daiki Ueno <ueno@gnu.org>, 2015.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "sentence.h"
24
25 #include <stdlib.h>
26 #include <string.h>
27 #include "unistr.h"
28
29
30 /* The minimal number of white spaces which should follow after the
31 end of sentence. */
32 int sentence_end_required_spaces = 1;
33
34 /* This function works in a similar way to 'forward-sentence' in
35 Emacs, which basically does a regular expression matching of:
36
37 [.?!\u2026]
38 []"'\u201d)}]*
39 \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)
40
41 Since we are lacking a regular expression routine capable of
42 Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
43 version, we would rather avoid depending on it), apply a manually
44 constructed DFA, which consists of 8 states where 4 of them are a
45 terminal. */
46 const char *
sentence_end(const char * string,ucs4_t * ending_charp)47 sentence_end (const char *string, ucs4_t *ending_charp)
48 {
49 const char *str = string;
50 const char *str_limit = string + strlen (str);
51 /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */
52 int state = 0;
53 /* Previous character before an end marker. */
54 ucs4_t ending_char = 0xfffd;
55 /* Possible starting position of the match, and the next starting
56 position if the current match fails. */
57 const char *match_start = NULL, *match_next = NULL;
58 /* Number of spaces. */
59 int spaces = 0;
60
61 while (str <= str_limit)
62 {
63 ucs4_t uc;
64 size_t length;
65
66 length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
67
68 if (state == 0)
69 {
70 switch (uc)
71 {
72 case '.': case '?': case '!': case 0x2026:
73 state = 1;
74 match_start = str;
75 match_next = str + length;
76 ending_char = uc;
77 spaces = 0;
78 break;
79
80 default:
81 break;
82 }
83
84 str += length;
85 continue;
86 }
87
88 if (state == 1)
89 {
90 switch (uc)
91 {
92 case ']': case '"': case '\'': case ')': case '}': case 0x201d:
93 state = 2;
94 break;
95
96 case '\0': case '\n':
97 /* State 3. */
98 *ending_charp = ending_char;
99 return match_start;
100
101 case ' ': case 0x00a0:
102 if (++spaces == sentence_end_required_spaces)
103 {
104 /* State 7. */
105 *ending_charp = ending_char;
106 return match_start;
107 }
108 state = 4;
109 break;
110
111 case '\t':
112 /* State 5. */
113 *ending_charp = ending_char;
114 return match_start;
115
116 default:
117 str = match_next;
118 state = 0;
119 continue;
120 }
121
122 str += length;
123 continue;
124 }
125
126 if (state == 2)
127 {
128 switch (uc)
129 {
130 case ']': case '"': case '\'': case ')': case '}': case 0x201d:
131 break;
132
133 case '\0': case '\n':
134 /* State 3. */
135 *ending_charp = ending_char;
136 return match_start;
137
138 case ' ': case 0x00a0:
139 if (++spaces == sentence_end_required_spaces)
140 {
141 /* State 7. */
142 *ending_charp = ending_char;
143 return match_start;
144 }
145 state = 4;
146 break;
147
148 case '\t':
149 /* State 5. */
150 *ending_charp = ending_char;
151 return match_start;
152
153 default:
154 state = 0;
155 str = match_next;
156 continue;
157 }
158
159 str += length;
160 continue;
161 }
162
163 if (state == 4)
164 {
165 switch (uc)
166 {
167 case '\0': case '\n':
168 /* State 6. */
169 *ending_charp = ending_char;
170 return match_start;
171
172 case ' ': case 0x00a0:
173 if (++spaces == sentence_end_required_spaces)
174 {
175 /* State 7. */
176 *ending_charp = ending_char;
177 return match_start;
178 }
179 break;
180
181 default:
182 state = 0;
183 str = match_next;
184 continue;
185 }
186
187 str += length;
188 continue;
189 }
190 }
191
192 *ending_charp = 0xfffd;
193 return str_limit;
194 }
195