1 /* Pango
2  * testboundaries.c: Test text boundary algorithms
3  *
4  * Copyright (C) 1999-2000 Red Hat Software
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public
17  * License along with this library; if not, write to the
18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19  * Boston, MA 02111-1307, USA.
20  */
21 
22 #include <string.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 
26 #include <glib.h>
27 #include <pango/pango.h>
28 
29 #ifndef G_OS_WIN32
30 #include <unistd.h>
31 #endif
32 
33 #define CHFORMAT "%0#6x"
34 
35 /* FIXME for now this just tests that the breaking of some sample
36  * text conforms to certain rules and invariants. But eventually
37  * we should also have test-result pairs, i.e. a string and some
38  * encoding of the correct way to break the string, to check
39  * more precisely that things worked
40  */
41 
42 
43 static int offset = 0;
44 static int line = 0;
45 static gunichar current_wc = 0;
46 static const char *line_start = NULL;
47 static const char *line_end = NULL;
48 
49 typedef void (* CharForeachFunc) (gunichar      wc,
50 				  gunichar      prev_wc,
51 				  gunichar      next_wc,
52 				  GUnicodeType  type,
53 				  GUnicodeType  prev_type,
54 				  GUnicodeType  next_type,
55 				  PangoLogAttr *attr,
56 				  PangoLogAttr *prev_attr,
57 				  PangoLogAttr *next_attr,
58 				  gpointer      data);
59 
60 static void
log_attr_foreach(const char * text,PangoLogAttr * attrs,CharForeachFunc func,gpointer data)61 log_attr_foreach (const char     *text,
62 		  PangoLogAttr   *attrs,
63 		  CharForeachFunc func,
64 		  gpointer        data)
65 {
66   const gchar *next = text;
67   gint length = strlen (text);
68   const gchar *end = text + length;
69   gint i = 0;
70   gunichar prev_wc;
71   gunichar next_wc;
72   GUnicodeType prev_type;
73   GUnicodeType next_type;
74 
75   if (next == end)
76     return;
77 
78   offset = 0;
79   line = 1;
80 
81   prev_type = (GUnicodeType) -1;
82   prev_wc = 0;
83 
84   next_wc = g_utf8_get_char (next);
85   next_type = g_unichar_type (next_wc);
86 
87   line_start = text;
88   line_end = text;
89 
90   while (next_wc != 0)
91     {
92       GUnicodeType type;
93       gunichar wc;
94 
95       wc = next_wc;
96       type = next_type;
97 
98       current_wc = wc;
99 
100       next = g_utf8_next_char (next);
101       line_end = next;
102 
103       if (next >= end)
104 	next_wc = 0;
105       else
106 	next_wc = g_utf8_get_char (next);
107 
108       if (next_wc)
109 	next_type = g_unichar_type (next_wc);
110 
111       (* func) (wc, prev_wc, next_wc,
112 		type, prev_type, next_type,
113 		&attrs[i],
114 		i != 0 ? &attrs[i-1] : NULL,
115 		next_wc != 0 ? &attrs[i+1] : NULL,
116 		data);
117 
118       prev_type = type;
119       prev_wc = wc;
120       ++i;
121       ++offset;
122       if (wc == '\n')
123 	{
124 	  ++line;
125 	  offset = 0;
126 	  line_start = next;
127 	  line_end = next;
128 	}
129     }
130 }
131 
132 static void
check_line_char(gunichar wc,gunichar prev_wc,gunichar next_wc,GUnicodeType type,GUnicodeType prev_type,GUnicodeType next_type,PangoLogAttr * attr,PangoLogAttr * prev_attr,PangoLogAttr * next_attr,gpointer data)133 check_line_char (gunichar      wc,
134 		 gunichar      prev_wc,
135 		 gunichar      next_wc,
136 		 GUnicodeType  type,
137 		 GUnicodeType  prev_type,
138 		 GUnicodeType  next_type,
139 		 PangoLogAttr *attr,
140 		 PangoLogAttr *prev_attr,
141 		 PangoLogAttr *next_attr,
142 		 gpointer      data)
143 {
144   GUnicodeBreakType break_type;
145   GUnicodeBreakType prev_break_type;
146 
147   break_type = g_unichar_break_type (wc);
148   if (prev_wc)
149     prev_break_type = g_unichar_break_type (prev_wc);
150   else
151     prev_break_type = G_UNICODE_BREAK_UNKNOWN;
152 
153   if (wc == '\n')
154     {
155       if (prev_wc == '\r')
156 	{
157           g_test_message ("Do not line break between \\r and \\n");
158           g_assert_false (attr->is_line_break);
159 	}
160 
161       if (next_attr != NULL)
162         {
163           g_test_message ("Line break after \\n");
164           g_assert_true (next_attr->is_line_break);
165 	}
166     }
167 
168   if (attr->is_line_break)
169     {
170       g_test_message ("first char in string should not be marked as a line break");
171       g_assert_false (prev_wc == 0);
172     }
173 
174   if (break_type == G_UNICODE_BREAK_SPACE)
175     {
176       g_test_message ("can't break lines before a space unless a mandatory break char precedes it or a combining mark follows; prev char was: " CHFORMAT, prev_wc);
177       g_assert_false (attr->is_line_break && prev_attr != NULL &&
178                       !attr->is_mandatory_break &&
179                       !(next_wc && g_unichar_break_type (next_wc) == G_UNICODE_BREAK_COMBINING_MARK));
180     }
181 
182   if (attr->is_mandatory_break)
183     {
184       g_test_message ("mandatory breaks must also be marked as regular breaks");
185       g_assert_true (attr->is_line_break);
186     }
187 
188 
189   /* FIXME use the break tables from break.c to automatically
190    * check invariants for each cell in the table. Shouldn't
191    * be that hard to do.
192    */
193 
194   g_test_message ("can't break between two open punctuation chars");
195   g_assert_false (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
196                   prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
197                   attr->is_line_break &&
198                   !attr->is_mandatory_break);
199 
200   g_test_message ("can't break between two close punctuation chars");
201   g_assert_false (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
202                   prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
203                   attr->is_line_break &&
204                   !attr->is_mandatory_break);
205 
206   g_test_message ("can't break letter-quotemark sequence");
207   g_assert_false (break_type == G_UNICODE_BREAK_QUOTATION &&
208                   prev_break_type == G_UNICODE_BREAK_ALPHABETIC &&
209                   attr->is_line_break &&
210                   !attr->is_mandatory_break);
211 }
212 
213 static void
check_line_invariants(const char * text,PangoLogAttr * attrs)214 check_line_invariants (const char   *text,
215 		       PangoLogAttr *attrs)
216 {
217   log_attr_foreach (text, attrs, check_line_char, NULL);
218 }
219 
220 static void
check_word_invariants(const char * text,PangoLogAttr * attrs)221 check_word_invariants (const char   *text,
222 		       PangoLogAttr *attrs)
223 {
224 
225 
226 }
227 
228 static void
check_sentence_invariants(const char * text,PangoLogAttr * attrs)229 check_sentence_invariants (const char   *text,
230 			   PangoLogAttr *attrs)
231 {
232 
233 
234 }
235 
236 static void
check_grapheme_invariants(const char * text,PangoLogAttr * attrs)237 check_grapheme_invariants (const char   *text,
238 			   PangoLogAttr *attrs)
239 {
240 
241 
242 }
243 
244 #if 0
245 static void print_sentences (const char   *text,
246 			     PangoLogAttr *attrs);
247 static void
248 print_sentences (const char   *text,
249 		 PangoLogAttr *attrs)
250 {
251   const char *p;
252   const char *last;
253   int i = 0;
254 
255   last = text;
256   p = text;
257 
258   while (*p)
259     {
260       if (attrs[i].is_sentence_boundary)
261 	{
262 	  char *s = g_strndup (last, p - last);
263 	  printf ("%s\n", s);
264 	  g_free (s);
265 	  last = p;
266 	}
267 
268       p = g_utf8_next_char (p);
269       ++i;
270     }
271 }
272 #endif
273 
274 static void
check_invariants(const char * text)275 check_invariants (const char *text)
276 {
277   int len;
278   PangoLogAttr *attrs;
279 
280   g_assert_true (g_utf8_validate (text, -1, NULL));
281 
282   len = g_utf8_strlen (text, -1);
283   attrs = g_new0 (PangoLogAttr, len + 1);
284 
285   pango_get_log_attrs (text,
286 		       -1,
287 		       0,
288 		       pango_language_from_string ("C"),
289 		       attrs,
290 		       len + 1);
291 
292   check_line_invariants (text, attrs);
293   check_sentence_invariants (text, attrs);
294   check_grapheme_invariants (text, attrs);
295   check_word_invariants (text, attrs);
296 
297 #if 0
298   print_sentences (text, attrs);
299 #endif
300 
301   g_free (attrs);
302 }
303 
304 static void
test_boundaries(void)305 test_boundaries (void)
306 {
307   const char *filename;
308   GError *error = NULL;
309   char *text;
310 
311   filename = g_test_get_filename (G_TEST_DIST, "boundaries.utf8", NULL);
312 
313   g_test_message ("sample file: %s\n", filename);
314 
315   g_file_get_contents (filename, &text, NULL, &error);
316   g_assert_no_error (error);
317 
318   check_invariants (text);
319 
320   g_free (text);
321 }
322 
323 int
main(int argc,char * argv[])324 main (int argc, char *argv[])
325 {
326   g_test_init (&argc, &argv, NULL);
327 
328   g_test_add_func ("/text/boundaries", test_boundaries);
329 
330   return g_test_run ();
331 }
332