1 /*
2  * Copyright (C) 2008 Red Hat, Inc.
3  *
4  * This is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Library General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public
15  * License along with this program; if not, write to the Free Software
16  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17  *
18  * Author(s):
19  * 	Behdad Esfahbod
20  */
21 
22 #include <config.h>
23 
24 #include "vteunistr.h"
25 
26 #include <string.h>
27 
28 
29 /* Overview:
30  *
31  * The way vteunistr is implemented is very simple: Unicode only defines
32  * codepoints less than 0x110000.  That leaves plenty of room in a guint32 to
33  * use for other things.  So, whenever our "string" contains only one Unicode
34  * character, we use its code as our vteunistr.  Otherwise, we register the
35  * string in a central registry and assign a unique number to it and use that.
36  * This number is "our own private internal non-unicode code for this
37  * sequence of characters".
38  *
39  * The rest of the problem would be how to efficiently implement this
40  * registry.  It does *NOT* really have to be efficient at all, as it will
41  * only be accessed in case of combining marks.  And the strings are pretty
42  * short (two or three characters).  But our implementation is quite efficient
43  * anyway.
44  *
45  * The access pattern of using vteunistr's is that we have a vteunistr in a
46  * terminal cell, a new gunichar comes in and we decide to combine with it,
47  * and we combine them and get a new vteunistr.  So, that is exactly how we
48  * encode vteunistr's: all we need to know about a vteunistr to be able to
49  * reconstruct its string is the vteunistr and the gunichar that joined to
50  * form it.  That's what VteUnistrDecomp is.  That is the decomposition.
51  *
52  * We start giving new vteunistr's unique numbers starting at
53  * %VTE_UNISTR_START+1 and going up.  We keep the decompositions in a GArray,
54  * called unistr_decomp.  The first entry of the array is unused (that's why
55  * we start from %VTE_UNISTR_START plus one).  The decomposition table provides
56  * enough information to efficiently answer questions like "what's the first
57  * gunichar in this vteunistr?", "what's the sequence of gunichar's in this
58  * vteunistr?", and "how many gunichar's are there in this vteunistr?".
59  *
60  * We still do not have any efficient way to construct new vteunistr's though.
61  * Given a vteunistr and a gunichar, we have to walk over the entire
62  * decomposition table to see if we have already registered (encoded) this
63  * combination.  To make that operation fast, we use a reverse map, that is,
64  * a GHashTable named unistr_comp.  The hash table maps a decomposition to its
65  * encoded vteunistr value.  The value obviously fits in a pointer and does
66  * not need memory allocation.  We also want to avoid allocating memory for
67  * the key of the hash table entries, as we already have those decompositions
68  * in the memory in the unistr_decomp array.  We cannot use direct pointers
69  * though as when growing, the GArray may resize and move to a new memory
70  * buffer, rendering all our pointers invalid.  For this reason, we keep the
71  * index into the array as our hash table key.  When we want to perform a
72  * lookup in the hash table, we insert the decomposition that we are searching
73  * for as item zero in the unistr_decomp table, then lookup for an equal entry
74  * of it in the hash table.  Finally, if the hash lookup fails, we add the new
75  * decomposition to the lookup array and the hash table, and return the newly
76  * encoded vteunistr value.
77  */
78 
79 #define VTE_UNISTR_START 0x80000000
80 
81 static vteunistr unistr_next = VTE_UNISTR_START + 1;
82 
83 struct VteUnistrDecomp {
84 	vteunistr prefix;
85 	gunichar  suffix;
86 };
87 
88 GArray     *unistr_decomp;
89 GHashTable *unistr_comp;
90 
91 #define DECOMP_FROM_INDEX(i)	g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
92 #define DECOMP_FROM_UNISTR(s)	DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)
93 
94 static guint
unistr_comp_hash(gconstpointer key)95 unistr_comp_hash (gconstpointer key)
96 {
97 	struct VteUnistrDecomp *decomp;
98 	decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
99 	return decomp->prefix ^ decomp->suffix;
100 }
101 
102 static gboolean
unistr_comp_equal(gconstpointer a,gconstpointer b)103 unistr_comp_equal (gconstpointer a, gconstpointer b)
104 {
105 	return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
106 			    &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
107 			    sizeof (struct VteUnistrDecomp));
108 }
109 
110 vteunistr
_vte_unistr_append_unichar(vteunistr s,gunichar c)111 _vte_unistr_append_unichar (vteunistr s, gunichar c)
112 {
113 	struct VteUnistrDecomp decomp;
114 	vteunistr ret = 0;
115 
116 	decomp.prefix = s;
117 	decomp.suffix = c;
118 
119 	if (G_UNLIKELY (!unistr_decomp)) {
120 		unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
121 		g_array_set_size (unistr_decomp, 1);
122 		unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
123 	} else {
124 		DECOMP_FROM_INDEX (0) = decomp;
125 		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
126 	}
127 
128 	if (G_UNLIKELY (!ret)) {
129 		/* sanity check to avoid OOM */
130 		if (G_UNLIKELY (_vte_unistr_strlen (s) > 10 || unistr_next - VTE_UNISTR_START > 100000))
131 			return s;
132 
133 		ret = unistr_next++;
134 		g_array_append_val (unistr_decomp, decomp);
135 		g_hash_table_insert (unistr_comp,
136 				     GUINT_TO_POINTER (ret - VTE_UNISTR_START),
137 				     GUINT_TO_POINTER (ret));
138 	}
139 
140 	return ret;
141 }
142 
143 gunichar
_vte_unistr_get_base(vteunistr s)144 _vte_unistr_get_base (vteunistr s)
145 {
146 	g_return_val_if_fail (s < unistr_next, s);
147 	while (G_UNLIKELY (s >= VTE_UNISTR_START))
148 		s = DECOMP_FROM_UNISTR (s).prefix;
149 	return (gunichar) s;
150 }
151 
152 void
_vte_unistr_append_to_string(vteunistr s,GString * gs)153 _vte_unistr_append_to_string (vteunistr s, GString *gs)
154 {
155 	g_return_if_fail (s < unistr_next);
156 	if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
157 		struct VteUnistrDecomp *decomp;
158 		decomp = &DECOMP_FROM_UNISTR (s);
159 		_vte_unistr_append_to_string (decomp->prefix, gs);
160 		s = decomp->suffix;
161 	}
162 	g_string_append_unichar (gs, (gunichar) s);
163 }
164 
165 int
_vte_unistr_strlen(vteunistr s)166 _vte_unistr_strlen (vteunistr s)
167 {
168 	int len = 1;
169 	g_return_val_if_fail (s < unistr_next, len);
170 	while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
171 		s = DECOMP_FROM_UNISTR (s).prefix;
172 		len++;
173 	}
174 	return len;
175 }
176