1 /*
2  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
3  *
4  * Sccsid @(#)colldata.h	1.5 (gritter) 5/1/04
5  */
6 /*  UNIX(R) Regular Expresssion Library
7  *
8  *  Note: Code is released under the GNU LGPL
9  *
10  *  Copyright (C) 2001 Caldera International, Inc.
11  *
12  *  This library is free software; you can redistribute it and/or
13  *  modify it under the terms of the GNU Lesser General Public
14  *  License as published by the Free Software Foundation; either
15  *  version 2 of the License, or (at your option) any later version.
16  *
17  *  This library is distributed in the hope that it will be useful,
18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  *  Lesser General Public License for more details.
21  *
22  *  You should have received a copy of the GNU Lesser General Public
23  *  License along with this library; if not, write to:
24  *        Free Software Foundation, Inc.
25  *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26  */
27 
28 #ifndef	LIBUXRE_COLLDATA_H
29 #define	LIBUXRE_COLLDATA_H
30 
31 typedef struct
32 {
33 	long	coll_offst;	/* offset to xnd table */
34 	long	sub_cnt;	/* length of subnd table */
35 	long	sub_offst;	/* offset to subnd table */
36 	long	str_offst;	/* offset to strings for subnd table */
37 	long	flags;		/* nonzero if reg.exp. used */
38 } hd;
39 
40 typedef struct
41 {
42 	unsigned char	ch;	/* character or number of followers */
43 	unsigned char	pwt;	/* primary weight */
44 	unsigned char	swt;	/* secondary weight */
45 	unsigned char	ns;	/* index of follower state list */
46 } xnd;
47 
48 typedef struct
49 {
50 	char	*exp;	/* expression to be replaced */
51 	long	explen; /* length of expression */
52 	char	*repl;	/* replacement string */
53 } subnd;
54 
55 /*----------------------------------*/
56 
57 #include <wcharm.h>
58 #include <limits.h>
59 /*	#include <stdlock.h>	*/
60 
61 /*
62 * Structure of a collation file:
63 *  1. CollHead (maintbl is 0 if CHF_ENCODED)
64 *   if !CHF_ENCODED then
65 *    2. CollElem[bytes] (256 for 8 bit bytes)
66 *    3. if CHF_INDEXED then
67 *	 CollElem[wides] (nmain-256 for 8 bit bytes)
68 *	else
69 *	 CollMult[wides]
70 *    4. CollMult[*] (none if multtbl is 0)
71 *    5. wuchar_type[*] (none if repltbl is 0)
72 *    6. CollSubn[*] (none if subntbl is 0)
73 *    7. strings (first is pathname for .so if CHF_DYNAMIC)
74 *
75 * The actual location of parts 2 through 7 is not important.
76 *
77 * The main table is in encoded value order.
78 *
79 * All indeces/offsets must be nonzero to be effective; zero is reserved
80 * to indicate no-such-entry.  This implies either that an unused initial
81 * entry is placed in each of (4) through (7), or that the "start offset"
82 * given by the header is artificially pushed back by an entry size.
83 *
84 * Note that if CHF_ENCODED is not set, then nweight must be positive.
85 *
86 * If an element can begin a multiple character element, it contains a
87 * nonzero multbeg which is the initial index into (4) for its list;
88 * the list is terminated by a CollMult with a ch of zero.
89 *
90 * If there are elements with the same primary weight (weight[1]), then
91 * for each such element, it must have a CollMult list.  The CollMult
92 * that terminates the list (ch==0) notes the lowest and highest basic
93 * weights for those elements with that same primary weight value
94 * respectively in weight[0] and weight[1].  If there are some basic
95 * weights between these values that do not have the same primary
96 * weight--are not in the equivalence class--then the terminator also
97 * has a SUBN_SPECIAL mark.  Note that this list terminator should be
98 * shared when the elements are not multiple character collating
99 * elements because they wouldn't otherwise have a CollMult list.
100 *
101 * WGHT_IGNORE is used to denote ignored collating elements for a
102 * particular collation ordering pass.  All main table entries other
103 * than for '\0' will have a non-WGHT_IGNORE weight[0].  However, it is
104 * possible for a CollMult entries from (4) to have a WGHT_IGNORE
105 * weight[0]:  If, for example, "xyz" is a multiple character collating
106 * element, but "xy" is not, then the CollMult for "y" will have a
107 * WGHT_IGNORE weight[0].  Also, WGHT_IGNORE is used to terminate each
108 * list of replacement weights.
109 *
110 * Within (3), it is possible to describe a sequence of unremarkable
111 * collating elements with a single CollMult entry.  If the SUBN_SPECIAL
112 * bit is set, the rest of subnbeg represents the number of collating
113 * elements covered by this entry.  The weight[0] values are determined
114 * by adding the difference between the encoded value and the entry's ch
115 * value to the entry's weight[0].  This value is then substituted for
116 * any weight[n], n>0 that has only the WGHT_SPECIAL bit set. libuxre_collelem()
117 * hides any match to such an entry by filling in a "spare" CollElem.
118 *
119 * If there are substitution strings, then for each character that begins
120 * a string, it has a nonzero subnbeg which is similarly the initial
121 * index into (6).  The indeces in (6) refer to offsets within (7).
122 */
123 
124 #define TOPBIT(t)	(((t)1) << (sizeof(t) * CHAR_BIT - 1))
125 
126 #define CHF_ENCODED	0x1	/* collation by encoded values only */
127 #define CHF_INDEXED	0x2	/* main table indexed by encoded values */
128 #define CHF_MULTICH	0x4	/* a multiple char. coll. elem. exists */
129 #define CHF_DYNAMIC	0x8	/* shared object has collation functions */
130 
131 #define CWF_BACKWARD	0x1	/* reversed ordering for this weight */
132 #define CWF_POSITION	0x2	/* weight takes position into account */
133 
134 #define CLVERS		1	/* most recent version */
135 
136 #define WGHT_IGNORE	0	/* ignore this collating element */
137 #define WGHT_SPECIAL	TOPBIT(wuchar_type)
138 #define SUBN_SPECIAL	TOPBIT(unsigned short)
139 
140 #ifndef	COLL_WEIGHTS_MAX
141 #define	COLL_WEIGHTS_MAX	1
142 #endif
143 
144 typedef struct
145 {
146 	unsigned long	maintbl;	/* start of main table */
147 	unsigned long	multtbl;	/* start of multi-char table */
148 	unsigned long	repltbl;	/* start of replacement weights */
149 	unsigned long	subntbl;	/* start of substitutions */
150 	unsigned long	strstbl;	/* start of sub. strings */
151 	unsigned long	nmain;		/* # entries in main table */
152 	unsigned short	flags;		/* CHF_* bits */
153 	unsigned short	version;	/* handle future changes */
154 	unsigned char	elemsize;	/* # bytes/element (w/padding) */
155 	unsigned char	nweight;	/* # weights/element */
156 	unsigned char	order[COLL_WEIGHTS_MAX]; /* CWF_* bits/weight */
157 } CollHead;
158 
159 typedef struct
160 {
161 	unsigned short	multbeg;	/* start of multi-chars */
162 	unsigned short	subnbeg;	/* start of substitutions */
163 	wuchar_type	weight[COLL_WEIGHTS_MAX];
164 } CollElem;
165 
166 typedef struct
167 {
168 	wchar_t		ch;	/* "this" character (of sequence) */
169 	CollElem	elem;	/* its full information */
170 } CollMult;
171 
172 typedef struct
173 {
174 	unsigned short	strbeg;		/* start of match string */
175 	unsigned short	length;		/* length of match string */
176 	unsigned short	repbeg;		/* start of replacement */
177 } CollSubn;
178 
179 struct lc_collate
180 {
181 	const unsigned char	*strstbl;
182 	const wuchar_type	*repltbl;
183 	const CollElem		*maintbl;
184 	const CollMult		*multtbl;
185 	const CollSubn		*subntbl;
186 #ifdef DSHLIB
187 	void	*handle;
188 	void	(*done)(struct lc_collate *);
189 	int	(*strc)(struct lc_collate *, const char *, const char *);
190 	int	(*wcsc)(struct lc_collate *, const wchar_t *, const wchar_t *);
191 	size_t	(*strx)(struct lc_collate *, char *, const char *, size_t);
192 	size_t	(*wcsx)(struct lc_collate *, wchar_t *, const wchar_t *, size_t);
193 #endif
194 	const char		*mapobj;
195 	size_t			mapsize;
196 	unsigned long		nmain;
197 	short			nuse;
198 	unsigned short		flags;
199 	unsigned char		elemsize;
200 	unsigned char		nweight;
201 	unsigned char		order[COLL_WEIGHTS_MAX];
202 };
203 
204 #define ELEM_BADCHAR	((CollElem *)0)
205 #define ELEM_ENCODED	((CollElem *)-1)
206 
207 /*
208 LIBUXRE_STATIC int	libuxre_old_collate(struct lc_collate *);
209 LIBUXRE_STATIC int	libuxre_strqcoll(struct lc_collate *, const char *,
210 				const char *);
211 LIBUXRE_STATIC int	libuxre_wcsqcoll(struct lc_collate *, const wchar_t *,
212 				const wchar_t *);
213 */
214 extern struct lc_collate *libuxre_lc_collate(struct lc_collate *);
215 LIBUXRE_STATIC const CollElem	*libuxre_collelem(struct lc_collate *,
216 					CollElem *, wchar_t);
217 LIBUXRE_STATIC const CollElem	*libuxre_collmult(struct lc_collate *,
218 					const CollElem *, wchar_t);
219 /*
220 LIBUXRE_STATIC const CollElem	*libuxre_collmbs(struct lc_collate *,
221 					CollElem *, const unsigned char **);
222 LIBUXRE_STATIC const CollElem	*libuxre_collwcs(struct lc_collate *,
223 					CollElem *, const wchar_t **);
224 */
225 
226 #endif	/* !LIBUXRE_COLLDATA_H */
227