1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 2003-2013 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Phong Vo <phongvo@gmail.com> *
18 * *
19 ***********************************************************************/
20 #include "vchdr.h"
21 #include <vctable.h>
22 #include <vcrdb.h>
23
24 /* Transform/compress relational table data.
25 ** This codes the values in each field either by their
26 ** ordinal numbers in some dynamically constructed
27 ** dictionary or by a sequence of integers if the values
28 ** are of the form xxx.yyy.../zzz (the /zzz is optional).
29 ** In this way, the relational table is turned into
30 ** a table with fixed-length rows which is then processed
31 ** by Vctable. The values in dictionaries are collected
32 ** and compressed by Vcbwt.
33 **
34 ** Written by Kiem-Phong Vo (05/01/2008)
35 */
36
37 #define RT_RSEP 1 /* setting fld separ */
38 #define RT_FSEP 2 /* setting rec separ */
39 #define RT_SCHEMA 3 /* defining schema */
40 #define RT_ALIGN 4 /* defining alignment */
41 #define RT_RENEW 5 /* always renew context */
42
43 #define RT_MAXINT (1<<28) /* max codable integer */
44
45 #define RT_SHORT 16 /* code short fld as-is */
46
47 /* codes in persistence data for Rtfld_t.type */
48 #define RT_ISEQ (1<<0) /* integer sequence */
49 #define RT_DASH (1<<1) /* sequence ends with / */
50 #define RT_CLEAR (1<<2) /* clear value set */
51 #define RT_FIXZ (1<<3) /* fixed length field */
52 #define RT_ASIS (1<<4) /* coded as-is in table */
53
54 typedef struct _rtval_s /* mapping value to ordinal */
55 { Dtlink_t olnk; /* search by ordinal */
56 Dtlink_t dlnk; /* search by data */
57 Vcchar_t* data; /* a data field value */
58 ssize_t dtsz; /* length of data */
59 ssize_t ordn; /* Note: ordn >= 1 */
60 } Rtval_t;
61
62 /* A field can be coded either by ordinal numbers of unique
63 ** values or as a sequence of integers if it is of the form
64 ** xxx.yyy.../zzz.
65 */
66 typedef struct _rtfld_s /* all data for a field */
67 { int type; /* field type */
68 Rtval_t** val; /* decoding value list */
69 Dt_t* ddt; /* store values by data */
70 Dt_t* odt; /* store vals by order */
71 ssize_t asis; /* fixed size coding */
72 ssize_t pcnt; /* previous # of values */
73 ssize_t vcnt; /* current # of values */
74 /* or # elts for ISEQ */
75 ssize_t vmax; /* max coding value */
76 } Rtfld_t;
77
78 typedef struct _rtctxt_s /* context of invocation */
79 { Vccontext_t ctxt;
80 int fsep; /* field separator */
81 int rsep; /* record separator */
82 ssize_t algn; /* field alignment */
83 ssize_t recz; /* 0 if not schematic */
84 ssize_t fldn; /* # of fields in use */
85 ssize_t* fldz; /* size of each field */
86 Rtfld_t* fld; /* list of fields */
87 } Rtctxt_t;
88
89 typedef struct _rtable_s
90 { Rtctxt_t* ctxt; /* default context */
91 Vcodex_t* tbl; /* table transform */
92 Vcodex_t* bwt; /* BWT transform */
93 int renew; /* always renew context */
94 int dot; /* the dot character */
95 int dash; /* the dash character */
96 int zero; /* the zero digit */
97 } Rtable_t;
98
99 /* arguments passable to vcopen() */
100 static Vcmtarg_t _Rtargs[] =
101 {
102 { "fsep", "Fields are separated by 'fsep=character'", (Void_t*)RT_FSEP },
103 { "rsep", "Records are separated by 'rsep=character'", (Void_t*)RT_RSEP },
104 { "schema", "Schema is defined as 'schema=[fldlen1,fldlen2,...]'", (Void_t*)RT_SCHEMA },
105 { "align", "Field alignment is given as 'align=value'", (Void_t*)RT_ALIGN },
106 { "renew", "Context renewal defined as 'renew=[01]'", (Void_t*)RT_RENEW },
107 { 0 , "Field and record separators to be computed.", (Void_t*)0 }
108 };
109
110 /* discipline functions and data structures for the CDT library */
111 #if __STD_C
rtdatacmp(Dt_t * dt,void * o1,void * o2,Dtdisc_t * disc)112 static int rtdatacmp(Dt_t* dt, void* o1, void* o2, Dtdisc_t* disc)
113 #else
114 static int rtdatacmp(dt, o1, o2, disc)
115 Dt_t* dt;
116 void* o1;
117 void* o2;
118 Dtdisc_t* disc;
119 #endif
120 {
121 int d;
122 Rtval_t *f1 = (Rtval_t*)o1, *f2 = (Rtval_t*)o2;
123 Vcchar_t *s1 = f1->data, *s2 = f2->data, *ends;
124
125 /* compare two objects by their data */
126 ends = s1 + (f1->dtsz < f2->dtsz ? f1->dtsz : f2->dtsz);
127 for(; s1 < ends; ++s1, ++s2)
128 if((d = (int)s1[0] - (int)s2[0]) != 0 )
129 return d;
130 return f1->dtsz < f2->dtsz ? -1 : f1->dtsz == f2->dtsz ? 0 : 1;
131 }
132
133 #if __STD_C
rtordncmp(Dt_t * dt,void * o1,void * o2,Dtdisc_t * disc)134 static int rtordncmp(Dt_t* dt, void* o1, void* o2, Dtdisc_t* disc)
135 #else
136 static int rtordncmp(dt, o1, o2, disc)
137 Dt_t* dt;
138 void* o1;
139 void* o2;
140 Dtdisc_t* disc;
141 #endif
142 {
143 Rtval_t *f1 = (Rtval_t*)o1, *f2 = (Rtval_t*)o2;
144 return f1->ordn < f2->ordn ? -1 : f1->ordn == f2->ordn ? 0 : 1;
145 }
146
147 #if __STD_C
rthash(Dt_t * dt,void * o,Dtdisc_t * disc)148 static unsigned int rthash(Dt_t* dt, void* o, Dtdisc_t* disc)
149 #else
150 static unsigned int rthash(dt, o, disc)
151 Dt_t* dt;
152 void* o;
153 Dtdisc_t* disc;
154 #endif
155 {
156 Rtval_t *f = (Rtval_t*)o;
157 return f->dtsz == 0 ? 0 : dtstrhash(0, f->data, f->dtsz);
158 }
159
160 #if __STD_C
rtfree(Dt_t * dt,void * o,Dtdisc_t * disc)161 static void rtfree(Dt_t* dt, void* o, Dtdisc_t* disc)
162 #else
163 static void rtfree(dt, o, disc)
164 Dt_t* dt;
165 void* o;
166 Dtdisc_t* disc;
167 #endif
168 {
169 free(o);
170 }
171
172 static Dtdisc_t Rtdata =
173 { 0, 0, DTOFFSET(Rtval_t, dlnk),
174 NIL(Dtmake_f), rtfree,
175 rtdatacmp, rthash,
176 NIL(Dtmemory_f), NIL(Dtevent_f)
177 };
178
179 static Dtdisc_t Rtordn =
180 { 0, 0, DTOFFSET(Rtval_t, olnk),
181 NIL(Dtmake_f), NIL(Dtfree_f),
182 rtordncmp, NIL(Dthash_f),
183 NIL(Dtmemory_f), NIL(Dtevent_f)
184 };
185
186 /* convert a sequence xxx.yyy.../zzz into fixed length integers */
187 #if __STD_C
rta2i(Rtable_t * rt,Rtfld_t * rf,Vcchar_t * dt,ssize_t dz,Vcchar_t * cv,ssize_t cz)188 static ssize_t rta2i(Rtable_t* rt, Rtfld_t* rf, Vcchar_t* dt, ssize_t dz, Vcchar_t* cv, ssize_t cz)
189 #else
190 static ssize_t rta2i(rt, rf, dt, dz, cv, cz)
191 Rtable_t* rt; /* table information */
192 Rtfld_t* rf; /* field in processing */
193 Vcchar_t* dt; /* data to convert */
194 ssize_t dz; /* data length */
195 Vcchar_t* cv; /* buffer to convert to */
196 ssize_t cz; /* buffer length */
197 #endif
198 {
199 Vcchar_t *edt;
200 int zero, nine;
201 ssize_t intv, v, vmax, vcnt, ndash;
202 Vcio_t io;
203
204 if(dz <= 0 || !(rf->type&RT_ISEQ)) /* not convertible */
205 return -1;
206
207 vcioinit(&io, cv, cz);
208 ndash = vcnt = 0; vmax = rf->vmax;
209 zero = rt->zero; nine = zero+9;
210 for(edt = dt+dz; dt < edt; )
211 { intv = *dt++ - zero;
212 if(intv < 0 || intv > 9)
213 return -1; /* must start with a digit */
214 if(intv == 0 && dt < edt && *dt >= zero && *dt <= nine)
215 return -1; /* no leading zero allowed */
216
217 for(; dt < edt; ++dt)
218 { v = *dt - zero;
219 if(v < 0 || v > 9)
220 { if(*dt != rt->dot && *dt != rt->dash)
221 return -1; /* unrecognized character */
222 if(ndash > 0 || (dt+1) >= edt)
223 return -1; /* bad syntax */
224
225 if(*dt == rt->dash)
226 ndash += 1;
227 vcnt += 1;
228
229 dt += 1;
230 break;
231 }
232 intv = intv*10 + v;
233
234 if(intv <= 0 || (cv && intv > vmax) || (!cv && intv >= RT_MAXINT) )
235 return -1; /* value too large */
236 }
237
238 if(cv) /* encoding */
239 { if(vciomore(&io) < vcsizem(vmax))
240 return -1;
241 vcioputm(&io, intv, vmax);
242 }
243 else vmax = vmax > intv ? vmax : intv;
244 }
245
246 vcnt += 1;
247
248 if(rf->vcnt > 0 ) /* make sure format remains exactly the same */
249 { if(vcnt != rf->vcnt) /* element count */
250 return -1;
251 if((rf->type&RT_DASH) && ndash == 0) /* xxx.yyy.../zzz format */
252 return -1;
253 if(!(rf->type&RT_DASH) && ndash > 0)
254 return -1;
255 }
256
257 rf->type |= (ndash ? RT_DASH : 0); /* whether or not there is a dash */
258 rf->vcnt = vcnt; /* sequence count */
259 rf->vmax = vmax > rf->vmax ? vmax : rf->vmax; /* max value to code with */
260 return vciosize(&io);
261 }
262
263 #if __STD_C
rti2a(Rtable_t * rt,Rtfld_t * rf,Vcchar_t * dt,ssize_t dz,Vcchar_t * cv,ssize_t cz)264 static ssize_t rti2a(Rtable_t* rt, Rtfld_t* rf, Vcchar_t* dt, ssize_t dz, Vcchar_t* cv, ssize_t cz)
265 #else
266 static ssize_t rti2a(rt, rf, dt, dz, cv, cz)
267 Rtable_t* rt; /* table information */
268 Rtfld_t* rf; /* field in processing */
269 Vcchar_t* dt; /* data to convert */
270 ssize_t dz; /* data length */
271 Vcchar_t* cv; /* buffer to convert to */
272 ssize_t cz; /* buffer length */
273 #endif
274 {
275 ssize_t k, n;
276 Vcchar_t buf[16], *b, *endb, *c, *endc, zero;
277 Vcio_t io;
278 Vcuint32_t intv, vmax;
279
280 zero = rt->zero; vmax = rf->vmax;
281
282 if(dz <= 0 || (dz%vcsizem(vmax)) != 0 || (dz/vcsizem(vmax)) != rf->vcnt)
283 return -1;
284 vcioinit(&io, dt, dz);
285
286 endb = buf + sizeof(buf);
287 endc = (c = cv) + cz;
288 for(k = dz/vcsizem(vmax);; )
289 { if((intv = vciogetm(&io, vmax)) > vmax)
290 return -1; /* bad encoded value */
291 for(b = endb;; )
292 { *--b = zero + intv%10;
293 if((intv /= 10) == 0)
294 break;
295 }
296 if((n = endb-b) > (endc-c))
297 return -1; /* out of buffer */
298 memcpy(c, b, n); c += n;
299
300 if((k -= 1) == 0)
301 return c-cv;
302
303 if(c >= endc)
304 return -1;
305 *c++ = (k == 1 && (rf->type&RT_DASH)) ? rt->dash : rt->dot;
306 }
307 }
308
309 /* remove a list of values */
310 #if __STD_C
vlclose(Rtval_t ** val,ssize_t vm)311 static void vlclose(Rtval_t** val, ssize_t vm)
312 #else
313 static void vlclose(val, vm)
314 Rtval_t** val; /* list of values */
315 ssize_t vm; /* max index of values */
316 #endif
317 {
318 if(val)
319 { for(; vm >= 0; --vm)
320 if(val[vm])
321 free(val[vm]);
322 free(val);
323 }
324 }
325
326 /* clear field definition and associated data in a context */
327 #if __STD_C
rtctxtclear(Rtctxt_t * rc)328 static void rtctxtclear(Rtctxt_t* rc)
329 #else
330 static void rtctxtclear(rc)
331 Rtctxt_t* rc;
332 #endif
333 {
334 ssize_t k;
335
336 if(rc->fldn > 0)
337 { if(rc->fld)
338 { for(k = 0; k < rc->fldn; ++k)
339 { if(rc->fld[k].ddt)
340 dtclose(rc->fld[k].ddt);
341 if(rc->fld[k].odt)
342 dtclose(rc->fld[k].odt);
343 if(rc->fld[k].val)
344 vlclose(rc->fld[k].val, rc->fld[k].vcnt);
345 }
346 free(rc->fld);
347 }
348
349 if(rc->fldz)
350 free(rc->fldz);
351 }
352
353 rc->fldn = 0;
354 rc->fldz = NIL(ssize_t*);
355 rc->recz = 0;
356 rc->fld = NIL(Rtfld_t*);
357 }
358
359 #if __STD_C
rtable(Vcodex_t * vc,const Void_t * data,size_t size,Void_t ** out)360 static ssize_t rtable(Vcodex_t* vc, const Void_t* data, size_t size, Void_t** out)
361 #else
362 static ssize_t rtable(vc, data, size, out)
363 Vcodex_t* vc;
364 Void_t* data;
365 size_t size;
366 Void_t** out;
367 #endif
368 {
369 Vcchar_t *d, *dt, *edt;
370 ssize_t f, fldn, fmin, recn;
371 ssize_t u, z, tblz, valz, rowz;
372 Rtval_t *vl, val;
373 Vcio_t io;
374 Vcchar_t *output, *valdt, *tbldt;
375 Rtable_t *rt;
376 Rtctxt_t *rc;
377 ssize_t rv = -1;
378
379 if(size <= 0)
380 return 0;
381
382 if(!vc || !data ||
383 !(rt = vcgetmtdata(vc,Rtable_t*)) ||
384 !(rc = vcgetcontext(vc, Rtctxt_t*)) )
385 return -1;
386
387 if(rt->renew) /* clear out dictionaries only */
388 rtctxtclear(rc);
389
390 vc->undone = size; /* start with doing nothing */
391
392 if(rc->recz <= 0 && (rc->rsep <= 0 || rc->fsep <= 0) )
393 { Vcrdformat_t *rdf;
394 if(!(rdf = vcrdformat((Vcchar_t*)data, size, rc->rsep, rc->algn, 1)) )
395 return 0;
396
397 if(rdf->fsep > 0 && rdf->rsep > 0)
398 { rc->fsep = rdf->fsep;
399 rc->rsep = rdf->rsep;
400 }
401 else if(rdf->fldn > 0 && rdf->fldz)
402 { if((rc->fldz = (ssize_t*)malloc(rdf->fldn*sizeof(ssize_t))) )
403 { memcpy(rc->fldz, rdf->fldz, rdf->fldn*sizeof(ssize_t));
404 rc->fldn = rdf->fldn;
405
406 rc->recz = 0; /* compute record length */
407 for(f = 0; f < rc->fldn; ++f)
408 rc->recz += rc->fldz[f];
409 }
410 }
411 free(rdf);
412
413 if(rc->fldn <= 0 && (rc->rsep <= 0 || rc->fsep <= 0) )
414 return 0;
415 }
416
417 /* compute field and record numbers */
418 if(rc->recz > 0) /* schematically defined */
419 { fldn = fmin = rc->fldn;
420 recn = size/rc->recz;
421 size = recn*rc->recz; /* size of data to be processed */
422 }
423 else /* records and fields defined by separators */
424 { if(rc->rsep <= 0) /* not a table */
425 return 0;
426 if(rc->fsep <= 0) /* single field */
427 rc->fsep = rc->rsep;
428
429 recn = fldn = 0; /* count records and number of fields per record */
430 fmin = -1; /* fewest fields in a record */
431 rowz = -1; /* see if this is a fixed length record */
432 for(edt = (dt = (Vcchar_t*)data)+size; dt < edt;)
433 { for(f = 0, d = dt; d < edt; ++d)
434 { if(*d == rc->fsep || *d == rc->rsep)
435 f += 1;
436 if(*d == rc->rsep)
437 break;
438 }
439 if(d >= edt) /* last incomplete record */
440 break;
441
442 if(rowz < 0)
443 rowz = f != 1 ? 0 : (d-dt)+1;
444 else if(rowz > 0)
445 rowz = (f != 1 || (z = (d-dt)+1) != rowz) ? 0 : z;
446
447 if(fmin < 0 || f < fmin) /* smallest # of fields per record */
448 fmin = f;
449 if(f > fldn) /* largest # of fields per record */
450 fldn = f;
451
452 recn += 1; /* count records */
453
454 dt = d+1; /* process next record */
455 }
456 size = dt - (Vcchar_t*)data; /* size of data to be processed */
457
458 if(fldn == 1 && rowz > 0) /* treat this as a fixed-length table */
459 { rtctxtclear(rc);
460 if(!(rc->fldz = (ssize_t*)malloc(sizeof(ssize_t))) )
461 return -1;
462 rc->fldn = 1;
463 rc->fldz[0] = rc->recz = rowz;
464 }
465 }
466 if(fldn <= 0 || recn <= 0)
467 return 0; /* nothing to do */
468 vc->undone -= size; /* a remaining incomplete record, if any */
469
470 if(fldn != rc->fldn) /* records differ from before */
471 { rtctxtclear(rc);
472 rc->fldn = fldn;
473 }
474
475 if(!rc->fld) /* allocate field structures */
476 { if(!(rc->fld = (Rtfld_t*)calloc(fldn, sizeof(Rtfld_t))) )
477 GOTO(done);
478 for(f = 0; f < fldn; ++f)
479 { if(!(rc->fld[f].odt = dtopen(&Rtordn, Dtoset)) )
480 GOTO(done);
481 if(!(rc->fld[f].ddt = dtopen(&Rtdata, Dtset)) )
482 GOTO(done);
483 rc->fld[f].pcnt = rc->fld[f].vcnt = 0;
484 rc->fld[f].vmax = 0;
485 }
486 }
487
488 /* check for fields that will be coded in-place in table */
489 if(rc->recz > 0) /* fields are schematically defined */
490 { for(f = 0; f < fldn; ++f)
491 { rc->fld[f].type = RT_FIXZ;
492 if(rc->fldz[f] <= RT_SHORT || fldn == 1) /* code as-is */
493 { rc->fld[f].type |= RT_ASIS;
494 rc->fld[f].vcnt = rc->fldz[f]; /* size of field */
495 rc->fld[f].vmax = 0xff; /* vcsizem(0xff) == 1, below */
496 }
497 }
498 }
499 else /* using field&record separators */
500 { for(f = 0; f < fldn; ++f)
501 { if(rc->fld[f].pcnt > 0 || fmin != fldn) /* no codable fields */
502 rc->fld[f].type = 0;
503 else
504 { rc->fld[f].type = RT_ASIS|RT_ISEQ;
505 rc->fld[f].asis = -1;
506 }
507 }
508
509 /* check for fields codable in some special way */
510 for(edt = (dt = (Vcchar_t*)data)+size; dt < edt; )
511 { for(u = 0, f = 0;; )
512 { for(d = dt; *d != rc->fsep && *d != rc->rsep; )
513 d += 1;
514 z = d-dt; /* size of field minus separator */
515
516 /* see if codable as a fixed length integer sequence */
517 if(rc->fld[f].type&RT_ISEQ)
518 { if(rta2i(rt, rc->fld+f, dt, z, 0, 0) < 0)
519 { rc->fld[f].type &= ~RT_ISEQ;
520 rc->fld[f].vcnt = rc->fld[f].vmax = 0;
521 }
522 }
523
524 /* see if remaining fixed length */
525 if(rc->fld[f].type&RT_ASIS)
526 { if(rc->fld[f].asis == -1)
527 rc->fld[f].asis = z;
528 else if(z != rc->fld[f].asis)
529 { rc->fld[f].type &= ~RT_ASIS;
530 rc->fld[f].asis = -2;
531 }
532 }
533
534 if(!(rc->fld[f].type & (RT_ISEQ|RT_ASIS)) )
535 u += 1; /* an uncodable field */
536
537 f += 1; /* count field */
538 dt = d+1; /* next field or record */
539 if(*d == rc->rsep) /* end of record */
540 break;
541 } /**/DEBUG_ASSERT(u <= fldn);
542 if(u == fldn) /* no more ISEQ|ASIS fields */
543 break;
544 }
545
546 /* pick between asis and iseq */
547 for(f = 0; f < fldn; ++f)
548 { if((rc->fld[f].type&RT_ISEQ) && (rc->fld[f].type&RT_ASIS) &&
549 rc->fld[f].asis <= (rc->fld[f].vcnt*rc->fld[f].vmax) )
550 rc->fld[f].type &= ~RT_ISEQ;
551
552 if(rc->fld[f].type&RT_ASIS)
553 { /**/DEBUG_ASSERT(!(rc->fld[f].type & RT_ISEQ) );
554 rc->fld[f].vcnt = rc->fld[f].asis;
555 rc->fld[f].vmax = 0xff;
556 }
557 }
558 }
559
560 valz = 0; /* check for new field values and accumulate their total size */
561 for(edt = (dt = (Vcchar_t*)data)+size; dt < edt; )
562 { for(f = 0;; ) /* process a record */
563 { if(rc->fld[f].type&RT_FIXZ) /* fixed length field */
564 d = dt + rc->fldz[f];
565 else for(d = dt; *d != rc->fsep && *d != rc->rsep; )
566 d += 1;
567
568 if(!(rc->fld[f].type&(RT_ASIS|RT_ISEQ)) )
569 { val.data = dt; val.dtsz = d-dt;
570 if(!(vl = dtsearch(rc->fld[f].ddt, &val)) )
571 { /* insert a new value into dictionaries */
572 z = sizeof(Rtval_t) + val.dtsz;
573 if(!(vl = (Rtval_t*)malloc(z)) )
574 GOTO(done);
575 vl->data = (Vcchar_t*)(vl+1);
576 if((vl->dtsz = val.dtsz) > 0)
577 memcpy(vl->data, dt, vl->dtsz);
578 vl->ordn = (rc->fld[f].vcnt += 1);
579
580 valz += vl->dtsz;
581 if(!(rc->fld[f].type&RT_FIXZ) ) /* count separator */
582 valz += sizeof(Vcchar_t);
583
584 dtinsert(rc->fld[f].odt, vl); /* ordinal dictionary */
585 dtinsert(rc->fld[f].ddt, vl); /* data dictionary */
586 }
587 }
588
589 f += 1; /* advance field index */
590 if(rc->fld[f-1].type&RT_FIXZ)
591 { dt = d;
592 if(f >= fldn)
593 break;
594 }
595 else
596 { dt = d+1; /* skip field or record separator */
597 if(*d == rc->rsep)
598 break;
599 }
600 }
601 }
602
603 for(rowz = 0, f = 0; f < fldn; ++f) /* estimate row size of encoding table */
604 { if(rc->fld[f].type&(RT_ISEQ|RT_ASIS)) /* fixed-length coding in table */
605 z = vcsizem(rc->fld[f].vmax) * rc->fld[f].vcnt;
606 else /* code by indices in some value set */
607 { rc->fld[f].vmax = rc->fld[f].vcnt;
608 z = vcsizem(rc->fld[f].vmax);
609 }
610 rowz += z;
611 } /**/DEBUG_PRINT(2,"fldn=%d ",fldn); DEBUG_PRINT(2,"recn=%d ",recn); DEBUG_PRINT(2,"rowz=%d\n",rowz);
612
613 tblz = recn*rowz; /* allocate space for code table */
614 if(!(tbldt = vcbuffer(rt->tbl, NIL(Vcchar_t*), tblz, 0)) )
615 GOTO(done);
616 vcioinit(&io, tbldt, tblz);
617
618 /* build the representation table from indices or fixed-size data */
619 for(edt = (dt = (Vcchar_t*)data)+size; dt < edt; )
620 { for(f = 0;; )
621 { if(rc->fld[f].type&RT_FIXZ) /* fixed length field */
622 d = dt + rc->fldz[f];
623 else for(d = dt; *d != rc->fsep && *d != rc->rsep; )
624 d += 1;
625
626 if(rc->fld[f].type&RT_ASIS) /* code literally */
627 { /**/DEBUG_ASSERT(rc->fld[f].asis == rc->fld[f].vcnt);
628 /**/DEBUG_ASSERT(rc->fld[f].asis == (d-dt));
629 vcioputs(&io, dt, d-dt);
630 }
631 else if(rc->fld[f].type&RT_ISEQ) /* code as an integer sequence */
632 { z = rta2i(rt, rc->fld+f, dt, d-dt, vcionext(&io), vciomore(&io));
633 /**/DEBUG_ASSERT(z == rc->fld[f].vcnt*vcsizem(rc->fld[f].vmax));
634 if(z <= 0)
635 GOTO(done);
636 vcioskip(&io, z);
637 }
638 else /* coding a field value by its ordinal number */
639 { val.data = dt; val.dtsz = d-dt;
640 if(!(vl = dtsearch(rc->fld[f].ddt, &val)) )
641 GOTO(done);
642 /**/DEBUG_ASSERT(vl->ordn <= rc->fld[f].vmax);
643 vcioputm(&io, vl->ordn, rc->fld[f].vmax);
644 }
645
646 f += 1; /* advance field index */
647 if(rc->fld[f-1].type&RT_FIXZ)
648 { dt = d;
649 if(f >= fldn)
650 break;
651 }
652 else
653 { dt = d+1; /* skip separator to start of next field */
654 if(*d == rc->rsep)
655 break;
656 }
657 }
658
659 for(; f < fldn; ++f) /* fill in missing fields with 0's */
660 { /**/DEBUG_ASSERT(rc->fld[f].type == 0);
661 vcioputm(&io, 0, rc->fld[f].vmax);
662 }
663 } /**/DEBUG_ASSERT(vciosize(&io) == tblz);
664
665 /* output new field value data in order of their ordinal numbers */
666 if(!(valdt = vcbuffer(rt->bwt, NIL(Vcchar_t*), valz+1, 0)) )
667 GOTO(done);
668 vcioinit(&io, valdt, valz);
669 for(f = 0; f < fldn; ++f)
670 { if(rc->fld[f].type & (RT_ISEQ|RT_ASIS)) /* data were coded in table */
671 continue;
672
673 val.ordn = rc->fld[f].pcnt+1; /* new data should have ordn > pcnt */
674 for(vl = dtsearch(rc->fld[f].odt,&val); vl; vl = dtnext(rc->fld[f].odt,vl) )
675 { vcioputs(&io, vl->data, vl->dtsz);
676 if(!(rc->fld[f].type&RT_FIXZ) )
677 vcioputc(&io, rc->rsep); /* only rsep is used here */
678 }
679
680 if(rc->fld[f].pcnt == 0) /* dictionary started empty */
681 rc->fld[f].type |= RT_CLEAR;
682
683 rc->fld[f].pcnt = rc->fld[f].vcnt;
684 } /**/DEBUG_ASSERT(vciosize(&io) == valz);
685
686 /**/DEBUG_PRINT(2,"Raw valz=%d\n",valz);
687 if(valz > 0)
688 { rt->bwt->coder = vc->coder; /* transform field values with bwt */
689 if((valz = vcapply(rt->bwt, valdt, valz, &valdt)) < 0 )
690 GOTO(done);
691 /**/DEBUG_PRINT(2, "Processed valz=%d\n", valz);
692 }
693
694 /**/DEBUG_PRINT(2,"Raw tblz=%d\n",tblz);
695 rt->tbl->coder = vc->coder; /* transform table with Vctable */
696 vcsetmtarg(rt->tbl, "columns", (Void_t*)rowz, 2);
697 if((tblz = vcapply(rt->tbl, tbldt, tblz, &tbldt)) < 0 )
698 GOTO(done);
699 /**/DEBUG_PRINT(2, "Processed tblz=%d\n", tblz);
700
701 /* compute size of output data */
702 z = vcsizeu(size) + /* original data size */
703 sizeof(Vcchar_t) + /* the '.' character */
704 sizeof(Vcchar_t) + /* the '/' character */
705 sizeof(Vcchar_t) + /* the '0' character */
706 sizeof(Vcchar_t) + /* record separator */
707 sizeof(Vcchar_t) + /* field separator */
708 vcsizeu(fldn) + /* number of fields per record */
709 vcsizeu(valz) + valz; /* new field values */
710 for(f = 0; f < fldn; ++f) /* field format */
711 { z += sizeof(Vcchar_t) + /* type of field */
712 vcsizeu(rc->fld[f].vmax) + /* max bound of indices */
713 vcsizeu(rc->fld[f].vcnt); /* # of elts in sequence */
714 if(rc->fld[f].type&RT_FIXZ) /* a fixed length field */
715 z += vcsizeu(rc->fldz[f]);
716 }
717 z += vcsizeu(rowz) + /* row size of table data */
718 vcsizeu(tblz) + tblz; /* table data */
719
720 if(!(output = vcbuffer(vc, NIL(Vcchar_t*), z, 0)) )
721 GOTO(done);
722 vcioinit(&io, output, z);
723 vcioputu(&io, size);
724 vcioputc(&io, rt->dot);
725 vcioputc(&io, rt->dash);
726 vcioputc(&io, rt->zero);
727 vcioputc(&io, rc->rsep);
728 vcioputc(&io, rc->fsep);
729 vcioputu(&io, fldn);
730 vcioputu(&io, valz); vcioputs(&io, valdt, valz);
731 for(f = 0; f < fldn; ++f)
732 { vcioputc(&io, rc->fld[f].type);
733 vcioputu(&io, rc->fld[f].vmax);
734 vcioputu(&io, rc->fld[f].vcnt);
735 if(rc->fld[f].type&RT_FIXZ)
736 vcioputu(&io, rc->fldz[f]);
737 }
738 vcioputu(&io, rowz);
739 vcioputu(&io, tblz); vcioputs(&io, tbldt, tblz);
740 /**/DEBUG_ASSERT(vciosize(&io) == z);
741
742 if(out)
743 *out = output;
744 rv = z;
745
746 done: if(rv < 0) /* if an error happened, clear context */
747 rtctxtclear(rc);
748 return rv;
749 }
750
751
752 #if __STD_C
unrtable(Vcodex_t * vc,const Void_t * data,size_t size,Void_t ** out)753 static ssize_t unrtable(Vcodex_t* vc, const Void_t* data, size_t size, Void_t** out)
754 #else
755 static ssize_t unrtable(vc, data, size, out)
756 Vcodex_t* vc;
757 Void_t* data;
758 size_t size;
759 Void_t** out;
760 #endif
761 {
762 ssize_t z, f, v, dtsz, fldn, valz, tblz, rowz;
763 ssize_t type, vmax, vcnt;
764 Vcchar_t *output, *tbldt, *valdt, *dt, *enddt;
765 Rtval_t **val, *vl;
766 Vcio_t io;
767 Rtable_t *rt;
768 Rtctxt_t *rc;
769 ssize_t rv = -1;
770
771 if(size == 0)
772 return 0;
773
774 if(!vc || !data ||
775 !(rt = vcgetmtdata(vc,Rtable_t*)) ||
776 !(rc = vcgetcontext(vc, Rtctxt_t*)) )
777 GOTO(done);
778
779 vcioinit(&io, data, size);
780
781 /* get the size of original data */
782 if(vciomore(&io) <= 0 || (dtsz = vciogetu(&io)) < 0 )
783 GOTO(done);
784
785 /* read the coding characters for integer sequences */
786 if(vciomore(&io) <= 0 || (rt->dot = vciogetc(&io)) < 0 )
787 GOTO(done);
788 if(vciomore(&io) <= 0 || (rt->dash = vciogetc(&io)) < 0 )
789 GOTO(done);
790 if(vciomore(&io) <= 0 || (rt->zero = vciogetc(&io)) < 0 )
791 GOTO(done);
792
793 /* read separators */
794 if(vciomore(&io) <= 0 || (rc->rsep = vciogetc(&io)) < 0 )
795 GOTO(done);
796 if(vciomore(&io) <= 0 || (rc->fsep = vciogetc(&io)) < 0 )
797 GOTO(done);
798
799 /* read field data */
800 if(vciomore(&io) <= 0 || (fldn = vciogetu(&io)) <= 0 )
801 GOTO(done);
802 if(vciomore(&io) <= 0 || (valz = vciogetu(&io)) < 0 )
803 GOTO(done);
804 if(vciomore(&io) < valz)
805 GOTO(done);
806 valdt = vcionext(&io); vcioskip(&io, valz);
807 rt->bwt->coder = vc->coder;
808 if(valz > 0 && (valz = vcapply(rt->bwt, valdt, valz, &valdt)) < 0)
809 GOTO(done);
810
811 if(fldn != rc->fldn || !rc->fld) /* creat arrays of field data */
812 { rtctxtclear(rc);
813 rc->fldn = fldn;
814 if(!(rc->fld = (Rtfld_t*)calloc(fldn, sizeof(Rtfld_t))) ||
815 !(rc->fldz = (ssize_t*)calloc(fldn, sizeof(ssize_t))) )
816 GOTO(done);
817 }
818
819 enddt = valdt + valz; /* reconstruct field data */
820 for(f = 0; f < fldn; ++f)
821 { if(vciomore(&io) <= 0 || (type = vciogetc(&io)) < 0 )
822 GOTO(done);
823 if(vciomore(&io) <= 0 || (vmax = vciogetu(&io)) < 0 )
824 GOTO(done);
825 if(vciomore(&io) <= 0 || (vcnt = vciogetu(&io)) < 0 )
826 GOTO(done);
827 if(type&RT_FIXZ)
828 { if(vciomore(&io) <= 0 || (z = vciogetu(&io)) < 0)
829 GOTO(done);
830 rc->fldz[f] = z;
831 }
832
833 if(type&(RT_CLEAR|RT_ISEQ|RT_ASIS)) /* clear all current values */
834 { if(rc->fld[f].val)
835 vlclose(rc->fld[f].val, rc->fld[f].vcnt);
836 rc->fld[f].val = NIL(Rtval_t**);
837 rc->fld[f].vcnt = 0;
838 }
839
840 rc->fld[f].type = type; /* type of field coding */
841 rc->fld[f].vmax = vmax; /* bound of value index */
842
843 if(type&(RT_ISEQ|RT_ASIS) ) /* values coded directly in table */
844 { rc->fld[f].vcnt = vcnt;
845 continue;
846 }
847
848 if(rc->fld[f].vcnt > 0 && vcnt == rc->fld[f].vcnt)
849 continue; /* no new values */
850
851 /* add new values for this field */
852 if((val = rc->fld[f].val)) /* get space for new values */
853 val = (Rtval_t**)realloc(val,(vcnt+1)*sizeof(Rtval_t*));
854 else val = (Rtval_t**)malloc((vcnt+1)*sizeof(Rtval_t*));
855 if(!val)
856 GOTO(done);
857 rc->fld[f].val = val;
858
859 val[0] = NIL(Rtval_t*); /* values start from location 1 */
860 for(v = rc->fld[f].vcnt+1; v <= vcnt; ++v)
861 { if(type&RT_FIXZ)
862 { dt = valdt + rc->fldz[f];
863 if(dt > enddt)
864 GOTO(done);
865 }
866 else
867 { for(dt = valdt; dt < enddt; ++dt)
868 if(*dt == rc->rsep)
869 break;
870 if(dt >= enddt) /* incomplete data */
871 GOTO(done);
872 }
873
874 if(!(vl = (Rtval_t*)malloc(sizeof(Rtval_t)+(dt-valdt))) )
875 GOTO(done);
876 vl->data = (Vcchar_t*)(vl+1);
877 vl->dtsz = dt - valdt;
878 memcpy(vl->data, valdt, vl->dtsz);
879 val[v] = vl;
880
881 valdt = dt + ((type&RT_FIXZ) ? 0 : 1);
882 }
883 rc->fld[f].vcnt = vcnt; /* vcnt is the index of the last value! */
884 }
885
886 /* get table data */
887 if(vciomore(&io) <= 0 || (rowz = vciogetu(&io)) <= 0 )
888 GOTO(done);
889 if(vciomore(&io) <= 0 || (tblz = vciogetu(&io)) <= 0 )
890 GOTO(done);
891 if(vciomore(&io) < tblz)
892 GOTO(done);
893 tbldt = vcionext(&io); vcioskip(&io, tblz);
894 rt->tbl->coder = vc->coder;
895 if((tblz = vcapply(rt->tbl, tbldt, tblz, &tbldt)) < 0)
896 GOTO(done);
897
898 /* now reconstruct original data */
899 if(!(output = vcbuffer(vc, NIL(Vcchar_t*), dtsz, 0)) )
900 GOTO(done);
901 vcioinit(&io, tbldt, tblz);
902 for(enddt = (dt = output)+dtsz; vciomore(&io) > 0; )
903 { for(f = 0; f < fldn; ++f)
904 { if(rc->fld[f].type&RT_ASIS) /* field's own uncoded data */
905 { if(f > 0 && !(rc->fld[f-1].type&RT_FIXZ))
906 { if(dt >= enddt)
907 GOTO(done);
908 *dt++ = rc->fsep;
909 }
910
911 z = (rc->fld[f].type&RT_FIXZ) ? rc->fldz[f] : rc->fld[f].vcnt;
912 if(z > vciomore(&io) || (enddt-dt) < z)
913 GOTO(done);
914 vciogets(&io, dt, z);
915 dt += z;
916 }
917 else if(rc->fld[f].type&RT_ISEQ) /* field coded as integer sequence */
918 { if(f > 0 && !(rc->fld[f-1].type&RT_FIXZ))
919 { if(dt >= enddt)
920 GOTO(done);
921 *dt++ = rc->fsep;
922 }
923
924 v = rc->fld[f].vcnt * vcsizem(rc->fld[f].vmax);
925 if(vciomore(&io) < v)
926 GOTO(done);
927 valdt = vcionext(&io); vcioskip(&io, v);
928 if((z = rti2a(rt, rc->fld+f, valdt, v, dt, enddt-dt)) < 0)
929 GOTO(done);
930 dt += z;
931 }
932 else /* string data coded by index in value set */
933 { if(vciomore(&io) <= 0 || (v = vciogetm(&io, rc->fld[f].vmax)) < 0)
934 GOTO(done);
935
936 if(v == 0) /* a short record with missing fields */
937 { for(f += 1; f < fldn; ++f) /* skip missing fields */
938 if((v = vciogetm(&io, rc->fld[f].vmax)) != 0)
939 GOTO(done);
940 break;
941 }
942
943 if(f > 0 && !(rc->fld[f-1].type&RT_FIXZ))
944 { if(dt >= enddt)
945 GOTO(done);
946 *dt++ = rc->fsep;
947 }
948
949 if((z = rc->fld[f].val[v]->dtsz) > (enddt-dt) )
950 GOTO(done);
951 memcpy(dt, rc->fld[f].val[v]->data, z);
952 dt += z;
953 }
954 }
955
956 if(!(rc->fld[fldn-1].type&RT_FIXZ) )
957 { if(dt >= enddt)
958 GOTO(done);
959 *dt++ = rc->rsep;
960 }
961 }
962 if(dt != enddt)
963 GOTO(done);
964
965 if(out)
966 *out = output;
967 rv = dtsz;
968
969 done:
970 return rv;
971 }
972
973 #if __STD_C
rtevent(Vcodex_t * vc,int type,Void_t * params)974 static int rtevent(Vcodex_t* vc, int type, Void_t* params)
975 #else
976 static int rtevent(vc, type, params)
977 Vcodex_t* vc;
978 int type;
979 Void_t* params;
980 #endif
981 {
982 Rtable_t *rt;
983 Rtctxt_t *rc;
984 Vcmtarg_t *arg;
985 char *data, val[1024];
986 ssize_t k, fldn, recz, *fldz;
987 int rv = -1;
988
989 if(type == VC_OPENING)
990 { if(!(rt = (Rtable_t*)calloc(1,sizeof(Rtable_t))) )
991 return -1;
992
993 /* handles to transform table and field value data */
994 type = vc->flags & (VC_ENCODE|VC_DECODE);
995 if(!(rt->tbl = vcopen(0, Vctable, 0, 0, type)) )
996 goto do_close;
997 if(!(rt->bwt = vcopen(0, Vcbwt, 0, 0, type)) )
998 goto do_close;
999
1000 /* create default context */
1001 rt->ctxt = (Rtctxt_t*)vcinitcontext(vc, NIL(Vccontext_t*));
1002
1003 /* by default, contexts are maintained across windows */
1004 rt->renew = 0;
1005
1006 /* reference characters for integer sequence conversion */
1007 rt->dot = '.';
1008 rt->dash = '/';
1009 rt->zero = '0';
1010
1011 vcsetmtdata(vc, rt);
1012 goto vc_setarg;
1013 }
1014 else if(type == VC_SETMTARG)
1015 { vc_setarg :
1016 if(!(rc = vcgetcontext(vc, Rtctxt_t*)) )
1017 return -1;
1018 for(data = (char*)params; data; )
1019 { data = vcgetmtarg(data, val, sizeof(val), _Rtargs, &arg);
1020
1021 type = TYPECAST(int,arg->data);
1022 if(type == RT_RSEP || type == RT_FSEP ||
1023 type == RT_ALIGN || type == RT_SCHEMA )
1024 { rtctxtclear(rc);
1025
1026 if(type == RT_ALIGN || type == RT_SCHEMA)
1027 rc->rsep = rc->fsep = 0;
1028 }
1029
1030 switch(TYPECAST(int,arg->data))
1031 { case RT_RENEW :
1032 rt = vcgetmtdata(vc, Rtable_t*);
1033 rt->renew = val[0] == '0' ? 0 : 1;
1034 break;
1035 case RT_RSEP:
1036 rc->rsep = val[0] ? val[0] : 0;
1037 break;
1038 case RT_FSEP:
1039 rc->fsep = val[0] ? val[0] : 0;
1040 break;
1041 case RT_ALIGN:
1042 rc->algn = vcatoi(val);
1043 break;
1044 case RT_SCHEMA:
1045 fldz = NIL(ssize_t*); /* get list of field sizes */
1046 if((fldn = vcstr2list(val, ',', &fldz)) < 0 || (fldn > 0 && !fldz) )
1047 return -1;
1048 for(recz = 0, k = 0; k < fldn; ++k)
1049 { if(fldz[k] <= 0) /* bad schema definition */
1050 { free(fldz);
1051 return -1;
1052 }
1053 recz += fldz[k]; /* count for record length */
1054 }
1055 rc->fldn = fldn;
1056 rc->fldz = fldz;
1057 rc->recz = recz;
1058 break;
1059 }
1060 }
1061 return 0;
1062 }
1063 else if(type == VC_INITCONTEXT) /* create a new context for transformation */
1064 { if(!params)
1065 return 0;
1066
1067 if(!(rc = (Rtctxt_t*)calloc(1,sizeof(Rtctxt_t))) )
1068 return -1;
1069 rc->fsep = 0;
1070 rc->rsep = 0;
1071
1072 *((Rtctxt_t**)params) = rc;
1073 return 1;
1074 }
1075 else if(type == VC_FREECONTEXT) /* delete an existing context */
1076 { if((rc = (Rtctxt_t*)params) )
1077 rtctxtclear(rc);
1078 free(rc);
1079 return 0;
1080 }
1081 else if(type == VC_FREEBUFFER)
1082 { if((rt = vcgetmtdata(vc, Rtable_t*)) )
1083 { if(rt->tbl)
1084 vcbuffer(rt->tbl, NIL(Vcchar_t*), -1, -1);
1085 if(rt->bwt)
1086 vcbuffer(rt->bwt, NIL(Vcchar_t*), -1, -1);
1087 }
1088 return 0;
1089 }
1090 else if(type == VC_CLOSING)
1091 { if(!(rt = vcgetmtdata(vc, Rtable_t*)) )
1092 return -1;
1093 rv = 0;
1094
1095 do_close: /* free any allocated resources */
1096 if(rt->tbl)
1097 { rt->tbl->coder = NIL(Vcodex_t*);
1098 vcclose(rt->tbl);
1099 }
1100 if(rt->bwt)
1101 { rt->bwt->coder = NIL(Vcodex_t*);
1102 vcclose(rt->bwt);
1103 }
1104 free(rt);
1105 vcsetmtdata(vc, NIL(Rtable_t*));
1106
1107 return rv;
1108 }
1109
1110 return 0;
1111 }
1112
1113 Vcmethod_t _Vcrtable =
1114 { rtable,
1115 unrtable,
1116 rtevent,
1117 "rtable", "Transforming/Compressing a relational data table.",
1118 "[-?\n@(#)$Id: vcodex-rtable (AT&T Research) 2011-07-20 $\n]" USAGE_LICENSE,
1119 _Rtargs,
1120 4*1024*1024,
1121 0
1122 };
1123
1124 VCLIB(Vcrtable)
1125