1 /* Copyright(C) 2004 Brazil
2
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17 #include "senna_in.h"
18 #include <fcntl.h>
19 #include <string.h>
20 #include <sys/stat.h>
21
22 #include "str.h"
23 #include "sym.h"
24 #include "inv.h"
25
26 struct _sen_inv {
27 uint8_t v08p;
28 sen_io *seg;
29 sen_io *chunk;
30 sen_sym *lexicon;
31 struct sen_inv_header *header;
32 #ifdef USE_QUERY_ABORT
33 int (*check_abort)(void*);
34 void *check_abort_arg;
35 #endif /* USE_QUERY_ABORT */
36 uint32_t total_chunk_size;
37 uint16_t ainfo[SEN_INV_MAX_SEGMENT];
38 uint16_t binfo[SEN_INV_MAX_SEGMENT];
39 uint16_t amax;
40 uint16_t bmax;
41 };
42
43 typedef struct {
44 sen_inv *inv;
45 sen_inv_posting pc;
46 sen_inv_posting pb;
47 sen_inv_posting *post;
48 uint8_t *cp;
49 uint8_t *cpe;
50 uint8_t *bp;
51 sen_io_win iw;
52 struct sen_inv_buffer *buf;
53 uint16_t stat;
54 uint16_t nextb;
55 uint32_t buffer_pos;
56 } sen_inv_cursor08;
57
58 struct sen_inv_header {
59 char idstr[16];
60 uint32_t initial_n_segments;
61 // todo: initial_n_segments should be uint16_t
62 // uint32_t total_chunk_size; todo: should be added when index format changed
63 uint16_t segments[SEN_INV_MAX_SEGMENT];
64 // todo: exchange segments and ainfo,binfo.
65 uint8_t chunks[1]; /* dummy */
66 };
67
68 #define SEN_INV_IDSTR "SENNA:INV:00.00"
69 #define SEN_INV_SEGMENT_SIZE 0x40000
70 /* SEN_INV_MAX_SEGMENT == 0x10000 >> 2 */
71 #define SEN_INV_CHUNK_SIZE 0x40000
72 #define N_CHUNKS_PER_FILE (SEN_IO_FILE_SIZE / SEN_INV_CHUNK_SIZE)
73 #define W_OF_SEGMENT 18
74 #define W_OF_ARRAY (W_OF_SEGMENT - 2)
75 #define ARRAY_MASK_IN_A_SEGMENT ((SEN_INV_SEGMENT_SIZE >> 2) - 1)
76 #define BUFFER_MASK_IN_A_SEGMENT (SEN_INV_SEGMENT_SIZE - 1)
77 #define CHUNK_NOT_ASSIGNED 0xffffffff
78 #define SEG_NOT_ASSIGNED 0xffff
79
80 #define SEGMENT_ARRAY 0x8000
81 #define SEGMENT_BUFFER 0x4000
82 #define SEGMENT_MASK (SEN_INV_MAX_SEGMENT - 1)
83
84 #define BIT11_01(x) ((x >> 1) & 0x7ff)
85 #define BIT31_12(x) (x >> 12)
86
87 #define SEN_INV_INITIAL_N_SEGMENTS 512
88 #define MAX_CHUNK_RATIO 64
89
90 #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p))
91
92 /* segment */
93
94 inline static sen_rc
segment_get(sen_inv * inv,uint16_t type,uint16_t segno,uint16_t * pseg)95 segment_get(sen_inv *inv, uint16_t type, uint16_t segno, uint16_t *pseg)
96 {
97 uint16_t s, i, empty = SEN_INV_MAX_SEGMENT;
98 for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
99 if ((s = inv->header->segments[i])) {
100 if (s == (type | segno)) { break; }
101 } else {
102 if (empty == SEN_INV_MAX_SEGMENT) { empty = i; }
103 }
104 }
105 if (i == SEN_INV_MAX_SEGMENT) {
106 void *p = NULL;
107 if (empty == SEN_INV_MAX_SEGMENT) { return sen_memory_exhausted; }
108 inv->header->segments[empty] = type | segno;
109 SEN_IO_SEG_REF(inv->seg, empty, p);
110 if (!p) { return sen_memory_exhausted; }
111 memset(p, 0, SEN_INV_SEGMENT_SIZE);
112 SEN_IO_SEG_UNREF(inv->seg, empty);
113 *pseg = empty;
114 } else {
115 *pseg = i;
116 }
117 return sen_success;
118 }
119
120 inline static sen_rc
segment_new(sen_inv * inv,uint16_t type,uint16_t * segno)121 segment_new(sen_inv *inv, uint16_t type, uint16_t *segno)
122 {
123 sen_rc rc = sen_success;
124 uint16_t s, i, seg, empty = SEN_INV_MAX_SEGMENT;
125 char used[SEN_INV_MAX_SEGMENT];
126 memset(used, 0, SEN_INV_MAX_SEGMENT);
127 for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
128 if ((s = inv->header->segments[i])) {
129 if (s & type) { used[s & SEGMENT_MASK]++; }
130 } else {
131 if (empty == SEN_INV_MAX_SEGMENT) { empty = i; }
132 }
133 }
134 if (empty == SEN_INV_MAX_SEGMENT) { return sen_memory_exhausted; }
135 if (segno && *segno < SEN_INV_MAX_SEGMENT) {
136 if (used[*segno]) { return sen_invalid_argument; }
137 seg = *segno;
138 } else {
139 for (seg = 0; used[seg]; seg++) ;
140 }
141 inv->header->segments[empty] = type | seg;
142 switch (type) {
143 case SEGMENT_ARRAY :
144 inv->ainfo[seg] = empty;
145 if (seg > inv->amax) { inv->amax = seg; }
146 break;
147 case SEGMENT_BUFFER :
148 inv->binfo[seg] = empty;
149 if (seg > inv->bmax) { inv->bmax = seg; }
150 break;
151 }
152 if (segno) { *segno = seg; }
153 return rc;
154 }
155
156 inline static sen_rc
load_all_segments(sen_inv * inv)157 load_all_segments(sen_inv *inv)
158 {
159 sen_rc rc = sen_success;
160 uint16_t s, seg, amax = 0, bmax = 0;
161 char used[SEN_INV_MAX_SEGMENT];
162 memset(used, 0, SEN_INV_MAX_SEGMENT);
163 for (seg = 0; seg < SEN_INV_MAX_SEGMENT; seg++) {
164 if (!(s = inv->header->segments[seg])) { continue; }
165 if (s & SEGMENT_ARRAY) {
166 used[s & SEGMENT_MASK] |= 2;
167 inv->ainfo[s & SEGMENT_MASK] = seg;
168 }
169 if (s & SEGMENT_BUFFER) {
170 used[s & SEGMENT_MASK] |= 1;
171 inv->binfo[s & SEGMENT_MASK] = seg;
172 }
173 }
174 for (seg = 0; seg < SEN_INV_MAX_SEGMENT; seg++) {
175 if ((used[seg] & 2)) { amax = seg; } else { inv->ainfo[seg] = SEG_NOT_ASSIGNED; }
176 if ((used[seg] & 1)) { bmax = seg; } else { inv->binfo[seg] = SEG_NOT_ASSIGNED; }
177 }
178 inv->amax = amax;
179 inv->bmax = bmax;
180 return rc;
181 }
182
183 void
sen_inv_seg_expire08(sen_inv * inv)184 sen_inv_seg_expire08(sen_inv *inv)
185 {
186 uint32_t expire_threshold = inv->header->initial_n_segments * 2;
187 if (inv->seg->nmaps > expire_threshold) {
188 uint16_t seg;
189 for (seg = inv->bmax; seg; seg--) {
190 uint16_t pseg = inv->binfo[seg];
191 if (pseg != SEG_NOT_ASSIGNED) {
192 sen_io_mapinfo *info = &inv->seg->maps[pseg];
193 uint32_t *pnref = &inv->seg->nrefs[pseg];
194 if (info->map && !*pnref) {
195 sen_io_seg_expire(inv->seg, pseg, 100);
196 if (inv->seg->nmaps <= expire_threshold) { return; }
197 }
198 }
199 }
200 for (seg = inv->amax; seg; seg--) {
201 uint16_t pseg = inv->ainfo[seg];
202 if (pseg != SEG_NOT_ASSIGNED) {
203 sen_io_mapinfo *info = &inv->seg->maps[pseg];
204 uint32_t *pnref = &inv->seg->nrefs[pseg];
205 if (info->map && !*pnref) {
206 sen_io_seg_expire(inv->seg, pseg, 100);
207 if (inv->seg->nmaps <= expire_threshold) { return; }
208 }
209 }
210 }
211 }
212 }
213
214 /* chunk */
215
216 inline static sen_rc
chunk_new(sen_inv * inv,uint32_t * res,uint32_t size)217 chunk_new(sen_inv *inv, uint32_t *res, uint32_t size)
218 {
219 int i, j;
220 uint32_t n = size / SEN_INV_CHUNK_SIZE;
221 int max_chunk = inv->header->initial_n_segments * MAX_CHUNK_RATIO;
222 uint32_t base_seg = sen_io_base_seg(inv->chunk);
223 if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
224 for (i = 0, j = -1; i < max_chunk; i++) {
225 if (inv->header->chunks[i]) {
226 j = i;
227 } else {
228 if (i - j == n) {
229 if (res) { *res = j + 1; }
230 while (j < i) {
231 inv->header->chunks[++j] = 1;
232 }
233 return sen_success;
234 }
235 if ((i + base_seg)/ N_CHUNKS_PER_FILE !=
236 (i + base_seg + 1) / N_CHUNKS_PER_FILE) { j = i; }
237 }
238 }
239 SEN_LOG(sen_log_crit, "index full. set bigger value to initial_n_segments. current value = %d",
240 inv->header->initial_n_segments);
241 return sen_memory_exhausted;
242 }
243
244 inline static sen_rc
chunk_free(sen_inv * inv,int start,uint32_t size)245 chunk_free(sen_inv *inv, int start, uint32_t size)
246 {
247 uint32_t i, n = size / SEN_INV_CHUNK_SIZE;
248 if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
249 // sen_log("chunk_free start=%d size=%d(%d)", start, size, n);
250 for (i = 0; i < n; i++) {
251 inv->header->chunks[start + i] = 0;
252 }
253 return sen_success;
254 }
255
256 /* buffer */
257
258 typedef struct {
259 uint32_t tid;
260 uint32_t size_in_chunk;
261 uint32_t pos_in_chunk;
262 uint16_t size_in_buffer;
263 uint16_t pos_in_buffer;
264 } buffer_term;
265
266 typedef struct {
267 uint16_t step;
268 uint16_t jump;
269 } buffer_rec;
270
271 typedef struct {
272 uint32_t chunk;
273 uint32_t chunk_size;
274 uint32_t buffer_free;
275 uint16_t nterms;
276 uint16_t nterms_void;
277 } buffer_header;
278
279 struct sen_inv_buffer {
280 buffer_header header;
281 buffer_term terms[(SEN_INV_SEGMENT_SIZE - sizeof(buffer_header))/sizeof(buffer_term)];
282 };
283
284 typedef struct sen_inv_buffer buffer;
285
286 inline static sen_rc
buffer_open(sen_inv * inv,uint32_t pos,buffer_term ** bt,buffer ** b)287 buffer_open(sen_inv *inv, uint32_t pos, buffer_term **bt, buffer **b)
288 {
289 byte *p = NULL;
290 uint16_t lseg = (uint16_t) (pos >> W_OF_SEGMENT);
291 uint16_t pseg = inv->binfo[lseg];
292 if (pseg == SEG_NOT_ASSIGNED ||
293 inv->header->segments[pseg] != (SEGMENT_BUFFER|lseg)) {
294 load_all_segments(inv);
295 pseg = inv->binfo[lseg];
296 if (pseg == SEG_NOT_ASSIGNED ||
297 inv->header->segments[pseg] != (SEGMENT_BUFFER|lseg)) {
298 return sen_invalid_argument;
299 }
300 }
301 SEN_IO_SEG_REF(inv->seg, pseg, p);
302 if (!p) { return sen_memory_exhausted; }
303 if (b) { *b = (buffer *)p; }
304 if (bt) { *bt = (buffer_term *)(p + (pos & BUFFER_MASK_IN_A_SEGMENT)); }
305 return sen_success;
306 }
307
308 inline static sen_rc
buffer_close(sen_inv * inv,uint32_t pos)309 buffer_close(sen_inv *inv, uint32_t pos)
310 {
311 uint16_t pseg = inv->binfo[pos >> W_OF_SEGMENT];
312 if (pseg >= SEN_INV_MAX_SEGMENT) { return sen_invalid_argument; }
313 SEN_IO_SEG_UNREF(inv->seg, pseg);
314 return sen_success;
315 }
316
317 inline static int
buffer_open_if_capable(sen_inv * inv,int32_t seg,int size,buffer ** b)318 buffer_open_if_capable(sen_inv *inv, int32_t seg, int size, buffer **b)
319 {
320 int res, nterms;
321 uint32_t pos = ((uint32_t) seg) * SEN_INV_SEGMENT_SIZE;
322 if (buffer_open(inv, pos, NULL, b)) { return 0; }
323 nterms = (*b)->header.nterms - (*b)->header.nterms_void;
324 res = ((nterms < 4096 ||
325 (inv->total_chunk_size >> ((nterms >> 8) - 6)) > (*b)->header.chunk_size) &&
326 ((*b)->header.buffer_free >= size + sizeof(buffer_term)));
327 if (!res) { buffer_close(inv, pos); }
328 return res;
329 }
330
331 inline static sen_rc
buffer_new(sen_inv * inv,int size,uint32_t * pos,buffer_term ** bt,buffer_rec ** br,buffer ** bp,int hint)332 buffer_new(sen_inv *inv, int size, uint32_t *pos,
333 buffer_term **bt, buffer_rec **br, buffer **bp, int hint)
334 {
335 buffer *b;
336 uint16_t nseg0 = inv->header->initial_n_segments;
337 uint16_t seg, offset, seg0 = hint % nseg0;
338 uint16_t segmax = (uint16_t) (inv->total_chunk_size >> 7) + nseg0;
339 if (size + sizeof(buffer_header) + sizeof(buffer_term) > SEN_INV_SEGMENT_SIZE) {
340 return sen_invalid_argument;
341 }
342 // load_all_segments(inv); todo: ainfo and binfo should be inside the header
343 for (seg = seg0; seg < segmax; seg += nseg0) {
344 if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
345 if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
346 }
347 if (seg >= segmax) {
348 for (seg = (seg0 + 1) % nseg0; seg != seg0; seg = (seg + 1) % nseg0) {
349 if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
350 if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
351 }
352 if (seg == seg0) {
353 for (seg = nseg0; seg < SEN_INV_MAX_SEGMENT; seg++) {
354 if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
355 if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
356 }
357 }
358 }
359 SEN_LOG(sen_log_debug, "inv=%p new seg=%d", inv, seg);
360 if (segment_new(inv, SEGMENT_BUFFER, &seg) ||
361 buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &b)) {
362 return sen_memory_exhausted;
363 }
364 memset(b, 0, SEN_INV_SEGMENT_SIZE);
365 b->header.buffer_free = SEN_INV_SEGMENT_SIZE - sizeof(buffer_header);
366 b->header.chunk = CHUNK_NOT_ASSIGNED;
367 b->header.chunk_size = 0;
368 exit :
369 if (b->header.nterms_void) {
370 for (offset = 0; offset < b->header.nterms; offset++) {
371 if (!b->terms[offset].tid) { break; }
372 }
373 if (offset == b->header.nterms) {
374 SEN_LOG(sen_log_notice, "inconsistent buffer(%d)", seg);
375 b->header.nterms_void = 0;
376 b->header.nterms++;
377 b->header.buffer_free -= size + sizeof(buffer_term);
378 } else {
379 b->header.nterms_void--;
380 b->header.buffer_free -= size;
381 }
382 } else {
383 offset = b->header.nterms++;
384 b->header.buffer_free -= size + sizeof(buffer_term);
385 }
386 *pos = seg * SEN_INV_SEGMENT_SIZE
387 + sizeof(buffer_header) + sizeof(buffer_term) * offset;
388 *bt = &b->terms[offset];
389 *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
390 *bp = b;
391 return sen_success;
392 }
393
394 typedef struct {
395 uint32_t rid;
396 uint32_t sid;
397 } docid;
398
399 #define BUFFER_REC_DEL(r) ((r)->jump = 1)
400 #define BUFFER_REC_DELETED(r) ((r)->jump == 1)
401
402 #define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos))
403 #define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b)))
404
405 inline static void
buffer_term_dump(buffer * b,buffer_term * bt)406 buffer_term_dump(buffer *b, buffer_term *bt)
407 {
408 int pos, rid, sid;
409 uint8_t *p;
410 buffer_rec *r;
411 SEN_LOG(sen_log_debug,
412 "b=(%x %u %u %u)", b->header.chunk, b->header.chunk_size, b->header.buffer_free, b->header.nterms);
413 SEN_LOG(sen_log_debug,
414 "bt=(%u %u %u %u %u)", bt->tid, bt->size_in_chunk, bt->pos_in_chunk, bt->size_in_buffer, bt->pos_in_buffer);
415 for (pos = bt->pos_in_buffer; pos; pos = r->step) {
416 r = BUFFER_REC_AT(b, pos);
417 p = NEXT_ADDR(r);
418 SEN_B_DEC(rid, p);
419 SEN_B_DEC(sid, p);
420 SEN_LOG(sen_log_debug, "%d=(%d:%d),(%d:%d)", pos, r->jump, r->step, rid, sid);
421 }
422 }
423
424 static buffer_term *tmp_bt;
425
426 inline static sen_rc
check_jump(buffer * b,buffer_rec * r,int j)427 check_jump(buffer *b, buffer_rec *r, int j)
428 {
429 uint16_t i = BUFFER_REC_POS(b, r);
430 uint8_t *p;
431 buffer_rec *r2;
432 docid id, id2;
433 if (!j) { return sen_success; }
434 p = NEXT_ADDR(r);
435 SEN_B_DEC(id.rid, p);
436 SEN_B_DEC(id.sid, p);
437 if (j == 1) {
438 SEN_LOG(sen_log_debug, "deleting! %d(%d:%d)", i, id.rid, id.sid);
439 return sen_success;
440 }
441 r2 = BUFFER_REC_AT(b, j);
442 p = NEXT_ADDR(r2);
443 SEN_B_DEC(id2.rid, p);
444 SEN_B_DEC(id2.sid, p);
445 if (r2->step == i) {
446 SEN_LOG(sen_log_emerg, "cycle! %d(%d:%d)<->%d(%d:%d)", i, id.rid, id.sid, j, id2.rid, id2.sid);
447 buffer_term_dump(b, tmp_bt);
448 return sen_other_error;
449 }
450 if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) {
451 SEN_LOG(sen_log_crit, "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)", i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, id2.rid, id2.sid);
452 return sen_other_error;
453 }
454 return sen_success;
455 }
456
457 inline static sen_rc
set_jump_r(buffer * b,buffer_rec * from,int to)458 set_jump_r(buffer *b, buffer_rec *from, int to)
459 {
460 int i, j, max_jump = 100;
461 buffer_rec *r, *r2;
462 for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) {
463 r2 = BUFFER_REC_AT(b, j);
464 if (r == r2) { break; }
465 if (BUFFER_REC_DELETED(r2)) { break; }
466 if (j == (i = r->jump)) { break; }
467 if (j == r->step) { break; }
468 if (check_jump(b, r, j)) { return sen_other_error; }
469 r->jump = j;
470 j = i;
471 if (!r->step) { return sen_other_error; }
472 }
473 return sen_success;
474 }
475
476 #define GET_NUM_BITS(x,n) { \
477 n = x; \
478 n = (n & 0x55555555) + ((n >> 1) & 0x55555555); \
479 n = (n & 0x33333333) + ((n >> 2) & 0x33333333); \
480 n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F); \
481 n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF); \
482 n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF); \
483 }
484
485 inline static sen_rc
buffer_put(buffer * b,buffer_term * bt,buffer_rec * rnew,uint8_t * bs,sen_inv_updspec * u,int size)486 buffer_put(buffer *b, buffer_term *bt, buffer_rec *rnew, uint8_t *bs,
487 sen_inv_updspec *u, int size)
488 {
489 uint8_t *p;
490 sen_rc rc = sen_success;
491 docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0};
492 buffer_rec *r_curr, *r_start = NULL;
493 uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew);
494 int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1;
495
496 tmp_bt = bt; // test
497
498 memcpy(NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec));
499 // sen_log("tid=%d u->rid=%d u->sid=%d", bt->tid, u->rid, u->sid);
500 for (;;) {
501 // sen_log("*lastp=%d", *lastp);
502 if (!*lastp) {
503 rnew->step = 0;
504 rnew->jump = 0;
505 *lastp = pos;
506 if (bt->size_in_buffer++ > 1) {
507 buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer);
508 rhead->jump = pos;
509 if (!(bt->size_in_buffer & 1)) {
510 int n;
511 buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2;
512 GET_NUM_BITS(bt->size_in_buffer, n);
513 while (n-- && (r->jump > 1)) {
514 r2 = BUFFER_REC_AT(b, r->jump);
515 if (BUFFER_REC_DELETED(r2)) { break; }
516 r = r2;
517 }
518 if (r != rnew) { set_jump_r(b, r, last); }
519 }
520 }
521 break;
522 }
523 r_curr = BUFFER_REC_AT(b, *lastp);
524 p = NEXT_ADDR(r_curr);
525 SEN_B_DEC(id_curr.rid, p);
526 SEN_B_DEC(id_curr.sid, p);
527 if (id_curr.rid < id_post.rid ||
528 (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) {
529 SEN_LOG(sen_log_emerg, "loop found!!! (%d:%d)->(%d:%d)",
530 id_post.rid, id_post.sid, id_curr.rid, id_curr.sid);
531 buffer_term_dump(b, bt);
532 /* abandon corrupt list */
533 bt->pos_in_buffer = 0;
534 bt->size_in_buffer = 0;
535 lastp = &bt->pos_in_buffer;
536 rc = sen_invalid_format;
537 continue;
538 }
539 id_post.rid = id_curr.rid;
540 id_post.sid = id_curr.sid;
541 if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) {
542 uint16_t step = *lastp, jump = r_curr->jump;
543 if (u->rid == id_curr.rid) {
544 if (u->sid == 0) {
545 while (id_curr.rid == u->rid) {
546 BUFFER_REC_DEL(r_curr);
547 if (!(step = r_curr->step)) { break; }
548 r_curr = BUFFER_REC_AT(b, step);
549 p = NEXT_ADDR(r_curr);
550 SEN_B_DEC(id_curr.rid, p);
551 SEN_B_DEC(id_curr.sid, p);
552 }
553 } else if (u->sid == id_curr.sid) {
554 BUFFER_REC_DEL(r_curr);
555 step = r_curr->step;
556 }
557 }
558 rnew->step = step;
559 rnew->jump = check_jump(b, rnew, jump) ? 0 : jump;
560 *lastp = pos;
561 break;
562 }
563
564 if (reset) {
565 r_start = r_curr;
566 id_start.rid = id_curr.rid;
567 id_start.sid = id_curr.sid;
568 if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; }
569 nhops = 0;
570 vhops = 1;
571 vdelta = delta0 >> 1;
572 } else {
573 if (!(delta = id_curr.rid - id_start.rid)) { delta = id_curr.sid - id_start.sid; }
574 if (vdelta < delta) {
575 vdelta += (delta0 >> ++vhops);
576 r_start = r_curr;
577 }
578 if (nhops > vhops) {
579 set_jump_r(b, r_start, *lastp);
580 } else {
581 nhops++;
582 }
583 }
584
585 last = *lastp;
586 lastp = &r_curr->step;
587 reset = 0;
588 {
589 uint16_t posj = r_curr->jump;
590 if (posj > 1) {
591 buffer_rec *rj = BUFFER_REC_AT(b, posj);
592 if (!BUFFER_REC_DELETED(rj)) {
593 docid idj;
594 p = NEXT_ADDR(rj);
595 SEN_B_DEC(idj.rid, p);
596 SEN_B_DEC(idj.sid, p);
597 if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) {
598 last = posj;
599 lastp = &rj->step;
600 } else {
601 reset = 1;
602 }
603 }
604 }
605 }
606 }
607 return rc;
608 }
609
610 /* array */
611
612 inline static uint32_t *
array_at(sen_inv * inv,uint32_t id)613 array_at(sen_inv *inv, uint32_t id)
614 {
615 byte *p = NULL;
616 uint16_t seg, pseg;
617 if (id > SEN_SYM_MAX_ID) { return NULL; }
618 seg = id >> W_OF_ARRAY;
619 if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
620 load_all_segments(inv);
621 if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
622 return NULL;
623 }
624 }
625 SEN_IO_SEG_REF(inv->seg, pseg, p);
626 if (!p) { return NULL; }
627 return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
628 }
629
630 inline static uint32_t *
array_get(sen_inv * inv,uint32_t id)631 array_get(sen_inv *inv, uint32_t id)
632 {
633 byte *p = NULL;
634 uint16_t seg, pseg;
635 if (id > SEN_SYM_MAX_ID) { return NULL; }
636 seg = id >> W_OF_ARRAY;
637 if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
638 if (segment_get(inv, SEGMENT_ARRAY, seg, &pseg)) { return NULL; }
639 inv->ainfo[seg] = pseg;
640 if (seg > inv->amax) { inv->amax = seg; }
641 }
642 SEN_IO_SEG_REF(inv->seg, pseg, p)
643 if (!p) { return NULL; }
644 return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
645 }
646
647 inline static void
array_unref(sen_inv * inv,uint32_t id)648 array_unref(sen_inv *inv, uint32_t id)
649 {
650 SEN_IO_SEG_UNREF(inv->seg, inv->ainfo[id >> W_OF_ARRAY]);
651 }
652
653 inline static uint8_t *
encode_rec(sen_inv_updspec * u,unsigned int * size,int deletep)654 encode_rec(sen_inv_updspec *u, unsigned int *size, int deletep)
655 {
656 intptr_t s;
657 uint8_t *br, *p;
658 struct _sen_inv_pos *pp;
659 uint32_t lpos, tf = deletep ? 0 : u->tf;
660 if (!(br = SEN_GMALLOC((u->tf + 4) * 5))) {
661 return NULL;
662 }
663 p = br;
664 SEN_B_ENC(u->rid, p);
665 SEN_B_ENC(u->sid, p);
666 if (!u->score) {
667 SEN_B_ENC(tf * 2, p);
668 } else {
669 SEN_B_ENC(tf * 2 + 1, p);
670 SEN_B_ENC(u->score, p);
671 }
672 for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) {
673 SEN_B_ENC(pp->pos - lpos, p);
674 }
675 s = (p - br) + sizeof(buffer_rec);
676 *size = (unsigned int) ((s + 0x03) & ~0x03);
677 return br;
678 }
679
680 inline static int
sym_deletable(uint32_t tid,sen_set * h)681 sym_deletable(uint32_t tid, sen_set *h)
682 {
683 sen_inv_updspec **u;
684 if (!h) { return 1; }
685 if (!sen_set_at(h, &tid, (void **) &u)) { return 1; }
686 if (!(*u)->tf || !(*u)->sid) { return 1; }
687 return 0;
688 }
689
690 typedef struct {
691 sen_inv *inv;
692 sen_set *h;
693 } sis_deletable_arg;
694
695 static int
sis_deletable(sen_id tid,void * arg)696 sis_deletable(sen_id tid, void *arg)
697 {
698 uint32_t *a;
699 sen_set *h = ((sis_deletable_arg *)arg)->h;
700 sen_inv *inv = ((sis_deletable_arg *)arg)->inv;
701 if ((a = array_at(inv, tid))) {
702 if (*a) {
703 array_unref(inv, tid);
704 return 0;
705 }
706 array_unref(inv, tid);
707 }
708 return sym_deletable(tid, h);
709 }
710
711 inline static void
sym_delete(sen_inv * inv,uint32_t tid,sen_set * h)712 sym_delete(sen_inv *inv, uint32_t tid, sen_set *h)
713 {
714 sis_deletable_arg arg = {inv, h};
715 if (inv->lexicon->flags & SEN_SYM_WITH_SIS) {
716 sen_sym_del_with_sis(inv->lexicon, tid, sis_deletable, &arg);
717 /*
718 uint32_t *a;
719 while ((tid = sen_sym_del_with_sis(inv->lexicon, tid))) {
720 if ((a = array_at(inv, tid))) {
721 if (*a) {
722 array_unref(inv, tid);
723 break;
724 }
725 array_unref(inv, tid);
726 }
727 if (!sym_deletable(tid, h)) { break; }
728 }
729 */
730 } else {
731 if (sym_deletable(tid, h)) {
732 sen_sym_del(inv->lexicon, _sen_sym_key(inv->lexicon, tid));
733 }
734 }
735 }
736
737 inline static sen_rc
buffer_flush(sen_inv * inv,uint32_t seg,sen_set * h)738 buffer_flush(sen_inv *inv, uint32_t seg, sen_set *h)
739 {
740 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
741 buffer *sb, *db = NULL;
742 sen_rc rc = sen_success;
743 sen_io_win sw, dw;
744 uint8_t *dc, *sc = NULL;
745 uint16_t ss, ds;
746 uint32_t scn, dcn, max_dest_chunk_size;
747 ss = inv->binfo[seg];
748 if (ss == SEG_NOT_ASSIGNED) { return sen_invalid_format; }
749 if (buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &sb)) {
750 return sen_memory_exhausted;
751 }
752 for (ds = 0; inv->header->segments[ds];) {
753 if (++ds == SEN_INV_MAX_SEGMENT) {
754 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
755 return sen_memory_exhausted;
756 }
757 }
758 SEN_IO_SEG_REF(inv->seg, ds, db);
759 if (!db) {
760 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
761 return sen_memory_exhausted;
762 }
763 memset(db, 0, SEN_INV_SEGMENT_SIZE);
764
765 max_dest_chunk_size = sb->header.chunk_size + SEN_INV_SEGMENT_SIZE;
766 if (chunk_new(inv, &dcn, max_dest_chunk_size)) {
767 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
768 SEN_IO_SEG_UNREF(inv->seg, ds);
769 return sen_memory_exhausted;
770 }
771 // sen_log("db=%p ds=%d sb=%p seg=%d", db, ds, sb, seg);
772 if ((scn = sb->header.chunk) != CHUNK_NOT_ASSIGNED) {
773 sc = sen_io_win_map(inv->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, sen_io_rdwr);
774 if (!sc) {
775 SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!!", scn, sb->header.chunk_size);
776 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
777 SEN_IO_SEG_UNREF(inv->seg, ds);
778 chunk_free(inv, dcn, max_dest_chunk_size);
779 return sen_memory_exhausted;
780 }
781 }
782 // dc = sen_io_win_map(inv->chunk, &dw, dcn, 0, max_dest_chunk_size, sen_io_wronly);
783 dc = sen_io_win_map(inv->chunk, ctx, &dw, dcn, 0, max_dest_chunk_size, sen_io_rdwr);
784 if (!dc) {
785 SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!!", dcn, max_dest_chunk_size);
786 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
787 SEN_IO_SEG_UNREF(inv->seg, ds);
788 chunk_free(inv, dcn, max_dest_chunk_size);
789 if (scn != CHUNK_NOT_ASSIGNED) { sen_io_win_unmap(&sw); }
790 return sen_memory_exhausted;
791 }
792 {
793 uint8_t *bp = NULL, *cp = NULL, *cpe = NULL, *dp = dc;
794 uint16_t nextb;
795 buffer_rec *br;
796 buffer_term *bt;
797 int n = sb->header.nterms;
798 int nterms_void = 0;
799 memcpy(db->terms, sb->terms, n * sizeof(buffer_term));
800 // sen_log(" scn=%d, dcn=%d, nterms=%d", sb->header.chunk, dcn, n);
801 for (bt = db->terms; n; n--, bt++) {
802 docid cid = {0, 0}, lid = {0, 0}, bid = {0, 0};
803 uint32_t ndf = 0, tf2, ltf2 = 0, gap;
804 if (!bt->tid) {
805 nterms_void++;
806 continue;
807 }
808 if (sc) {
809 cp = sc + bt->pos_in_chunk;
810 cpe = cp + bt->size_in_chunk;
811 }
812 nextb = bt->pos_in_buffer;
813 bt->pos_in_chunk = (uint32_t)(dp - dc);
814 bt->size_in_buffer = 0;
815 bt->pos_in_buffer = 0;
816
817 // sen_log("db=%p n=%d, bt=%p tid=%d bdf=%d", db, n, bt, bt->tid, bdf);
818
819 #define GETNEXTC() { \
820 if (cp < cpe && cid.rid) { \
821 SEN_B_DEC(tf2, cp); \
822 if (tf2 & 1) { SEN_B_SKIP(cp); } \
823 tf2 >>= 1; \
824 while (cp < cpe && tf2--) { SEN_B_SKIP(cp); } \
825 } \
826 if (cp < cpe) { \
827 SEN_B_DEC(gap, cp); \
828 cid.rid += gap; \
829 if (gap) { cid.sid = 0; } \
830 SEN_B_DEC(gap, cp); \
831 cid.sid += gap; \
832 } else { \
833 cid.rid = 0; \
834 } \
835 }
836 #define PUTNEXTC() { \
837 if (cid.rid) { \
838 /* sen_log("srid=%d", srid); */ \
839 SEN_B_DEC(tf2, cp); \
840 if (tf2) { \
841 if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) { \
842 SEN_LOG(sen_log_crit, "brokenc!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid); \
843 rc = sen_invalid_format;\
844 break; \
845 } \
846 ndf++; \
847 gap = cid.rid - lid.rid; \
848 SEN_B_ENC(gap, dp); \
849 if (gap) { SEN_B_ENC(cid.sid, dp); } else { SEN_B_ENC(cid.sid - lid.sid, dp); } \
850 SEN_B_ENC(tf2, dp); \
851 if (tf2 & 1) { SEN_B_COPY(dp, cp); } \
852 ltf2 = tf2; \
853 tf2 >>= 1; \
854 while (tf2--) { SEN_B_COPY(dp, cp); } \
855 lid.rid = cid.rid; \
856 lid.sid = cid.sid; \
857 } else { \
858 SEN_LOG(sen_log_crit, "invalid chunk(%d,%d)", bt->tid, cid.rid);\
859 rc = sen_invalid_format;\
860 break; \
861 } \
862 } \
863 if (cp < cpe) { \
864 SEN_B_DEC(gap, cp); \
865 cid.rid += gap; \
866 if (gap) { cid.sid = 0; } \
867 SEN_B_DEC(gap, cp); \
868 cid.sid += gap; \
869 } else { \
870 cid.rid = 0; \
871 } \
872 /* sen_log("gap=%d srid=%d", gap, srid); */ \
873 }
874 #define GETNEXTB() { \
875 if (nextb) { \
876 uint32_t lrid = bid.rid, lsid = bid.sid; \
877 br = BUFFER_REC_AT(sb, nextb); \
878 bp = NEXT_ADDR(br); \
879 SEN_B_DEC(bid.rid, bp); \
880 SEN_B_DEC(bid.sid, bp); \
881 if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) { \
882 SEN_LOG(sen_log_crit, "brokeng!! (%d:%d) -> (%d:%d)", lrid, lsid, bid.rid, bid.sid); \
883 rc = sen_invalid_format;\
884 break; \
885 } \
886 nextb = br->step; \
887 } else { \
888 bid.rid = 0; \
889 } \
890 }
891 #define PUTNEXTB() { \
892 if (bid.rid && bid.sid) { \
893 SEN_B_DEC(tf2, bp); \
894 if (tf2) { \
895 /* sen_log("brid=%d", bid.rid); */ \
896 if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) { \
897 SEN_LOG(sen_log_crit, "brokenb!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid); \
898 rc = sen_invalid_format;\
899 break; \
900 } \
901 ndf++; \
902 gap = bid.rid - lid.rid; \
903 SEN_B_ENC(gap, dp); \
904 if (gap) { SEN_B_ENC(bid.sid, dp); } else { SEN_B_ENC(bid.sid - lid.sid, dp); } \
905 SEN_B_ENC(tf2, dp); \
906 if (tf2 & 1) { SEN_B_COPY(dp, bp); } \
907 ltf2 = tf2; \
908 tf2 >>= 1; \
909 while (tf2--) { SEN_B_COPY(dp, bp); } \
910 lid.rid = bid.rid; \
911 lid.sid = bid.sid; \
912 } \
913 } \
914 GETNEXTB(); \
915 }
916
917 GETNEXTC();
918 GETNEXTB();
919 for (;;) {
920 if (bid.rid) {
921 if (cid.rid) {
922 if (cid.rid < bid.rid) {
923 PUTNEXTC();
924 } else {
925 if (bid.rid < cid.rid) {
926 PUTNEXTB();
927 } else {
928 if (bid.sid) {
929 if (cid.sid < bid.sid) {
930 PUTNEXTC();
931 } else {
932 if (bid.sid == cid.sid) { GETNEXTC(); }
933 PUTNEXTB();
934 }
935 } else {
936 GETNEXTC();
937 }
938 }
939 }
940 } else {
941 PUTNEXTB();
942 }
943 } else {
944 if (cid.rid) {
945 PUTNEXTC();
946 } else {
947 break;
948 }
949 }
950 }
951 // sen_log("break: dp=%p cp=%p", dp, cp);
952
953 bt->size_in_chunk = (uint32_t)((dp - dc) - bt->pos_in_chunk);
954
955 if (!ndf) {
956 uint32_t *a;
957 if ((a = array_at(inv, bt->tid))) {
958 sen_sym_pocket_set(inv->lexicon, bt->tid, 0);
959 *a = 0;
960 sym_delete(inv, bt->tid, h);
961 array_unref(inv, bt->tid);
962 }
963 bt->tid = 0;
964 bt->pos_in_chunk = 0;
965 bt->size_in_chunk = 0;
966 nterms_void++;
967 } else if (ndf == 1 && lid.rid < 0x100000 && lid.sid < 0x800 && ltf2 == 2) {
968 uint32_t rid_, sid_, tf_, pos_;
969 uint8_t *dp_ = dc + bt->pos_in_chunk;
970 SEN_B_DEC(rid_, dp_);
971 if (rid_ < 0x100000) {
972 SEN_B_DEC(sid_, dp_);
973 if (sid_ < 0x800) {
974 SEN_B_DEC(tf_, dp_);
975 if (tf_ == 2) {
976 SEN_B_DEC(pos_, dp_);
977 if (pos_ < 0x4000) {
978 uint32_t *a;
979 if ((a = array_at(inv, bt->tid))) {
980 sen_sym_pocket_set(inv->lexicon, bt->tid, pos_);
981 *a = (rid_ << 12) + (sid_ << 1) + 1;
982 array_unref(inv, bt->tid);
983 }
984 dp = dc + bt->pos_in_chunk;
985 bt->tid = 0;
986 bt->pos_in_chunk = 0;
987 bt->size_in_chunk = 0;
988 nterms_void++;
989 }
990 }
991 }
992 }
993 }
994 // sen_log("db=%p df=%d size=%d", db, ndf, (dp - dc) - bt->pos_in_chunk);
995 }
996 db->header.chunk_size = (uint32_t)(dp - dc);
997 db->header.nterms_void = nterms_void;
998 inv->total_chunk_size += db->header.chunk_size >> 10;
999 }
1000 db->header.chunk = dcn;
1001 db->header.buffer_free = SEN_INV_SEGMENT_SIZE
1002 - sizeof(buffer_header) - sb->header.nterms * sizeof(buffer_term);
1003 db->header.nterms = sb->header.nterms;
1004
1005 {
1006 uint32_t mc, ec;
1007 mc = max_dest_chunk_size / SEN_INV_CHUNK_SIZE;
1008 if (mc * SEN_INV_CHUNK_SIZE < max_dest_chunk_size) { mc++; }
1009 ec = db->header.chunk_size / SEN_INV_CHUNK_SIZE;
1010 if (ec * SEN_INV_CHUNK_SIZE < db->header.chunk_size) { ec++; }
1011 // sen_log(" ss=%d ds=%d inv->binfo[%d]=%p max_size=%d(%d) chunk_size=%d(%d)", ss, ds, seg, db, max_dest_chunk_size, mc, db->header.chunk_size, ec);
1012 while (ec < mc) {
1013 // sen_log("chunk[%d]=0(%d)", ec, mc);
1014 inv->header->chunks[db->header.chunk + ec++] = 0;
1015 }
1016 }
1017 buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
1018 SEN_IO_SEG_UNREF(inv->seg, ds);
1019 inv->binfo[seg] = ds;
1020 inv->header->segments[ss] = 0;
1021 inv->header->segments[ds] = SEGMENT_BUFFER | seg;
1022 if (scn != CHUNK_NOT_ASSIGNED) {
1023 sen_io_win_unmap(&sw);
1024 chunk_free(inv, scn, sb->header.chunk_size);
1025 inv->total_chunk_size -= sb->header.chunk_size >> 10;
1026 }
1027 sen_io_win_unmap(&dw);
1028 return rc;
1029 }
1030
1031 /* inv */
1032
1033 sen_inv *
sen_inv_create08(const char * path,sen_sym * lexicon,uint32_t initial_n_segments)1034 sen_inv_create08(const char *path, sen_sym *lexicon, uint32_t initial_n_segments)
1035 {
1036 int i, max_chunk;
1037 sen_io *seg, *chunk;
1038 sen_inv *inv;
1039 char path2[PATH_MAX];
1040 struct sen_inv_header *header;
1041 if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1042 strcpy(path2, path);
1043 strcat(path2, ".c");
1044 if (!initial_n_segments) { initial_n_segments = SEN_INV_INITIAL_N_SEGMENTS; }
1045 if (initial_n_segments > SEN_INV_MAX_SEGMENT) {
1046 initial_n_segments = SEN_INV_MAX_SEGMENT;
1047 }
1048 max_chunk = initial_n_segments * MAX_CHUNK_RATIO;
1049 seg = sen_io_create(path, sizeof(struct sen_inv_header) + max_chunk,
1050 SEN_INV_SEGMENT_SIZE, SEN_INV_MAX_SEGMENT,
1051 sen_io_auto, SEN_INV_MAX_SEGMENT);
1052 if (!seg) { return NULL; }
1053 chunk = sen_io_create(path2, 0, SEN_INV_CHUNK_SIZE,
1054 max_chunk, sen_io_auto, max_chunk);
1055 if (!chunk) {
1056 sen_io_close(seg);
1057 return NULL;
1058 }
1059 header = sen_io_header(seg);
1060 memcpy(header->idstr, SEN_INV_IDSTR, 16);
1061 for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) { header->segments[i] = 0; }
1062 header->initial_n_segments = initial_n_segments;
1063 if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1064 sen_io_close(seg);
1065 sen_io_close(chunk);
1066 return NULL;
1067 }
1068 inv->v08p = 1;
1069 inv->seg = seg;
1070 inv->chunk = chunk;
1071 inv->header = header;
1072 inv->lexicon = lexicon;
1073 #ifdef USE_QUERY_ABORT
1074 inv->check_abort = NULL;
1075 inv->check_abort_arg = NULL;
1076 #endif /* USE_QUERY_ABORT */
1077 inv->total_chunk_size = 0;
1078 load_all_segments(inv);
1079 return inv;
1080 }
1081
1082 sen_inv *
sen_inv_open08(const char * path,sen_sym * lexicon)1083 sen_inv_open08(const char *path, sen_sym *lexicon)
1084 {
1085 sen_io *seg, *chunk;
1086 sen_inv *inv;
1087 char path2[PATH_MAX];
1088 struct sen_inv_header *header;
1089 if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1090 strcpy(path2, path);
1091 strcat(path2, ".c");
1092 seg = sen_io_open(path, sen_io_auto, SEN_INV_MAX_SEGMENT);
1093 if (!seg) { return NULL; }
1094 chunk = sen_io_open(path2, sen_io_auto, SEN_INV_MAX_SEGMENT);
1095 if (!chunk) {
1096 sen_io_close(seg);
1097 return NULL;
1098 }
1099 header = sen_io_header(seg);
1100 if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1101 sen_io_close(seg);
1102 sen_io_close(chunk);
1103 return NULL;
1104 }
1105 inv->v08p = 1;
1106 inv->seg = seg;
1107 inv->chunk = chunk;
1108 inv->header = header;
1109 inv->lexicon = lexicon;
1110 {
1111 off_t size = 0;
1112 sen_io_size(inv->chunk, &size);
1113 inv->total_chunk_size = (uint32_t) (size >> 10);
1114 }
1115 load_all_segments(inv);
1116 return inv;
1117 }
1118
1119 sen_rc
sen_inv_update_one08(sen_inv * inv,uint32_t key,sen_inv_updspec * u,sen_set * h,int hint)1120 sen_inv_update_one08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h, int hint)
1121 {
1122 sen_rc r = sen_success;
1123 buffer *b;
1124 uint8_t *bs;
1125 buffer_rec *br = NULL;
1126 buffer_term *bt;
1127 uint32_t pos = 0, size, *a;
1128 if (!u->tf || !u->sid) { return sen_inv_delete_one(inv, key, u, h); }
1129 if (!(a = array_get(inv, key))) { return sen_memory_exhausted; }
1130 if (!(bs = encode_rec(u, &size, 0))) { r = sen_memory_exhausted; goto exit; }
1131 for (;;) {
1132 if (*a) {
1133 if (!(*a & 1)) {
1134 pos = *a;
1135 if ((r = buffer_open(inv, pos, &bt, &b))) { goto exit; }
1136 if (b->header.buffer_free < size) {
1137 int bfb = b->header.buffer_free;
1138 SEN_LOG(sen_log_debug, "flushing *a=%d seg=%d(%p) free=%d",
1139 *a, *a >> W_OF_SEGMENT, b, b->header.buffer_free);
1140 buffer_close(inv, pos);
1141 if ((r = buffer_flush(inv, pos >> W_OF_SEGMENT, h))) { goto exit; }
1142 if (*a != pos) {
1143 SEN_LOG(sen_log_debug, "sen_inv_update_one: *a changed %d->%d", *a, pos);
1144 continue;
1145 }
1146 if ((r = buffer_open(inv, pos, &bt, &b))) {
1147 SEN_LOG(sen_log_crit, "buffer not found *a=%d", *a);
1148 goto exit;
1149 }
1150 SEN_LOG(sen_log_debug, "flushed *a=%d seg=%d(%p) free=%d->%d nterms=%d v=%d",
1151 *a, *a >> W_OF_SEGMENT, b, bfb, b->header.buffer_free,
1152 b->header.nterms, b->header.nterms_void);
1153 if (b->header.buffer_free < size) {
1154 buffer_close(inv, pos);
1155 SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_update_one",
1156 *a, b->header.buffer_free, size);
1157 /* todo: must be splitted */
1158 r = sen_memory_exhausted;
1159 goto exit;
1160 }
1161 }
1162 b->header.buffer_free -= size;
1163 br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms])
1164 + b->header.buffer_free);
1165 } else {
1166 sen_inv_updspec u2;
1167 uint32_t size2 = 0, v = *a;
1168 struct _sen_inv_pos pos2;
1169 pos2.pos = sen_sym_pocket_get(inv->lexicon, key);
1170 pos2.next = NULL;
1171 u2.pos = &pos2;
1172 u2.rid = BIT31_12(v);
1173 u2.sid = BIT11_01(v);
1174 u2.tf = 1;
1175 u2.score = 0;
1176 if (u2.rid != u->rid || u2.sid != u->sid) {
1177 uint8_t *bs2 = encode_rec(&u2, &size2, 0);
1178 if (!bs2) {
1179 SEN_LOG(sen_log_alert, "encode_rec on sen_inv_update_one failed !");
1180 r = sen_memory_exhausted;
1181 goto exit;
1182 }
1183 if ((r = buffer_new(inv, size + size2, &pos, &bt, &br, &b, hint))) {
1184 SEN_GFREE(bs2);
1185 goto exit;
1186 }
1187 bt->tid = key;
1188 bt->size_in_chunk = 0;
1189 bt->pos_in_chunk = 0;
1190 bt->size_in_buffer = 0;
1191 bt->pos_in_buffer = 0;
1192 if ((r = buffer_put(b, bt, br, bs2, &u2, size2))) {
1193 SEN_GFREE(bs2);
1194 buffer_close(inv, pos);
1195 goto exit;
1196 }
1197 br = (buffer_rec *)(((byte *)br) + size2);
1198 SEN_GFREE(bs2);
1199 }
1200 }
1201 }
1202 break;
1203 }
1204 if (!br) {
1205 if (u->rid < 0x100000 && u->sid < 0x800 &&
1206 u->tf == 1 && u->score == 0 && u->pos->pos < 0x4000) {
1207 sen_sym_pocket_set(inv->lexicon, key, u->pos->pos);
1208 *a = (u->rid << 12) + (u->sid << 1) + 1;
1209 goto exit;
1210 } else {
1211 if ((r = buffer_new(inv, size, &pos, &bt, &br, &b, hint))) { goto exit; }
1212 bt->tid = key;
1213 bt->size_in_chunk = 0;
1214 bt->pos_in_chunk = 0;
1215 bt->size_in_buffer = 0;
1216 bt->pos_in_buffer = 0;
1217 }
1218 }
1219 r = buffer_put(b, bt, br, bs, u, size);
1220 buffer_close(inv, pos);
1221 if (!*a || (*a & 1)) {
1222 *a = pos;
1223 sen_sym_pocket_set(inv->lexicon, key, 0);
1224 }
1225 exit :
1226 array_unref(inv, key);
1227 if (bs) { SEN_GFREE(bs); }
1228 return r;
1229 }
1230
1231 sen_rc
sen_inv_delete_one08(sen_inv * inv,uint32_t key,sen_inv_updspec * u,sen_set * h)1232 sen_inv_delete_one08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h)
1233 {
1234 sen_rc r = sen_success;
1235 buffer *b;
1236 uint8_t *bs = NULL;
1237 buffer_rec *br;
1238 buffer_term *bt;
1239 uint32_t size, *a = array_at(inv, key);
1240 if (!a) { return sen_invalid_argument; }
1241 for (;;) {
1242 if (!*a) { goto exit; }
1243 if (*a & 1) {
1244 uint32_t rid = BIT31_12(*a);
1245 uint32_t sid = BIT11_01(*a);
1246 if (u->rid == rid && (!u->sid || u->sid == sid)) {
1247 *a = 0;
1248 sym_delete(inv, key, h);
1249 }
1250 goto exit;
1251 }
1252 if (!(bs = encode_rec(u, &size, 1))) {
1253 r = sen_memory_exhausted;
1254 goto exit;
1255 }
1256 if ((r = buffer_open(inv, *a, &bt, &b))) { goto exit; }
1257 // sen_log("b->header.buffer_free=%d size=%d", b->header.buffer_free, size);
1258 if (b->header.buffer_free < size) {
1259 uint32_t _a = *a;
1260 SEN_LOG(sen_log_debug, "flushing! b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1261 buffer_close(inv, *a);
1262 if ((r = buffer_flush(inv, *a >> W_OF_SEGMENT, h))) { goto exit; }
1263 if (*a != _a) {
1264 SEN_LOG(sen_log_debug, "sen_inv_delete_one: *a changed %d->%d)", *a, _a);
1265 continue;
1266 }
1267 if ((r = buffer_open(inv, *a, &bt, &b))) { goto exit; }
1268 SEN_LOG(sen_log_debug, "flushed! b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1269 if (b->header.buffer_free < size) {
1270 /* todo: must be splitted ? */
1271 SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_delete_one",
1272 *a, b->header.buffer_free, size);
1273 r = sen_memory_exhausted;
1274 buffer_close(inv, *a);
1275 goto exit;
1276 }
1277 }
1278
1279 b->header.buffer_free -= size;
1280 br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
1281 r = buffer_put(b, bt, br, bs, u, size);
1282 buffer_close(inv, *a);
1283 break;
1284 }
1285 exit :
1286 array_unref(inv, key);
1287 if (bs) { SEN_GFREE(bs); }
1288 return r;
1289 }
1290
1291 #define CHUNK_USED 1
1292 #define BUFFER_USED 2
1293 #define SOLE_DOC_USED 4
1294 #define SOLE_POS_USED 8
1295
1296 sen_inv_cursor *
sen_inv_cursor_open08(sen_inv * inv,uint32_t key)1297 sen_inv_cursor_open08(sen_inv *inv, uint32_t key)
1298 {
1299 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1300 sen_inv_cursor08 *c = NULL;
1301 uint32_t pos, *a = array_at(inv, key);
1302 if (!a) { return NULL; }
1303 if (!(pos = *a)) { goto exit; }
1304 if (!(c = SEN_GMALLOC(sizeof(sen_inv_cursor08)))) { goto exit; }
1305 memset(c, 0, sizeof(sen_inv_cursor08));
1306 c->inv = inv;
1307 if (pos & 1) {
1308 c->stat = 0;
1309 c->pb.rid = BIT31_12(pos);
1310 c->pb.sid = BIT11_01(pos);
1311 c->pb.tf = 1;
1312 c->pb.score = 0;
1313 c->pb.pos = sen_sym_pocket_get(inv->lexicon, key);
1314 } else {
1315 uint32_t chunk;
1316 buffer_term *bt;
1317 c->pb.rid = 0; c->pb.sid = 0; /* for check */
1318 c->buffer_pos = pos;
1319 if (buffer_open(inv, pos, &bt, &c->buf)) {
1320 SEN_GFREE(c);
1321 c = NULL;
1322 goto exit;
1323 }
1324 if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != CHUNK_NOT_ASSIGNED) {
1325 c->cp = sen_io_win_map(inv->chunk, ctx, &c->iw,
1326 chunk, bt->pos_in_chunk, bt->size_in_chunk, sen_io_rdonly);
1327 if (!c->cp) {
1328 buffer_close(inv, pos);
1329 SEN_GFREE(c);
1330 c = NULL;
1331 goto exit;
1332 }
1333 c->cpe = c->cp + bt->size_in_chunk;
1334 c->pc.rid = 0;
1335 c->pc.sid = 0;
1336 }
1337 c->nextb = bt->pos_in_buffer;
1338 c->stat = CHUNK_USED|BUFFER_USED;
1339 }
1340 exit :
1341 array_unref(inv, key);
1342 return (sen_inv_cursor *) c;
1343 }
1344
1345 sen_rc
sen_inv_cursor_next08(sen_inv_cursor * c1)1346 sen_inv_cursor_next08(sen_inv_cursor *c1)
1347 {
1348 sen_inv_cursor08 *c = (sen_inv_cursor08 *)c1;
1349 if (c->buf) {
1350 for (;;) {
1351 if (c->stat & CHUNK_USED) {
1352 while (c->cp < c->cpe && c->pc.rest--) { SEN_B_SKIP(c->cp); }
1353 if (c->cp < c->cpe) {
1354 uint32_t gap;
1355 SEN_B_DEC(gap, c->cp);
1356 c->pc.rid += gap;
1357 if (gap) { c->pc.sid = 0; }
1358 SEN_B_DEC(gap, c->cp);
1359 c->pc.sid += gap;
1360 SEN_B_DEC(c->pc.tf, c->cp);
1361 if (c->pc.tf & 1) { SEN_B_DEC(c->pc.score, c->cp); } else { c->pc.score = 0; }
1362 c->pc.rest = c->pc.tf >>= 1;
1363 c->pc.pos = 0;
1364 } else {
1365 c->pc.rid = 0;
1366 }
1367 }
1368 if (c->stat & BUFFER_USED) {
1369 if (c->nextb) {
1370 uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */
1371 buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb);
1372 c->bp = NEXT_ADDR(br);
1373 SEN_B_DEC(c->pb.rid, c->bp);
1374 SEN_B_DEC(c->pb.sid, c->bp);
1375 if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) {
1376 SEN_LOG(sen_log_crit, "brokend!! (%d:%d) -> (%d:%d)", lrid, lsid, c->pb.rid, c->pb.sid);
1377 return sen_other_error;
1378 }
1379 c->nextb = br->step;
1380 SEN_B_DEC(c->pb.tf, c->bp);
1381 if (c->pb.tf & 1) { SEN_B_DEC(c->pb.score, c->bp); } else { c->pb.score = 0; }
1382 c->pb.rest = c->pb.tf >>= 1;
1383 c->pb.pos = 0;
1384 } else {
1385 c->pb.rid = 0;
1386 }
1387 }
1388 if (c->pb.rid) {
1389 if (c->pc.rid) {
1390 if (c->pc.rid < c->pb.rid) {
1391 c->stat = CHUNK_USED;
1392 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1393 } else {
1394 if (c->pb.rid < c->pc.rid) {
1395 c->stat = BUFFER_USED;
1396 if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1397 } else {
1398 if (c->pb.sid) {
1399 if (c->pc.sid < c->pb.sid) {
1400 c->stat = CHUNK_USED;
1401 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1402 } else {
1403 c->stat = BUFFER_USED;
1404 if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; }
1405 if (c->pb.tf) { c->post = &c->pb; break; }
1406 }
1407 } else {
1408 c->stat = CHUNK_USED;
1409 }
1410 }
1411 }
1412 } else {
1413 c->stat = BUFFER_USED;
1414 if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1415 }
1416 } else {
1417 if (c->pc.rid) {
1418 c->stat = CHUNK_USED;
1419 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1420 } else {
1421 c->post = NULL;
1422 return sen_other_error;
1423 }
1424 }
1425 }
1426 } else {
1427 if (c->stat & SOLE_DOC_USED) {
1428 c->post = NULL;
1429 return sen_other_error;
1430 } else {
1431 c->post = &c->pb;
1432 c->stat |= SOLE_DOC_USED;
1433 }
1434 }
1435 return sen_success;
1436 }
1437
1438 sen_rc
sen_inv_cursor_next_pos08(sen_inv_cursor * c1)1439 sen_inv_cursor_next_pos08(sen_inv_cursor *c1)
1440 {
1441 sen_inv_cursor08 *c = (sen_inv_cursor08 *)c1;
1442 uint32_t gap;
1443 sen_rc rc = sen_success;
1444 if (c->buf) {
1445 if (c->post == &c->pc) {
1446 if (c->pc.rest) {
1447 c->pc.rest--;
1448 SEN_B_DEC(gap, c->cp);
1449 c->pc.pos += gap;
1450 } else {
1451 rc = sen_other_error;
1452 }
1453 } else if (c->post == &c->pb) {
1454 if (c->pb.rest) {
1455 c->pb.rest--;
1456 SEN_B_DEC(gap, c->bp);
1457 c->pb.pos += gap;
1458 } else {
1459 rc = sen_other_error;
1460 }
1461 } else {
1462 rc = sen_other_error;
1463 }
1464 } else {
1465 if (c->stat & SOLE_POS_USED) {
1466 rc = sen_other_error;
1467 } else {
1468 c->stat |= SOLE_POS_USED;
1469 }
1470 }
1471 return rc;
1472 }
1473
1474 sen_rc
sen_inv_cursor_close08(sen_inv_cursor * c1)1475 sen_inv_cursor_close08(sen_inv_cursor *c1)
1476 {
1477 sen_inv_cursor08 *c = (sen_inv_cursor08 *) c1;
1478 if (!c) { return sen_invalid_argument; }
1479 if (c->cp) { sen_io_win_unmap(&c->iw); }
1480 if (c->buf) { buffer_close(c->inv, c->buffer_pos); }
1481 SEN_GFREE(c);
1482 return sen_success;
1483 }
1484
1485 uint32_t
sen_inv_estimate_size08(sen_inv * inv,uint32_t key)1486 sen_inv_estimate_size08(sen_inv *inv, uint32_t key)
1487 {
1488 uint32_t res, pos, *a = array_at(inv, key);
1489 if (!a) { return 0; }
1490 if ((pos = *a)) {
1491 if (pos & 1) {
1492 res = 1;
1493 } else {
1494 buffer *buf;
1495 buffer_term *bt;
1496 if (buffer_open(inv, pos, &bt, &buf)) {
1497 res = 0;
1498 } else {
1499 res = (bt->size_in_chunk >> 2) + bt->size_in_buffer + 2;
1500 buffer_close(inv, pos);
1501 }
1502 }
1503 } else {
1504 res = 0;
1505 }
1506 array_unref(inv, key);
1507 return res;
1508 }
1509
1510 int
sen_inv_entry_info08(sen_inv * inv,unsigned key,unsigned * a,unsigned * pocket,unsigned * chunk,unsigned * chunk_size,unsigned * buffer_free,unsigned * nterms,unsigned * nterms_void,unsigned * tid,unsigned * size_in_chunk,unsigned * pos_in_chunk,unsigned * size_in_buffer,unsigned * pos_in_buffer)1511 sen_inv_entry_info08(sen_inv *inv, unsigned key, unsigned *a, unsigned *pocket,
1512 unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
1513 unsigned *nterms, unsigned *nterms_void, unsigned *tid,
1514 unsigned *size_in_chunk, unsigned *pos_in_chunk,
1515 unsigned *size_in_buffer, unsigned *pos_in_buffer)
1516 {
1517 buffer *b;
1518 buffer_term *bt;
1519 uint32_t *ap = array_at(inv, key);
1520 *pocket = sen_sym_pocket_get(inv->lexicon, key);
1521 if (!ap) { return 0; }
1522 *a = *ap;
1523 array_unref(inv, key);
1524 if (!*a) { return 1; }
1525 if (*a & 1) { return 2; }
1526 if (buffer_open(inv, *a, &bt, &b)) { return 3; }
1527 *chunk = b->header.chunk;
1528 *chunk_size = b->header.chunk_size;
1529 *buffer_free = b->header.buffer_free;
1530 *nterms = b->header.nterms;
1531 *tid = bt->tid;
1532 *size_in_chunk = bt->size_in_chunk;
1533 *pos_in_chunk = bt->pos_in_chunk;
1534 *size_in_buffer = bt->size_in_buffer;
1535 *pos_in_buffer = bt->pos_in_buffer;
1536 buffer_close(inv, *a);
1537 return 4;
1538 }
1539