1 /* Copyright(C) 2004 Brazil
2 
3   This library is free software; you can redistribute it and/or
4   modify it under the terms of the GNU Lesser General Public
5   License as published by the Free Software Foundation; either
6   version 2.1 of the License, or (at your option) any later version.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 #include "senna_in.h"
18 #include <fcntl.h>
19 #include <string.h>
20 #include <sys/stat.h>
21 
22 #include "str.h"
23 #include "sym.h"
24 #include "inv.h"
25 
26 struct _sen_inv {
27   uint8_t v08p;
28   sen_io *seg;
29   sen_io *chunk;
30   sen_sym *lexicon;
31   struct sen_inv_header *header;
32 #ifdef USE_QUERY_ABORT
33   int (*check_abort)(void*);
34   void *check_abort_arg;
35 #endif /* USE_QUERY_ABORT */
36   uint32_t total_chunk_size;
37   uint16_t ainfo[SEN_INV_MAX_SEGMENT];
38   uint16_t binfo[SEN_INV_MAX_SEGMENT];
39   uint16_t amax;
40   uint16_t bmax;
41 };
42 
43 typedef struct {
44   sen_inv *inv;
45   sen_inv_posting pc;
46   sen_inv_posting pb;
47   sen_inv_posting *post;
48   uint8_t *cp;
49   uint8_t *cpe;
50   uint8_t *bp;
51   sen_io_win iw;
52   struct sen_inv_buffer *buf;
53   uint16_t stat;
54   uint16_t nextb;
55   uint32_t buffer_pos;
56 } sen_inv_cursor08;
57 
58 struct sen_inv_header {
59   char idstr[16];
60   uint32_t initial_n_segments;
61   // todo: initial_n_segments should be uint16_t
62   // uint32_t total_chunk_size; todo: should be added when index format changed
63   uint16_t segments[SEN_INV_MAX_SEGMENT];
64   // todo: exchange segments and ainfo,binfo.
65   uint8_t chunks[1]; /* dummy */
66 };
67 
68 #define SEN_INV_IDSTR "SENNA:INV:00.00"
69 #define SEN_INV_SEGMENT_SIZE 0x40000
70 /* SEN_INV_MAX_SEGMENT == 0x10000 >> 2 */
71 #define SEN_INV_CHUNK_SIZE   0x40000
72 #define N_CHUNKS_PER_FILE (SEN_IO_FILE_SIZE / SEN_INV_CHUNK_SIZE)
73 #define W_OF_SEGMENT 18
74 #define W_OF_ARRAY (W_OF_SEGMENT - 2)
75 #define ARRAY_MASK_IN_A_SEGMENT ((SEN_INV_SEGMENT_SIZE >> 2) - 1)
76 #define BUFFER_MASK_IN_A_SEGMENT (SEN_INV_SEGMENT_SIZE - 1)
77 #define CHUNK_NOT_ASSIGNED 0xffffffff
78 #define SEG_NOT_ASSIGNED 0xffff
79 
80 #define SEGMENT_ARRAY 0x8000
81 #define SEGMENT_BUFFER 0x4000
82 #define SEGMENT_MASK (SEN_INV_MAX_SEGMENT - 1)
83 
84 #define BIT11_01(x) ((x >> 1) & 0x7ff)
85 #define BIT31_12(x) (x >> 12)
86 
87 #define SEN_INV_INITIAL_N_SEGMENTS 512
88 #define MAX_CHUNK_RATIO 64
89 
90 #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p))
91 
92 /* segment */
93 
94 inline static sen_rc
segment_get(sen_inv * inv,uint16_t type,uint16_t segno,uint16_t * pseg)95 segment_get(sen_inv *inv, uint16_t type, uint16_t segno, uint16_t *pseg)
96 {
97   uint16_t s, i, empty = SEN_INV_MAX_SEGMENT;
98   for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
99     if ((s = inv->header->segments[i])) {
100       if (s == (type | segno)) { break; }
101     } else {
102       if (empty == SEN_INV_MAX_SEGMENT) { empty = i; }
103     }
104   }
105   if (i == SEN_INV_MAX_SEGMENT) {
106     void *p = NULL;
107     if (empty == SEN_INV_MAX_SEGMENT) { return sen_memory_exhausted; }
108     inv->header->segments[empty] = type | segno;
109     SEN_IO_SEG_REF(inv->seg, empty, p);
110     if (!p) { return sen_memory_exhausted; }
111     memset(p, 0, SEN_INV_SEGMENT_SIZE);
112     SEN_IO_SEG_UNREF(inv->seg, empty);
113     *pseg = empty;
114   } else {
115     *pseg = i;
116   }
117   return sen_success;
118 }
119 
120 inline static sen_rc
segment_new(sen_inv * inv,uint16_t type,uint16_t * segno)121 segment_new(sen_inv *inv, uint16_t type, uint16_t *segno)
122 {
123   sen_rc rc = sen_success;
124   uint16_t s, i, seg, empty = SEN_INV_MAX_SEGMENT;
125   char used[SEN_INV_MAX_SEGMENT];
126   memset(used, 0, SEN_INV_MAX_SEGMENT);
127   for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
128     if ((s = inv->header->segments[i])) {
129       if (s & type) { used[s & SEGMENT_MASK]++; }
130     } else {
131       if (empty == SEN_INV_MAX_SEGMENT) { empty = i; }
132     }
133   }
134   if (empty == SEN_INV_MAX_SEGMENT) { return sen_memory_exhausted; }
135   if (segno && *segno < SEN_INV_MAX_SEGMENT) {
136     if (used[*segno]) { return sen_invalid_argument; }
137     seg = *segno;
138   } else {
139     for (seg = 0; used[seg]; seg++) ;
140   }
141   inv->header->segments[empty] = type | seg;
142   switch (type) {
143   case SEGMENT_ARRAY :
144     inv->ainfo[seg] = empty;
145     if (seg > inv->amax) { inv->amax = seg; }
146     break;
147   case SEGMENT_BUFFER :
148     inv->binfo[seg] = empty;
149     if (seg > inv->bmax) { inv->bmax = seg; }
150     break;
151   }
152   if (segno) { *segno = seg; }
153   return rc;
154 }
155 
156 inline static sen_rc
load_all_segments(sen_inv * inv)157 load_all_segments(sen_inv *inv)
158 {
159   sen_rc rc = sen_success;
160   uint16_t s, seg, amax = 0, bmax = 0;
161   char used[SEN_INV_MAX_SEGMENT];
162   memset(used, 0, SEN_INV_MAX_SEGMENT);
163   for (seg = 0; seg < SEN_INV_MAX_SEGMENT; seg++) {
164     if (!(s = inv->header->segments[seg])) { continue; }
165     if (s & SEGMENT_ARRAY) {
166       used[s & SEGMENT_MASK] |= 2;
167       inv->ainfo[s & SEGMENT_MASK] = seg;
168     }
169     if (s & SEGMENT_BUFFER) {
170       used[s & SEGMENT_MASK] |= 1;
171       inv->binfo[s & SEGMENT_MASK] = seg;
172     }
173   }
174   for (seg = 0; seg < SEN_INV_MAX_SEGMENT; seg++) {
175     if ((used[seg] & 2)) { amax = seg; } else { inv->ainfo[seg] = SEG_NOT_ASSIGNED; }
176     if ((used[seg] & 1)) { bmax = seg; } else { inv->binfo[seg] = SEG_NOT_ASSIGNED; }
177   }
178   inv->amax = amax;
179   inv->bmax = bmax;
180   return rc;
181 }
182 
183 void
sen_inv_seg_expire08(sen_inv * inv)184 sen_inv_seg_expire08(sen_inv *inv)
185 {
186   uint32_t expire_threshold = inv->header->initial_n_segments * 2;
187   if (inv->seg->nmaps > expire_threshold) {
188     uint16_t seg;
189     for (seg = inv->bmax; seg; seg--) {
190       uint16_t pseg = inv->binfo[seg];
191       if (pseg != SEG_NOT_ASSIGNED) {
192         sen_io_mapinfo *info = &inv->seg->maps[pseg];
193         uint32_t *pnref = &inv->seg->nrefs[pseg];
194         if (info->map && !*pnref) {
195           sen_io_seg_expire(inv->seg, pseg, 100);
196           if (inv->seg->nmaps <= expire_threshold) { return; }
197         }
198       }
199     }
200     for (seg = inv->amax; seg; seg--) {
201       uint16_t pseg = inv->ainfo[seg];
202       if (pseg != SEG_NOT_ASSIGNED) {
203         sen_io_mapinfo *info = &inv->seg->maps[pseg];
204         uint32_t *pnref = &inv->seg->nrefs[pseg];
205         if (info->map && !*pnref) {
206           sen_io_seg_expire(inv->seg, pseg, 100);
207           if (inv->seg->nmaps <= expire_threshold) { return; }
208         }
209       }
210     }
211   }
212 }
213 
214 /* chunk */
215 
216 inline static sen_rc
chunk_new(sen_inv * inv,uint32_t * res,uint32_t size)217 chunk_new(sen_inv *inv, uint32_t *res, uint32_t size)
218 {
219   int i, j;
220   uint32_t n = size / SEN_INV_CHUNK_SIZE;
221   int max_chunk = inv->header->initial_n_segments * MAX_CHUNK_RATIO;
222   uint32_t base_seg = sen_io_base_seg(inv->chunk);
223   if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
224   for (i = 0, j = -1; i < max_chunk; i++) {
225     if (inv->header->chunks[i]) {
226       j = i;
227     } else {
228       if (i - j == n) {
229         if (res) { *res = j + 1; }
230         while (j < i) {
231           inv->header->chunks[++j] = 1;
232         }
233         return sen_success;
234       }
235       if ((i + base_seg)/ N_CHUNKS_PER_FILE !=
236           (i + base_seg + 1) / N_CHUNKS_PER_FILE) { j = i; }
237     }
238   }
239   SEN_LOG(sen_log_crit, "index full. set bigger value to initial_n_segments. current value = %d",
240           inv->header->initial_n_segments);
241   return sen_memory_exhausted;
242 }
243 
244 inline static sen_rc
chunk_free(sen_inv * inv,int start,uint32_t size)245 chunk_free(sen_inv *inv, int start, uint32_t size)
246 {
247   uint32_t i, n = size / SEN_INV_CHUNK_SIZE;
248   if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
249   // sen_log("chunk_free start=%d size=%d(%d)", start, size, n);
250   for (i = 0; i < n; i++) {
251     inv->header->chunks[start + i] = 0;
252   }
253   return sen_success;
254 }
255 
256 /* buffer */
257 
258 typedef struct {
259   uint32_t tid;
260   uint32_t size_in_chunk;
261   uint32_t pos_in_chunk;
262   uint16_t size_in_buffer;
263   uint16_t pos_in_buffer;
264 } buffer_term;
265 
266 typedef struct {
267   uint16_t step;
268   uint16_t jump;
269 } buffer_rec;
270 
271 typedef struct {
272   uint32_t chunk;
273   uint32_t chunk_size;
274   uint32_t buffer_free;
275   uint16_t nterms;
276   uint16_t nterms_void;
277 } buffer_header;
278 
279 struct sen_inv_buffer {
280   buffer_header header;
281   buffer_term terms[(SEN_INV_SEGMENT_SIZE - sizeof(buffer_header))/sizeof(buffer_term)];
282 };
283 
284 typedef struct sen_inv_buffer buffer;
285 
286 inline static sen_rc
buffer_open(sen_inv * inv,uint32_t pos,buffer_term ** bt,buffer ** b)287 buffer_open(sen_inv *inv, uint32_t pos, buffer_term **bt, buffer **b)
288 {
289   byte *p = NULL;
290   uint16_t lseg = (uint16_t) (pos >> W_OF_SEGMENT);
291   uint16_t pseg = inv->binfo[lseg];
292   if (pseg == SEG_NOT_ASSIGNED ||
293       inv->header->segments[pseg] != (SEGMENT_BUFFER|lseg)) {
294     load_all_segments(inv);
295     pseg = inv->binfo[lseg];
296     if (pseg == SEG_NOT_ASSIGNED ||
297         inv->header->segments[pseg] != (SEGMENT_BUFFER|lseg)) {
298       return sen_invalid_argument;
299     }
300   }
301   SEN_IO_SEG_REF(inv->seg, pseg, p);
302   if (!p) { return sen_memory_exhausted; }
303   if (b) { *b = (buffer *)p; }
304   if (bt) { *bt = (buffer_term *)(p + (pos & BUFFER_MASK_IN_A_SEGMENT)); }
305   return sen_success;
306 }
307 
308 inline static sen_rc
buffer_close(sen_inv * inv,uint32_t pos)309 buffer_close(sen_inv *inv, uint32_t pos)
310 {
311   uint16_t pseg = inv->binfo[pos >> W_OF_SEGMENT];
312   if (pseg >= SEN_INV_MAX_SEGMENT) { return sen_invalid_argument; }
313   SEN_IO_SEG_UNREF(inv->seg, pseg);
314   return sen_success;
315 }
316 
317 inline static int
buffer_open_if_capable(sen_inv * inv,int32_t seg,int size,buffer ** b)318 buffer_open_if_capable(sen_inv *inv, int32_t seg, int size, buffer **b)
319 {
320   int res, nterms;
321   uint32_t pos = ((uint32_t) seg) * SEN_INV_SEGMENT_SIZE;
322   if (buffer_open(inv, pos, NULL, b)) { return 0; }
323   nterms = (*b)->header.nterms - (*b)->header.nterms_void;
324   res = ((nterms < 4096 ||
325           (inv->total_chunk_size >> ((nterms >> 8) - 6)) > (*b)->header.chunk_size) &&
326          ((*b)->header.buffer_free >= size + sizeof(buffer_term)));
327   if (!res) { buffer_close(inv, pos); }
328   return res;
329 }
330 
331 inline static sen_rc
buffer_new(sen_inv * inv,int size,uint32_t * pos,buffer_term ** bt,buffer_rec ** br,buffer ** bp,int hint)332 buffer_new(sen_inv *inv, int size, uint32_t *pos,
333            buffer_term **bt, buffer_rec **br, buffer **bp, int hint)
334 {
335   buffer *b;
336   uint16_t nseg0 = inv->header->initial_n_segments;
337   uint16_t seg, offset, seg0 = hint % nseg0;
338   uint16_t segmax = (uint16_t) (inv->total_chunk_size >> 7) + nseg0;
339   if (size + sizeof(buffer_header) + sizeof(buffer_term) > SEN_INV_SEGMENT_SIZE) {
340     return sen_invalid_argument;
341   }
342   // load_all_segments(inv); todo: ainfo and binfo should be inside the header
343   for (seg = seg0; seg < segmax; seg += nseg0) {
344     if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
345     if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
346   }
347   if (seg >= segmax) {
348     for (seg = (seg0 + 1) % nseg0; seg != seg0; seg = (seg + 1) % nseg0) {
349       if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
350       if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
351     }
352     if (seg == seg0) {
353       for (seg = nseg0; seg < SEN_INV_MAX_SEGMENT; seg++) {
354         if (inv->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
355         if (buffer_open_if_capable(inv, seg, size, &b)) { goto exit; }
356       }
357     }
358   }
359   SEN_LOG(sen_log_debug, "inv=%p new seg=%d", inv, seg);
360   if (segment_new(inv, SEGMENT_BUFFER, &seg) ||
361       buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &b)) {
362     return sen_memory_exhausted;
363   }
364   memset(b, 0, SEN_INV_SEGMENT_SIZE);
365   b->header.buffer_free = SEN_INV_SEGMENT_SIZE - sizeof(buffer_header);
366   b->header.chunk = CHUNK_NOT_ASSIGNED;
367   b->header.chunk_size = 0;
368 exit :
369   if (b->header.nterms_void) {
370     for (offset = 0; offset < b->header.nterms; offset++) {
371       if (!b->terms[offset].tid) { break; }
372     }
373     if (offset == b->header.nterms) {
374       SEN_LOG(sen_log_notice, "inconsistent buffer(%d)", seg);
375       b->header.nterms_void = 0;
376       b->header.nterms++;
377       b->header.buffer_free -= size + sizeof(buffer_term);
378     } else {
379       b->header.nterms_void--;
380       b->header.buffer_free -= size;
381     }
382   } else {
383     offset = b->header.nterms++;
384     b->header.buffer_free -= size + sizeof(buffer_term);
385   }
386   *pos = seg * SEN_INV_SEGMENT_SIZE
387     + sizeof(buffer_header) + sizeof(buffer_term) * offset;
388   *bt = &b->terms[offset];
389   *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
390   *bp = b;
391   return sen_success;
392 }
393 
394 typedef struct {
395   uint32_t rid;
396   uint32_t sid;
397 } docid;
398 
399 #define BUFFER_REC_DEL(r)  ((r)->jump = 1)
400 #define BUFFER_REC_DELETED(r) ((r)->jump == 1)
401 
402 #define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos))
403 #define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b)))
404 
405 inline static void
buffer_term_dump(buffer * b,buffer_term * bt)406 buffer_term_dump(buffer *b, buffer_term *bt)
407 {
408   int pos, rid, sid;
409   uint8_t *p;
410   buffer_rec *r;
411   SEN_LOG(sen_log_debug,
412           "b=(%x %u %u %u)", b->header.chunk, b->header.chunk_size, b->header.buffer_free, b->header.nterms);
413   SEN_LOG(sen_log_debug,
414           "bt=(%u %u %u %u %u)", bt->tid, bt->size_in_chunk, bt->pos_in_chunk, bt->size_in_buffer, bt->pos_in_buffer);
415   for (pos = bt->pos_in_buffer; pos; pos = r->step) {
416     r = BUFFER_REC_AT(b, pos);
417     p = NEXT_ADDR(r);
418     SEN_B_DEC(rid, p);
419     SEN_B_DEC(sid, p);
420     SEN_LOG(sen_log_debug, "%d=(%d:%d),(%d:%d)", pos, r->jump, r->step, rid, sid);
421   }
422 }
423 
424 static buffer_term *tmp_bt;
425 
426 inline static sen_rc
check_jump(buffer * b,buffer_rec * r,int j)427 check_jump(buffer *b, buffer_rec *r, int j)
428 {
429   uint16_t i = BUFFER_REC_POS(b, r);
430   uint8_t *p;
431   buffer_rec *r2;
432   docid id, id2;
433   if (!j) { return sen_success; }
434   p = NEXT_ADDR(r);
435   SEN_B_DEC(id.rid, p);
436   SEN_B_DEC(id.sid, p);
437   if (j == 1) {
438     SEN_LOG(sen_log_debug, "deleting! %d(%d:%d)", i, id.rid, id.sid);
439     return sen_success;
440   }
441   r2 = BUFFER_REC_AT(b, j);
442   p = NEXT_ADDR(r2);
443   SEN_B_DEC(id2.rid, p);
444   SEN_B_DEC(id2.sid, p);
445   if (r2->step == i) {
446     SEN_LOG(sen_log_emerg, "cycle! %d(%d:%d)<->%d(%d:%d)", i, id.rid, id.sid, j, id2.rid, id2.sid);
447     buffer_term_dump(b, tmp_bt);
448     return sen_other_error;
449   }
450   if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) {
451     SEN_LOG(sen_log_crit, "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)", i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, id2.rid, id2.sid);
452     return sen_other_error;
453   }
454   return sen_success;
455 }
456 
457 inline static sen_rc
set_jump_r(buffer * b,buffer_rec * from,int to)458 set_jump_r(buffer *b, buffer_rec *from, int to)
459 {
460   int i, j, max_jump = 100;
461   buffer_rec *r, *r2;
462   for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) {
463     r2 = BUFFER_REC_AT(b, j);
464     if (r == r2) { break; }
465     if (BUFFER_REC_DELETED(r2)) { break; }
466     if (j == (i = r->jump)) { break; }
467     if (j == r->step) { break; }
468     if (check_jump(b, r, j)) { return sen_other_error; }
469     r->jump = j;
470     j = i;
471     if (!r->step) { return sen_other_error; }
472   }
473   return sen_success;
474 }
475 
476 #define GET_NUM_BITS(x,n) { \
477   n = x; \
478   n = (n & 0x55555555) + ((n >> 1) & 0x55555555); \
479   n = (n & 0x33333333) + ((n >> 2) & 0x33333333); \
480   n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F); \
481   n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF); \
482   n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF); \
483 }
484 
485 inline static sen_rc
buffer_put(buffer * b,buffer_term * bt,buffer_rec * rnew,uint8_t * bs,sen_inv_updspec * u,int size)486 buffer_put(buffer *b, buffer_term *bt, buffer_rec *rnew, uint8_t *bs,
487            sen_inv_updspec *u, int size)
488 {
489   uint8_t *p;
490   sen_rc rc = sen_success;
491   docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0};
492   buffer_rec *r_curr, *r_start = NULL;
493   uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew);
494   int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1;
495 
496   tmp_bt = bt; // test
497 
498   memcpy(NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec));
499   //  sen_log("tid=%d u->rid=%d u->sid=%d", bt->tid, u->rid, u->sid);
500   for (;;) {
501     //    sen_log("*lastp=%d", *lastp);
502     if (!*lastp) {
503       rnew->step = 0;
504       rnew->jump = 0;
505       *lastp = pos;
506       if (bt->size_in_buffer++ > 1) {
507         buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer);
508         rhead->jump = pos;
509         if (!(bt->size_in_buffer & 1)) {
510           int n;
511           buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2;
512           GET_NUM_BITS(bt->size_in_buffer, n);
513           while (n-- && (r->jump > 1)) {
514             r2 = BUFFER_REC_AT(b, r->jump);
515             if (BUFFER_REC_DELETED(r2)) { break; }
516             r = r2;
517           }
518           if (r != rnew) { set_jump_r(b, r, last); }
519         }
520       }
521       break;
522     }
523     r_curr = BUFFER_REC_AT(b, *lastp);
524     p = NEXT_ADDR(r_curr);
525     SEN_B_DEC(id_curr.rid, p);
526     SEN_B_DEC(id_curr.sid, p);
527     if (id_curr.rid < id_post.rid ||
528         (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) {
529       SEN_LOG(sen_log_emerg, "loop found!!! (%d:%d)->(%d:%d)",
530               id_post.rid, id_post.sid, id_curr.rid, id_curr.sid);
531       buffer_term_dump(b, bt);
532       /* abandon corrupt list */
533       bt->pos_in_buffer = 0;
534       bt->size_in_buffer = 0;
535       lastp = &bt->pos_in_buffer;
536       rc = sen_invalid_format;
537       continue;
538     }
539     id_post.rid = id_curr.rid;
540     id_post.sid = id_curr.sid;
541     if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) {
542       uint16_t step = *lastp, jump = r_curr->jump;
543       if (u->rid == id_curr.rid) {
544         if (u->sid == 0) {
545           while (id_curr.rid == u->rid) {
546             BUFFER_REC_DEL(r_curr);
547             if (!(step = r_curr->step)) { break; }
548             r_curr = BUFFER_REC_AT(b, step);
549             p = NEXT_ADDR(r_curr);
550             SEN_B_DEC(id_curr.rid, p);
551             SEN_B_DEC(id_curr.sid, p);
552           }
553         } else if (u->sid == id_curr.sid) {
554           BUFFER_REC_DEL(r_curr);
555           step = r_curr->step;
556         }
557       }
558       rnew->step = step;
559       rnew->jump = check_jump(b, rnew, jump) ? 0 : jump;
560       *lastp = pos;
561       break;
562     }
563 
564     if (reset) {
565       r_start = r_curr;
566       id_start.rid = id_curr.rid;
567       id_start.sid = id_curr.sid;
568       if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; }
569       nhops = 0;
570       vhops = 1;
571       vdelta = delta0 >> 1;
572     } else {
573       if (!(delta = id_curr.rid - id_start.rid)) { delta = id_curr.sid - id_start.sid; }
574       if (vdelta < delta) {
575         vdelta += (delta0 >> ++vhops);
576         r_start = r_curr;
577       }
578       if (nhops > vhops) {
579         set_jump_r(b, r_start, *lastp);
580       } else {
581         nhops++;
582       }
583     }
584 
585     last = *lastp;
586     lastp = &r_curr->step;
587     reset = 0;
588     {
589       uint16_t posj = r_curr->jump;
590       if (posj > 1) {
591         buffer_rec *rj = BUFFER_REC_AT(b, posj);
592         if (!BUFFER_REC_DELETED(rj)) {
593           docid idj;
594           p = NEXT_ADDR(rj);
595           SEN_B_DEC(idj.rid, p);
596           SEN_B_DEC(idj.sid, p);
597           if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) {
598             last = posj;
599             lastp = &rj->step;
600           } else {
601             reset = 1;
602           }
603         }
604       }
605     }
606   }
607   return rc;
608 }
609 
610 /* array */
611 
612 inline static uint32_t *
array_at(sen_inv * inv,uint32_t id)613 array_at(sen_inv *inv, uint32_t id)
614 {
615   byte *p = NULL;
616   uint16_t seg, pseg;
617   if (id > SEN_SYM_MAX_ID) { return NULL; }
618   seg = id >> W_OF_ARRAY;
619   if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
620     load_all_segments(inv);
621     if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
622       return NULL;
623     }
624   }
625   SEN_IO_SEG_REF(inv->seg, pseg, p);
626   if (!p) { return NULL; }
627   return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
628 }
629 
630 inline static uint32_t *
array_get(sen_inv * inv,uint32_t id)631 array_get(sen_inv *inv, uint32_t id)
632 {
633   byte *p = NULL;
634   uint16_t seg, pseg;
635   if (id > SEN_SYM_MAX_ID) { return NULL; }
636   seg = id >> W_OF_ARRAY;
637   if ((pseg = inv->ainfo[seg]) == SEG_NOT_ASSIGNED) {
638     if (segment_get(inv, SEGMENT_ARRAY, seg, &pseg)) { return NULL; }
639     inv->ainfo[seg] = pseg;
640     if (seg > inv->amax) { inv->amax = seg; }
641   }
642   SEN_IO_SEG_REF(inv->seg, pseg, p)
643   if (!p) { return NULL; }
644   return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
645 }
646 
647 inline static void
array_unref(sen_inv * inv,uint32_t id)648 array_unref(sen_inv *inv, uint32_t id)
649 {
650   SEN_IO_SEG_UNREF(inv->seg, inv->ainfo[id >> W_OF_ARRAY]);
651 }
652 
653 inline static uint8_t *
encode_rec(sen_inv_updspec * u,unsigned int * size,int deletep)654 encode_rec(sen_inv_updspec *u, unsigned int *size, int deletep)
655 {
656   intptr_t s;
657   uint8_t *br, *p;
658   struct _sen_inv_pos *pp;
659   uint32_t lpos, tf = deletep ? 0 : u->tf;
660   if (!(br = SEN_GMALLOC((u->tf + 4) * 5))) {
661     return NULL;
662   }
663   p = br;
664   SEN_B_ENC(u->rid, p);
665   SEN_B_ENC(u->sid, p);
666   if (!u->score) {
667     SEN_B_ENC(tf * 2, p);
668   } else {
669     SEN_B_ENC(tf * 2 + 1, p);
670     SEN_B_ENC(u->score, p);
671   }
672   for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) {
673     SEN_B_ENC(pp->pos - lpos, p);
674   }
675   s = (p - br) + sizeof(buffer_rec);
676   *size = (unsigned int) ((s + 0x03) & ~0x03);
677   return br;
678 }
679 
680 inline static int
sym_deletable(uint32_t tid,sen_set * h)681 sym_deletable(uint32_t tid, sen_set *h)
682 {
683   sen_inv_updspec **u;
684   if (!h) { return 1; }
685   if (!sen_set_at(h, &tid, (void **) &u)) { return 1; }
686   if (!(*u)->tf || !(*u)->sid) { return 1; }
687   return 0;
688 }
689 
690 typedef struct {
691   sen_inv *inv;
692   sen_set *h;
693 } sis_deletable_arg;
694 
695 static int
sis_deletable(sen_id tid,void * arg)696 sis_deletable(sen_id tid, void *arg)
697 {
698   uint32_t *a;
699   sen_set *h = ((sis_deletable_arg *)arg)->h;
700   sen_inv *inv = ((sis_deletable_arg *)arg)->inv;
701   if ((a = array_at(inv, tid))) {
702     if (*a) {
703       array_unref(inv, tid);
704       return 0;
705     }
706     array_unref(inv, tid);
707   }
708   return sym_deletable(tid, h);
709 }
710 
711 inline static void
sym_delete(sen_inv * inv,uint32_t tid,sen_set * h)712 sym_delete(sen_inv *inv, uint32_t tid, sen_set *h)
713 {
714   sis_deletable_arg arg = {inv, h};
715   if (inv->lexicon->flags & SEN_SYM_WITH_SIS) {
716     sen_sym_del_with_sis(inv->lexicon, tid, sis_deletable, &arg);
717     /*
718     uint32_t *a;
719     while ((tid = sen_sym_del_with_sis(inv->lexicon, tid))) {
720       if ((a = array_at(inv, tid))) {
721         if (*a) {
722           array_unref(inv, tid);
723           break;
724         }
725         array_unref(inv, tid);
726       }
727       if (!sym_deletable(tid, h)) { break; }
728     }
729     */
730   } else {
731     if (sym_deletable(tid, h)) {
732       sen_sym_del(inv->lexicon, _sen_sym_key(inv->lexicon, tid));
733     }
734   }
735 }
736 
737 inline static sen_rc
buffer_flush(sen_inv * inv,uint32_t seg,sen_set * h)738 buffer_flush(sen_inv *inv, uint32_t seg, sen_set *h)
739 {
740   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
741   buffer *sb, *db = NULL;
742   sen_rc rc = sen_success;
743   sen_io_win sw, dw;
744   uint8_t *dc, *sc = NULL;
745   uint16_t ss, ds;
746   uint32_t scn, dcn, max_dest_chunk_size;
747   ss = inv->binfo[seg];
748   if (ss == SEG_NOT_ASSIGNED) { return sen_invalid_format; }
749   if (buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &sb)) {
750     return sen_memory_exhausted;
751   }
752   for (ds = 0; inv->header->segments[ds];) {
753     if (++ds == SEN_INV_MAX_SEGMENT) {
754       buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
755       return sen_memory_exhausted;
756     }
757   }
758   SEN_IO_SEG_REF(inv->seg, ds, db);
759   if (!db) {
760     buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
761     return sen_memory_exhausted;
762   }
763   memset(db, 0, SEN_INV_SEGMENT_SIZE);
764 
765   max_dest_chunk_size = sb->header.chunk_size + SEN_INV_SEGMENT_SIZE;
766   if (chunk_new(inv, &dcn, max_dest_chunk_size)) {
767     buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
768     SEN_IO_SEG_UNREF(inv->seg, ds);
769     return sen_memory_exhausted;
770   }
771   //  sen_log("db=%p ds=%d sb=%p seg=%d", db, ds, sb, seg);
772   if ((scn = sb->header.chunk) != CHUNK_NOT_ASSIGNED) {
773     sc = sen_io_win_map(inv->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, sen_io_rdwr);
774     if (!sc) {
775       SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!!", scn, sb->header.chunk_size);
776       buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
777       SEN_IO_SEG_UNREF(inv->seg, ds);
778       chunk_free(inv, dcn, max_dest_chunk_size);
779       return sen_memory_exhausted;
780     }
781   }
782   // dc = sen_io_win_map(inv->chunk, &dw, dcn, 0, max_dest_chunk_size, sen_io_wronly);
783   dc = sen_io_win_map(inv->chunk, ctx, &dw, dcn, 0, max_dest_chunk_size, sen_io_rdwr);
784   if (!dc) {
785     SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!!", dcn, max_dest_chunk_size);
786     buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
787     SEN_IO_SEG_UNREF(inv->seg, ds);
788     chunk_free(inv, dcn, max_dest_chunk_size);
789     if (scn != CHUNK_NOT_ASSIGNED) { sen_io_win_unmap(&sw); }
790     return sen_memory_exhausted;
791   }
792   {
793     uint8_t *bp = NULL, *cp = NULL, *cpe = NULL, *dp = dc;
794     uint16_t nextb;
795     buffer_rec *br;
796     buffer_term *bt;
797     int n = sb->header.nterms;
798     int nterms_void = 0;
799     memcpy(db->terms, sb->terms, n * sizeof(buffer_term));
800     // sen_log(" scn=%d, dcn=%d, nterms=%d", sb->header.chunk, dcn, n);
801     for (bt = db->terms; n; n--, bt++) {
802       docid cid = {0, 0}, lid = {0, 0}, bid = {0, 0};
803       uint32_t ndf = 0, tf2, ltf2 = 0, gap;
804       if (!bt->tid) {
805         nterms_void++;
806         continue;
807       }
808       if (sc) {
809         cp = sc + bt->pos_in_chunk;
810         cpe = cp + bt->size_in_chunk;
811       }
812       nextb = bt->pos_in_buffer;
813       bt->pos_in_chunk = (uint32_t)(dp - dc);
814       bt->size_in_buffer = 0;
815       bt->pos_in_buffer = 0;
816 
817       // sen_log("db=%p n=%d, bt=%p tid=%d bdf=%d", db, n, bt, bt->tid, bdf);
818 
819 #define GETNEXTC() { \
820   if (cp < cpe && cid.rid) { \
821     SEN_B_DEC(tf2, cp); \
822     if (tf2 & 1) { SEN_B_SKIP(cp); } \
823     tf2 >>= 1; \
824     while (cp < cpe && tf2--) { SEN_B_SKIP(cp); } \
825   } \
826   if (cp < cpe) { \
827     SEN_B_DEC(gap, cp); \
828     cid.rid += gap; \
829     if (gap) { cid.sid = 0; } \
830     SEN_B_DEC(gap, cp); \
831     cid.sid += gap; \
832   } else { \
833     cid.rid = 0; \
834   } \
835 }
836 #define PUTNEXTC() { \
837   if (cid.rid) { \
838     /* sen_log("srid=%d", srid); */ \
839     SEN_B_DEC(tf2, cp); \
840     if (tf2) { \
841       if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) { \
842         SEN_LOG(sen_log_crit, "brokenc!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid); \
843         rc = sen_invalid_format;\
844         break; \
845       } \
846       ndf++; \
847       gap = cid.rid - lid.rid; \
848       SEN_B_ENC(gap, dp); \
849       if (gap) { SEN_B_ENC(cid.sid, dp); } else { SEN_B_ENC(cid.sid - lid.sid, dp); } \
850       SEN_B_ENC(tf2, dp); \
851       if (tf2 & 1) { SEN_B_COPY(dp, cp); } \
852       ltf2 = tf2; \
853       tf2 >>= 1; \
854       while (tf2--) { SEN_B_COPY(dp, cp); } \
855       lid.rid = cid.rid; \
856       lid.sid = cid.sid; \
857     } else { \
858       SEN_LOG(sen_log_crit, "invalid chunk(%d,%d)", bt->tid, cid.rid);\
859       rc = sen_invalid_format;\
860       break; \
861     } \
862   } \
863   if (cp < cpe) { \
864     SEN_B_DEC(gap, cp); \
865     cid.rid += gap; \
866     if (gap) { cid.sid = 0; } \
867     SEN_B_DEC(gap, cp); \
868     cid.sid += gap; \
869   } else { \
870     cid.rid = 0; \
871   } \
872   /* sen_log("gap=%d srid=%d", gap, srid); */ \
873 }
874 #define GETNEXTB() { \
875   if (nextb) { \
876     uint32_t lrid = bid.rid, lsid = bid.sid; \
877     br = BUFFER_REC_AT(sb, nextb); \
878     bp = NEXT_ADDR(br); \
879     SEN_B_DEC(bid.rid, bp); \
880     SEN_B_DEC(bid.sid, bp); \
881     if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) { \
882       SEN_LOG(sen_log_crit, "brokeng!! (%d:%d) -> (%d:%d)", lrid, lsid, bid.rid, bid.sid); \
883       rc = sen_invalid_format;\
884       break; \
885     } \
886     nextb = br->step; \
887   } else { \
888     bid.rid = 0; \
889   } \
890 }
891 #define PUTNEXTB() { \
892   if (bid.rid && bid.sid) { \
893     SEN_B_DEC(tf2, bp); \
894     if (tf2) { \
895       /* sen_log("brid=%d", bid.rid); */ \
896       if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) { \
897         SEN_LOG(sen_log_crit, "brokenb!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid); \
898         rc = sen_invalid_format;\
899         break; \
900       } \
901       ndf++; \
902       gap = bid.rid - lid.rid; \
903       SEN_B_ENC(gap, dp); \
904       if (gap) { SEN_B_ENC(bid.sid, dp); } else { SEN_B_ENC(bid.sid - lid.sid, dp); } \
905       SEN_B_ENC(tf2, dp); \
906       if (tf2 & 1) { SEN_B_COPY(dp, bp); } \
907       ltf2 = tf2; \
908       tf2 >>= 1; \
909       while (tf2--) { SEN_B_COPY(dp, bp); } \
910       lid.rid = bid.rid; \
911       lid.sid = bid.sid; \
912     } \
913   } \
914   GETNEXTB(); \
915 }
916 
917       GETNEXTC();
918       GETNEXTB();
919       for (;;) {
920         if (bid.rid) {
921           if (cid.rid) {
922             if (cid.rid < bid.rid) {
923               PUTNEXTC();
924             } else {
925               if (bid.rid < cid.rid) {
926                 PUTNEXTB();
927               } else {
928                 if (bid.sid) {
929                   if (cid.sid < bid.sid) {
930                     PUTNEXTC();
931                   } else {
932                     if (bid.sid == cid.sid) { GETNEXTC(); }
933                     PUTNEXTB();
934                   }
935                 } else {
936                   GETNEXTC();
937                 }
938               }
939             }
940           } else {
941             PUTNEXTB();
942           }
943         } else {
944           if (cid.rid) {
945             PUTNEXTC();
946           } else {
947             break;
948           }
949         }
950       }
951       // sen_log("break: dp=%p cp=%p", dp, cp);
952 
953       bt->size_in_chunk = (uint32_t)((dp - dc) - bt->pos_in_chunk);
954 
955       if (!ndf) {
956         uint32_t *a;
957         if ((a = array_at(inv, bt->tid))) {
958           sen_sym_pocket_set(inv->lexicon, bt->tid, 0);
959           *a = 0;
960           sym_delete(inv, bt->tid, h);
961           array_unref(inv, bt->tid);
962         }
963         bt->tid = 0;
964         bt->pos_in_chunk = 0;
965         bt->size_in_chunk = 0;
966         nterms_void++;
967       } else if (ndf == 1 && lid.rid < 0x100000 && lid.sid < 0x800 && ltf2 == 2) {
968         uint32_t rid_, sid_, tf_, pos_;
969         uint8_t *dp_ = dc + bt->pos_in_chunk;
970         SEN_B_DEC(rid_, dp_);
971         if (rid_ < 0x100000) {
972           SEN_B_DEC(sid_, dp_);
973           if (sid_ < 0x800) {
974             SEN_B_DEC(tf_, dp_);
975             if (tf_ == 2) {
976               SEN_B_DEC(pos_, dp_);
977               if (pos_ < 0x4000) {
978                 uint32_t *a;
979                 if ((a = array_at(inv, bt->tid))) {
980                   sen_sym_pocket_set(inv->lexicon, bt->tid, pos_);
981                   *a = (rid_ << 12) + (sid_ << 1) + 1;
982                   array_unref(inv, bt->tid);
983                 }
984                 dp = dc + bt->pos_in_chunk;
985                 bt->tid = 0;
986                 bt->pos_in_chunk = 0;
987                 bt->size_in_chunk = 0;
988                 nterms_void++;
989               }
990             }
991           }
992         }
993       }
994       // sen_log("db=%p df=%d size=%d", db, ndf, (dp - dc) - bt->pos_in_chunk);
995     }
996     db->header.chunk_size = (uint32_t)(dp - dc);
997     db->header.nterms_void = nterms_void;
998     inv->total_chunk_size += db->header.chunk_size >> 10;
999   }
1000   db->header.chunk = dcn;
1001   db->header.buffer_free = SEN_INV_SEGMENT_SIZE
1002     - sizeof(buffer_header) - sb->header.nterms * sizeof(buffer_term);
1003   db->header.nterms = sb->header.nterms;
1004 
1005   {
1006     uint32_t mc, ec;
1007     mc = max_dest_chunk_size / SEN_INV_CHUNK_SIZE;
1008     if (mc * SEN_INV_CHUNK_SIZE < max_dest_chunk_size) { mc++; }
1009     ec = db->header.chunk_size / SEN_INV_CHUNK_SIZE;
1010     if (ec * SEN_INV_CHUNK_SIZE < db->header.chunk_size) { ec++; }
1011     // sen_log(" ss=%d ds=%d inv->binfo[%d]=%p max_size=%d(%d) chunk_size=%d(%d)", ss, ds, seg, db, max_dest_chunk_size, mc, db->header.chunk_size, ec);
1012     while (ec < mc) {
1013       // sen_log("chunk[%d]=0(%d)", ec, mc);
1014       inv->header->chunks[db->header.chunk + ec++] = 0;
1015     }
1016   }
1017   buffer_close(inv, seg * SEN_INV_SEGMENT_SIZE);
1018   SEN_IO_SEG_UNREF(inv->seg, ds);
1019   inv->binfo[seg] = ds;
1020   inv->header->segments[ss] = 0;
1021   inv->header->segments[ds] = SEGMENT_BUFFER | seg;
1022   if (scn != CHUNK_NOT_ASSIGNED) {
1023     sen_io_win_unmap(&sw);
1024     chunk_free(inv, scn, sb->header.chunk_size);
1025     inv->total_chunk_size -= sb->header.chunk_size >> 10;
1026   }
1027   sen_io_win_unmap(&dw);
1028   return rc;
1029 }
1030 
1031 /* inv */
1032 
1033 sen_inv *
sen_inv_create08(const char * path,sen_sym * lexicon,uint32_t initial_n_segments)1034 sen_inv_create08(const char *path, sen_sym *lexicon, uint32_t initial_n_segments)
1035 {
1036   int i, max_chunk;
1037   sen_io *seg, *chunk;
1038   sen_inv *inv;
1039   char path2[PATH_MAX];
1040   struct sen_inv_header *header;
1041   if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1042   strcpy(path2, path);
1043   strcat(path2, ".c");
1044   if (!initial_n_segments) { initial_n_segments = SEN_INV_INITIAL_N_SEGMENTS; }
1045   if (initial_n_segments > SEN_INV_MAX_SEGMENT) {
1046     initial_n_segments = SEN_INV_MAX_SEGMENT;
1047   }
1048   max_chunk = initial_n_segments * MAX_CHUNK_RATIO;
1049   seg = sen_io_create(path, sizeof(struct sen_inv_header) + max_chunk,
1050                       SEN_INV_SEGMENT_SIZE, SEN_INV_MAX_SEGMENT,
1051                       sen_io_auto, SEN_INV_MAX_SEGMENT);
1052   if (!seg) { return NULL; }
1053   chunk = sen_io_create(path2, 0, SEN_INV_CHUNK_SIZE,
1054                         max_chunk, sen_io_auto, max_chunk);
1055   if (!chunk) {
1056     sen_io_close(seg);
1057     return NULL;
1058   }
1059   header = sen_io_header(seg);
1060   memcpy(header->idstr, SEN_INV_IDSTR, 16);
1061   for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) { header->segments[i] = 0; }
1062   header->initial_n_segments = initial_n_segments;
1063   if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1064     sen_io_close(seg);
1065     sen_io_close(chunk);
1066     return NULL;
1067   }
1068   inv->v08p = 1;
1069   inv->seg = seg;
1070   inv->chunk = chunk;
1071   inv->header = header;
1072   inv->lexicon = lexicon;
1073 #ifdef USE_QUERY_ABORT
1074   inv->check_abort = NULL;
1075   inv->check_abort_arg = NULL;
1076 #endif /* USE_QUERY_ABORT */
1077   inv->total_chunk_size = 0;
1078   load_all_segments(inv);
1079   return inv;
1080 }
1081 
1082 sen_inv *
sen_inv_open08(const char * path,sen_sym * lexicon)1083 sen_inv_open08(const char *path, sen_sym *lexicon)
1084 {
1085   sen_io *seg, *chunk;
1086   sen_inv *inv;
1087   char path2[PATH_MAX];
1088   struct sen_inv_header *header;
1089   if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1090   strcpy(path2, path);
1091   strcat(path2, ".c");
1092   seg = sen_io_open(path, sen_io_auto, SEN_INV_MAX_SEGMENT);
1093   if (!seg) { return NULL; }
1094   chunk = sen_io_open(path2, sen_io_auto, SEN_INV_MAX_SEGMENT);
1095   if (!chunk) {
1096     sen_io_close(seg);
1097     return NULL;
1098   }
1099   header = sen_io_header(seg);
1100   if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1101     sen_io_close(seg);
1102     sen_io_close(chunk);
1103     return NULL;
1104   }
1105   inv->v08p = 1;
1106   inv->seg = seg;
1107   inv->chunk = chunk;
1108   inv->header = header;
1109   inv->lexicon = lexicon;
1110   {
1111     off_t size = 0;
1112     sen_io_size(inv->chunk, &size);
1113     inv->total_chunk_size = (uint32_t) (size >> 10);
1114   }
1115   load_all_segments(inv);
1116   return inv;
1117 }
1118 
1119 sen_rc
sen_inv_update_one08(sen_inv * inv,uint32_t key,sen_inv_updspec * u,sen_set * h,int hint)1120 sen_inv_update_one08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h, int hint)
1121 {
1122   sen_rc r = sen_success;
1123   buffer *b;
1124   uint8_t *bs;
1125   buffer_rec *br = NULL;
1126   buffer_term *bt;
1127   uint32_t pos = 0, size, *a;
1128   if (!u->tf || !u->sid) { return sen_inv_delete_one(inv, key, u, h); }
1129   if (!(a = array_get(inv, key))) { return sen_memory_exhausted; }
1130   if (!(bs = encode_rec(u, &size, 0))) { r = sen_memory_exhausted; goto exit; }
1131   for (;;) {
1132     if (*a) {
1133       if (!(*a & 1)) {
1134         pos = *a;
1135         if ((r = buffer_open(inv, pos, &bt, &b))) { goto exit; }
1136         if (b->header.buffer_free < size) {
1137           int bfb = b->header.buffer_free;
1138           SEN_LOG(sen_log_debug, "flushing *a=%d seg=%d(%p) free=%d",
1139                   *a, *a >> W_OF_SEGMENT, b, b->header.buffer_free);
1140           buffer_close(inv, pos);
1141           if ((r = buffer_flush(inv, pos >> W_OF_SEGMENT, h))) { goto exit; }
1142           if (*a != pos) {
1143             SEN_LOG(sen_log_debug, "sen_inv_update_one: *a changed %d->%d", *a, pos);
1144             continue;
1145           }
1146           if ((r = buffer_open(inv, pos, &bt, &b))) {
1147             SEN_LOG(sen_log_crit, "buffer not found *a=%d", *a);
1148             goto exit;
1149           }
1150           SEN_LOG(sen_log_debug, "flushed  *a=%d seg=%d(%p) free=%d->%d nterms=%d v=%d",
1151                   *a, *a >> W_OF_SEGMENT, b, bfb, b->header.buffer_free,
1152                   b->header.nterms, b->header.nterms_void);
1153           if (b->header.buffer_free < size) {
1154             buffer_close(inv, pos);
1155             SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_update_one",
1156                     *a, b->header.buffer_free, size);
1157             /* todo: must be splitted */
1158             r = sen_memory_exhausted;
1159             goto exit;
1160           }
1161         }
1162         b->header.buffer_free -= size;
1163         br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms])
1164                             + b->header.buffer_free);
1165       } else {
1166         sen_inv_updspec u2;
1167         uint32_t size2 = 0, v = *a;
1168         struct _sen_inv_pos pos2;
1169         pos2.pos = sen_sym_pocket_get(inv->lexicon, key);
1170         pos2.next = NULL;
1171         u2.pos = &pos2;
1172         u2.rid = BIT31_12(v);
1173         u2.sid = BIT11_01(v);
1174         u2.tf = 1;
1175         u2.score = 0;
1176         if (u2.rid != u->rid || u2.sid != u->sid) {
1177           uint8_t *bs2 = encode_rec(&u2, &size2, 0);
1178     if (!bs2) {
1179       SEN_LOG(sen_log_alert, "encode_rec on sen_inv_update_one failed !");
1180       r = sen_memory_exhausted;
1181       goto exit;
1182     }
1183           if ((r = buffer_new(inv, size + size2, &pos, &bt, &br, &b, hint))) {
1184             SEN_GFREE(bs2);
1185             goto exit;
1186           }
1187           bt->tid = key;
1188           bt->size_in_chunk = 0;
1189           bt->pos_in_chunk = 0;
1190           bt->size_in_buffer = 0;
1191           bt->pos_in_buffer = 0;
1192           if ((r = buffer_put(b, bt, br, bs2, &u2, size2))) {
1193             SEN_GFREE(bs2);
1194             buffer_close(inv, pos);
1195             goto exit;
1196           }
1197           br = (buffer_rec *)(((byte *)br) + size2);
1198           SEN_GFREE(bs2);
1199         }
1200       }
1201     }
1202     break;
1203   }
1204   if (!br) {
1205     if (u->rid < 0x100000 && u->sid < 0x800 &&
1206         u->tf == 1 && u->score == 0 && u->pos->pos < 0x4000) {
1207       sen_sym_pocket_set(inv->lexicon, key, u->pos->pos);
1208       *a = (u->rid << 12) + (u->sid << 1) + 1;
1209       goto exit;
1210     } else {
1211       if ((r = buffer_new(inv, size, &pos, &bt, &br, &b, hint))) { goto exit; }
1212       bt->tid = key;
1213       bt->size_in_chunk = 0;
1214       bt->pos_in_chunk = 0;
1215       bt->size_in_buffer = 0;
1216       bt->pos_in_buffer = 0;
1217     }
1218   }
1219   r = buffer_put(b, bt, br, bs, u, size);
1220   buffer_close(inv, pos);
1221   if (!*a || (*a & 1)) {
1222     *a = pos;
1223     sen_sym_pocket_set(inv->lexicon, key, 0);
1224   }
1225 exit :
1226   array_unref(inv, key);
1227   if (bs) { SEN_GFREE(bs); }
1228   return r;
1229 }
1230 
1231 sen_rc
sen_inv_delete_one08(sen_inv * inv,uint32_t key,sen_inv_updspec * u,sen_set * h)1232 sen_inv_delete_one08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h)
1233 {
1234   sen_rc r = sen_success;
1235   buffer *b;
1236   uint8_t *bs = NULL;
1237   buffer_rec *br;
1238   buffer_term *bt;
1239   uint32_t size, *a = array_at(inv, key);
1240   if (!a) { return sen_invalid_argument; }
1241   for (;;) {
1242     if (!*a) { goto exit; }
1243     if (*a & 1) {
1244       uint32_t rid = BIT31_12(*a);
1245       uint32_t sid = BIT11_01(*a);
1246       if (u->rid == rid && (!u->sid || u->sid == sid)) {
1247         *a = 0;
1248         sym_delete(inv, key, h);
1249       }
1250       goto exit;
1251     }
1252     if (!(bs = encode_rec(u, &size, 1))) {
1253       r = sen_memory_exhausted;
1254       goto exit;
1255     }
1256     if ((r = buffer_open(inv, *a, &bt, &b))) { goto exit; }
1257     //  sen_log("b->header.buffer_free=%d size=%d", b->header.buffer_free, size);
1258     if (b->header.buffer_free < size) {
1259       uint32_t _a = *a;
1260       SEN_LOG(sen_log_debug, "flushing! b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1261       buffer_close(inv, *a);
1262       if ((r = buffer_flush(inv, *a >> W_OF_SEGMENT, h))) { goto exit; }
1263       if (*a != _a) {
1264         SEN_LOG(sen_log_debug, "sen_inv_delete_one: *a changed %d->%d)", *a, _a);
1265         continue;
1266       }
1267       if ((r = buffer_open(inv, *a, &bt, &b))) { goto exit; }
1268       SEN_LOG(sen_log_debug, "flushed!  b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1269       if (b->header.buffer_free < size) {
1270         /* todo: must be splitted ? */
1271         SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_delete_one",
1272                 *a, b->header.buffer_free, size);
1273         r = sen_memory_exhausted;
1274         buffer_close(inv, *a);
1275         goto exit;
1276       }
1277     }
1278 
1279     b->header.buffer_free -= size;
1280     br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
1281     r = buffer_put(b, bt, br, bs, u, size);
1282     buffer_close(inv, *a);
1283     break;
1284   }
1285 exit :
1286   array_unref(inv, key);
1287   if (bs) { SEN_GFREE(bs); }
1288   return r;
1289 }
1290 
1291 #define CHUNK_USED    1
1292 #define BUFFER_USED   2
1293 #define SOLE_DOC_USED 4
1294 #define SOLE_POS_USED 8
1295 
1296 sen_inv_cursor *
sen_inv_cursor_open08(sen_inv * inv,uint32_t key)1297 sen_inv_cursor_open08(sen_inv *inv, uint32_t key)
1298 {
1299   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1300   sen_inv_cursor08 *c  = NULL;
1301   uint32_t pos, *a = array_at(inv, key);
1302   if (!a) { return NULL; }
1303   if (!(pos = *a)) { goto exit; }
1304   if (!(c = SEN_GMALLOC(sizeof(sen_inv_cursor08)))) { goto exit; }
1305   memset(c, 0, sizeof(sen_inv_cursor08));
1306   c->inv = inv;
1307   if (pos & 1) {
1308     c->stat = 0;
1309     c->pb.rid = BIT31_12(pos);
1310     c->pb.sid = BIT11_01(pos);
1311     c->pb.tf = 1;
1312     c->pb.score = 0;
1313     c->pb.pos = sen_sym_pocket_get(inv->lexicon, key);
1314   } else {
1315     uint32_t chunk;
1316     buffer_term *bt;
1317     c->pb.rid = 0; c->pb.sid = 0; /* for check */
1318     c->buffer_pos = pos;
1319     if (buffer_open(inv, pos, &bt, &c->buf)) {
1320       SEN_GFREE(c);
1321       c = NULL;
1322       goto exit;
1323     }
1324     if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != CHUNK_NOT_ASSIGNED) {
1325       c->cp = sen_io_win_map(inv->chunk, ctx, &c->iw,
1326                              chunk, bt->pos_in_chunk, bt->size_in_chunk, sen_io_rdonly);
1327       if (!c->cp) {
1328         buffer_close(inv, pos);
1329         SEN_GFREE(c);
1330         c = NULL;
1331         goto exit;
1332       }
1333       c->cpe = c->cp + bt->size_in_chunk;
1334       c->pc.rid = 0;
1335       c->pc.sid = 0;
1336     }
1337     c->nextb = bt->pos_in_buffer;
1338     c->stat = CHUNK_USED|BUFFER_USED;
1339   }
1340 exit :
1341   array_unref(inv, key);
1342   return (sen_inv_cursor *) c;
1343 }
1344 
1345 sen_rc
sen_inv_cursor_next08(sen_inv_cursor * c1)1346 sen_inv_cursor_next08(sen_inv_cursor *c1)
1347 {
1348   sen_inv_cursor08 *c = (sen_inv_cursor08 *)c1;
1349   if (c->buf) {
1350     for (;;) {
1351       if (c->stat & CHUNK_USED) {
1352         while (c->cp < c->cpe && c->pc.rest--) { SEN_B_SKIP(c->cp); }
1353         if (c->cp < c->cpe) {
1354           uint32_t gap;
1355           SEN_B_DEC(gap, c->cp);
1356           c->pc.rid += gap;
1357           if (gap) { c->pc.sid = 0; }
1358           SEN_B_DEC(gap, c->cp);
1359           c->pc.sid += gap;
1360           SEN_B_DEC(c->pc.tf, c->cp);
1361           if (c->pc.tf & 1) { SEN_B_DEC(c->pc.score, c->cp); } else { c->pc.score = 0; }
1362           c->pc.rest = c->pc.tf >>= 1;
1363           c->pc.pos = 0;
1364         } else {
1365           c->pc.rid = 0;
1366         }
1367       }
1368       if (c->stat & BUFFER_USED) {
1369         if (c->nextb) {
1370           uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */
1371           buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb);
1372           c->bp = NEXT_ADDR(br);
1373           SEN_B_DEC(c->pb.rid, c->bp);
1374           SEN_B_DEC(c->pb.sid, c->bp);
1375           if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) {
1376             SEN_LOG(sen_log_crit, "brokend!! (%d:%d) -> (%d:%d)", lrid, lsid, c->pb.rid, c->pb.sid);
1377             return sen_other_error;
1378           }
1379           c->nextb = br->step;
1380           SEN_B_DEC(c->pb.tf, c->bp);
1381           if (c->pb.tf & 1) { SEN_B_DEC(c->pb.score, c->bp); } else { c->pb.score = 0; }
1382           c->pb.rest = c->pb.tf >>= 1;
1383           c->pb.pos = 0;
1384         } else {
1385           c->pb.rid = 0;
1386         }
1387       }
1388       if (c->pb.rid) {
1389         if (c->pc.rid) {
1390           if (c->pc.rid < c->pb.rid) {
1391             c->stat = CHUNK_USED;
1392             if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1393           } else {
1394             if (c->pb.rid < c->pc.rid) {
1395               c->stat = BUFFER_USED;
1396               if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1397             } else {
1398               if (c->pb.sid) {
1399                 if (c->pc.sid < c->pb.sid) {
1400                   c->stat = CHUNK_USED;
1401                   if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1402                 } else {
1403                   c->stat = BUFFER_USED;
1404                   if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; }
1405                   if (c->pb.tf) { c->post = &c->pb; break; }
1406                 }
1407               } else {
1408                 c->stat = CHUNK_USED;
1409               }
1410             }
1411           }
1412         } else {
1413           c->stat = BUFFER_USED;
1414           if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1415         }
1416       } else {
1417         if (c->pc.rid) {
1418           c->stat = CHUNK_USED;
1419           if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1420         } else {
1421           c->post = NULL;
1422           return sen_other_error;
1423         }
1424       }
1425     }
1426   } else {
1427     if (c->stat & SOLE_DOC_USED) {
1428       c->post = NULL;
1429       return sen_other_error;
1430     } else {
1431       c->post = &c->pb;
1432       c->stat |= SOLE_DOC_USED;
1433     }
1434   }
1435   return sen_success;
1436 }
1437 
1438 sen_rc
sen_inv_cursor_next_pos08(sen_inv_cursor * c1)1439 sen_inv_cursor_next_pos08(sen_inv_cursor *c1)
1440 {
1441   sen_inv_cursor08 *c = (sen_inv_cursor08 *)c1;
1442   uint32_t gap;
1443   sen_rc rc = sen_success;
1444   if (c->buf) {
1445     if (c->post == &c->pc) {
1446       if (c->pc.rest) {
1447         c->pc.rest--;
1448         SEN_B_DEC(gap, c->cp);
1449         c->pc.pos += gap;
1450       } else {
1451         rc = sen_other_error;
1452       }
1453     } else if (c->post == &c->pb) {
1454       if (c->pb.rest) {
1455         c->pb.rest--;
1456         SEN_B_DEC(gap, c->bp);
1457         c->pb.pos += gap;
1458       } else {
1459         rc = sen_other_error;
1460       }
1461     } else {
1462       rc = sen_other_error;
1463     }
1464   } else {
1465     if (c->stat & SOLE_POS_USED) {
1466       rc = sen_other_error;
1467     } else {
1468       c->stat |= SOLE_POS_USED;
1469     }
1470   }
1471   return rc;
1472 }
1473 
1474 sen_rc
sen_inv_cursor_close08(sen_inv_cursor * c1)1475 sen_inv_cursor_close08(sen_inv_cursor *c1)
1476 {
1477   sen_inv_cursor08 *c = (sen_inv_cursor08 *) c1;
1478   if (!c) { return sen_invalid_argument; }
1479   if (c->cp) { sen_io_win_unmap(&c->iw); }
1480   if (c->buf) { buffer_close(c->inv, c->buffer_pos); }
1481   SEN_GFREE(c);
1482   return sen_success;
1483 }
1484 
1485 uint32_t
sen_inv_estimate_size08(sen_inv * inv,uint32_t key)1486 sen_inv_estimate_size08(sen_inv *inv, uint32_t key)
1487 {
1488   uint32_t res, pos, *a = array_at(inv, key);
1489   if (!a) { return 0; }
1490   if ((pos = *a)) {
1491     if (pos & 1) {
1492       res = 1;
1493     } else {
1494       buffer *buf;
1495       buffer_term *bt;
1496       if (buffer_open(inv, pos, &bt, &buf)) {
1497         res = 0;
1498       } else {
1499         res = (bt->size_in_chunk >> 2) + bt->size_in_buffer + 2;
1500         buffer_close(inv, pos);
1501       }
1502     }
1503   } else {
1504     res = 0;
1505   }
1506   array_unref(inv, key);
1507   return res;
1508 }
1509 
1510 int
sen_inv_entry_info08(sen_inv * inv,unsigned key,unsigned * a,unsigned * pocket,unsigned * chunk,unsigned * chunk_size,unsigned * buffer_free,unsigned * nterms,unsigned * nterms_void,unsigned * tid,unsigned * size_in_chunk,unsigned * pos_in_chunk,unsigned * size_in_buffer,unsigned * pos_in_buffer)1511 sen_inv_entry_info08(sen_inv *inv, unsigned key, unsigned *a, unsigned *pocket,
1512                    unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
1513                    unsigned *nterms, unsigned *nterms_void, unsigned *tid,
1514                    unsigned *size_in_chunk, unsigned *pos_in_chunk,
1515                    unsigned *size_in_buffer, unsigned *pos_in_buffer)
1516 {
1517   buffer *b;
1518   buffer_term *bt;
1519   uint32_t *ap = array_at(inv, key);
1520   *pocket = sen_sym_pocket_get(inv->lexicon, key);
1521   if (!ap) { return 0; }
1522   *a = *ap;
1523   array_unref(inv, key);
1524   if (!*a) { return 1; }
1525   if (*a & 1) { return 2; }
1526   if (buffer_open(inv, *a, &bt, &b)) { return 3; }
1527   *chunk = b->header.chunk;
1528   *chunk_size = b->header.chunk_size;
1529   *buffer_free = b->header.buffer_free;
1530   *nterms = b->header.nterms;
1531   *tid = bt->tid;
1532   *size_in_chunk = bt->size_in_chunk;
1533   *pos_in_chunk = bt->pos_in_chunk;
1534   *size_in_buffer = bt->size_in_buffer;
1535   *pos_in_buffer = bt->pos_in_buffer;
1536   buffer_close(inv, *a);
1537   return 4;
1538 }
1539