1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <math.h>
23
24 #include "udm_common.h"
25 #include "udm_utils.h"
26 #include "udm_db.h"
27 #include "udm_db_int.h"
28 #include "udm_vars.h"
29 #include "udm_coords.h"
30 #include "udm_log.h"
31 #include "udm_hash.h"
32 #include "udm_word.h"
33 #include "udm_crc32.h"
34 #include "udm_http.h"
35 #include "udm_contentencoding.h"
36 #include "udm_parsehtml.h"
37 #include "udm_store.h"
38 #include "udm_textlist.h"
39 #include "udm_indexcache.h"
40 #include "udm_result.h"
41 #include "udm_agent.h"
42
43 #ifdef HAVE_SQL
44
45
46 #ifdef WIN32
47 #define UDM_DEFAULT_ZINT4 UDM_TRUE
48 #define UDM_DEFAULT_DEFLATE UDM_TRUE
49 #else
50 #define UDM_DEFAULT_ZINT4 UDM_FALSE
51 #define UDM_DEFAULT_DEFLATE UDM_FALSE
52 #endif
53
54
55 /********** Record encoding and compression ************/
56 /*
57 Record compression types:
58 - 0x01 = Deflate
59 - 0x02 = Zint4
60 - 0x03 = Zint4+Deflate
61 - 0x04 = Single URL_ID range
62 */
63
64 #define UDM_BLOB_COMP_NONE 0x00
65 #define UDM_BLOB_COMP_DEFLATE 0x01
66 #define UDM_BLOB_COMP_ZINT4 0x02
67 #define UDM_BLOB_COMP_ZINT4_DEFLATE 0x03
68 #define UDM_BLOB_COMP_SINGLE_RANGE 0x04
69 #define UDM_BLOB_COMP_URLID_DELTA_VARIABLE 0x05
70 #define UDM_BLOB_COMP_URLID_DELTA_2BYTES 0x06
71 #define UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET 0x07
72 #define UDM_BLOB_COMP_URLID_DELTA_3BYTES 0x08
73 #define UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET 0x09
74 #define UDM_BLOB_COMP_URLID_RANGE_MULTI 0x0A
75
76
77 static inline int
UdmDSTRAppendCompressionType(UDM_DSTR * dstr,int compression_type)78 UdmDSTRAppendCompressionType(UDM_DSTR *dstr, int compression_type)
79 {
80 if (!UdmDSTRAppendINT4(dstr, 0xFFFFFFFF))
81 return 0;
82 return (compression_type <= UDM_BLOB_COMP_ZINT4_DEFLATE) ?
83 UdmDSTRAppendINT4(dstr, compression_type) :
84 UdmDSTRAppendINT2BE(dstr, compression_type);
85 }
86
87
88 /*******************************************/
89
90 /*
91 If can do "indexer -Eblob" using RENAME TABLE.
92 */
93 static int
UdmBlobCanDoRename(UDM_DB * db)94 UdmBlobCanDoRename(UDM_DB *db)
95 {
96 return
97 (UdmSQLDBFlags(db) & UDM_SQL_HAVE_RENAME) &&
98 (UdmSQLDBFlags(db) & UDM_SQL_HAVE_CREATE_LIKE) &&
99 /* PgSQL can do RENAME only when "DROP TABLE IF EXISTS" is supported */
100 (UdmSQLDBType(db)!= UDM_DB_PGSQL || UdmSQLDBFlags(db) & UDM_SQL_HAVE_DROP_IF_EXISTS);
101 }
102
103
104 static udm_rc_t
UdmBlobCreateIndexRandomName(UDM_AGENT * A,UDM_DB * db,const char * table_name)105 UdmBlobCreateIndexRandomName(UDM_AGENT *A, UDM_DB *db, const char *table_name)
106 {
107 char qbuf[128];
108 /* Create index with an unique name */
109 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
110 {
111 udm_snprintf(qbuf, sizeof(qbuf),
112 "ALTER TABLE %s ADD KEY (word)", table_name);
113 }
114 else
115 {
116 udm_snprintf(qbuf, sizeof(qbuf),
117 "CREATE INDEX bdict_%d_%d ON %s (word)",
118 (int) time(0), (int) (UdmStartTimer() % 0xFFFF), table_name);
119 }
120 return UdmDBSQLQuery(A, db, NULL, qbuf);
121 }
122
123
124 static int
UdmBlobGetTable(UDM_AGENT * A,UDM_DB * db)125 UdmBlobGetTable(UDM_AGENT *A, UDM_DB *db)
126 {
127 UDM_SQLRES SQLRes;
128 int rc;
129 const char *val;
130
131 return(1);
132
133 if (UDM_OK != UdmDBSQLQuery(A, db, &SQLRes, "SELECT n FROM bdictsw"))
134 return(1);
135
136 if (! UdmSQLNumRows(&SQLRes) || ! (val = UdmSQLValue(&SQLRes, 0, 0))) rc = 2;
137 else if (*val != '1') rc = 3;
138 else rc = 4;
139
140 UdmSQLFree(&SQLRes);
141 return(rc);
142 }
143
144
145 /*
146 This function returns "bdict" by default,
147 or the "bdict" parameter from DBAddr, if exists.
148 */
149 static const char *
UdmBlobGetTableNamePrefix(UDM_DB * db)150 UdmBlobGetTableNamePrefix(UDM_DB *db)
151 {
152 return UdmVarListFindStr(UdmSQLDBVars(db), "bdict", "bdict");
153 }
154
155
156 /*
157 This function is used when "indexer -Erewritelimit"
158 or "indexer -Erewriteurl" is called.
159 */
160 static size_t
UdmBlobGetTableForRewrite(UDM_AGENT * A,UDM_DB * db,char * dst,size_t dstlen)161 UdmBlobGetTableForRewrite(UDM_AGENT *A, UDM_DB *db, char *dst, size_t dstlen)
162 {
163 const char *prefix= UdmBlobGetTableNamePrefix(db);
164 return udm_snprintf(dst, dstlen, "%s", prefix);
165 }
166
167
168 static size_t
UdmBlobGetRTable(UDM_AGENT * A,UDM_DB * db,char * dst,size_t dstlen)169 UdmBlobGetRTable(UDM_AGENT *A, UDM_DB *db, char *dst, size_t dstlen)
170 {
171 const char *prefix= UdmBlobGetTableNamePrefix(db);
172 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
173 return udm_snprintf(dst, dstlen, "%s", prefix);
174 if (UdmBlobGetTable(A, db) == 3)
175 return udm_snprintf(dst, dstlen, "%s00", prefix);
176 return udm_snprintf(dst, dstlen, "%s", prefix);
177 }
178
179
180 static udm_rc_t
UdmBlobGetWTable(UDM_AGENT * A,UDM_DB * db,char * name,size_t namelen)181 UdmBlobGetWTable(UDM_AGENT *A, UDM_DB *db, char *name, size_t namelen)
182 {
183 udm_rc_t rc;
184
185 if (UdmBlobCanDoRename(db))
186 {
187 if ((UDM_OK != (rc= UdmDBSQLDropTableIfExists(A, db, "bdict_tmp"))) ||
188 (UDM_OK != (rc= UdmDBSQLCopyStructure(A, db, "bdict", "bdict_tmp"))) ||
189 (UDM_OK != (rc= UdmBlobCreateIndexRandomName(A, db, "bdict_tmp"))))
190 return rc;
191 udm_snprintf(name, namelen, "bdict_tmp");
192 return UDM_OK;
193 }
194
195 udm_snprintf(name, namelen, "%s", UdmBlobGetTableNamePrefix(db));
196 if (UdmBlobGetTable(A, db) == 4)
197 udm_snprintf(name, namelen, "%s00", UdmBlobGetTableNamePrefix(db));
198 return UDM_OK;
199 }
200
201
202 static udm_rc_t
UdmBlobSetTable(UDM_AGENT * A,UDM_DB * db)203 UdmBlobSetTable(UDM_AGENT *A, UDM_DB *db)
204 {
205 char qbuf[128];
206 udm_rc_t rc;
207 int t, n;
208 const char *table_name= UdmVarListFindBool(&A->Conf->Vars, "delta", UDM_FALSE) ?
209 "bdict_delta" :
210 UdmBlobGetTableNamePrefix(db);
211
212 if (UdmBlobCanDoRename(db))
213 {
214 if (UDM_OK == (rc= UdmDBSQLDropTableIfExists(A, db, table_name)))
215 rc= UdmDBSQLRenameTable(A, db, "bdict_tmp", table_name);
216 return rc;
217 }
218
219 t= UdmBlobGetTable(A, db);
220 if (t == 1) return(UDM_OK);
221 else if (t == 4) n = 0;
222 else n = 1;
223
224 rc= UdmDBSQLQuery(A, db, NULL, "DELETE FROM bdictsw");
225 if (rc != UDM_OK) return(UDM_OK);
226 udm_snprintf(qbuf, sizeof(qbuf), "INSERT INTO bdictsw VALUES(%d)", n);
227 rc = UdmDBSQLQuery(A, db, NULL, qbuf);
228 if (rc != UDM_OK) return(UDM_OK);
229 return(UDM_OK);
230 }
231
232
233 typedef struct udm_blob_cache_stat_st
234 {
235 urlid_t cur_url_id;
236 urlid_t min_url_id;
237 urlid_t max_url_id;
238 urlid_t max_url_id_delta;
239 urlid_t range_min_url_id;
240 urlid_t range_max_url_id;
241 size_t ndistinct_url_ids_minus_one;
242 size_t compression_type;
243 size_t range_offset;
244 size_t nranges;
245 } UDM_BLOB_CACHE_WORD_STAT;
246
247
248 static inline void
udm_put_int2(int i,unsigned char * dst)249 udm_put_int2(int i, unsigned char *dst)
250 {
251 dst[0]= (unsigned char) (i & 0xFF);
252 dst[1]= (unsigned char) (i >> 8);
253 }
254
255 static inline int
udm_get_int2(const unsigned char * src)256 udm_get_int2(const unsigned char *src)
257 {
258 return ((int) src[0]) + (((int) src[1]) << 8);
259 }
260
261
262 static inline void
udm_put_int3(int i,unsigned char * dst)263 udm_put_int3(int i, unsigned char *dst)
264 {
265 dst[0]= (unsigned char) (i & 0xFF);
266 dst[1]= (unsigned char) ((i >> 8) & 0xFF);
267 dst[2]= (unsigned char) ((i >> 16) & 0xFF);
268 }
269
270 static inline int
udm_get_int3(const unsigned char * src)271 udm_get_int3(const unsigned char *src)
272 {
273 return ((int) src[0]) + (((int) src[1]) << 8) + (((int) src[2]) << 16);
274 }
275
276
277 /*
278 Pack integer using various formats:
279 0 = don't put anything
280 1 = one byte
281 2 = two bytes
282 3 = three bytes
283 4 = four bytes
284 5 = variable length encoding
285
286 dst must have enough space to store the packed integer.
287 */
288 static inline size_t
udm_put_int_with_format(int i,unsigned char * dst,int format)289 udm_put_int_with_format(int i, unsigned char *dst, int format)
290 {
291 if (format == 1)
292 {
293 UDM_ASSERT((unsigned int) i < 256);
294 dst[0]= (char) (unsigned char) i;
295 return 1;
296 }
297 else if (format == 2)
298 {
299 udm_put_int2(i, (unsigned char*) dst);
300 return 2;
301 }
302 else if (format == 3)
303 {
304 udm_put_int3(i, (unsigned char*) dst);
305 return 3;
306 }
307 else if (format == 4)
308 {
309 udm_put_int4(i, (unsigned char*) dst);
310 return 4;
311 }
312 else if (format == 5)
313 {
314 return udm_coord_put(i, dst, dst + 4);
315 }
316 UDM_ASSERT(format == 0);
317 return 0;
318 }
319
320
321 /*
322 Unpack coords when only minpos, maxpos and seclen is of interest
323 */
324 static UDM_COORD2 *
UdmBlobPackedCoordsUnpackMinMaxLen(const unsigned char * s,const unsigned char * e,size_t nrecs,UDM_COORD2 * C,UDM_COORD2 * Coord,const unsigned char ** end,int save_section_size,UDM_SEARCHSECTION * Section)325 UdmBlobPackedCoordsUnpackMinMaxLen(const unsigned char *s,
326 const unsigned char *e,
327 size_t nrecs,
328 UDM_COORD2 *C,
329 UDM_COORD2 *Coord,
330 const unsigned char **end,
331 int save_section_size,
332 UDM_SEARCHSECTION *Section)
333 {
334 size_t crd, nbytes;
335 if (save_section_size)
336 {
337 if (nrecs > 1)
338 {
339 s= udm_coord_sum(&crd, s, e, nrecs - 1); /* Sum middle coords */
340 C->pos+= crd;
341 Section->maxpos= C->pos;
342 }
343 else
344 Section->maxpos= C->pos; /* One coord, minpos=maxpos */
345
346 if ((nbytes= udm_coord_get(&crd, s, e))) /* Get seclen */
347 {
348 s+= nbytes;
349 C->pos+= crd;
350 Section->seclen= C->pos;
351 Section->ncoords= nrecs;
352 Coord+= nrecs;
353 }
354 else
355 {
356 Section->seclen= 0;
357 Section->ncoords= 0;
358 }
359 }
360 else
361 {
362 s= udm_coord_sum(&crd, s, e, nrecs); /* Sum middle coords */
363 C->pos+= crd;
364 Section->maxpos= C->pos;
365 Section->seclen= 0;
366 Section->ncoords= nrecs + 1;
367 Coord+= nrecs + 1;
368 }
369 *end= s;
370 return Coord;
371 }
372
373
374
375 static size_t
UdmBlobCoordsGetCompressionType(UDM_AGENT * A,UDM_BLOB_CACHE_WORD_STAT * Stat,const unsigned char * s,size_t length)376 UdmBlobCoordsGetCompressionType(UDM_AGENT *A,
377 UDM_BLOB_CACHE_WORD_STAT *Stat,
378 const unsigned char *s,
379 size_t length)
380 {
381 size_t header_size;
382 bzero((void*) Stat, sizeof(*Stat));
383 Stat->compression_type= (length > 10 && udm_get_int4(s) == 0xFFFFFFFF) ?
384 udm_get_int4(s + 4) : 0;
385 header_size= (Stat->compression_type & 0xFFFF0000) ? 6 : 8;
386 Stat->compression_type&= 0x0000FFFF;
387 /*
388 fprintf(stderr, "Hdr size=%d cmpr=%08X len=%d\n", header_size, Stat->compression_type, length);
389 */
390 if (Stat->compression_type == UDM_BLOB_COMP_SINGLE_RANGE)
391 {
392 Stat->min_url_id= udm_get_int4(s + header_size);
393 Stat->max_url_id= udm_get_int4(s + header_size + 4);
394 Stat->cur_url_id= Stat->min_url_id;
395 UdmLog(A, UDM_LOG_DEBUG,
396 "Single-URLID-Range compression: %d-%d (%d docs)",
397 Stat->min_url_id, Stat->max_url_id,
398 Stat->max_url_id - Stat->min_url_id + 1);
399 return header_size + 8;
400 }
401 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_VARIABLE)
402 {
403 UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta compression");
404 return header_size;
405 }
406 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES)
407 {
408 UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta-2bytes compression");
409 return header_size;
410 }
411 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET)
412 {
413 Stat->cur_url_id= udm_get_int2(s + header_size) << 16;
414 UdmLog(A, UDM_LOG_DEBUG,
415 "URLID-Delta-2bytes-offset compression, offs=%d", Stat->cur_url_id);
416 return header_size + 2;
417 }
418 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET)
419 {
420 Stat->cur_url_id= udm_get_int4(s + header_size);
421 UdmLog(A, UDM_LOG_DEBUG,
422 "URLID-Delta-1byte-offset compression, offs=%d", Stat->cur_url_id);
423 return header_size + 4;
424 }
425 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_3BYTES)
426 {
427 UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta-2bytes compression");
428 return header_size;
429 }
430 else if (Stat->compression_type == UDM_BLOB_COMP_URLID_RANGE_MULTI)
431 {
432 Stat->nranges= udm_get_int4(s + header_size);
433 Stat->range_offset= header_size + 4;
434 UdmLog(A, UDM_LOG_DEBUG,
435 "URLID-Range-Multi compression, nranges=%d", (int) Stat->nranges);
436 return header_size + 4 + 8 * Stat->nranges;
437 }
438 else if (Stat->compression_type != UDM_BLOB_COMP_NONE)
439 {
440 UdmLog(A, UDM_LOG_DEBUG,
441 "Unknown coompression type: %08X", (int) Stat->compression_type);
442 }
443 return 0;
444 }
445
446
447 static size_t
UdmExpectedSectionCount(size_t compression_type,size_t length)448 UdmExpectedSectionCount(size_t compression_type, size_t length)
449 {
450 switch (compression_type)
451 {
452 case UDM_BLOB_COMP_NONE:
453 /*
454 Shortest section with no compression is 6 bytes:
455 - 4 bytes for URL id
456 - 1 byte for "ncoords"
457 - 1 byte for coord
458 */
459 return length / 6;
460 case UDM_BLOB_COMP_URLID_DELTA_VARIABLE:
461 /*
462 - 1..4 bytes URL id delta
463 - 1 byte for "ncoordss"
464 - 1 byte for coord
465 */
466 return length / 3;
467
468 case UDM_BLOB_COMP_SINGLE_RANGE:
469 case UDM_BLOB_COMP_URLID_RANGE_MULTI:
470 /*
471 Shortest section with single-range compression is 2 byest:
472 - 1 byte for ncoords
473 - 1 byte for coord
474 */
475 return length / 2;
476 case UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET:
477 /* 1 byte url_id, 1 byte ncoorrds, 1 byte coords */
478 return length / 3;
479 case UDM_BLOB_COMP_URLID_DELTA_2BYTES:
480 case UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET:
481 /* 2 bytes url_id, 1 byte ncoords, 1 byte coord */
482 return length / 4;
483 case UDM_BLOB_COMP_URLID_DELTA_3BYTES:
484 /* 3 bytes url_id, 1 bbyte ncoords, 1 byte coord */
485 return length / 5;
486 }
487
488 UDM_ASSERT(0); /* Should not normally get here */
489 return length;
490 }
491
492
493 static udm_rc_t
UdmSectionListBlobCoordsUnpack(UDM_AGENT * A,UDM_SEARCHSECTIONLIST * SectionList,UDM_URLID_LIST * urls,UDM_SEARCHSECTION * SectionTemplate,const unsigned char * s,size_t length,int save_section_size,int need_coords)494 UdmSectionListBlobCoordsUnpack(UDM_AGENT *A,
495 UDM_SEARCHSECTIONLIST *SectionList,
496 UDM_URLID_LIST *urls,
497 UDM_SEARCHSECTION *SectionTemplate,
498 const unsigned char *s,
499 size_t length,
500 int save_section_size,
501 int need_coords)
502 {
503 size_t ncoords= 0, nurls= urls->nurls;
504 const unsigned char *s0= s;
505 const unsigned char *e= s + length;
506 const unsigned char *last_urlid_start= e - sizeof(urlid_t) - 1;
507 UDM_COORD2 C, *Coord;
508 unsigned char secno= SectionTemplate->secno;
509 unsigned char wordnum= SectionTemplate->wordnum;
510 unsigned char order= SectionTemplate->order;
511 UDM_SEARCHSECTION *Section;
512 UDM_BLOB_CACHE_WORD_STAT Stat;
513 const unsigned char *range_ptr;
514 int single_range, range_multi, compr_urlid_delta;
515 int compr_urlid_delta_1bytes, compr_urlid_delta_2bytes, compr_urlid_delta_3bytes;
516 size_t coords_alloced, sections_alloced;
517 s+= UdmBlobCoordsGetCompressionType(A, &Stat, s, length);
518 range_ptr= s0 + Stat.range_offset;
519
520 UdmLog(A, UDM_LOG_DEBUG+1, "Secno=%d len=%d",
521 SectionTemplate->secno, (int) length);
522 single_range= (Stat.compression_type == UDM_BLOB_COMP_SINGLE_RANGE);
523 range_multi= (Stat.compression_type == UDM_BLOB_COMP_URLID_RANGE_MULTI);
524 compr_urlid_delta= (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_VARIABLE);
525 compr_urlid_delta_2bytes= (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES) ||
526 (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET);
527 compr_urlid_delta_3bytes= Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_3BYTES;
528 compr_urlid_delta_1bytes= Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET;
529
530 if (single_range || range_multi)
531 last_urlid_start= e - 1; /* TODO: check other modes */
532
533 coords_alloced= length;
534 sections_alloced= UdmExpectedSectionCount(Stat.compression_type, length);
535 UdmSearchSectionListAlloc(SectionList, coords_alloced, sections_alloced);
536 Coord= SectionList->Coord;
537 Section= SectionList->Section;
538
539 /*
540 A non-compressed chunk consists of:
541 - sizeof(urlid_t)
542 - at least one byte for length
543 */
544 for (C.order= order ; s < last_urlid_start; )
545 {
546 int active= 1;
547 size_t nrecs;
548
549 Section->Coord= Coord;
550 Section->secno= secno;
551 if (Stat.compression_type)
552 {
553 if (single_range)
554 {
555 Section->url_id= Stat.cur_url_id++;
556 }
557 else if (range_multi)
558 {
559 if (Stat.cur_url_id == Stat.range_max_url_id && Stat.nranges--)
560 {
561 Stat.range_min_url_id= udm_get_int4(range_ptr); range_ptr+= 4;
562 Stat.range_max_url_id= udm_get_int4(range_ptr); range_ptr+= 4;
563 Stat.cur_url_id= Stat.range_min_url_id;
564 /*
565 fprintf(stderr, "loading range: %d-%d nranges remain: %d\n",
566 Stat.range_min_url_id, Stat.range_max_url_id, Stat.nranges);
567 */
568 }
569 else
570 {
571 Stat.cur_url_id++;
572 }
573 Section->url_id= Stat.cur_url_id;
574 }
575 else if (compr_urlid_delta)
576 {
577 size_t delta, nbytes= udm_coord_get(&delta, s, e);
578 s+= nbytes;
579 if (!nbytes)
580 break;
581 Stat.cur_url_id+= delta;
582 Section->url_id= Stat.cur_url_id;
583 }
584 else if (compr_urlid_delta_1bytes)
585 {
586 size_t delta= (unsigned char) *s;
587 s++;
588 Stat.cur_url_id+= delta;
589 Section->url_id= Stat.cur_url_id;
590 /*
591 fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
592 */
593 }
594 else if (compr_urlid_delta_2bytes)
595 {
596 size_t delta= udm_get_int2(s);
597 s+= 2;
598 Stat.cur_url_id+= delta;
599 Section->url_id= Stat.cur_url_id;
600 /*
601 fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
602 */
603 }
604 else if (compr_urlid_delta_3bytes)
605 {
606 size_t delta= udm_get_int3(s);
607 s+= 3;
608 Stat.cur_url_id+= delta;
609 Section->url_id= Stat.cur_url_id;
610 /*
611 fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
612 */
613 }
614 }
615 else
616 {
617 Section->url_id= (urlid_t) udm_get_int4(s);
618 s+= 4;
619 }
620 Section->wordnum= wordnum;
621 Section->order= order;
622
623 if (nurls)
624 {
625 void *found= UdmBSearch(&Section->url_id, urls->urls, urls->nurls,
626 sizeof(urlid_t), (udm_qsort_cmp)UdmCmpURLID);
627 if (found && urls->exclude)
628 active= 0;
629 if (!found && !urls->exclude)
630 active= 0;
631 }
632
633 /* Get number of coords */
634 if (*s < 128)
635 {
636 nrecs= *s++;
637 }
638 else
639 {
640 size_t nbytes= udm_coord_get(&nrecs, s, e);
641 if (!nbytes) break;
642 s+= nbytes;
643 }
644
645 if (!nrecs) /* extra safety */
646 break;
647
648 if (!active)
649 {
650 s= udm_coord_skip(s, e, nrecs);
651 continue;
652 }
653
654 ncoords+= nrecs;
655
656 if (save_section_size && nrecs > 1)
657 ncoords--;
658
659 Section->PackedCoord= s;
660
661 /* Get first coord and put into S->minpos */
662 if (*s < 128)
663 {
664 C.pos= *s++;
665 }
666 else
667 {
668 size_t crd;
669 size_t nbytes= udm_coord_get(&crd, s, e);
670 if (!nbytes) break;
671 s+= nbytes;
672 C.pos= crd;
673 }
674
675 Section->minpos= C.pos;
676
677 /*
678 If no coords anymore.
679 Maybe the "section length" record didn't fit
680 into "64K words per section" limit.
681 Add section with seclen=0.
682 */
683 if (!--nrecs)
684 {
685 Section->seclen= C.pos;
686 Section->ncoords= 1;
687 Section->maxpos= C.pos;
688 if (need_coords)
689 *Coord= C;
690 Coord++;
691 Section++;
692 continue;
693 }
694
695 if (!need_coords) /* Does not need coords, e.g. one word search */
696 {
697 Coord= UdmBlobPackedCoordsUnpackMinMaxLen(s, e, nrecs, &C, Coord, &s,
698 save_section_size, Section);
699 Section++;
700 continue;
701 }
702
703 *Coord++= C; /* Add first coord */
704
705 /* Unpack the other coordinates */
706 Coord= UdmPackedCoordsToUnpackedCoords(s, e, nrecs, &C, Coord, &s);
707
708 /* Set section length */
709 nrecs= Coord - Section->Coord;
710 if (save_section_size)
711 {
712 /*
713 We need to check whether Coord > Coord0 in the above
714 condition: URL could be skipped because of limit
715 */
716 Section->seclen= ((--Coord)->pos);
717 Section->ncoords= nrecs - 1;
718 Section->maxpos= Coord[-1].pos;;
719 }
720 else
721 {
722 Section->seclen= 0;
723 Section->ncoords= nrecs;
724 Section->maxpos= C.pos;
725 }
726 Section++;
727 }
728
729 SectionList->ncoords= ncoords;
730 SectionList->nsections= Section - SectionList->Section;
731
732 UDM_ASSERT(SectionList->ncoords <= coords_alloced);
733 UDM_ASSERT(SectionList->nsections <= sections_alloced);
734
735 return UDM_OK;
736 }
737
738
739 static udm_rc_t
UdmDeleteWordsFromURLBlob(UDM_AGENT * Indexer,UDM_DB * db,urlid_t url_id)740 UdmDeleteWordsFromURLBlob(UDM_AGENT *Indexer, UDM_DB *db, urlid_t url_id)
741 {
742 return UDM_OK;
743 }
744
745
746 static udm_rc_t
UdmStoreWordsBlob(UDM_AGENT * Indexer,UDM_DB * db,UDM_DOCUMENT * Doc)747 UdmStoreWordsBlob(UDM_AGENT *Indexer, UDM_DB *db, UDM_DOCUMENT *Doc)
748 {
749 return UDM_OK;
750 }
751
752
753 static const char*
UdmBlobModeInflateOrSelf(UDM_AGENT * A,UDM_DSTR * buf,const char * name,const char * src,size_t * len)754 UdmBlobModeInflateOrSelf(UDM_AGENT *A,
755 UDM_DSTR *buf, const char *name,
756 const char *src, size_t *len)
757 {
758 int use_zint4;
759 int use_deflate;
760 /* fprintf(stderr, "here, len=%d src=%p\n", *len, src); */
761 if (!src || *len < 8 ||
762 (unsigned char)src[0] != 0xFF ||
763 (unsigned char)src[1] != 0xFF ||
764 (unsigned char)src[2] != 0xFF ||
765 (unsigned char)src[3] != 0xFF ||
766 (src[4] != 1 && src[4] != 2 && src[4] != 3) ||
767 src[5] != 0x00 ||
768 src[6] != 0x00 ||
769 src[7] != 0x00)
770 return src;
771 use_zint4= src[4] == 2 || src[4] == 3;
772 use_deflate= src[4] == 1 || src[4] == 3;
773 src+= 8;
774 *len-= 8;
775 if (name)
776 UdmLog(A, UDM_LOG_DEBUG, "Unpacking '%s'", name);
777 if (use_deflate)
778 {
779 udm_timer_t ticks= UdmStartTimer();
780 size_t len0= len[0];
781 UdmLog(A,UDM_LOG_DEBUG, "Deflate header detected");
782 UdmDSTRReset(buf);
783 if (UDM_OK == UdmDSTRAppendInflate(buf, src, *len))
784 {
785 src= UdmDSTRPtr(buf);
786 len[0]= UdmDSTRLength(buf);
787 UdmLog(A, UDM_LOG_DEBUG, "%d to %d bytes inflated",
788 (int) len0, (int) len[0]);
789 }
790 UdmLog(A, UDM_LOG_DEBUG, "Inflating done: %.2f", UdmStopTimer(&ticks));
791 }
792 if (*len >= 5 && use_zint4)
793 {
794 udm_timer_t ticks= UdmStartTimer();
795 char *zint4_buf= (char*) UdmMalloc(*len);
796 UdmLog(A, UDM_LOG_DEBUG, "zint4 header detected (zint4ed data length: %d)",
797 (int) (*len));
798 if (! zint4_buf)
799 {
800 UdmLog(A, UDM_LOG_ERROR, "Malloc failed. Requested %u bytes",
801 (int) (*len));
802 return(NULL);
803 }
804 memcpy(zint4_buf, src, *len);
805 if (buf->size_alloced < *len * 7 && UdmDSTRRealloc(buf, *len * 7) != UDM_OK)
806 {
807 UdmFree(zint4_buf);
808 UdmLog(A, UDM_LOG_ERROR, "UdmDSTRRealloc failed. Requested %u bytes",
809 (int) (*len * 7));
810 return(NULL);
811 }
812 *len= udm_dezint4(zint4_buf, (int4 *) buf->Val.str, *len) * 4;
813 src= UdmDSTRPtr(buf);
814 UdmFree(zint4_buf);
815 UdmLog(A, UDM_LOG_ERROR, "dezint4ed data length: %d", (int) (*len));
816 UdmLog(A, UDM_LOG_ERROR, "dezint4 done: %.2f", UdmStopTimer(&ticks));
817 }
818 return src;
819 }
820
821
822 static void
UdmBlobModeInflateOrAlloc(UDM_AGENT * A,UDM_DSTR * buf,const char * name,UDM_STR * row,UDM_CONST_STR * dst)823 UdmBlobModeInflateOrAlloc(UDM_AGENT *A, UDM_DSTR *buf, const char *name,
824 UDM_STR *row, UDM_CONST_STR *dst)
825 {
826 dst->str= UdmBlobModeInflateOrSelf(A, buf, name, row->str, &row->length);
827 if (dst->str == row->str)
828 {
829 UdmDSTRRealloc(buf, row->length);
830 memcpy(buf->Val.str, row->str, row->length);
831 buf->Val.length= row->length;
832 dst->str= UdmDSTRPtr(buf);
833 }
834 dst->length= row->length;
835 }
836
837
838 static udm_rc_t
UdmInflateBlobModeSQLRes(UDM_AGENT * A,UDM_SQLRES * src)839 UdmInflateBlobModeSQLRes(UDM_AGENT *A, UDM_SQLRES *src)
840 {
841 UDM_DSTR ibuf;
842 size_t row;
843 UdmDSTRInit(&ibuf, 1024);
844 for (row= 0; row < src->nRows; row++)
845 {
846 size_t len= UdmSQLLen(src, row, 1);
847 const char *val= UdmSQLValue(src, row, 1);
848 const char *iflt;
849 iflt= UdmBlobModeInflateOrSelf(A, &ibuf, NULL, val, &len);
850 if (iflt != val)
851 {
852 size_t offs= src->nCols*row + 1;
853 UdmFree(src->Items[offs].str);
854 src->Items[offs].str= (char*) UdmMalloc(len + 1);
855 memcpy(src->Items[offs].str, iflt, len);
856 src->Items[offs].length= len;
857 src->Items[offs].str[len]= '\0';
858 }
859 }
860 UdmDSTRFree(&ibuf);
861 return UDM_OK;
862 }
863
864
865 static udm_rc_t
UdmAddCollationMatch(UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,const char * word,size_t count)866 UdmAddCollationMatch(UDM_QUERY *Query, UDM_FINDWORD_ARGS *args,
867 const char *word, size_t count)
868 {
869 /*
870 If match is not full, then we don't know whether
871 the word is a substring or a collation match.
872 Let's assume it is a substring, to avoid long
873 word lists in $(WE).
874 */
875 if (args->Word.Param.match_mode == UDM_MATCH_FULL ||
876 args->Word.Param.match_mode == UDM_MATCH_RANGE)
877 {
878 UDM_WIDEWORD_PARAM Param= Query->Res.WWList.Word[args->Word.Param.order].Param;
879 Param.origin= UDM_WORD_ORIGIN_COLLATION;
880 Param.count= count;
881 UdmWideWordListAddLike(&args->CollationMatches, &Param, word);
882 }
883 return UDM_OK;
884 }
885
886
887 static udm_rc_t
UdmBlobAddCoords(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,UDM_SQLRES * SQLRes)888 UdmBlobAddCoords(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
889 UDM_FINDWORD_ARGS *args,
890 UDM_SQLRES *SQLRes)
891 {
892 size_t numrows= UdmSQLNumRows(SQLRes);
893 size_t i;
894 char *wf= args->query_param.wf;
895 UDM_URLID_LIST *urls= &args->urls;
896 int need_coords= args->query_param.NewVersion ?
897 args->need_coords : (Query->Res.WWList.nwords > 1);
898 udm_bool_t save_section_size= args->query_param.SaveSectionSize;
899 UDM_SEARCHSECTION Section;
900
901 bzero((void*) &Section, sizeof(Section));
902 Section.wordnum= args->Word.Param.order & 0xFF;
903 Section.order= Query->Res.WWList.Word[Section.wordnum].Param.order;
904
905 for (i= 0; i < numrows; i++)
906 {
907 const unsigned char *s= (const unsigned char *)UdmSQLValue(SQLRes, i, 1);
908 size_t length= UdmSQLLen(SQLRes, i, 1);
909 unsigned char secno= UDM_ATOI(UdmSQLValue(SQLRes, i, 0));
910 const char *cmatch= UdmSQLValue(SQLRes, i, 2);
911 UDM_SEARCHSECTIONLIST SectionList;
912
913 if (!wf[secno])
914 continue;
915
916 Section.secno= secno;
917
918 UdmSectionListBlobCoordsUnpack(A, &SectionList,
919 urls, &Section,
920 s, length,
921 save_section_size,
922 need_coords);
923
924 #ifdef HAVE_DEBUG
925 if (UdmVarListFindBool(UdmSQLDBVars(db), "DebugSectionList", UDM_FALSE))
926 UdmSearchSectionListPrint(&SectionList);
927 #endif
928
929 if (SectionList.nsections && SectionList.ncoords)
930 {
931 UdmSearchSectionListListAdd(&args->SearchSectionListList, &SectionList);
932 args->Word.Param.count+= SectionList.ncoords;
933 UdmAddCollationMatch(Query, args, cmatch, SectionList.ncoords);
934 }
935 else
936 {
937 UdmSearchSectionListFree(&SectionList);
938 }
939 }
940
941 return UDM_OK;
942 }
943
944
945 static udm_rc_t
UdmFindWordBlobFromTable(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,const char * table_name)946 UdmFindWordBlobFromTable(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
947 UDM_FINDWORD_ARGS *args, const char *table_name)
948 {
949 char qbuf[4096];
950 char secno[32]= "";
951 char special[32]= "";
952 udm_timer_t ticks;
953 UDM_SQLRES SQLRes;
954 udm_rc_t rc;
955
956 if (args->urls.empty)
957 {
958 UdmLog(A, UDM_LOG_DEBUG,
959 "Not searching '%s': Base URL limit is empty", table_name);
960 return UDM_OK;
961 }
962
963 ticks= UdmStartTimer();
964 UdmLog(A, UDM_LOG_DEBUG, "Start fetching");
965 if (args->Word.Param.secno)
966 udm_snprintf(secno, sizeof(secno), " AND secno=%d", (int) args->Word.Param.secno);
967 /*
968 When performing substring or number search,
969 don't include special data, like '##last_mod_time' or '##rec_id'
970 */
971 if (args->Word.Param.match_mode != UDM_MATCH_FULL)
972 udm_snprintf(special, sizeof(special), " AND word NOT LIKE '##%%'");
973 udm_snprintf(qbuf, sizeof(qbuf),
974 "SELECT secno,coords,word FROM %s WHERE %s%s%s",
975 table_name, args->cmparg, secno, special);
976 if (UDM_OK != (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)))
977 return rc;
978 UdmLog(A, UDM_LOG_DEBUG, "%-30s%.2f", "Stop fetching", UdmStopTimer(&ticks));
979
980 ticks= UdmStartTimer();
981 UdmLog(A, UDM_LOG_DEBUG, "Start BlobAddCoords");
982 UdmInflateBlobModeSQLRes(A, &SQLRes);
983 UdmBlobAddCoords(A, db, Query, args, &SQLRes);
984 UdmLog(A, UDM_LOG_DEBUG, "%-30s%.2f", "Stop BlobAddCoords", UdmStopTimer(&ticks));
985 if (args->query_param.NewVersion)
986 {
987 UdmSQLResListAdd(&args->SQLResults, &SQLRes);
988 }
989 else
990 {
991 UdmSQLFree(&SQLRes);
992 }
993 return(UDM_OK);
994 }
995
996
997 static udm_rc_t
UdmFindWordBlobSimple(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)998 UdmFindWordBlobSimple(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
999 UDM_FINDWORD_ARGS *args)
1000 {
1001 char tablename[64];
1002 udm_rc_t rc;
1003 int delta= UdmVarListFindBool(&A->Conf->Vars, "UseDelta", UDM_FALSE);
1004 UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
1005 if (UDM_OK != (rc= UdmFindWordBlobFromTable(A, db, Query, args, tablename)))
1006 return rc;
1007 if (delta && UDM_OK != (rc= UdmFindWordBlobFromTable(A, db, Query,
1008 args, "bdict_delta")))
1009 return rc;
1010 return UDM_OK;
1011 }
1012
1013
1014 static udm_rc_t
UdmLoadSlowLimitWithSort(UDM_AGENT * A,UDM_DB * db,UDM_URLID_LIST * list,const char * q)1015 UdmLoadSlowLimitWithSort(UDM_AGENT *A, UDM_DB *db, UDM_URLID_LIST *list, const char *q)
1016 {
1017 udm_rc_t rc= UdmLoadSlowLimit(A, db, list, q);
1018 if (rc == UDM_OK)
1019 UdmURLIdListSort(list);
1020 return rc;
1021 }
1022
1023
1024 static udm_rc_t
UdmBlobLoadLiveUpdateLimitLoad(UDM_AGENT * A,UDM_DB * db,UDM_FINDWORD_ARGS * args)1025 UdmBlobLoadLiveUpdateLimitLoad(UDM_AGENT *A, UDM_DB *db, UDM_FINDWORD_ARGS *args)
1026 {
1027 udm_rc_t rc;
1028 int ts= 0;
1029 udm_timer_t ticks;
1030 char qbuf[128];
1031 UDM_ASSERT(UdmSQLDBMode(db) == UDM_SQLDBMODE_BLOB);
1032 ticks= UdmStartTimer();
1033 UdmLog(A, UDM_LOG_DEBUG, "Start loading LiveUpdate url_id list");
1034 if (UDM_OK != (rc= UdmBlobReadTimestamp(A, db, &ts, 0)))
1035 return rc;
1036 args->live_updates_ts= ts;
1037 udm_snprintf(qbuf, sizeof(qbuf),
1038 "SELECT url_id FROM cachedcopy WHERE ts>=%d", ts);
1039 if (UDM_OK != (rc= UdmLoadSlowLimitWithSort(A, db,
1040 &args->live_update_deleted_urls,
1041 qbuf)))
1042 return rc;
1043 UdmLog(A, UDM_LOG_DEBUG,
1044 "Stop loading LiveUpdate url_id list: %.02f, %d updated docs found",
1045 UdmStopTimer(&ticks), (int) args->live_update_deleted_urls.nurls);
1046 args->live_update_deleted_urls.exclude= 1;
1047 UdmURLIdListCopy(&args->live_update_active_urls, &args->urls);
1048 UdmURLIdListMerge(&args->urls, &args->live_update_deleted_urls);
1049 return UDM_OK;
1050 }
1051
1052
1053 static udm_rc_t
UdmFindWordBlobLiveUpdates(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)1054 UdmFindWordBlobLiveUpdates(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1055 UDM_FINDWORD_ARGS *args)
1056 {
1057 udm_rc_t rc;
1058 if (!(UDM_OK == (rc= UdmFindWordBlobSimple(A, db, Query, args))) ||
1059 !(UDM_OK == (rc= UdmFindWordRawBlobDelta(A, db, Query, args))))
1060 goto ret;
1061
1062 ret:
1063 return rc;
1064 }
1065
1066
1067 static udm_rc_t
UdmFindWordBlob(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)1068 UdmFindWordBlob(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1069 UDM_FINDWORD_ARGS *args)
1070 {
1071 return args->live_updates ?
1072 UdmFindWordBlobLiveUpdates(A, db, Query, args) :
1073 UdmFindWordBlobSimple(A, db, Query, args);
1074 }
1075
1076
1077 udm_rc_t
UdmBlobReadTimestamp(UDM_AGENT * A,UDM_DB * db,int * ts,int def)1078 UdmBlobReadTimestamp(UDM_AGENT *A, UDM_DB *db, int *ts, int def)
1079 {
1080 udm_rc_t rc;
1081 char lname[]= "##ts";
1082 char qbuf[64], tablename[64];
1083 UDM_SQLRES SQLRes;
1084
1085 UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
1086 udm_snprintf(qbuf, sizeof(qbuf), "SELECT coords FROM %s WHERE word='%s'",
1087 tablename, lname);
1088 if (UDM_OK == (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)) &&
1089 UdmSQLNumRows(&SQLRes) > 0)
1090 *ts= atoi(UdmSQLValue(&SQLRes, 0,0));
1091 else
1092 *ts= def;
1093 UdmSQLFree(&SQLRes);
1094 return rc;
1095 }
1096
1097
1098 static udm_rc_t
UdmBlobWriteWordPrepare(UDM_AGENT * A,UDM_DB * db,const char * table)1099 UdmBlobWriteWordPrepare(UDM_AGENT *A, UDM_DB *db, const char *table)
1100 {
1101 udm_rc_t rc;
1102 char qbuf[128];
1103 const char *int_cast= UdmSQLDBType(db) == UDM_DB_PGSQL ? "::integer" : "";
1104 udm_snprintf(qbuf, sizeof(qbuf),
1105 "INSERT INTO %s (word,secno,coords) "
1106 "VALUES(%s, %s%s, %s)",
1107 table,
1108 UdmDBSQLParamPlaceHolder(db, 1),
1109 UdmDBSQLParamPlaceHolder(db, 2),
1110 int_cast,
1111 UdmDBSQLParamPlaceHolder(db, 3));
1112 rc= UdmDBSQLPrepare(A, db, qbuf);
1113 return rc;
1114 }
1115
1116
1117 #if 0
1118 static udm_bool_t
1119 check_well_formed_length_with_warn(UDM_AGENT *A,
1120 const char *str, size_t length)
1121 {
1122 size_t wflength= A->Conf->lcs->cset->well_formed_length(A->Conf->lcs,
1123 str, length,
1124 UDM_RECODE_HTML);
1125 if (wflength < length)
1126 {
1127 UDM_DSTR tmp;
1128 UdmDSTRInit(&tmp, 128);
1129 UdmDSTRAppendHex(&tmp, str + wflength, length - wflength);
1130 UdmLog(A, UDM_LOG_DEBUG, "Not a well formed word: '%s'", tmp.Val.str);
1131 UdmDSTRFree(&tmp);
1132 return UDM_TRUE;
1133 }
1134 return UDM_FALSE;
1135 }
1136 #endif
1137
1138
1139 static udm_rc_t
UdmBlobWriteWordUsingBind(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,uint4 secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1140 UdmBlobWriteWordUsingBind(UDM_AGENT *A, UDM_DB *db, const char *table,
1141 const char *word, uint4 secno,
1142 const char *data, size_t len, UDM_DSTR *buf,
1143 int auto_prepare)
1144 {
1145 udm_rc_t rc;
1146 size_t wordlen= strlen(word);
1147 if ((auto_prepare &&
1148 UDM_OK != (rc= UdmBlobWriteWordPrepare(A, db, table))) ||
1149 UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 1, word, (int) wordlen, UDM_SQLTYPE_VARCHAR)) ||
1150 UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 2, &secno, (int) sizeof(secno), UDM_SQLTYPE_INT32)) ||
1151 UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 3, data, (int) len, UDM_SQLTYPE_LONGVARBINARY) ) ||
1152 UDM_OK != (rc= UdmDBSQLExecute(A, db)) ||
1153 (auto_prepare && UDM_OK != (rc= UdmDBSQLStmtFree(A, db))))
1154 return rc;
1155
1156 return UDM_OK;
1157 }
1158
1159
1160 static void
UdmDSTREncodeForDB(UDM_AGENT * A,UDM_DB * db,UDM_DSTR * buf,const char * src,size_t length)1161 UdmDSTREncodeForDB(UDM_AGENT *A, UDM_DB *db, UDM_DSTR *buf,
1162 const char *src, size_t length)
1163 {
1164 if (UdmSQLDBType(db) == UDM_DB_PGSQL)
1165 {
1166 char *s= buf->Val.str + buf->Val.length;
1167 buf->Val.length+= UdmDBSQLBinEscStr(A, db, s, buf->size_alloced, src, length);
1168 }
1169 else
1170 UdmDSTRAppendHex(buf, src, length);
1171 }
1172
1173
1174 static udm_rc_t
UdmBlobWriteWordUsingEncoding(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1175 UdmBlobWriteWordUsingEncoding(UDM_AGENT *A, UDM_DB *db, const char *table,
1176 const char *word, size_t secno,
1177 const char *data, size_t len, UDM_DSTR *buf,
1178 int auto_prepare)
1179 {
1180 udm_rc_t rc;
1181 size_t escape_factor= UdmSQLDBType(db) == UDM_DB_PGSQL ? 5 : 2;
1182 const char *pf= UdmSQLDBType(db) == UDM_DB_PGSQL ? "'" : "0x";
1183 const char *sf= UdmSQLDBType(db) == UDM_DB_PGSQL ? "'" : "";
1184 const char *E= (UdmSQLDBDriver(db) == UDM_DBAPI_PGSQL && UdmSQLDBVersion(db) >= 80101) ? "E" : "";
1185 size_t nbytes= 100 + len * escape_factor + 1;
1186
1187 if (UdmSQLDBFlags(db) & UDM_SQL_HAVE_STDHEX) /* X'AABBCC' syntax */
1188 {
1189 pf= "X'";
1190 sf= "'";
1191 }
1192 else if (UdmSQLDBFlags(db) & UDM_SQL_HAVE_BLOB_AS_HEX) /* 'AABBCC' syntax */
1193 {
1194 pf= "'";
1195 sf= "'";
1196 }
1197
1198 UdmDSTRReset(buf);
1199
1200 if (UdmDSTRAlloc(buf, nbytes))
1201 {
1202 UdmLog(A, UDM_LOG_ERROR,
1203 "BlobWriteWordUsingEncoding: DSTRAlloc(%d) failed: "
1204 "word='%s' secno=%d length=%d",
1205 (int) nbytes, word, (int) secno, (int) len);
1206 return UDM_OK; /* Skip this word - try to continue */
1207 }
1208 UdmDSTRAppendf(buf, "INSERT INTO %s VALUES('%s', %d, %s%s",
1209 table, word, (int) secno, E, pf);
1210 UdmDSTREncodeForDB(A, db, buf, data, len);
1211 UdmDSTRAppendf(buf, "%s)", sf);
1212 if (UDM_OK != (rc= UdmDBSQLQuery(A, db, NULL, UdmDSTRPtr(buf))))
1213 return rc;
1214
1215 UdmDSTRReset(buf);
1216
1217 return UDM_OK;
1218 }
1219
1220
1221 static udm_rc_t
UdmBlobWriteWordUsingMultiInsert(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1222 UdmBlobWriteWordUsingMultiInsert(UDM_AGENT *A, UDM_DB *db, const char *table,
1223 const char *word, size_t secno,
1224 const char *data, size_t len, UDM_DSTR *buf,
1225 int auto_prepare)
1226 {
1227 const char *comma= ",";
1228 size_t escape_factor= 2;
1229 size_t nbytes= UdmDSTRLength(buf) + 100 + len * escape_factor + 1;
1230
1231 if (UdmDSTRRealloc(buf, nbytes))
1232 {
1233 UdmLog(A, UDM_LOG_ERROR, "DSTRAlloc(%d) failed: word='%s' secno=%d len=%d",
1234 (int) nbytes, word, (int) secno, (int) len);
1235 return UDM_ERROR;
1236 }
1237
1238 if (!UdmDSTRLength(buf))
1239 {
1240 UdmDSTRAppendf(buf, "INSERT INTO %s VALUES ", table);
1241 comma= "";
1242 }
1243
1244 UdmDSTRAppendf(buf, "%s('%s',%d,0x", comma, word, (int) secno);
1245 UdmDSTRAppendHex(buf, data, len);
1246 UdmDSTRAppendf(buf, ")");
1247 return UDM_OK;
1248 }
1249
1250
1251 static udm_rc_t
UdmBlobWriteWord(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare,int use_multi_insert)1252 UdmBlobWriteWord(UDM_AGENT *A, UDM_DB *db, const char *table,
1253 const char *word, size_t secno,
1254 const char *data, size_t len, UDM_DSTR *buf,
1255 int auto_prepare, int use_multi_insert)
1256 {
1257 udm_rc_t rc;
1258 int use_bind= UdmSQLDBFlags(db) & UDM_SQL_HAVE_BIND_BINARY;
1259
1260 rc= use_multi_insert ?
1261 UdmBlobWriteWordUsingMultiInsert(A, db, table, word, secno, data, len,
1262 buf, auto_prepare) :
1263 use_bind ?
1264 UdmBlobWriteWordUsingBind(A, db, table, word, secno, data, len,
1265 buf, auto_prepare):
1266 UdmBlobWriteWordUsingEncoding(A, db, table, word, secno, data, len,
1267 buf, auto_prepare);
1268 return rc;
1269 }
1270
1271
1272 static udm_rc_t
UdmBlobWriteWordCmpr(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,UDM_DSTR * z,int use_zint4,int auto_prepare,int allow_multi_insert)1273 UdmBlobWriteWordCmpr(UDM_AGENT *A, UDM_DB *db, const char *table,
1274 const char *word, size_t secno,
1275 const char *data, size_t len,
1276 UDM_DSTR *buf, UDM_DSTR *z,
1277 int use_zint4,
1278 int auto_prepare,
1279 int allow_multi_insert)
1280 {
1281 #ifdef HAVE_ZLIB
1282 if (z && len > 256)
1283 {
1284 UdmDSTRReset(z);
1285 UdmDSTRRealloc(z, len + 8 + 1); /* 8 for two INTs */
1286 /* Append Format version */
1287 #if 0
1288 if (use_zint4)
1289 {
1290 udm_rc_t dummy;
1291 /* Something is wrong here: why UdmDeflate? */
1292 UdmDSTRAppendCompressionType(z, UDM_BLOB_COMP_ZINT4_DEFLATE);
1293 UdmDSTRAppendDeflate(z, data + 8, len - 8);
1294 }
1295 else
1296 #endif
1297 {
1298 UdmDSTRAppendCompressionType(z, UDM_BLOB_COMP_DEFLATE);
1299 UdmDSTRAppendDeflate(z, data, len);
1300 }
1301 if (UdmDSTRLength(z) < len)
1302 {
1303 data= UdmDSTRPtr(z);
1304 len= UdmDSTRLength(z);
1305 }
1306 }
1307 #endif
1308 return UdmBlobWriteWord(A, db, table, word, secno, data, len, buf,
1309 auto_prepare, allow_multi_insert);
1310 }
1311
1312
1313 /*
1314 The word must not require escaping!
1315 */
1316 static udm_rc_t
UdmBlobDeleteWordFromTable(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word)1317 UdmBlobDeleteWordFromTable(UDM_AGENT *A, UDM_DB *db,
1318 const char *table, const char *word)
1319 {
1320 char qbuf[64];
1321 udm_snprintf(qbuf, sizeof(qbuf),
1322 "DELETE FROM %s WHERE word='%s'", table, word);
1323 return UdmDBSQLQuery(A, db, NULL, qbuf);
1324 }
1325
1326
1327 static udm_rc_t
UdmBlobRewriteIntWord(UDM_AGENT * A,UDM_DB * db,UDM_DSTR * buf,const char * table,const char * name,int value,udm_bool_t rewrite)1328 UdmBlobRewriteIntWord(UDM_AGENT *A, UDM_DB *db, UDM_DSTR *buf,
1329 const char *table, const char *name, int value,
1330 udm_bool_t rewrite)
1331 {
1332 udm_rc_t rc;
1333 char data[64];
1334 size_t size_data;
1335 UdmLog(A, UDM_LOG_DEBUG, "Writing '%s'", name);
1336 if (rewrite &&
1337 UDM_OK != (rc= UdmBlobDeleteWordFromTable(A, db, table, name)))
1338 return rc;
1339 size_data= udm_snprintf(data, sizeof(data), "%d", value);
1340 return UdmBlobWriteWord(A, db, table, name, 0, data, size_data, buf, 1, 0);
1341 }
1342
1343
1344 static udm_rc_t
UdmBlobWriteTimestamp(UDM_AGENT * A,UDM_DB * db,const char * table,udm_bool_t rewrite)1345 UdmBlobWriteTimestamp(UDM_AGENT *A, UDM_DB *db,
1346 const char *table, udm_bool_t rewrite)
1347 {
1348 udm_rc_t rc;
1349 UDM_DSTR buf;
1350 UdmDSTRInit(&buf, 128);
1351 if (UDM_OK != (rc= UdmBlobRewriteIntWord(A, db, &buf, table, "##ts",
1352 (int) time(0), rewrite)))
1353 goto ex;
1354 rc= UdmBlobRewriteIntWord(A, db, &buf, table, "##version",
1355 UDM_VERSION_ID, rewrite);
1356 ex:
1357 UdmDSTRFree(&buf);
1358 return rc;
1359 }
1360
1361
1362
1363 static udm_rc_t
UdmTruncateDictBlob(UDM_AGENT * Indexer,UDM_DB * db)1364 UdmTruncateDictBlob(UDM_AGENT *Indexer, UDM_DB *db)
1365 {
1366 return UdmDBSQLTableTruncateOrDelete(Indexer, db, "bdict");
1367 }
1368
1369 /************************************************/
1370
1371 typedef struct
1372 {
1373 UDM_DSTR buf;
1374 UDM_DSTR compress;
1375 udm_bool_t use_deflate;
1376 } UDM_WRITE_HELPER;
1377
1378
1379 static udm_rc_t
UdmWriteHelperInit(UDM_WRITE_HELPER * Helper,UDM_DB * db)1380 UdmWriteHelperInit(UDM_WRITE_HELPER *Helper, UDM_DB *db)
1381 {
1382 bzero((void*) Helper, sizeof(*Helper));
1383 #ifdef HAVE_ZLIB
1384 Helper->use_deflate= UdmVarListFindBool(UdmSQLDBVars(db), "deflate", UDM_FALSE);
1385 #endif
1386 if (UDM_OK != UdmDSTRInit(&Helper->buf, 8 * 1024))
1387 return UDM_ERROR;
1388 if (UDM_OK != UdmDSTRInit(&Helper->compress, 8 * 1024))
1389 {
1390 UdmDSTRFree(&Helper->buf);
1391 return UDM_ERROR;
1392 }
1393 return UDM_OK;
1394 }
1395
1396
1397 static void
UdmWriteHelperFree(UDM_WRITE_HELPER * Helper)1398 UdmWriteHelperFree(UDM_WRITE_HELPER *Helper)
1399 {
1400 UdmDSTRFree(&Helper->buf);
1401 UdmDSTRFree(&Helper->compress);
1402 }
1403
1404
1405 static udm_rc_t
UdmWriteWordWithHelper(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,const UDM_DSTR * data,UDM_WRITE_HELPER * Helper,int use_zint4)1406 UdmWriteWordWithHelper(UDM_AGENT *A, UDM_DB *db, const char *table,
1407 const char *word, const UDM_DSTR *data,
1408 UDM_WRITE_HELPER *Helper, int use_zint4)
1409 {
1410 return UdmBlobWriteWordCmpr(A, db, table, word, /*secno*/0,
1411 UdmDSTRPtr(data), UdmDSTRLength(data),
1412 &Helper->buf,
1413 Helper->use_deflate ? &Helper->compress : NULL,
1414 use_zint4, 1, 0);
1415 }
1416 /*****************************************************/
1417
1418 /*
1419 Write limits, but don't COMMIT and don't write timestamp
1420 */
1421 static udm_rc_t
UdmBlobWriteLimitsInternal(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_WRITE_HELPER * Helper)1422 UdmBlobWriteLimitsInternal(UDM_AGENT *A, UDM_DB *db,
1423 const char *table, UDM_WRITE_HELPER *Helper)
1424 {
1425 UDM_VARLIST *Vars= &A->Conf->Vars;
1426 UDM_DSTR l;
1427 udm_rc_t rc= UDM_OK;
1428 size_t nvar;
1429
1430 UdmDSTRInit(&l, 8192);
1431 for (nvar= 0; nvar < Vars->nvars; nvar++)
1432 {
1433 UDM_VAR *Var= UdmVarListFindByIndex(Vars, nvar);
1434 size_t i, ndocs;
1435 char qbuf[128];
1436 char lname[64];
1437 UDM_URLID_LIST list;
1438 UDM_URL_INT4_LIST UserScore;
1439 int is_score= 0;
1440 udm_timer_t ticks;
1441
1442 if (!strncasecmp(UdmVarName(Var), "Limit.", 6))
1443 udm_snprintf(lname, sizeof(lname), "##limit#%s", UdmVarName(Var) + 6);
1444 else if (!strncasecmp(UdmVarName(Var), "Order.", 6))
1445 udm_snprintf(lname, sizeof(lname), "##order#%s", UdmVarName(Var) + 6);
1446 else if ((is_score= !strncasecmp(UdmVarName(Var), "Score.", 6)))
1447 udm_snprintf(lname, sizeof(lname), "##score#%s", UdmVarName(Var) + 6);
1448 else
1449 continue;
1450 UdmLog(A, UDM_LOG_DEBUG, "Writing '%s'", lname);
1451
1452 bzero((void*) &list, sizeof(list));
1453 bzero((void*) &UserScore, sizeof(UserScore));
1454
1455 if (UDM_OK != (rc= is_score ?
1456 UdmUserScoreListLoad(A, db, &UserScore, UdmVarStr(Var)) :
1457 UdmLoadSlowLimit(A, db, &list, UdmVarStr(Var))))
1458 goto ret;
1459
1460 ticks= UdmStartTimer();
1461
1462 if (!strncasecmp(UdmVarName(Var), "Limit.", 6))
1463 UdmURLIdListSort(&list);
1464
1465 UdmDSTRReset(&Helper->buf);
1466 UdmDSTRReset(&l);
1467 ndocs= is_score ? UserScore.nitems : list.nurls;
1468 for (i= 0; i < ndocs; i++)
1469 {
1470 if (is_score)
1471 {
1472 UDM_URL_INT4 *item= &UserScore.Item[i];
1473 char ch= item->param;
1474 UdmDSTRAppendINT4(&l, item->url_id);
1475 UdmDSTRAppend(&l, &ch, 1);
1476 }
1477 else
1478 {
1479 /* Limit */
1480 UdmDSTRAppendINT4(&l, list.urls[i]);
1481 }
1482 }
1483
1484 udm_snprintf(qbuf, sizeof(qbuf), "DELETE FROM %s WHERE word=('%s')", table, lname);
1485 if (UDM_OK != (rc= UdmDBSQLQuery(A, db, NULL, qbuf)))
1486 goto ret;
1487
1488 if (UdmDSTRLength(&l))
1489 {
1490 if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, lname, &l,
1491 Helper, 0)))
1492 goto ret;
1493 }
1494
1495 UDM_FREE(list.urls);
1496 UDM_FREE(UserScore.Item);
1497 UdmLog(A, UDM_LOG_DEBUG, "%d documents written to '%s': %.2f",
1498 (int) ndocs, lname, UdmStopTimer(&ticks));
1499 }
1500 ret:
1501 UdmDSTRFree(&l);
1502 return rc;
1503 }
1504
1505
1506 /*******************************************/
1507
1508 static udm_rc_t
UdmURLDataListStorePopularityBdict(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_URLDATALIST * List,UDM_WRITE_HELPER * Helper)1509 UdmURLDataListStorePopularityBdict(UDM_AGENT *A, UDM_DB *db, const char *table,
1510 UDM_URLDATALIST *List,
1511 UDM_WRITE_HELPER *Helper)
1512 {
1513 udm_rc_t rc= UDM_OK;
1514 UDM_DSTR pop;
1515
1516 UdmLog(A, UDM_LOG_DEBUG, "Writing '##pop'");
1517
1518 /* TODO34: add "rewrite" parameter */
1519 if (UDM_OK != (rc= UdmBlobDeleteWordFromTable(A, db, table, "##pop")))
1520 return rc;
1521
1522 if (UDM_OK != (rc= UdmDSTRInit(&pop, 8192)))
1523 return UDM_ERROR;
1524
1525 if (UDM_OK != (rc= UdmURLDataListPackPopularity(A, List, &pop)))
1526 {
1527 UdmLog(A, UDM_LOG_ERROR, "Packing popularity failed");
1528 goto ex;
1529 }
1530
1531 if (UdmDSTRLength(&pop))
1532 rc= UdmWriteWordWithHelper(A, db, table, "##pop", &pop, Helper, 0);
1533 ex:
1534 UdmDSTRFree(&pop);
1535 return rc;
1536 }
1537
1538
1539 static udm_rc_t
UdmURLDataListStorePopularityTable(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * List)1540 UdmURLDataListStorePopularityTable(UDM_AGENT *A, UDM_DB *db,
1541 UDM_URLDATALIST *List)
1542 {
1543 udm_timer_t timer= UdmStartTimer();
1544 char qbuf[128];
1545 size_t i;
1546 const char *table= UdmVarListFindStr(&A->Conf->Vars, "SQLExportPopularityTable", NULL);
1547
1548 if (!table)
1549 return UDM_OK;
1550
1551 /* TODO34: allow only safe table names? */
1552 UdmLog(A, UDM_LOG_DEBUG, "Writing popularity table '%s'", table);
1553 if (UDM_OK != UdmDBSQLTableTruncateOrDelete(A, db, table))
1554 return UDM_ERROR;
1555 udm_snprintf(qbuf, sizeof(qbuf), "%s WRITE", table);
1556 if (UDM_OK != UdmDBSQLLockOrBegin(A, db, qbuf))
1557 return UDM_ERROR;
1558 for (i= 0; i < List->nitems; i++)
1559 {
1560 UDM_URLDATA *Item= &List->Item[i];
1561 udm_snprintf(qbuf, sizeof(qbuf),
1562 "INSERT INTO %s (url_id,inlinks,poprank) VALUES (%d,%d,%f)",
1563 table, Item->url_id, Item->per_site, Item->pop_rank);
1564 if (UDM_OK != UdmDBSQLQuery(A, db, NULL, qbuf))
1565 return UDM_ERROR;
1566 }
1567 if (UDM_OK != UdmDBSQLUnlockOrCommit(A, db))
1568 return UDM_ERROR;
1569 UdmLog(A, UDM_LOG_DEBUG, "Writing popularity table done: %.2f", UdmStopTimer(&timer));
1570 return UDM_OK;
1571 }
1572
1573
1574 static udm_rc_t
UdmWritePopularityBdictAndTable(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * URLData,const char * table,UDM_WRITE_HELPER * Helper)1575 UdmWritePopularityBdictAndTable(UDM_AGENT *A, UDM_DB *db,
1576 UDM_URLDATALIST *URLData,
1577 const char *table,
1578 UDM_WRITE_HELPER *Helper)
1579 {
1580 udm_rc_t rc;
1581 /* Write pop_rank only if we have some non-empty value */
1582 if (UDM_OK != (rc= UdmURLDataListStorePopularityBdict(A, db, table, URLData,
1583 Helper)))
1584 return rc;
1585 if (UDM_OK != (rc= UdmURLDataListStorePopularityTable(A, db, URLData)))
1586 return rc;
1587 return UDM_OK;
1588 }
1589
1590
1591 static udm_rc_t
UdmBlobWriteURLData(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,const char * table,UDM_WRITE_HELPER * Helper)1592 UdmBlobWriteURLData(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1593 const char *table,
1594 UDM_WRITE_HELPER *Helper)
1595 {
1596 udm_rc_t rc= UDM_OK;
1597 int use_zint4= UdmVarListFindBool(UdmSQLDBVars(db), "zint4", UDM_DEFAULT_ZINT4);
1598 size_t i;
1599 UDM_DSTR url_id, site, last_mod_time;
1600 UDM_URLDATALIST *List= &Query->URLData;
1601
1602 UdmDSTRInit(&url_id, 8192);
1603 UdmDSTRInit(&site, 1024);
1604 UdmDSTRInit(&last_mod_time, 8192);
1605
1606 for (i= 0; i < List->nitems; i++)
1607 {
1608 UDM_URLDATA *Item= &List->Item[i];
1609 UdmDSTRAppendINT4(&url_id, Item->url_id);
1610 UdmDSTRAppendINT4(&last_mod_time, Item->last_mod_time);
1611 }
1612
1613 UdmLog(A, UDM_LOG_DEBUG, "Writing '##rec_id'");
1614 if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##rec_id",
1615 &url_id, Helper, use_zint4)))
1616 goto ex;
1617
1618 UdmLog(A, UDM_LOG_DEBUG, "Writing '##last_mod_time'");
1619 if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##last_mod_time",
1620 &last_mod_time, Helper, 0)))
1621 goto ex;
1622
1623 if (UDM_OK != (rc= UdmURLDataListPackSite(List, &site)))
1624 goto ex;
1625
1626 UdmLog(A, UDM_LOG_DEBUG, "Writing '##site'");
1627 if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##site",
1628 &site, Helper, 0)))
1629 goto ex;
1630
1631 ex:
1632 UdmDSTRFree(&url_id);
1633 UdmDSTRFree(&site);
1634 UdmDSTRFree(&last_mod_time);
1635 return rc;
1636 }
1637
1638
1639 /*
1640 Unpack URL data from all packed records:
1641 '##rec_id'
1642 '##last_mod_time'
1643 */
1644 static size_t
UdmURLDataUnpackFull(UDM_URLDATALIST * DataList,size_t nrows,const char * rec_id_str,const char * last_mod_time_str)1645 UdmURLDataUnpackFull(UDM_URLDATALIST *DataList, size_t nrows,
1646 const char *rec_id_str,
1647 const char *last_mod_time_str)
1648 {
1649 size_t i, j;
1650 for (j= 0, i= 0; i < nrows; i++)
1651 {
1652 urlid_t rec_id= udm_get_int4(rec_id_str);
1653 rec_id_str+= 4;
1654
1655 if (rec_id == DataList->Item[j].url_id)
1656 {
1657 UDM_URLDATA *D= &DataList->Item[j];
1658 if (last_mod_time_str)
1659 D->last_mod_time= udm_get_int4(&last_mod_time_str[i*4]);
1660 j++;
1661 if (j == DataList->nitems)
1662 break;
1663 }
1664 }
1665 return j;
1666 }
1667
1668
1669 /*
1670 Unpack rec_id from '##rec_id' record.
1671 */
1672 static size_t
UdmURLDataUnpackRecID(UDM_AGENT * A,UDM_URLDATALIST * DataList,size_t nrows,const char * rec_id_str)1673 UdmURLDataUnpackRecID(UDM_AGENT *A, UDM_URLDATALIST *DataList, size_t nrows,
1674 const char *rec_id_str)
1675 {
1676 /* Need only rec_id */
1677 UDM_URLDATA *Data= DataList->Item;
1678 size_t j, i, skip= 0, ncoords= DataList->nitems;
1679
1680 for (j = 0, i = 0; i < nrows; i++)
1681 {
1682 urlid_t rec_id= udm_get_int4(rec_id_str);
1683 while (rec_id > Data[j].url_id && j < ncoords)
1684 {
1685 skip++;
1686 j++;
1687 }
1688
1689 if (rec_id == Data[j].url_id)
1690 {
1691 j++;
1692 if (j == ncoords) break;
1693 }
1694 rec_id_str+= 4;
1695 }
1696 if (j < ncoords)
1697 {
1698 skip+= (ncoords - j);
1699 UdmLog(A, UDM_LOG_DEBUG,
1700 "Warning: %d out of %d coords didn't have URL data",
1701 (int) skip, (int) DataList->nitems);
1702 j= DataList->nitems;
1703 }
1704 return j;
1705 }
1706
1707
1708 static size_t
UdmDSTRAppendWithComma(UDM_DSTR * dstr,const char * s,size_t length)1709 UdmDSTRAppendWithComma(UDM_DSTR *dstr, const char *s, size_t length)
1710 {
1711 if (UdmDSTRLength(dstr))
1712 UdmDSTRAppend(dstr, ",", 1);
1713 return UdmDSTRAppend(dstr, s, length);
1714 }
1715
1716
1717 udm_rc_t
UdmLoadURLDataFromBdict(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * DataList,int flags)1718 UdmLoadURLDataFromBdict(UDM_AGENT *A, UDM_DB *db,
1719 UDM_URLDATALIST *DataList,
1720 int flags)
1721 {
1722 udm_rc_t rc;
1723 char qbuf[4*1024], table[64];
1724 UDM_SQLRES SQLres;
1725 UDM_DSTR inbuf, rec_id_buf, site_buf, pop_rank_buf, last_mod_time_buf;
1726 UDM_STR row[2];
1727 udm_timer_t ticks= UdmStartTimer();
1728 int need_pop_rank= (flags & UDM_URLDATA_POP);
1729 int need_last_mod_time= (flags & UDM_URLDATA_LM);
1730 int need_site= (flags & (UDM_URLDATA_SITE | UDM_URLDATA_SITE_RANK));
1731 int need_id= need_last_mod_time; /* TODO34: pack last_mode_time toghether with id? */
1732 UDM_CONST_STR site, id, pop, lm;
1733
1734 if (!flags)
1735 return UDM_NOTARGET;
1736
1737 UdmConstStrInit(&site);
1738 UdmConstStrInit(&id);
1739 UdmConstStrInit(&pop);
1740 UdmConstStrInit(&lm);
1741
1742 UdmBlobGetRTable(A, db, table, sizeof(table));
1743
1744 UdmDSTRInit(&inbuf, 64);
1745 if (need_id)
1746 UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##rec_id'"));
1747 if (need_pop_rank)
1748 UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##pop'"));
1749 if (need_last_mod_time)
1750 UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##last_mod_time'"));
1751 if (need_site)
1752 UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##site'"));
1753
1754 /* Check that DataList is not empty and is sorted by url_id */
1755 UDM_ASSERT(DataList->nitems);
1756 UDM_ASSERT(DataList->Item[0].url_id <= DataList->Item[DataList->nitems-1].url_id);
1757
1758 UdmLog(A,UDM_LOG_DEBUG,"Loading URL data from bdict");
1759 udm_snprintf(qbuf, sizeof(qbuf),
1760 "SELECT word,coords FROM %s WHERE word IN (%s)",
1761 table, UdmDSTRPtr(&inbuf));
1762 UdmDSTRFree(&inbuf);
1763
1764 if (UDM_OK != (rc= UdmDBSQLExecDirect(A, db, &SQLres, qbuf)))
1765 {
1766 UdmLog(A,UDM_LOG_DEBUG,"Couldn't run a query on bdict");
1767 return(rc);
1768 }
1769
1770 UdmDSTRInit(&rec_id_buf, 4096);
1771 UdmDSTRInit(&site_buf, 4096);
1772 UdmDSTRInit(&pop_rank_buf, 4096);
1773 UdmDSTRInit(&last_mod_time_buf, 4096);
1774
1775 while (UdmDBSQLFetchRow(A, db, &SQLres, row) == UDM_OK)
1776 {
1777 if (!strcmp(row[0].str, "##rec_id"))
1778 UdmBlobModeInflateOrAlloc(A, &rec_id_buf, "##rec_id", &row[1], &id);
1779 else if (!strcmp(row[0].str, "##site"))
1780 UdmBlobModeInflateOrAlloc(A, &site_buf, "##site", &row[1], &site);
1781 else if (!strcmp(row[0].str, "##last_mod_time"))
1782 UdmBlobModeInflateOrAlloc(A, &last_mod_time_buf, "##last_mod_time",
1783 &row[1], &lm);
1784 else if (!strcmp(row[0].str, "##pop"))
1785 UdmBlobModeInflateOrAlloc(A, &pop_rank_buf, "##pop", &row[1], &pop);
1786 }
1787
1788 UdmLog(A, UDM_LOG_DEBUG, "Fetch from bdict done: %.2f", UdmStopTimer(&ticks));
1789
1790 if (need_pop_rank)
1791 {
1792 if (pop.str)
1793 {
1794 ticks= UdmStartTimer();
1795 UdmURLDataListUnpackPopularity(A, DataList, &pop);
1796 UdmLog(A, UDM_LOG_DEBUG, "Unpacking popularity done: %.02f", UdmStopTimer(&ticks));
1797 }
1798 else
1799 {
1800 /*
1801 All pop_rank values were 0 at "indexer -Eblob" time.
1802 Use 0 as pop_rank values.
1803 */
1804 UdmLog(A, UDM_LOG_DEBUG, "Warning: s=P is requested, but '##pop' record not found");
1805 UdmLog(A, UDM_LOG_DEBUG, "Perhaps you forgot to run 'indexer -n0 -R' before running 'indexer --index'");
1806 need_pop_rank= 0;
1807 }
1808 }
1809
1810 if (need_site)
1811 {
1812 if (site.str)
1813 {
1814 ticks= UdmStartTimer();
1815 UdmURLDataListUnpackSite(A, DataList, &site);
1816 UdmLog(A, UDM_LOG_DEBUG, "Unpacking site done: %.02f", UdmStopTimer(&ticks));
1817 }
1818 else
1819 {
1820 UdmLog(A, UDM_LOG_DEBUG, "No '##site' record found");
1821 }
1822 }
1823
1824 if (!need_id && !need_last_mod_time)
1825 {
1826 rc= UDM_OK;
1827 goto ret;
1828 }
1829
1830 if (id.str && id.length &&
1831 (lm.str || ! need_last_mod_time))
1832 {
1833 size_t j, nrows= id.length / 4;
1834
1835 ticks= UdmStartTimer();
1836 UdmLog(A, UDM_LOG_DEBUG, "Unpacking URL Data %d rows", (int) nrows);
1837 if (need_last_mod_time)
1838 {
1839 /* Need pop_rank or last_mod_time */
1840 j= UdmURLDataUnpackFull(DataList, nrows, id.str,
1841 need_last_mod_time ? lm.str : NULL);
1842 }
1843 else
1844 {
1845 /* Need only rec_id */
1846 j= UdmURLDataUnpackRecID(A, DataList, nrows, id.str);
1847 }
1848
1849 UdmLog(A, UDM_LOG_DEBUG, "Unpacking URL Data done: %.02f", UdmStopTimer(&ticks));
1850
1851 if (j != DataList->nitems)
1852 {
1853 UdmLog(A,UDM_LOG_DEBUG, "Expected to load %d URLs, loaded %d",
1854 (int) DataList->nitems, (int) j);
1855 UdmLog(A,UDM_LOG_DEBUG,"Couldn't load URL data from bdict");
1856 goto load_from_url;
1857 }
1858 }
1859 else
1860 {
1861 UdmLog(A,UDM_LOG_DEBUG,"There is no URL data in bdict");
1862 }
1863
1864
1865 load_from_url:
1866 rc= UDM_NOTARGET;
1867
1868 ret:
1869 UdmSQLFree(&SQLres);
1870 UdmDSTRFree(&rec_id_buf);
1871 UdmDSTRFree(&site_buf);
1872 UdmDSTRFree(&pop_rank_buf);
1873 UdmDSTRFree(&last_mod_time_buf);
1874 return rc;
1875 }
1876
1877
1878
1879 /*******************************************/
1880
1881 static udm_rc_t
UdmRewriteURL(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)1882 UdmRewriteURL(UDM_AGENT *Indexer, UDM_DB *db, UDM_QUERY *Query)
1883 {
1884 udm_rc_t rc;
1885 int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
1886 char tablename[64];
1887 UDM_WRITE_HELPER Helper;
1888
1889 if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
1890 return rc;
1891 UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
1892 if ((tr && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
1893 UDM_OK != (rc= UdmBlobWriteTimestamp(Indexer, db, tablename, UDM_TRUE)) ||
1894 UDM_OK != (rc= UdmBlobWriteURLData(Indexer, db, Query, tablename, &Helper)) ||
1895 UDM_OK != (rc= UdmBlobWriteLimitsInternal(Indexer, db, tablename, &Helper)) ||
1896 (tr && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
1897 {
1898 }
1899 UdmWriteHelperFree(&Helper);
1900 return rc;
1901 }
1902
1903
1904 /*
1905 Write limits with COMMIT and timestamp
1906 */
1907 static udm_rc_t
UdmBlobWriteLimits(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_WRITE_HELPER * Helper)1908 UdmBlobWriteLimits(UDM_AGENT *A, UDM_DB *db, const char *table,
1909 UDM_WRITE_HELPER *Helper)
1910 {
1911 udm_rc_t rc;
1912 if (UDM_OK != (rc= UdmDBSQLBegin(A, db)) ||
1913 UDM_OK != (rc= UdmBlobWriteLimitsInternal(A, db, table, Helper)) ||
1914 UDM_OK != (rc= UdmBlobWriteTimestamp(A, db, table, UDM_TRUE)) ||
1915 UDM_OK != (rc= UdmDBSQLCommit(A, db)))
1916 return rc;
1917 return UDM_OK;
1918 }
1919
1920
1921 static udm_rc_t
UdmRewriteLimits(UDM_AGENT * Indexer,UDM_DB * db)1922 UdmRewriteLimits(UDM_AGENT *Indexer, UDM_DB *db)
1923 {
1924 udm_rc_t rc;
1925 char tablename[64];
1926 UDM_WRITE_HELPER Helper;
1927 if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
1928 return rc;
1929 UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
1930 rc= UdmBlobWriteLimits(Indexer, db, tablename, &Helper);
1931 UdmWriteHelperFree(&Helper);
1932 return rc;
1933 }
1934
1935
1936
1937
1938 /************************************************/
1939
1940 #include "udm_doc.h"
1941 #include "udm_parsehtml.h"
1942 #include "udm_parsexml.h"
1943 #include "udm_server.h" /* UdmSpiderParamInit */
1944 #include "udm_url.h"
1945
1946 /* TODO34: merge with word.c */
1947 /*
1948 static int wlcmp_sort(UDM_WORD *w1, UDM_WORD *w2)
1949 {
1950 register int _;
1951 if ((_= strcmp(w1->word, w2->word)))
1952 return _;
1953 if (w1->secno != (int) w2->secno)
1954 return w1->secno < w2->secno ? -1 : 1;
1955 if (w1->pos != w2->pos)
1956 return w1->pos < w2->pos ? -1 : 1;
1957 return 0;
1958 }
1959 */
wlcmp_search(UDM_WORD * w1,UDM_WORD * w2)1960 static int wlcmp_search(UDM_WORD *w1, UDM_WORD *w2)
1961 {
1962 register int _;
1963 if ((_= strcmp(w1->word, w2->word)))
1964 return _;
1965 if (w1->coord.secno != (int) w2->coord.secno)
1966 return w1->coord.secno < w2->coord.secno ? -1 : 1;
1967 return 0;
1968 }
1969 /*
1970 static void
1971 UdmWordListSort(UDM_WORDLIST *WL)
1972 {
1973 if (WL->nwords)
1974 UdmSort(WL->Word, WL->nwords, sizeof(UDM_WORD), (udm_qsort_cmp) wlcmp_sort);
1975 }
1976 */
1977
1978 static udm_rc_t
UdmWordList2InvertedIndexCache(UDM_AGENT * A,UDM_WORDLIST * WL,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_CACHE * cache)1979 UdmWordList2InvertedIndexCache(UDM_AGENT *A, UDM_WORDLIST *WL,
1980 UDM_CONSTWORD_HASH_DATA *data,
1981 UDM_INVERTED_INDEX_CACHE *cache)
1982 {
1983 size_t i;
1984 UDM_WORD *prev;
1985 if (!WL->nwords)
1986 return UDM_OK;
1987 for (prev= &WL->Word[0], i= 0; i <= WL->nwords; i++)
1988 {
1989 UDM_WORD *W= &WL->Word[i];
1990 if (i == WL->nwords || wlcmp_search(prev, W))
1991 {
1992 if (UDM_OK != UdmInvertedIndexCacheAdd(A, data, cache, prev, W - prev))
1993 return UDM_ERROR;
1994 prev= W;
1995 }
1996 }
1997 return UDM_OK;
1998 }
1999
2000
2001
2002 static udm_rc_t
UdmConstWordListToPairWordList(UDM_INVERTED_INDEX_CACHE * cache,UDM_WORDLIST * WL,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_COORD_LIST * CL)2003 UdmConstWordListToPairWordList(UDM_INVERTED_INDEX_CACHE *cache,
2004 UDM_WORDLIST *WL,
2005 UDM_CONSTWORD_HASH_DATA *data,
2006 UDM_INVERTED_INDEX_COORD_LIST *CL)
2007 {
2008 size_t i;
2009
2010 for (i= 1; i < CL->nitems; i++)
2011 {
2012 UDM_INVERTED_INDEX_COORD *C1= &CL->Item[i-1];
2013 UDM_INVERTED_INDEX_COORD *C2= &CL->Item[i];
2014 UDM_INVERTED_INDEX_WORD *W1= &cache->Words.list.Item[C1->word_id];
2015 UDM_INVERTED_INDEX_WORD *W2= &cache->Words.list.Item[C2->word_id];
2016 /*
2017 printf("[%d][%d][%d:%d][%d:%d] '%.*s'-'%.*s' (%d-%d)\n",
2018 i, (int) data->url_id,
2019 C1->secno, C1->pos, C2->secno, C2->pos,
2020 (int) W1->str.length, W1->str.str,
2021 (int) W2->str.length, W2->str.str,
2022 W1->last_url_id_count, W2->last_url_id_count);
2023 */
2024 /* TODO34: Check max_word_len and min_word_len */
2025 if (C1->coord.secno == C2->coord.secno &&
2026 C1->coord.pos + 1 == C2->coord.pos &&
2027 W1->last_url_id_count > cache->param.pair_limit &&
2028 W2->last_url_id_count > cache->param.pair_limit)
2029 {
2030 char word[256]= "";
2031 UDM_ASSERT(W1->last_url_id == W2->last_url_id);
2032 udm_snprintf(word, sizeof(word), "##%.*s-%.*s",
2033 (int) W1->length, W1->str,
2034 (int) W2->length, W2->str);
2035 /*
2036 fprintf(stderr, "[%d:%d:%d] %d-%d '%s'\n",
2037 data->url_id, C1->secno, C1->pos, W1->last_url_id_count, W2->last_url_id_count,
2038 word);
2039 */
2040 UdmWordListAddEx(WL, word, C1->coord.secno, C1->coord.pos, 0);
2041 }
2042 }
2043 return UDM_OK;
2044 }
2045
2046
2047 static udm_rc_t
UdmConvertPairs(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,UDM_INVERTED_INDEX_COORD_LIST * CL,UDM_CONSTWORD_HASH_DATA * hash_data)2048 UdmConvertPairs(UDM_AGENT *A,
2049 UDM_INVERTED_INDEX_CACHE *cache,
2050 UDM_INVERTED_INDEX_COORD_LIST *CL,
2051 UDM_CONSTWORD_HASH_DATA *hash_data)
2052 {
2053 udm_rc_t rc= UDM_OK;
2054 UDM_WORDLIST WL;
2055 UdmWordListInit(&WL);
2056 UdmConstWordListToPairWordList(cache, &WL, hash_data, CL);
2057 if (WL.nwords)
2058 rc= UdmWordList2InvertedIndexCache(A, &WL, hash_data, cache);
2059 UdmWordListFree(&WL);
2060 return rc;
2061 }
2062
2063
2064 static udm_rc_t
UdmWordListSortAndConvert(UDM_AGENT * A,UDM_CONSTWORDLIST * CWL,UDM_INVERTED_INDEX_CACHE * cache,urlid_t url_id,UDM_CHARSET * doccs)2065 UdmWordListSortAndConvert(UDM_AGENT *A,
2066 UDM_CONSTWORDLIST *CWL,
2067 UDM_INVERTED_INDEX_CACHE *cache,
2068 urlid_t url_id,
2069 UDM_CHARSET *doccs)
2070 {
2071 udm_rc_t rc= UDM_OK;
2072 UDM_INVERTED_INDEX_COORD_LIST CL;
2073 UDM_CONSTWORD_HASH_DATA hash_data;
2074 udm_timer_t ticks= UdmStartTimer();
2075
2076 if (!CWL->nitems)
2077 return UDM_OK;
2078
2079 UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2080 bzero(&CL, sizeof(CL));
2081 UdmConvInit(&hash_data.cnv, doccs, A->Conf->lcs);
2082 hash_data.url_id= url_id;
2083 hash_data.cache= cache;
2084 cache->Words.hash.user_data= &hash_data;
2085 cache->Stats.conv+= UdmStartTimer() - ticks;
2086
2087 ticks= UdmStartTimer();
2088 rc= UdmConstWordListToInvertedIndexCoordList(cache, &CL, CWL);
2089 cache->Stats.prepare_words+= UdmStartTimer() - ticks;
2090 UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2091
2092 if (rc != UDM_OK)
2093 goto ret;
2094
2095 UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2096 if (hash_data.cache->param.pair_limit)
2097 {
2098 ticks= UdmStartTimer();
2099 rc= UdmConvertPairs(A, cache, &CL, &hash_data);
2100 cache->Stats.pairs+= UdmStartTimer() - ticks;
2101 }
2102 UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2103
2104 ticks= UdmStartTimer();
2105 UdmInvertedIndexCoordListSort(&CL);
2106
2107 UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2108 cache->Stats.sort_wordlist+= UdmStartTimer() - ticks;
2109 ticks= UdmStartTimer();
2110 rc= UdmInvertedIndexCoordList2InvertedIndexCache(A, &CL, CWL, &hash_data, cache);
2111 cache->Stats.conv+= UdmStartTimer() - ticks;
2112 UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2113
2114 if (rc != UDM_OK)
2115 goto ret;
2116
2117 ret:
2118 ticks= UdmStartTimer();
2119 UdmInvertedIndexCoordListFree(&CL);
2120 cache->Stats.conv+= UdmStartTimer() - ticks;
2121 return rc;
2122 }
2123
2124
2125 static udm_rc_t
UdmInvertedIndexCacheDocParse(UDM_AGENT * Indexer,urlid_t url_id,UDM_DOCUMENT * Doc,UDM_INVERTED_INDEX_CACHE * cache)2126 UdmInvertedIndexCacheDocParse(UDM_AGENT *Indexer,
2127 urlid_t url_id,
2128 UDM_DOCUMENT *Doc,
2129 UDM_INVERTED_INDEX_CACHE *cache)
2130 {
2131 UDM_CONSTWORDLIST CWL;
2132 UDM_CHARSET *doccs= UdmDocDetectCachedCharset(Indexer, Doc, url_id);
2133 UDM_CHARSET *metacs;
2134 udm_rc_t rc;
2135
2136 UdmConstWordListInit(&CWL);
2137 UdmDocToConstWordList(Indexer, Doc, &CWL, doccs,
2138 cache->param.cnvflags,
2139 cache->param.aggregate_section_flags,
2140 &cache->Stats.parse,
2141 &cache->Stats.prepare_words);
2142 metacs= UdmVarListFindCharset(&Doc->Sections, "Strong-Meta-Charset", doccs);
2143 rc= UdmWordListSortAndConvert(Indexer, &CWL, cache, url_id, metacs);
2144 UdmConstWordListFree(&CWL);
2145 return rc;
2146 }
2147
2148
2149
2150 #define UDM_BLOB2_MULTI_INSERT_ALLOC_SIZE 32*1024*1024
2151 #define UDM_BLOB2_MULTI_INSERT_FLUSH_SIZE 32*1024*1024
2152 #define UDM_BLOB2_MILTI_INSERT_WORD_SIZE 1024
2153
2154 static udm_rc_t
UdmSendMultiInsertQuery(UDM_AGENT * A,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_DSTR * multi)2155 UdmSendMultiInsertQuery(UDM_AGENT *A, UDM_DB *db,
2156 UDM_INVERTED_INDEX_STATS *Stats, UDM_DSTR *multi)
2157 {
2158 udm_timer_t ticks= UdmStartTimer();
2159 udm_rc_t rc= UdmDBSQLQuery(A, db, NULL, UdmDSTRPtr(multi));
2160 Stats->send_multi+= UdmStartTimer() - ticks;
2161 return rc;
2162 }
2163
2164
2165 /* TODO34: add compression */
2166 static udm_rc_t
UdmInvertedIndexCacheStoreWord(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_DSTR * dstr,UDM_DSTR * multi,UDM_INVERTED_INDEX_CACHE_ITEM * Items,size_t nitems)2167 UdmInvertedIndexCacheStoreWord(UDM_AGENT *Indexer, UDM_DB *db,
2168 UDM_INVERTED_INDEX_STATS *Stats,
2169 const char *wtable,
2170 UDM_DSTR *dstr,
2171 UDM_DSTR *multi,
2172 UDM_INVERTED_INDEX_CACHE_ITEM *Items,
2173 size_t nitems)
2174 {
2175 size_t i;
2176 udm_timer_t ticks= UdmStartTimer();
2177
2178 UdmDSTRReset(dstr);
2179 for (i= 0; i < nitems; i++)
2180 {
2181 UDM_INVERTED_INDEX_CACHE_ITEM *Item= &Items[i];
2182 uint4 wordlen= strlen(Item->ptr) + 1;
2183 if (!UdmDSTRAppendINT4(dstr, Item->url_id) ||
2184 !UdmDSTRAppend(dstr, Item->ptr + wordlen,
2185 Item->length - wordlen))
2186 {
2187 UdmLog(Indexer, UDM_LOG_ERROR, "UdmDSTRAppend failed");
2188 return UDM_ERROR;
2189 }
2190 }
2191 Stats->pack+= UdmStartTimer() - ticks;
2192
2193 /*
2194 fprintf(stderr, "secno=%d ndocs=%-6d ncoords=%-6d datalen=%-6d %s\n",
2195 Items->secno, (int) nitems, (int) ncoords, (int) dstr->size_data, Items->word);
2196 */
2197 /*
2198 TODO: Multi-insert for MyODBC. Needs proper escaping.
2199 */
2200 if (UdmSQLDBDriver(db) == UDM_DBAPI_MYSQL &&
2201 UdmDSTRLength(dstr) < UDM_BLOB2_MILTI_INSERT_WORD_SIZE)
2202 {
2203 ticks= UdmStartTimer();
2204 if (!UdmDSTRLength(multi))
2205 UdmDSTRAppendf(multi, "INSERT INTO %s VALUES('", wtable);
2206 else
2207 UdmDSTRAppend(multi, UDM_CSTR_WITH_LEN(",('"));
2208 UdmDSTRAppend(multi, Items->ptr, strlen(Items->ptr));
2209 UdmDSTRAppendf(multi,"',%d,", Items->secno);
2210 if (0)
2211 {
2212 UdmDSTRAppend(multi, "0x", 2);
2213 UdmDSTRAppendHex(multi, UdmDSTRPtr(dstr), UdmDSTRLength(dstr));
2214 }
2215 else
2216 {
2217 size_t esclen;
2218 UdmDSTRAppend(multi, "'", 1);
2219 if (UDM_OK != UdmDSTRRealloc(multi, UdmDSTRLength(multi) +
2220 2 * UdmDSTRLength(dstr)))
2221 return UDM_ERROR;
2222 esclen= UdmDBSQLEscStr(Indexer, db, multi->Val.str + multi->Val.length,
2223 UdmDSTRPtr(dstr), UdmDSTRLength(dstr));
2224 multi->Val.length+= esclen;
2225 UdmDSTRAppend(multi, "'", 1);
2226 }
2227 UdmDSTRAppend(multi, UDM_CSTR_WITH_LEN(")"));
2228 Stats->send_multi+= UdmStartTimer() - ticks;
2229 if (UdmDSTRLength(multi) > UDM_BLOB2_MULTI_INSERT_FLUSH_SIZE)
2230 {
2231 if (UDM_OK != UdmSendMultiInsertQuery(Indexer, db, Stats, multi))
2232 return UDM_ERROR;
2233 UdmDSTRReset(multi);
2234 }
2235 }
2236 else
2237 {
2238 ticks= UdmStartTimer();
2239 if (UDM_OK != UdmBlobWriteWordUsingBind(Indexer, db, wtable,
2240 Items->ptr, Items->secno,
2241 UdmDSTRPtr(dstr),
2242 UdmDSTRLength(dstr),
2243 NULL, 0))
2244 {
2245 UdmLog(Indexer, UDM_LOG_ERROR, "%s", UdmDBSQLError(db));
2246 return UDM_ERROR;
2247 }
2248 Stats->send+= UdmStartTimer() - ticks;
2249 }
2250
2251 return UDM_OK;
2252 }
2253
2254
2255 static udm_rc_t
UdmInvertedIndexCacheStoreUsingBind(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_INVERTED_INDEX_CACHE_PART * cache,UDM_DSTR * dstr,UDM_DSTR * multi)2256 UdmInvertedIndexCacheStoreUsingBind(UDM_AGENT *Indexer, UDM_DB *db,
2257 UDM_INVERTED_INDEX_STATS *Stats,
2258 const char *wtable,
2259 UDM_INVERTED_INDEX_CACHE_PART *cache,
2260 UDM_DSTR *dstr, UDM_DSTR *multi)
2261 {
2262 UDM_INVERTED_INDEX_CACHE_ITEM *prev;
2263 udm_rc_t rc;
2264 size_t i;
2265
2266 if (cache->nitems == 0)
2267 return UDM_OK;
2268
2269 if (UDM_OK != (rc= UdmBlobWriteWordPrepare(Indexer, db, wtable)))
2270 return rc;
2271
2272 for (prev= &cache->Item[0], i= 0; i <= cache->nitems; i++)
2273 {
2274 UDM_INVERTED_INDEX_CACHE_ITEM *Item= &cache->Item[i];
2275 if (i == cache->nitems ||
2276 strcmp(prev->ptr, Item->ptr) ||
2277 prev->secno != Item->secno)
2278 {
2279 if (UDM_OK != (rc= UdmInvertedIndexCacheStoreWord(Indexer, db,
2280 Stats,
2281 wtable,
2282 dstr, multi,
2283 prev, Item - prev)))
2284 break;
2285 prev= Item;
2286 }
2287 }
2288 UdmDBSQLStmtFree(Indexer, db);
2289 if (UdmDSTRLength(multi) &&
2290 UDM_OK != UdmSendMultiInsertQuery(Indexer, db, Stats, multi))
2291 return UDM_ERROR;
2292 return rc;
2293 }
2294
2295
2296 /**
2297 Store a sorted part
2298 */
2299 static udm_rc_t
UdmInvertedIndexCachePartStore(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_INVERTED_INDEX_CACHE_PART * part)2300 UdmInvertedIndexCachePartStore(UDM_AGENT *Indexer, UDM_DB *db,
2301 UDM_INVERTED_INDEX_STATS *Stats,
2302 const char *wtable,
2303 UDM_INVERTED_INDEX_CACHE_PART *part)
2304 {
2305 udm_rc_t rc= UDM_OK;
2306 UDM_DSTR dstr, multi;
2307 UdmDSTRInit(&dstr, 1024);
2308 UdmDSTRInit(&multi, UDM_BLOB2_MULTI_INSERT_ALLOC_SIZE);
2309 UdmLog(Indexer, UDM_LOG_DEBUG, "Writing word index cache part");
2310 rc= UdmInvertedIndexCacheStoreUsingBind(Indexer, db, Stats,
2311 wtable, part, &dstr, &multi);
2312 UdmDSTRFree(&multi);
2313 UdmDSTRFree(&dstr);
2314 return rc;
2315 }
2316
2317
2318 /**
2319 Sort one cache part
2320 */
2321 static void
UdmInvertedIndexCacheSortPart(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,size_t partno)2322 UdmInvertedIndexCacheSortPart(UDM_AGENT *A,
2323 UDM_INVERTED_INDEX_CACHE *cache,
2324 size_t partno)
2325 {
2326 udm_timer_t ticks= UdmStartTimer();
2327 UDM_INVERTED_INDEX_CACHE_PART *part= &cache->Item[partno];
2328 UdmLog(A, UDM_LOG_DEBUG,
2329 "Sorting word index cache part%02d: %d records",
2330 (int) partno, (int) part->nitems);
2331 UdmInvertedIndexCachePartSort(part);
2332 cache->Stats.sort+= UdmStartTimer() - ticks;
2333 }
2334
2335
2336 typedef struct
2337 {
2338 size_t partno;
2339 UDM_INVERTED_INDEX_CACHE *cache;
2340 } UDM_INDEXCACHESORT_SHARE;
2341
2342
2343 typedef struct
2344 {
2345 UDM_AGENT Agent;
2346 void *thd;
2347 UDM_INDEXCACHESORT_SHARE *share;
2348 } UDM_INDEXCACHESORT_PARAM;
2349
2350
2351
2352 static
IndexCacheSortThread(void * arg)2353 void *IndexCacheSortThread(void *arg)
2354 {
2355 UDM_INDEXCACHESORT_PARAM *param= (UDM_INDEXCACHESORT_PARAM *) arg;
2356 udm_bool_t done= UDM_FALSE;
2357 for ( ; ; )
2358 {
2359 size_t partno;
2360 UDM_GETLOCK(¶m->Agent, UDM_LOCK_DOC_CACHE);
2361 partno= param->share->partno;
2362 if (param->share->partno < param->share->cache->nitems)
2363 param->share->partno++;
2364 else
2365 done= UDM_TRUE;
2366 UDM_RELEASELOCK(¶m->Agent, UDM_LOCK_DOC_CACHE);
2367 if (done)
2368 break;
2369 UdmInvertedIndexCacheSortPart(¶m->Agent, param->share->cache, partno);
2370 }
2371 return NULL;
2372 }
2373
2374
2375 static udm_rc_t
UdmInvertedIndexCacheSortThreaded(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,size_t nthreads)2376 UdmInvertedIndexCacheSortThreaded(UDM_AGENT *A,
2377 UDM_INVERTED_INDEX_CACHE *cache,
2378 size_t nthreads)
2379 {
2380 size_t i;
2381 udm_timer_t ticks= UdmStartTimer();
2382 UDM_INDEXCACHESORT_SHARE share;
2383 UDM_INDEXCACHESORT_PARAM params[UDM_INDEXER_THREADS_MAX];
2384
2385 share.cache= cache;
2386 share.partno= 0;
2387 if (nthreads > cache->nitems)
2388 nthreads= cache->nitems;
2389
2390 UdmLog(A, UDM_LOG_DEBUG, "Sorting cache");
2391 for (i= 0; i < nthreads; i++)
2392 {
2393 UDM_INDEXCACHESORT_PARAM *param= ¶ms[i];
2394 UdmAgentInit(¶m->Agent, A->Conf, i + 1);
2395 param->thd= NULL;
2396 param->share= &share;
2397 A->Conf->THDHandler.ThreadCreate(¶m->thd, IndexCacheSortThread, (void*) param);
2398 }
2399 for (i= 0; i < nthreads; i++)
2400 {
2401 void *thd= params[i].thd;
2402 A->Conf->THDHandler.ThreadJoin(thd);
2403 }
2404
2405 UdmLog(A, UDM_LOG_DEBUG, "Sorting threads done: %.2f", UdmStopTimer(&ticks));
2406 for (i= 0 ; i < nthreads; i++)
2407 {
2408 UDM_INDEXCACHESORT_PARAM *param= ¶ms[i];
2409 UdmAgentFree(¶m->Agent);
2410 }
2411 return UDM_OK;
2412 }
2413
2414
2415 /**
2416 Sort all cache parts
2417 */
2418 static void
UdmInvertedIndexCacheSort(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache)2419 UdmInvertedIndexCacheSort(UDM_AGENT *A, UDM_INVERTED_INDEX_CACHE *cache)
2420 {
2421 size_t nthreads= UdmVarListFindInt(&A->Conf->Vars, "IndexerThreads", 0);
2422 if (nthreads && A->Conf->THDHandler.ThreadCreate)
2423 {
2424 UdmInvertedIndexCacheSortThreaded(A, cache, nthreads);
2425 }
2426 else
2427 {
2428 size_t i;
2429 for (i= 0; i < cache->nitems; i++)
2430 UdmInvertedIndexCacheSortPart(A, cache, i);
2431 }
2432 }
2433
2434
2435 static udm_rc_t
UdmInvertedIndexCacheStore(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_INVERTED_INDEX_CACHE * cache)2436 UdmInvertedIndexCacheStore(UDM_AGENT *Indexer,
2437 UDM_DB *db, const char *wtable,
2438 UDM_INVERTED_INDEX_CACHE *cache)
2439 {
2440 int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
2441 size_t i;
2442 UdmLog(Indexer, UDM_LOG_EXTRA,
2443 "Storing word index cache: %lld bytes",
2444 (unsigned long long) UdmInvertedIndexCacheEstimateUsedMemory(cache));
2445 UdmInvertedIndexCacheSort(Indexer, cache);
2446 for (i= 0; i < cache->nitems; i++)
2447 {
2448 udm_rc_t rc;
2449 UDM_INVERTED_INDEX_CACHE_PART *Item= &cache->Item[i];
2450 if (!Item->nitems)
2451 continue;
2452 if (tr && (UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))))
2453 return rc;
2454 if (UDM_OK != (rc= UdmInvertedIndexCachePartStore(Indexer, db,
2455 &cache->Stats,
2456 wtable, Item)))
2457 return rc;
2458 if (tr && (UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
2459 return rc;
2460 }
2461 return UDM_OK;
2462 }
2463
2464
2465 static void
UdmInvertedIndexCacheReportStatistics(UDM_AGENT * Indexer,UDM_INVERTED_INDEX_STATS * Stats)2466 UdmInvertedIndexCacheReportStatistics(UDM_AGENT *Indexer,
2467 UDM_INVERTED_INDEX_STATS *Stats)
2468 {
2469 UdmLog(Indexer, UDM_LOG_INFO, "Indexing statistics:");
2470 UdmLog(Indexer, UDM_LOG_INFO, "- Loading cached copies: %.2f (%llu bytes)", (double) Stats->load / 1000, Stats->bytes_loaded);
2471 if (Stats->unpack_cached_copy)
2472 UdmLog(Indexer, UDM_LOG_INFO, "- Unpacking cached copies: %.2f", (double) Stats->unpack_cached_copy / 1000);
2473 UdmLog(Indexer, UDM_LOG_INFO, "- Parsing documents: %.2f", (double) Stats->parse / 1000);
2474 UdmLog(Indexer, UDM_LOG_INFO, "- Breaking sections to words: %.2f", (double) Stats->prepare_words / 1000);
2475 UdmLog(Indexer, UDM_LOG_INFO, "- Sorting word list: %.2f", (double) Stats->sort_wordlist / 1000);
2476 UdmLog(Indexer, UDM_LOG_INFO, "- Groupping words: %.2f", (double) Stats->conv / 1000);
2477 if (Stats->pairs)
2478 UdmLog(Indexer, UDM_LOG_INFO, "- Making pairs: %.2f", (double) Stats->pairs / 1000);
2479 UdmLog(Indexer, UDM_LOG_INFO, "- Sorting words: %.2f", (double) Stats->sort / 1000);
2480 UdmLog(Indexer, UDM_LOG_INFO, "- Packing words: %.2f", (double) Stats->pack / 1000);
2481 UdmLog(Indexer, UDM_LOG_INFO, "- Sending words: %.2f", (double) (Stats->send + Stats->send_multi) / 1000);
2482 if (Stats->send_multi)
2483 {
2484 UdmLog(Indexer, UDM_LOG_EXTRA, " +- Sending words: %.2f (MultiInsert)", (double) Stats->send_multi / 1000);
2485 UdmLog(Indexer, UDM_LOG_EXTRA, " +- Sending words: %.2f (SQLExec)", (double) Stats->send / 1000);
2486 }
2487 }
2488
2489
2490 static udm_rc_t
UdmSQLFetchRowCachedCopy(UDM_AGENT * A,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_SQLRES * SQLRes,UDM_STR * row)2491 UdmSQLFetchRowCachedCopy(UDM_AGENT *A, UDM_DB *db,
2492 UDM_INVERTED_INDEX_STATS *Stats,
2493 UDM_SQLRES *SQLRes, UDM_STR *row)
2494 {
2495 udm_timer_t ticks= UdmStartTimer();
2496 udm_rc_t rc= UdmDBSQLFetchRow(A, db, SQLRes, row);
2497 Stats->load+= UdmStartTimer() - ticks;
2498 return rc;
2499 }
2500
2501
2502 static udm_rc_t
UdmParseRowCachedCopy(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,UDM_STR * row,UDM_URLDATALIST * URLList,size_t max_doc_size)2503 UdmParseRowCachedCopy(UDM_AGENT *Indexer, UDM_DB *db,
2504 UDM_INVERTED_INDEX_CACHE *cache,
2505 UDM_STR *row, UDM_URLDATALIST *URLList,
2506 size_t max_doc_size)
2507 {
2508 udm_rc_t rc;
2509 UDM_DOCUMENT Doc;
2510 urlid_t url_id= atoi(row[0].str);
2511 UDM_URLDATA *urldata;
2512 if (!(urldata= UdmURLDataListSearch(URLList, url_id)))
2513 {
2514 UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCreate: url_id=%d not found", url_id);
2515 return UDM_OK;
2516 }
2517 urldata->score++;
2518 UdmDocInit(&Doc);
2519 Doc.lcs= Indexer->Conf->lcs;
2520 if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc,
2521 row[1].str, row[1].length,
2522 max_doc_size,
2523 &cache->Stats.unpack_cached_copy)))
2524 {
2525 UdmLog(Indexer, UDM_LOG_DEBUG,
2526 "UdmDocSetFromCachedHTTPResponse() failed: url_id=%d len=%d",
2527 url_id, (int) row[1].length);
2528 rc= UDM_OK;
2529 goto end;
2530 }
2531 cache->Stats.bytes_loaded+= UdmHTTPBufSize(&Doc.Buf);
2532 if (UDM_OK != (rc= UdmInvertedIndexCacheDocParse(Indexer, urldata->url_id,
2533 &Doc, cache)))
2534 UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCacheDocParse() failed");
2535
2536 end:
2537 UdmDocFree(&Doc);
2538 return rc;
2539 }
2540
2541
2542 /***** Cache for Cached copies *********************************************/
2543 typedef struct
2544 {
2545 UDM_STR content;
2546 urlid_t url_id;
2547 } UDM_DOCCACHEITEM;
2548
2549
2550 static udm_rc_t
UdmDocCacheItemInit(UDM_DOCCACHEITEM * Item,const UDM_CONST_STR * content,urlid_t url_id)2551 UdmDocCacheItemInit(UDM_DOCCACHEITEM *Item,
2552 const UDM_CONST_STR *content,
2553 urlid_t url_id)
2554 {
2555 UDM_ASSERT(content->length > 0);
2556 if (!(Item->content.str= UdmMalloc(content->length)))
2557 return UDM_ERROR;
2558 Item->content.length= content->length;
2559 memcpy(Item->content.str, content->str, content->length);
2560 Item->url_id= url_id;
2561 return UDM_OK;
2562 }
2563
2564
2565 static void
UdmDocCacheItemFree(UDM_DOCCACHEITEM * Item)2566 UdmDocCacheItemFree(UDM_DOCCACHEITEM *Item)
2567 {
2568 UdmFree(Item->content.str);
2569 }
2570
2571
2572 typedef struct
2573 {
2574 size_t nitems;
2575 size_t mitems;
2576 UDM_DOCCACHEITEM *Item;
2577 } UDM_DOCCACHELIST;
2578
2579
2580 static void
UdmDocCacheListInit(UDM_DOCCACHELIST * List)2581 UdmDocCacheListInit(UDM_DOCCACHELIST *List)
2582 {
2583 bzero((void*) List, sizeof(*List));
2584 }
2585
2586
2587 static udm_rc_t
UdmDocCacheListRealloc(UDM_DOCCACHELIST * List)2588 UdmDocCacheListRealloc(UDM_DOCCACHELIST *List)
2589 {
2590 if (List->nitems >= List->mitems)
2591 {
2592 size_t mitems2= List->nitems + 256;
2593 if (!(List->Item= UdmRealloc(List->Item, mitems2 * sizeof(List->Item[0]))))
2594 return UDM_ERROR;
2595 List->mitems= mitems2;
2596 }
2597 return UDM_OK;
2598 }
2599
2600
2601 static udm_rc_t
UdmDocCacheListAdd(UDM_DOCCACHELIST * List,UDM_DOCCACHEITEM * Item)2602 UdmDocCacheListAdd(UDM_DOCCACHELIST *List, UDM_DOCCACHEITEM *Item)
2603 {
2604 if (UDM_OK != UdmDocCacheListRealloc(List))
2605 return UDM_ERROR;
2606 List->Item[List->nitems++]= *Item;
2607 return UDM_OK;
2608 }
2609
2610
2611 static void
UdmDocCacheListFree(UDM_DOCCACHELIST * List)2612 UdmDocCacheListFree(UDM_DOCCACHELIST *List)
2613 {
2614 size_t i;
2615 for (i= 0; i < List->nitems; i++)
2616 UdmDocCacheItemFree(&List->Item[i]);
2617 UdmFree(List->Item);
2618 }
2619
2620
2621 typedef struct
2622 {
2623 UDM_DOCCACHELIST DocList;
2624 udm_uint8 nbytes;
2625 size_t cur_item;
2626 } UDM_DOCCACHE;
2627
2628
UdmDocCacheInit(UDM_DOCCACHE * Cache)2629 static void UdmDocCacheInit(UDM_DOCCACHE *Cache)
2630 {
2631 UdmDocCacheListInit(&Cache->DocList);
2632 Cache->nbytes= 0;
2633 Cache->cur_item= 0;
2634 }
2635
2636
UdmDocCacheFree(UDM_DOCCACHE * Cache)2637 static void UdmDocCacheFree(UDM_DOCCACHE *Cache)
2638 {
2639 UdmDocCacheListFree(&Cache->DocList);
2640 }
2641
2642
UdmDocCacheReset(UDM_DOCCACHE * Cache)2643 static void UdmDocCacheReset(UDM_DOCCACHE *Cache)
2644 {
2645 UdmDocCacheListFree(&Cache->DocList);
2646 UdmDocCacheListInit(&Cache->DocList);
2647 Cache->nbytes= 0;
2648 Cache->cur_item= 0;
2649 }
2650
2651 /**********************************************************************/
2652
2653 static udm_rc_t
UdmRowCachedCopyToDocCache(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_STR * row,UDM_DOCCACHE * DocCache,UDM_URLDATALIST * URLList)2654 UdmRowCachedCopyToDocCache(UDM_AGENT *Indexer, UDM_DB *db,
2655 UDM_INVERTED_INDEX_STATS *Stats,
2656 UDM_STR *row,
2657 UDM_DOCCACHE *DocCache,
2658 UDM_URLDATALIST *URLList)
2659 {
2660 urlid_t url_id= atoi(row[0].str);
2661 UDM_URLDATA *urldata;
2662 UDM_CONST_STR content;
2663 UDM_DOCCACHEITEM DocCacheItem;
2664 if (!(urldata= UdmURLDataListSearch(URLList, url_id)))
2665 {
2666 UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCreate: url_id=%d not found", url_id);
2667 return UDM_OK;
2668 }
2669 urldata->score++;
2670 UdmConstStrSet(&content, row[1].str, row[1].length);
2671 if (UDM_OK != UdmDocCacheItemInit(&DocCacheItem, &content, url_id))
2672 return UDM_ERROR;
2673 if (UdmDocCacheListAdd(&DocCache->DocList, &DocCacheItem))
2674 return UDM_ERROR;
2675 DocCache->nbytes+= content.length;
2676 return UDM_OK;
2677 }
2678
2679
2680 static udm_rc_t
UdmCreateContentQueryUsingURLIdLoop(UDM_DSTR * query,const char * fmt,UDM_URLDATALIST * URLList,size_t offs,size_t nrows,const UDM_CONST_STR ihint)2681 UdmCreateContentQueryUsingURLIdLoop(UDM_DSTR *query, const char *fmt,
2682 UDM_URLDATALIST *URLList, size_t offs,
2683 size_t nrows,
2684 const UDM_CONST_STR ihint)
2685 {
2686 UdmDSTRReset(query);
2687 for ( ; *fmt; )
2688 {
2689 if (!strncasecmp(fmt, UDM_CSTR_WITH_LEN("${where}")))
2690 {
2691 size_t j;
2692 fmt+= 8;
2693 UdmDSTRAppend(query, UDM_CSTR_WITH_LEN(" WHERE url_id IN("));
2694 for (j=0; j < nrows; j++)
2695 {
2696 if (j > 0)
2697 UdmDSTRAppend(query, ",", 1);
2698 UdmDSTRAppendf(query, "%d", URLList->Item[offs + j].url_id);
2699 }
2700 UdmDSTRAppend(query, UDM_CSTR_WITH_LEN(")"));
2701 }
2702 else if (!strncasecmp(fmt, UDM_CSTR_WITH_LEN("${ihint}")))
2703 {
2704 fmt+= 8;
2705 UdmDSTRAppend(query, ihint.str, ihint.length);
2706 }
2707 else
2708 UdmDSTRAppend(query, fmt++, 1);
2709 }
2710 return UDM_OK;
2711 }
2712
2713
2714 #define DEFAULT_CONTENT_FMT "SELECT url_id,content FROM cachedcopy${ihint}${where}"
2715
2716
2717 static size_t
in_limit(UDM_DB * db)2718 in_limit(UDM_DB *db)
2719 {
2720 if (UdmSQLDBType(db)== UDM_DB_ORACLE8)
2721 return 1000; /* TODO34: move to UDM_SQLDB_DRIVER */
2722 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
2723 return 10*1024;
2724 if (UdmSQLDBType(db) == UDM_DB_IBASE)
2725 return 1500;
2726 return 2000;
2727 }
2728
2729
2730 static udm_rc_t
UdmDocCacheIndexOneDocument(UDM_AGENT * Indexer,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t max_doc_size)2731 UdmDocCacheIndexOneDocument(UDM_AGENT *Indexer,
2732 UDM_DOCCACHE *Docs,
2733 UDM_INVERTED_INDEX_CACHE *cache,
2734 size_t max_doc_size)
2735 {
2736 udm_rc_t rc;
2737 UDM_DOCCACHEITEM *Doc;
2738 UDM_DOCUMENT Doc2;
2739
2740 UDM_GETLOCK(Indexer, UDM_LOCK_DOC_CACHE);
2741 if (Docs->cur_item < Docs->DocList.nitems)
2742 {
2743 Doc= &Docs->DocList.Item[Docs->cur_item];
2744 Docs->cur_item++;
2745 }
2746 else
2747 {
2748 Doc= NULL;
2749 }
2750 UDM_RELEASELOCK(Indexer, UDM_LOCK_DOC_CACHE);
2751 if (!Doc)
2752 return UDM_NOTARGET;
2753
2754 UdmDocInit(&Doc2);
2755 Doc2.lcs= Indexer->Conf->lcs;
2756 if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc2,
2757 Doc->content.str,
2758 Doc->content.length,
2759 max_doc_size,
2760 &cache->Stats.unpack_cached_copy)))
2761 {
2762 UdmLog(Indexer, UDM_LOG_DEBUG,
2763 "UdmDocSetFromCachedHTTPResponse() failed: url_id=%d len=%d",
2764 Doc->url_id, (int) Doc->content.length);
2765 rc= UDM_OK;
2766 goto end;
2767 }
2768 cache->Stats.bytes_loaded+= UdmHTTPBufSize(&Doc2.Buf);
2769
2770 if (UDM_OK != (rc= UdmInvertedIndexCacheDocParse(Indexer, Doc->url_id,
2771 &Doc2, cache)))
2772 UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCacheDocParse() failed");
2773
2774 end:
2775 UdmDocFree(&Doc2);
2776 return rc;
2777 }
2778
2779
2780 static udm_rc_t
UdmInvertedIndexCacheParse(UDM_AGENT * Indexer,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t max_doc_size)2781 UdmInvertedIndexCacheParse(UDM_AGENT *Indexer,
2782 UDM_DOCCACHE *Docs,
2783 UDM_INVERTED_INDEX_CACHE *cache,
2784 size_t max_doc_size)
2785 {
2786 int rc;
2787 while (UDM_OK == (rc= UdmDocCacheIndexOneDocument(Indexer, Docs, cache,
2788 max_doc_size)))
2789 { }
2790 return rc == UDM_NOTARGET ? UDM_OK : rc;
2791 }
2792
2793
2794 typedef struct indexer_param_st
2795 {
2796 UDM_AGENT Agent;
2797 UDM_DOCCACHE *Docs;
2798 UDM_INVERTED_INDEX_CACHE *cache;
2799 size_t max_doc_size;
2800 udm_rc_t rc;
2801 void *thd;
2802 } UDM_INDEXER_PARAM;
2803
2804
2805 static
IndexerThread(void * arg)2806 void *IndexerThread(void *arg)
2807 {
2808 UDM_INDEXER_PARAM *param= (UDM_INDEXER_PARAM *) arg;
2809 param->rc= UdmInvertedIndexCacheParse(¶m->Agent, param->Docs,
2810 param->cache,
2811 param->max_doc_size);
2812 return NULL;
2813 }
2814
2815
2816 static udm_rc_t
UdmInvertedIndexCacheParseThreaded(UDM_AGENT * A,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t nthreads)2817 UdmInvertedIndexCacheParseThreaded(UDM_AGENT *A,
2818 UDM_DOCCACHE *Docs,
2819 UDM_INVERTED_INDEX_CACHE *cache,
2820 size_t nthreads)
2821 {
2822 size_t i;
2823 udm_timer_t ticks= UdmStartTimer();
2824 size_t max_doc_size= UdmVarListFindInt(&A->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
2825 UDM_INDEXER_PARAM params[UDM_INDEXER_THREADS_MAX];
2826
2827 UdmLog(A, UDM_LOG_DEBUG, " Indexing %d docs", (int) Docs->DocList.nitems);
2828 if (!Docs->DocList.nitems)
2829 return UDM_OK;
2830 if (nthreads > Docs->DocList.nitems)
2831 nthreads= Docs->DocList.nitems;
2832 for (i= 0; i < nthreads; i++)
2833 {
2834 UDM_INDEXER_PARAM *param= ¶ms[i];
2835 UdmAgentInit(¶m->Agent, A->Conf, i + 1);
2836 param->Docs= Docs;
2837 param->cache= cache;
2838 param->thd= NULL;
2839 param->max_doc_size= max_doc_size;
2840 A->Conf->THDHandler.ThreadCreate(¶m->thd, IndexerThread, (void*) param);
2841 }
2842 for (i= 0; i < nthreads; i++)
2843 {
2844 void *thd= params[i].thd;
2845 A->Conf->THDHandler.ThreadJoin(thd);
2846 }
2847
2848 UdmLog(A, UDM_LOG_DEBUG, " Threads finished: %.2f", UdmStopTimer(&ticks));
2849 for (i= 0 ; i < nthreads; i++)
2850 {
2851 UDM_INDEXER_PARAM *param= ¶ms[i];
2852 UdmAgentFree(¶m->Agent);
2853 }
2854 return UDM_OK;
2855 }
2856
2857
2858 static udm_rc_t
UdmIndertedIndexExecContentQuery(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,UDM_QUERY * Query,const char * qbuf)2859 UdmIndertedIndexExecContentQuery(UDM_AGENT *Indexer, UDM_DB *db,
2860 UDM_INVERTED_INDEX_CACHE *cache,
2861 UDM_QUERY *Query,
2862 const char *qbuf)
2863 {
2864 udm_rc_t rc= UDM_OK;
2865 udm_timer_t ticks1= UdmStartTimer();
2866 size_t max_doc_size= UdmVarListFindInt(&Indexer->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
2867 size_t nthreads= UdmVarListFindInt(&Indexer->Conf->Vars, "IndexerThreads", 0);
2868 UDM_SQLRES SQLRes;
2869 UDM_STR row[2];
2870 UDM_DOCCACHE DocCache;
2871
2872 if (UDM_OK != (rc= UdmDBSQLExecDirect(Indexer, db, &SQLRes, qbuf)))
2873 goto end;
2874 cache->Stats.load+= UdmStartTimer() - ticks1;
2875
2876 UdmDocCacheInit(&DocCache);
2877 while (UDM_OK == UdmSQLFetchRowCachedCopy(Indexer, db, &cache->Stats,
2878 &SQLRes, row))
2879 {
2880 if (!nthreads || !Indexer->Conf->THDHandler.ThreadCreate)
2881 {
2882 if (UDM_OK != (rc= UdmParseRowCachedCopy(Indexer, db, cache,
2883 row, &Query->URLData,
2884 max_doc_size)))
2885 break;
2886 }
2887 else
2888 {
2889 if (UDM_OK != (rc= UdmRowCachedCopyToDocCache(Indexer, db, &cache->Stats,
2890 row, &DocCache,
2891 &Query->URLData)))
2892 break;
2893 /*fprintf(stderr, "DOCCACHE=%.3f\n", (double) DocCache.nbytes / 1024 / 1024);*/
2894 if (DocCache.nbytes >= 32 * 1024 * 1024)
2895 {
2896 UdmInvertedIndexCacheParseThreaded(Indexer, &DocCache,
2897 cache, nthreads);
2898 UdmDocCacheReset(&DocCache);
2899 }
2900 }
2901 }
2902 UdmSQLFree(&SQLRes);
2903 if (DocCache.DocList.nitems)
2904 UdmInvertedIndexCacheParseThreaded(Indexer, &DocCache, cache, nthreads);
2905 UdmDocCacheFree(&DocCache);
2906
2907 end:
2908 return rc;
2909 }
2910
2911
2912 static udm_rc_t
UdmInvertedIndexCreateUsingURLIdLoop(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_QUERY * Query)2913 UdmInvertedIndexCreateUsingURLIdLoop(UDM_AGENT *Indexer,
2914 UDM_DB *db,
2915 UDM_INVERTED_INDEX_CACHE *cache,
2916 const char *wtable,
2917 UDM_QUERY *Query)
2918 {
2919 udm_rc_t rc= UDM_OK;
2920 UDM_DSTR query;
2921 UDM_CONST_STR ihint;
2922 UDM_URLDATALIST *URLList= &Query->URLData;
2923 size_t i;
2924 size_t ndocs_at_time= in_limit(db);
2925 udm_uint8 cache_size_limit= (udm_uint8) UdmVarListFindUnsigned(&Indexer->Conf->Vars,
2926 "IndexCacheSize",
2927 128*1024*1024);
2928 UdmConstStrInit(&ihint);
2929 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
2930 {
2931 UDM_SQLRES SQLRes;
2932 UdmDBSQLQuery(Indexer, db, &SQLRes, "SHOW CREATE TABLE cachedcopy");
2933 if (UdmSQLNumCols(&SQLRes) == 2 && UdmSQLNumRows(&SQLRes) == 1)
2934 {
2935 if (strstr(UdmSQLValue(&SQLRes, 0, 1), "PARTITION BY HASH (url_id"))
2936 UdmConstStrSet(&ihint, UDM_CSTR_WITH_LEN(" IGNORE INDEX (url_id)"));
2937 }
2938 UdmSQLFree(&SQLRes);
2939 }
2940 UdmDSTRInit(&query, ndocs_at_time * 8);
2941 for (i= 0; i < URLList->nitems; )
2942 {
2943 udm_uint8 cache_size;
2944 size_t nrows= UDM_MIN(ndocs_at_time, URLList->nitems - i);
2945 UdmLog(Indexer, UDM_LOG_DEBUG, "-- IDs %d-%d",
2946 URLList->Item[i].url_id, URLList->Item[i+nrows-1].url_id);
2947 if (UDM_OK != (rc= UdmCreateContentQueryUsingURLIdLoop(&query,
2948 DEFAULT_CONTENT_FMT,
2949 URLList, i, nrows,
2950 ihint)))
2951 goto end;
2952 if (UDM_OK != (rc= UdmIndertedIndexExecContentQuery(Indexer, db,
2953 cache, Query,
2954 UdmDSTRPtr(&query))))
2955 goto end;
2956 i+= nrows;
2957
2958 if ((cache_size= UdmInvertedIndexCacheEstimateUsedMemory(cache)) >
2959 cache_size_limit)
2960 {
2961 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
2962 UdmInvertedIndexCacheReset(cache);
2963 }
2964 }
2965
2966 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
2967 end:
2968 UdmDSTRFree(&query);
2969 return rc;
2970 }
2971
2972
2973 static udm_rc_t
UdmInvertedIndexCreateUsingGenericLoop(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_QUERY * Query,const char * field,size_t from,size_t to)2974 UdmInvertedIndexCreateUsingGenericLoop(UDM_AGENT *Indexer,
2975 UDM_DB *db,
2976 UDM_INVERTED_INDEX_CACHE *cache,
2977 const char *wtable,
2978 UDM_QUERY *Query,
2979 const char *field,
2980 size_t from, size_t to)
2981 {
2982 udm_rc_t rc= UDM_OK;
2983 size_t i;
2984 udm_uint8 cache_size_limit= (udm_uint8) UdmVarListFindInt(&Indexer->Conf->Vars,
2985 "IndexCacheSize",
2986 128*1024*1024);
2987 for (i= from; i < to; i++)
2988 {
2989 char qbuf[256];
2990 udm_uint8 cache_size;
2991 if (from + 1 != to)
2992 UdmLog(Indexer, UDM_LOG_DEBUG, "-- Part %d", (int) i);
2993 udm_snprintf(qbuf, sizeof(qbuf),
2994 "SELECT url_id,content FROM cachedcopy WHERE %s=%d",
2995 field, (int) i);
2996 if (UDM_OK != (rc= UdmIndertedIndexExecContentQuery(Indexer, db,
2997 cache, Query,
2998 qbuf)))
2999 goto end;
3000 if ((cache_size= UdmInvertedIndexCacheEstimateUsedMemory(cache)) >
3001 cache_size_limit)
3002 {
3003 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
3004 UdmInvertedIndexCacheReset(cache);
3005 }
3006 }
3007
3008 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
3009 end:
3010 return rc;
3011 }
3012
3013
3014 /*
3015 TODO34: Character set detection (crawl or index time?)
3016 TODO34: Segmenting (crawl or index time)?
3017 TODO34: CRC32 (crawl or index time?)
3018 TODO34: what to do with urlinfo sections?
3019 TODO34: Raw sections
3020 TODO34: Pairs
3021 TODO34: remove HAVE_ZLIB
3022 TODO34: In case of 'Mime application/msword "text/plain; charset=utf-8"',
3023 the charset= part is stored twice in urlinfob.
3024 */
3025 static udm_rc_t
UdmInvertedIndexCreate(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_QUERY * Query)3026 UdmInvertedIndexCreate(UDM_AGENT *Indexer, UDM_DB *db, const char *wtable,
3027 UDM_QUERY *Query)
3028 {
3029 udm_timer_t ticks= UdmStartTimer();
3030 udm_rc_t rc= UDM_OK;
3031 UDM_INVERTED_INDEX_CACHE cache;
3032
3033 UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3034 if (UDM_OK != UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3035 INVERTED_INDEX_CACHE_PARTS))
3036 return UDM_ERROR;
3037 UdmLog(Indexer, UDM_LOG_INFO, "Indexing document contents");
3038
3039 if (1)
3040 rc= UdmInvertedIndexCreateUsingURLIdLoop(Indexer, db, &cache, wtable, Query);
3041 else
3042 rc= UdmInvertedIndexCreateUsingGenericLoop(Indexer, db, &cache, wtable,
3043 Query, "0", 0, 1);
3044
3045 UdmLog(Indexer, UDM_LOG_INFO, "Freeing cache");
3046 {
3047 udm_timer_t ticks1= UdmStartTimer();
3048 UdmInvertedIndexCacheFree(&cache);
3049 UdmLog(Indexer, UDM_LOG_INFO, "Freeing cache done: %.2f", UdmStopTimer(&ticks1));
3050 }
3051 UdmLog(Indexer, UDM_LOG_INFO, "Indexing document contents done: %.2f", UdmStopTimer(&ticks));
3052 UdmInvertedIndexCacheReportStatistics(Indexer, &cache.Stats);
3053 return rc;
3054 }
3055
3056
3057 typedef struct
3058 {
3059 size_t outgoing_link_count;
3060 size_t incoming_link_count;
3061 double popularity0;
3062 double popularity1;
3063 double server_weight;
3064 urlid_t url_id;
3065 uint4 score;
3066 } UDM_URLPOPINFO;
3067
3068
3069 typedef struct
3070 {
3071 size_t mitems;
3072 size_t nitems;
3073 UDM_URLPOPINFO *Item;
3074 } UDM_URLPOPINFOLIST;
3075
3076
3077 static void
UdmURLPopInfoListInit(UDM_URLPOPINFOLIST * List)3078 UdmURLPopInfoListInit(UDM_URLPOPINFOLIST *List)
3079 {
3080 bzero((void*) List, sizeof(*List));
3081 }
3082
3083
3084 static udm_rc_t
UdmURLPopInfoListAlloc(UDM_URLPOPINFOLIST * List,size_t mitems)3085 UdmURLPopInfoListAlloc(UDM_URLPOPINFOLIST *List, size_t mitems)
3086 {
3087 if (!(List->Item= (UDM_URLPOPINFO *) UdmMalloc(mitems * sizeof(UDM_URLPOPINFO))))
3088 return UDM_ERROR;
3089 List->mitems= mitems;
3090 return UDM_OK;
3091 }
3092
3093
3094 static void
UdmURLPopInfoListFree(UDM_URLPOPINFOLIST * List)3095 UdmURLPopInfoListFree(UDM_URLPOPINFOLIST *List)
3096 {
3097 UdmFree(List->Item);
3098 }
3099
3100
3101 static int
cmp_data_urlpopinfo(UDM_URLPOPINFO * d1,UDM_URLPOPINFO * d2)3102 cmp_data_urlpopinfo(UDM_URLPOPINFO *d1, UDM_URLPOPINFO *d2)
3103 {
3104 if (d1->url_id > d2->url_id) return 1;
3105 if (d1->url_id < d2->url_id) return -1;
3106 return 0;
3107 }
3108
3109
3110 static UDM_URLPOPINFO *
UdmURLPopInfoListSearch(UDM_URLPOPINFOLIST * List,urlid_t id)3111 UdmURLPopInfoListSearch(UDM_URLPOPINFOLIST *List, urlid_t id)
3112 {
3113 UDM_URLPOPINFO d;
3114 void *found;
3115 if (!List->nitems)
3116 return 0;
3117 d.url_id= id;
3118 found= UdmBSearch(&d, List->Item, List->nitems, sizeof(UDM_URLPOPINFO),
3119 (udm_qsort_cmp) cmp_data_urlpopinfo);
3120 return (UDM_URLPOPINFO*) found;
3121 }
3122
3123
3124 /*
3125 Init URLPopInfoList from a sorted URLDataList.
3126 */
3127 static udm_rc_t
UdmURLPopInfoListInitFromURLDataList(UDM_AGENT * A,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_URLDATALIST * URLDataList)3128 UdmURLPopInfoListInitFromURLDataList(UDM_AGENT *A,
3129 UDM_URLPOPINFOLIST *URLPopInfoList,
3130 UDM_URLDATALIST *URLDataList)
3131 {
3132 size_t i;
3133 double r= 1e0 / (double) URLDataList->nitems;
3134 if (UDM_OK != UdmURLPopInfoListAlloc(URLPopInfoList, URLDataList->nitems))
3135 return UDM_ERROR;
3136 for (i= 0 ; i < URLDataList->nitems; i++)
3137 {
3138 UDM_URLPOPINFO *dst= &URLPopInfoList->Item[i];
3139 UDM_URLDATA *src= &URLDataList->Item[i];
3140 UDM_SERVER *Server= UdmServerFind(A, &A->Conf->Servers, src->url, NULL);
3141 dst->url_id= src->url_id;
3142 dst->score= src->score;
3143 dst->popularity0= r;
3144 dst->server_weight= (Server ? Server->weight : 1);
3145 dst->popularity1= 0;
3146 dst->outgoing_link_count= 0;
3147 dst->incoming_link_count= 0;
3148 /*fprintf(stderr, "[%.5f]%s\n", dst->popularity0, Server ? Server->Match.pattern : NULL);*/
3149 }
3150 URLPopInfoList->nitems= URLDataList->nitems;
3151 return UDM_OK;
3152 }
3153
3154
3155 typedef struct
3156 {
3157 UDM_URLPOPINFO *from;
3158 UDM_URLPOPINFO *to;
3159 } UDM_LINKINFO;
3160
3161
3162 typedef struct
3163 {
3164 size_t nitems;
3165 size_t mitems;
3166 UDM_LINKINFO *Item;
3167 } UDM_LINKINFOLIST;
3168
3169 static void
UdmLinkInfoListInit(UDM_LINKINFOLIST * List)3170 UdmLinkInfoListInit(UDM_LINKINFOLIST *List)
3171 {
3172 bzero((void*) List, sizeof(*List));
3173 }
3174
3175 static void
UdmLinkInfoListFree(UDM_LINKINFOLIST * List)3176 UdmLinkInfoListFree(UDM_LINKINFOLIST *List)
3177 {
3178 UdmFree(List->Item);
3179 }
3180
3181 static udm_rc_t
UdmLinkInfoListAlloc(UDM_LINKINFOLIST * List,size_t mitems)3182 UdmLinkInfoListAlloc(UDM_LINKINFOLIST *List, size_t mitems)
3183 {
3184 if (!(List->Item= (UDM_LINKINFO*) UdmMalloc(mitems * sizeof(UDM_LINKINFO))))
3185 return UDM_ERROR;
3186 List->mitems= mitems;
3187 return UDM_OK;
3188 }
3189
3190 static void
UdmLinkInfoListAdd(UDM_LINKINFOLIST * List,const UDM_LINKINFO * Item)3191 UdmLinkInfoListAdd(UDM_LINKINFOLIST *List, const UDM_LINKINFO *Item)
3192 {
3193 UDM_ASSERT(List->nitems < List->mitems);
3194 List->Item[List->nitems++]= *Item;
3195 }
3196
3197
3198 typedef struct
3199 {
3200 size_t mitems;
3201 size_t nitems;
3202 UDM_LINKINFOLIST *Item;
3203 } UDM_LINKINFOLISTLIST;
3204
3205
3206 static void
UdmLinkInfoListListInit(UDM_LINKINFOLISTLIST * List)3207 UdmLinkInfoListListInit(UDM_LINKINFOLISTLIST *List)
3208 {
3209 bzero((void*) List, sizeof(*List));
3210 }
3211
3212 static void
UdmLinkInfoListListFree(UDM_LINKINFOLISTLIST * List)3213 UdmLinkInfoListListFree(UDM_LINKINFOLISTLIST *List)
3214 {
3215 size_t i;
3216 for (i= 0; i < List->nitems; i++)
3217 UdmLinkInfoListFree(&List->Item[i]);
3218 UdmFree(List->Item);
3219 }
3220
3221 static udm_rc_t
UdmLinkInfoListListRealloc(UDM_LINKINFOLISTLIST * List)3222 UdmLinkInfoListListRealloc(UDM_LINKINFOLISTLIST *List)
3223 {
3224 if (List->nitems >= List->mitems)
3225 {
3226 size_t mitems= List->mitems + 256;
3227 size_t nbytes= mitems * sizeof(UDM_LINKINFOLIST);
3228 if (!(List->Item= (UDM_LINKINFOLIST*) UdmRealloc(List->Item, nbytes)))
3229 return UDM_ERROR;
3230 List->mitems= mitems;
3231 }
3232 return UDM_OK;
3233 }
3234
3235 static udm_rc_t
UdmLinkInfoListListAdd(UDM_LINKINFOLISTLIST * List,const UDM_LINKINFOLIST * Item)3236 UdmLinkInfoListListAdd(UDM_LINKINFOLISTLIST *List, const UDM_LINKINFOLIST *Item)
3237 {
3238 if (UDM_OK != UdmLinkInfoListListRealloc(List))
3239 return UDM_ERROR;
3240 List->Item[List->nitems++]= *Item;
3241 return UDM_OK;
3242 }
3243
3244
3245 typedef struct
3246 {
3247 UDM_CONST_STR text;
3248 UDM_LINKINFO link;
3249 } UDM_LINKTEXT;
3250
3251 typedef struct
3252 {
3253 size_t nitems;
3254 size_t mitems;
3255 UDM_LINKTEXT *Item;
3256 } UDM_LINKTEXTLIST;
3257
3258 static int
linktextcmp(const UDM_LINKTEXT * a,const UDM_LINKTEXT * b)3259 linktextcmp(const UDM_LINKTEXT *a, const UDM_LINKTEXT *b)
3260 {
3261 if (a->link.to->url_id != b->link.to->url_id)
3262 return a->link.to->url_id < b->link.to->url_id ? -1 : 1;
3263 return 0;
3264 }
3265
3266 static void
UdmLinkTextListInit(UDM_LINKTEXTLIST * List)3267 UdmLinkTextListInit(UDM_LINKTEXTLIST *List)
3268 {
3269 bzero((void*) List, sizeof(*List));
3270 }
3271
3272 static void
UdmLinkTextListFree(UDM_LINKTEXTLIST * List)3273 UdmLinkTextListFree(UDM_LINKTEXTLIST *List)
3274 {
3275 UdmFree(List->Item);
3276 }
3277
3278 static udm_rc_t
UdmLinkTextListAlloc(UDM_LINKTEXTLIST * List,size_t nitems)3279 UdmLinkTextListAlloc(UDM_LINKTEXTLIST *List, size_t nitems)
3280 {
3281 if (!(List->Item= (UDM_LINKTEXT *) UdmMalloc(nitems * sizeof(UDM_LINKTEXT))))
3282 return UDM_ERROR;
3283 List->mitems= 0;
3284 return UDM_OK;
3285 }
3286
3287 typedef struct
3288 {
3289 UDM_CONST_STR url;
3290 udmcrc32_t hash;
3291 urlid_t id;
3292 } UDM_URLIDHASH;
3293
3294
3295 static void
UdmURLIdHashItemInitFromURLData(UDM_URLIDHASH * dst,UDM_URLDATA * src)3296 UdmURLIdHashItemInitFromURLData(UDM_URLIDHASH *dst, UDM_URLDATA *src)
3297 {
3298 UdmConstStrSetStr(&dst->url, src->url);
3299 dst->hash= UdmStrCRC32(src->url);
3300 dst->id= src->url_id;
3301 }
3302
3303
3304 static void
UdmURLIdHashItemInit(UDM_URLIDHASH * dst,const char * url)3305 UdmURLIdHashItemInit(UDM_URLIDHASH *dst, const char *url)
3306 {
3307 UdmConstStrSetStr(&dst->url, url);
3308 dst->hash= UdmStrCRC32(url);
3309 dst->id= 0;
3310 }
3311
3312
3313 static udm_rc_t
UdmURLIdHashStore(UDM_HASH * hash,void * ofs,void * item)3314 UdmURLIdHashStore(UDM_HASH *hash, void *ofs, void *item)
3315 {
3316 memcpy(ofs, item, sizeof(UDM_URLIDHASH));
3317 return UDM_OK;
3318 }
3319
3320
3321 static udm_rc_t
UdmURLIdHashJoin(UDM_HASH * hash,void * ofs,void * b)3322 UdmURLIdHashJoin(UDM_HASH *hash, void *ofs, void *b)
3323 {
3324 return UDM_OK;
3325 }
3326
3327
3328 static int
UdmConstStrEQ(const UDM_CONST_STR * s1,const UDM_CONST_STR * s2)3329 UdmConstStrEQ(const UDM_CONST_STR *s1, const UDM_CONST_STR *s2)
3330 {
3331 if (s1->length != s2->length)
3332 return 1;
3333 return memcmp(s1->str, s2->str, s1->length);
3334 }
3335
3336
3337 /*
3338 Returns 0 (on equal)
3339 Returns 1 (on non equal).
3340 Note: not siutable for sorting!
3341 */
3342 static int
UdmURLIdHashCmp(UDM_HASH * hash,void * w1,void * w2)3343 UdmURLIdHashCmp(UDM_HASH *hash, void *w1, void *w2)
3344 {
3345 if (((UDM_URLIDHASH*)w1)->hash != ((UDM_URLIDHASH*)w2)->hash)
3346 return 1;
3347 return UdmConstStrEQ(&(((UDM_URLIDHASH*)w1)->url),
3348 &(((UDM_URLIDHASH*)w2)->url));
3349 }
3350
3351
3352 static udmcrc32_t
UdmURLIdHashCalculateHash(UDM_HASH * hash,const void * item)3353 UdmURLIdHashCalculateHash(UDM_HASH *hash, const void *item)
3354 {
3355 return ((const UDM_URLIDHASH*)item)->hash;
3356 }
3357
3358
3359 static UDM_HASH_HANDLER UdmURLIdHashHandler=
3360 {
3361 UdmURLIdHashStore, /* store */
3362 UdmURLIdHashJoin, /* join */
3363 UdmURLIdHashCmp, /* cmp */
3364 UdmURLIdHashCalculateHash, /* keykey */
3365 UdmURLIdHashCalculateHash /* reckey */
3366 };
3367
3368
3369 static udm_rc_t
UdmURLIdHashInitFromURLDataList(UDM_AGENT * Indexer,UDM_HASH * urlidhash,const UDM_URLDATALIST * URLList)3370 UdmURLIdHashInitFromURLDataList(UDM_AGENT *Indexer,
3371 UDM_HASH *urlidhash,
3372 const UDM_URLDATALIST *URLList)
3373 {
3374 size_t i;
3375 if (UDM_OK != UdmHashInit(urlidhash, &UdmURLIdHashHandler,
3376 NULL, URLList->nitems + URLList->nitems/10 + 100,
3377 sizeof(UDM_URLIDHASH)))
3378 {
3379 UdmLog(Indexer, UDM_LOG_ERROR, "UdmHashInit failed");
3380 return UDM_ERROR;
3381 }
3382 for (i= 0; i < URLList->nitems; i++)
3383 {
3384 UDM_URLIDHASH item;
3385 UdmURLIdHashItemInitFromURLData(&item, &URLList->Item[i]);
3386 if (NULL == UdmHashPut(urlidhash, &item))
3387 {
3388 UdmLog(Indexer, UDM_LOG_ERROR, "UdmHashPut failed");
3389 return UDM_ERROR;
3390 }
3391 }
3392 return UDM_OK;
3393 }
3394
3395
3396 static udm_rc_t
UdmInvertedIndexAddURLText(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_URLDATALIST * URLList)3397 UdmInvertedIndexAddURLText(UDM_AGENT *Indexer,
3398 UDM_DB *db,
3399 const char *wtable,
3400 UDM_URLDATALIST *URLList)
3401 {
3402 udm_rc_t rc= UDM_OK;
3403 size_t i;
3404 UDM_INVERTED_INDEX_CACHE cache;
3405
3406 if (!UdmVarListFindByPrefix(&Indexer->Conf->Sections, "url.", 4))
3407 return UDM_OK;
3408
3409 UdmLog(Indexer, UDM_LOG_INFO, "Indexing URL text");
3410 UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3411 if (UDM_OK != UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3412 INVERTED_INDEX_CACHE_PARTS))
3413 return UDM_ERROR;
3414
3415 for (i=0; i < URLList->nitems && rc == UDM_OK; i++)
3416 {
3417 UDM_URLDATA *Item= &URLList->Item[i];
3418 UDM_CONSTWORDLIST CWL;
3419 UDM_DOCUMENT Doc;
3420 if (!Item->score)
3421 continue; /* Does not have urlinfob record, e.g. HrefOnly */
3422 UdmConstWordListInit(&CWL);
3423 UdmDocInit(&Doc);
3424 UdmURLParse(&Doc.CurURL, Item->url);
3425 UdmVarListAddLst(&Doc.Sections, &Indexer->Conf->Sections, NULL, "*");
3426 UdmParseURLText(Indexer, &Doc);
3427 /* TODO34: RemoteCharset, RemoteFileNameCharset */
3428 UdmTextListToConstWordList(&Doc.TextList,
3429 Indexer->Conf->unidata, Indexer->Conf->lcs,
3430 UDM_RECODE_HTML, &CWL);
3431 rc= UdmWordListSortAndConvert(Indexer, &CWL, &cache,
3432 Item->url_id, Indexer->Conf->lcs);
3433 UdmConstWordListFree(&CWL);
3434 UdmDocFree(&Doc);
3435 }
3436 if (rc == UDM_OK)
3437 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, &cache);
3438
3439 UdmInvertedIndexCacheFree(&cache);
3440 return rc;
3441 }
3442
3443
3444 static udm_rc_t
UdmLinkTextListToInvertedIndexCache(UDM_AGENT * Indexer,UDM_LINKTEXTLIST * LinkTextList,UDM_INVERTED_INDEX_CACHE * cache,udm_secno_t ilinktext)3445 UdmLinkTextListToInvertedIndexCache(UDM_AGENT *Indexer,
3446 UDM_LINKTEXTLIST *LinkTextList,
3447 UDM_INVERTED_INDEX_CACHE *cache,
3448 udm_secno_t ilinktext)
3449 {
3450 size_t i;
3451 udm_rc_t rc= UDM_OK;
3452 UDM_WORD_SCANNER scanner;
3453 UDM_CONSTWORDLIST CWL;
3454
3455 UdmWordScannerInit(&scanner, Indexer->Conf->unidata, Indexer->Conf->lcs);
3456 UdmConstWordListInit(&CWL);
3457
3458 /* Sort Linktext items by url_id */
3459 UdmSort(LinkTextList->Item, LinkTextList->nitems, sizeof(UDM_LINKTEXT), (udm_qsort_cmp) linktextcmp);
3460 for (i= 0; i < LinkTextList->nitems; i++)
3461 {
3462 UDM_LINKTEXT *Item= &LinkTextList->Item[i];
3463 UDM_URLPOPINFO *popinfo= Item->link.to;
3464 if (!popinfo->score) /* URL does not have urlinfob record. e.g. HrefOnly */
3465 continue;
3466 UdmConstWordListAddString(&scanner,
3467 UDM_RECODE_HTML, &CWL, /* TODO34: StripAccents */
3468 ilinktext,
3469 Item->text.str, Item->text.length);
3470 CWL.wordpos[ilinktext]+= 8; /* TODO34: check overflow */
3471 if (i + 1 == LinkTextList->nitems ||
3472 LinkTextList->Item[i+1].link.to->url_id != Item->link.to->url_id)
3473 {
3474 if (UDM_OK != (rc= UdmWordListSortAndConvert(Indexer, &CWL, cache,
3475 Item->link.to->url_id,
3476 Indexer->Conf->lcs)))
3477 goto ex;
3478 CWL.nitems= 0;
3479 CWL.wordpos[ilinktext]= 0;
3480 }
3481 }
3482 ex:
3483 UdmConstWordListFree(&CWL);
3484 return rc;
3485 }
3486
3487
3488 /* Order of the columns */
3489 #define LINKTEXT_ID 0
3490 #define LINKTEXT_URL 1
3491 #define LINKTEXT_TEXT 2
3492
3493 static udm_rc_t
UdmInvertedIndexProcessLinksResult(UDM_AGENT * Indexer,UDM_HASH * urlidhash,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_INVERTED_INDEX_CACHE * cache,UDM_LINKINFOLISTLIST * LinkInfoListList,UDM_SQLRES * SQLRes,udm_secno_t ilinktext)3494 UdmInvertedIndexProcessLinksResult(UDM_AGENT *Indexer,
3495 UDM_HASH *urlidhash,
3496 UDM_URLPOPINFOLIST *URLPopInfoList,
3497 UDM_INVERTED_INDEX_CACHE *cache,
3498 UDM_LINKINFOLISTLIST *LinkInfoListList,
3499 UDM_SQLRES *SQLRes,
3500 udm_secno_t ilinktext)
3501 {
3502 udm_rc_t rc;
3503 size_t i, nrows;
3504 UDM_LINKTEXTLIST LinkTextList;
3505 UDM_LINKINFOLIST LinkInfoList;
3506
3507 if (!(nrows= UdmSQLNumRows(SQLRes)))
3508 return UDM_OK;
3509
3510 UdmLinkInfoListInit(&LinkInfoList);
3511 UdmLinkTextListInit(&LinkTextList);
3512
3513 if (UDM_OK != (rc= UdmLinkTextListAlloc(&LinkTextList, nrows)))
3514 return rc;
3515
3516 if (UDM_OK != (rc= UdmLinkInfoListAlloc(&LinkInfoList, nrows)))
3517 goto ex;
3518
3519 for (i=0; i < nrows; i++)
3520 {
3521 UDM_LINKTEXT *Item= &LinkTextList.Item[LinkTextList.nitems];
3522 const char *url= UdmSQLValue(SQLRes, i, LINKTEXT_URL);
3523 urlid_t fromid= atoi(UdmSQLValue(SQLRes, i, LINKTEXT_ID));
3524 UDM_URLIDHASH *u, search_item;
3525 UdmURLIdHashItemInit(&search_item, url);
3526 if (!(u= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &search_item)) || !u->id)
3527 continue;
3528 if (!(Item->link.to= UdmURLPopInfoListSearch(URLPopInfoList, u->id)))
3529 {
3530 UdmLog(Indexer, UDM_LOG_WARN,
3531 "UdmInvertedIndexAddLinkText: to_id=%d not found", u->id);
3532 continue;
3533 }
3534 if (!(Item->link.from= UdmURLPopInfoListSearch(URLPopInfoList, fromid)))
3535 {
3536 UdmLog(Indexer, UDM_LOG_WARN,
3537 "UdmInvertedIndexAddLinkText: from_id=%s not found",
3538 UdmSQLValue(SQLRes, i, LINKTEXT_ID));
3539 continue;
3540 }
3541 if (cache)
3542 {
3543 Item->text.str= UdmSQLValue(SQLRes, i, LINKTEXT_TEXT);
3544 Item->text.length= UdmSQLLen(SQLRes, i, LINKTEXT_TEXT);
3545 LinkTextList.nitems++;
3546 }
3547 UdmLinkInfoListAdd(&LinkInfoList, &Item->link);
3548 }
3549 if (!LinkInfoList.nitems)
3550 {
3551 /* It will not be added into LinkInfoListList, so free it now */
3552 UdmLinkInfoListFree(&LinkInfoList);
3553 goto ex;
3554 }
3555
3556 if (UDM_OK != UdmLinkInfoListListAdd(LinkInfoListList, &LinkInfoList))
3557 return UDM_ERROR;
3558
3559 if (LinkTextList.nitems)
3560 rc= UdmLinkTextListToInvertedIndexCache(Indexer, &LinkTextList, cache,
3561 ilinktext);
3562 ex:
3563 UdmLinkTextListFree(&LinkTextList);
3564 return rc;
3565 }
3566
3567
3568 static void
UdmLinkInfoListCalcOutgoingLinks(UDM_LINKINFOLIST * List)3569 UdmLinkInfoListCalcOutgoingLinks(UDM_LINKINFOLIST *List)
3570 {
3571 size_t i;
3572 for (i= 0; i < List->nitems; i++)
3573 {
3574 UDM_LINKINFO *Item= &List->Item[i];
3575 Item->from->outgoing_link_count++;
3576 /*fprintf(stderr, "%d->%d\n", Item->from->url_id, Item->to->url_id);*/
3577 }
3578 }
3579
3580
3581 static void
UdmLinkInfoListCalcIncomingLinks(UDM_LINKINFOLIST * List)3582 UdmLinkInfoListCalcIncomingLinks(UDM_LINKINFOLIST *List)
3583 {
3584 size_t i;
3585 for (i= 0; i < List->nitems; i++)
3586 {
3587 UDM_LINKINFO *Item= &List->Item[i];
3588 Item->to->incoming_link_count++;
3589 }
3590 }
3591
3592
3593 static void
UdmLinkInfoListCalcSumWeightsIncomingLinks2(UDM_LINKINFOLIST * List)3594 UdmLinkInfoListCalcSumWeightsIncomingLinks2(UDM_LINKINFOLIST *List)
3595 {
3596 size_t i;
3597 for (i= 0; i < List->nitems; i++)
3598 {
3599 UDM_LINKINFO *Item= &List->Item[i];
3600 /*Item->to->incoming_link_count++;*/
3601 if (Item->from->outgoing_link_count)
3602 Item->to->popularity1+= (Item->from->popularity0 /
3603 (double) Item->from->outgoing_link_count) *
3604 Item->from->server_weight;
3605 /*fprintf(stderr, "%d->%d\n", Item->from->url_id, Item->to->url_id);*/
3606 }
3607 }
3608
3609
3610 static void
UdmURLDataListNormalizePopRank(UDM_URLPOPINFOLIST * URLPopInfoList,UDM_URLDATALIST * URLDataList)3611 UdmURLDataListNormalizePopRank(UDM_URLPOPINFOLIST *URLPopInfoList,
3612 UDM_URLDATALIST *URLDataList)
3613 {
3614 size_t i;
3615 UDM_ASSERT(URLDataList->nitems == URLPopInfoList->nitems);
3616 for (i= 0; i < URLPopInfoList->nitems; i++)
3617 {
3618 UDM_URLPOPINFO *src= &URLPopInfoList->Item[i];
3619 UDM_URLDATA *dst= &URLDataList->Item[i];
3620 double tmp= src->popularity0 * 1000000000;
3621 UDM_ASSERT(src->url_id == dst->url_id);
3622 if (tmp < 1)
3623 tmp= 1;
3624 dst->pop_rank= log(tmp)/log(1000000000);
3625 dst->per_site= (uint4) src->incoming_link_count;
3626 /*fprintf(stderr, "[%d] %f %f %s\n", dst->url_id, src->popularity0, dst->pop_rank, dst->url);*/
3627 }
3628 }
3629
3630
3631 static udm_rc_t
UdmLinkInfoListListCalcPopRankOnce(UDM_URLPOPINFOLIST * URLPopInfoList,UDM_LINKINFOLISTLIST * LinkInfoListList)3632 UdmLinkInfoListListCalcPopRankOnce(UDM_URLPOPINFOLIST *URLPopInfoList,
3633 UDM_LINKINFOLISTLIST *LinkInfoListList)
3634 {
3635 size_t i;
3636 UDM_ASSERT(URLPopInfoList->nitems);
3637 for (i= 0; i < LinkInfoListList->nitems; i++)
3638 UdmLinkInfoListCalcSumWeightsIncomingLinks2(&LinkInfoListList->Item[i]);
3639 for (i= 0; i < URLPopInfoList->nitems; i++)
3640 {
3641 UDM_URLPOPINFO *Item= &URLPopInfoList->Item[i];
3642 Item->popularity0= (0.15 / URLPopInfoList->nitems +
3643 0.85 * Item->popularity1);
3644 /*
3645 fprintf(stderr, "[%d] norm=%.6f old=%.6f new=%.6f ilinks=%d olinks=%d Wsrv=%.2f\n",
3646 Item->url_id, 0.15 / URLPopInfoList->nitems,
3647 Item->popularity1, Item->popularity0,
3648 (int) Item->incoming_link_count,
3649 (int) Item->outgoing_link_count,
3650 Item->server_weight);
3651 */
3652 Item->popularity1= 0; /* Prepare for the next iteration */
3653 }
3654 return UDM_OK;
3655 }
3656
3657
3658 static size_t
UdmLinkInfoListListCalcTotalLinkCount(UDM_LINKINFOLISTLIST * LinkInfoListList)3659 UdmLinkInfoListListCalcTotalLinkCount(UDM_LINKINFOLISTLIST *LinkInfoListList)
3660 {
3661 size_t i, count;
3662 for (count=0, i= 0; i < LinkInfoListList->nitems; i++)
3663 count+= LinkInfoListList->Item[i].nitems;
3664 return count;
3665 }
3666
3667
3668 static udm_rc_t
UdmLinkInfoListListCalcPopRank(UDM_AGENT * Indexer,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_LINKINFOLISTLIST * LinkInfoListList)3669 UdmLinkInfoListListCalcPopRank(UDM_AGENT *Indexer,
3670 UDM_URLPOPINFOLIST *URLPopInfoList,
3671 UDM_LINKINFOLISTLIST *LinkInfoListList)
3672 {
3673 size_t i;
3674 if (!URLPopInfoList->nitems)
3675 return UDM_OK;
3676 UdmLog(Indexer, UDM_LOG_INFO,
3677 "Calculating popularity: %d documents, %d links",
3678 (int) URLPopInfoList->nitems,
3679 (int) UdmLinkInfoListListCalcTotalLinkCount(LinkInfoListList));
3680 for (i= 0; i < LinkInfoListList->nitems; i++)
3681 {
3682 UdmLinkInfoListCalcOutgoingLinks(&LinkInfoListList->Item[i]);
3683 UdmLinkInfoListCalcIncomingLinks(&LinkInfoListList->Item[i]);
3684 }
3685 for (i= 0; i < 3; i++)
3686 {
3687 udm_rc_t rc;
3688 if (UDM_OK != (rc= UdmLinkInfoListListCalcPopRankOnce(URLPopInfoList,
3689 LinkInfoListList)))
3690 return rc;
3691 }
3692 return UDM_OK;
3693 }
3694
3695
3696 typedef struct
3697 {
3698 udm_secno_t ilinktext;
3699 udm_bool_t use_popularity;
3700 } UDM_ADD_LINKS_PARAM;
3701
3702
3703 /**
3704 Link processing parameters for "indexer --rewritepop"
3705 */
3706 static const UDM_ADD_LINKS_PARAM links_param_for_popularity=
3707 {
3708 0, /* Don't touch indexer link words*/
3709 UDM_TRUE /* Force UsePopularity if explicitly asked for */
3710 };
3711
3712
3713 static void
UdmAddLinksParamInitFromEnv(UDM_ADD_LINKS_PARAM * param,const UDM_ENV * Env)3714 UdmAddLinksParamInitFromEnv(UDM_ADD_LINKS_PARAM *param, const UDM_ENV *Env)
3715 {
3716 const UDM_VAR *ilinktext= UdmVarListFind(&Env->Sections, "ilinktext");
3717 param->ilinktext= ilinktext ? UdmVarSecno(ilinktext) : 0;
3718 param->use_popularity= UdmVarListFindBool(&Env->Vars, "UsePopularity", UDM_TRUE);
3719 }
3720
3721
3722 static udm_rc_t
UdmLoadRedirectLinks(UDM_AGENT * Indexer,UDM_DB * db,UDM_URLDATALIST * URLList,UDM_HASH * urlidhash)3723 UdmLoadRedirectLinks(UDM_AGENT *Indexer, UDM_DB *db,
3724 UDM_URLDATALIST *URLList,
3725 UDM_HASH *urlidhash)
3726 {
3727 udm_timer_t ticks= UdmStartTimer();
3728 char qbuf[128];
3729 UDM_SQLRES SQLRes;
3730 udm_rc_t rc;
3731 size_t i;
3732
3733 UdmLog(Indexer, UDM_LOG_INFO, "Loading redirects");
3734 udm_snprintf(qbuf, sizeof(qbuf), "SELECT url_id,url FROM redirect");
3735 if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, &SQLRes, qbuf)))
3736 return rc;
3737
3738 for (i= 0; i < UdmSQLNumRows(&SQLRes); i++)
3739 {
3740 /*
3741 There is a redirect from (src_id,src_url) to (dst_id,dst_url).
3742 We associate src_url with dst_id, so all links coming to src_url
3743 are considered as belonging to dst_id instead.
3744 */
3745 urlid_t srcid= atoi(UdmSQLValue(&SQLRes, i, 0));
3746 UDM_URLIDHASH *src, *dst, key;
3747 UDM_URLDATA *srcdata= UdmURLDataListSearch(URLList, srcid);
3748 if (!srcdata)
3749 {
3750 /* TODO34: src can be not in URLData, if "indexer -s200 --index" is give */
3751 continue;
3752 }
3753
3754 UdmURLIdHashItemInit(&key, srcdata->url);
3755 if (!(src= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &key)) || !src->id)
3756 continue; /* The link source is out of the indexing space */
3757
3758 UdmURLIdHashItemInit(&key, UdmSQLValue(&SQLRes, i, 1));
3759 if (!(dst= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &key)) || !dst->id)
3760 continue; /* The link destination is out of the indexing space */
3761
3762 /*printf("FROM [%d:%s]\nTO [%d:%s]\n",
3763 src->id, src->url.str, dst->id, dst->url.str);*/
3764 src->id= dst->id;
3765 }
3766 UdmSQLFree(&SQLRes);
3767 UdmLog(Indexer, UDM_LOG_INFO,
3768 "Loading redirects done: %d links, %.2f sec",
3769 (int) UdmSQLNumRows(&SQLRes), UdmStopTimer(&ticks));
3770 return UDM_OK;
3771 }
3772
3773
3774 /**
3775 Calculate popularity and add link words into the index cache.
3776 cache can be NULL, which means don't add link words and
3777 calculate popularity only.
3778 */
3779 static udm_rc_t
UdmInvertedIndexAddLinkText(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_URLDATALIST * URLList,const UDM_ADD_LINKS_PARAM param)3780 UdmInvertedIndexAddLinkText(UDM_AGENT *Indexer,
3781 UDM_DB *db,
3782 UDM_INVERTED_INDEX_CACHE *cache,
3783 const char *wtable,
3784 UDM_URLDATALIST *URLList,
3785 const UDM_ADD_LINKS_PARAM param)
3786 {
3787 int i;
3788 udm_rc_t rc= UDM_OK;
3789 udm_timer_t ticks;
3790 UDM_HASH urlidhash;
3791 UDM_LINKINFOLISTLIST LinkInfoListList;
3792 UDM_URLPOPINFOLIST URLPopInfoList;
3793
3794 UdmURLPopInfoListInit(&URLPopInfoList);
3795 UdmLinkInfoListListInit(&LinkInfoListList);
3796 if (UDM_OK != (rc= UdmURLPopInfoListInitFromURLDataList(Indexer,
3797 &URLPopInfoList,
3798 URLList)))
3799 goto ex;
3800
3801 if (UDM_OK != (rc= UdmURLIdHashInitFromURLDataList(Indexer,
3802 &urlidhash, URLList)))
3803 goto ex;
3804
3805 if (UdmVarListFindBool(&Indexer->Conf->Vars, "ResolveRedirect", UDM_TRUE) &&
3806 UDM_OK != (rc= UdmLoadRedirectLinks(Indexer, db, URLList, &urlidhash)))
3807 goto ex;
3808
3809 ticks= UdmStartTimer();
3810 UdmLog(Indexer, UDM_LOG_INFO, "Loading links");
3811 for (i= 0; i < 256; i++)
3812 {
3813 char qbuf[128];
3814 UDM_SQLRES SQLRes;
3815 if ((i % 16) == 0)
3816 UdmLog(Indexer, UDM_LOG_EXTRA, "- Loading links (part %d..%d)", i, i + 16 - 1);
3817 /*
3818 Add the "ilinktext" column only if cache is not NULL. Otherwise,
3819 we're running "indexer --rewritepop", so ilinktext is not needed.
3820 */
3821 udm_snprintf(qbuf, sizeof(qbuf),
3822 "SELECT url_id,url%s FROM links WHERE seed=%d",
3823 cache ? ",linktext" : "", i);
3824 if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, &SQLRes, qbuf)))
3825 goto ex;
3826
3827 rc= UdmInvertedIndexProcessLinksResult(Indexer, &urlidhash, &URLPopInfoList,
3828 cache, &LinkInfoListList,
3829 &SQLRes, param.ilinktext);
3830 UdmSQLFree(&SQLRes);
3831 if (rc != UDM_OK)
3832 goto ex;
3833 }
3834 UdmLog(Indexer, UDM_LOG_INFO,
3835 "Loading links done: %.2f sec", UdmStopTimer(&ticks));
3836
3837 rc= UdmLinkInfoListListCalcPopRank(Indexer, &URLPopInfoList, &LinkInfoListList);
3838 UdmURLDataListNormalizePopRank(&URLPopInfoList, URLList);
3839
3840 ex:
3841 UdmURLPopInfoListFree(&URLPopInfoList);
3842 UdmLinkInfoListListFree(&LinkInfoListList);
3843 UdmHashFree(&urlidhash);
3844 return rc;
3845 }
3846
3847
3848 static udm_rc_t
UdmInvertedIndexRewritePopularityOneDB(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,const char * wtable)3849 UdmInvertedIndexRewritePopularityOneDB(UDM_AGENT *A,
3850 UDM_DB *db,
3851 UDM_QUERY *Query,
3852 const char *wtable)
3853 {
3854 udm_rc_t rc;
3855 udm_bool_t tr= UDM_TEST(UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT);
3856 UDM_WRITE_HELPER Helper;
3857
3858 if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
3859 goto ex2;
3860
3861 if (UDM_OK != (rc= UdmInvertedIndexAddLinkText(A, db, NULL, wtable,
3862 &Query->URLData,
3863 links_param_for_popularity)))
3864 goto ex;
3865
3866 if (tr && UDM_OK != (rc= UdmDBSQLBegin(A, db)))
3867 goto ex;
3868
3869 if (UDM_OK != (rc= UdmWritePopularityBdictAndTable(A, db, &Query->URLData,
3870 wtable, &Helper)))
3871 goto ex;
3872
3873 if (tr && UDM_OK != (rc= UdmDBSQLCommit(A, db)))
3874 goto ex;
3875
3876 ex:
3877 UdmWriteHelperFree(&Helper);
3878 ex2:
3879 UdmURLDataListFree(&Query->URLData);
3880 return rc;
3881 }
3882
3883
3884 static udm_rc_t
UdmRewritePopularity(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)3885 UdmRewritePopularity(UDM_AGENT *Indexer, UDM_DB *db, UDM_QUERY *Query)
3886 {
3887
3888 char tablename[64];
3889 UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
3890 return UdmInvertedIndexRewritePopularityOneDB(Indexer, db, Query, tablename);
3891 }
3892
3893
3894 /**
3895 Load links from the database.
3896 Add link words into the cache, and calculate popularity.
3897 Store cached link words into the index.
3898 TODO34: don't calculate popularity if param.use_popularity is UDM_FALSE.
3899 TODO34: add tests for all "UsePopularity" and "Section ilinktext" combinations
3900 for "indexer --index" and "indexer --rewritepop"
3901 */
3902 static udm_rc_t
UdmInvertedIndexAddLinks(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_URLDATALIST * URLList,UDM_ADD_LINKS_PARAM param)3903 UdmInvertedIndexAddLinks(UDM_AGENT *Indexer, UDM_DB *db,
3904 const char *wtable,
3905 UDM_URLDATALIST *URLList,
3906 UDM_ADD_LINKS_PARAM param)
3907 {
3908 udm_rc_t rc;
3909 UDM_INVERTED_INDEX_CACHE cache;
3910 UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3911 if (UDM_OK != (rc= UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3912 INVERTED_INDEX_CACHE_PARTS)))
3913 return rc;
3914
3915 if (UDM_OK != (rc= UdmInvertedIndexAddLinkText(Indexer, db, &cache,
3916 wtable, URLList, param)))
3917 goto ex;
3918
3919 rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, &cache);
3920
3921 ex:
3922 UdmInvertedIndexCacheFree(&cache);
3923 return rc;
3924 }
3925
3926 /***************************************************************/
3927 static udm_rc_t
UdmCreateInvertedWordIndexFromCachedCopySQL(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)3928 UdmCreateInvertedWordIndexFromCachedCopySQL(UDM_AGENT *Indexer,
3929 UDM_DB *db,
3930 UDM_QUERY *Query)
3931 {
3932 udm_rc_t rc;
3933 char buf[128], wtable[64];
3934 int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
3935 int tr_truncate= tr && (UdmSQLDBType(db) != UDM_DB_SYBASE);
3936 udm_timer_t ticks;
3937 udm_bool_t disable_keys= UdmVarListFindBool(UdmSQLDBVars(db), "DisableKeys", UDM_TRUE);
3938 UDM_WRITE_HELPER Helper;
3939 UDM_ADD_LINKS_PARAM param;
3940
3941 UdmAddLinksParamInitFromEnv(¶m, Indexer->Conf);
3942 if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
3943 return UDM_ERROR;
3944
3945 UdmLog(Indexer, UDM_LOG_DEBUG, Helper.use_deflate ? "Using deflate" : "Not using deflate");
3946
3947 /* Get table to write to */
3948 if (UDM_OK != (rc= UdmBlobGetWTable(Indexer, db, wtable, sizeof(wtable))))
3949 goto ret2;
3950 /* Lock tables for MySQL */
3951 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
3952 {
3953 if (UdmSQLDBVersion(db) >= 40000 && disable_keys)
3954 {
3955 sprintf(buf, "ALTER TABLE %s DISABLE KEYS", wtable);
3956 if (UDM_OK != UdmDBSQLQuery(Indexer, db, NULL, buf))
3957 goto ret;
3958 }
3959 udm_snprintf(buf, sizeof(buf),
3960 "LOCK TABLES "
3961 "url READ,urlinfo READ,cachedcopy READ,"
3962 "redirect READ,links READ,"
3963 "%s WRITE",
3964 wtable);
3965 if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, buf)))
3966 goto ret2;
3967 }
3968
3969 /* Delete old words from bdict */
3970 if ((tr_truncate && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
3971 UDM_OK != (rc= UdmDBSQLTableTruncateOrDelete(Indexer, db, wtable)) ||
3972 (tr_truncate && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
3973 goto ret;
3974
3975 /* Convert words */
3976
3977 ticks= UdmStartTimer();
3978 if (UDM_OK != (rc= UdmInvertedIndexCreate(Indexer, db, wtable, Query)))
3979 goto ret;
3980 if (UDM_OK != (rc= UdmInvertedIndexAddURLText(Indexer, db, wtable, &Query->URLData)))
3981 goto ret;
3982 if (param.ilinktext || param.use_popularity)
3983 {
3984 rc= param.ilinktext ?
3985 UdmInvertedIndexAddLinks(Indexer, db, wtable, &Query->URLData, param):
3986 UdmInvertedIndexAddLinkText(Indexer, db, NULL, wtable, &Query->URLData, param);
3987 if (rc != UDM_OK)
3988 goto ret;
3989 }
3990 UdmLog(Indexer, UDM_LOG_DEBUG,
3991 "UdmInvertedIndexCreate done: %.2f sec", UdmStopTimer(&ticks));
3992
3993 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
3994 {
3995 ticks= UdmStartTimer();
3996 if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES")))
3997 goto ret2;
3998 udm_snprintf(buf, sizeof(buf), "LOCK TABLES %s WRITE", wtable);
3999 if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, buf)))
4000 goto ret2;
4001 UdmLog(Indexer, UDM_LOG_DEBUG,
4002 "Unlocking tables: %.2f sec", UdmStopTimer(&ticks));
4003 }
4004
4005 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4006 {
4007 if (UdmSQLDBVersion(db) >= 40000 && disable_keys)
4008 {
4009 ticks= UdmStartTimer();
4010 UdmLog(Indexer, UDM_LOG_INFO, "Enabling SQL indexes");
4011 sprintf(buf, "ALTER TABLE %s ENABLE KEYS", wtable);
4012 UdmDBSQLQuery(Indexer, db, NULL, buf);
4013 UdmLog(Indexer, UDM_LOG_INFO,
4014 "Enabling SQL indexes done, %.2f sec", UdmStopTimer(&ticks));
4015 }
4016 }
4017
4018 /* Put timestamp: note, the indexes must be already enabled here! */
4019 if ((tr && (UDM_OK != (rc= UdmDBSQLBegin(Indexer, db)))) ||
4020 (UDM_OK != (rc= UdmBlobWriteTimestamp(Indexer, db, wtable, UDM_FALSE))) ||
4021 (tr && (UDM_OK != (rc= UdmDBSQLCommit(Indexer, db)))))
4022 goto ret;
4023
4024 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4025 UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES");
4026
4027 /* Convert URL */
4028 ticks= UdmStartTimer();
4029 UdmLog(Indexer, UDM_LOG_ERROR, "Writing url data");
4030 if ((tr && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
4031 (param.use_popularity &&
4032 UDM_OK != (rc= UdmWritePopularityBdictAndTable(Indexer, db, &Query->URLData, wtable, &Helper))) ||
4033 UDM_OK != (rc= UdmBlobWriteURLData(Indexer, db, Query, wtable, &Helper)) ||
4034 UDM_OK != (rc= UdmBlobWriteLimitsInternal(Indexer, db, wtable, &Helper)) ||
4035 (tr && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
4036 goto ret2;
4037
4038 UdmLog(Indexer, UDM_LOG_DEBUG,
4039 "Writing URL data done: %.2f sec", UdmStopTimer(&ticks));
4040
4041 /* Switch to new table */
4042 UdmLog(Indexer, UDM_LOG_ERROR, "Rotating table");
4043 rc= UdmBlobSetTable(Indexer, db);
4044 goto ret2;
4045
4046 ret:
4047 if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4048 UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES");
4049 ret2:
4050 UdmWriteHelperFree(&Helper);
4051 return rc;
4052 }
4053
4054
4055 /*
4056 Name is already escaped here, using UdmSQLEscStrSimple().
4057 */
4058 static udm_rc_t
UdmBlobLoadFastURLLimitByFullName(UDM_AGENT * A,UDM_DB * db,const char * ename,UDM_URLID_LIST * buf)4059 UdmBlobLoadFastURLLimitByFullName(UDM_AGENT *A,
4060 UDM_DB *db,
4061 const char *ename,
4062 UDM_URLID_LIST *buf)
4063 {
4064 udm_rc_t rc= UDM_OK;
4065 UDM_SQLRES SQLRes;
4066 char qbuf[256], tablename[64], exclude;
4067 size_t nrows, nurls, i, row;
4068
4069 exclude= buf->exclude;
4070 bzero((void*)buf, sizeof(*buf));
4071 buf->exclude= exclude;
4072
4073 UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
4074 udm_snprintf(qbuf, sizeof(qbuf),
4075 "SELECT coords FROM %s WHERE word LIKE '%s'", tablename, ename);
4076 if (UDM_OK != (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)))
4077 goto ret;
4078
4079 if (! (nrows= UdmSQLNumRows(&SQLRes)))
4080 {
4081 buf->empty= 1;
4082 goto ret;
4083 }
4084 nurls= 0;
4085 for (row= 0; row < nrows; row++)
4086 nurls+= UdmSQLLen(&SQLRes, row, 0) / 4;
4087
4088 if (!(buf->urls= (urlid_t*) UdmMalloc(sizeof(urlid_t) * nurls)))
4089 goto ret;
4090
4091 for (row= 0; row < nrows; row++)
4092 {
4093 const char *src= UdmSQLValue(&SQLRes, row, 0);
4094 nurls= UdmSQLLen(&SQLRes, row, 0) / 4;
4095 if (src && nurls)
4096 for (i = 0; i < nurls; i++, src+= 4)
4097 buf->urls[buf->nurls++]= (urlid_t) udm_get_int4(src);
4098 }
4099 UdmURLIdListSort(buf);
4100
4101 ret:
4102 UdmSQLFree(&SQLRes);
4103 return rc;
4104 }
4105
4106
4107 udm_rc_t
UdmBlobLoadFastURLLimit(UDM_AGENT * A,UDM_DB * db,const char * name,UDM_URLID_LIST * buf)4108 UdmBlobLoadFastURLLimit(UDM_AGENT *A, UDM_DB *db,
4109 const char *name, UDM_URLID_LIST *buf)
4110 {
4111 char ename[130], ename2[130];
4112 size_t namelen= strlen(name);
4113 if (namelen > 64)
4114 return UDM_OK;
4115 UdmDBSQLEscStrSimple(A, db, ename, name, namelen);
4116 udm_snprintf(ename2, sizeof(ename2), "##limit#%s", ename);
4117 return UdmBlobLoadFastURLLimitByFullName(A, db, ename2, buf);
4118 }
4119
4120
4121 static udm_rc_t
UdmBlobLoadFastOrderOrFastScore(UDM_AGENT * A,UDM_DB * db,UDM_SQLRES * SQLRes,const char * prefix,const char * name)4122 UdmBlobLoadFastOrderOrFastScore(UDM_AGENT *A, UDM_DB *db, UDM_SQLRES *SQLRes,
4123 const char *prefix, const char *name)
4124 {
4125 char qbuf[256], ename[256], tablename[64];
4126 size_t namelen= strlen(name);
4127 bzero((void*) SQLRes, sizeof(*SQLRes));
4128 if (namelen > 64)
4129 return UDM_OK;
4130 UdmDBSQLEscStrSimple(A, db, ename, name, namelen); /* Escape order name */
4131 UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
4132 udm_snprintf(qbuf, sizeof(qbuf),
4133 "SELECT coords FROM %s WHERE word LIKE '##%s#%s'",
4134 tablename, prefix, ename);
4135 return UdmDBSQLQuery(A, db, SQLRes, qbuf);
4136 }
4137
4138
4139 static udm_rc_t
UdmBlobUnpackFastOrder(UDM_URL_INT4_LIST * List,UDM_SQLRES * SQLRes,size_t record_size)4140 UdmBlobUnpackFastOrder(UDM_URL_INT4_LIST *List,
4141 UDM_SQLRES *SQLRes,
4142 size_t record_size)
4143 {
4144 size_t nrows, nurls, row, param;
4145 udm_rc_t rc= UDM_OK;
4146
4147 bzero((void*)List, sizeof(*List));
4148
4149 if (!(nrows= UdmSQLNumRows(SQLRes)))
4150 goto ret;
4151
4152 nurls= 0;
4153 for (row= 0; row < nrows; row++)
4154 nurls+= UdmSQLLen(SQLRes, row, 0) / record_size;
4155
4156 if (!(List->Item= (UDM_URL_INT4*) UdmMalloc(sizeof(UDM_URL_INT4) * nurls)))
4157 {
4158 rc= UDM_ERROR;
4159 goto ret;
4160 }
4161
4162 for (param= 0x7FFFFFFF, row= 0; row < nrows; row++)
4163 {
4164 const char *src= UdmSQLValue(SQLRes, row, 0);
4165 nurls= UdmSQLLen(SQLRes, row, 0) / record_size;
4166 if (src && nurls)
4167 {
4168 size_t i;
4169 for (i= 0; i < nurls; i++, src+= record_size)
4170 {
4171 UDM_URL_INT4 *Item= &List->Item[List->nitems++];
4172 Item->url_id= (urlid_t) udm_get_int4(src);
4173 if (record_size == 5)
4174 Item->param= src[4];
4175 else
4176 Item->param= --param;
4177 }
4178 }
4179 }
4180 if (List->nitems > 1)
4181 UdmSort(List->Item, List->nitems, sizeof(UDM_URL_INT4), (udm_qsort_cmp) UdmCmpURLID);
4182 ret:
4183 return rc;
4184 }
4185
4186
4187 udm_rc_t
UdmBlobLoadFastOrder(UDM_AGENT * A,UDM_DB * db,UDM_URL_INT4_LIST * List,const char * name)4188 UdmBlobLoadFastOrder(UDM_AGENT *A, UDM_DB *db,
4189 UDM_URL_INT4_LIST *List, const char *name)
4190 {
4191 udm_rc_t rc= UDM_OK;
4192 UDM_SQLRES SQLRes;
4193
4194 if (UDM_OK != (rc= UdmBlobLoadFastOrderOrFastScore(A, db, &SQLRes, "order", name)) ||
4195 UDM_OK != (rc= UdmBlobUnpackFastOrder(List, &SQLRes, 4)))
4196 goto ret;
4197
4198 ret:
4199 UdmSQLFree(&SQLRes);
4200 return rc;
4201 }
4202
4203
4204 udm_rc_t
UdmBlobLoadFastScore(UDM_AGENT * A,UDM_DB * db,UDM_URL_INT4_LIST * List,const char * name)4205 UdmBlobLoadFastScore(UDM_AGENT *A, UDM_DB *db,
4206 UDM_URL_INT4_LIST *List, const char *name)
4207 {
4208 udm_rc_t rc= UDM_OK;
4209 UDM_SQLRES SQLRes;
4210
4211 if (UDM_OK != (rc= UdmBlobLoadFastOrderOrFastScore(A, db, &SQLRes, "score", name)) ||
4212 UDM_OK != (rc= UdmBlobUnpackFastOrder(List, &SQLRes, 5)))
4213 goto ret;
4214
4215 ret:
4216 UdmSQLFree(&SQLRes);
4217 return rc;
4218 }
4219
4220
4221 static udm_rc_t
UdmWordStatCreateBlob(UDM_AGENT * A,UDM_DB * db)4222 UdmWordStatCreateBlob(UDM_AGENT *A, UDM_DB *db)
4223 {
4224 char qbuf[128], tablename[64], expr[64];
4225 UdmBlobGetTableForRewrite(A, db, tablename, sizeof(tablename));
4226 switch(UdmSQLDBType(db))
4227 {
4228 case UDM_DB_ORACLE8:
4229 udm_snprintf(expr, sizeof(expr), "lengthb(coords)");
4230 break;
4231 case UDM_DB_SQLITE3:
4232 udm_snprintf(expr, sizeof(expr), "length(coords)");
4233 break;
4234 case UDM_DB_MONETDB:
4235 /* Div by 2, to convert hex digits to real length */
4236 udm_snprintf(expr, sizeof(expr), "length(cast(coords as text))/2");
4237 break;
4238 case UDM_DB_MSSQL:
4239 udm_snprintf(expr, sizeof(expr), "datalength(coords)");
4240 break;
4241 default:
4242 udm_snprintf(expr, sizeof(expr), "octet_length(coords)");
4243 }
4244 udm_snprintf(qbuf, sizeof(qbuf),
4245 "SELECT word, sum(%s) FROM %s WHERE word NOT LIKE '##%%' GROUP BY word",
4246 expr, tablename);
4247 return UdmWordStatQuery(A, db, qbuf);
4248 }
4249
4250
4251 /*
4252 Dump word information to stdout.
4253 */
4254 static udm_rc_t
UdmDumpWordInfoOneDocBlob(UDM_AGENT * A,UDM_DB * db,UDM_DOCUMENT * Doc)4255 UdmDumpWordInfoOneDocBlob(UDM_AGENT *A, UDM_DB *db, UDM_DOCUMENT *Doc)
4256 {
4257 return UDM_OK;
4258 }
4259
4260
4261 static udm_rc_t
UdmBlobInitSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)4262 UdmBlobInitSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
4263 UDM_FINDWORD_ARGS *args)
4264 {
4265 udm_rc_t rc;
4266 if (!args->live_updates)
4267 return UDM_OK;
4268 if ((UDM_OK != (rc= UdmBlobLoadLiveUpdateLimitLoad(A, db, args))))
4269 return rc;
4270 return udm_dbmode_handler_rawblob.InitSearch(A, db, Query, args);
4271 }
4272
4273
4274 static udm_rc_t
UdmQueryActionBlob(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,udm_querycmd_t cmd)4275 UdmQueryActionBlob(UDM_AGENT *A, UDM_DB *db,
4276 UDM_QUERY *Query, udm_querycmd_t cmd)
4277 {
4278 switch (cmd)
4279 {
4280 case UDM_QUERYCMD_CLEAR: return UdmTruncateDictBlob(A, db);
4281 case UDM_QUERYCMD_WORDSTAT: return UdmWordStatCreateBlob(A, db);
4282 case UDM_QUERYCMD_REWRITE_URLDATA: return UdmRewriteURL(A, db, Query);
4283 case UDM_QUERYCMD_REWRITE_LIMITS: return UdmRewriteLimits(A, db);
4284 case UDM_QUERYCMD_REWRITE_POPULARITY: return UdmRewritePopularity(A, db, Query);
4285 case UDM_QUERYCMD_INDEX:
4286 return UdmCreateInvertedWordIndexFromCachedCopySQL(A, db, Query);
4287 default: break;
4288 }
4289 return UDM_NOTARGET;
4290 }
4291
4292
4293 const UDM_DBMODE_HANDLER udm_dbmode_handler_blob=
4294 {
4295 "blob",
4296 UdmStoreWordsBlob,
4297 UdmQueryActionBlob,
4298 UdmDeleteWordsFromURLBlob,
4299 UdmFindWordBlob,
4300 UdmDumpWordInfoOneDocBlob,
4301 UdmBlobInitSearch,
4302 };
4303
4304 #endif /* HAVE_SQL */
4305