1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <math.h>
23 
24 #include "udm_common.h"
25 #include "udm_utils.h"
26 #include "udm_db.h"
27 #include "udm_db_int.h"
28 #include "udm_vars.h"
29 #include "udm_coords.h"
30 #include "udm_log.h"
31 #include "udm_hash.h"
32 #include "udm_word.h"
33 #include "udm_crc32.h"
34 #include "udm_http.h"
35 #include "udm_contentencoding.h"
36 #include "udm_parsehtml.h"
37 #include "udm_store.h"
38 #include "udm_textlist.h"
39 #include "udm_indexcache.h"
40 #include "udm_result.h"
41 #include "udm_agent.h"
42 
43 #ifdef HAVE_SQL
44 
45 
46 #ifdef WIN32
47 #define UDM_DEFAULT_ZINT4   UDM_TRUE
48 #define UDM_DEFAULT_DEFLATE UDM_TRUE
49 #else
50 #define UDM_DEFAULT_ZINT4   UDM_FALSE
51 #define UDM_DEFAULT_DEFLATE UDM_FALSE
52 #endif
53 
54 
55 /********** Record encoding and compression ************/
56 /*
57   Record compression types:
58   - 0x01 = Deflate
59   - 0x02 = Zint4
60   - 0x03 = Zint4+Deflate
61   - 0x04 = Single URL_ID range
62 */
63 
64 #define UDM_BLOB_COMP_NONE                            0x00
65 #define UDM_BLOB_COMP_DEFLATE                         0x01
66 #define UDM_BLOB_COMP_ZINT4                           0x02
67 #define UDM_BLOB_COMP_ZINT4_DEFLATE                   0x03
68 #define UDM_BLOB_COMP_SINGLE_RANGE                    0x04
69 #define UDM_BLOB_COMP_URLID_DELTA_VARIABLE            0x05
70 #define UDM_BLOB_COMP_URLID_DELTA_2BYTES              0x06
71 #define UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET  0x07
72 #define UDM_BLOB_COMP_URLID_DELTA_3BYTES              0x08
73 #define UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET  0x09
74 #define UDM_BLOB_COMP_URLID_RANGE_MULTI               0x0A
75 
76 
77 static inline int
UdmDSTRAppendCompressionType(UDM_DSTR * dstr,int compression_type)78 UdmDSTRAppendCompressionType(UDM_DSTR *dstr, int compression_type)
79 {
80   if (!UdmDSTRAppendINT4(dstr, 0xFFFFFFFF))
81     return 0;
82   return (compression_type <= UDM_BLOB_COMP_ZINT4_DEFLATE) ?
83          UdmDSTRAppendINT4(dstr, compression_type) :
84          UdmDSTRAppendINT2BE(dstr, compression_type);
85 }
86 
87 
88 /*******************************************/
89 
90 /*
91   If can do "indexer -Eblob" using RENAME TABLE.
92 */
93 static int
UdmBlobCanDoRename(UDM_DB * db)94 UdmBlobCanDoRename(UDM_DB *db)
95 {
96   return
97     (UdmSQLDBFlags(db) & UDM_SQL_HAVE_RENAME) &&
98     (UdmSQLDBFlags(db) & UDM_SQL_HAVE_CREATE_LIKE) &&
99     /* PgSQL can do RENAME only when "DROP TABLE IF EXISTS" is supported */
100     (UdmSQLDBType(db)!= UDM_DB_PGSQL || UdmSQLDBFlags(db) & UDM_SQL_HAVE_DROP_IF_EXISTS);
101 }
102 
103 
104 static udm_rc_t
UdmBlobCreateIndexRandomName(UDM_AGENT * A,UDM_DB * db,const char * table_name)105 UdmBlobCreateIndexRandomName(UDM_AGENT *A, UDM_DB *db, const char *table_name)
106 {
107   char qbuf[128];
108   /* Create index with an unique name */
109   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
110   {
111     udm_snprintf(qbuf, sizeof(qbuf),
112                  "ALTER TABLE %s ADD KEY (word)", table_name);
113   }
114   else
115   {
116     udm_snprintf(qbuf, sizeof(qbuf),
117                 "CREATE INDEX bdict_%d_%d ON %s (word)",
118                  (int) time(0), (int) (UdmStartTimer() % 0xFFFF), table_name);
119   }
120   return UdmDBSQLQuery(A, db, NULL, qbuf);
121 }
122 
123 
124 static int
UdmBlobGetTable(UDM_AGENT * A,UDM_DB * db)125 UdmBlobGetTable(UDM_AGENT *A, UDM_DB *db)
126 {
127   UDM_SQLRES SQLRes;
128   int rc;
129   const char *val;
130 
131   return(1);
132 
133   if (UDM_OK != UdmDBSQLQuery(A, db, &SQLRes, "SELECT n FROM bdictsw"))
134     return(1);
135 
136   if (! UdmSQLNumRows(&SQLRes) || ! (val = UdmSQLValue(&SQLRes, 0, 0))) rc = 2;
137   else if (*val != '1') rc = 3;
138   else rc = 4;
139 
140   UdmSQLFree(&SQLRes);
141   return(rc);
142 }
143 
144 
145 /*
146   This function returns "bdict" by default,
147   or the "bdict" parameter from DBAddr, if exists.
148 */
149 static const char *
UdmBlobGetTableNamePrefix(UDM_DB * db)150 UdmBlobGetTableNamePrefix(UDM_DB *db)
151 {
152   return UdmVarListFindStr(UdmSQLDBVars(db), "bdict", "bdict");
153 }
154 
155 
156 /*
157   This function is used when "indexer -Erewritelimit"
158   or "indexer -Erewriteurl" is called.
159 */
160 static size_t
UdmBlobGetTableForRewrite(UDM_AGENT * A,UDM_DB * db,char * dst,size_t dstlen)161 UdmBlobGetTableForRewrite(UDM_AGENT *A, UDM_DB *db, char *dst, size_t dstlen)
162 {
163   const char *prefix= UdmBlobGetTableNamePrefix(db);
164   return udm_snprintf(dst, dstlen, "%s", prefix);
165 }
166 
167 
168 static size_t
UdmBlobGetRTable(UDM_AGENT * A,UDM_DB * db,char * dst,size_t dstlen)169 UdmBlobGetRTable(UDM_AGENT *A, UDM_DB *db, char *dst, size_t dstlen)
170 {
171   const char *prefix= UdmBlobGetTableNamePrefix(db);
172   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
173     return udm_snprintf(dst, dstlen, "%s", prefix);
174   if (UdmBlobGetTable(A, db) == 3)
175     return udm_snprintf(dst, dstlen, "%s00", prefix);
176   return udm_snprintf(dst, dstlen, "%s", prefix);
177 }
178 
179 
180 static udm_rc_t
UdmBlobGetWTable(UDM_AGENT * A,UDM_DB * db,char * name,size_t namelen)181 UdmBlobGetWTable(UDM_AGENT *A, UDM_DB *db, char *name, size_t namelen)
182 {
183   udm_rc_t rc;
184 
185   if (UdmBlobCanDoRename(db))
186   {
187     if ((UDM_OK != (rc= UdmDBSQLDropTableIfExists(A, db, "bdict_tmp"))) ||
188         (UDM_OK != (rc= UdmDBSQLCopyStructure(A, db, "bdict", "bdict_tmp"))) ||
189         (UDM_OK != (rc= UdmBlobCreateIndexRandomName(A, db, "bdict_tmp"))))
190       return rc;
191     udm_snprintf(name, namelen, "bdict_tmp");
192     return UDM_OK;
193   }
194 
195   udm_snprintf(name, namelen, "%s", UdmBlobGetTableNamePrefix(db));
196   if (UdmBlobGetTable(A, db) == 4)
197     udm_snprintf(name, namelen, "%s00", UdmBlobGetTableNamePrefix(db));
198   return UDM_OK;
199 }
200 
201 
202 static udm_rc_t
UdmBlobSetTable(UDM_AGENT * A,UDM_DB * db)203 UdmBlobSetTable(UDM_AGENT *A, UDM_DB *db)
204 {
205   char qbuf[128];
206   udm_rc_t rc;
207   int t, n;
208   const char *table_name= UdmVarListFindBool(&A->Conf->Vars, "delta", UDM_FALSE) ?
209                           "bdict_delta" :
210                           UdmBlobGetTableNamePrefix(db);
211 
212   if (UdmBlobCanDoRename(db))
213   {
214     if (UDM_OK == (rc= UdmDBSQLDropTableIfExists(A, db, table_name)))
215       rc= UdmDBSQLRenameTable(A, db, "bdict_tmp", table_name);
216     return rc;
217   }
218 
219   t= UdmBlobGetTable(A, db);
220   if (t == 1) return(UDM_OK);
221   else if (t == 4) n = 0;
222   else n = 1;
223 
224   rc= UdmDBSQLQuery(A, db, NULL, "DELETE FROM bdictsw");
225   if (rc != UDM_OK) return(UDM_OK);
226   udm_snprintf(qbuf, sizeof(qbuf), "INSERT INTO bdictsw VALUES(%d)", n);
227   rc = UdmDBSQLQuery(A, db, NULL, qbuf);
228   if (rc != UDM_OK) return(UDM_OK);
229   return(UDM_OK);
230 }
231 
232 
233 typedef struct udm_blob_cache_stat_st
234 {
235   urlid_t cur_url_id;
236   urlid_t min_url_id;
237   urlid_t max_url_id;
238   urlid_t max_url_id_delta;
239   urlid_t range_min_url_id;
240   urlid_t range_max_url_id;
241   size_t ndistinct_url_ids_minus_one;
242   size_t compression_type;
243   size_t range_offset;
244   size_t nranges;
245 } UDM_BLOB_CACHE_WORD_STAT;
246 
247 
248 static inline void
udm_put_int2(int i,unsigned char * dst)249 udm_put_int2(int i, unsigned char *dst)
250 {
251   dst[0]= (unsigned char) (i & 0xFF);
252   dst[1]= (unsigned char) (i >> 8);
253 }
254 
255 static inline int
udm_get_int2(const unsigned char * src)256 udm_get_int2(const unsigned char *src)
257 {
258   return ((int) src[0]) + (((int) src[1]) << 8);
259 }
260 
261 
262 static inline void
udm_put_int3(int i,unsigned char * dst)263 udm_put_int3(int i, unsigned char *dst)
264 {
265   dst[0]= (unsigned char) (i & 0xFF);
266   dst[1]= (unsigned char) ((i >> 8) & 0xFF);
267   dst[2]= (unsigned char) ((i >> 16) & 0xFF);
268 }
269 
270 static inline int
udm_get_int3(const unsigned char * src)271 udm_get_int3(const unsigned char *src)
272 {
273   return ((int) src[0]) + (((int) src[1]) << 8) + (((int) src[2]) << 16);
274 }
275 
276 
277 /*
278   Pack integer using various formats:
279   0 = don't put anything
280   1 = one byte
281   2 = two bytes
282   3 = three bytes
283   4 = four bytes
284   5 = variable length encoding
285 
286   dst must have enough space to store the packed integer.
287 */
288 static inline size_t
udm_put_int_with_format(int i,unsigned char * dst,int format)289 udm_put_int_with_format(int i, unsigned char *dst, int format)
290 {
291   if (format == 1)
292   {
293     UDM_ASSERT((unsigned int) i < 256);
294     dst[0]= (char) (unsigned char) i;
295     return 1;
296   }
297   else if (format == 2)
298   {
299     udm_put_int2(i, (unsigned char*) dst);
300     return 2;
301   }
302   else if (format == 3)
303   {
304     udm_put_int3(i, (unsigned char*) dst);
305     return 3;
306   }
307   else if (format == 4)
308   {
309     udm_put_int4(i, (unsigned char*) dst);
310     return 4;
311   }
312   else if (format == 5)
313   {
314     return udm_coord_put(i, dst, dst + 4);
315   }
316   UDM_ASSERT(format == 0);
317   return 0;
318 }
319 
320 
321 /*
322   Unpack coords when only minpos, maxpos and seclen is of interest
323 */
324 static UDM_COORD2 *
UdmBlobPackedCoordsUnpackMinMaxLen(const unsigned char * s,const unsigned char * e,size_t nrecs,UDM_COORD2 * C,UDM_COORD2 * Coord,const unsigned char ** end,int save_section_size,UDM_SEARCHSECTION * Section)325 UdmBlobPackedCoordsUnpackMinMaxLen(const unsigned char *s,
326                                    const unsigned char *e,
327                                    size_t nrecs,
328                                    UDM_COORD2 *C,
329                                    UDM_COORD2 *Coord,
330                                    const unsigned char **end,
331                                    int save_section_size,
332                                    UDM_SEARCHSECTION *Section)
333 {
334   size_t crd, nbytes;
335   if (save_section_size)
336   {
337     if (nrecs > 1)
338     {
339       s= udm_coord_sum(&crd, s, e, nrecs - 1); /* Sum middle coords */
340       C->pos+= crd;
341       Section->maxpos= C->pos;
342     }
343     else
344       Section->maxpos= C->pos; /* One coord, minpos=maxpos */
345 
346     if ((nbytes= udm_coord_get(&crd, s, e))) /* Get seclen */
347     {
348       s+= nbytes;
349       C->pos+= crd;
350       Section->seclen= C->pos;
351       Section->ncoords= nrecs;
352       Coord+= nrecs;
353     }
354     else
355     {
356       Section->seclen= 0;
357       Section->ncoords= 0;
358     }
359   }
360   else
361   {
362     s= udm_coord_sum(&crd, s, e, nrecs);      /* Sum middle coords  */
363     C->pos+= crd;
364     Section->maxpos= C->pos;
365     Section->seclen= 0;
366     Section->ncoords= nrecs + 1;
367     Coord+= nrecs + 1;
368   }
369   *end= s;
370   return Coord;
371 }
372 
373 
374 
375 static size_t
UdmBlobCoordsGetCompressionType(UDM_AGENT * A,UDM_BLOB_CACHE_WORD_STAT * Stat,const unsigned char * s,size_t length)376 UdmBlobCoordsGetCompressionType(UDM_AGENT *A,
377                                 UDM_BLOB_CACHE_WORD_STAT *Stat,
378                                 const unsigned char *s,
379                                 size_t length)
380 {
381   size_t header_size;
382   bzero((void*) Stat, sizeof(*Stat));
383   Stat->compression_type= (length > 10 && udm_get_int4(s) == 0xFFFFFFFF) ?
384                            udm_get_int4(s + 4) : 0;
385   header_size= (Stat->compression_type & 0xFFFF0000) ? 6 : 8;
386   Stat->compression_type&= 0x0000FFFF;
387   /*
388   fprintf(stderr, "Hdr size=%d cmpr=%08X len=%d\n", header_size, Stat->compression_type, length);
389   */
390   if (Stat->compression_type == UDM_BLOB_COMP_SINGLE_RANGE)
391   {
392     Stat->min_url_id= udm_get_int4(s + header_size);
393     Stat->max_url_id= udm_get_int4(s + header_size + 4);
394     Stat->cur_url_id= Stat->min_url_id;
395     UdmLog(A, UDM_LOG_DEBUG,
396            "Single-URLID-Range compression: %d-%d (%d docs)",
397            Stat->min_url_id, Stat->max_url_id,
398            Stat->max_url_id - Stat->min_url_id + 1);
399     return header_size + 8;
400   }
401   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_VARIABLE)
402   {
403     UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta compression");
404     return header_size;
405   }
406   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES)
407   {
408     UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta-2bytes compression");
409     return header_size;
410   }
411   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET)
412   {
413     Stat->cur_url_id= udm_get_int2(s + header_size) << 16;
414     UdmLog(A, UDM_LOG_DEBUG,
415            "URLID-Delta-2bytes-offset compression, offs=%d", Stat->cur_url_id);
416     return header_size + 2;
417   }
418   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET)
419   {
420     Stat->cur_url_id= udm_get_int4(s + header_size);
421     UdmLog(A, UDM_LOG_DEBUG,
422            "URLID-Delta-1byte-offset compression, offs=%d", Stat->cur_url_id);
423     return header_size + 4;
424   }
425   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_DELTA_3BYTES)
426   {
427     UdmLog(A, UDM_LOG_DEBUG, "URLID-Delta-2bytes compression");
428     return header_size;
429   }
430   else if (Stat->compression_type == UDM_BLOB_COMP_URLID_RANGE_MULTI)
431   {
432     Stat->nranges= udm_get_int4(s + header_size);
433     Stat->range_offset= header_size + 4;
434     UdmLog(A, UDM_LOG_DEBUG,
435            "URLID-Range-Multi compression, nranges=%d", (int) Stat->nranges);
436     return header_size + 4 + 8 * Stat->nranges;
437   }
438   else if (Stat->compression_type != UDM_BLOB_COMP_NONE)
439   {
440     UdmLog(A, UDM_LOG_DEBUG,
441            "Unknown coompression type: %08X", (int) Stat->compression_type);
442   }
443   return 0;
444 }
445 
446 
447 static size_t
UdmExpectedSectionCount(size_t compression_type,size_t length)448 UdmExpectedSectionCount(size_t compression_type, size_t length)
449 {
450   switch (compression_type)
451   {
452     case UDM_BLOB_COMP_NONE:
453       /*
454         Shortest section with no compression is 6 bytes:
455         - 4 bytes for URL id
456         - 1 byte for "ncoords"
457         - 1 byte for coord
458       */
459       return length / 6;
460     case UDM_BLOB_COMP_URLID_DELTA_VARIABLE:
461       /*
462         - 1..4 bytes URL id delta
463         - 1 byte for "ncoordss"
464         - 1 byte for coord
465       */
466       return length / 3;
467 
468     case UDM_BLOB_COMP_SINGLE_RANGE:
469     case UDM_BLOB_COMP_URLID_RANGE_MULTI:
470       /*
471         Shortest section with single-range compression is 2 byest:
472         - 1 byte for ncoords
473         - 1 byte for coord
474       */
475       return length / 2;
476     case UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET:
477       /* 1 byte url_id, 1 byte ncoorrds, 1 byte coords */
478       return length / 3;
479     case UDM_BLOB_COMP_URLID_DELTA_2BYTES:
480     case UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET:
481       /* 2 bytes url_id, 1 byte ncoords, 1 byte coord */
482       return length / 4;
483     case UDM_BLOB_COMP_URLID_DELTA_3BYTES:
484       /* 3 bytes url_id, 1 bbyte ncoords, 1 byte coord */
485       return length / 5;
486   }
487 
488   UDM_ASSERT(0); /* Should not normally get here */
489   return length;
490 }
491 
492 
493 static udm_rc_t
UdmSectionListBlobCoordsUnpack(UDM_AGENT * A,UDM_SEARCHSECTIONLIST * SectionList,UDM_URLID_LIST * urls,UDM_SEARCHSECTION * SectionTemplate,const unsigned char * s,size_t length,int save_section_size,int need_coords)494 UdmSectionListBlobCoordsUnpack(UDM_AGENT *A,
495                                UDM_SEARCHSECTIONLIST *SectionList,
496                                UDM_URLID_LIST *urls,
497                                UDM_SEARCHSECTION *SectionTemplate,
498                                const unsigned char *s,
499                                size_t length,
500                                int save_section_size,
501                                int need_coords)
502 {
503   size_t ncoords= 0, nurls= urls->nurls;
504   const unsigned char *s0= s;
505   const unsigned char *e= s + length;
506   const unsigned char *last_urlid_start= e - sizeof(urlid_t) - 1;
507   UDM_COORD2 C, *Coord;
508   unsigned char secno= SectionTemplate->secno;
509   unsigned char wordnum= SectionTemplate->wordnum;
510   unsigned char order= SectionTemplate->order;
511   UDM_SEARCHSECTION *Section;
512   UDM_BLOB_CACHE_WORD_STAT Stat;
513   const unsigned char *range_ptr;
514   int single_range, range_multi, compr_urlid_delta;
515   int compr_urlid_delta_1bytes, compr_urlid_delta_2bytes, compr_urlid_delta_3bytes;
516   size_t coords_alloced, sections_alloced;
517   s+= UdmBlobCoordsGetCompressionType(A, &Stat, s, length);
518   range_ptr= s0 + Stat.range_offset;
519 
520   UdmLog(A, UDM_LOG_DEBUG+1, "Secno=%d len=%d",
521          SectionTemplate->secno, (int) length);
522   single_range= (Stat.compression_type == UDM_BLOB_COMP_SINGLE_RANGE);
523   range_multi= (Stat.compression_type == UDM_BLOB_COMP_URLID_RANGE_MULTI);
524   compr_urlid_delta= (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_VARIABLE);
525   compr_urlid_delta_2bytes= (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES) ||
526                             (Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_2BYTES_WITH_OFFSET);
527   compr_urlid_delta_3bytes= Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_3BYTES;
528   compr_urlid_delta_1bytes= Stat.compression_type == UDM_BLOB_COMP_URLID_DELTA_1BYTES_WITH_OFFSET;
529 
530   if (single_range || range_multi)
531     last_urlid_start= e - 1; /* TODO: check other modes */
532 
533   coords_alloced= length;
534   sections_alloced= UdmExpectedSectionCount(Stat.compression_type, length);
535   UdmSearchSectionListAlloc(SectionList, coords_alloced, sections_alloced);
536   Coord= SectionList->Coord;
537   Section= SectionList->Section;
538 
539   /*
540     A non-compressed chunk consists of:
541     - sizeof(urlid_t)
542     - at least one byte for length
543   */
544   for (C.order= order ; s < last_urlid_start; )
545   {
546     int active= 1;
547     size_t nrecs;
548 
549     Section->Coord= Coord;
550     Section->secno= secno;
551     if (Stat.compression_type)
552     {
553       if (single_range)
554       {
555         Section->url_id= Stat.cur_url_id++;
556       }
557       else if (range_multi)
558       {
559         if (Stat.cur_url_id == Stat.range_max_url_id && Stat.nranges--)
560         {
561           Stat.range_min_url_id= udm_get_int4(range_ptr); range_ptr+= 4;
562           Stat.range_max_url_id= udm_get_int4(range_ptr); range_ptr+= 4;
563           Stat.cur_url_id= Stat.range_min_url_id;
564           /*
565           fprintf(stderr, "loading range: %d-%d nranges remain: %d\n",
566                   Stat.range_min_url_id, Stat.range_max_url_id, Stat.nranges);
567           */
568         }
569         else
570         {
571           Stat.cur_url_id++;
572         }
573         Section->url_id= Stat.cur_url_id;
574       }
575       else if (compr_urlid_delta)
576       {
577         size_t delta, nbytes= udm_coord_get(&delta, s, e);
578         s+= nbytes;
579         if (!nbytes)
580           break;
581         Stat.cur_url_id+= delta;
582         Section->url_id= Stat.cur_url_id;
583       }
584       else if (compr_urlid_delta_1bytes)
585       {
586         size_t delta= (unsigned char) *s;
587         s++;
588         Stat.cur_url_id+= delta;
589         Section->url_id= Stat.cur_url_id;
590         /*
591         fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
592         */
593       }
594       else if (compr_urlid_delta_2bytes)
595       {
596         size_t delta= udm_get_int2(s);
597         s+= 2;
598         Stat.cur_url_id+= delta;
599         Section->url_id= Stat.cur_url_id;
600         /*
601         fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
602         */
603       }
604       else if (compr_urlid_delta_3bytes)
605       {
606         size_t delta= udm_get_int3(s);
607         s+= 3;
608         Stat.cur_url_id+= delta;
609         Section->url_id= Stat.cur_url_id;
610         /*
611         fprintf(stderr, "url_id=%d delta=%d\n", Stat.cur_url_id, delta);
612         */
613       }
614     }
615     else
616     {
617       Section->url_id= (urlid_t) udm_get_int4(s);
618       s+= 4;
619     }
620     Section->wordnum= wordnum;
621     Section->order= order;
622 
623     if (nurls)
624     {
625       void *found= UdmBSearch(&Section->url_id, urls->urls, urls->nurls,
626                               sizeof(urlid_t), (udm_qsort_cmp)UdmCmpURLID);
627       if (found && urls->exclude)
628         active= 0;
629       if (!found && !urls->exclude)
630         active= 0;
631     }
632 
633     /* Get number of coords */
634     if (*s < 128)
635     {
636       nrecs= *s++;
637     }
638     else
639     {
640       size_t nbytes= udm_coord_get(&nrecs, s, e);
641       if (!nbytes) break;
642       s+= nbytes;
643     }
644 
645     if (!nrecs) /* extra safety */
646       break;
647 
648     if (!active)
649     {
650       s= udm_coord_skip(s, e, nrecs);
651       continue;
652     }
653 
654     ncoords+= nrecs;
655 
656     if (save_section_size && nrecs > 1)
657       ncoords--;
658 
659     Section->PackedCoord= s;
660 
661     /* Get first coord and put into S->minpos */
662     if (*s < 128)
663     {
664       C.pos= *s++;
665     }
666     else
667     {
668       size_t crd;
669       size_t nbytes= udm_coord_get(&crd, s, e);
670       if (!nbytes) break;
671       s+= nbytes;
672       C.pos= crd;
673     }
674 
675     Section->minpos= C.pos;
676 
677     /*
678       If no coords anymore.
679       Maybe the "section length" record didn't fit
680       into "64K words per section" limit.
681       Add section with seclen=0.
682     */
683     if (!--nrecs)
684     {
685       Section->seclen= C.pos;
686       Section->ncoords= 1;
687       Section->maxpos= C.pos;
688       if (need_coords)
689         *Coord= C;
690       Coord++;
691       Section++;
692       continue;
693     }
694 
695     if (!need_coords) /* Does not need coords, e.g. one word search */
696     {
697       Coord= UdmBlobPackedCoordsUnpackMinMaxLen(s, e, nrecs, &C, Coord, &s,
698                                                 save_section_size, Section);
699       Section++;
700       continue;
701     }
702 
703     *Coord++= C; /* Add first coord */
704 
705     /* Unpack the other coordinates */
706     Coord= UdmPackedCoordsToUnpackedCoords(s, e, nrecs, &C, Coord, &s);
707 
708     /* Set section length */
709     nrecs= Coord - Section->Coord;
710     if (save_section_size)
711     {
712       /*
713         We need to check whether Coord > Coord0 in the above
714         condition: URL could be skipped because of limit
715       */
716       Section->seclen= ((--Coord)->pos);
717       Section->ncoords= nrecs - 1;
718       Section->maxpos= Coord[-1].pos;;
719     }
720     else
721     {
722       Section->seclen= 0;
723       Section->ncoords= nrecs;
724       Section->maxpos= C.pos;
725     }
726     Section++;
727   }
728 
729   SectionList->ncoords= ncoords;
730   SectionList->nsections= Section - SectionList->Section;
731 
732   UDM_ASSERT(SectionList->ncoords <= coords_alloced);
733   UDM_ASSERT(SectionList->nsections <= sections_alloced);
734 
735   return UDM_OK;
736 }
737 
738 
739 static udm_rc_t
UdmDeleteWordsFromURLBlob(UDM_AGENT * Indexer,UDM_DB * db,urlid_t url_id)740 UdmDeleteWordsFromURLBlob(UDM_AGENT *Indexer, UDM_DB *db, urlid_t url_id)
741 {
742   return UDM_OK;
743 }
744 
745 
746 static udm_rc_t
UdmStoreWordsBlob(UDM_AGENT * Indexer,UDM_DB * db,UDM_DOCUMENT * Doc)747 UdmStoreWordsBlob(UDM_AGENT *Indexer, UDM_DB *db, UDM_DOCUMENT *Doc)
748 {
749   return UDM_OK;
750 }
751 
752 
753 static const char*
UdmBlobModeInflateOrSelf(UDM_AGENT * A,UDM_DSTR * buf,const char * name,const char * src,size_t * len)754 UdmBlobModeInflateOrSelf(UDM_AGENT *A,
755                          UDM_DSTR *buf, const char *name,
756                          const char *src, size_t *len)
757 {
758   int use_zint4;
759   int use_deflate;
760   /* fprintf(stderr, "here, len=%d src=%p\n", *len, src); */
761   if (!src || *len < 8 ||
762       (unsigned char)src[0] != 0xFF ||
763       (unsigned char)src[1] != 0xFF ||
764       (unsigned char)src[2] != 0xFF ||
765       (unsigned char)src[3] != 0xFF ||
766       (src[4] != 1 && src[4] != 2 && src[4] != 3) ||
767       src[5] != 0x00 ||
768       src[6] != 0x00 ||
769       src[7] != 0x00)
770     return src;
771   use_zint4= src[4] == 2 || src[4] == 3;
772   use_deflate= src[4] == 1 || src[4] == 3;
773   src+= 8;
774   *len-= 8;
775   if (name)
776     UdmLog(A, UDM_LOG_DEBUG, "Unpacking '%s'", name);
777   if (use_deflate)
778   {
779     udm_timer_t ticks= UdmStartTimer();
780     size_t len0= len[0];
781     UdmLog(A,UDM_LOG_DEBUG, "Deflate header detected");
782     UdmDSTRReset(buf);
783     if (UDM_OK == UdmDSTRAppendInflate(buf, src, *len))
784     {
785       src= UdmDSTRPtr(buf);
786       len[0]= UdmDSTRLength(buf);
787       UdmLog(A, UDM_LOG_DEBUG, "%d to %d bytes inflated",
788                                  (int) len0, (int) len[0]);
789     }
790     UdmLog(A, UDM_LOG_DEBUG, "Inflating done: %.2f", UdmStopTimer(&ticks));
791   }
792   if (*len >= 5 && use_zint4)
793   {
794     udm_timer_t ticks= UdmStartTimer();
795     char *zint4_buf= (char*) UdmMalloc(*len);
796     UdmLog(A, UDM_LOG_DEBUG, "zint4 header detected (zint4ed data length: %d)",
797                              (int) (*len));
798     if (! zint4_buf)
799     {
800       UdmLog(A, UDM_LOG_ERROR, "Malloc failed. Requested %u bytes",
801                                (int) (*len));
802       return(NULL);
803     }
804     memcpy(zint4_buf, src, *len);
805     if (buf->size_alloced < *len * 7 && UdmDSTRRealloc(buf, *len * 7) != UDM_OK)
806     {
807       UdmFree(zint4_buf);
808       UdmLog(A, UDM_LOG_ERROR, "UdmDSTRRealloc failed. Requested %u bytes",
809              (int) (*len * 7));
810       return(NULL);
811     }
812     *len= udm_dezint4(zint4_buf, (int4 *) buf->Val.str, *len) * 4;
813     src= UdmDSTRPtr(buf);
814     UdmFree(zint4_buf);
815     UdmLog(A, UDM_LOG_ERROR, "dezint4ed data length: %d", (int) (*len));
816     UdmLog(A, UDM_LOG_ERROR, "dezint4 done: %.2f", UdmStopTimer(&ticks));
817   }
818   return src;
819 }
820 
821 
822 static void
UdmBlobModeInflateOrAlloc(UDM_AGENT * A,UDM_DSTR * buf,const char * name,UDM_STR * row,UDM_CONST_STR * dst)823 UdmBlobModeInflateOrAlloc(UDM_AGENT *A, UDM_DSTR *buf, const char *name,
824                           UDM_STR *row, UDM_CONST_STR *dst)
825 {
826   dst->str= UdmBlobModeInflateOrSelf(A, buf, name, row->str, &row->length);
827   if (dst->str == row->str)
828   {
829     UdmDSTRRealloc(buf, row->length);
830     memcpy(buf->Val.str, row->str, row->length);
831     buf->Val.length= row->length;
832     dst->str= UdmDSTRPtr(buf);
833   }
834   dst->length= row->length;
835 }
836 
837 
838 static udm_rc_t
UdmInflateBlobModeSQLRes(UDM_AGENT * A,UDM_SQLRES * src)839 UdmInflateBlobModeSQLRes(UDM_AGENT *A, UDM_SQLRES *src)
840 {
841   UDM_DSTR ibuf;
842   size_t row;
843   UdmDSTRInit(&ibuf, 1024);
844   for (row= 0; row < src->nRows; row++)
845   {
846     size_t len= UdmSQLLen(src, row, 1);
847     const char *val= UdmSQLValue(src, row, 1);
848     const char *iflt;
849     iflt= UdmBlobModeInflateOrSelf(A, &ibuf, NULL, val, &len);
850     if (iflt != val)
851     {
852       size_t offs= src->nCols*row + 1;
853       UdmFree(src->Items[offs].str);
854       src->Items[offs].str= (char*) UdmMalloc(len + 1);
855       memcpy(src->Items[offs].str, iflt, len);
856       src->Items[offs].length= len;
857       src->Items[offs].str[len]= '\0';
858     }
859   }
860   UdmDSTRFree(&ibuf);
861   return UDM_OK;
862 }
863 
864 
865 static udm_rc_t
UdmAddCollationMatch(UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,const char * word,size_t count)866 UdmAddCollationMatch(UDM_QUERY *Query, UDM_FINDWORD_ARGS *args,
867                      const char *word, size_t count)
868 {
869   /*
870     If match is not full, then we don't know whether
871     the word is a substring or a collation match.
872     Let's assume it is a substring, to avoid long
873     word lists in $(WE).
874   */
875   if (args->Word.Param.match_mode == UDM_MATCH_FULL ||
876       args->Word.Param.match_mode == UDM_MATCH_RANGE)
877   {
878     UDM_WIDEWORD_PARAM Param= Query->Res.WWList.Word[args->Word.Param.order].Param;
879     Param.origin= UDM_WORD_ORIGIN_COLLATION;
880     Param.count= count;
881     UdmWideWordListAddLike(&args->CollationMatches, &Param, word);
882   }
883   return UDM_OK;
884 }
885 
886 
887 static udm_rc_t
UdmBlobAddCoords(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,UDM_SQLRES * SQLRes)888 UdmBlobAddCoords(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
889                  UDM_FINDWORD_ARGS *args,
890                  UDM_SQLRES *SQLRes)
891 {
892   size_t numrows= UdmSQLNumRows(SQLRes);
893   size_t i;
894   char *wf= args->query_param.wf;
895   UDM_URLID_LIST *urls= &args->urls;
896   int need_coords= args->query_param.NewVersion ?
897                    args->need_coords : (Query->Res.WWList.nwords > 1);
898   udm_bool_t save_section_size= args->query_param.SaveSectionSize;
899   UDM_SEARCHSECTION Section;
900 
901   bzero((void*) &Section, sizeof(Section));
902   Section.wordnum= args->Word.Param.order & 0xFF;
903   Section.order= Query->Res.WWList.Word[Section.wordnum].Param.order;
904 
905   for (i= 0; i < numrows; i++)
906   {
907     const unsigned char *s= (const unsigned char *)UdmSQLValue(SQLRes, i, 1);
908     size_t length= UdmSQLLen(SQLRes, i, 1);
909     unsigned char secno= UDM_ATOI(UdmSQLValue(SQLRes, i, 0));
910     const char *cmatch= UdmSQLValue(SQLRes, i, 2);
911     UDM_SEARCHSECTIONLIST SectionList;
912 
913     if (!wf[secno])
914       continue;
915 
916     Section.secno= secno;
917 
918     UdmSectionListBlobCoordsUnpack(A, &SectionList,
919                                    urls, &Section,
920                                    s, length,
921                                    save_section_size,
922                                    need_coords);
923 
924 #ifdef HAVE_DEBUG
925     if (UdmVarListFindBool(UdmSQLDBVars(db), "DebugSectionList", UDM_FALSE))
926       UdmSearchSectionListPrint(&SectionList);
927 #endif
928 
929     if (SectionList.nsections && SectionList.ncoords)
930     {
931       UdmSearchSectionListListAdd(&args->SearchSectionListList, &SectionList);
932       args->Word.Param.count+= SectionList.ncoords;
933       UdmAddCollationMatch(Query, args, cmatch, SectionList.ncoords);
934     }
935     else
936     {
937       UdmSearchSectionListFree(&SectionList);
938     }
939   }
940 
941   return UDM_OK;
942 }
943 
944 
945 static udm_rc_t
UdmFindWordBlobFromTable(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,const char * table_name)946 UdmFindWordBlobFromTable(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
947                          UDM_FINDWORD_ARGS *args, const char *table_name)
948 {
949   char qbuf[4096];
950   char secno[32]= "";
951   char special[32]= "";
952   udm_timer_t ticks;
953   UDM_SQLRES SQLRes;
954   udm_rc_t rc;
955 
956   if (args->urls.empty)
957   {
958     UdmLog(A, UDM_LOG_DEBUG,
959            "Not searching '%s': Base URL limit is empty", table_name);
960     return UDM_OK;
961   }
962 
963   ticks= UdmStartTimer();
964   UdmLog(A, UDM_LOG_DEBUG, "Start fetching");
965   if (args->Word.Param.secno)
966     udm_snprintf(secno, sizeof(secno), " AND secno=%d", (int) args->Word.Param.secno);
967   /*
968     When performing substring or number search,
969     don't include special data, like '##last_mod_time' or '##rec_id'
970   */
971   if (args->Word.Param.match_mode != UDM_MATCH_FULL)
972     udm_snprintf(special, sizeof(special), " AND word NOT LIKE '##%%'");
973   udm_snprintf(qbuf, sizeof(qbuf),
974                "SELECT secno,coords,word FROM %s WHERE %s%s%s",
975                table_name, args->cmparg, secno, special);
976   if (UDM_OK != (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)))
977     return rc;
978   UdmLog(A, UDM_LOG_DEBUG, "%-30s%.2f", "Stop  fetching", UdmStopTimer(&ticks));
979 
980   ticks= UdmStartTimer();
981   UdmLog(A, UDM_LOG_DEBUG, "Start BlobAddCoords");
982   UdmInflateBlobModeSQLRes(A, &SQLRes);
983   UdmBlobAddCoords(A, db, Query, args, &SQLRes);
984   UdmLog(A, UDM_LOG_DEBUG, "%-30s%.2f", "Stop  BlobAddCoords", UdmStopTimer(&ticks));
985   if (args->query_param.NewVersion)
986   {
987     UdmSQLResListAdd(&args->SQLResults, &SQLRes);
988   }
989   else
990   {
991     UdmSQLFree(&SQLRes);
992   }
993   return(UDM_OK);
994 }
995 
996 
997 static udm_rc_t
UdmFindWordBlobSimple(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)998 UdmFindWordBlobSimple(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
999                       UDM_FINDWORD_ARGS *args)
1000 {
1001   char tablename[64];
1002   udm_rc_t rc;
1003   int delta= UdmVarListFindBool(&A->Conf->Vars, "UseDelta", UDM_FALSE);
1004   UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
1005   if (UDM_OK != (rc= UdmFindWordBlobFromTable(A, db, Query, args, tablename)))
1006     return rc;
1007   if (delta && UDM_OK != (rc= UdmFindWordBlobFromTable(A, db, Query,
1008                                                        args, "bdict_delta")))
1009     return rc;
1010   return UDM_OK;
1011 }
1012 
1013 
1014 static udm_rc_t
UdmLoadSlowLimitWithSort(UDM_AGENT * A,UDM_DB * db,UDM_URLID_LIST * list,const char * q)1015 UdmLoadSlowLimitWithSort(UDM_AGENT *A, UDM_DB *db, UDM_URLID_LIST *list, const char *q)
1016 {
1017   udm_rc_t rc= UdmLoadSlowLimit(A, db, list, q);
1018   if (rc == UDM_OK)
1019     UdmURLIdListSort(list);
1020   return rc;
1021 }
1022 
1023 
1024 static udm_rc_t
UdmBlobLoadLiveUpdateLimitLoad(UDM_AGENT * A,UDM_DB * db,UDM_FINDWORD_ARGS * args)1025 UdmBlobLoadLiveUpdateLimitLoad(UDM_AGENT *A, UDM_DB *db, UDM_FINDWORD_ARGS *args)
1026 {
1027   udm_rc_t rc;
1028   int ts= 0;
1029   udm_timer_t ticks;
1030   char qbuf[128];
1031   UDM_ASSERT(UdmSQLDBMode(db) == UDM_SQLDBMODE_BLOB);
1032   ticks= UdmStartTimer();
1033   UdmLog(A, UDM_LOG_DEBUG, "Start loading LiveUpdate url_id list");
1034   if (UDM_OK != (rc= UdmBlobReadTimestamp(A, db, &ts, 0)))
1035     return rc;
1036   args->live_updates_ts= ts;
1037   udm_snprintf(qbuf, sizeof(qbuf),
1038                "SELECT url_id FROM cachedcopy WHERE ts>=%d", ts);
1039   if (UDM_OK != (rc= UdmLoadSlowLimitWithSort(A, db,
1040                                               &args->live_update_deleted_urls,
1041                                               qbuf)))
1042     return rc;
1043   UdmLog(A, UDM_LOG_DEBUG,
1044          "Stop loading LiveUpdate url_id list: %.02f, %d updated docs found",
1045          UdmStopTimer(&ticks), (int) args->live_update_deleted_urls.nurls);
1046   args->live_update_deleted_urls.exclude= 1;
1047   UdmURLIdListCopy(&args->live_update_active_urls, &args->urls);
1048   UdmURLIdListMerge(&args->urls, &args->live_update_deleted_urls);
1049   return UDM_OK;
1050 }
1051 
1052 
1053 static udm_rc_t
UdmFindWordBlobLiveUpdates(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)1054 UdmFindWordBlobLiveUpdates(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1055                            UDM_FINDWORD_ARGS *args)
1056 {
1057   udm_rc_t rc;
1058   if (!(UDM_OK == (rc= UdmFindWordBlobSimple(A, db, Query, args))) ||
1059       !(UDM_OK == (rc= UdmFindWordRawBlobDelta(A, db, Query, args))))
1060     goto ret;
1061 
1062 ret:
1063   return rc;
1064 }
1065 
1066 
1067 static udm_rc_t
UdmFindWordBlob(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)1068 UdmFindWordBlob(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1069                 UDM_FINDWORD_ARGS *args)
1070 {
1071   return args->live_updates ?
1072          UdmFindWordBlobLiveUpdates(A, db, Query, args) :
1073          UdmFindWordBlobSimple(A, db, Query, args);
1074 }
1075 
1076 
1077 udm_rc_t
UdmBlobReadTimestamp(UDM_AGENT * A,UDM_DB * db,int * ts,int def)1078 UdmBlobReadTimestamp(UDM_AGENT *A, UDM_DB *db, int *ts, int def)
1079 {
1080   udm_rc_t rc;
1081   char lname[]= "##ts";
1082   char qbuf[64], tablename[64];
1083   UDM_SQLRES SQLRes;
1084 
1085   UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
1086   udm_snprintf(qbuf, sizeof(qbuf), "SELECT coords FROM %s WHERE word='%s'",
1087                tablename, lname);
1088   if (UDM_OK == (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)) &&
1089       UdmSQLNumRows(&SQLRes) > 0)
1090     *ts= atoi(UdmSQLValue(&SQLRes, 0,0));
1091   else
1092     *ts= def;
1093   UdmSQLFree(&SQLRes);
1094   return rc;
1095 }
1096 
1097 
1098 static udm_rc_t
UdmBlobWriteWordPrepare(UDM_AGENT * A,UDM_DB * db,const char * table)1099 UdmBlobWriteWordPrepare(UDM_AGENT *A, UDM_DB *db, const char *table)
1100 {
1101   udm_rc_t rc;
1102   char qbuf[128];
1103   const char *int_cast= UdmSQLDBType(db) == UDM_DB_PGSQL ? "::integer" : "";
1104   udm_snprintf(qbuf, sizeof(qbuf),
1105                "INSERT INTO %s (word,secno,coords) "
1106                "VALUES(%s, %s%s, %s)",
1107                table,
1108                UdmDBSQLParamPlaceHolder(db, 1),
1109                UdmDBSQLParamPlaceHolder(db, 2),
1110                int_cast,
1111                UdmDBSQLParamPlaceHolder(db, 3));
1112   rc= UdmDBSQLPrepare(A, db, qbuf);
1113   return rc;
1114 }
1115 
1116 
1117 #if 0
1118 static udm_bool_t
1119 check_well_formed_length_with_warn(UDM_AGENT *A,
1120                                    const char *str, size_t length)
1121 {
1122   size_t wflength= A->Conf->lcs->cset->well_formed_length(A->Conf->lcs,
1123                                                           str, length,
1124                                                           UDM_RECODE_HTML);
1125   if (wflength < length)
1126   {
1127     UDM_DSTR tmp;
1128     UdmDSTRInit(&tmp, 128);
1129     UdmDSTRAppendHex(&tmp, str + wflength, length - wflength);
1130     UdmLog(A, UDM_LOG_DEBUG, "Not a well formed word: '%s'", tmp.Val.str);
1131     UdmDSTRFree(&tmp);
1132     return UDM_TRUE;
1133   }
1134   return UDM_FALSE;
1135 }
1136 #endif
1137 
1138 
1139 static udm_rc_t
UdmBlobWriteWordUsingBind(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,uint4 secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1140 UdmBlobWriteWordUsingBind(UDM_AGENT *A, UDM_DB *db,  const char *table,
1141                           const char *word, uint4 secno,
1142                           const char *data, size_t len, UDM_DSTR *buf,
1143                           int auto_prepare)
1144 {
1145   udm_rc_t rc;
1146   size_t wordlen= strlen(word);
1147   if ((auto_prepare &&
1148        UDM_OK != (rc= UdmBlobWriteWordPrepare(A, db, table))) ||
1149        UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 1, word, (int) wordlen, UDM_SQLTYPE_VARCHAR)) ||
1150        UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 2, &secno, (int) sizeof(secno), UDM_SQLTYPE_INT32)) ||
1151        UDM_OK != (rc= UdmDBSQLBindParameter(A, db, 3, data, (int) len, UDM_SQLTYPE_LONGVARBINARY) ) ||
1152        UDM_OK != (rc= UdmDBSQLExecute(A, db)) ||
1153        (auto_prepare && UDM_OK != (rc= UdmDBSQLStmtFree(A, db))))
1154     return rc;
1155 
1156   return UDM_OK;
1157 }
1158 
1159 
1160 static void
UdmDSTREncodeForDB(UDM_AGENT * A,UDM_DB * db,UDM_DSTR * buf,const char * src,size_t length)1161 UdmDSTREncodeForDB(UDM_AGENT *A, UDM_DB *db, UDM_DSTR *buf,
1162                    const char *src, size_t length)
1163 {
1164   if (UdmSQLDBType(db) == UDM_DB_PGSQL)
1165   {
1166     char *s= buf->Val.str + buf->Val.length;
1167     buf->Val.length+= UdmDBSQLBinEscStr(A, db, s, buf->size_alloced, src, length);
1168   }
1169   else
1170     UdmDSTRAppendHex(buf, src, length);
1171 }
1172 
1173 
1174 static udm_rc_t
UdmBlobWriteWordUsingEncoding(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1175 UdmBlobWriteWordUsingEncoding(UDM_AGENT *A, UDM_DB *db,  const char *table,
1176                               const char *word, size_t secno,
1177                               const char *data, size_t len, UDM_DSTR *buf,
1178                               int auto_prepare)
1179 {
1180   udm_rc_t rc;
1181   size_t escape_factor= UdmSQLDBType(db) == UDM_DB_PGSQL ? 5 : 2;
1182   const char *pf= UdmSQLDBType(db) == UDM_DB_PGSQL ? "'" : "0x";
1183   const char *sf= UdmSQLDBType(db) == UDM_DB_PGSQL ? "'" : "";
1184   const char *E= (UdmSQLDBDriver(db) == UDM_DBAPI_PGSQL && UdmSQLDBVersion(db) >= 80101) ? "E" : "";
1185   size_t nbytes= 100 + len * escape_factor + 1;
1186 
1187   if (UdmSQLDBFlags(db) & UDM_SQL_HAVE_STDHEX) /* X'AABBCC' syntax */
1188   {
1189     pf= "X'";
1190     sf= "'";
1191   }
1192   else if (UdmSQLDBFlags(db) & UDM_SQL_HAVE_BLOB_AS_HEX) /* 'AABBCC' syntax */
1193   {
1194     pf= "'";
1195     sf= "'";
1196   }
1197 
1198   UdmDSTRReset(buf);
1199 
1200   if (UdmDSTRAlloc(buf, nbytes))
1201   {
1202     UdmLog(A, UDM_LOG_ERROR,
1203            "BlobWriteWordUsingEncoding: DSTRAlloc(%d) failed: "
1204            "word='%s' secno=%d length=%d",
1205            (int) nbytes, word, (int) secno, (int) len);
1206     return UDM_OK; /* Skip this word - try to continue */
1207   }
1208   UdmDSTRAppendf(buf, "INSERT INTO %s VALUES('%s', %d, %s%s",
1209                  table, word, (int) secno, E, pf);
1210   UdmDSTREncodeForDB(A, db, buf, data, len);
1211   UdmDSTRAppendf(buf, "%s)", sf);
1212   if (UDM_OK != (rc= UdmDBSQLQuery(A, db, NULL, UdmDSTRPtr(buf))))
1213     return rc;
1214 
1215   UdmDSTRReset(buf);
1216 
1217   return UDM_OK;
1218 }
1219 
1220 
1221 static udm_rc_t
UdmBlobWriteWordUsingMultiInsert(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare)1222 UdmBlobWriteWordUsingMultiInsert(UDM_AGENT *A, UDM_DB *db,  const char *table,
1223                                  const char *word, size_t secno,
1224                                  const char *data, size_t len, UDM_DSTR *buf,
1225                                  int auto_prepare)
1226 {
1227   const char *comma= ",";
1228   size_t escape_factor= 2;
1229   size_t nbytes= UdmDSTRLength(buf) + 100 + len * escape_factor + 1;
1230 
1231   if (UdmDSTRRealloc(buf, nbytes))
1232   {
1233     UdmLog(A, UDM_LOG_ERROR, "DSTRAlloc(%d) failed: word='%s' secno=%d len=%d",
1234             (int) nbytes, word, (int) secno, (int) len);
1235     return UDM_ERROR;
1236   }
1237 
1238   if (!UdmDSTRLength(buf))
1239   {
1240     UdmDSTRAppendf(buf, "INSERT INTO %s VALUES ", table);
1241     comma= "";
1242   }
1243 
1244   UdmDSTRAppendf(buf, "%s('%s',%d,0x", comma, word, (int) secno);
1245   UdmDSTRAppendHex(buf, data, len);
1246   UdmDSTRAppendf(buf, ")");
1247   return UDM_OK;
1248 }
1249 
1250 
1251 static udm_rc_t
UdmBlobWriteWord(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,int auto_prepare,int use_multi_insert)1252 UdmBlobWriteWord(UDM_AGENT *A, UDM_DB *db, const char *table,
1253                  const char *word, size_t secno,
1254                  const char *data, size_t len, UDM_DSTR *buf,
1255                  int auto_prepare, int use_multi_insert)
1256 {
1257   udm_rc_t rc;
1258   int use_bind= UdmSQLDBFlags(db) & UDM_SQL_HAVE_BIND_BINARY;
1259 
1260   rc= use_multi_insert ?
1261       UdmBlobWriteWordUsingMultiInsert(A, db, table, word, secno, data, len,
1262                                        buf, auto_prepare) :
1263       use_bind ?
1264       UdmBlobWriteWordUsingBind(A, db, table, word, secno, data, len,
1265                                 buf, auto_prepare):
1266       UdmBlobWriteWordUsingEncoding(A, db, table, word, secno, data, len,
1267                                     buf, auto_prepare);
1268   return rc;
1269 }
1270 
1271 
1272 static udm_rc_t
UdmBlobWriteWordCmpr(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,size_t secno,const char * data,size_t len,UDM_DSTR * buf,UDM_DSTR * z,int use_zint4,int auto_prepare,int allow_multi_insert)1273 UdmBlobWriteWordCmpr(UDM_AGENT *A, UDM_DB *db, const char *table,
1274                      const char *word, size_t secno,
1275                      const char *data, size_t len,
1276                      UDM_DSTR *buf, UDM_DSTR *z,
1277                      int use_zint4,
1278                      int auto_prepare,
1279                      int allow_multi_insert)
1280 {
1281 #ifdef HAVE_ZLIB
1282   if (z && len > 256)
1283   {
1284     UdmDSTRReset(z);
1285     UdmDSTRRealloc(z, len + 8 + 1); /* 8 for two INTs */
1286     /* Append Format version */
1287 #if 0
1288     if (use_zint4)
1289     {
1290       udm_rc_t dummy;
1291       /* Something is wrong here: why UdmDeflate? */
1292       UdmDSTRAppendCompressionType(z, UDM_BLOB_COMP_ZINT4_DEFLATE);
1293       UdmDSTRAppendDeflate(z, data + 8, len - 8);
1294     }
1295     else
1296 #endif
1297     {
1298       UdmDSTRAppendCompressionType(z, UDM_BLOB_COMP_DEFLATE);
1299       UdmDSTRAppendDeflate(z, data, len);
1300     }
1301     if (UdmDSTRLength(z) < len)
1302     {
1303       data= UdmDSTRPtr(z);
1304       len= UdmDSTRLength(z);
1305     }
1306   }
1307 #endif
1308   return UdmBlobWriteWord(A, db, table, word, secno, data, len, buf,
1309                           auto_prepare, allow_multi_insert);
1310 }
1311 
1312 
1313 /*
1314   The word must not require escaping!
1315 */
1316 static udm_rc_t
UdmBlobDeleteWordFromTable(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word)1317 UdmBlobDeleteWordFromTable(UDM_AGENT *A, UDM_DB *db,
1318                            const char *table, const char *word)
1319 {
1320   char qbuf[64];
1321   udm_snprintf(qbuf, sizeof(qbuf),
1322                "DELETE FROM %s WHERE word='%s'", table, word);
1323   return UdmDBSQLQuery(A, db, NULL, qbuf);
1324 }
1325 
1326 
1327 static udm_rc_t
UdmBlobRewriteIntWord(UDM_AGENT * A,UDM_DB * db,UDM_DSTR * buf,const char * table,const char * name,int value,udm_bool_t rewrite)1328 UdmBlobRewriteIntWord(UDM_AGENT *A, UDM_DB *db, UDM_DSTR *buf,
1329                       const char *table, const char *name, int value,
1330                       udm_bool_t rewrite)
1331 {
1332   udm_rc_t rc;
1333   char data[64];
1334   size_t size_data;
1335   UdmLog(A, UDM_LOG_DEBUG, "Writing '%s'", name);
1336   if (rewrite &&
1337       UDM_OK != (rc= UdmBlobDeleteWordFromTable(A, db, table, name)))
1338     return rc;
1339   size_data= udm_snprintf(data, sizeof(data), "%d", value);
1340   return UdmBlobWriteWord(A, db, table, name, 0, data, size_data, buf, 1, 0);
1341 }
1342 
1343 
1344 static udm_rc_t
UdmBlobWriteTimestamp(UDM_AGENT * A,UDM_DB * db,const char * table,udm_bool_t rewrite)1345 UdmBlobWriteTimestamp(UDM_AGENT *A, UDM_DB *db,
1346                       const char *table, udm_bool_t rewrite)
1347 {
1348   udm_rc_t rc;
1349   UDM_DSTR buf;
1350   UdmDSTRInit(&buf, 128);
1351   if (UDM_OK != (rc= UdmBlobRewriteIntWord(A, db, &buf, table, "##ts",
1352                                            (int) time(0), rewrite)))
1353     goto ex;
1354   rc= UdmBlobRewriteIntWord(A, db, &buf, table, "##version",
1355                             UDM_VERSION_ID, rewrite);
1356 ex:
1357   UdmDSTRFree(&buf);
1358   return rc;
1359 }
1360 
1361 
1362 
1363 static udm_rc_t
UdmTruncateDictBlob(UDM_AGENT * Indexer,UDM_DB * db)1364 UdmTruncateDictBlob(UDM_AGENT *Indexer, UDM_DB *db)
1365 {
1366   return UdmDBSQLTableTruncateOrDelete(Indexer, db, "bdict");
1367 }
1368 
1369 /************************************************/
1370 
1371 typedef struct
1372 {
1373   UDM_DSTR buf;
1374   UDM_DSTR compress;
1375   udm_bool_t use_deflate;
1376 } UDM_WRITE_HELPER;
1377 
1378 
1379 static udm_rc_t
UdmWriteHelperInit(UDM_WRITE_HELPER * Helper,UDM_DB * db)1380 UdmWriteHelperInit(UDM_WRITE_HELPER *Helper, UDM_DB *db)
1381 {
1382   bzero((void*) Helper, sizeof(*Helper));
1383 #ifdef HAVE_ZLIB
1384   Helper->use_deflate= UdmVarListFindBool(UdmSQLDBVars(db), "deflate", UDM_FALSE);
1385 #endif
1386   if (UDM_OK != UdmDSTRInit(&Helper->buf, 8 * 1024))
1387     return UDM_ERROR;
1388   if (UDM_OK != UdmDSTRInit(&Helper->compress, 8 * 1024))
1389   {
1390     UdmDSTRFree(&Helper->buf);
1391     return UDM_ERROR;
1392   }
1393   return UDM_OK;
1394 }
1395 
1396 
1397 static void
UdmWriteHelperFree(UDM_WRITE_HELPER * Helper)1398 UdmWriteHelperFree(UDM_WRITE_HELPER *Helper)
1399 {
1400   UdmDSTRFree(&Helper->buf);
1401   UdmDSTRFree(&Helper->compress);
1402 }
1403 
1404 
1405 static udm_rc_t
UdmWriteWordWithHelper(UDM_AGENT * A,UDM_DB * db,const char * table,const char * word,const UDM_DSTR * data,UDM_WRITE_HELPER * Helper,int use_zint4)1406 UdmWriteWordWithHelper(UDM_AGENT *A, UDM_DB *db, const char *table,
1407                        const char *word, const UDM_DSTR *data,
1408                        UDM_WRITE_HELPER *Helper, int use_zint4)
1409 {
1410   return UdmBlobWriteWordCmpr(A, db, table, word, /*secno*/0,
1411                               UdmDSTRPtr(data), UdmDSTRLength(data),
1412                               &Helper->buf,
1413                               Helper->use_deflate ? &Helper->compress : NULL,
1414                               use_zint4, 1, 0);
1415 }
1416 /*****************************************************/
1417 
1418 /*
1419   Write limits, but don't COMMIT and don't write timestamp
1420 */
1421 static udm_rc_t
UdmBlobWriteLimitsInternal(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_WRITE_HELPER * Helper)1422 UdmBlobWriteLimitsInternal(UDM_AGENT *A, UDM_DB *db,
1423                            const char *table, UDM_WRITE_HELPER *Helper)
1424 {
1425   UDM_VARLIST *Vars= &A->Conf->Vars;
1426   UDM_DSTR l;
1427   udm_rc_t rc= UDM_OK;
1428   size_t nvar;
1429 
1430   UdmDSTRInit(&l, 8192);
1431   for (nvar= 0; nvar < Vars->nvars; nvar++)
1432   {
1433     UDM_VAR *Var= UdmVarListFindByIndex(Vars, nvar);
1434     size_t i, ndocs;
1435     char qbuf[128];
1436     char lname[64];
1437     UDM_URLID_LIST list;
1438     UDM_URL_INT4_LIST UserScore;
1439     int is_score= 0;
1440     udm_timer_t ticks;
1441 
1442     if (!strncasecmp(UdmVarName(Var), "Limit.", 6))
1443       udm_snprintf(lname, sizeof(lname), "##limit#%s", UdmVarName(Var) + 6);
1444     else if (!strncasecmp(UdmVarName(Var), "Order.", 6))
1445       udm_snprintf(lname, sizeof(lname), "##order#%s", UdmVarName(Var) + 6);
1446     else if ((is_score= !strncasecmp(UdmVarName(Var), "Score.", 6)))
1447       udm_snprintf(lname, sizeof(lname), "##score#%s", UdmVarName(Var) + 6);
1448     else
1449       continue;
1450     UdmLog(A, UDM_LOG_DEBUG, "Writing '%s'", lname);
1451 
1452     bzero((void*) &list, sizeof(list));
1453     bzero((void*) &UserScore, sizeof(UserScore));
1454 
1455     if (UDM_OK != (rc= is_score ?
1456                        UdmUserScoreListLoad(A, db, &UserScore, UdmVarStr(Var)) :
1457                        UdmLoadSlowLimit(A, db, &list, UdmVarStr(Var))))
1458       goto ret;
1459 
1460     ticks= UdmStartTimer();
1461 
1462     if (!strncasecmp(UdmVarName(Var), "Limit.", 6))
1463       UdmURLIdListSort(&list);
1464 
1465     UdmDSTRReset(&Helper->buf);
1466     UdmDSTRReset(&l);
1467     ndocs= is_score ? UserScore.nitems : list.nurls;
1468     for (i= 0; i < ndocs; i++)
1469     {
1470       if (is_score)
1471       {
1472         UDM_URL_INT4 *item= &UserScore.Item[i];
1473         char ch= item->param;
1474         UdmDSTRAppendINT4(&l, item->url_id);
1475         UdmDSTRAppend(&l, &ch, 1);
1476       }
1477       else
1478       {
1479         /* Limit */
1480         UdmDSTRAppendINT4(&l, list.urls[i]);
1481       }
1482     }
1483 
1484     udm_snprintf(qbuf, sizeof(qbuf), "DELETE FROM %s WHERE word=('%s')", table, lname);
1485     if (UDM_OK != (rc= UdmDBSQLQuery(A, db, NULL, qbuf)))
1486       goto ret;
1487 
1488     if (UdmDSTRLength(&l))
1489     {
1490       if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, lname, &l,
1491                                                 Helper, 0)))
1492         goto ret;
1493     }
1494 
1495     UDM_FREE(list.urls);
1496     UDM_FREE(UserScore.Item);
1497     UdmLog(A, UDM_LOG_DEBUG, "%d documents written to '%s': %.2f",
1498                               (int) ndocs, lname, UdmStopTimer(&ticks));
1499   }
1500 ret:
1501   UdmDSTRFree(&l);
1502   return rc;
1503 }
1504 
1505 
1506 /*******************************************/
1507 
1508 static udm_rc_t
UdmURLDataListStorePopularityBdict(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_URLDATALIST * List,UDM_WRITE_HELPER * Helper)1509 UdmURLDataListStorePopularityBdict(UDM_AGENT *A, UDM_DB *db, const char *table,
1510                                    UDM_URLDATALIST *List,
1511                                    UDM_WRITE_HELPER *Helper)
1512 {
1513   udm_rc_t rc= UDM_OK;
1514   UDM_DSTR pop;
1515 
1516   UdmLog(A, UDM_LOG_DEBUG, "Writing '##pop'");
1517 
1518   /* TODO34: add "rewrite" parameter */
1519   if (UDM_OK != (rc= UdmBlobDeleteWordFromTable(A, db, table, "##pop")))
1520     return rc;
1521 
1522   if (UDM_OK != (rc= UdmDSTRInit(&pop, 8192)))
1523     return UDM_ERROR;
1524 
1525   if (UDM_OK != (rc= UdmURLDataListPackPopularity(A, List, &pop)))
1526   {
1527     UdmLog(A, UDM_LOG_ERROR, "Packing popularity failed");
1528     goto ex;
1529   }
1530 
1531   if (UdmDSTRLength(&pop))
1532     rc= UdmWriteWordWithHelper(A, db, table, "##pop", &pop, Helper, 0);
1533 ex:
1534   UdmDSTRFree(&pop);
1535   return rc;
1536 }
1537 
1538 
1539 static udm_rc_t
UdmURLDataListStorePopularityTable(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * List)1540 UdmURLDataListStorePopularityTable(UDM_AGENT *A, UDM_DB *db,
1541                                    UDM_URLDATALIST *List)
1542 {
1543   udm_timer_t timer= UdmStartTimer();
1544   char qbuf[128];
1545   size_t i;
1546   const char *table= UdmVarListFindStr(&A->Conf->Vars, "SQLExportPopularityTable", NULL);
1547 
1548   if (!table)
1549     return UDM_OK;
1550 
1551   /* TODO34: allow only safe table names? */
1552   UdmLog(A, UDM_LOG_DEBUG, "Writing popularity table '%s'", table);
1553   if (UDM_OK != UdmDBSQLTableTruncateOrDelete(A, db, table))
1554     return UDM_ERROR;
1555   udm_snprintf(qbuf, sizeof(qbuf), "%s WRITE", table);
1556   if (UDM_OK != UdmDBSQLLockOrBegin(A, db, qbuf))
1557     return UDM_ERROR;
1558   for (i= 0; i < List->nitems; i++)
1559   {
1560     UDM_URLDATA *Item= &List->Item[i];
1561     udm_snprintf(qbuf, sizeof(qbuf),
1562                  "INSERT INTO %s (url_id,inlinks,poprank) VALUES (%d,%d,%f)",
1563                  table, Item->url_id, Item->per_site, Item->pop_rank);
1564     if (UDM_OK != UdmDBSQLQuery(A, db, NULL, qbuf))
1565       return UDM_ERROR;
1566   }
1567   if (UDM_OK != UdmDBSQLUnlockOrCommit(A, db))
1568     return UDM_ERROR;
1569   UdmLog(A, UDM_LOG_DEBUG, "Writing popularity table done: %.2f", UdmStopTimer(&timer));
1570   return UDM_OK;
1571 }
1572 
1573 
1574 static udm_rc_t
UdmWritePopularityBdictAndTable(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * URLData,const char * table,UDM_WRITE_HELPER * Helper)1575 UdmWritePopularityBdictAndTable(UDM_AGENT *A, UDM_DB *db,
1576                                 UDM_URLDATALIST *URLData,
1577                                 const char *table,
1578                                 UDM_WRITE_HELPER *Helper)
1579 {
1580   udm_rc_t rc;
1581   /* Write pop_rank only if we have some non-empty value */
1582   if (UDM_OK != (rc= UdmURLDataListStorePopularityBdict(A, db, table, URLData,
1583                                                         Helper)))
1584     return rc;
1585   if (UDM_OK != (rc= UdmURLDataListStorePopularityTable(A, db, URLData)))
1586     return rc;
1587   return UDM_OK;
1588 }
1589 
1590 
1591 static udm_rc_t
UdmBlobWriteURLData(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,const char * table,UDM_WRITE_HELPER * Helper)1592 UdmBlobWriteURLData(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
1593                     const char *table,
1594                     UDM_WRITE_HELPER *Helper)
1595 {
1596   udm_rc_t rc= UDM_OK;
1597   int use_zint4= UdmVarListFindBool(UdmSQLDBVars(db), "zint4", UDM_DEFAULT_ZINT4);
1598   size_t i;
1599   UDM_DSTR url_id, site, last_mod_time;
1600   UDM_URLDATALIST *List= &Query->URLData;
1601 
1602   UdmDSTRInit(&url_id, 8192);
1603   UdmDSTRInit(&site, 1024);
1604   UdmDSTRInit(&last_mod_time, 8192);
1605 
1606   for (i= 0; i < List->nitems; i++)
1607   {
1608     UDM_URLDATA *Item= &List->Item[i];
1609     UdmDSTRAppendINT4(&url_id, Item->url_id);
1610     UdmDSTRAppendINT4(&last_mod_time, Item->last_mod_time);
1611   }
1612 
1613   UdmLog(A, UDM_LOG_DEBUG, "Writing '##rec_id'");
1614   if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##rec_id",
1615                                             &url_id, Helper, use_zint4)))
1616     goto ex;
1617 
1618   UdmLog(A, UDM_LOG_DEBUG, "Writing '##last_mod_time'");
1619   if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##last_mod_time",
1620                                             &last_mod_time, Helper, 0)))
1621     goto ex;
1622 
1623   if (UDM_OK != (rc= UdmURLDataListPackSite(List, &site)))
1624     goto ex;
1625 
1626   UdmLog(A, UDM_LOG_DEBUG, "Writing '##site'");
1627   if (UDM_OK != (rc= UdmWriteWordWithHelper(A, db, table, "##site",
1628                                             &site, Helper, 0)))
1629     goto ex;
1630 
1631 ex:
1632   UdmDSTRFree(&url_id);
1633   UdmDSTRFree(&site);
1634   UdmDSTRFree(&last_mod_time);
1635   return rc;
1636 }
1637 
1638 
1639 /*
1640   Unpack URL data from all packed records:
1641   '##rec_id'
1642   '##last_mod_time'
1643 */
1644 static size_t
UdmURLDataUnpackFull(UDM_URLDATALIST * DataList,size_t nrows,const char * rec_id_str,const char * last_mod_time_str)1645 UdmURLDataUnpackFull(UDM_URLDATALIST *DataList, size_t nrows,
1646                      const char *rec_id_str,
1647                      const char *last_mod_time_str)
1648 {
1649   size_t i, j;
1650   for (j= 0, i= 0; i < nrows; i++)
1651   {
1652     urlid_t rec_id= udm_get_int4(rec_id_str);
1653     rec_id_str+= 4;
1654 
1655     if (rec_id == DataList->Item[j].url_id)
1656     {
1657       UDM_URLDATA *D= &DataList->Item[j];
1658       if (last_mod_time_str)
1659         D->last_mod_time= udm_get_int4(&last_mod_time_str[i*4]);
1660       j++;
1661       if (j == DataList->nitems)
1662         break;
1663     }
1664   }
1665   return j;
1666 }
1667 
1668 
1669 /*
1670   Unpack rec_id from '##rec_id' record.
1671 */
1672 static size_t
UdmURLDataUnpackRecID(UDM_AGENT * A,UDM_URLDATALIST * DataList,size_t nrows,const char * rec_id_str)1673 UdmURLDataUnpackRecID(UDM_AGENT *A, UDM_URLDATALIST *DataList, size_t nrows,
1674                       const char *rec_id_str)
1675 {
1676   /* Need only rec_id */
1677   UDM_URLDATA *Data= DataList->Item;
1678   size_t j, i, skip= 0, ncoords= DataList->nitems;
1679 
1680   for (j = 0, i = 0; i < nrows; i++)
1681   {
1682     urlid_t rec_id= udm_get_int4(rec_id_str);
1683     while (rec_id > Data[j].url_id && j < ncoords)
1684     {
1685       skip++;
1686       j++;
1687     }
1688 
1689     if (rec_id == Data[j].url_id)
1690     {
1691       j++;
1692       if (j == ncoords) break;
1693     }
1694     rec_id_str+= 4;
1695   }
1696   if (j < ncoords)
1697   {
1698     skip+= (ncoords - j);
1699     UdmLog(A, UDM_LOG_DEBUG,
1700            "Warning: %d out of %d coords didn't have URL data",
1701            (int) skip, (int) DataList->nitems);
1702     j= DataList->nitems;
1703   }
1704   return j;
1705 }
1706 
1707 
1708 static size_t
UdmDSTRAppendWithComma(UDM_DSTR * dstr,const char * s,size_t length)1709 UdmDSTRAppendWithComma(UDM_DSTR *dstr, const char *s, size_t length)
1710 {
1711   if (UdmDSTRLength(dstr))
1712     UdmDSTRAppend(dstr, ",", 1);
1713   return UdmDSTRAppend(dstr, s, length);
1714 }
1715 
1716 
1717 udm_rc_t
UdmLoadURLDataFromBdict(UDM_AGENT * A,UDM_DB * db,UDM_URLDATALIST * DataList,int flags)1718 UdmLoadURLDataFromBdict(UDM_AGENT *A, UDM_DB *db,
1719                         UDM_URLDATALIST *DataList,
1720                         int flags)
1721 {
1722   udm_rc_t rc;
1723   char qbuf[4*1024], table[64];
1724   UDM_SQLRES SQLres;
1725   UDM_DSTR inbuf, rec_id_buf, site_buf, pop_rank_buf, last_mod_time_buf;
1726   UDM_STR row[2];
1727   udm_timer_t ticks= UdmStartTimer();
1728   int need_pop_rank= (flags & UDM_URLDATA_POP);
1729   int need_last_mod_time= (flags & UDM_URLDATA_LM);
1730   int need_site= (flags & (UDM_URLDATA_SITE | UDM_URLDATA_SITE_RANK));
1731   int need_id= need_last_mod_time; /* TODO34: pack last_mode_time toghether with id? */
1732   UDM_CONST_STR site, id, pop, lm;
1733 
1734   if (!flags)
1735     return UDM_NOTARGET;
1736 
1737   UdmConstStrInit(&site);
1738   UdmConstStrInit(&id);
1739   UdmConstStrInit(&pop);
1740   UdmConstStrInit(&lm);
1741 
1742   UdmBlobGetRTable(A, db, table, sizeof(table));
1743 
1744   UdmDSTRInit(&inbuf, 64);
1745   if (need_id)
1746     UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##rec_id'"));
1747   if (need_pop_rank)
1748     UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##pop'"));
1749   if (need_last_mod_time)
1750     UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##last_mod_time'"));
1751   if (need_site)
1752     UdmDSTRAppendWithComma(&inbuf, UDM_CSTR_WITH_LEN("'##site'"));
1753 
1754   /* Check that DataList is not empty and is sorted by url_id */
1755   UDM_ASSERT(DataList->nitems);
1756   UDM_ASSERT(DataList->Item[0].url_id <= DataList->Item[DataList->nitems-1].url_id);
1757 
1758   UdmLog(A,UDM_LOG_DEBUG,"Loading URL data from bdict");
1759   udm_snprintf(qbuf, sizeof(qbuf),
1760                "SELECT word,coords FROM %s WHERE word IN (%s)",
1761                table, UdmDSTRPtr(&inbuf));
1762   UdmDSTRFree(&inbuf);
1763 
1764   if (UDM_OK != (rc= UdmDBSQLExecDirect(A, db, &SQLres, qbuf)))
1765   {
1766     UdmLog(A,UDM_LOG_DEBUG,"Couldn't run a query on bdict");
1767     return(rc);
1768   }
1769 
1770   UdmDSTRInit(&rec_id_buf, 4096);
1771   UdmDSTRInit(&site_buf, 4096);
1772   UdmDSTRInit(&pop_rank_buf, 4096);
1773   UdmDSTRInit(&last_mod_time_buf, 4096);
1774 
1775   while (UdmDBSQLFetchRow(A, db, &SQLres, row) == UDM_OK)
1776   {
1777     if (!strcmp(row[0].str, "##rec_id"))
1778       UdmBlobModeInflateOrAlloc(A, &rec_id_buf, "##rec_id", &row[1], &id);
1779     else if (!strcmp(row[0].str, "##site"))
1780       UdmBlobModeInflateOrAlloc(A, &site_buf, "##site", &row[1], &site);
1781     else if (!strcmp(row[0].str, "##last_mod_time"))
1782       UdmBlobModeInflateOrAlloc(A, &last_mod_time_buf, "##last_mod_time",
1783                                 &row[1], &lm);
1784     else if (!strcmp(row[0].str, "##pop"))
1785       UdmBlobModeInflateOrAlloc(A, &pop_rank_buf, "##pop", &row[1], &pop);
1786   }
1787 
1788   UdmLog(A, UDM_LOG_DEBUG, "Fetch from bdict done:        %.2f", UdmStopTimer(&ticks));
1789 
1790   if (need_pop_rank)
1791   {
1792     if (pop.str)
1793     {
1794       ticks= UdmStartTimer();
1795       UdmURLDataListUnpackPopularity(A, DataList, &pop);
1796       UdmLog(A, UDM_LOG_DEBUG, "Unpacking popularity done:    %.02f", UdmStopTimer(&ticks));
1797     }
1798     else
1799     {
1800       /*
1801         All pop_rank values were 0 at "indexer -Eblob" time.
1802         Use 0 as pop_rank values.
1803       */
1804       UdmLog(A, UDM_LOG_DEBUG, "Warning: s=P is requested, but '##pop' record not found");
1805       UdmLog(A, UDM_LOG_DEBUG, "Perhaps you forgot to run 'indexer -n0 -R' before running 'indexer --index'");
1806       need_pop_rank= 0;
1807     }
1808   }
1809 
1810   if (need_site)
1811   {
1812     if (site.str)
1813     {
1814       ticks= UdmStartTimer();
1815       UdmURLDataListUnpackSite(A, DataList, &site);
1816       UdmLog(A, UDM_LOG_DEBUG, "Unpacking site done: %.02f", UdmStopTimer(&ticks));
1817     }
1818     else
1819     {
1820       UdmLog(A, UDM_LOG_DEBUG, "No '##site' record found");
1821     }
1822   }
1823 
1824   if (!need_id && !need_last_mod_time)
1825   {
1826     rc= UDM_OK;
1827     goto ret;
1828   }
1829 
1830   if (id.str && id.length &&
1831       (lm.str || ! need_last_mod_time))
1832   {
1833     size_t j, nrows= id.length / 4;
1834 
1835     ticks= UdmStartTimer();
1836     UdmLog(A, UDM_LOG_DEBUG, "Unpacking URL Data %d rows", (int) nrows);
1837     if (need_last_mod_time)
1838     {
1839       /* Need pop_rank or last_mod_time */
1840       j= UdmURLDataUnpackFull(DataList, nrows, id.str,
1841                               need_last_mod_time ? lm.str : NULL);
1842     }
1843     else
1844     {
1845       /* Need only rec_id */
1846       j= UdmURLDataUnpackRecID(A, DataList, nrows, id.str);
1847     }
1848 
1849     UdmLog(A, UDM_LOG_DEBUG, "Unpacking URL Data done: %.02f", UdmStopTimer(&ticks));
1850 
1851     if (j != DataList->nitems)
1852     {
1853       UdmLog(A,UDM_LOG_DEBUG, "Expected to load %d URLs, loaded %d",
1854                               (int) DataList->nitems, (int) j);
1855       UdmLog(A,UDM_LOG_DEBUG,"Couldn't load URL data from bdict");
1856       goto load_from_url;
1857     }
1858   }
1859   else
1860   {
1861     UdmLog(A,UDM_LOG_DEBUG,"There is no URL data in bdict");
1862   }
1863 
1864 
1865 load_from_url:
1866   rc= UDM_NOTARGET;
1867 
1868 ret:
1869   UdmSQLFree(&SQLres);
1870   UdmDSTRFree(&rec_id_buf);
1871   UdmDSTRFree(&site_buf);
1872   UdmDSTRFree(&pop_rank_buf);
1873   UdmDSTRFree(&last_mod_time_buf);
1874   return rc;
1875 }
1876 
1877 
1878 
1879 /*******************************************/
1880 
1881 static udm_rc_t
UdmRewriteURL(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)1882 UdmRewriteURL(UDM_AGENT *Indexer, UDM_DB *db, UDM_QUERY *Query)
1883 {
1884   udm_rc_t rc;
1885   int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
1886   char tablename[64];
1887   UDM_WRITE_HELPER Helper;
1888 
1889   if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
1890     return rc;
1891   UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
1892   if ((tr && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
1893       UDM_OK != (rc= UdmBlobWriteTimestamp(Indexer, db, tablename, UDM_TRUE)) ||
1894       UDM_OK != (rc= UdmBlobWriteURLData(Indexer, db, Query, tablename, &Helper)) ||
1895       UDM_OK != (rc= UdmBlobWriteLimitsInternal(Indexer, db, tablename, &Helper)) ||
1896       (tr && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
1897   {
1898   }
1899   UdmWriteHelperFree(&Helper);
1900   return rc;
1901 }
1902 
1903 
1904 /*
1905   Write limits with COMMIT and timestamp
1906 */
1907 static udm_rc_t
UdmBlobWriteLimits(UDM_AGENT * A,UDM_DB * db,const char * table,UDM_WRITE_HELPER * Helper)1908 UdmBlobWriteLimits(UDM_AGENT *A, UDM_DB *db, const char *table,
1909                    UDM_WRITE_HELPER *Helper)
1910 {
1911   udm_rc_t rc;
1912   if (UDM_OK != (rc= UdmDBSQLBegin(A, db)) ||
1913       UDM_OK != (rc= UdmBlobWriteLimitsInternal(A, db, table, Helper)) ||
1914       UDM_OK != (rc= UdmBlobWriteTimestamp(A, db, table, UDM_TRUE)) ||
1915       UDM_OK != (rc= UdmDBSQLCommit(A, db)))
1916     return rc;
1917   return UDM_OK;
1918 }
1919 
1920 
1921 static udm_rc_t
UdmRewriteLimits(UDM_AGENT * Indexer,UDM_DB * db)1922 UdmRewriteLimits(UDM_AGENT *Indexer, UDM_DB *db)
1923 {
1924   udm_rc_t rc;
1925   char tablename[64];
1926   UDM_WRITE_HELPER Helper;
1927   if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
1928     return rc;
1929   UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
1930   rc= UdmBlobWriteLimits(Indexer, db, tablename, &Helper);
1931   UdmWriteHelperFree(&Helper);
1932   return rc;
1933 }
1934 
1935 
1936 
1937 
1938 /************************************************/
1939 
1940 #include "udm_doc.h"
1941 #include "udm_parsehtml.h"
1942 #include "udm_parsexml.h"
1943 #include "udm_server.h"  /* UdmSpiderParamInit */
1944 #include "udm_url.h"
1945 
1946 /* TODO34: merge with word.c */
1947 /*
1948 static int wlcmp_sort(UDM_WORD *w1, UDM_WORD *w2)
1949 {
1950   register int _;
1951   if ((_= strcmp(w1->word, w2->word)))
1952     return _;
1953   if (w1->secno != (int) w2->secno)
1954     return w1->secno < w2->secno ? -1 : 1;
1955   if (w1->pos != w2->pos)
1956     return w1->pos < w2->pos ? -1 : 1;
1957   return 0;
1958 }
1959 */
wlcmp_search(UDM_WORD * w1,UDM_WORD * w2)1960 static int wlcmp_search(UDM_WORD *w1, UDM_WORD *w2)
1961 {
1962   register int _;
1963   if ((_= strcmp(w1->word, w2->word)))
1964     return _;
1965   if (w1->coord.secno != (int) w2->coord.secno)
1966     return w1->coord.secno < w2->coord.secno ? -1 : 1;
1967   return 0;
1968 }
1969 /*
1970 static void
1971 UdmWordListSort(UDM_WORDLIST *WL)
1972 {
1973   if (WL->nwords)
1974     UdmSort(WL->Word, WL->nwords, sizeof(UDM_WORD), (udm_qsort_cmp) wlcmp_sort);
1975 }
1976 */
1977 
1978 static udm_rc_t
UdmWordList2InvertedIndexCache(UDM_AGENT * A,UDM_WORDLIST * WL,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_CACHE * cache)1979 UdmWordList2InvertedIndexCache(UDM_AGENT *A, UDM_WORDLIST *WL,
1980                                UDM_CONSTWORD_HASH_DATA *data,
1981                                UDM_INVERTED_INDEX_CACHE *cache)
1982 {
1983   size_t i;
1984   UDM_WORD *prev;
1985   if (!WL->nwords)
1986     return UDM_OK;
1987   for (prev= &WL->Word[0], i= 0; i <= WL->nwords; i++)
1988   {
1989     UDM_WORD *W= &WL->Word[i];
1990     if (i == WL->nwords || wlcmp_search(prev, W))
1991     {
1992       if (UDM_OK != UdmInvertedIndexCacheAdd(A, data, cache, prev, W - prev))
1993         return UDM_ERROR;
1994       prev= W;
1995     }
1996   }
1997   return UDM_OK;
1998 }
1999 
2000 
2001 
2002 static udm_rc_t
UdmConstWordListToPairWordList(UDM_INVERTED_INDEX_CACHE * cache,UDM_WORDLIST * WL,UDM_CONSTWORD_HASH_DATA * data,UDM_INVERTED_INDEX_COORD_LIST * CL)2003 UdmConstWordListToPairWordList(UDM_INVERTED_INDEX_CACHE *cache,
2004                                UDM_WORDLIST *WL,
2005                                UDM_CONSTWORD_HASH_DATA *data,
2006                                UDM_INVERTED_INDEX_COORD_LIST *CL)
2007 {
2008   size_t i;
2009 
2010   for (i= 1; i < CL->nitems; i++)
2011   {
2012     UDM_INVERTED_INDEX_COORD *C1= &CL->Item[i-1];
2013     UDM_INVERTED_INDEX_COORD *C2= &CL->Item[i];
2014     UDM_INVERTED_INDEX_WORD *W1= &cache->Words.list.Item[C1->word_id];
2015     UDM_INVERTED_INDEX_WORD *W2= &cache->Words.list.Item[C2->word_id];
2016     /*
2017       printf("[%d][%d][%d:%d][%d:%d] '%.*s'-'%.*s' (%d-%d)\n",
2018              i, (int) data->url_id,
2019              C1->secno, C1->pos, C2->secno, C2->pos,
2020              (int) W1->str.length, W1->str.str,
2021              (int) W2->str.length, W2->str.str,
2022              W1->last_url_id_count, W2->last_url_id_count);
2023     */
2024     /* TODO34: Check max_word_len and min_word_len */
2025     if (C1->coord.secno == C2->coord.secno &&
2026         C1->coord.pos + 1 == C2->coord.pos &&
2027         W1->last_url_id_count > cache->param.pair_limit &&
2028         W2->last_url_id_count > cache->param.pair_limit)
2029     {
2030       char word[256]= "";
2031       UDM_ASSERT(W1->last_url_id == W2->last_url_id);
2032       udm_snprintf(word, sizeof(word), "##%.*s-%.*s",
2033                    (int) W1->length, W1->str,
2034                    (int) W2->length, W2->str);
2035       /*
2036       fprintf(stderr, "[%d:%d:%d] %d-%d '%s'\n",
2037               data->url_id, C1->secno, C1->pos, W1->last_url_id_count, W2->last_url_id_count,
2038               word);
2039       */
2040       UdmWordListAddEx(WL, word, C1->coord.secno, C1->coord.pos, 0);
2041     }
2042   }
2043   return UDM_OK;
2044 }
2045 
2046 
2047 static udm_rc_t
UdmConvertPairs(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,UDM_INVERTED_INDEX_COORD_LIST * CL,UDM_CONSTWORD_HASH_DATA * hash_data)2048 UdmConvertPairs(UDM_AGENT *A,
2049                 UDM_INVERTED_INDEX_CACHE *cache,
2050                 UDM_INVERTED_INDEX_COORD_LIST *CL,
2051                 UDM_CONSTWORD_HASH_DATA *hash_data)
2052 {
2053   udm_rc_t rc= UDM_OK;
2054   UDM_WORDLIST WL;
2055   UdmWordListInit(&WL);
2056   UdmConstWordListToPairWordList(cache, &WL, hash_data, CL);
2057   if (WL.nwords)
2058     rc= UdmWordList2InvertedIndexCache(A, &WL, hash_data, cache);
2059   UdmWordListFree(&WL);
2060   return rc;
2061 }
2062 
2063 
2064 static udm_rc_t
UdmWordListSortAndConvert(UDM_AGENT * A,UDM_CONSTWORDLIST * CWL,UDM_INVERTED_INDEX_CACHE * cache,urlid_t url_id,UDM_CHARSET * doccs)2065 UdmWordListSortAndConvert(UDM_AGENT *A,
2066                           UDM_CONSTWORDLIST *CWL,
2067                           UDM_INVERTED_INDEX_CACHE *cache,
2068                           urlid_t url_id,
2069                           UDM_CHARSET *doccs)
2070 {
2071   udm_rc_t rc= UDM_OK;
2072   UDM_INVERTED_INDEX_COORD_LIST CL;
2073   UDM_CONSTWORD_HASH_DATA hash_data;
2074   udm_timer_t ticks= UdmStartTimer();
2075 
2076   if (!CWL->nitems)
2077     return UDM_OK;
2078 
2079   UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2080   bzero(&CL, sizeof(CL));
2081   UdmConvInit(&hash_data.cnv, doccs, A->Conf->lcs);
2082   hash_data.url_id= url_id;
2083   hash_data.cache= cache;
2084   cache->Words.hash.user_data= &hash_data;
2085   cache->Stats.conv+= UdmStartTimer() - ticks;
2086 
2087   ticks= UdmStartTimer();
2088   rc= UdmConstWordListToInvertedIndexCoordList(cache, &CL, CWL);
2089   cache->Stats.prepare_words+= UdmStartTimer() - ticks;
2090   UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2091 
2092   if (rc != UDM_OK)
2093     goto ret;
2094 
2095   UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2096   if (hash_data.cache->param.pair_limit)
2097   {
2098     ticks= UdmStartTimer();
2099     rc= UdmConvertPairs(A, cache, &CL, &hash_data);
2100     cache->Stats.pairs+=  UdmStartTimer() - ticks;
2101   }
2102   UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2103 
2104   ticks= UdmStartTimer();
2105   UdmInvertedIndexCoordListSort(&CL);
2106 
2107   UDM_GETLOCK(A, UDM_LOCK_DOC_CACHE);
2108   cache->Stats.sort_wordlist+= UdmStartTimer() - ticks;
2109   ticks= UdmStartTimer();
2110   rc= UdmInvertedIndexCoordList2InvertedIndexCache(A, &CL, CWL, &hash_data, cache);
2111   cache->Stats.conv+= UdmStartTimer() - ticks;
2112   UDM_RELEASELOCK(A, UDM_LOCK_DOC_CACHE);
2113 
2114   if (rc != UDM_OK)
2115     goto ret;
2116 
2117 ret:
2118   ticks= UdmStartTimer();
2119   UdmInvertedIndexCoordListFree(&CL);
2120   cache->Stats.conv+= UdmStartTimer() - ticks;
2121   return rc;
2122 }
2123 
2124 
2125 static udm_rc_t
UdmInvertedIndexCacheDocParse(UDM_AGENT * Indexer,urlid_t url_id,UDM_DOCUMENT * Doc,UDM_INVERTED_INDEX_CACHE * cache)2126 UdmInvertedIndexCacheDocParse(UDM_AGENT *Indexer,
2127                               urlid_t url_id,
2128                               UDM_DOCUMENT *Doc,
2129                               UDM_INVERTED_INDEX_CACHE *cache)
2130 {
2131   UDM_CONSTWORDLIST CWL;
2132   UDM_CHARSET *doccs= UdmDocDetectCachedCharset(Indexer, Doc, url_id);
2133   UDM_CHARSET *metacs;
2134   udm_rc_t rc;
2135 
2136   UdmConstWordListInit(&CWL);
2137   UdmDocToConstWordList(Indexer, Doc, &CWL, doccs,
2138                         cache->param.cnvflags,
2139                         cache->param.aggregate_section_flags,
2140                         &cache->Stats.parse,
2141                         &cache->Stats.prepare_words);
2142   metacs= UdmVarListFindCharset(&Doc->Sections, "Strong-Meta-Charset", doccs);
2143   rc= UdmWordListSortAndConvert(Indexer, &CWL, cache, url_id, metacs);
2144   UdmConstWordListFree(&CWL);
2145   return rc;
2146 }
2147 
2148 
2149 
2150 #define UDM_BLOB2_MULTI_INSERT_ALLOC_SIZE  32*1024*1024
2151 #define UDM_BLOB2_MULTI_INSERT_FLUSH_SIZE  32*1024*1024
2152 #define UDM_BLOB2_MILTI_INSERT_WORD_SIZE   1024
2153 
2154 static udm_rc_t
UdmSendMultiInsertQuery(UDM_AGENT * A,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_DSTR * multi)2155 UdmSendMultiInsertQuery(UDM_AGENT *A, UDM_DB *db,
2156                         UDM_INVERTED_INDEX_STATS *Stats, UDM_DSTR *multi)
2157 {
2158   udm_timer_t ticks= UdmStartTimer();
2159   udm_rc_t rc= UdmDBSQLQuery(A, db, NULL, UdmDSTRPtr(multi));
2160   Stats->send_multi+= UdmStartTimer() - ticks;
2161   return rc;
2162 }
2163 
2164 
2165 /* TODO34: add compression */
2166 static udm_rc_t
UdmInvertedIndexCacheStoreWord(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_DSTR * dstr,UDM_DSTR * multi,UDM_INVERTED_INDEX_CACHE_ITEM * Items,size_t nitems)2167 UdmInvertedIndexCacheStoreWord(UDM_AGENT *Indexer, UDM_DB *db,
2168                                UDM_INVERTED_INDEX_STATS *Stats,
2169                                const char *wtable,
2170                                UDM_DSTR *dstr,
2171                                UDM_DSTR *multi,
2172                                UDM_INVERTED_INDEX_CACHE_ITEM *Items,
2173                                size_t nitems)
2174 {
2175   size_t i;
2176   udm_timer_t ticks= UdmStartTimer();
2177 
2178   UdmDSTRReset(dstr);
2179   for (i= 0; i < nitems; i++)
2180   {
2181     UDM_INVERTED_INDEX_CACHE_ITEM *Item= &Items[i];
2182     uint4 wordlen= strlen(Item->ptr) + 1;
2183     if (!UdmDSTRAppendINT4(dstr, Item->url_id) ||
2184         !UdmDSTRAppend(dstr, Item->ptr + wordlen,
2185                              Item->length - wordlen))
2186     {
2187       UdmLog(Indexer, UDM_LOG_ERROR, "UdmDSTRAppend failed");
2188       return UDM_ERROR;
2189     }
2190   }
2191   Stats->pack+= UdmStartTimer() - ticks;
2192 
2193   /*
2194   fprintf(stderr, "secno=%d ndocs=%-6d ncoords=%-6d datalen=%-6d %s\n",
2195           Items->secno, (int) nitems, (int) ncoords, (int) dstr->size_data, Items->word);
2196   */
2197   /*
2198     TODO: Multi-insert for MyODBC. Needs proper escaping.
2199   */
2200   if (UdmSQLDBDriver(db) == UDM_DBAPI_MYSQL &&
2201       UdmDSTRLength(dstr) < UDM_BLOB2_MILTI_INSERT_WORD_SIZE)
2202   {
2203     ticks= UdmStartTimer();
2204     if (!UdmDSTRLength(multi))
2205       UdmDSTRAppendf(multi, "INSERT INTO %s VALUES('", wtable);
2206     else
2207       UdmDSTRAppend(multi, UDM_CSTR_WITH_LEN(",('"));
2208     UdmDSTRAppend(multi, Items->ptr, strlen(Items->ptr));
2209     UdmDSTRAppendf(multi,"',%d,", Items->secno);
2210     if (0)
2211     {
2212       UdmDSTRAppend(multi, "0x", 2);
2213       UdmDSTRAppendHex(multi, UdmDSTRPtr(dstr), UdmDSTRLength(dstr));
2214     }
2215     else
2216     {
2217       size_t esclen;
2218       UdmDSTRAppend(multi, "'", 1);
2219       if (UDM_OK != UdmDSTRRealloc(multi, UdmDSTRLength(multi) +
2220                                           2 * UdmDSTRLength(dstr)))
2221         return UDM_ERROR;
2222       esclen= UdmDBSQLEscStr(Indexer, db, multi->Val.str + multi->Val.length,
2223                              UdmDSTRPtr(dstr), UdmDSTRLength(dstr));
2224       multi->Val.length+= esclen;
2225       UdmDSTRAppend(multi, "'", 1);
2226     }
2227     UdmDSTRAppend(multi, UDM_CSTR_WITH_LEN(")"));
2228     Stats->send_multi+= UdmStartTimer() - ticks;
2229     if (UdmDSTRLength(multi) > UDM_BLOB2_MULTI_INSERT_FLUSH_SIZE)
2230     {
2231       if (UDM_OK != UdmSendMultiInsertQuery(Indexer, db, Stats, multi))
2232         return UDM_ERROR;
2233       UdmDSTRReset(multi);
2234     }
2235   }
2236   else
2237   {
2238     ticks= UdmStartTimer();
2239     if (UDM_OK != UdmBlobWriteWordUsingBind(Indexer, db, wtable,
2240                                             Items->ptr, Items->secno,
2241                                             UdmDSTRPtr(dstr),
2242                                             UdmDSTRLength(dstr),
2243                                             NULL, 0))
2244     {
2245       UdmLog(Indexer, UDM_LOG_ERROR, "%s", UdmDBSQLError(db));
2246       return UDM_ERROR;
2247     }
2248     Stats->send+= UdmStartTimer() - ticks;
2249   }
2250 
2251   return UDM_OK;
2252 }
2253 
2254 
2255 static udm_rc_t
UdmInvertedIndexCacheStoreUsingBind(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_INVERTED_INDEX_CACHE_PART * cache,UDM_DSTR * dstr,UDM_DSTR * multi)2256 UdmInvertedIndexCacheStoreUsingBind(UDM_AGENT *Indexer, UDM_DB *db,
2257                                     UDM_INVERTED_INDEX_STATS *Stats,
2258                                     const char *wtable,
2259                                     UDM_INVERTED_INDEX_CACHE_PART *cache,
2260                                     UDM_DSTR *dstr, UDM_DSTR *multi)
2261 {
2262   UDM_INVERTED_INDEX_CACHE_ITEM *prev;
2263   udm_rc_t rc;
2264   size_t i;
2265 
2266   if (cache->nitems == 0)
2267     return UDM_OK;
2268 
2269   if (UDM_OK != (rc= UdmBlobWriteWordPrepare(Indexer, db, wtable)))
2270     return rc;
2271 
2272   for (prev= &cache->Item[0], i= 0; i <= cache->nitems; i++)
2273   {
2274     UDM_INVERTED_INDEX_CACHE_ITEM *Item= &cache->Item[i];
2275     if (i == cache->nitems ||
2276         strcmp(prev->ptr, Item->ptr) ||
2277         prev->secno != Item->secno)
2278     {
2279       if (UDM_OK != (rc= UdmInvertedIndexCacheStoreWord(Indexer, db,
2280                                                         Stats,
2281                                                         wtable,
2282                                                         dstr, multi,
2283                                                         prev, Item - prev)))
2284         break;
2285       prev= Item;
2286     }
2287   }
2288   UdmDBSQLStmtFree(Indexer, db);
2289   if (UdmDSTRLength(multi) &&
2290       UDM_OK != UdmSendMultiInsertQuery(Indexer, db, Stats, multi))
2291     return UDM_ERROR;
2292   return rc;
2293 }
2294 
2295 
2296 /**
2297   Store a sorted part
2298 */
2299 static udm_rc_t
UdmInvertedIndexCachePartStore(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,const char * wtable,UDM_INVERTED_INDEX_CACHE_PART * part)2300 UdmInvertedIndexCachePartStore(UDM_AGENT *Indexer, UDM_DB *db,
2301                                UDM_INVERTED_INDEX_STATS *Stats,
2302                                const char *wtable,
2303                                UDM_INVERTED_INDEX_CACHE_PART *part)
2304 {
2305   udm_rc_t rc= UDM_OK;
2306   UDM_DSTR dstr, multi;
2307   UdmDSTRInit(&dstr, 1024);
2308   UdmDSTRInit(&multi, UDM_BLOB2_MULTI_INSERT_ALLOC_SIZE);
2309   UdmLog(Indexer, UDM_LOG_DEBUG, "Writing word index cache part");
2310   rc= UdmInvertedIndexCacheStoreUsingBind(Indexer, db, Stats,
2311                                           wtable, part, &dstr, &multi);
2312   UdmDSTRFree(&multi);
2313   UdmDSTRFree(&dstr);
2314   return rc;
2315 }
2316 
2317 
2318 /**
2319   Sort one cache part
2320 */
2321 static void
UdmInvertedIndexCacheSortPart(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,size_t partno)2322 UdmInvertedIndexCacheSortPart(UDM_AGENT *A,
2323                               UDM_INVERTED_INDEX_CACHE *cache,
2324                               size_t partno)
2325 {
2326   udm_timer_t ticks= UdmStartTimer();
2327   UDM_INVERTED_INDEX_CACHE_PART *part= &cache->Item[partno];
2328   UdmLog(A, UDM_LOG_DEBUG,
2329          "Sorting word index cache part%02d: %d records",
2330          (int) partno, (int) part->nitems);
2331   UdmInvertedIndexCachePartSort(part);
2332   cache->Stats.sort+= UdmStartTimer() - ticks;
2333 }
2334 
2335 
2336 typedef struct
2337 {
2338   size_t partno;
2339   UDM_INVERTED_INDEX_CACHE *cache;
2340 } UDM_INDEXCACHESORT_SHARE;
2341 
2342 
2343 typedef struct
2344 {
2345   UDM_AGENT Agent;
2346   void *thd;
2347   UDM_INDEXCACHESORT_SHARE *share;
2348 } UDM_INDEXCACHESORT_PARAM;
2349 
2350 
2351 
2352 static
IndexCacheSortThread(void * arg)2353 void *IndexCacheSortThread(void *arg)
2354 {
2355   UDM_INDEXCACHESORT_PARAM *param= (UDM_INDEXCACHESORT_PARAM *) arg;
2356   udm_bool_t done= UDM_FALSE;
2357   for ( ; ; )
2358   {
2359     size_t partno;
2360     UDM_GETLOCK(&param->Agent, UDM_LOCK_DOC_CACHE);
2361     partno= param->share->partno;
2362     if (param->share->partno < param->share->cache->nitems)
2363       param->share->partno++;
2364     else
2365       done= UDM_TRUE;
2366     UDM_RELEASELOCK(&param->Agent, UDM_LOCK_DOC_CACHE);
2367     if (done)
2368       break;
2369     UdmInvertedIndexCacheSortPart(&param->Agent, param->share->cache, partno);
2370   }
2371   return NULL;
2372 }
2373 
2374 
2375 static udm_rc_t
UdmInvertedIndexCacheSortThreaded(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache,size_t nthreads)2376 UdmInvertedIndexCacheSortThreaded(UDM_AGENT *A,
2377                                   UDM_INVERTED_INDEX_CACHE *cache,
2378                                   size_t nthreads)
2379 {
2380   size_t i;
2381   udm_timer_t ticks= UdmStartTimer();
2382   UDM_INDEXCACHESORT_SHARE share;
2383   UDM_INDEXCACHESORT_PARAM params[UDM_INDEXER_THREADS_MAX];
2384 
2385   share.cache= cache;
2386   share.partno= 0;
2387   if (nthreads > cache->nitems)
2388     nthreads= cache->nitems;
2389 
2390   UdmLog(A, UDM_LOG_DEBUG, "Sorting cache");
2391   for (i= 0; i < nthreads; i++)
2392   {
2393     UDM_INDEXCACHESORT_PARAM *param= &params[i];
2394     UdmAgentInit(&param->Agent, A->Conf, i + 1);
2395     param->thd= NULL;
2396     param->share= &share;
2397     A->Conf->THDHandler.ThreadCreate(&param->thd, IndexCacheSortThread, (void*) param);
2398   }
2399   for (i= 0; i < nthreads; i++)
2400   {
2401     void *thd= params[i].thd;
2402     A->Conf->THDHandler.ThreadJoin(thd);
2403   }
2404 
2405   UdmLog(A, UDM_LOG_DEBUG, "Sorting threads done: %.2f", UdmStopTimer(&ticks));
2406   for (i= 0 ; i < nthreads; i++)
2407   {
2408     UDM_INDEXCACHESORT_PARAM *param= &params[i];
2409     UdmAgentFree(&param->Agent);
2410   }
2411   return UDM_OK;
2412 }
2413 
2414 
2415 /**
2416   Sort all cache parts
2417 */
2418 static void
UdmInvertedIndexCacheSort(UDM_AGENT * A,UDM_INVERTED_INDEX_CACHE * cache)2419 UdmInvertedIndexCacheSort(UDM_AGENT *A, UDM_INVERTED_INDEX_CACHE *cache)
2420 {
2421   size_t nthreads= UdmVarListFindInt(&A->Conf->Vars, "IndexerThreads", 0);
2422   if (nthreads && A->Conf->THDHandler.ThreadCreate)
2423   {
2424     UdmInvertedIndexCacheSortThreaded(A, cache, nthreads);
2425   }
2426   else
2427   {
2428     size_t i;
2429     for (i= 0; i < cache->nitems; i++)
2430       UdmInvertedIndexCacheSortPart(A, cache, i);
2431   }
2432 }
2433 
2434 
2435 static udm_rc_t
UdmInvertedIndexCacheStore(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_INVERTED_INDEX_CACHE * cache)2436 UdmInvertedIndexCacheStore(UDM_AGENT *Indexer,
2437                            UDM_DB *db, const char *wtable,
2438                            UDM_INVERTED_INDEX_CACHE *cache)
2439 {
2440   int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
2441   size_t i;
2442   UdmLog(Indexer, UDM_LOG_EXTRA,
2443          "Storing word index cache: %lld bytes",
2444           (unsigned long long) UdmInvertedIndexCacheEstimateUsedMemory(cache));
2445   UdmInvertedIndexCacheSort(Indexer, cache);
2446   for (i= 0; i < cache->nitems; i++)
2447   {
2448     udm_rc_t rc;
2449     UDM_INVERTED_INDEX_CACHE_PART *Item= &cache->Item[i];
2450     if (!Item->nitems)
2451       continue;
2452     if (tr && (UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))))
2453       return rc;
2454     if (UDM_OK != (rc= UdmInvertedIndexCachePartStore(Indexer, db,
2455                                                       &cache->Stats,
2456                                                       wtable, Item)))
2457       return rc;
2458     if (tr && (UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
2459       return rc;
2460   }
2461   return UDM_OK;
2462 }
2463 
2464 
2465 static void
UdmInvertedIndexCacheReportStatistics(UDM_AGENT * Indexer,UDM_INVERTED_INDEX_STATS * Stats)2466 UdmInvertedIndexCacheReportStatistics(UDM_AGENT *Indexer,
2467                                       UDM_INVERTED_INDEX_STATS *Stats)
2468 {
2469   UdmLog(Indexer, UDM_LOG_INFO, "Indexing statistics:");
2470   UdmLog(Indexer, UDM_LOG_INFO, "- Loading cached copies:      %.2f (%llu bytes)", (double) Stats->load / 1000, Stats->bytes_loaded);
2471   if (Stats->unpack_cached_copy)
2472     UdmLog(Indexer, UDM_LOG_INFO, "- Unpacking cached copies:    %.2f", (double) Stats->unpack_cached_copy / 1000);
2473   UdmLog(Indexer, UDM_LOG_INFO, "- Parsing documents:          %.2f", (double) Stats->parse / 1000);
2474   UdmLog(Indexer, UDM_LOG_INFO, "- Breaking sections to words: %.2f", (double) Stats->prepare_words / 1000);
2475   UdmLog(Indexer, UDM_LOG_INFO, "- Sorting word list:          %.2f", (double) Stats->sort_wordlist / 1000);
2476   UdmLog(Indexer, UDM_LOG_INFO, "- Groupping words:            %.2f", (double) Stats->conv / 1000);
2477   if (Stats->pairs)
2478     UdmLog(Indexer, UDM_LOG_INFO, "- Making pairs:               %.2f", (double) Stats->pairs / 1000);
2479   UdmLog(Indexer, UDM_LOG_INFO, "- Sorting words: %.2f", (double) Stats->sort / 1000);
2480   UdmLog(Indexer, UDM_LOG_INFO, "- Packing words: %.2f", (double) Stats->pack / 1000);
2481   UdmLog(Indexer, UDM_LOG_INFO, "- Sending words: %.2f", (double) (Stats->send + Stats->send_multi) / 1000);
2482   if (Stats->send_multi)
2483   {
2484     UdmLog(Indexer, UDM_LOG_EXTRA, "  +- Sending words: %.2f (MultiInsert)", (double) Stats->send_multi / 1000);
2485     UdmLog(Indexer, UDM_LOG_EXTRA, "  +- Sending words: %.2f (SQLExec)", (double) Stats->send / 1000);
2486   }
2487 }
2488 
2489 
2490 static udm_rc_t
UdmSQLFetchRowCachedCopy(UDM_AGENT * A,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_SQLRES * SQLRes,UDM_STR * row)2491 UdmSQLFetchRowCachedCopy(UDM_AGENT *A, UDM_DB *db,
2492                          UDM_INVERTED_INDEX_STATS *Stats,
2493                          UDM_SQLRES *SQLRes, UDM_STR *row)
2494 {
2495   udm_timer_t ticks= UdmStartTimer();
2496   udm_rc_t rc= UdmDBSQLFetchRow(A, db, SQLRes, row);
2497   Stats->load+= UdmStartTimer() - ticks;
2498   return rc;
2499 }
2500 
2501 
2502 static udm_rc_t
UdmParseRowCachedCopy(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,UDM_STR * row,UDM_URLDATALIST * URLList,size_t max_doc_size)2503 UdmParseRowCachedCopy(UDM_AGENT *Indexer, UDM_DB *db,
2504                       UDM_INVERTED_INDEX_CACHE *cache,
2505                       UDM_STR *row, UDM_URLDATALIST *URLList,
2506                       size_t max_doc_size)
2507 {
2508   udm_rc_t rc;
2509   UDM_DOCUMENT Doc;
2510   urlid_t url_id= atoi(row[0].str);
2511   UDM_URLDATA *urldata;
2512   if (!(urldata= UdmURLDataListSearch(URLList, url_id)))
2513   {
2514     UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCreate: url_id=%d not found", url_id);
2515     return UDM_OK;
2516   }
2517   urldata->score++;
2518   UdmDocInit(&Doc);
2519   Doc.lcs= Indexer->Conf->lcs;
2520   if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc,
2521                                                      row[1].str, row[1].length,
2522                                                      max_doc_size,
2523                                                      &cache->Stats.unpack_cached_copy)))
2524   {
2525     UdmLog(Indexer, UDM_LOG_DEBUG,
2526            "UdmDocSetFromCachedHTTPResponse() failed: url_id=%d len=%d",
2527            url_id, (int) row[1].length);
2528     rc= UDM_OK;
2529     goto end;
2530   }
2531   cache->Stats.bytes_loaded+= UdmHTTPBufSize(&Doc.Buf);
2532   if (UDM_OK != (rc= UdmInvertedIndexCacheDocParse(Indexer, urldata->url_id,
2533                                                    &Doc, cache)))
2534     UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCacheDocParse() failed");
2535 
2536 end:
2537   UdmDocFree(&Doc);
2538   return rc;
2539 }
2540 
2541 
2542 /***** Cache for Cached copies *********************************************/
2543 typedef struct
2544 {
2545   UDM_STR content;
2546   urlid_t url_id;
2547 } UDM_DOCCACHEITEM;
2548 
2549 
2550 static udm_rc_t
UdmDocCacheItemInit(UDM_DOCCACHEITEM * Item,const UDM_CONST_STR * content,urlid_t url_id)2551 UdmDocCacheItemInit(UDM_DOCCACHEITEM *Item,
2552                     const UDM_CONST_STR *content,
2553                     urlid_t url_id)
2554 {
2555   UDM_ASSERT(content->length > 0);
2556   if (!(Item->content.str= UdmMalloc(content->length)))
2557     return UDM_ERROR;
2558   Item->content.length= content->length;
2559   memcpy(Item->content.str, content->str, content->length);
2560   Item->url_id= url_id;
2561   return UDM_OK;
2562 }
2563 
2564 
2565 static void
UdmDocCacheItemFree(UDM_DOCCACHEITEM * Item)2566 UdmDocCacheItemFree(UDM_DOCCACHEITEM *Item)
2567 {
2568   UdmFree(Item->content.str);
2569 }
2570 
2571 
2572 typedef struct
2573 {
2574   size_t nitems;
2575   size_t mitems;
2576   UDM_DOCCACHEITEM *Item;
2577 } UDM_DOCCACHELIST;
2578 
2579 
2580 static void
UdmDocCacheListInit(UDM_DOCCACHELIST * List)2581 UdmDocCacheListInit(UDM_DOCCACHELIST *List)
2582 {
2583   bzero((void*) List, sizeof(*List));
2584 }
2585 
2586 
2587 static udm_rc_t
UdmDocCacheListRealloc(UDM_DOCCACHELIST * List)2588 UdmDocCacheListRealloc(UDM_DOCCACHELIST *List)
2589 {
2590   if (List->nitems >= List->mitems)
2591   {
2592     size_t mitems2= List->nitems + 256;
2593     if (!(List->Item= UdmRealloc(List->Item, mitems2 * sizeof(List->Item[0]))))
2594       return UDM_ERROR;
2595     List->mitems= mitems2;
2596   }
2597   return UDM_OK;
2598 }
2599 
2600 
2601 static udm_rc_t
UdmDocCacheListAdd(UDM_DOCCACHELIST * List,UDM_DOCCACHEITEM * Item)2602 UdmDocCacheListAdd(UDM_DOCCACHELIST *List, UDM_DOCCACHEITEM *Item)
2603 {
2604   if (UDM_OK != UdmDocCacheListRealloc(List))
2605     return UDM_ERROR;
2606   List->Item[List->nitems++]= *Item;
2607   return UDM_OK;
2608 }
2609 
2610 
2611 static void
UdmDocCacheListFree(UDM_DOCCACHELIST * List)2612 UdmDocCacheListFree(UDM_DOCCACHELIST *List)
2613 {
2614   size_t i;
2615   for (i= 0; i < List->nitems; i++)
2616     UdmDocCacheItemFree(&List->Item[i]);
2617   UdmFree(List->Item);
2618 }
2619 
2620 
2621 typedef struct
2622 {
2623   UDM_DOCCACHELIST DocList;
2624   udm_uint8 nbytes;
2625   size_t cur_item;
2626 } UDM_DOCCACHE;
2627 
2628 
UdmDocCacheInit(UDM_DOCCACHE * Cache)2629 static void UdmDocCacheInit(UDM_DOCCACHE *Cache)
2630 {
2631   UdmDocCacheListInit(&Cache->DocList);
2632   Cache->nbytes= 0;
2633   Cache->cur_item= 0;
2634 }
2635 
2636 
UdmDocCacheFree(UDM_DOCCACHE * Cache)2637 static void UdmDocCacheFree(UDM_DOCCACHE *Cache)
2638 {
2639   UdmDocCacheListFree(&Cache->DocList);
2640 }
2641 
2642 
UdmDocCacheReset(UDM_DOCCACHE * Cache)2643 static void UdmDocCacheReset(UDM_DOCCACHE *Cache)
2644 {
2645   UdmDocCacheListFree(&Cache->DocList);
2646   UdmDocCacheListInit(&Cache->DocList);
2647   Cache->nbytes= 0;
2648   Cache->cur_item= 0;
2649 }
2650 
2651 /**********************************************************************/
2652 
2653 static udm_rc_t
UdmRowCachedCopyToDocCache(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_STATS * Stats,UDM_STR * row,UDM_DOCCACHE * DocCache,UDM_URLDATALIST * URLList)2654 UdmRowCachedCopyToDocCache(UDM_AGENT *Indexer, UDM_DB *db,
2655                            UDM_INVERTED_INDEX_STATS *Stats,
2656                            UDM_STR *row,
2657                            UDM_DOCCACHE *DocCache,
2658                            UDM_URLDATALIST *URLList)
2659 {
2660   urlid_t url_id= atoi(row[0].str);
2661   UDM_URLDATA *urldata;
2662   UDM_CONST_STR content;
2663   UDM_DOCCACHEITEM DocCacheItem;
2664   if (!(urldata= UdmURLDataListSearch(URLList, url_id)))
2665   {
2666     UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCreate: url_id=%d not found", url_id);
2667     return UDM_OK;
2668   }
2669   urldata->score++;
2670   UdmConstStrSet(&content, row[1].str, row[1].length);
2671   if (UDM_OK != UdmDocCacheItemInit(&DocCacheItem, &content, url_id))
2672     return UDM_ERROR;
2673   if (UdmDocCacheListAdd(&DocCache->DocList, &DocCacheItem))
2674     return UDM_ERROR;
2675   DocCache->nbytes+= content.length;
2676   return UDM_OK;
2677 }
2678 
2679 
2680 static udm_rc_t
UdmCreateContentQueryUsingURLIdLoop(UDM_DSTR * query,const char * fmt,UDM_URLDATALIST * URLList,size_t offs,size_t nrows,const UDM_CONST_STR ihint)2681 UdmCreateContentQueryUsingURLIdLoop(UDM_DSTR *query, const char *fmt,
2682                                     UDM_URLDATALIST *URLList, size_t offs,
2683                                     size_t nrows,
2684                                     const UDM_CONST_STR ihint)
2685 {
2686   UdmDSTRReset(query);
2687   for ( ; *fmt; )
2688   {
2689     if (!strncasecmp(fmt, UDM_CSTR_WITH_LEN("${where}")))
2690     {
2691       size_t j;
2692       fmt+= 8;
2693       UdmDSTRAppend(query, UDM_CSTR_WITH_LEN(" WHERE url_id IN("));
2694       for (j=0; j < nrows; j++)
2695       {
2696         if (j > 0)
2697           UdmDSTRAppend(query, ",", 1);
2698         UdmDSTRAppendf(query, "%d", URLList->Item[offs + j].url_id);
2699       }
2700       UdmDSTRAppend(query, UDM_CSTR_WITH_LEN(")"));
2701     }
2702     else if (!strncasecmp(fmt, UDM_CSTR_WITH_LEN("${ihint}")))
2703     {
2704       fmt+= 8;
2705       UdmDSTRAppend(query, ihint.str, ihint.length);
2706     }
2707     else
2708       UdmDSTRAppend(query, fmt++, 1);
2709   }
2710   return UDM_OK;
2711 }
2712 
2713 
2714 #define DEFAULT_CONTENT_FMT "SELECT url_id,content FROM cachedcopy${ihint}${where}"
2715 
2716 
2717 static size_t
in_limit(UDM_DB * db)2718 in_limit(UDM_DB *db)
2719 {
2720   if (UdmSQLDBType(db)== UDM_DB_ORACLE8)
2721     return 1000; /* TODO34: move to UDM_SQLDB_DRIVER */
2722   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
2723     return 10*1024;
2724   if (UdmSQLDBType(db) == UDM_DB_IBASE)
2725     return 1500;
2726   return 2000;
2727 }
2728 
2729 
2730 static udm_rc_t
UdmDocCacheIndexOneDocument(UDM_AGENT * Indexer,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t max_doc_size)2731 UdmDocCacheIndexOneDocument(UDM_AGENT *Indexer,
2732                             UDM_DOCCACHE *Docs,
2733                             UDM_INVERTED_INDEX_CACHE *cache,
2734                             size_t max_doc_size)
2735 {
2736   udm_rc_t rc;
2737   UDM_DOCCACHEITEM *Doc;
2738   UDM_DOCUMENT Doc2;
2739 
2740   UDM_GETLOCK(Indexer, UDM_LOCK_DOC_CACHE);
2741   if (Docs->cur_item < Docs->DocList.nitems)
2742   {
2743     Doc= &Docs->DocList.Item[Docs->cur_item];
2744     Docs->cur_item++;
2745   }
2746   else
2747   {
2748     Doc= NULL;
2749   }
2750   UDM_RELEASELOCK(Indexer, UDM_LOCK_DOC_CACHE);
2751   if (!Doc)
2752     return UDM_NOTARGET;
2753 
2754   UdmDocInit(&Doc2);
2755   Doc2.lcs= Indexer->Conf->lcs;
2756   if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc2,
2757                                                      Doc->content.str,
2758                                                      Doc->content.length,
2759                                                      max_doc_size,
2760                                                      &cache->Stats.unpack_cached_copy)))
2761   {
2762     UdmLog(Indexer, UDM_LOG_DEBUG,
2763            "UdmDocSetFromCachedHTTPResponse() failed: url_id=%d len=%d",
2764            Doc->url_id, (int) Doc->content.length);
2765     rc= UDM_OK;
2766     goto end;
2767   }
2768   cache->Stats.bytes_loaded+= UdmHTTPBufSize(&Doc2.Buf);
2769 
2770   if (UDM_OK != (rc= UdmInvertedIndexCacheDocParse(Indexer, Doc->url_id,
2771                                                    &Doc2, cache)))
2772     UdmLog(Indexer, UDM_LOG_DEBUG, "UdmInvertedIndexCacheDocParse() failed");
2773 
2774 end:
2775   UdmDocFree(&Doc2);
2776   return rc;
2777 }
2778 
2779 
2780 static udm_rc_t
UdmInvertedIndexCacheParse(UDM_AGENT * Indexer,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t max_doc_size)2781 UdmInvertedIndexCacheParse(UDM_AGENT *Indexer,
2782                            UDM_DOCCACHE *Docs,
2783                            UDM_INVERTED_INDEX_CACHE *cache,
2784                            size_t max_doc_size)
2785 {
2786   int rc;
2787   while (UDM_OK == (rc= UdmDocCacheIndexOneDocument(Indexer, Docs, cache,
2788                                                     max_doc_size)))
2789   { }
2790   return rc == UDM_NOTARGET ? UDM_OK : rc;
2791 }
2792 
2793 
2794 typedef struct indexer_param_st
2795 {
2796   UDM_AGENT Agent;
2797   UDM_DOCCACHE *Docs;
2798   UDM_INVERTED_INDEX_CACHE *cache;
2799   size_t max_doc_size;
2800   udm_rc_t rc;
2801   void *thd;
2802 } UDM_INDEXER_PARAM;
2803 
2804 
2805 static
IndexerThread(void * arg)2806 void *IndexerThread(void *arg)
2807 {
2808   UDM_INDEXER_PARAM *param= (UDM_INDEXER_PARAM *) arg;
2809   param->rc= UdmInvertedIndexCacheParse(&param->Agent, param->Docs,
2810                                         param->cache,
2811                                         param->max_doc_size);
2812   return NULL;
2813 }
2814 
2815 
2816 static udm_rc_t
UdmInvertedIndexCacheParseThreaded(UDM_AGENT * A,UDM_DOCCACHE * Docs,UDM_INVERTED_INDEX_CACHE * cache,size_t nthreads)2817 UdmInvertedIndexCacheParseThreaded(UDM_AGENT *A,
2818                                    UDM_DOCCACHE *Docs,
2819                                    UDM_INVERTED_INDEX_CACHE *cache,
2820                                    size_t nthreads)
2821 {
2822   size_t i;
2823   udm_timer_t ticks= UdmStartTimer();
2824   size_t max_doc_size= UdmVarListFindInt(&A->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
2825   UDM_INDEXER_PARAM params[UDM_INDEXER_THREADS_MAX];
2826 
2827   UdmLog(A, UDM_LOG_DEBUG, "   Indexing %d docs", (int) Docs->DocList.nitems);
2828   if (!Docs->DocList.nitems)
2829     return UDM_OK;
2830   if (nthreads > Docs->DocList.nitems)
2831     nthreads= Docs->DocList.nitems;
2832   for (i= 0; i < nthreads; i++)
2833   {
2834     UDM_INDEXER_PARAM *param= &params[i];
2835     UdmAgentInit(&param->Agent, A->Conf, i + 1);
2836     param->Docs= Docs;
2837     param->cache= cache;
2838     param->thd= NULL;
2839     param->max_doc_size= max_doc_size;
2840     A->Conf->THDHandler.ThreadCreate(&param->thd, IndexerThread, (void*) param);
2841   }
2842   for (i= 0; i < nthreads; i++)
2843   {
2844     void *thd= params[i].thd;
2845     A->Conf->THDHandler.ThreadJoin(thd);
2846   }
2847 
2848   UdmLog(A, UDM_LOG_DEBUG, "   Threads finished: %.2f", UdmStopTimer(&ticks));
2849   for (i= 0 ; i < nthreads; i++)
2850   {
2851     UDM_INDEXER_PARAM *param= &params[i];
2852     UdmAgentFree(&param->Agent);
2853   }
2854   return UDM_OK;
2855 }
2856 
2857 
2858 static udm_rc_t
UdmIndertedIndexExecContentQuery(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,UDM_QUERY * Query,const char * qbuf)2859 UdmIndertedIndexExecContentQuery(UDM_AGENT *Indexer, UDM_DB *db,
2860                                  UDM_INVERTED_INDEX_CACHE *cache,
2861                                  UDM_QUERY *Query,
2862                                  const char *qbuf)
2863 {
2864   udm_rc_t rc= UDM_OK;
2865   udm_timer_t ticks1= UdmStartTimer();
2866   size_t max_doc_size= UdmVarListFindInt(&Indexer->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
2867   size_t nthreads= UdmVarListFindInt(&Indexer->Conf->Vars, "IndexerThreads", 0);
2868   UDM_SQLRES SQLRes;
2869   UDM_STR row[2];
2870   UDM_DOCCACHE DocCache;
2871 
2872   if (UDM_OK != (rc= UdmDBSQLExecDirect(Indexer, db, &SQLRes, qbuf)))
2873     goto end;
2874   cache->Stats.load+= UdmStartTimer() - ticks1;
2875 
2876   UdmDocCacheInit(&DocCache);
2877   while (UDM_OK == UdmSQLFetchRowCachedCopy(Indexer, db, &cache->Stats,
2878                                             &SQLRes, row))
2879   {
2880     if (!nthreads || !Indexer->Conf->THDHandler.ThreadCreate)
2881     {
2882       if (UDM_OK != (rc= UdmParseRowCachedCopy(Indexer, db, cache,
2883                                                row, &Query->URLData,
2884                                                max_doc_size)))
2885         break;
2886     }
2887     else
2888     {
2889       if (UDM_OK != (rc= UdmRowCachedCopyToDocCache(Indexer, db, &cache->Stats,
2890                                                     row, &DocCache,
2891                                                     &Query->URLData)))
2892         break;
2893       /*fprintf(stderr, "DOCCACHE=%.3f\n", (double) DocCache.nbytes / 1024 / 1024);*/
2894       if (DocCache.nbytes >= 32 * 1024 * 1024)
2895       {
2896         UdmInvertedIndexCacheParseThreaded(Indexer, &DocCache,
2897                                            cache, nthreads);
2898         UdmDocCacheReset(&DocCache);
2899       }
2900     }
2901   }
2902   UdmSQLFree(&SQLRes);
2903   if (DocCache.DocList.nitems)
2904     UdmInvertedIndexCacheParseThreaded(Indexer, &DocCache, cache, nthreads);
2905   UdmDocCacheFree(&DocCache);
2906 
2907 end:
2908   return rc;
2909 }
2910 
2911 
2912 static udm_rc_t
UdmInvertedIndexCreateUsingURLIdLoop(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_QUERY * Query)2913 UdmInvertedIndexCreateUsingURLIdLoop(UDM_AGENT *Indexer,
2914                                      UDM_DB *db,
2915                                      UDM_INVERTED_INDEX_CACHE *cache,
2916                                      const char *wtable,
2917                                      UDM_QUERY *Query)
2918 {
2919   udm_rc_t rc= UDM_OK;
2920   UDM_DSTR query;
2921   UDM_CONST_STR ihint;
2922   UDM_URLDATALIST *URLList= &Query->URLData;
2923   size_t i;
2924   size_t ndocs_at_time= in_limit(db);
2925   udm_uint8 cache_size_limit= (udm_uint8) UdmVarListFindUnsigned(&Indexer->Conf->Vars,
2926                                                                  "IndexCacheSize",
2927                                                                  128*1024*1024);
2928   UdmConstStrInit(&ihint);
2929   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
2930   {
2931     UDM_SQLRES SQLRes;
2932     UdmDBSQLQuery(Indexer, db, &SQLRes, "SHOW CREATE TABLE cachedcopy");
2933     if (UdmSQLNumCols(&SQLRes) == 2 && UdmSQLNumRows(&SQLRes) == 1)
2934     {
2935       if (strstr(UdmSQLValue(&SQLRes, 0, 1), "PARTITION BY HASH (url_id"))
2936         UdmConstStrSet(&ihint, UDM_CSTR_WITH_LEN(" IGNORE INDEX (url_id)"));
2937     }
2938     UdmSQLFree(&SQLRes);
2939   }
2940   UdmDSTRInit(&query, ndocs_at_time * 8);
2941   for (i= 0; i < URLList->nitems; )
2942   {
2943     udm_uint8 cache_size;
2944     size_t nrows= UDM_MIN(ndocs_at_time, URLList->nitems - i);
2945     UdmLog(Indexer, UDM_LOG_DEBUG, "-- IDs %d-%d",
2946            URLList->Item[i].url_id, URLList->Item[i+nrows-1].url_id);
2947     if (UDM_OK != (rc= UdmCreateContentQueryUsingURLIdLoop(&query,
2948                                                            DEFAULT_CONTENT_FMT,
2949                                                            URLList, i, nrows,
2950                                                            ihint)))
2951       goto end;
2952     if (UDM_OK != (rc= UdmIndertedIndexExecContentQuery(Indexer, db,
2953                                                         cache, Query,
2954                                                         UdmDSTRPtr(&query))))
2955       goto end;
2956     i+= nrows;
2957 
2958     if ((cache_size= UdmInvertedIndexCacheEstimateUsedMemory(cache)) >
2959         cache_size_limit)
2960     {
2961       rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
2962       UdmInvertedIndexCacheReset(cache);
2963     }
2964   }
2965 
2966   rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
2967 end:
2968   UdmDSTRFree(&query);
2969   return rc;
2970 }
2971 
2972 
2973 static udm_rc_t
UdmInvertedIndexCreateUsingGenericLoop(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_QUERY * Query,const char * field,size_t from,size_t to)2974 UdmInvertedIndexCreateUsingGenericLoop(UDM_AGENT *Indexer,
2975                                        UDM_DB *db,
2976                                        UDM_INVERTED_INDEX_CACHE *cache,
2977                                        const char *wtable,
2978                                        UDM_QUERY *Query,
2979                                        const char *field,
2980                                        size_t from, size_t to)
2981 {
2982   udm_rc_t rc= UDM_OK;
2983   size_t i;
2984   udm_uint8 cache_size_limit= (udm_uint8) UdmVarListFindInt(&Indexer->Conf->Vars,
2985                                                             "IndexCacheSize",
2986                                                             128*1024*1024);
2987   for (i= from; i < to; i++)
2988   {
2989     char qbuf[256];
2990     udm_uint8 cache_size;
2991     if (from + 1 != to)
2992       UdmLog(Indexer, UDM_LOG_DEBUG, "-- Part %d", (int) i);
2993     udm_snprintf(qbuf, sizeof(qbuf),
2994                  "SELECT url_id,content FROM cachedcopy WHERE %s=%d",
2995                  field, (int) i);
2996     if (UDM_OK != (rc= UdmIndertedIndexExecContentQuery(Indexer, db,
2997                                                         cache, Query,
2998                                                         qbuf)))
2999       goto end;
3000     if ((cache_size= UdmInvertedIndexCacheEstimateUsedMemory(cache)) >
3001         cache_size_limit)
3002     {
3003       rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
3004       UdmInvertedIndexCacheReset(cache);
3005     }
3006   }
3007 
3008   rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, cache);
3009 end:
3010   return rc;
3011 }
3012 
3013 
3014 /*
3015   TODO34: Character set detection (crawl or index time?)
3016   TODO34: Segmenting (crawl or index time)?
3017   TODO34: CRC32 (crawl or index time?)
3018   TODO34: what to do with urlinfo sections?
3019   TODO34: Raw sections
3020   TODO34: Pairs
3021   TODO34: remove HAVE_ZLIB
3022   TODO34: In case of 'Mime application/msword "text/plain; charset=utf-8"',
3023           the charset= part is stored twice in urlinfob.
3024 */
3025 static udm_rc_t
UdmInvertedIndexCreate(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_QUERY * Query)3026 UdmInvertedIndexCreate(UDM_AGENT *Indexer, UDM_DB *db, const char *wtable,
3027                        UDM_QUERY *Query)
3028 {
3029   udm_timer_t ticks= UdmStartTimer();
3030   udm_rc_t rc= UDM_OK;
3031   UDM_INVERTED_INDEX_CACHE cache;
3032 
3033   UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3034   if (UDM_OK != UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3035                                                 INVERTED_INDEX_CACHE_PARTS))
3036     return UDM_ERROR;
3037   UdmLog(Indexer, UDM_LOG_INFO, "Indexing document contents");
3038 
3039   if (1)
3040     rc= UdmInvertedIndexCreateUsingURLIdLoop(Indexer, db, &cache, wtable, Query);
3041   else
3042     rc= UdmInvertedIndexCreateUsingGenericLoop(Indexer, db, &cache, wtable,
3043                                                Query, "0", 0, 1);
3044 
3045   UdmLog(Indexer, UDM_LOG_INFO, "Freeing cache");
3046   {
3047     udm_timer_t ticks1= UdmStartTimer();
3048     UdmInvertedIndexCacheFree(&cache);
3049     UdmLog(Indexer, UDM_LOG_INFO, "Freeing cache done: %.2f", UdmStopTimer(&ticks1));
3050   }
3051   UdmLog(Indexer, UDM_LOG_INFO, "Indexing document contents done: %.2f", UdmStopTimer(&ticks));
3052   UdmInvertedIndexCacheReportStatistics(Indexer, &cache.Stats);
3053   return rc;
3054 }
3055 
3056 
3057 typedef struct
3058 {
3059   size_t outgoing_link_count;
3060   size_t incoming_link_count;
3061   double popularity0;
3062   double popularity1;
3063   double server_weight;
3064   urlid_t url_id;
3065   uint4 score;
3066 } UDM_URLPOPINFO;
3067 
3068 
3069 typedef struct
3070 {
3071   size_t mitems;
3072   size_t nitems;
3073   UDM_URLPOPINFO *Item;
3074 } UDM_URLPOPINFOLIST;
3075 
3076 
3077 static void
UdmURLPopInfoListInit(UDM_URLPOPINFOLIST * List)3078 UdmURLPopInfoListInit(UDM_URLPOPINFOLIST *List)
3079 {
3080   bzero((void*) List, sizeof(*List));
3081 }
3082 
3083 
3084 static udm_rc_t
UdmURLPopInfoListAlloc(UDM_URLPOPINFOLIST * List,size_t mitems)3085 UdmURLPopInfoListAlloc(UDM_URLPOPINFOLIST *List, size_t mitems)
3086 {
3087   if (!(List->Item= (UDM_URLPOPINFO *) UdmMalloc(mitems * sizeof(UDM_URLPOPINFO))))
3088     return UDM_ERROR;
3089   List->mitems= mitems;
3090   return UDM_OK;
3091 }
3092 
3093 
3094 static void
UdmURLPopInfoListFree(UDM_URLPOPINFOLIST * List)3095 UdmURLPopInfoListFree(UDM_URLPOPINFOLIST *List)
3096 {
3097   UdmFree(List->Item);
3098 }
3099 
3100 
3101 static int
cmp_data_urlpopinfo(UDM_URLPOPINFO * d1,UDM_URLPOPINFO * d2)3102 cmp_data_urlpopinfo(UDM_URLPOPINFO *d1, UDM_URLPOPINFO *d2)
3103 {
3104   if (d1->url_id > d2->url_id) return 1;
3105   if (d1->url_id < d2->url_id) return -1;
3106   return 0;
3107 }
3108 
3109 
3110 static UDM_URLPOPINFO *
UdmURLPopInfoListSearch(UDM_URLPOPINFOLIST * List,urlid_t id)3111 UdmURLPopInfoListSearch(UDM_URLPOPINFOLIST *List, urlid_t id)
3112 {
3113   UDM_URLPOPINFO d;
3114   void *found;
3115   if (!List->nitems)
3116     return 0;
3117   d.url_id= id;
3118   found= UdmBSearch(&d, List->Item, List->nitems, sizeof(UDM_URLPOPINFO),
3119                     (udm_qsort_cmp) cmp_data_urlpopinfo);
3120   return (UDM_URLPOPINFO*) found;
3121 }
3122 
3123 
3124 /*
3125   Init URLPopInfoList from a sorted URLDataList.
3126 */
3127 static udm_rc_t
UdmURLPopInfoListInitFromURLDataList(UDM_AGENT * A,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_URLDATALIST * URLDataList)3128 UdmURLPopInfoListInitFromURLDataList(UDM_AGENT *A,
3129                                      UDM_URLPOPINFOLIST *URLPopInfoList,
3130                                      UDM_URLDATALIST *URLDataList)
3131 {
3132   size_t i;
3133   double r= 1e0 / (double) URLDataList->nitems;
3134   if (UDM_OK != UdmURLPopInfoListAlloc(URLPopInfoList, URLDataList->nitems))
3135     return UDM_ERROR;
3136   for (i= 0 ; i < URLDataList->nitems; i++)
3137   {
3138     UDM_URLPOPINFO *dst= &URLPopInfoList->Item[i];
3139     UDM_URLDATA *src= &URLDataList->Item[i];
3140     UDM_SERVER *Server= UdmServerFind(A, &A->Conf->Servers, src->url, NULL);
3141     dst->url_id= src->url_id;
3142     dst->score= src->score;
3143     dst->popularity0= r;
3144     dst->server_weight= (Server ? Server->weight : 1);
3145     dst->popularity1= 0;
3146     dst->outgoing_link_count= 0;
3147     dst->incoming_link_count= 0;
3148     /*fprintf(stderr, "[%.5f]%s\n", dst->popularity0, Server ?  Server->Match.pattern : NULL);*/
3149   }
3150   URLPopInfoList->nitems= URLDataList->nitems;
3151   return UDM_OK;
3152 }
3153 
3154 
3155 typedef struct
3156 {
3157   UDM_URLPOPINFO *from;
3158   UDM_URLPOPINFO *to;
3159 } UDM_LINKINFO;
3160 
3161 
3162 typedef struct
3163 {
3164   size_t nitems;
3165   size_t mitems;
3166   UDM_LINKINFO *Item;
3167 } UDM_LINKINFOLIST;
3168 
3169 static void
UdmLinkInfoListInit(UDM_LINKINFOLIST * List)3170 UdmLinkInfoListInit(UDM_LINKINFOLIST *List)
3171 {
3172   bzero((void*) List, sizeof(*List));
3173 }
3174 
3175 static void
UdmLinkInfoListFree(UDM_LINKINFOLIST * List)3176 UdmLinkInfoListFree(UDM_LINKINFOLIST *List)
3177 {
3178   UdmFree(List->Item);
3179 }
3180 
3181 static udm_rc_t
UdmLinkInfoListAlloc(UDM_LINKINFOLIST * List,size_t mitems)3182 UdmLinkInfoListAlloc(UDM_LINKINFOLIST *List, size_t mitems)
3183 {
3184   if (!(List->Item= (UDM_LINKINFO*) UdmMalloc(mitems * sizeof(UDM_LINKINFO))))
3185     return UDM_ERROR;
3186   List->mitems= mitems;
3187   return UDM_OK;
3188 }
3189 
3190 static void
UdmLinkInfoListAdd(UDM_LINKINFOLIST * List,const UDM_LINKINFO * Item)3191 UdmLinkInfoListAdd(UDM_LINKINFOLIST *List, const UDM_LINKINFO *Item)
3192 {
3193   UDM_ASSERT(List->nitems < List->mitems);
3194   List->Item[List->nitems++]= *Item;
3195 }
3196 
3197 
3198 typedef struct
3199 {
3200   size_t mitems;
3201   size_t nitems;
3202   UDM_LINKINFOLIST *Item;
3203 } UDM_LINKINFOLISTLIST;
3204 
3205 
3206 static void
UdmLinkInfoListListInit(UDM_LINKINFOLISTLIST * List)3207 UdmLinkInfoListListInit(UDM_LINKINFOLISTLIST *List)
3208 {
3209   bzero((void*) List, sizeof(*List));
3210 }
3211 
3212 static void
UdmLinkInfoListListFree(UDM_LINKINFOLISTLIST * List)3213 UdmLinkInfoListListFree(UDM_LINKINFOLISTLIST *List)
3214 {
3215   size_t i;
3216   for (i= 0; i < List->nitems; i++)
3217     UdmLinkInfoListFree(&List->Item[i]);
3218   UdmFree(List->Item);
3219 }
3220 
3221 static udm_rc_t
UdmLinkInfoListListRealloc(UDM_LINKINFOLISTLIST * List)3222 UdmLinkInfoListListRealloc(UDM_LINKINFOLISTLIST *List)
3223 {
3224   if (List->nitems >= List->mitems)
3225   {
3226     size_t mitems= List->mitems + 256;
3227     size_t nbytes= mitems * sizeof(UDM_LINKINFOLIST);
3228     if (!(List->Item= (UDM_LINKINFOLIST*) UdmRealloc(List->Item, nbytes)))
3229       return UDM_ERROR;
3230     List->mitems= mitems;
3231   }
3232   return UDM_OK;
3233 }
3234 
3235 static udm_rc_t
UdmLinkInfoListListAdd(UDM_LINKINFOLISTLIST * List,const UDM_LINKINFOLIST * Item)3236 UdmLinkInfoListListAdd(UDM_LINKINFOLISTLIST *List, const UDM_LINKINFOLIST *Item)
3237 {
3238   if (UDM_OK != UdmLinkInfoListListRealloc(List))
3239     return UDM_ERROR;
3240   List->Item[List->nitems++]= *Item;
3241   return UDM_OK;
3242 }
3243 
3244 
3245 typedef struct
3246 {
3247   UDM_CONST_STR text;
3248   UDM_LINKINFO link;
3249 } UDM_LINKTEXT;
3250 
3251 typedef struct
3252 {
3253   size_t nitems;
3254   size_t mitems;
3255   UDM_LINKTEXT *Item;
3256 } UDM_LINKTEXTLIST;
3257 
3258 static int
linktextcmp(const UDM_LINKTEXT * a,const UDM_LINKTEXT * b)3259 linktextcmp(const UDM_LINKTEXT *a, const UDM_LINKTEXT *b)
3260 {
3261   if (a->link.to->url_id != b->link.to->url_id)
3262     return a->link.to->url_id < b->link.to->url_id ? -1 : 1;
3263   return 0;
3264 }
3265 
3266 static void
UdmLinkTextListInit(UDM_LINKTEXTLIST * List)3267 UdmLinkTextListInit(UDM_LINKTEXTLIST *List)
3268 {
3269   bzero((void*) List, sizeof(*List));
3270 }
3271 
3272 static void
UdmLinkTextListFree(UDM_LINKTEXTLIST * List)3273 UdmLinkTextListFree(UDM_LINKTEXTLIST *List)
3274 {
3275   UdmFree(List->Item);
3276 }
3277 
3278 static udm_rc_t
UdmLinkTextListAlloc(UDM_LINKTEXTLIST * List,size_t nitems)3279 UdmLinkTextListAlloc(UDM_LINKTEXTLIST *List, size_t nitems)
3280 {
3281   if (!(List->Item= (UDM_LINKTEXT *) UdmMalloc(nitems * sizeof(UDM_LINKTEXT))))
3282     return UDM_ERROR;
3283   List->mitems= 0;
3284   return UDM_OK;
3285 }
3286 
3287 typedef struct
3288 {
3289   UDM_CONST_STR url;
3290   udmcrc32_t hash;
3291   urlid_t id;
3292 } UDM_URLIDHASH;
3293 
3294 
3295 static void
UdmURLIdHashItemInitFromURLData(UDM_URLIDHASH * dst,UDM_URLDATA * src)3296 UdmURLIdHashItemInitFromURLData(UDM_URLIDHASH *dst, UDM_URLDATA *src)
3297 {
3298   UdmConstStrSetStr(&dst->url, src->url);
3299   dst->hash= UdmStrCRC32(src->url);
3300   dst->id= src->url_id;
3301 }
3302 
3303 
3304 static void
UdmURLIdHashItemInit(UDM_URLIDHASH * dst,const char * url)3305 UdmURLIdHashItemInit(UDM_URLIDHASH *dst, const char *url)
3306 {
3307   UdmConstStrSetStr(&dst->url, url);
3308   dst->hash= UdmStrCRC32(url);
3309   dst->id= 0;
3310 }
3311 
3312 
3313 static udm_rc_t
UdmURLIdHashStore(UDM_HASH * hash,void * ofs,void * item)3314 UdmURLIdHashStore(UDM_HASH *hash, void *ofs, void *item)
3315 {
3316   memcpy(ofs, item, sizeof(UDM_URLIDHASH));
3317   return UDM_OK;
3318 }
3319 
3320 
3321 static udm_rc_t
UdmURLIdHashJoin(UDM_HASH * hash,void * ofs,void * b)3322 UdmURLIdHashJoin(UDM_HASH *hash, void *ofs, void *b)
3323 {
3324   return UDM_OK;
3325 }
3326 
3327 
3328 static int
UdmConstStrEQ(const UDM_CONST_STR * s1,const UDM_CONST_STR * s2)3329 UdmConstStrEQ(const UDM_CONST_STR *s1, const UDM_CONST_STR *s2)
3330 {
3331   if (s1->length != s2->length)
3332     return 1;
3333   return memcmp(s1->str, s2->str, s1->length);
3334 }
3335 
3336 
3337 /*
3338   Returns 0 (on equal)
3339   Returns 1 (on non equal).
3340   Note: not siutable for sorting!
3341 */
3342 static int
UdmURLIdHashCmp(UDM_HASH * hash,void * w1,void * w2)3343 UdmURLIdHashCmp(UDM_HASH *hash, void *w1, void *w2)
3344 {
3345   if (((UDM_URLIDHASH*)w1)->hash != ((UDM_URLIDHASH*)w2)->hash)
3346     return 1;
3347   return UdmConstStrEQ(&(((UDM_URLIDHASH*)w1)->url),
3348                        &(((UDM_URLIDHASH*)w2)->url));
3349 }
3350 
3351 
3352 static udmcrc32_t
UdmURLIdHashCalculateHash(UDM_HASH * hash,const void * item)3353 UdmURLIdHashCalculateHash(UDM_HASH *hash, const void *item)
3354 {
3355   return ((const UDM_URLIDHASH*)item)->hash;
3356 }
3357 
3358 
3359 static UDM_HASH_HANDLER UdmURLIdHashHandler=
3360 {
3361   UdmURLIdHashStore,         /* store  */
3362   UdmURLIdHashJoin,          /* join   */
3363   UdmURLIdHashCmp,           /* cmp    */
3364   UdmURLIdHashCalculateHash, /* keykey */
3365   UdmURLIdHashCalculateHash  /* reckey */
3366 };
3367 
3368 
3369 static udm_rc_t
UdmURLIdHashInitFromURLDataList(UDM_AGENT * Indexer,UDM_HASH * urlidhash,const UDM_URLDATALIST * URLList)3370 UdmURLIdHashInitFromURLDataList(UDM_AGENT *Indexer,
3371                                 UDM_HASH *urlidhash,
3372                                 const UDM_URLDATALIST *URLList)
3373 {
3374   size_t i;
3375   if (UDM_OK != UdmHashInit(urlidhash, &UdmURLIdHashHandler,
3376                             NULL, URLList->nitems + URLList->nitems/10 + 100,
3377                             sizeof(UDM_URLIDHASH)))
3378   {
3379     UdmLog(Indexer, UDM_LOG_ERROR, "UdmHashInit failed");
3380     return UDM_ERROR;
3381   }
3382   for (i= 0; i < URLList->nitems; i++)
3383   {
3384     UDM_URLIDHASH item;
3385     UdmURLIdHashItemInitFromURLData(&item, &URLList->Item[i]);
3386     if (NULL == UdmHashPut(urlidhash, &item))
3387     {
3388       UdmLog(Indexer, UDM_LOG_ERROR, "UdmHashPut failed");
3389       return UDM_ERROR;
3390     }
3391   }
3392   return UDM_OK;
3393 }
3394 
3395 
3396 static udm_rc_t
UdmInvertedIndexAddURLText(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_URLDATALIST * URLList)3397 UdmInvertedIndexAddURLText(UDM_AGENT *Indexer,
3398                              UDM_DB *db,
3399                              const char *wtable,
3400                              UDM_URLDATALIST *URLList)
3401 {
3402   udm_rc_t rc= UDM_OK;
3403   size_t i;
3404   UDM_INVERTED_INDEX_CACHE cache;
3405 
3406   if (!UdmVarListFindByPrefix(&Indexer->Conf->Sections, "url.", 4))
3407     return UDM_OK;
3408 
3409   UdmLog(Indexer, UDM_LOG_INFO, "Indexing URL text");
3410   UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3411   if (UDM_OK != UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3412                                                 INVERTED_INDEX_CACHE_PARTS))
3413     return UDM_ERROR;
3414 
3415   for (i=0; i < URLList->nitems && rc == UDM_OK; i++)
3416   {
3417     UDM_URLDATA *Item= &URLList->Item[i];
3418     UDM_CONSTWORDLIST CWL;
3419     UDM_DOCUMENT Doc;
3420     if (!Item->score)
3421       continue; /* Does not have urlinfob record, e.g. HrefOnly */
3422     UdmConstWordListInit(&CWL);
3423     UdmDocInit(&Doc);
3424     UdmURLParse(&Doc.CurURL, Item->url);
3425     UdmVarListAddLst(&Doc.Sections, &Indexer->Conf->Sections, NULL, "*");
3426     UdmParseURLText(Indexer, &Doc);
3427     /* TODO34: RemoteCharset, RemoteFileNameCharset */
3428     UdmTextListToConstWordList(&Doc.TextList,
3429                                Indexer->Conf->unidata, Indexer->Conf->lcs,
3430                                UDM_RECODE_HTML, &CWL);
3431     rc= UdmWordListSortAndConvert(Indexer, &CWL, &cache,
3432                                   Item->url_id, Indexer->Conf->lcs);
3433     UdmConstWordListFree(&CWL);
3434     UdmDocFree(&Doc);
3435   }
3436   if (rc == UDM_OK)
3437     rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, &cache);
3438 
3439   UdmInvertedIndexCacheFree(&cache);
3440   return rc;
3441 }
3442 
3443 
3444 static udm_rc_t
UdmLinkTextListToInvertedIndexCache(UDM_AGENT * Indexer,UDM_LINKTEXTLIST * LinkTextList,UDM_INVERTED_INDEX_CACHE * cache,udm_secno_t ilinktext)3445 UdmLinkTextListToInvertedIndexCache(UDM_AGENT *Indexer,
3446                                     UDM_LINKTEXTLIST *LinkTextList,
3447                                     UDM_INVERTED_INDEX_CACHE *cache,
3448                                     udm_secno_t ilinktext)
3449 {
3450   size_t i;
3451   udm_rc_t rc= UDM_OK;
3452   UDM_WORD_SCANNER scanner;
3453   UDM_CONSTWORDLIST CWL;
3454 
3455   UdmWordScannerInit(&scanner, Indexer->Conf->unidata, Indexer->Conf->lcs);
3456   UdmConstWordListInit(&CWL);
3457 
3458   /* Sort Linktext items by url_id */
3459   UdmSort(LinkTextList->Item, LinkTextList->nitems, sizeof(UDM_LINKTEXT), (udm_qsort_cmp) linktextcmp);
3460   for (i= 0; i < LinkTextList->nitems; i++)
3461   {
3462     UDM_LINKTEXT *Item= &LinkTextList->Item[i];
3463     UDM_URLPOPINFO *popinfo= Item->link.to;
3464     if (!popinfo->score) /* URL does not have urlinfob record. e.g. HrefOnly */
3465       continue;
3466     UdmConstWordListAddString(&scanner,
3467                               UDM_RECODE_HTML, &CWL, /* TODO34: StripAccents */
3468                               ilinktext,
3469                               Item->text.str, Item->text.length);
3470     CWL.wordpos[ilinktext]+= 8; /* TODO34: check overflow */
3471     if (i + 1 == LinkTextList->nitems ||
3472         LinkTextList->Item[i+1].link.to->url_id != Item->link.to->url_id)
3473     {
3474       if (UDM_OK != (rc= UdmWordListSortAndConvert(Indexer, &CWL, cache,
3475                                                    Item->link.to->url_id,
3476                                                    Indexer->Conf->lcs)))
3477         goto ex;
3478       CWL.nitems= 0;
3479       CWL.wordpos[ilinktext]= 0;
3480     }
3481   }
3482 ex:
3483   UdmConstWordListFree(&CWL);
3484   return rc;
3485 }
3486 
3487 
3488 /* Order of the columns */
3489 #define LINKTEXT_ID   0
3490 #define LINKTEXT_URL  1
3491 #define LINKTEXT_TEXT 2
3492 
3493 static udm_rc_t
UdmInvertedIndexProcessLinksResult(UDM_AGENT * Indexer,UDM_HASH * urlidhash,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_INVERTED_INDEX_CACHE * cache,UDM_LINKINFOLISTLIST * LinkInfoListList,UDM_SQLRES * SQLRes,udm_secno_t ilinktext)3494 UdmInvertedIndexProcessLinksResult(UDM_AGENT *Indexer,
3495                                    UDM_HASH *urlidhash,
3496                                    UDM_URLPOPINFOLIST *URLPopInfoList,
3497                                    UDM_INVERTED_INDEX_CACHE *cache,
3498                                    UDM_LINKINFOLISTLIST *LinkInfoListList,
3499                                    UDM_SQLRES *SQLRes,
3500                                    udm_secno_t ilinktext)
3501 {
3502   udm_rc_t rc;
3503   size_t i, nrows;
3504   UDM_LINKTEXTLIST LinkTextList;
3505   UDM_LINKINFOLIST LinkInfoList;
3506 
3507   if (!(nrows= UdmSQLNumRows(SQLRes)))
3508     return UDM_OK;
3509 
3510   UdmLinkInfoListInit(&LinkInfoList);
3511   UdmLinkTextListInit(&LinkTextList);
3512 
3513   if (UDM_OK != (rc= UdmLinkTextListAlloc(&LinkTextList, nrows)))
3514     return rc;
3515 
3516   if (UDM_OK != (rc= UdmLinkInfoListAlloc(&LinkInfoList, nrows)))
3517     goto ex;
3518 
3519   for (i=0; i < nrows; i++)
3520   {
3521     UDM_LINKTEXT *Item= &LinkTextList.Item[LinkTextList.nitems];
3522     const char *url= UdmSQLValue(SQLRes, i, LINKTEXT_URL);
3523     urlid_t fromid= atoi(UdmSQLValue(SQLRes, i, LINKTEXT_ID));
3524     UDM_URLIDHASH *u, search_item;
3525     UdmURLIdHashItemInit(&search_item, url);
3526     if (!(u= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &search_item)) || !u->id)
3527       continue;
3528     if (!(Item->link.to= UdmURLPopInfoListSearch(URLPopInfoList, u->id)))
3529     {
3530       UdmLog(Indexer, UDM_LOG_WARN,
3531              "UdmInvertedIndexAddLinkText: to_id=%d not found", u->id);
3532       continue;
3533     }
3534     if (!(Item->link.from= UdmURLPopInfoListSearch(URLPopInfoList, fromid)))
3535     {
3536       UdmLog(Indexer, UDM_LOG_WARN,
3537              "UdmInvertedIndexAddLinkText: from_id=%s not found",
3538              UdmSQLValue(SQLRes, i, LINKTEXT_ID));
3539       continue;
3540     }
3541     if (cache)
3542     {
3543       Item->text.str= UdmSQLValue(SQLRes, i, LINKTEXT_TEXT);
3544       Item->text.length= UdmSQLLen(SQLRes, i, LINKTEXT_TEXT);
3545       LinkTextList.nitems++;
3546     }
3547     UdmLinkInfoListAdd(&LinkInfoList, &Item->link);
3548   }
3549   if (!LinkInfoList.nitems)
3550   {
3551     /* It will not be added into LinkInfoListList, so free it now */
3552     UdmLinkInfoListFree(&LinkInfoList);
3553     goto ex;
3554   }
3555 
3556   if (UDM_OK != UdmLinkInfoListListAdd(LinkInfoListList, &LinkInfoList))
3557     return UDM_ERROR;
3558 
3559   if (LinkTextList.nitems)
3560     rc= UdmLinkTextListToInvertedIndexCache(Indexer, &LinkTextList, cache,
3561                                             ilinktext);
3562 ex:
3563   UdmLinkTextListFree(&LinkTextList);
3564   return rc;
3565 }
3566 
3567 
3568 static void
UdmLinkInfoListCalcOutgoingLinks(UDM_LINKINFOLIST * List)3569 UdmLinkInfoListCalcOutgoingLinks(UDM_LINKINFOLIST *List)
3570 {
3571   size_t i;
3572   for (i= 0; i < List->nitems; i++)
3573   {
3574     UDM_LINKINFO *Item= &List->Item[i];
3575     Item->from->outgoing_link_count++;
3576     /*fprintf(stderr, "%d->%d\n", Item->from->url_id, Item->to->url_id);*/
3577   }
3578 }
3579 
3580 
3581 static void
UdmLinkInfoListCalcIncomingLinks(UDM_LINKINFOLIST * List)3582 UdmLinkInfoListCalcIncomingLinks(UDM_LINKINFOLIST *List)
3583 {
3584   size_t i;
3585   for (i= 0; i < List->nitems; i++)
3586   {
3587     UDM_LINKINFO *Item= &List->Item[i];
3588     Item->to->incoming_link_count++;
3589   }
3590 }
3591 
3592 
3593 static void
UdmLinkInfoListCalcSumWeightsIncomingLinks2(UDM_LINKINFOLIST * List)3594 UdmLinkInfoListCalcSumWeightsIncomingLinks2(UDM_LINKINFOLIST *List)
3595 {
3596   size_t i;
3597   for (i= 0; i < List->nitems; i++)
3598   {
3599     UDM_LINKINFO *Item= &List->Item[i];
3600     /*Item->to->incoming_link_count++;*/
3601     if (Item->from->outgoing_link_count)
3602       Item->to->popularity1+= (Item->from->popularity0 /
3603                                (double) Item->from->outgoing_link_count) *
3604                               Item->from->server_weight;
3605     /*fprintf(stderr, "%d->%d\n", Item->from->url_id, Item->to->url_id);*/
3606   }
3607 }
3608 
3609 
3610 static void
UdmURLDataListNormalizePopRank(UDM_URLPOPINFOLIST * URLPopInfoList,UDM_URLDATALIST * URLDataList)3611 UdmURLDataListNormalizePopRank(UDM_URLPOPINFOLIST *URLPopInfoList,
3612                                UDM_URLDATALIST *URLDataList)
3613 {
3614   size_t i;
3615   UDM_ASSERT(URLDataList->nitems == URLPopInfoList->nitems);
3616   for (i= 0; i < URLPopInfoList->nitems; i++)
3617   {
3618     UDM_URLPOPINFO *src= &URLPopInfoList->Item[i];
3619     UDM_URLDATA *dst= &URLDataList->Item[i];
3620     double tmp= src->popularity0 * 1000000000;
3621     UDM_ASSERT(src->url_id == dst->url_id);
3622     if (tmp < 1)
3623       tmp= 1;
3624     dst->pop_rank= log(tmp)/log(1000000000);
3625     dst->per_site= (uint4) src->incoming_link_count;
3626     /*fprintf(stderr, "[%d] %f %f %s\n", dst->url_id, src->popularity0, dst->pop_rank, dst->url);*/
3627   }
3628 }
3629 
3630 
3631 static udm_rc_t
UdmLinkInfoListListCalcPopRankOnce(UDM_URLPOPINFOLIST * URLPopInfoList,UDM_LINKINFOLISTLIST * LinkInfoListList)3632 UdmLinkInfoListListCalcPopRankOnce(UDM_URLPOPINFOLIST *URLPopInfoList,
3633                                   UDM_LINKINFOLISTLIST *LinkInfoListList)
3634 {
3635   size_t i;
3636   UDM_ASSERT(URLPopInfoList->nitems);
3637   for (i= 0; i < LinkInfoListList->nitems; i++)
3638     UdmLinkInfoListCalcSumWeightsIncomingLinks2(&LinkInfoListList->Item[i]);
3639   for (i= 0; i < URLPopInfoList->nitems; i++)
3640   {
3641     UDM_URLPOPINFO *Item= &URLPopInfoList->Item[i];
3642     Item->popularity0= (0.15 / URLPopInfoList->nitems +
3643                         0.85 * Item->popularity1);
3644     /*
3645     fprintf(stderr, "[%d] norm=%.6f old=%.6f new=%.6f ilinks=%d olinks=%d Wsrv=%.2f\n",
3646             Item->url_id, 0.15 / URLPopInfoList->nitems,
3647             Item->popularity1, Item->popularity0,
3648             (int) Item->incoming_link_count,
3649             (int) Item->outgoing_link_count,
3650             Item->server_weight);
3651     */
3652     Item->popularity1= 0; /* Prepare for the next iteration */
3653   }
3654   return UDM_OK;
3655 }
3656 
3657 
3658 static size_t
UdmLinkInfoListListCalcTotalLinkCount(UDM_LINKINFOLISTLIST * LinkInfoListList)3659 UdmLinkInfoListListCalcTotalLinkCount(UDM_LINKINFOLISTLIST *LinkInfoListList)
3660 {
3661   size_t i, count;
3662   for (count=0, i= 0; i < LinkInfoListList->nitems; i++)
3663     count+= LinkInfoListList->Item[i].nitems;
3664   return count;
3665 }
3666 
3667 
3668 static udm_rc_t
UdmLinkInfoListListCalcPopRank(UDM_AGENT * Indexer,UDM_URLPOPINFOLIST * URLPopInfoList,UDM_LINKINFOLISTLIST * LinkInfoListList)3669 UdmLinkInfoListListCalcPopRank(UDM_AGENT *Indexer,
3670                                UDM_URLPOPINFOLIST *URLPopInfoList,
3671                                UDM_LINKINFOLISTLIST *LinkInfoListList)
3672 {
3673   size_t i;
3674   if (!URLPopInfoList->nitems)
3675     return UDM_OK;
3676   UdmLog(Indexer, UDM_LOG_INFO,
3677          "Calculating popularity: %d documents, %d links",
3678          (int) URLPopInfoList->nitems,
3679          (int) UdmLinkInfoListListCalcTotalLinkCount(LinkInfoListList));
3680   for (i= 0; i < LinkInfoListList->nitems; i++)
3681   {
3682     UdmLinkInfoListCalcOutgoingLinks(&LinkInfoListList->Item[i]);
3683     UdmLinkInfoListCalcIncomingLinks(&LinkInfoListList->Item[i]);
3684   }
3685   for (i= 0; i < 3; i++)
3686   {
3687     udm_rc_t rc;
3688     if (UDM_OK != (rc= UdmLinkInfoListListCalcPopRankOnce(URLPopInfoList,
3689                                                           LinkInfoListList)))
3690       return rc;
3691   }
3692   return UDM_OK;
3693 }
3694 
3695 
3696 typedef struct
3697 {
3698   udm_secno_t ilinktext;
3699   udm_bool_t  use_popularity;
3700 } UDM_ADD_LINKS_PARAM;
3701 
3702 
3703 /**
3704   Link processing parameters for "indexer --rewritepop"
3705 */
3706 static const UDM_ADD_LINKS_PARAM links_param_for_popularity=
3707 {
3708   0,       /* Don't touch indexer link words*/
3709   UDM_TRUE /* Force UsePopularity if explicitly asked for */
3710 };
3711 
3712 
3713 static void
UdmAddLinksParamInitFromEnv(UDM_ADD_LINKS_PARAM * param,const UDM_ENV * Env)3714 UdmAddLinksParamInitFromEnv(UDM_ADD_LINKS_PARAM *param, const UDM_ENV *Env)
3715 {
3716   const UDM_VAR *ilinktext= UdmVarListFind(&Env->Sections, "ilinktext");
3717   param->ilinktext= ilinktext ? UdmVarSecno(ilinktext) : 0;
3718   param->use_popularity= UdmVarListFindBool(&Env->Vars, "UsePopularity", UDM_TRUE);
3719 }
3720 
3721 
3722 static udm_rc_t
UdmLoadRedirectLinks(UDM_AGENT * Indexer,UDM_DB * db,UDM_URLDATALIST * URLList,UDM_HASH * urlidhash)3723 UdmLoadRedirectLinks(UDM_AGENT *Indexer, UDM_DB *db,
3724                      UDM_URLDATALIST *URLList,
3725                      UDM_HASH *urlidhash)
3726 {
3727   udm_timer_t ticks= UdmStartTimer();
3728   char qbuf[128];
3729   UDM_SQLRES SQLRes;
3730   udm_rc_t rc;
3731   size_t i;
3732 
3733   UdmLog(Indexer, UDM_LOG_INFO, "Loading redirects");
3734   udm_snprintf(qbuf, sizeof(qbuf), "SELECT url_id,url FROM redirect");
3735   if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, &SQLRes, qbuf)))
3736    return rc;
3737 
3738   for (i= 0; i < UdmSQLNumRows(&SQLRes); i++)
3739   {
3740     /*
3741       There is a redirect from (src_id,src_url) to (dst_id,dst_url).
3742       We associate src_url with dst_id, so all links coming to src_url
3743       are considered as belonging to dst_id instead.
3744     */
3745     urlid_t srcid= atoi(UdmSQLValue(&SQLRes, i, 0));
3746     UDM_URLIDHASH *src, *dst, key;
3747     UDM_URLDATA *srcdata= UdmURLDataListSearch(URLList, srcid);
3748     if (!srcdata)
3749     {
3750       /* TODO34: src can be not in URLData, if "indexer -s200 --index" is give */
3751       continue;
3752     }
3753 
3754     UdmURLIdHashItemInit(&key, srcdata->url);
3755     if (!(src= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &key)) || !src->id)
3756       continue; /* The link source is out of the indexing space */
3757 
3758     UdmURLIdHashItemInit(&key, UdmSQLValue(&SQLRes, i, 1));
3759     if (!(dst= (UDM_URLIDHASH*) UdmHashFind(urlidhash, &key)) || !dst->id)
3760       continue; /* The link destination is out of the indexing space */
3761 
3762     /*printf("FROM [%d:%s]\nTO   [%d:%s]\n",
3763              src->id, src->url.str, dst->id, dst->url.str);*/
3764     src->id= dst->id;
3765   }
3766   UdmSQLFree(&SQLRes);
3767   UdmLog(Indexer, UDM_LOG_INFO,
3768          "Loading redirects done: %d links, %.2f sec",
3769          (int) UdmSQLNumRows(&SQLRes), UdmStopTimer(&ticks));
3770   return UDM_OK;
3771 }
3772 
3773 
3774 /**
3775   Calculate popularity and add link words into the index cache.
3776   cache can be NULL, which means don't add link words and
3777   calculate popularity only.
3778 */
3779 static udm_rc_t
UdmInvertedIndexAddLinkText(UDM_AGENT * Indexer,UDM_DB * db,UDM_INVERTED_INDEX_CACHE * cache,const char * wtable,UDM_URLDATALIST * URLList,const UDM_ADD_LINKS_PARAM param)3780 UdmInvertedIndexAddLinkText(UDM_AGENT *Indexer,
3781                             UDM_DB *db,
3782                             UDM_INVERTED_INDEX_CACHE *cache,
3783                             const char *wtable,
3784                             UDM_URLDATALIST *URLList,
3785                             const UDM_ADD_LINKS_PARAM param)
3786 {
3787   int i;
3788   udm_rc_t rc= UDM_OK;
3789   udm_timer_t ticks;
3790   UDM_HASH urlidhash;
3791   UDM_LINKINFOLISTLIST LinkInfoListList;
3792   UDM_URLPOPINFOLIST URLPopInfoList;
3793 
3794   UdmURLPopInfoListInit(&URLPopInfoList);
3795   UdmLinkInfoListListInit(&LinkInfoListList);
3796   if (UDM_OK != (rc= UdmURLPopInfoListInitFromURLDataList(Indexer,
3797                                                           &URLPopInfoList,
3798                                                           URLList)))
3799     goto ex;
3800 
3801   if (UDM_OK != (rc= UdmURLIdHashInitFromURLDataList(Indexer,
3802                                                      &urlidhash, URLList)))
3803     goto ex;
3804 
3805   if (UdmVarListFindBool(&Indexer->Conf->Vars, "ResolveRedirect", UDM_TRUE) &&
3806       UDM_OK != (rc= UdmLoadRedirectLinks(Indexer, db, URLList, &urlidhash)))
3807     goto ex;
3808 
3809   ticks= UdmStartTimer();
3810   UdmLog(Indexer, UDM_LOG_INFO, "Loading links");
3811   for (i= 0; i < 256; i++)
3812   {
3813     char qbuf[128];
3814     UDM_SQLRES SQLRes;
3815     if ((i % 16) == 0)
3816       UdmLog(Indexer, UDM_LOG_EXTRA, "- Loading links (part %d..%d)", i, i + 16 - 1);
3817     /*
3818       Add the "ilinktext" column only if cache is not NULL. Otherwise,
3819       we're running "indexer --rewritepop", so ilinktext is not needed.
3820     */
3821     udm_snprintf(qbuf, sizeof(qbuf),
3822                  "SELECT url_id,url%s FROM links WHERE seed=%d",
3823                  cache ? ",linktext" : "", i);
3824     if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, &SQLRes, qbuf)))
3825       goto ex;
3826 
3827     rc= UdmInvertedIndexProcessLinksResult(Indexer, &urlidhash, &URLPopInfoList,
3828                                             cache, &LinkInfoListList,
3829                                             &SQLRes, param.ilinktext);
3830     UdmSQLFree(&SQLRes);
3831     if (rc != UDM_OK)
3832       goto ex;
3833   }
3834   UdmLog(Indexer, UDM_LOG_INFO,
3835          "Loading links done: %.2f sec", UdmStopTimer(&ticks));
3836 
3837   rc= UdmLinkInfoListListCalcPopRank(Indexer, &URLPopInfoList, &LinkInfoListList);
3838   UdmURLDataListNormalizePopRank(&URLPopInfoList, URLList);
3839 
3840 ex:
3841   UdmURLPopInfoListFree(&URLPopInfoList);
3842   UdmLinkInfoListListFree(&LinkInfoListList);
3843   UdmHashFree(&urlidhash);
3844   return rc;
3845 }
3846 
3847 
3848 static udm_rc_t
UdmInvertedIndexRewritePopularityOneDB(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,const char * wtable)3849 UdmInvertedIndexRewritePopularityOneDB(UDM_AGENT *A,
3850                                        UDM_DB *db,
3851                                        UDM_QUERY *Query,
3852                                        const char *wtable)
3853 {
3854   udm_rc_t rc;
3855   udm_bool_t tr= UDM_TEST(UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT);
3856   UDM_WRITE_HELPER Helper;
3857 
3858   if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
3859     goto ex2;
3860 
3861   if (UDM_OK != (rc= UdmInvertedIndexAddLinkText(A, db, NULL, wtable,
3862                                                  &Query->URLData,
3863                                                  links_param_for_popularity)))
3864     goto ex;
3865 
3866   if (tr && UDM_OK != (rc= UdmDBSQLBegin(A, db)))
3867     goto ex;
3868 
3869   if (UDM_OK != (rc= UdmWritePopularityBdictAndTable(A, db, &Query->URLData,
3870                                                      wtable, &Helper)))
3871     goto ex;
3872 
3873   if (tr && UDM_OK != (rc= UdmDBSQLCommit(A, db)))
3874     goto ex;
3875 
3876 ex:
3877   UdmWriteHelperFree(&Helper);
3878 ex2:
3879   UdmURLDataListFree(&Query->URLData);
3880   return rc;
3881 }
3882 
3883 
3884 static udm_rc_t
UdmRewritePopularity(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)3885 UdmRewritePopularity(UDM_AGENT *Indexer, UDM_DB *db, UDM_QUERY *Query)
3886 {
3887 
3888   char tablename[64];
3889   UdmBlobGetTableForRewrite(Indexer, db, tablename, sizeof(tablename));
3890   return UdmInvertedIndexRewritePopularityOneDB(Indexer, db, Query, tablename);
3891 }
3892 
3893 
3894 /**
3895   Load links from the database.
3896   Add link words into the cache, and calculate popularity.
3897   Store cached link words into the index.
3898   TODO34: don't calculate popularity if param.use_popularity is UDM_FALSE.
3899   TODO34: add tests for all "UsePopularity" and "Section ilinktext" combinations
3900           for "indexer --index" and "indexer --rewritepop"
3901 */
3902 static udm_rc_t
UdmInvertedIndexAddLinks(UDM_AGENT * Indexer,UDM_DB * db,const char * wtable,UDM_URLDATALIST * URLList,UDM_ADD_LINKS_PARAM param)3903 UdmInvertedIndexAddLinks(UDM_AGENT *Indexer, UDM_DB *db,
3904                          const char *wtable,
3905                          UDM_URLDATALIST *URLList,
3906                          UDM_ADD_LINKS_PARAM param)
3907 {
3908   udm_rc_t rc;
3909   UDM_INVERTED_INDEX_CACHE cache;
3910   UdmInvertedIndexCacheInit(&cache, Indexer->Conf);
3911   if (UDM_OK != (rc= UdmInvertedIndexCacheAllocParts(Indexer, &cache,
3912                                                      INVERTED_INDEX_CACHE_PARTS)))
3913     return rc;
3914 
3915   if (UDM_OK != (rc= UdmInvertedIndexAddLinkText(Indexer, db, &cache,
3916                                                  wtable, URLList, param)))
3917     goto ex;
3918 
3919   rc= UdmInvertedIndexCacheStore(Indexer, db, wtable, &cache);
3920 
3921 ex:
3922   UdmInvertedIndexCacheFree(&cache);
3923   return rc;
3924 }
3925 
3926 /***************************************************************/
3927 static udm_rc_t
UdmCreateInvertedWordIndexFromCachedCopySQL(UDM_AGENT * Indexer,UDM_DB * db,UDM_QUERY * Query)3928 UdmCreateInvertedWordIndexFromCachedCopySQL(UDM_AGENT *Indexer,
3929                                             UDM_DB *db,
3930                                             UDM_QUERY *Query)
3931 {
3932   udm_rc_t rc;
3933   char buf[128], wtable[64];
3934   int tr= (UdmSQLDBFlags(db) & UDM_SQL_HAVE_TRANSACT) ? 1 : 0;
3935   int tr_truncate= tr && (UdmSQLDBType(db) != UDM_DB_SYBASE);
3936   udm_timer_t ticks;
3937   udm_bool_t disable_keys= UdmVarListFindBool(UdmSQLDBVars(db), "DisableKeys", UDM_TRUE);
3938   UDM_WRITE_HELPER Helper;
3939   UDM_ADD_LINKS_PARAM param;
3940 
3941   UdmAddLinksParamInitFromEnv(&param, Indexer->Conf);
3942   if (UDM_OK != (rc= UdmWriteHelperInit(&Helper, db)))
3943     return UDM_ERROR;
3944 
3945   UdmLog(Indexer, UDM_LOG_DEBUG, Helper.use_deflate ? "Using deflate" : "Not using deflate");
3946 
3947   /* Get table to write to */
3948   if (UDM_OK != (rc= UdmBlobGetWTable(Indexer, db, wtable, sizeof(wtable))))
3949     goto ret2;
3950   /* Lock tables for MySQL */
3951   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
3952   {
3953     if (UdmSQLDBVersion(db) >= 40000 && disable_keys)
3954     {
3955       sprintf(buf, "ALTER TABLE %s DISABLE KEYS", wtable);
3956       if (UDM_OK != UdmDBSQLQuery(Indexer, db, NULL, buf))
3957         goto ret;
3958     }
3959     udm_snprintf(buf, sizeof(buf),
3960                  "LOCK TABLES "
3961                  "url READ,urlinfo READ,cachedcopy READ,"
3962                  "redirect READ,links READ,"
3963                  "%s WRITE",
3964                  wtable);
3965     if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, buf)))
3966       goto ret2;
3967   }
3968 
3969   /* Delete old words from bdict */
3970   if ((tr_truncate && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
3971       UDM_OK != (rc= UdmDBSQLTableTruncateOrDelete(Indexer, db, wtable)) ||
3972       (tr_truncate && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
3973     goto ret;
3974 
3975   /* Convert words */
3976 
3977   ticks= UdmStartTimer();
3978   if (UDM_OK != (rc= UdmInvertedIndexCreate(Indexer, db, wtable, Query)))
3979     goto ret;
3980   if (UDM_OK != (rc= UdmInvertedIndexAddURLText(Indexer, db, wtable, &Query->URLData)))
3981     goto ret;
3982   if (param.ilinktext || param.use_popularity)
3983   {
3984     rc= param.ilinktext ?
3985         UdmInvertedIndexAddLinks(Indexer, db, wtable, &Query->URLData, param):
3986         UdmInvertedIndexAddLinkText(Indexer, db, NULL, wtable, &Query->URLData, param);
3987     if (rc != UDM_OK)
3988       goto ret;
3989   }
3990   UdmLog(Indexer, UDM_LOG_DEBUG,
3991          "UdmInvertedIndexCreate done: %.2f sec", UdmStopTimer(&ticks));
3992 
3993   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
3994   {
3995     ticks= UdmStartTimer();
3996     if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES")))
3997       goto ret2;
3998     udm_snprintf(buf, sizeof(buf), "LOCK TABLES %s WRITE", wtable);
3999     if (UDM_OK != (rc= UdmDBSQLQuery(Indexer, db, NULL, buf)))
4000       goto ret2;
4001     UdmLog(Indexer, UDM_LOG_DEBUG,
4002            "Unlocking tables: %.2f sec", UdmStopTimer(&ticks));
4003   }
4004 
4005   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4006   {
4007     if (UdmSQLDBVersion(db) >= 40000 && disable_keys)
4008     {
4009       ticks= UdmStartTimer();
4010       UdmLog(Indexer, UDM_LOG_INFO, "Enabling SQL indexes");
4011       sprintf(buf, "ALTER TABLE %s ENABLE KEYS", wtable);
4012       UdmDBSQLQuery(Indexer, db, NULL, buf);
4013       UdmLog(Indexer, UDM_LOG_INFO,
4014              "Enabling SQL indexes done, %.2f sec", UdmStopTimer(&ticks));
4015     }
4016   }
4017 
4018   /* Put timestamp: note, the indexes must be already enabled here! */
4019   if ((tr && (UDM_OK != (rc= UdmDBSQLBegin(Indexer, db)))) ||
4020       (UDM_OK != (rc= UdmBlobWriteTimestamp(Indexer, db, wtable, UDM_FALSE))) ||
4021       (tr && (UDM_OK != (rc= UdmDBSQLCommit(Indexer, db)))))
4022     goto ret;
4023 
4024   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4025     UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES");
4026 
4027   /* Convert URL */
4028   ticks= UdmStartTimer();
4029   UdmLog(Indexer, UDM_LOG_ERROR, "Writing url data");
4030   if ((tr && UDM_OK != (rc= UdmDBSQLBegin(Indexer, db))) ||
4031       (param.use_popularity &&
4032        UDM_OK != (rc= UdmWritePopularityBdictAndTable(Indexer, db, &Query->URLData, wtable, &Helper))) ||
4033       UDM_OK != (rc= UdmBlobWriteURLData(Indexer, db, Query, wtable, &Helper)) ||
4034       UDM_OK != (rc= UdmBlobWriteLimitsInternal(Indexer, db, wtable, &Helper)) ||
4035       (tr && UDM_OK != (rc= UdmDBSQLCommit(Indexer, db))))
4036     goto ret2;
4037 
4038   UdmLog(Indexer, UDM_LOG_DEBUG,
4039          "Writing URL data done: %.2f sec", UdmStopTimer(&ticks));
4040 
4041   /* Switch to new table */
4042   UdmLog(Indexer, UDM_LOG_ERROR, "Rotating table");
4043   rc= UdmBlobSetTable(Indexer, db);
4044   goto ret2;
4045 
4046 ret:
4047   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
4048     UdmDBSQLQuery(Indexer, db, NULL, "UNLOCK TABLES");
4049 ret2:
4050   UdmWriteHelperFree(&Helper);
4051   return rc;
4052 }
4053 
4054 
4055 /*
4056   Name is already escaped here, using UdmSQLEscStrSimple().
4057 */
4058 static udm_rc_t
UdmBlobLoadFastURLLimitByFullName(UDM_AGENT * A,UDM_DB * db,const char * ename,UDM_URLID_LIST * buf)4059 UdmBlobLoadFastURLLimitByFullName(UDM_AGENT *A,
4060                                   UDM_DB *db,
4061                                   const char *ename,
4062                                   UDM_URLID_LIST *buf)
4063 {
4064   udm_rc_t rc= UDM_OK;
4065   UDM_SQLRES SQLRes;
4066   char qbuf[256], tablename[64], exclude;
4067   size_t nrows, nurls, i, row;
4068 
4069   exclude= buf->exclude;
4070   bzero((void*)buf, sizeof(*buf));
4071   buf->exclude= exclude;
4072 
4073   UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
4074   udm_snprintf(qbuf, sizeof(qbuf),
4075                "SELECT coords FROM %s WHERE word LIKE '%s'", tablename, ename);
4076   if (UDM_OK != (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)))
4077    goto ret;
4078 
4079   if (! (nrows= UdmSQLNumRows(&SQLRes)))
4080   {
4081     buf->empty= 1;
4082     goto ret;
4083   }
4084   nurls= 0;
4085   for (row= 0; row < nrows; row++)
4086     nurls+= UdmSQLLen(&SQLRes, row, 0) / 4;
4087 
4088   if (!(buf->urls= (urlid_t*) UdmMalloc(sizeof(urlid_t) * nurls)))
4089     goto ret;
4090 
4091   for (row= 0; row < nrows; row++)
4092   {
4093     const char *src= UdmSQLValue(&SQLRes, row, 0);
4094     nurls= UdmSQLLen(&SQLRes, row, 0) / 4;
4095     if (src && nurls)
4096       for (i = 0; i < nurls; i++, src+= 4)
4097         buf->urls[buf->nurls++]= (urlid_t) udm_get_int4(src);
4098   }
4099   UdmURLIdListSort(buf);
4100 
4101 ret:
4102   UdmSQLFree(&SQLRes);
4103   return rc;
4104 }
4105 
4106 
4107 udm_rc_t
UdmBlobLoadFastURLLimit(UDM_AGENT * A,UDM_DB * db,const char * name,UDM_URLID_LIST * buf)4108 UdmBlobLoadFastURLLimit(UDM_AGENT *A, UDM_DB *db,
4109                         const char *name, UDM_URLID_LIST *buf)
4110 {
4111   char ename[130], ename2[130];
4112   size_t namelen= strlen(name);
4113   if (namelen > 64)
4114     return UDM_OK;
4115   UdmDBSQLEscStrSimple(A, db, ename, name, namelen);
4116   udm_snprintf(ename2, sizeof(ename2), "##limit#%s", ename);
4117   return UdmBlobLoadFastURLLimitByFullName(A, db, ename2, buf);
4118 }
4119 
4120 
4121 static udm_rc_t
UdmBlobLoadFastOrderOrFastScore(UDM_AGENT * A,UDM_DB * db,UDM_SQLRES * SQLRes,const char * prefix,const char * name)4122 UdmBlobLoadFastOrderOrFastScore(UDM_AGENT *A, UDM_DB *db, UDM_SQLRES *SQLRes,
4123                                 const char *prefix, const char *name)
4124 {
4125   char qbuf[256], ename[256], tablename[64];
4126   size_t namelen= strlen(name);
4127   bzero((void*) SQLRes, sizeof(*SQLRes));
4128   if (namelen > 64)
4129     return UDM_OK;
4130   UdmDBSQLEscStrSimple(A, db, ename, name, namelen); /* Escape order name */
4131   UdmBlobGetRTable(A, db, tablename, sizeof(tablename));
4132   udm_snprintf(qbuf, sizeof(qbuf),
4133                "SELECT coords FROM %s WHERE word LIKE '##%s#%s'",
4134                tablename, prefix, ename);
4135   return UdmDBSQLQuery(A, db, SQLRes, qbuf);
4136 }
4137 
4138 
4139 static udm_rc_t
UdmBlobUnpackFastOrder(UDM_URL_INT4_LIST * List,UDM_SQLRES * SQLRes,size_t record_size)4140 UdmBlobUnpackFastOrder(UDM_URL_INT4_LIST *List,
4141                        UDM_SQLRES *SQLRes,
4142                        size_t record_size)
4143 {
4144   size_t nrows, nurls, row, param;
4145   udm_rc_t rc= UDM_OK;
4146 
4147   bzero((void*)List, sizeof(*List));
4148 
4149   if (!(nrows= UdmSQLNumRows(SQLRes)))
4150     goto ret;
4151 
4152   nurls= 0;
4153   for (row= 0; row < nrows; row++)
4154     nurls+= UdmSQLLen(SQLRes, row, 0) / record_size;
4155 
4156   if (!(List->Item= (UDM_URL_INT4*) UdmMalloc(sizeof(UDM_URL_INT4) * nurls)))
4157   {
4158     rc= UDM_ERROR;
4159     goto ret;
4160   }
4161 
4162   for (param= 0x7FFFFFFF, row= 0; row < nrows; row++)
4163   {
4164     const char *src= UdmSQLValue(SQLRes, row, 0);
4165     nurls= UdmSQLLen(SQLRes, row, 0) / record_size;
4166     if (src && nurls)
4167     {
4168       size_t i;
4169       for (i= 0; i < nurls; i++, src+= record_size)
4170       {
4171         UDM_URL_INT4 *Item= &List->Item[List->nitems++];
4172         Item->url_id= (urlid_t) udm_get_int4(src);
4173         if (record_size == 5)
4174           Item->param= src[4];
4175         else
4176           Item->param= --param;
4177       }
4178     }
4179   }
4180   if (List->nitems > 1)
4181     UdmSort(List->Item, List->nitems, sizeof(UDM_URL_INT4), (udm_qsort_cmp) UdmCmpURLID);
4182 ret:
4183   return rc;
4184 }
4185 
4186 
4187 udm_rc_t
UdmBlobLoadFastOrder(UDM_AGENT * A,UDM_DB * db,UDM_URL_INT4_LIST * List,const char * name)4188 UdmBlobLoadFastOrder(UDM_AGENT *A, UDM_DB *db,
4189                      UDM_URL_INT4_LIST *List, const char *name)
4190 {
4191   udm_rc_t rc= UDM_OK;
4192   UDM_SQLRES SQLRes;
4193 
4194   if (UDM_OK != (rc= UdmBlobLoadFastOrderOrFastScore(A, db, &SQLRes, "order", name)) ||
4195       UDM_OK != (rc= UdmBlobUnpackFastOrder(List, &SQLRes, 4)))
4196     goto ret;
4197 
4198 ret:
4199   UdmSQLFree(&SQLRes);
4200   return rc;
4201 }
4202 
4203 
4204 udm_rc_t
UdmBlobLoadFastScore(UDM_AGENT * A,UDM_DB * db,UDM_URL_INT4_LIST * List,const char * name)4205 UdmBlobLoadFastScore(UDM_AGENT *A, UDM_DB *db,
4206                      UDM_URL_INT4_LIST *List, const char *name)
4207 {
4208   udm_rc_t rc= UDM_OK;
4209   UDM_SQLRES SQLRes;
4210 
4211   if (UDM_OK != (rc= UdmBlobLoadFastOrderOrFastScore(A, db, &SQLRes, "score", name)) ||
4212       UDM_OK != (rc= UdmBlobUnpackFastOrder(List, &SQLRes, 5)))
4213     goto ret;
4214 
4215 ret:
4216   UdmSQLFree(&SQLRes);
4217   return rc;
4218 }
4219 
4220 
4221 static udm_rc_t
UdmWordStatCreateBlob(UDM_AGENT * A,UDM_DB * db)4222 UdmWordStatCreateBlob(UDM_AGENT *A, UDM_DB *db)
4223 {
4224   char qbuf[128], tablename[64], expr[64];
4225   UdmBlobGetTableForRewrite(A, db, tablename, sizeof(tablename));
4226   switch(UdmSQLDBType(db))
4227   {
4228     case UDM_DB_ORACLE8:
4229       udm_snprintf(expr, sizeof(expr), "lengthb(coords)");
4230       break;
4231     case UDM_DB_SQLITE3:
4232       udm_snprintf(expr, sizeof(expr), "length(coords)");
4233       break;
4234     case UDM_DB_MONETDB:
4235       /* Div by 2, to convert hex digits to real length */
4236       udm_snprintf(expr, sizeof(expr), "length(cast(coords as text))/2");
4237       break;
4238     case UDM_DB_MSSQL:
4239       udm_snprintf(expr, sizeof(expr), "datalength(coords)");
4240       break;
4241     default:
4242       udm_snprintf(expr, sizeof(expr), "octet_length(coords)");
4243   }
4244   udm_snprintf(qbuf, sizeof(qbuf),
4245                "SELECT word, sum(%s) FROM %s WHERE word NOT LIKE '##%%' GROUP BY word",
4246                expr, tablename);
4247   return UdmWordStatQuery(A, db, qbuf);
4248 }
4249 
4250 
4251 /*
4252   Dump word information to stdout.
4253 */
4254 static udm_rc_t
UdmDumpWordInfoOneDocBlob(UDM_AGENT * A,UDM_DB * db,UDM_DOCUMENT * Doc)4255 UdmDumpWordInfoOneDocBlob(UDM_AGENT *A, UDM_DB *db, UDM_DOCUMENT *Doc)
4256 {
4257   return UDM_OK;
4258 }
4259 
4260 
4261 static udm_rc_t
UdmBlobInitSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)4262 UdmBlobInitSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
4263                   UDM_FINDWORD_ARGS *args)
4264 {
4265   udm_rc_t rc;
4266   if (!args->live_updates)
4267     return UDM_OK;
4268   if ((UDM_OK != (rc= UdmBlobLoadLiveUpdateLimitLoad(A, db, args))))
4269     return rc;
4270   return udm_dbmode_handler_rawblob.InitSearch(A, db, Query, args);
4271 }
4272 
4273 
4274 static udm_rc_t
UdmQueryActionBlob(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,udm_querycmd_t cmd)4275 UdmQueryActionBlob(UDM_AGENT *A, UDM_DB *db,
4276                    UDM_QUERY *Query, udm_querycmd_t cmd)
4277 {
4278   switch (cmd)
4279   {
4280     case UDM_QUERYCMD_CLEAR:              return UdmTruncateDictBlob(A, db);
4281     case UDM_QUERYCMD_WORDSTAT:           return UdmWordStatCreateBlob(A, db);
4282     case UDM_QUERYCMD_REWRITE_URLDATA:    return UdmRewriteURL(A, db, Query);
4283     case UDM_QUERYCMD_REWRITE_LIMITS:     return UdmRewriteLimits(A, db);
4284     case UDM_QUERYCMD_REWRITE_POPULARITY: return UdmRewritePopularity(A, db, Query);
4285     case UDM_QUERYCMD_INDEX:
4286       return UdmCreateInvertedWordIndexFromCachedCopySQL(A, db, Query);
4287     default: break;
4288   }
4289   return UDM_NOTARGET;
4290 }
4291 
4292 
4293 const UDM_DBMODE_HANDLER udm_dbmode_handler_blob=
4294 {
4295   "blob",
4296   UdmStoreWordsBlob,
4297   UdmQueryActionBlob,
4298   UdmDeleteWordsFromURLBlob,
4299   UdmFindWordBlob,
4300   UdmDumpWordInfoOneDocBlob,
4301   UdmBlobInitSearch,
4302 };
4303 
4304 #endif /* HAVE_SQL */
4305