1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #ifdef HAVE_SQL
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "udm_common.h"
27 #include "udm_utils.h"
28 #include "udm_db.h"
29 #include "udm_log.h"
30 #include "udm_hash.h"
31 #include "udm_coords.h"
32 #include "udm_word.h"
33 #include "udm_db_int.h"
34 #include "udm_doc.h"
35 #include "udm_server.h"
36 #include "udm_vars.h"
37 #include "udm_store.h"
38 #include "udm_indexcache.h"
39 
40 
41 static udm_rc_t
UdmFindWordInIndexCachePart(UDM_AGENT * A,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,UDM_CONV * cnv,const UDM_INVERTED_INDEX_CACHE_PART * part,int flags)42 UdmFindWordInIndexCachePart(UDM_AGENT *A, UDM_QUERY *Query,
43                             UDM_FINDWORD_ARGS *args,
44                             UDM_CONV *cnv,
45                             const UDM_INVERTED_INDEX_CACHE_PART *part,
46                             int flags)
47 {
48   size_t i;
49   UDM_UNIDATA *unidata= A->Conf->unidata;
50   const UDM_WIDEWORD *WW= &Query->Res.WWList.Word[args->Word.Param.order];
51   UDM_URLCRDLIST CoordList;
52   UDM_URL_CRD CoordTemplate;
53   UDM_URLID_LIST *urls= (flags & UDM_RAWBLOB_DELTA) ?
54                          &args->live_update_active_urls : &args->urls;
55 
56   bzero((void*)&CoordList, sizeof(CoordList));
57   bzero((void*)&CoordTemplate, sizeof(CoordTemplate));
58   CoordTemplate.num= args->Word.Param.order;
59 
60   for (i= 0; i < part->nitems; i++)
61   {
62     UDM_INVERTED_INDEX_CACHE_ITEM *Item= &part->Item[i];
63     uint4 word_length= strlen(Item->ptr);
64     /* TODO34: fuzzy search: accents, substrings, etc */
65     if (!UdmStrCaseCmp2(unidata, cnv,
66                        Item->ptr, word_length,
67                        WW->Word.str, WW->Word.length,
68                        UDM_RECODE_HTML))
69     {
70       CoordList.acoords+= Item->length - word_length - 1;
71     }
72   }
73   if (!(CoordList.Coords= (UDM_URL_CRD *) UdmMalloc((CoordList.acoords) *
74                                                     sizeof(UDM_URL_CRD))))
75   {
76     UdmLog(A, UDM_LOG_ERROR,
77            "UdmFindWordInIndexCachePart: UdmMalloc failed");
78     return UDM_ERROR;
79   }
80   for (i= 0; i < part->nitems; i++)
81   {
82     UDM_INVERTED_INDEX_CACHE_ITEM *Item= &part->Item[i];
83     uint word_length= strlen(Item->ptr);
84     unsigned const char *src= (unsigned char *) Item->ptr + word_length + 1;
85     unsigned const char *srcend= (unsigned char*) Item->ptr + Item->length;
86     size_t ncoords, nbytes= udm_coord_get(&ncoords, src, srcend);
87     if (!nbytes || nbytes > srcend - src)
88     {
89       UdmLog(A, UDM_LOG_ERROR,
90              "UdmFindWordInIndexCachePart: unexpected ncoords: %d/%d",
91              (int) ncoords, (int) (srcend - src));
92       return UDM_ERROR;
93     }
94     src+= nbytes;
95     /* TODO34: fuzzy search: accents, substrings, etc */
96     if (!UdmStrCaseCmp2(unidata, cnv,
97                         Item->ptr, word_length,
98                         WW->Word.str, WW->Word.length,
99                         UDM_RECODE_HTML))
100     {
101       CoordTemplate.urlid_coord.url_id= Item->url_id;
102       CoordTemplate.urlid_coord.coord.secno= Item->secno;
103       /*
104       fprintf(stderr, "[%d:%d]%s length=%d ncoords=%d\n",
105               Item->url_id, Item->secno, Item->data.str,
106               (int) Item->data.length, Item->data.str[word_length+1]);
107       */
108       UdmCoordListMultiUnpack(&CoordList, &CoordTemplate,
109                               (const unsigned char*) src, srcend - src,
110                               args->query_param.SaveSectionSize);
111     }
112   }
113   if (urls->nurls)
114     UdmApplyFastLimit(&CoordList, urls);
115   UdmLog(A, UDM_LOG_DEBUG, "Raw cache: %d coords found", (int) CoordList.ncoords);
116   if (CoordList.ncoords)
117   {
118     /*
119       We have to sort here, because DBMode=multi
120       returns data unsorted.
121     */
122     UdmURLCRDListSortByURLThenSecnoThenPos(&CoordList);
123     UdmURLCRDListListAddWithSort2(&args->SearchSectionListList,
124                                   &Query->Res.WWList,
125                                   &args->Word, &CoordList);
126   }
127   else
128   {
129     UdmFree(CoordList.Coords);
130   }
131   args->Word.Param.count+= CoordList.ncoords;
132 
133   return UDM_OK;
134 }
135 
136 
137 static udm_rc_t
UdmFindWordRawBlobInternal(UDM_AGENT * A,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,int flags)138 UdmFindWordRawBlobInternal(UDM_AGENT *A, UDM_QUERY *Query,
139                            UDM_FINDWORD_ARGS *args, int flags)
140 {
141   UDM_CONV cnv;
142   UDM_UNIDATA *unidata= A->Conf->unidata;
143   unsigned int i, tmin, tmax;
144   const UDM_WIDEWORD *WW= &Query->Res.WWList.Word[args->Word.Param.order];
145 
146   if (!args->IndexCache.nitems)
147     return UDM_OK;
148 
149   UdmConvInit(&cnv, A->Conf->lcs, A->Conf->lcs);
150 
151   if (args->Word.Param.match_mode != UDM_MATCH_FULL)
152   {
153     /* Scan all cache parts for substring search */
154     tmin= 0;
155     tmax= args->IndexCache.nitems;
156   }
157   else
158   {
159     /* TODO34: StripAccents */
160     udmhash32_t crc= cnv.to->cset->crc32lcase(unidata, cnv.to,
161                                               WW->Word.str,
162                                               WW->Word.length,
163                                               UDM_RECODE_HTML);
164     tmin= crc % args->IndexCache.nitems;
165     tmax= tmin + 1;
166   }
167 
168   for (i= tmin;  i < tmax; i++)
169   {
170     UDM_INVERTED_INDEX_CACHE_PART *part= &args->IndexCache.Item[i];
171     UdmFindWordInIndexCachePart(A, Query, args, &cnv, part, flags);
172   }
173   return UDM_OK;
174 }
175 
176 
177 static udm_rc_t
UdmRawBlobInitSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)178 UdmRawBlobInitSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
179                      UDM_FINDWORD_ARGS *args)
180 {
181   udm_rc_t rc;
182   int ndocs= 0, nskip= 0;
183   size_t i;
184   char qbuf[4096];
185   UDM_SQLRES SQLRes;
186   UDM_STR row[2];
187   UDM_INVERTED_INDEX_CACHE *cache= &args->IndexCache;
188   size_t max_doc_size= UdmVarListFindInt(&A->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
189 
190   UdmInvertedIndexCacheInit(cache, A->Conf);
191   if (UDM_OK != UdmInvertedIndexCacheAllocParts(A, cache,
192                                                 INVERTED_INDEX_CACHE_PARTS))
193     return UDM_ERROR;
194 
195   if (args->where[0])
196   {
197     udm_snprintf(qbuf, sizeof(qbuf),
198                  "SELECT d.url_id,d.content FROM cachedcopy d, url%s "
199                  "WHERE url.rec_id=d.url_id "
200                  "AND ts>=%d "
201                  "AND %s",
202                  UdmSQLDBQueryFrom(Query),
203                  (int) args->live_updates_ts, args->where);
204   }
205   else
206   {
207     udm_snprintf(qbuf, sizeof(qbuf),
208                  "SELECT url_id,content FROM cachedcopy "
209                  "WHERE ts>=%d",
210                  (int) args->live_updates_ts);
211   }
212   if (UDM_OK != (rc= UdmDBSQLExecDirect(A, db, &SQLRes, qbuf)))
213     goto end;
214   for (ndocs= nskip= 0;
215        UDM_OK == UdmDBSQLFetchRow(A, db, &SQLRes, row);
216        ndocs++)
217   {
218     UDM_DOCUMENT Doc;
219     UDM_CONSTWORDLIST CWL;
220     UDM_CONSTWORD_HASH_DATA data;
221     UDM_CHARSET *doccs;
222     udm_timer_t timer_unpack= 0;
223     udm_timer_t timer_parse= 0;
224     udm_timer_t timer_words= 0;
225 
226     if (UdmMemrootUsedMemory(&cache->coord_root) >= 10*1024*1024) /* TODO34: template variable */
227     {
228       nskip++;
229       continue;
230     }
231     UdmDocInit(&Doc);
232     UdmSpiderParamInit(&Doc.Spider);
233     Doc.Spider.robots.follow= UDM_FALSE;
234     UdmVarListReplaceLst(&Doc.Sections, &A->Conf->Sections, NULL, "*");
235 
236     if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc,
237                                                        row[1].str,
238                                                        row[1].length,
239                                                        max_doc_size,
240                                                        &timer_unpack)))
241       goto end2;
242 
243     doccs= UdmDocDetectCachedCharset(A, &Doc, data.url_id);
244     bzero(&data, sizeof(data));
245     data.url_id= atoi(row[0].str);
246     UdmConvInit(&data.cnv, doccs, A->Conf->lcs);
247     data.cache= cache;
248     cache->Words.hash.user_data= &data;
249     UdmConstWordListInit(&CWL);
250     if (UDM_OK != (rc= UdmDocToConstWordList(A, &Doc, &CWL,
251                                              doccs,
252                                              cache->param.cnvflags,
253                                              cache->param.aggregate_section_flags,
254                                              &timer_parse, &timer_words)))
255       goto end2;
256 
257     if (CWL.nitems)
258     {
259       UDM_INVERTED_INDEX_COORD_LIST CL;
260       if (UDM_OK != (rc= UdmConstWordListToInvertedIndexCoordList(cache, &CL, &CWL)))
261         goto end2;
262       UdmInvertedIndexCoordListSort(&CL);
263       rc= UdmInvertedIndexCoordList2InvertedIndexCache(A, &CL, &CWL, &data, cache);
264       UdmInvertedIndexCoordListFree(&CL);
265     }
266 end2:
267     UdmConstWordListFree(&CWL);
268     UdmDocFree(&Doc);
269     if (rc != UDM_OK)
270       break;
271   }
272   UdmSQLFree(&SQLRes);
273 end:
274   UdmLog(A, UDM_LOG_DEBUG,
275          "RawBlob: %d cached documents loaded (%lld bytes)",
276           ndocs, (long long int) UdmMemrootUsedMemory(&cache->coord_root));
277   if (nskip)
278     UdmLog(A, UDM_LOG_DEBUG,
279            "RawBlob: %d documents were skipped", nskip);
280   for (i= 0; i < cache->nitems; i++)
281     UdmInvertedIndexCachePartSort(&cache->Item[i]);
282   return rc;
283 }
284 
285 
286 udm_rc_t
UdmFindWordRawBlobDelta(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)287 UdmFindWordRawBlobDelta(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
288                         UDM_FINDWORD_ARGS *args)
289 {
290   return UdmFindWordRawBlobInternal(A, Query, args, UDM_RAWBLOB_DELTA);
291 }
292 
293 
294 static udm_rc_t
UdmFindWordRawBlobSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)295 UdmFindWordRawBlobSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
296                          UDM_FINDWORD_ARGS *args)
297 {
298   return UdmFindWordRawBlobInternal(A, Query, args, UDM_RAWBLOB_SEARCH);
299 }
300 
301 
302 
303 const UDM_DBMODE_HANDLER udm_dbmode_handler_rawblob=
304 {
305   "rawblob",
306   NULL,                      /* StoreWords        */
307   NULL,                      /* QueryAction       */
308   NULL,                      /* DeleteWordFromURL */
309   UdmFindWordRawBlobSearch,  /* FindWord          */
310   NULL,                      /* DumpWordInfo      */
311   UdmRawBlobInitSearch,
312 };
313 
314 #endif /* HAVE_SQL */
315