1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #ifdef HAVE_SQL
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "udm_common.h"
27 #include "udm_utils.h"
28 #include "udm_db.h"
29 #include "udm_log.h"
30 #include "udm_hash.h"
31 #include "udm_coords.h"
32 #include "udm_word.h"
33 #include "udm_db_int.h"
34 #include "udm_doc.h"
35 #include "udm_server.h"
36 #include "udm_vars.h"
37 #include "udm_store.h"
38 #include "udm_indexcache.h"
39
40
41 static udm_rc_t
UdmFindWordInIndexCachePart(UDM_AGENT * A,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,UDM_CONV * cnv,const UDM_INVERTED_INDEX_CACHE_PART * part,int flags)42 UdmFindWordInIndexCachePart(UDM_AGENT *A, UDM_QUERY *Query,
43 UDM_FINDWORD_ARGS *args,
44 UDM_CONV *cnv,
45 const UDM_INVERTED_INDEX_CACHE_PART *part,
46 int flags)
47 {
48 size_t i;
49 UDM_UNIDATA *unidata= A->Conf->unidata;
50 const UDM_WIDEWORD *WW= &Query->Res.WWList.Word[args->Word.Param.order];
51 UDM_URLCRDLIST CoordList;
52 UDM_URL_CRD CoordTemplate;
53 UDM_URLID_LIST *urls= (flags & UDM_RAWBLOB_DELTA) ?
54 &args->live_update_active_urls : &args->urls;
55
56 bzero((void*)&CoordList, sizeof(CoordList));
57 bzero((void*)&CoordTemplate, sizeof(CoordTemplate));
58 CoordTemplate.num= args->Word.Param.order;
59
60 for (i= 0; i < part->nitems; i++)
61 {
62 UDM_INVERTED_INDEX_CACHE_ITEM *Item= &part->Item[i];
63 uint4 word_length= strlen(Item->ptr);
64 /* TODO34: fuzzy search: accents, substrings, etc */
65 if (!UdmStrCaseCmp2(unidata, cnv,
66 Item->ptr, word_length,
67 WW->Word.str, WW->Word.length,
68 UDM_RECODE_HTML))
69 {
70 CoordList.acoords+= Item->length - word_length - 1;
71 }
72 }
73 if (!(CoordList.Coords= (UDM_URL_CRD *) UdmMalloc((CoordList.acoords) *
74 sizeof(UDM_URL_CRD))))
75 {
76 UdmLog(A, UDM_LOG_ERROR,
77 "UdmFindWordInIndexCachePart: UdmMalloc failed");
78 return UDM_ERROR;
79 }
80 for (i= 0; i < part->nitems; i++)
81 {
82 UDM_INVERTED_INDEX_CACHE_ITEM *Item= &part->Item[i];
83 uint word_length= strlen(Item->ptr);
84 unsigned const char *src= (unsigned char *) Item->ptr + word_length + 1;
85 unsigned const char *srcend= (unsigned char*) Item->ptr + Item->length;
86 size_t ncoords, nbytes= udm_coord_get(&ncoords, src, srcend);
87 if (!nbytes || nbytes > srcend - src)
88 {
89 UdmLog(A, UDM_LOG_ERROR,
90 "UdmFindWordInIndexCachePart: unexpected ncoords: %d/%d",
91 (int) ncoords, (int) (srcend - src));
92 return UDM_ERROR;
93 }
94 src+= nbytes;
95 /* TODO34: fuzzy search: accents, substrings, etc */
96 if (!UdmStrCaseCmp2(unidata, cnv,
97 Item->ptr, word_length,
98 WW->Word.str, WW->Word.length,
99 UDM_RECODE_HTML))
100 {
101 CoordTemplate.urlid_coord.url_id= Item->url_id;
102 CoordTemplate.urlid_coord.coord.secno= Item->secno;
103 /*
104 fprintf(stderr, "[%d:%d]%s length=%d ncoords=%d\n",
105 Item->url_id, Item->secno, Item->data.str,
106 (int) Item->data.length, Item->data.str[word_length+1]);
107 */
108 UdmCoordListMultiUnpack(&CoordList, &CoordTemplate,
109 (const unsigned char*) src, srcend - src,
110 args->query_param.SaveSectionSize);
111 }
112 }
113 if (urls->nurls)
114 UdmApplyFastLimit(&CoordList, urls);
115 UdmLog(A, UDM_LOG_DEBUG, "Raw cache: %d coords found", (int) CoordList.ncoords);
116 if (CoordList.ncoords)
117 {
118 /*
119 We have to sort here, because DBMode=multi
120 returns data unsorted.
121 */
122 UdmURLCRDListSortByURLThenSecnoThenPos(&CoordList);
123 UdmURLCRDListListAddWithSort2(&args->SearchSectionListList,
124 &Query->Res.WWList,
125 &args->Word, &CoordList);
126 }
127 else
128 {
129 UdmFree(CoordList.Coords);
130 }
131 args->Word.Param.count+= CoordList.ncoords;
132
133 return UDM_OK;
134 }
135
136
137 static udm_rc_t
UdmFindWordRawBlobInternal(UDM_AGENT * A,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,int flags)138 UdmFindWordRawBlobInternal(UDM_AGENT *A, UDM_QUERY *Query,
139 UDM_FINDWORD_ARGS *args, int flags)
140 {
141 UDM_CONV cnv;
142 UDM_UNIDATA *unidata= A->Conf->unidata;
143 unsigned int i, tmin, tmax;
144 const UDM_WIDEWORD *WW= &Query->Res.WWList.Word[args->Word.Param.order];
145
146 if (!args->IndexCache.nitems)
147 return UDM_OK;
148
149 UdmConvInit(&cnv, A->Conf->lcs, A->Conf->lcs);
150
151 if (args->Word.Param.match_mode != UDM_MATCH_FULL)
152 {
153 /* Scan all cache parts for substring search */
154 tmin= 0;
155 tmax= args->IndexCache.nitems;
156 }
157 else
158 {
159 /* TODO34: StripAccents */
160 udmhash32_t crc= cnv.to->cset->crc32lcase(unidata, cnv.to,
161 WW->Word.str,
162 WW->Word.length,
163 UDM_RECODE_HTML);
164 tmin= crc % args->IndexCache.nitems;
165 tmax= tmin + 1;
166 }
167
168 for (i= tmin; i < tmax; i++)
169 {
170 UDM_INVERTED_INDEX_CACHE_PART *part= &args->IndexCache.Item[i];
171 UdmFindWordInIndexCachePart(A, Query, args, &cnv, part, flags);
172 }
173 return UDM_OK;
174 }
175
176
177 static udm_rc_t
UdmRawBlobInitSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)178 UdmRawBlobInitSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
179 UDM_FINDWORD_ARGS *args)
180 {
181 udm_rc_t rc;
182 int ndocs= 0, nskip= 0;
183 size_t i;
184 char qbuf[4096];
185 UDM_SQLRES SQLRes;
186 UDM_STR row[2];
187 UDM_INVERTED_INDEX_CACHE *cache= &args->IndexCache;
188 size_t max_doc_size= UdmVarListFindInt(&A->Conf->Vars, "MaxDocSize", UDM_MAXDOCSIZE);
189
190 UdmInvertedIndexCacheInit(cache, A->Conf);
191 if (UDM_OK != UdmInvertedIndexCacheAllocParts(A, cache,
192 INVERTED_INDEX_CACHE_PARTS))
193 return UDM_ERROR;
194
195 if (args->where[0])
196 {
197 udm_snprintf(qbuf, sizeof(qbuf),
198 "SELECT d.url_id,d.content FROM cachedcopy d, url%s "
199 "WHERE url.rec_id=d.url_id "
200 "AND ts>=%d "
201 "AND %s",
202 UdmSQLDBQueryFrom(Query),
203 (int) args->live_updates_ts, args->where);
204 }
205 else
206 {
207 udm_snprintf(qbuf, sizeof(qbuf),
208 "SELECT url_id,content FROM cachedcopy "
209 "WHERE ts>=%d",
210 (int) args->live_updates_ts);
211 }
212 if (UDM_OK != (rc= UdmDBSQLExecDirect(A, db, &SQLRes, qbuf)))
213 goto end;
214 for (ndocs= nskip= 0;
215 UDM_OK == UdmDBSQLFetchRow(A, db, &SQLRes, row);
216 ndocs++)
217 {
218 UDM_DOCUMENT Doc;
219 UDM_CONSTWORDLIST CWL;
220 UDM_CONSTWORD_HASH_DATA data;
221 UDM_CHARSET *doccs;
222 udm_timer_t timer_unpack= 0;
223 udm_timer_t timer_parse= 0;
224 udm_timer_t timer_words= 0;
225
226 if (UdmMemrootUsedMemory(&cache->coord_root) >= 10*1024*1024) /* TODO34: template variable */
227 {
228 nskip++;
229 continue;
230 }
231 UdmDocInit(&Doc);
232 UdmSpiderParamInit(&Doc.Spider);
233 Doc.Spider.robots.follow= UDM_FALSE;
234 UdmVarListReplaceLst(&Doc.Sections, &A->Conf->Sections, NULL, "*");
235
236 if (UDM_OK != (rc= UdmDocSetFromCachedHTTPResponse(&Doc,
237 row[1].str,
238 row[1].length,
239 max_doc_size,
240 &timer_unpack)))
241 goto end2;
242
243 doccs= UdmDocDetectCachedCharset(A, &Doc, data.url_id);
244 bzero(&data, sizeof(data));
245 data.url_id= atoi(row[0].str);
246 UdmConvInit(&data.cnv, doccs, A->Conf->lcs);
247 data.cache= cache;
248 cache->Words.hash.user_data= &data;
249 UdmConstWordListInit(&CWL);
250 if (UDM_OK != (rc= UdmDocToConstWordList(A, &Doc, &CWL,
251 doccs,
252 cache->param.cnvflags,
253 cache->param.aggregate_section_flags,
254 &timer_parse, &timer_words)))
255 goto end2;
256
257 if (CWL.nitems)
258 {
259 UDM_INVERTED_INDEX_COORD_LIST CL;
260 if (UDM_OK != (rc= UdmConstWordListToInvertedIndexCoordList(cache, &CL, &CWL)))
261 goto end2;
262 UdmInvertedIndexCoordListSort(&CL);
263 rc= UdmInvertedIndexCoordList2InvertedIndexCache(A, &CL, &CWL, &data, cache);
264 UdmInvertedIndexCoordListFree(&CL);
265 }
266 end2:
267 UdmConstWordListFree(&CWL);
268 UdmDocFree(&Doc);
269 if (rc != UDM_OK)
270 break;
271 }
272 UdmSQLFree(&SQLRes);
273 end:
274 UdmLog(A, UDM_LOG_DEBUG,
275 "RawBlob: %d cached documents loaded (%lld bytes)",
276 ndocs, (long long int) UdmMemrootUsedMemory(&cache->coord_root));
277 if (nskip)
278 UdmLog(A, UDM_LOG_DEBUG,
279 "RawBlob: %d documents were skipped", nskip);
280 for (i= 0; i < cache->nitems; i++)
281 UdmInvertedIndexCachePartSort(&cache->Item[i]);
282 return rc;
283 }
284
285
286 udm_rc_t
UdmFindWordRawBlobDelta(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)287 UdmFindWordRawBlobDelta(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
288 UDM_FINDWORD_ARGS *args)
289 {
290 return UdmFindWordRawBlobInternal(A, Query, args, UDM_RAWBLOB_DELTA);
291 }
292
293
294 static udm_rc_t
UdmFindWordRawBlobSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)295 UdmFindWordRawBlobSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
296 UDM_FINDWORD_ARGS *args)
297 {
298 return UdmFindWordRawBlobInternal(A, Query, args, UDM_RAWBLOB_SEARCH);
299 }
300
301
302
303 const UDM_DBMODE_HANDLER udm_dbmode_handler_rawblob=
304 {
305 "rawblob",
306 NULL, /* StoreWords */
307 NULL, /* QueryAction */
308 NULL, /* DeleteWordFromURL */
309 UdmFindWordRawBlobSearch, /* FindWord */
310 NULL, /* DumpWordInfo */
311 UdmRawBlobInitSearch,
312 };
313
314 #endif /* HAVE_SQL */
315