1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "udm_common.h"
26 #include "udm_word.h"
27 #include "udm_doc.h"
28 #include "udm_utils.h"
29 #include "udm_result.h"
30 #include "udm_parsehtml.h"
31 #include "udm_parsexml.h"
32 #include "udm_vars.h"
33 #include "udm_searchtool.h"
34 #include "udm_boolean.h"
35
36 #include "udm_db.h" /* for UdmDocAction */
37
38
UdmResultInit(UDM_RESULT * Res)39 void UdmResultInit(UDM_RESULT *Res)
40 {
41 bzero((void*)Res, sizeof(UDM_RESULT));
42 }
43
44
45 UDM_API(void)
UdmResultFree(UDM_RESULT * Res)46 UdmResultFree(UDM_RESULT * Res)
47 {
48 size_t i;
49 if(!Res)return;
50 UdmVarListFree(&Res->Vars);
51 UdmWideWordListFree(&Res->WWList);
52 if(Res->Doc)
53 {
54 for(i=0;i<Res->num_rows;i++)
55 {
56 UdmDocFree(&Res->Doc[i]);
57 }
58 UdmFree(Res->Doc);
59 }
60 bzero((void*)Res, sizeof(*Res));
61 }
62
63
UdmResultNumRows(UDM_RESULT * Res)64 UDM_API(size_t) UdmResultNumRows(UDM_RESULT *Res)
65 {
66 return Res->num_rows;
67 }
68
69
70 UDM_API(size_t)
UdmResultFirst(UDM_RESULT * Res)71 UdmResultFirst(UDM_RESULT *Res)
72 {
73 return (size_t) UdmVarListFindInt(&Res->Vars, "first", 1) - 1;
74 }
75
76
77 UDM_API(size_t)
UdmResultLast(UDM_RESULT * Res)78 UdmResultLast(UDM_RESULT *Res)
79 {
80 return (size_t) UdmVarListFindInt(&Res->Vars, "last", 1) - 1;
81 }
82
83
84 UDM_API(size_t)
UdmResultTotalFound(UDM_RESULT * Res)85 UdmResultTotalFound(UDM_RESULT *Res)
86 {
87 return (size_t) UdmVarListFindInt(&Res->Vars, "total", 0);
88 }
89
90
91 UDM_API(udm_rc_t)
UdmResultToVarList(UDM_VARLIST * Vars,const UDM_RESULT * Res)92 UdmResultToVarList(UDM_VARLIST *Vars, const UDM_RESULT *Res)
93 {
94 char search_time[100];
95 /* Convert milliseconds to seconds */
96 udm_snprintf(search_time, sizeof(search_time), "%.3f",
97 ((double) UdmVarListFindInt(&Res->Vars, "SearchTime", 0))/1000);
98 if (UDM_OK != UdmVarListReplaceStr(Vars, "SearchTime", search_time))
99 return UDM_ERROR;
100 return UdmVarListReplaceLst(Vars, &Res->Vars, NULL, "*");
101 }
102
103 /***************************************************************/
104 udm_rc_t
UdmResultAppendAndInit(UDM_RESULT * Res)105 UdmResultAppendAndInit(UDM_RESULT *Res)
106 {
107 size_t nbytes= (Res->num_rows + 1) * sizeof(UDM_DOCUMENT);
108 UDM_DOCUMENT *tmp;
109 if (!(tmp= UdmRealloc(Res->Doc, nbytes)))
110 return UDM_ERROR;
111 Res->Doc= tmp;
112 UdmDocInit(&Res->Doc[Res->num_rows]);
113 Res->num_rows++;
114 return UDM_OK;
115 }
116
117
118 udm_rc_t
UdmResultAppendDoc(UDM_RESULT * Res,UDM_DOCUMENT * Doc)119 UdmResultAppendDoc(UDM_RESULT *Res, UDM_DOCUMENT *Doc)
120 {
121 size_t nbytes= (Res->num_rows + 1) * sizeof(UDM_DOCUMENT);
122 UDM_DOCUMENT *tmp;
123 if (!(tmp= UdmRealloc(Res->Doc, nbytes)))
124 return UDM_ERROR;
125 Res->Doc= tmp;
126 Res->Doc[Res->num_rows]= Doc[0];
127 Res->num_rows++;
128 return UDM_OK;
129 }
130
131
132 /*****************************************************************/
133 #define UDM_DF_SIZE 64
134
135 typedef struct
136 {
137 int state;
138 UDM_AGENT *Agent;
139 UDM_WIDEWORD WW;
140 UDM_DOCUMENT Doc;
141 UDM_QUERY *Query;
142 UDM_CHARSET *cs;
143 uint4 score;
144 uint4 per_site;
145 urlid_t site_id;
146 char date_format[UDM_DF_SIZE];
147 } RES_PARSER_DATA;
148
149
150 static udm_rc_t
PD_ReplaceOrAppendStrn_hl0(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)151 PD_ReplaceOrAppendStrn_hl0(RES_PARSER_DATA *D,
152 const char *name,
153 const char *val, size_t len)
154 {
155 return UdmVarListReplaceOrAppendStrn(&D->Doc.Sections, name, val, len, 0);
156 }
157
158
159 static udm_rc_t
PD_ReplaceOrAppendStrn_hl1(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)160 PD_ReplaceOrAppendStrn_hl1(RES_PARSER_DATA *D,
161 const char *name,
162 const char *val, size_t len)
163 {
164 return UdmVarListReplaceOrAppendStrn(&D->Doc.Sections, name, val, len, 1);
165 }
166
167
168 static udm_rc_t
PD_ReplaceStrn(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)169 PD_ReplaceStrn(RES_PARSER_DATA *D,
170 const char *name,
171 const char *val, size_t len)
172 {
173 return UdmVarListReplaceStrn(&D->Doc.Sections, name, val, len) ? UDM_OK : UDM_ERROR;
174 }
175
176
177 static udm_rc_t
PD_ReplaceLastModified(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)178 PD_ReplaceLastModified(RES_PARSER_DATA *D,
179 const char *name,
180 const char *val, size_t len)
181 {
182 time_t last_mod_time;
183 char tmp[UDM_MAXTIMESTRLEN];
184 len= len >= sizeof(tmp) ? sizeof(tmp) - 1 : len;
185 memcpy(tmp, val, len);
186 tmp[len]= '\0';
187 last_mod_time= UdmHttpDate2Time_t(tmp);
188 UdmVarListReplaceInt(&D->Doc.Sections, "Last-Modified-Timestamp", (int) last_mod_time);
189 if ((len= strftime(tmp, sizeof(tmp)-1, D->date_format, localtime(&last_mod_time))))
190 tmp[len]= '\0';
191 else
192 UdmTime_t2HttpStr(last_mod_time, tmp, sizeof(tmp));
193 UdmVarListReplaceStr(&D->Doc.Sections, "Last-Modified", tmp);
194 return UDM_OK;
195 }
196
197
198 typedef udm_rc_t
199 (*section_handler)(RES_PARSER_DATA *D,
200 const char *name,
201 const char *val, size_t len);
202
203
204
205
206 #define RES_MISC 0
207
208 struct udm_res_section_st
209 {
210 int state;
211 size_t length;
212 const char *str;
213 const char *section_name;
214 section_handler handler;
215 };
216
217 #define UDM_RESSEC_TOTAL_RESULTS 100
218
219 #define UDM_RESSEC_WORD 1000
220 #define UDM_RESSEC_WORD_ID 1001
221 #define UDM_RESSEC_WORD_ORDER 1002
222 #define UDM_RESSEC_WORD_COUNT 1003
223 #define UDM_RESSEC_WORD_ORIGIN 1004
224 #define UDM_RESSEC_WORD_WEIGHT 1005
225 #define UDM_RESSEC_WORD_MATCH 1006
226 #define UDM_RESSEC_WORD_SECNO 1007
227 #define UDM_RESSEC_WORD_PHRLEN 1008
228 #define UDM_RESSEC_WORD_PHRPOS 1009
229 #define UDM_RESSEC_WORD_WORD 1010
230
231 #define UDM_RESSEC_ITEM 2000
232 #define UDM_RESSEC_ITEM_DESCR 2002
233 #define UDM_RESSEC_ITEM_SCORE 2005
234
235 #define UDM_RESSEC_ITEM_PERSITE 2011
236 #define UDM_RESSEC_ITEM_DESCR_B 2014
237
238 #define SECDEF(x) ((size_t) (sizeof(x) - 1)), (x)
239 #define WSL "/rss/channel/mnoGoSearch:WordStatList"
240 #define WSI "/rss/channel/mnoGoSearch:WordStatList/mnoGoSearch:WordStatItem"
241 #define RCI(x) SECDEF("/rss/channel/item/" x)
242
243 static struct udm_res_section_st res_sec[]=
244 {
245 {UDM_RESSEC_WORD, SECDEF("/result/wordinfo/word"), NULL, NULL},
246 {UDM_RESSEC_WORD_ID, SECDEF("/result/wordinfo/word@id"), NULL, NULL},
247 {UDM_RESSEC_WORD_ORDER, SECDEF("/result/wordinfo/word@order"), NULL, NULL},
248 {UDM_RESSEC_WORD_COUNT, SECDEF("/result/wordinfo/word@count"), NULL, NULL},
249 {UDM_RESSEC_WORD_ORIGIN, SECDEF("/result/wordinfo/word@origin"), NULL, NULL},
250 {UDM_RESSEC_WORD_WEIGHT, SECDEF("/result/wordinfo/word@weight"), NULL, NULL},
251 {UDM_RESSEC_WORD_MATCH, SECDEF("/result/wordinfo/word@match"), NULL, NULL},
252 {UDM_RESSEC_WORD_SECNO, SECDEF("/result/wordinfo/word@secno"), NULL, NULL},
253 {UDM_RESSEC_WORD_PHRLEN, SECDEF("/result/wordinfo/word@phrlen"), NULL, NULL},
254 {UDM_RESSEC_WORD_PHRPOS, SECDEF("/result/wordinfo/word@phrpos"), NULL, NULL},
255 {UDM_RESSEC_TOTAL_RESULTS, SECDEF("/result/totalResults"), NULL, NULL},
256
257 {UDM_RESSEC_WORD, SECDEF(WSI), NULL, NULL},
258 {UDM_RESSEC_WORD_ID, SECDEF(WSI "@id"), NULL, NULL},
259 {UDM_RESSEC_WORD_WORD, SECDEF(WSI "@word"), NULL, NULL},
260 {UDM_RESSEC_WORD_ORDER, SECDEF(WSI "@order"), NULL, NULL},
261 {UDM_RESSEC_WORD_COUNT, SECDEF(WSI "@count"), NULL, NULL},
262 {UDM_RESSEC_WORD_ORIGIN, SECDEF(WSI "@origin"), NULL, NULL},
263 {UDM_RESSEC_WORD_WEIGHT, SECDEF(WSI "@weight"), NULL, NULL},
264 {UDM_RESSEC_WORD_MATCH, SECDEF(WSI "@match"), NULL, NULL},
265 {UDM_RESSEC_WORD_SECNO, SECDEF(WSI "@secno"), NULL, NULL},
266 {UDM_RESSEC_WORD_PHRLEN, SECDEF(WSI "@phrlen"), NULL, NULL},
267 {UDM_RESSEC_WORD_PHRPOS, SECDEF(WSI "@phrpos"), NULL, NULL},
268
269 {UDM_RESSEC_TOTAL_RESULTS, SECDEF("/rss/channel/openSearch:totalResults"), NULL, NULL},
270
271 {UDM_RESSEC_ITEM, SECDEF("/rss/channel/item"), NULL, NULL},
272 {UDM_RESSEC_ITEM_SCORE, SECDEF("/rss/channel/item/score"), NULL, NULL},
273 {UDM_RESSEC_ITEM_PERSITE, SECDEF("/rss/channel/item/persite"), NULL, NULL},
274
275 /* Duplicate: */
276 {0, RCI("updated"), "Last-Modified", PD_ReplaceLastModified},
277 {0, RCI("pubDate"), "Last-Modified", PD_ReplaceLastModified},
278 {0, RCI("last-modified"), "Last-Modified", PD_ReplaceLastModified},
279
280 {0, RCI("title"), "title", PD_ReplaceOrAppendStrn_hl0},
281 {0, RCI("title/b"), "title", PD_ReplaceOrAppendStrn_hl1},
282
283 /* Duplicate: */
284 {0, RCI("body"), "body", PD_ReplaceOrAppendStrn_hl0},
285 {0, RCI("description"), "body", PD_ReplaceOrAppendStrn_hl0},
286 {0, RCI("description/b"), "body", PD_ReplaceOrAppendStrn_hl1},
287
288 /* Duplicate: */
289 {0, RCI("link"), "url", PD_ReplaceStrn},
290 {0, RCI("url"), "url", PD_ReplaceStrn},
291
292 {0, RCI("id"), "id", PD_ReplaceStrn},
293 {0, RCI("content-length"), "Content-Length", PD_ReplaceStrn},
294 {0, RCI("content-type"), "Content-Type", PD_ReplaceStrn},
295 {0, RCI("cached"), "stored_href", PD_ReplaceStrn},
296
297 /* Duplicate: */
298 {0, RCI("CachedCopyBase64"),"CachedCopyBase64", PD_ReplaceStrn},
299 {0, RCI("cached-content"), "CachedCopyBase64", PD_ReplaceStrn},
300
301 {0, RCI("tag"), "tag", PD_ReplaceStrn},
302 {0, RCI("crc32"), "crc32", PD_ReplaceStrn},
303 {0, RCI("charset"), "charset", PD_ReplaceStrn},
304 {0, RCI("status"), "status", PD_ReplaceStrn},
305
306 {0, 0, NULL, NULL, NULL}
307 };
308
309
res_sec_find(const char * attr,size_t len)310 static struct udm_res_section_st *res_sec_find(const char *attr, size_t len)
311 {
312 struct udm_res_section_st *s;
313 for (s= res_sec; s->str; s++)
314 {
315 if (len == s->length && !strncasecmp(attr, s->str, len))
316 return s;
317 }
318 return NULL;
319 }
320
321
322 static udm_rc_t
ResFromXMLEnter(UDM_XML_PARSER * parser,const char * name,size_t l)323 ResFromXMLEnter(UDM_XML_PARSER *parser, const char *name, size_t l)
324 {
325 RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
326 struct udm_res_section_st *st= res_sec_find(parser->attr,
327 parser->attrend - parser->attr);
328 D->state= st ? st->state : 0;
329 if (D->state == UDM_RESSEC_WORD)
330 {
331 UdmWideWordInit(&D->WW);
332 D->WW.Param.origin= UDM_WORD_ORIGIN_QUERY;
333 }
334 if (D->state == UDM_RESSEC_ITEM)
335 {
336 char dbuf[128];
337 UdmDocInit(&D->Doc);
338 snprintf(dbuf, 128, "%.5f", (float) 0);
339 UdmVarListReplaceStr(&D->Doc.Sections, "Pop_Rank", dbuf);
340 }
341 return UDM_OK;
342 }
343
344
345 static udm_rc_t
ResFromXMLAddDocHook(UDM_XML_PARSER * parser)346 ResFromXMLAddDocHook(UDM_XML_PARSER *parser)
347 {
348 RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
349 size_t nbytes;
350 D->Query->URLData.nitems++;
351 D->Query->Res.num_rows++;
352 nbytes= D->Query->Res.num_rows * sizeof(UDM_DOCUMENT);
353 D->Query->Res.Doc= (UDM_DOCUMENT*) UdmRealloc(D->Query->Res.Doc, nbytes);
354 D->Query->Res.Doc[D->Query->Res.num_rows-1]= D->Doc;
355 bzero((void*)&D->Doc, sizeof(UDM_DOCUMENT));
356
357 nbytes= D->Query->Res.num_rows * sizeof(UDM_URLDATA);
358 D->Query->URLData.Item= (UDM_URLDATA*) UdmRealloc(D->Query->URLData.Item, nbytes);
359 bzero((void*)&D->Query->URLData.Item[D->Query->Res.num_rows-1], sizeof(UDM_URLDATA));
360 D->Query->URLData.Item[D->Query->Res.num_rows-1].url_id= D->Query->URLData.nitems-1;
361 D->Query->URLData.Item[D->Query->Res.num_rows-1].score= D->score;
362 D->Query->URLData.Item[D->Query->Res.num_rows-1].per_site= D->per_site;
363 D->Query->URLData.Item[D->Query->Res.num_rows-1].site_id= D->site_id;
364
365 D->score= 0;
366 D->per_site= 0;
367 D->site_id= 0;
368 return UDM_OK;
369 }
370
371 static udm_rc_t
ResFromXMLAddDocHookImport(UDM_XML_PARSER * parser)372 ResFromXMLAddDocHookImport(UDM_XML_PARSER *parser)
373 {
374 RES_PARSER_DATA *Data= (RES_PARSER_DATA*) parser->user_data;
375 UDM_DOCUMENT *D= &Data->Doc;
376 UdmDocAction(Data->Agent, D, UDM_DOCCMD_RESTOREDATA);
377 UdmVarListFree(&D->Sections);
378 return UDM_OK;
379 }
380
381 static udm_rc_t
ResFromXMLLeave(UDM_XML_PARSER * parser,const char * name,size_t l)382 ResFromXMLLeave(UDM_XML_PARSER *parser, const char *name, size_t l)
383 {
384 RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
385 struct udm_res_section_st *st= res_sec_find(parser->attr,
386 parser->attrend - parser->attr);
387 D->state= st ? st->state : 0;
388
389 if (D->state == UDM_RESSEC_WORD)
390 {
391 if (!D->WW.Word.str)
392 {
393 D->WW.Word.str= UdmStrdup("<empty>");
394 D->WW.Word.length= 7;
395 }
396 UdmWideWordListAddForStat(&D->Query->Res.WWList, &D->WW);
397 UdmWideWordFree(&D->WW);
398 }
399 if (D->state == UDM_RESSEC_ITEM)
400 {
401 if (D->Query)
402 ResFromXMLAddDocHook(parser);
403 else
404 ResFromXMLAddDocHookImport(parser);
405 }
406 /* fprintf(stderr, "leave: len=%d '%s'\n", l, name);*/
407 return(UDM_OK);
408 }
409
410
411 static udm_rc_t
ResFromXMLValue(UDM_XML_PARSER * parser,const char * s,size_t len)412 ResFromXMLValue(UDM_XML_PARSER *parser, const char *s, size_t len)
413 {
414 RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
415 struct udm_res_section_st *st= res_sec_find(parser->attr,
416 parser->attrend - parser->attr);
417 if (!st)
418 {
419 /* Add user defined tags */
420 if (!strncasecmp(parser->attr, UDM_CSTR_WITH_LEN("/rss/channel/item/")))
421 UdmVarListReplaceStrn(&D->Doc.Sections, parser->attr + 18, s, len);
422 return UDM_OK;
423 }
424
425 if (st->handler)
426 {
427 st->handler(D, st->section_name, s, len);
428 return UDM_OK;
429 }
430 D->state= st->state;
431 switch (D->state)
432 {
433 case UDM_RESSEC_WORD :
434 UdmFree(D->WW.Word.str);
435 D->WW.Word.str= udm_strndup(s, len);
436 D->WW.Word.length= len;
437 break;
438 case UDM_RESSEC_WORD_WORD :
439 UdmFree(D->WW.Word.str);
440 D->WW.Word.str= udm_strndup(s, len);
441 D->WW.Word.length= len;
442 break;
443 case UDM_RESSEC_WORD_ID :
444 break;
445 case UDM_RESSEC_WORD_ORDER :
446 D->WW.Param.order= atoi(s);
447 break;
448 case UDM_RESSEC_WORD_COUNT :
449 D->WW.Param.count= atoi(s);
450 break;
451 case UDM_RESSEC_WORD_ORIGIN :
452 D->WW.Param.origin= atoi(s); /* TODO34: check if valid */
453 break;
454 case UDM_RESSEC_WORD_WEIGHT :
455 D->WW.Param.weight= atoi(s);
456 break;
457 case UDM_RESSEC_WORD_MATCH :
458 D->WW.Param.match_mode= atoi(s); /* TODO34: check if valid */
459 break;
460 case UDM_RESSEC_WORD_SECNO :
461 D->WW.Param.secno= atoi(s);
462 break;
463 case UDM_RESSEC_WORD_PHRLEN :
464 D->WW.Param.phrlen= atoi(s);
465 break;
466 case UDM_RESSEC_WORD_PHRPOS :
467 D->WW.Param.phrpos= atoi(s);
468 break;
469 case UDM_RESSEC_ITEM_PERSITE:
470 D->per_site= udm_strntod(s, len);
471 break;
472 case UDM_RESSEC_ITEM_SCORE:
473 D->score= udm_strntod(s, len) * 1000 + 0.5;
474 break;
475 case UDM_RESSEC_TOTAL_RESULTS:
476 D->Query->stats.total_found= atoi(s);
477 break;
478 }
479 /*fprintf(stderr, "UdmXMLValue: st=%d '%.*s' name='%s'\n", D->state, len, s, parser->attr);*/
480 return UDM_OK;
481 }
482
483
484 udm_rc_t
UdmQueryFromXML(UDM_AGENT * A,UDM_QUERY * Query,const char * str,size_t length,UDM_CHARSET * cs)485 UdmQueryFromXML(UDM_AGENT *A, UDM_QUERY *Query,
486 const char *str, size_t length, UDM_CHARSET *cs)
487 {
488 udm_rc_t rc= UDM_OK;
489 RES_PARSER_DATA Data;
490 UDM_XML_PARSER parser;
491 const char *date_format= UdmVarListFindStr(&A->Conf->Vars, "DateFormat",
492 "%a, %d %b %Y, %X %Z");
493 UdmXMLParserCreate(&parser);
494 parser.flags |= UDM_XML_SKIP_TEXT_NORMALIZATION;
495 bzero(&Data, sizeof(Data));
496 Data.Agent= A;
497 Data.Query= Query;
498 Data.cs= cs;
499 udm_snprintf(Data.date_format, UDM_DF_SIZE, "%s", date_format);
500
501 UdmXMLSetUserData(&parser, &Data);
502 UdmXMLSetEnterHandler(&parser, ResFromXMLEnter);
503 UdmXMLSetLeaveHandler(&parser, ResFromXMLLeave);
504 UdmXMLSetValueHandler(&parser, ResFromXMLValue);
505
506 if (UDM_OK != (rc= UdmXMLParserExec(&parser, str, length)))
507 {
508 char err[256];
509 udm_snprintf(err, sizeof(err),
510 "XML parsing error: %s at line %d pos %d",
511 UdmXMLErrorString(&parser),
512 (int) UdmXMLErrorLineno(&parser),
513 (int) UdmXMLErrorPos(&parser));
514 }
515
516 UdmXMLParserFree(&parser);
517 return rc;
518 }
519