1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <stdlib.h>
23 #include <string.h>
24 
25 #include "udm_common.h"
26 #include "udm_word.h"
27 #include "udm_doc.h"
28 #include "udm_utils.h"
29 #include "udm_result.h"
30 #include "udm_parsehtml.h"
31 #include "udm_parsexml.h"
32 #include "udm_vars.h"
33 #include "udm_searchtool.h"
34 #include "udm_boolean.h"
35 
36 #include "udm_db.h" /* for UdmDocAction */
37 
38 
UdmResultInit(UDM_RESULT * Res)39 void UdmResultInit(UDM_RESULT *Res)
40 {
41   bzero((void*)Res, sizeof(UDM_RESULT));
42 }
43 
44 
45 UDM_API(void)
UdmResultFree(UDM_RESULT * Res)46 UdmResultFree(UDM_RESULT * Res)
47 {
48   size_t i;
49   if(!Res)return;
50   UdmVarListFree(&Res->Vars);
51   UdmWideWordListFree(&Res->WWList);
52   if(Res->Doc)
53   {
54     for(i=0;i<Res->num_rows;i++)
55     {
56       UdmDocFree(&Res->Doc[i]);
57     }
58     UdmFree(Res->Doc);
59   }
60   bzero((void*)Res, sizeof(*Res));
61 }
62 
63 
UdmResultNumRows(UDM_RESULT * Res)64 UDM_API(size_t) UdmResultNumRows(UDM_RESULT *Res)
65 {
66   return Res->num_rows;
67 }
68 
69 
70 UDM_API(size_t)
UdmResultFirst(UDM_RESULT * Res)71 UdmResultFirst(UDM_RESULT *Res)
72 {
73   return (size_t) UdmVarListFindInt(&Res->Vars, "first", 1) - 1;
74 }
75 
76 
77 UDM_API(size_t)
UdmResultLast(UDM_RESULT * Res)78 UdmResultLast(UDM_RESULT *Res)
79 {
80   return (size_t) UdmVarListFindInt(&Res->Vars, "last", 1) - 1;
81 }
82 
83 
84 UDM_API(size_t)
UdmResultTotalFound(UDM_RESULT * Res)85 UdmResultTotalFound(UDM_RESULT *Res)
86 {
87   return (size_t) UdmVarListFindInt(&Res->Vars, "total", 0);
88 }
89 
90 
91 UDM_API(udm_rc_t)
UdmResultToVarList(UDM_VARLIST * Vars,const UDM_RESULT * Res)92 UdmResultToVarList(UDM_VARLIST *Vars, const UDM_RESULT *Res)
93 {
94   char search_time[100];
95   /* Convert milliseconds to seconds */
96   udm_snprintf(search_time, sizeof(search_time), "%.3f",
97                ((double) UdmVarListFindInt(&Res->Vars, "SearchTime", 0))/1000);
98   if (UDM_OK != UdmVarListReplaceStr(Vars, "SearchTime", search_time))
99     return UDM_ERROR;
100   return UdmVarListReplaceLst(Vars, &Res->Vars, NULL, "*");
101 }
102 
103 /***************************************************************/
104 udm_rc_t
UdmResultAppendAndInit(UDM_RESULT * Res)105 UdmResultAppendAndInit(UDM_RESULT *Res)
106 {
107   size_t nbytes= (Res->num_rows + 1) * sizeof(UDM_DOCUMENT);
108   UDM_DOCUMENT *tmp;
109   if (!(tmp= UdmRealloc(Res->Doc, nbytes)))
110     return UDM_ERROR;
111   Res->Doc= tmp;
112   UdmDocInit(&Res->Doc[Res->num_rows]);
113   Res->num_rows++;
114   return UDM_OK;
115 }
116 
117 
118 udm_rc_t
UdmResultAppendDoc(UDM_RESULT * Res,UDM_DOCUMENT * Doc)119 UdmResultAppendDoc(UDM_RESULT *Res, UDM_DOCUMENT *Doc)
120 {
121   size_t nbytes= (Res->num_rows + 1) * sizeof(UDM_DOCUMENT);
122   UDM_DOCUMENT *tmp;
123   if (!(tmp= UdmRealloc(Res->Doc, nbytes)))
124     return UDM_ERROR;
125   Res->Doc= tmp;
126   Res->Doc[Res->num_rows]= Doc[0];
127   Res->num_rows++;
128   return UDM_OK;
129 }
130 
131 
132 /*****************************************************************/
133 #define UDM_DF_SIZE 64
134 
135 typedef struct
136 {
137   int state;
138   UDM_AGENT *Agent;
139   UDM_WIDEWORD WW;
140   UDM_DOCUMENT Doc;
141   UDM_QUERY *Query;
142   UDM_CHARSET *cs;
143   uint4       score;
144   uint4       per_site;
145   urlid_t     site_id;
146   char date_format[UDM_DF_SIZE];
147 } RES_PARSER_DATA;
148 
149 
150 static udm_rc_t
PD_ReplaceOrAppendStrn_hl0(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)151 PD_ReplaceOrAppendStrn_hl0(RES_PARSER_DATA *D,
152                         const char *name,
153                         const char *val, size_t len)
154 {
155   return UdmVarListReplaceOrAppendStrn(&D->Doc.Sections, name, val, len, 0);
156 }
157 
158 
159 static udm_rc_t
PD_ReplaceOrAppendStrn_hl1(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)160 PD_ReplaceOrAppendStrn_hl1(RES_PARSER_DATA *D,
161                            const char *name,
162                            const char *val, size_t len)
163 {
164   return UdmVarListReplaceOrAppendStrn(&D->Doc.Sections, name, val, len, 1);
165 }
166 
167 
168 static udm_rc_t
PD_ReplaceStrn(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)169 PD_ReplaceStrn(RES_PARSER_DATA *D,
170                const char *name,
171                const char *val, size_t len)
172 {
173   return UdmVarListReplaceStrn(&D->Doc.Sections, name, val, len) ? UDM_OK : UDM_ERROR;
174 }
175 
176 
177 static udm_rc_t
PD_ReplaceLastModified(RES_PARSER_DATA * D,const char * name,const char * val,size_t len)178 PD_ReplaceLastModified(RES_PARSER_DATA *D,
179                        const char *name,
180                        const char *val, size_t len)
181 {
182   time_t last_mod_time;
183   char tmp[UDM_MAXTIMESTRLEN];
184   len= len >= sizeof(tmp) ? sizeof(tmp) - 1 : len;
185   memcpy(tmp, val, len);
186   tmp[len]= '\0';
187   last_mod_time= UdmHttpDate2Time_t(tmp);
188   UdmVarListReplaceInt(&D->Doc.Sections, "Last-Modified-Timestamp", (int) last_mod_time);
189   if ((len= strftime(tmp, sizeof(tmp)-1, D->date_format, localtime(&last_mod_time))))
190     tmp[len]= '\0';
191   else
192     UdmTime_t2HttpStr(last_mod_time, tmp, sizeof(tmp));
193   UdmVarListReplaceStr(&D->Doc.Sections, "Last-Modified", tmp);
194   return UDM_OK;
195 }
196 
197 
198 typedef udm_rc_t
199 (*section_handler)(RES_PARSER_DATA *D,
200                    const char *name,
201                    const char *val, size_t len);
202 
203 
204 
205 
206 #define RES_MISC 0
207 
208 struct udm_res_section_st
209 {
210   int        state;
211   size_t     length;
212   const char *str;
213   const char *section_name;
214   section_handler handler;
215 };
216 
217 #define UDM_RESSEC_TOTAL_RESULTS       100
218 
219 #define UDM_RESSEC_WORD                1000
220 #define UDM_RESSEC_WORD_ID             1001
221 #define UDM_RESSEC_WORD_ORDER          1002
222 #define UDM_RESSEC_WORD_COUNT          1003
223 #define UDM_RESSEC_WORD_ORIGIN         1004
224 #define UDM_RESSEC_WORD_WEIGHT         1005
225 #define UDM_RESSEC_WORD_MATCH          1006
226 #define UDM_RESSEC_WORD_SECNO          1007
227 #define UDM_RESSEC_WORD_PHRLEN         1008
228 #define UDM_RESSEC_WORD_PHRPOS         1009
229 #define UDM_RESSEC_WORD_WORD           1010
230 
231 #define UDM_RESSEC_ITEM                2000
232 #define UDM_RESSEC_ITEM_DESCR          2002
233 #define UDM_RESSEC_ITEM_SCORE          2005
234 
235 #define UDM_RESSEC_ITEM_PERSITE        2011
236 #define UDM_RESSEC_ITEM_DESCR_B        2014
237 
238 #define SECDEF(x)  ((size_t) (sizeof(x) - 1)), (x)
239 #define WSL "/rss/channel/mnoGoSearch:WordStatList"
240 #define WSI "/rss/channel/mnoGoSearch:WordStatList/mnoGoSearch:WordStatItem"
241 #define RCI(x) SECDEF("/rss/channel/item/" x)
242 
243 static struct udm_res_section_st res_sec[]=
244 {
245   {UDM_RESSEC_WORD,           SECDEF("/result/wordinfo/word"),        NULL, NULL},
246   {UDM_RESSEC_WORD_ID,        SECDEF("/result/wordinfo/word@id"),     NULL, NULL},
247   {UDM_RESSEC_WORD_ORDER,     SECDEF("/result/wordinfo/word@order"),  NULL, NULL},
248   {UDM_RESSEC_WORD_COUNT,     SECDEF("/result/wordinfo/word@count"),  NULL, NULL},
249   {UDM_RESSEC_WORD_ORIGIN,    SECDEF("/result/wordinfo/word@origin"), NULL, NULL},
250   {UDM_RESSEC_WORD_WEIGHT,    SECDEF("/result/wordinfo/word@weight"), NULL, NULL},
251   {UDM_RESSEC_WORD_MATCH,     SECDEF("/result/wordinfo/word@match"),  NULL, NULL},
252   {UDM_RESSEC_WORD_SECNO,     SECDEF("/result/wordinfo/word@secno"),  NULL, NULL},
253   {UDM_RESSEC_WORD_PHRLEN,    SECDEF("/result/wordinfo/word@phrlen"), NULL, NULL},
254   {UDM_RESSEC_WORD_PHRPOS,    SECDEF("/result/wordinfo/word@phrpos"), NULL, NULL},
255   {UDM_RESSEC_TOTAL_RESULTS,  SECDEF("/result/totalResults"),         NULL, NULL},
256 
257   {UDM_RESSEC_WORD,           SECDEF(WSI),           NULL, NULL},
258   {UDM_RESSEC_WORD_ID,        SECDEF(WSI "@id"),     NULL, NULL},
259   {UDM_RESSEC_WORD_WORD,      SECDEF(WSI "@word"),   NULL, NULL},
260   {UDM_RESSEC_WORD_ORDER,     SECDEF(WSI "@order"),  NULL, NULL},
261   {UDM_RESSEC_WORD_COUNT,     SECDEF(WSI "@count"),  NULL, NULL},
262   {UDM_RESSEC_WORD_ORIGIN,    SECDEF(WSI "@origin"), NULL, NULL},
263   {UDM_RESSEC_WORD_WEIGHT,    SECDEF(WSI "@weight"), NULL, NULL},
264   {UDM_RESSEC_WORD_MATCH,     SECDEF(WSI "@match"),  NULL, NULL},
265   {UDM_RESSEC_WORD_SECNO,     SECDEF(WSI "@secno"),  NULL, NULL},
266   {UDM_RESSEC_WORD_PHRLEN,    SECDEF(WSI "@phrlen"), NULL, NULL},
267   {UDM_RESSEC_WORD_PHRPOS,    SECDEF(WSI "@phrpos"), NULL, NULL},
268 
269   {UDM_RESSEC_TOTAL_RESULTS,  SECDEF("/rss/channel/openSearch:totalResults"), NULL, NULL},
270 
271   {UDM_RESSEC_ITEM,           SECDEF("/rss/channel/item"),         NULL, NULL},
272   {UDM_RESSEC_ITEM_SCORE,     SECDEF("/rss/channel/item/score"),   NULL, NULL},
273   {UDM_RESSEC_ITEM_PERSITE,   SECDEF("/rss/channel/item/persite"), NULL, NULL},
274 
275   /* Duplicate: */
276   {0, RCI("updated"),         "Last-Modified",    PD_ReplaceLastModified},
277   {0, RCI("pubDate"),         "Last-Modified",    PD_ReplaceLastModified},
278   {0, RCI("last-modified"),   "Last-Modified",    PD_ReplaceLastModified},
279 
280   {0, RCI("title"),           "title",            PD_ReplaceOrAppendStrn_hl0},
281   {0, RCI("title/b"),         "title",            PD_ReplaceOrAppendStrn_hl1},
282 
283   /* Duplicate: */
284   {0, RCI("body"),            "body",             PD_ReplaceOrAppendStrn_hl0},
285   {0, RCI("description"),     "body",             PD_ReplaceOrAppendStrn_hl0},
286   {0, RCI("description/b"),   "body",             PD_ReplaceOrAppendStrn_hl1},
287 
288   /* Duplicate: */
289   {0, RCI("link"),            "url",              PD_ReplaceStrn},
290   {0, RCI("url"),             "url",              PD_ReplaceStrn},
291 
292   {0, RCI("id"),              "id",               PD_ReplaceStrn},
293   {0, RCI("content-length"),  "Content-Length",   PD_ReplaceStrn},
294   {0, RCI("content-type"),    "Content-Type",     PD_ReplaceStrn},
295   {0, RCI("cached"),          "stored_href",      PD_ReplaceStrn},
296 
297   /* Duplicate: */
298   {0, RCI("CachedCopyBase64"),"CachedCopyBase64", PD_ReplaceStrn},
299   {0, RCI("cached-content"),  "CachedCopyBase64", PD_ReplaceStrn},
300 
301   {0, RCI("tag"),             "tag",              PD_ReplaceStrn},
302   {0, RCI("crc32"),           "crc32",            PD_ReplaceStrn},
303   {0, RCI("charset"),         "charset",          PD_ReplaceStrn},
304   {0, RCI("status"),          "status",           PD_ReplaceStrn},
305 
306   {0, 0, NULL, NULL, NULL}
307 };
308 
309 
res_sec_find(const char * attr,size_t len)310 static struct udm_res_section_st *res_sec_find(const char *attr, size_t len)
311 {
312   struct udm_res_section_st *s;
313   for (s= res_sec; s->str; s++)
314   {
315     if (len == s->length && !strncasecmp(attr, s->str, len))
316       return s;
317   }
318   return NULL;
319 }
320 
321 
322 static udm_rc_t
ResFromXMLEnter(UDM_XML_PARSER * parser,const char * name,size_t l)323 ResFromXMLEnter(UDM_XML_PARSER *parser, const char *name, size_t l)
324 {
325   RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
326   struct udm_res_section_st *st= res_sec_find(parser->attr,
327                                               parser->attrend - parser->attr);
328   D->state= st ? st->state : 0;
329   if (D->state == UDM_RESSEC_WORD)
330   {
331     UdmWideWordInit(&D->WW);
332     D->WW.Param.origin= UDM_WORD_ORIGIN_QUERY;
333   }
334   if (D->state == UDM_RESSEC_ITEM)
335   {
336     char dbuf[128];
337     UdmDocInit(&D->Doc);
338     snprintf(dbuf, 128, "%.5f", (float) 0);
339     UdmVarListReplaceStr(&D->Doc.Sections, "Pop_Rank", dbuf);
340   }
341   return UDM_OK;
342 }
343 
344 
345 static udm_rc_t
ResFromXMLAddDocHook(UDM_XML_PARSER * parser)346 ResFromXMLAddDocHook(UDM_XML_PARSER *parser)
347 {
348   RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
349   size_t nbytes;
350   D->Query->URLData.nitems++;
351   D->Query->Res.num_rows++;
352   nbytes= D->Query->Res.num_rows * sizeof(UDM_DOCUMENT);
353   D->Query->Res.Doc= (UDM_DOCUMENT*) UdmRealloc(D->Query->Res.Doc, nbytes);
354   D->Query->Res.Doc[D->Query->Res.num_rows-1]= D->Doc;
355   bzero((void*)&D->Doc, sizeof(UDM_DOCUMENT));
356 
357   nbytes= D->Query->Res.num_rows * sizeof(UDM_URLDATA);
358   D->Query->URLData.Item= (UDM_URLDATA*) UdmRealloc(D->Query->URLData.Item, nbytes);
359   bzero((void*)&D->Query->URLData.Item[D->Query->Res.num_rows-1], sizeof(UDM_URLDATA));
360   D->Query->URLData.Item[D->Query->Res.num_rows-1].url_id= D->Query->URLData.nitems-1;
361   D->Query->URLData.Item[D->Query->Res.num_rows-1].score= D->score;
362   D->Query->URLData.Item[D->Query->Res.num_rows-1].per_site= D->per_site;
363   D->Query->URLData.Item[D->Query->Res.num_rows-1].site_id= D->site_id;
364 
365   D->score= 0;
366   D->per_site= 0;
367   D->site_id= 0;
368   return UDM_OK;
369 }
370 
371 static udm_rc_t
ResFromXMLAddDocHookImport(UDM_XML_PARSER * parser)372 ResFromXMLAddDocHookImport(UDM_XML_PARSER *parser)
373 {
374   RES_PARSER_DATA *Data= (RES_PARSER_DATA*) parser->user_data;
375   UDM_DOCUMENT *D= &Data->Doc;
376   UdmDocAction(Data->Agent, D, UDM_DOCCMD_RESTOREDATA);
377   UdmVarListFree(&D->Sections);
378   return UDM_OK;
379 }
380 
381 static udm_rc_t
ResFromXMLLeave(UDM_XML_PARSER * parser,const char * name,size_t l)382 ResFromXMLLeave(UDM_XML_PARSER *parser, const char *name, size_t l)
383 {
384   RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
385   struct udm_res_section_st *st= res_sec_find(parser->attr,
386                                               parser->attrend - parser->attr);
387   D->state= st ? st->state : 0;
388 
389   if (D->state == UDM_RESSEC_WORD)
390   {
391     if (!D->WW.Word.str)
392     {
393       D->WW.Word.str= UdmStrdup("<empty>");
394       D->WW.Word.length= 7;
395     }
396     UdmWideWordListAddForStat(&D->Query->Res.WWList, &D->WW);
397     UdmWideWordFree(&D->WW);
398   }
399   if (D->state == UDM_RESSEC_ITEM)
400   {
401     if (D->Query)
402       ResFromXMLAddDocHook(parser);
403     else
404       ResFromXMLAddDocHookImport(parser);
405   }
406   /* fprintf(stderr, "leave: len=%d '%s'\n", l, name);*/
407   return(UDM_OK);
408 }
409 
410 
411 static udm_rc_t
ResFromXMLValue(UDM_XML_PARSER * parser,const char * s,size_t len)412 ResFromXMLValue(UDM_XML_PARSER *parser, const char *s, size_t len)
413 {
414   RES_PARSER_DATA *D= (RES_PARSER_DATA*) parser->user_data;
415   struct udm_res_section_st *st= res_sec_find(parser->attr,
416                                               parser->attrend - parser->attr);
417   if (!st)
418   {
419     /* Add user defined tags */
420     if (!strncasecmp(parser->attr, UDM_CSTR_WITH_LEN("/rss/channel/item/")))
421       UdmVarListReplaceStrn(&D->Doc.Sections, parser->attr + 18, s, len);
422     return UDM_OK;
423   }
424 
425   if (st->handler)
426   {
427     st->handler(D, st->section_name, s, len);
428     return UDM_OK;
429   }
430   D->state= st->state;
431   switch (D->state)
432   {
433     case  UDM_RESSEC_WORD         :
434       UdmFree(D->WW.Word.str);
435       D->WW.Word.str= udm_strndup(s, len);
436       D->WW.Word.length= len;
437       break;
438     case  UDM_RESSEC_WORD_WORD    :
439       UdmFree(D->WW.Word.str);
440       D->WW.Word.str= udm_strndup(s, len);
441       D->WW.Word.length= len;
442       break;
443     case  UDM_RESSEC_WORD_ID      :
444       break;
445     case  UDM_RESSEC_WORD_ORDER   :
446       D->WW.Param.order= atoi(s);
447       break;
448     case  UDM_RESSEC_WORD_COUNT   :
449       D->WW.Param.count= atoi(s);
450       break;
451     case  UDM_RESSEC_WORD_ORIGIN  :
452       D->WW.Param.origin= atoi(s); /* TODO34: check if valid */
453       break;
454     case  UDM_RESSEC_WORD_WEIGHT  :
455       D->WW.Param.weight= atoi(s);
456       break;
457     case  UDM_RESSEC_WORD_MATCH   :
458       D->WW.Param.match_mode= atoi(s); /* TODO34: check if valid */
459       break;
460     case  UDM_RESSEC_WORD_SECNO   :
461       D->WW.Param.secno= atoi(s);
462       break;
463     case  UDM_RESSEC_WORD_PHRLEN  :
464       D->WW.Param.phrlen= atoi(s);
465       break;
466     case  UDM_RESSEC_WORD_PHRPOS  :
467       D->WW.Param.phrpos= atoi(s);
468       break;
469     case UDM_RESSEC_ITEM_PERSITE:
470       D->per_site= udm_strntod(s, len);
471       break;
472     case UDM_RESSEC_ITEM_SCORE:
473       D->score= udm_strntod(s, len) * 1000 + 0.5;
474       break;
475     case UDM_RESSEC_TOTAL_RESULTS:
476       D->Query->stats.total_found= atoi(s);
477       break;
478   }
479   /*fprintf(stderr, "UdmXMLValue: st=%d '%.*s' name='%s'\n", D->state, len, s, parser->attr);*/
480   return UDM_OK;
481 }
482 
483 
484 udm_rc_t
UdmQueryFromXML(UDM_AGENT * A,UDM_QUERY * Query,const char * str,size_t length,UDM_CHARSET * cs)485 UdmQueryFromXML(UDM_AGENT *A, UDM_QUERY *Query,
486                 const char *str, size_t length, UDM_CHARSET *cs)
487 {
488   udm_rc_t rc= UDM_OK;
489   RES_PARSER_DATA Data;
490   UDM_XML_PARSER parser;
491   const char *date_format= UdmVarListFindStr(&A->Conf->Vars, "DateFormat",
492                                              "%a, %d %b %Y, %X %Z");
493   UdmXMLParserCreate(&parser);
494   parser.flags |= UDM_XML_SKIP_TEXT_NORMALIZATION;
495   bzero(&Data, sizeof(Data));
496   Data.Agent= A;
497   Data.Query= Query;
498   Data.cs= cs;
499   udm_snprintf(Data.date_format, UDM_DF_SIZE, "%s", date_format);
500 
501   UdmXMLSetUserData(&parser, &Data);
502   UdmXMLSetEnterHandler(&parser, ResFromXMLEnter);
503   UdmXMLSetLeaveHandler(&parser, ResFromXMLLeave);
504   UdmXMLSetValueHandler(&parser, ResFromXMLValue);
505 
506   if (UDM_OK != (rc= UdmXMLParserExec(&parser, str, length)))
507   {
508     char err[256];
509     udm_snprintf(err, sizeof(err),
510                  "XML parsing error: %s at line %d pos %d",
511                   UdmXMLErrorString(&parser),
512                   (int) UdmXMLErrorLineno(&parser),
513                   (int) UdmXMLErrorPos(&parser));
514   }
515 
516   UdmXMLParserFree(&parser);
517   return rc;
518 }
519