1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #include "udm_common.h"
24 #include "udm_utils.h"
25 #include "udm_db.h"
26 #include "udm_db_int.h"
27 #include "udm_url.h"
28 #include "udm_vars.h"
29 #include "udm_coords.h"
30 #include "udm_log.h"
31 #include "udm_hash.h"
32 #include "udm_doc.h"
33 #include "udm_word.h"
34 #include "udm_indexer.h"
35 #include "udm_parsehtml.h"
36 
37 #ifdef HAVE_SQL
38 
39 
40 #define UDM_WRDCOORD(p,s)  ((((unsigned int)(s))<<24)+(unsigned int)(p))
41 #define UDM_WRDSEC(c)      (((unsigned int)(c))>>24)
42 #define UDM_WRDPOS(c)      (((unsigned int)(c))&0x001FFFFF)
43 
44 
45 static udm_rc_t
UdmDeleteWordsFromURLSingle(UDM_AGENT * Indexer,UDM_DB * db,urlid_t url_id)46 UdmDeleteWordsFromURLSingle(UDM_AGENT *Indexer, UDM_DB *db, urlid_t url_id)
47 {
48   char qbuf[512];
49   udm_snprintf(qbuf, sizeof(qbuf), "DELETE FROM dict WHERE url_id=%d", url_id);
50   return UdmDBSQLQuery(Indexer, db, NULL, qbuf);
51 }
52 
53 
54 static udm_rc_t
StoreWordsMySQL(UDM_AGENT * A,UDM_DB * db,UDM_WORDLIST * Words,urlid_t url_id)55 StoreWordsMySQL(UDM_AGENT *A, UDM_DB *db, UDM_WORDLIST *Words, urlid_t url_id)
56 {
57   udm_rc_t rc;
58   size_t nstored= 0, i;
59 
60   if (!Words->nwords)
61     return UDM_OK;
62 
63   for(nstored= 0; nstored < Words->nwords; )
64   {
65     char * qb,*qe;
66     size_t step=1024;
67     size_t mlen=1024;
68     size_t rstored = 0;
69 
70     qb=(char*)UdmMalloc(mlen);
71     strcpy(qb,"INSERT INTO dict (word,url_id,coord) VALUES ");
72     qe=qb+strlen(qb);
73 
74     for(i= nstored; i< Words->nwords;i++)
75     {
76       size_t len=qe-qb;
77       if (!Words->Word[i].coord.secno)
78       {
79         nstored++;
80         continue;
81       }
82       rstored++;
83 
84       /* UDM_MAXWORDSIZE+100 should be enough */
85       if((len + A->Conf->WordParam.max_word_len + 100) >= mlen)
86       {
87         mlen+=step;
88         qb=(char*)UdmRealloc(qb,mlen);
89         qe=qb+len;
90       }
91 
92       if(i>nstored)*qe++=',';
93 
94       if (UdmSQLDBMode(db) == UDM_SQLDBMODE_SINGLE)
95       {
96         *qe++='(';
97         *qe++='\'';
98         strcpy(qe, Words->Word[i].word);
99         while(*qe)qe++;
100         *qe++='\'';
101         *qe++=',';
102         qe+=sprintf(qe,"%d,%d",url_id,
103                     UDM_WRDCOORD(Words->Word[i].coord.pos,
104                                  Words->Word[i].coord.secno) /*+
105                                  Words->Word[i].seclen_marker*/);
106         *qe++=')';
107         *qe='\0';
108       }
109       if(qe>qb+UDM_MAX_MULTI_INSERT_QSIZE)
110         break;
111     }
112     nstored = i + 1;
113     rc = (rstored > 0) ? UdmDBSQLQuery(A, db, NULL, qb) : UDM_OK;
114     UDM_FREE(qb);
115     if(rc!=UDM_OK)
116       break;
117   }
118   return rc;
119 }
120 
121 
122 static udm_rc_t
StoreWordsSingleGeneric(UDM_AGENT * A,UDM_DB * db,UDM_WORDLIST * Words,urlid_t url_id)123 StoreWordsSingleGeneric(UDM_AGENT *A, UDM_DB *db,
124                         UDM_WORDLIST *Words, urlid_t url_id)
125 {
126   size_t i;
127   for(i= 0; i < Words->nwords; i++)
128   {
129     udm_rc_t rc;
130     char qbuf[256];
131     if (!Words->Word[i].coord.secno)
132       continue;
133 
134     if (UdmSQLDBMode(db) == UDM_SQLDBMODE_SINGLE)
135     {
136       sprintf(qbuf,"INSERT INTO dict (url_id,word,coord) VALUES(%d,'%s',%d)",
137               url_id,
138               Words->Word[i].word,
139               UDM_WRDCOORD(Words->Word[i].coord.pos,
140               Words->Word[i].coord.secno));
141       if (UDM_OK!= (rc= UdmDBSQLQuery(A, db, NULL, qbuf)))
142         return rc;
143     }
144   }
145   return UDM_OK;
146 }
147 
148 
149 static udm_rc_t
StoreWordsSingle(UDM_AGENT * Indexer,UDM_DB * db,UDM_DOCUMENT * Doc)150 StoreWordsSingle(UDM_AGENT *Indexer, UDM_DB *db, UDM_DOCUMENT *Doc)
151 {
152   udm_rc_t rc= UDM_OK;
153   urlid_t  url_id = UdmVarListFindInt(&Doc->Sections, "ID", 0);
154   UDM_WORDLIST Words;
155 
156   UdmWordListInit(&Words);
157   if (UDM_OK != (rc= UdmPrepareWords(Indexer, Doc, &Words)))
158     goto ret;
159 
160   if (UDM_OK != (rc= UdmDeleteWordsFromURLSingle(Indexer, db, url_id)))
161     goto ret;
162 
163   if (UdmVarListFindInt(&Indexer->Conf->Vars, "SaveSectionSize", 1))
164   {
165     if (UDM_OK!= (rc= UdmWordListSaveSectionSize(&Words)))
166       goto ret;
167   }
168 
169   /*
170     Don't need to delete words here,
171     they are deleted in UdmStoreWords().
172   */
173 
174   /* Insert new words */
175   if (UdmSQLDBType(db) == UDM_DB_MYSQL)
176   {
177     rc= StoreWordsMySQL(Indexer, db, &Words, url_id);
178   }
179   else
180   {
181     rc= StoreWordsSingleGeneric(Indexer, db, &Words, url_id);
182   }
183 ret:
184   UdmWordListFree(&Words);
185   return rc;
186 }
187 
188 
189 static udm_rc_t
UdmFindWordSingleInternal(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args,UDM_URLCRDLIST * CoordList,const char * table,int join)190 UdmFindWordSingleInternal(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
191                           UDM_FINDWORD_ARGS *args,
192                           UDM_URLCRDLIST *CoordList,
193                           const char *table, int join)
194 {
195   char qbuf[4096];
196   UDM_SQLRES SQLRes;
197   size_t numrows, i, curnum;
198   udm_rc_t rc;
199 
200   if (*args->where)
201   {
202     udm_snprintf(qbuf, sizeof(qbuf) - 1,"\
203 SELECT %s.url_id,%s.coord FROM %s, url%s \
204 WHERE %s.%s AND url.rec_id=%s.url_id AND %s",
205     table, table, table, UdmSQLDBQueryFrom(Query),
206     table, args->cmparg, table, args->where);
207   }
208   else if (!join)
209   {
210     udm_snprintf(qbuf, sizeof(qbuf) - 1,
211       "SELECT url_id,coord FROM %s WHERE %s", table, args->cmparg);
212   }
213   else
214   {
215 
216     udm_snprintf(qbuf,sizeof(qbuf)-1,"\
217 SELECT url_id,coord FROM %s,url WHERE %s.%s AND url.rec_id=%s.url_id",
218        table, table, args->cmparg, table);
219   }
220 
221   if (UDM_OK != (rc= UdmDBSQLQuery(A, db, &SQLRes, qbuf)))
222     return rc;
223 
224   if ((numrows= UdmSQLNumRows(&SQLRes)))
225   {
226     CoordList->Coords= (UDM_URL_CRD*) UdmMalloc(numrows*sizeof(UDM_URL_CRD));
227     CoordList->acoords= numrows;
228   }
229 
230   /* Add new found word to the list */
231   for(curnum= 0, i= 0; i < numrows; i++)
232   {
233     uint4 coord= atoi(UdmSQLValue(&SQLRes, i, 1));
234     uint4 section= UDM_WRDSEC(coord);
235     uint4 weight= args->query_param.wf[section];
236 
237     if(weight && (!args->Word.Param.secno || args->Word.Param.secno == section))
238     {
239       UDM_URL_CRD *Coord= &CoordList->Coords[curnum];
240       Coord->urlid_coord.url_id= UDM_ATOI(UdmSQLValue(&SQLRes, i, 0));
241       Coord->urlid_coord.coord.pos= UDM_WRDPOS(coord);
242       Coord->urlid_coord.coord.secno= UDM_WRDSEC(coord);
243       Coord->num= args->Word.Param.order & 0xFF;
244       Coord->seclen= 0;
245       curnum++;
246     }
247   }
248   CoordList->ncoords= curnum;
249   UdmSQLFree(&SQLRes);
250   UdmURLCRDListSortByURLThenSecnoThenPos(CoordList);
251   return rc;
252 }
253 
254 
255 static udm_rc_t
UdmFindWordSingle(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)256 UdmFindWordSingle(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
257                   UDM_FINDWORD_ARGS *args)
258 {
259   udm_rc_t rc;
260   UDM_URLCRDLIST CoordList;
261 
262   bzero(&CoordList, sizeof(CoordList));
263 
264   if (UDM_OK != (rc= UdmFindWordSingleInternal(A, db, Query, args,
265                                                &CoordList, "dict", 0)))
266     return rc;
267 
268   if (args->query_param.SaveSectionSize && CoordList.ncoords)
269   {
270     UDM_URL_CRD *Crd= CoordList.Coords;
271     UDM_URL_CRD *End= CoordList.Coords + CoordList.ncoords;
272     UDM_URL_CRD *To= CoordList.Coords;
273     UDM_URL_CRD *Prev= CoordList.Coords;
274     urlid_t prev_url_id;
275     udm_secno_t prev_secno;
276 
277     for (prev_url_id= Crd->urlid_coord.url_id,
278          prev_secno= Crd->urlid_coord.coord.secno ;
279          Crd < End; Crd++)
280     {
281       UDM_URL_CRD *Next= Crd + 1;
282       if (Next == End ||
283           Next->urlid_coord.url_id != prev_url_id ||
284           Next->urlid_coord.coord.secno  != prev_secno)
285       {
286         for ( ; Prev < To; Prev++)
287         {
288           Prev->seclen= Crd->urlid_coord.coord.pos;
289         }
290         if (Next < End)
291         {
292           prev_url_id= Next->urlid_coord.url_id;
293           prev_secno= Next->urlid_coord.coord.secno;
294           Prev= To;
295         }
296       }
297       else
298         *To++= *Crd;
299     }
300     CoordList.ncoords= To - CoordList.Coords;
301   }
302   if (args->urls.nurls)
303     UdmApplyFastLimit(&CoordList, &args->urls);
304   if (CoordList.ncoords)
305   {
306     args->Word.Param.count= CoordList.ncoords;
307     UdmURLCRDListListAddWithSort2(&args->SearchSectionListList,
308                                   &Query->Res.WWList,
309                                   &args->Word, &CoordList);
310   }
311   else
312   {
313     UdmFree(CoordList.Coords);
314   }
315   return(UDM_OK);
316 }
317 
318 
319 static udm_rc_t
UdmTruncateDictSingle(UDM_AGENT * A,UDM_DB * db)320 UdmTruncateDictSingle(UDM_AGENT *A, UDM_DB *db)
321 {
322   return UdmDBSQLTableTruncateOrDelete(A, db, "dict");
323 }
324 
325 
326 static udm_rc_t
UdmWordStatCreateSingle(UDM_AGENT * A,UDM_DB * db)327 UdmWordStatCreateSingle(UDM_AGENT *A, UDM_DB *db)
328 {
329   char qbuf[128];
330   sprintf(qbuf, "SELECT word, count(*) FROM dict GROUP BY word");
331   return UdmWordStatQuery(A, db, qbuf);
332 }
333 
334 
335 static udm_rc_t
UdmDumpWordInfoOneDocSingle(UDM_AGENT * A,UDM_DB * db,UDM_DOCUMENT * Doc)336 UdmDumpWordInfoOneDocSingle(UDM_AGENT *A, UDM_DB *db, UDM_DOCUMENT *Doc)
337 {
338   return UDM_OK;
339 }
340 
341 
342 static udm_rc_t
UdmSingleInitSearch(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,UDM_FINDWORD_ARGS * args)343 UdmSingleInitSearch(UDM_AGENT *A, UDM_DB *db, UDM_QUERY *Query,
344                     UDM_FINDWORD_ARGS *args)
345 {
346   return UDM_OK;
347 }
348 
349 
350 static udm_rc_t
UdmQueryActionSingle(UDM_AGENT * A,UDM_DB * db,UDM_QUERY * Query,udm_querycmd_t cmd)351 UdmQueryActionSingle(UDM_AGENT *A, UDM_DB *db,
352                      UDM_QUERY *Query, udm_querycmd_t cmd)
353 {
354   switch (cmd)
355   {
356     case UDM_QUERYCMD_CLEAR:    return UdmTruncateDictSingle(A, db);
357     case UDM_QUERYCMD_WORDSTAT: return UdmWordStatCreateSingle(A, db);
358     case UDM_QUERYCMD_INDEX:
359       return udm_dbmode_handler_blob.QueryAction(A, db, Query, cmd);
360     default: break;
361   }
362   return UDM_NOTARGET;
363 }
364 
365 
366 const UDM_DBMODE_HANDLER udm_dbmode_handler_single=
367 {
368   "single",
369   StoreWordsSingle,
370   UdmQueryActionSingle,
371   UdmDeleteWordsFromURLSingle,
372   UdmFindWordSingle,
373   UdmDumpWordInfoOneDocSingle,
374   UdmSingleInitSearch,
375 };
376 
377 
378 #endif /* HAVE_SQL */
379