1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <string.h>
21 
22 #include "udm_common.h"
23 #include "udm_log.h"
24 #include "udm_utils.h"
25 #include "udm_searchtool.h"
26 #include "udm_coords.h"
27 #include "udm_db.h"
28 #include "udm_hash.h"
29 #include "udm_url.h"
30 
31 /*******************  GroupBySite  *********************************/
32 
33 
34 void
UdmURLDataApplySiteRank(UDM_AGENT * A,UDM_URLDATALIST * DataList,int is_aggregation_point)35 UdmURLDataApplySiteRank(UDM_AGENT *A, UDM_URLDATALIST *DataList,
36                         int is_aggregation_point)
37 {
38   size_t i, in_site_rank;
39   urlid_t prev_site_id;
40   for (i= 0, prev_site_id= in_site_rank= 1; i < DataList->nitems; i++)
41   {
42     UDM_URLDATA *D= &DataList->Item[i];
43     if (prev_site_id != D->site_id)
44     {
45       in_site_rank= 0;
46       prev_site_id= D->site_id;
47     }
48     in_site_rank++;
49     if (is_aggregation_point)
50     {
51       if (in_site_rank > 1)
52       {
53         /* Keep machine number in the low byte */
54         D->score= (D->score & 0xFF) + ((D->score / in_site_rank) & 0x7FFFFF00);
55       }
56     }
57     else
58       D->score/= in_site_rank;
59   }
60 }
61 
62 
63 udm_rc_t
UdmURLDataListGroupBySiteUsingSort(UDM_AGENT * A,UDM_URLDATALIST * R,UDM_DB * db)64 UdmURLDataListGroupBySiteUsingSort(UDM_AGENT *A,
65                                    UDM_URLDATALIST *R,
66                                    UDM_DB *db)
67 {
68   udm_timer_t ticks;
69   UDM_URLDATA *Dat, *End;
70 
71   /* Initialize per_site */
72   for (Dat= R->Item, End= R->Item + R->nitems;
73        Dat < End; Dat++)
74     Dat->per_site= 1;
75 
76   UdmLog(A,UDM_LOG_DEBUG,"Start sort by site_id %d docs", (int) R->nitems);
77   ticks=UdmStartTimer();
78   UdmURLDataSortBySite(R);
79   UdmLog(A,UDM_LOG_DEBUG,"Stop sort by site_id:\t%.2f",UdmStopTimer(&ticks));
80 
81   UdmLog(A,UDM_LOG_DEBUG,"Start group by site_id %d docs", (int) R->nitems);
82   ticks=UdmStartTimer();
83   UdmURLDataGroupBySite(R);
84   UdmLog(A,UDM_LOG_DEBUG,"Stop group by site_id:\t%.2f", UdmStopTimer(&ticks));
85 
86   return UDM_OK;
87 }
88 
89 
UdmURLDataGroupBySite(UDM_URLDATALIST * List)90 void UdmURLDataGroupBySite(UDM_URLDATALIST *List)
91 {
92   UDM_URLDATA *src= List->Item + 1;
93   UDM_URLDATA *dst= List->Item;
94   UDM_URLDATA *end= List->Item + List->nitems;
95   uint4 count;
96 
97   if (!List->nitems)
98     return;
99 
100   for(count= List->Item[0].per_site; src < end; src++)
101   {
102     UDM_ASSERT(src->url != dst->url || src->url == NULL);
103     /* Group by site_id */
104     if(dst->site_id == src->site_id)
105     {
106       count+= src->per_site;
107       if (dst->score > src->score)
108       {
109         UdmURLDataFree(src);
110         continue;
111       }
112       else if (dst->score == src->score)
113       {
114         if (dst->pop_rank > src->pop_rank)
115         {
116           UdmURLDataFree(src);
117           continue;
118         }
119         else if (dst->pop_rank == src->pop_rank)
120         {
121           if (dst->url_id < src->url_id)
122           {
123             UdmURLDataFree(src);
124             continue;
125           }
126         }
127       }
128       UdmURLDataFree(dst);
129       dst->url_id=        src->url_id;
130       dst->score=         src->score;
131       dst->last_mod_time= src->last_mod_time;
132       dst->pop_rank=      src->pop_rank;
133       dst->url=           src->url;
134       dst->section=       src->section;
135     }
136     else
137     {
138       /* Next site */
139       dst->per_site= count;
140       *++dst= *src;
141       count= src->per_site;
142     }
143   }
144   dst->per_site= count;
145   List->nitems= dst - List->Item + 1;
146   return;
147 }
148 
149 
150 typedef struct udm_str_with_url_id_st
151 {
152   char *str;
153   urlid_t url_id;
154 } UDM_STR_WITH_URL_ID;
155 
156 typedef struct udm_url_with_url_id_list
157 {
158   size_t nitems;
159   UDM_STR_WITH_URL_ID *Item;
160 } UDM_STR_WITH_URL_ID_LIST;
161 
162 
163 static void
UdmStrWithURLIdListInit(UDM_STR_WITH_URL_ID_LIST * List)164 UdmStrWithURLIdListInit(UDM_STR_WITH_URL_ID_LIST *List)
165 {
166   bzero((void*) List, sizeof(*List));
167 }
168 
169 static udm_rc_t
UdmStrWithURLIdListAlloc(UDM_STR_WITH_URL_ID_LIST * List,size_t n)170 UdmStrWithURLIdListAlloc(UDM_STR_WITH_URL_ID_LIST *List, size_t n)
171 {
172   if (!((List->Item= (UDM_STR_WITH_URL_ID*) UdmMalloc(n * sizeof(UDM_STR_WITH_URL_ID)))))
173     return UDM_ERROR;
174   return UDM_OK;
175 }
176 
177 static void
UdmStrWithURLIdListFree(UDM_STR_WITH_URL_ID_LIST * List)178 UdmStrWithURLIdListFree(UDM_STR_WITH_URL_ID_LIST *List)
179 {
180   size_t i;
181   for (i= 0; i < List->nitems; i++)
182     UdmFree(List->Item[i].str);
183   UdmFree(List->Item);
184 }
185 
186 static int
UdmStrWithURLIdCmp(const UDM_STR_WITH_URL_ID * a,const UDM_STR_WITH_URL_ID * b)187 UdmStrWithURLIdCmp(const UDM_STR_WITH_URL_ID *a, const UDM_STR_WITH_URL_ID *b)
188 {
189   int rc= strcmp(a->str, b->str);
190   if (rc)
191     return rc;
192   if (a->url_id < b->url_id)
193     return -1;
194   if (a->url_id > b->url_id)
195     return 1;
196   return 0;
197 }
198 
199 static void
UdmStrWithURLIdListSort(UDM_STR_WITH_URL_ID_LIST * List)200 UdmStrWithURLIdListSort(UDM_STR_WITH_URL_ID_LIST *List)
201 {
202   UdmSort(List->Item, List->nitems, sizeof(UDM_STR_WITH_URL_ID),
203           (udm_qsort_cmp) UdmStrWithURLIdCmp);
204 }
205 
206 static udm_rc_t
UdmStrWithURLIdListInitFromURLDataList(UDM_STR_WITH_URL_ID_LIST * Dst,UDM_URLDATALIST * Src)207 UdmStrWithURLIdListInitFromURLDataList(UDM_STR_WITH_URL_ID_LIST *Dst,
208                                        UDM_URLDATALIST *Src)
209 {
210   size_t i;
211   UdmStrWithURLIdListInit(Dst);
212   if (UDM_OK != UdmStrWithURLIdListAlloc(Dst, Src->nitems))
213     return UDM_ERROR;
214   for (i= 0; i < Src->nitems; i++)
215   {
216     UDM_URLDATA *Item= &Src->Item[i];
217     UDM_STR_WITH_URL_ID *S= &Dst->Item[i];
218     size_t sitelen= UdmAbsoluteURLSiteLength(Item->url);
219     S->url_id= Item->url_id;
220     if (!(S->str= udm_strndup(Item->url, sitelen)))
221       return UDM_ERROR;
222   }
223   Dst->nitems= Src->nitems;
224   return UDM_OK;
225 }
226 
227 
228 static size_t
UdmStrWithURLIdListSameSite(UDM_STR_WITH_URL_ID_LIST * List,size_t offs)229 UdmStrWithURLIdListSameSite(UDM_STR_WITH_URL_ID_LIST *List, size_t offs)
230 {
231   size_t i;
232   const char *first= List->Item[offs].str;
233   for (i= offs; i < List->nitems; i++)
234   {
235     if (strcmp(first, List->Item[i].str))
236       break;
237   }
238   return i - offs;
239 }
240 
241 
242 static udm_rc_t
UdmStrWithURLIdListToDSTR(UDM_STR_WITH_URL_ID_LIST * List,UDM_DSTR * dstr)243 UdmStrWithURLIdListToDSTR(UDM_STR_WITH_URL_ID_LIST *List, UDM_DSTR *dstr)
244 {
245   size_t i;
246   for (i= 0; i < List->nitems; )
247   {
248     UDM_STR_WITH_URL_ID *Item= &List->Item[i];
249     size_t sitelen= strlen(Item->str);
250     size_t j, n= UdmStrWithURLIdListSameSite(List, i);
251     if (!n)
252       break;
253     /*fprintf(stderr, "[%d][%d]curlen=%d URL=%s\n", Item->url_id, (int) n, (int) dstr->size_data, Item->str);*/
254     if (UDM_OK != UdmDSTRAppendCoord(dstr, sitelen) ||
255         !UdmDSTRAppend(dstr, Item->str, sitelen) ||
256         UDM_OK != UdmDSTRAppendCoord(dstr, n))
257       return UDM_ERROR;
258     for (j= 0; j < n; j++)
259     {
260       UDM_STR_WITH_URL_ID *Item2= &List->Item[i +  j];
261       size_t delta= (j == 0) ? Item2->url_id : (Item2->url_id - Item2[-1].url_id);
262       if (UDM_OK != UdmDSTRAppendCoord(dstr, delta))
263         return UDM_ERROR;
264       /*fprintf(stderr, "  [%d][%d][%d]\n", Item2->url_id, (int) delta, (int) dstr->size_data);*/
265     }
266     i+= n;
267   }
268   return UDM_OK;
269 }
270 
271 
272 /*
273   Pack '##site' record from URLDataList.
274   dstr must be initialized.
275 */
276 udm_rc_t
UdmURLDataListPackSite(UDM_URLDATALIST * List,UDM_DSTR * dstr)277 UdmURLDataListPackSite(UDM_URLDATALIST *List, UDM_DSTR *dstr)
278 {
279   udm_rc_t rc= UDM_OK;
280   UDM_STR_WITH_URL_ID_LIST Site;
281   bzero((void*) &Site, sizeof(Site));
282   if (UDM_OK != (rc= UdmStrWithURLIdListInitFromURLDataList(&Site, List)))
283    return UDM_ERROR;
284   if (Site.nitems)
285     UdmStrWithURLIdListSort(&Site);
286   rc= UdmStrWithURLIdListToDSTR(&Site, dstr);
287   UdmStrWithURLIdListFree(&Site);
288   return UDM_OK;
289 }
290 
291 
292 static inline size_t
udm_coord_get_s(size_t * coord,const char * str,const char * end)293 udm_coord_get_s(size_t *coord, const char *str, const char *end)
294 {
295   return udm_coord_get(coord, (const unsigned char *) str,
296                               (const unsigned char *) end);
297 }
298 
299 
300 static int
UdmURLDataComp(const UDM_URLDATA * a,const UDM_URLDATA * b)301 UdmURLDataComp(const UDM_URLDATA *a, const UDM_URLDATA *b)
302 {
303   if (a->url_id < b->url_id)
304     return -1;
305   if (a->url_id > b->url_id)
306     return 1;
307   return 0;
308 }
309 
310 
311 udm_rc_t
UdmURLDataListUnpackSite(UDM_AGENT * A,UDM_URLDATALIST * List,const UDM_CONST_STR * str_arg)312 UdmURLDataListUnpackSite(UDM_AGENT *A, UDM_URLDATALIST *List,
313                          const UDM_CONST_STR *str_arg)
314 {
315   const char *str= str_arg->str;
316   const char *end= str + str_arg->length;
317   UDM_URLDATA key;
318 
319   bzero((void*) &key, sizeof(key));
320   UDM_ASSERT(List->nitems);
321 
322   for ( ; str < end ; )
323   {
324     UDM_CONST_STR site;
325     size_t coord, nbytes, nurls;
326     /* Get site name length */
327     str+= (nbytes= udm_coord_get_s(&coord, str, end));
328     if (!nbytes || !coord || (((size_t)(end - str)) < coord))
329       goto err;
330     UdmConstStrSet(&site, str, coord); /* Set site name */
331     str+= coord;
332     /* Get number of URLs in this site */
333     str+= (nbytes= udm_coord_get_s(&nurls, str, end));
334     if (!nbytes || !nurls || (((size_t)(end - str)) < nurls))
335       goto err;
336     /* Get all URL ids */
337     for (key.url_id= 0; nurls; nurls--)
338     {
339       UDM_URLDATA *d;
340       str+= (nbytes= udm_coord_get_s(&coord, str, end));
341       if (!nbytes || !coord)
342         goto err;
343       key.url_id+= coord;
344       if ((d= (UDM_URLDATA*) UdmBSearch(&key, List->Item, List->nitems,
345                                         sizeof(UDM_URLDATA),
346                                         (udm_qsort_cmp) UdmURLDataComp)))
347       {
348         /* TODO34: change to in-memory hash to resolve collisions */
349         d->site_id= UdmHash32(site.str, site.length);
350       }
351     }
352   }
353   return UDM_OK;
354 err:
355   UdmLog(A, UDM_LOG_ERROR, "Bad format for '##site' string");
356   return UDM_ERROR;
357 }
358