1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <string.h>
21
22 #include "udm_common.h"
23 #include "udm_log.h"
24 #include "udm_utils.h"
25 #include "udm_searchtool.h"
26 #include "udm_coords.h"
27 #include "udm_db.h"
28 #include "udm_hash.h"
29 #include "udm_url.h"
30
31 /******************* GroupBySite *********************************/
32
33
34 void
UdmURLDataApplySiteRank(UDM_AGENT * A,UDM_URLDATALIST * DataList,int is_aggregation_point)35 UdmURLDataApplySiteRank(UDM_AGENT *A, UDM_URLDATALIST *DataList,
36 int is_aggregation_point)
37 {
38 size_t i, in_site_rank;
39 urlid_t prev_site_id;
40 for (i= 0, prev_site_id= in_site_rank= 1; i < DataList->nitems; i++)
41 {
42 UDM_URLDATA *D= &DataList->Item[i];
43 if (prev_site_id != D->site_id)
44 {
45 in_site_rank= 0;
46 prev_site_id= D->site_id;
47 }
48 in_site_rank++;
49 if (is_aggregation_point)
50 {
51 if (in_site_rank > 1)
52 {
53 /* Keep machine number in the low byte */
54 D->score= (D->score & 0xFF) + ((D->score / in_site_rank) & 0x7FFFFF00);
55 }
56 }
57 else
58 D->score/= in_site_rank;
59 }
60 }
61
62
63 udm_rc_t
UdmURLDataListGroupBySiteUsingSort(UDM_AGENT * A,UDM_URLDATALIST * R,UDM_DB * db)64 UdmURLDataListGroupBySiteUsingSort(UDM_AGENT *A,
65 UDM_URLDATALIST *R,
66 UDM_DB *db)
67 {
68 udm_timer_t ticks;
69 UDM_URLDATA *Dat, *End;
70
71 /* Initialize per_site */
72 for (Dat= R->Item, End= R->Item + R->nitems;
73 Dat < End; Dat++)
74 Dat->per_site= 1;
75
76 UdmLog(A,UDM_LOG_DEBUG,"Start sort by site_id %d docs", (int) R->nitems);
77 ticks=UdmStartTimer();
78 UdmURLDataSortBySite(R);
79 UdmLog(A,UDM_LOG_DEBUG,"Stop sort by site_id:\t%.2f",UdmStopTimer(&ticks));
80
81 UdmLog(A,UDM_LOG_DEBUG,"Start group by site_id %d docs", (int) R->nitems);
82 ticks=UdmStartTimer();
83 UdmURLDataGroupBySite(R);
84 UdmLog(A,UDM_LOG_DEBUG,"Stop group by site_id:\t%.2f", UdmStopTimer(&ticks));
85
86 return UDM_OK;
87 }
88
89
UdmURLDataGroupBySite(UDM_URLDATALIST * List)90 void UdmURLDataGroupBySite(UDM_URLDATALIST *List)
91 {
92 UDM_URLDATA *src= List->Item + 1;
93 UDM_URLDATA *dst= List->Item;
94 UDM_URLDATA *end= List->Item + List->nitems;
95 uint4 count;
96
97 if (!List->nitems)
98 return;
99
100 for(count= List->Item[0].per_site; src < end; src++)
101 {
102 UDM_ASSERT(src->url != dst->url || src->url == NULL);
103 /* Group by site_id */
104 if(dst->site_id == src->site_id)
105 {
106 count+= src->per_site;
107 if (dst->score > src->score)
108 {
109 UdmURLDataFree(src);
110 continue;
111 }
112 else if (dst->score == src->score)
113 {
114 if (dst->pop_rank > src->pop_rank)
115 {
116 UdmURLDataFree(src);
117 continue;
118 }
119 else if (dst->pop_rank == src->pop_rank)
120 {
121 if (dst->url_id < src->url_id)
122 {
123 UdmURLDataFree(src);
124 continue;
125 }
126 }
127 }
128 UdmURLDataFree(dst);
129 dst->url_id= src->url_id;
130 dst->score= src->score;
131 dst->last_mod_time= src->last_mod_time;
132 dst->pop_rank= src->pop_rank;
133 dst->url= src->url;
134 dst->section= src->section;
135 }
136 else
137 {
138 /* Next site */
139 dst->per_site= count;
140 *++dst= *src;
141 count= src->per_site;
142 }
143 }
144 dst->per_site= count;
145 List->nitems= dst - List->Item + 1;
146 return;
147 }
148
149
150 typedef struct udm_str_with_url_id_st
151 {
152 char *str;
153 urlid_t url_id;
154 } UDM_STR_WITH_URL_ID;
155
156 typedef struct udm_url_with_url_id_list
157 {
158 size_t nitems;
159 UDM_STR_WITH_URL_ID *Item;
160 } UDM_STR_WITH_URL_ID_LIST;
161
162
163 static void
UdmStrWithURLIdListInit(UDM_STR_WITH_URL_ID_LIST * List)164 UdmStrWithURLIdListInit(UDM_STR_WITH_URL_ID_LIST *List)
165 {
166 bzero((void*) List, sizeof(*List));
167 }
168
169 static udm_rc_t
UdmStrWithURLIdListAlloc(UDM_STR_WITH_URL_ID_LIST * List,size_t n)170 UdmStrWithURLIdListAlloc(UDM_STR_WITH_URL_ID_LIST *List, size_t n)
171 {
172 if (!((List->Item= (UDM_STR_WITH_URL_ID*) UdmMalloc(n * sizeof(UDM_STR_WITH_URL_ID)))))
173 return UDM_ERROR;
174 return UDM_OK;
175 }
176
177 static void
UdmStrWithURLIdListFree(UDM_STR_WITH_URL_ID_LIST * List)178 UdmStrWithURLIdListFree(UDM_STR_WITH_URL_ID_LIST *List)
179 {
180 size_t i;
181 for (i= 0; i < List->nitems; i++)
182 UdmFree(List->Item[i].str);
183 UdmFree(List->Item);
184 }
185
186 static int
UdmStrWithURLIdCmp(const UDM_STR_WITH_URL_ID * a,const UDM_STR_WITH_URL_ID * b)187 UdmStrWithURLIdCmp(const UDM_STR_WITH_URL_ID *a, const UDM_STR_WITH_URL_ID *b)
188 {
189 int rc= strcmp(a->str, b->str);
190 if (rc)
191 return rc;
192 if (a->url_id < b->url_id)
193 return -1;
194 if (a->url_id > b->url_id)
195 return 1;
196 return 0;
197 }
198
199 static void
UdmStrWithURLIdListSort(UDM_STR_WITH_URL_ID_LIST * List)200 UdmStrWithURLIdListSort(UDM_STR_WITH_URL_ID_LIST *List)
201 {
202 UdmSort(List->Item, List->nitems, sizeof(UDM_STR_WITH_URL_ID),
203 (udm_qsort_cmp) UdmStrWithURLIdCmp);
204 }
205
206 static udm_rc_t
UdmStrWithURLIdListInitFromURLDataList(UDM_STR_WITH_URL_ID_LIST * Dst,UDM_URLDATALIST * Src)207 UdmStrWithURLIdListInitFromURLDataList(UDM_STR_WITH_URL_ID_LIST *Dst,
208 UDM_URLDATALIST *Src)
209 {
210 size_t i;
211 UdmStrWithURLIdListInit(Dst);
212 if (UDM_OK != UdmStrWithURLIdListAlloc(Dst, Src->nitems))
213 return UDM_ERROR;
214 for (i= 0; i < Src->nitems; i++)
215 {
216 UDM_URLDATA *Item= &Src->Item[i];
217 UDM_STR_WITH_URL_ID *S= &Dst->Item[i];
218 size_t sitelen= UdmAbsoluteURLSiteLength(Item->url);
219 S->url_id= Item->url_id;
220 if (!(S->str= udm_strndup(Item->url, sitelen)))
221 return UDM_ERROR;
222 }
223 Dst->nitems= Src->nitems;
224 return UDM_OK;
225 }
226
227
228 static size_t
UdmStrWithURLIdListSameSite(UDM_STR_WITH_URL_ID_LIST * List,size_t offs)229 UdmStrWithURLIdListSameSite(UDM_STR_WITH_URL_ID_LIST *List, size_t offs)
230 {
231 size_t i;
232 const char *first= List->Item[offs].str;
233 for (i= offs; i < List->nitems; i++)
234 {
235 if (strcmp(first, List->Item[i].str))
236 break;
237 }
238 return i - offs;
239 }
240
241
242 static udm_rc_t
UdmStrWithURLIdListToDSTR(UDM_STR_WITH_URL_ID_LIST * List,UDM_DSTR * dstr)243 UdmStrWithURLIdListToDSTR(UDM_STR_WITH_URL_ID_LIST *List, UDM_DSTR *dstr)
244 {
245 size_t i;
246 for (i= 0; i < List->nitems; )
247 {
248 UDM_STR_WITH_URL_ID *Item= &List->Item[i];
249 size_t sitelen= strlen(Item->str);
250 size_t j, n= UdmStrWithURLIdListSameSite(List, i);
251 if (!n)
252 break;
253 /*fprintf(stderr, "[%d][%d]curlen=%d URL=%s\n", Item->url_id, (int) n, (int) dstr->size_data, Item->str);*/
254 if (UDM_OK != UdmDSTRAppendCoord(dstr, sitelen) ||
255 !UdmDSTRAppend(dstr, Item->str, sitelen) ||
256 UDM_OK != UdmDSTRAppendCoord(dstr, n))
257 return UDM_ERROR;
258 for (j= 0; j < n; j++)
259 {
260 UDM_STR_WITH_URL_ID *Item2= &List->Item[i + j];
261 size_t delta= (j == 0) ? Item2->url_id : (Item2->url_id - Item2[-1].url_id);
262 if (UDM_OK != UdmDSTRAppendCoord(dstr, delta))
263 return UDM_ERROR;
264 /*fprintf(stderr, " [%d][%d][%d]\n", Item2->url_id, (int) delta, (int) dstr->size_data);*/
265 }
266 i+= n;
267 }
268 return UDM_OK;
269 }
270
271
272 /*
273 Pack '##site' record from URLDataList.
274 dstr must be initialized.
275 */
276 udm_rc_t
UdmURLDataListPackSite(UDM_URLDATALIST * List,UDM_DSTR * dstr)277 UdmURLDataListPackSite(UDM_URLDATALIST *List, UDM_DSTR *dstr)
278 {
279 udm_rc_t rc= UDM_OK;
280 UDM_STR_WITH_URL_ID_LIST Site;
281 bzero((void*) &Site, sizeof(Site));
282 if (UDM_OK != (rc= UdmStrWithURLIdListInitFromURLDataList(&Site, List)))
283 return UDM_ERROR;
284 if (Site.nitems)
285 UdmStrWithURLIdListSort(&Site);
286 rc= UdmStrWithURLIdListToDSTR(&Site, dstr);
287 UdmStrWithURLIdListFree(&Site);
288 return UDM_OK;
289 }
290
291
292 static inline size_t
udm_coord_get_s(size_t * coord,const char * str,const char * end)293 udm_coord_get_s(size_t *coord, const char *str, const char *end)
294 {
295 return udm_coord_get(coord, (const unsigned char *) str,
296 (const unsigned char *) end);
297 }
298
299
300 static int
UdmURLDataComp(const UDM_URLDATA * a,const UDM_URLDATA * b)301 UdmURLDataComp(const UDM_URLDATA *a, const UDM_URLDATA *b)
302 {
303 if (a->url_id < b->url_id)
304 return -1;
305 if (a->url_id > b->url_id)
306 return 1;
307 return 0;
308 }
309
310
311 udm_rc_t
UdmURLDataListUnpackSite(UDM_AGENT * A,UDM_URLDATALIST * List,const UDM_CONST_STR * str_arg)312 UdmURLDataListUnpackSite(UDM_AGENT *A, UDM_URLDATALIST *List,
313 const UDM_CONST_STR *str_arg)
314 {
315 const char *str= str_arg->str;
316 const char *end= str + str_arg->length;
317 UDM_URLDATA key;
318
319 bzero((void*) &key, sizeof(key));
320 UDM_ASSERT(List->nitems);
321
322 for ( ; str < end ; )
323 {
324 UDM_CONST_STR site;
325 size_t coord, nbytes, nurls;
326 /* Get site name length */
327 str+= (nbytes= udm_coord_get_s(&coord, str, end));
328 if (!nbytes || !coord || (((size_t)(end - str)) < coord))
329 goto err;
330 UdmConstStrSet(&site, str, coord); /* Set site name */
331 str+= coord;
332 /* Get number of URLs in this site */
333 str+= (nbytes= udm_coord_get_s(&nurls, str, end));
334 if (!nbytes || !nurls || (((size_t)(end - str)) < nurls))
335 goto err;
336 /* Get all URL ids */
337 for (key.url_id= 0; nurls; nurls--)
338 {
339 UDM_URLDATA *d;
340 str+= (nbytes= udm_coord_get_s(&coord, str, end));
341 if (!nbytes || !coord)
342 goto err;
343 key.url_id+= coord;
344 if ((d= (UDM_URLDATA*) UdmBSearch(&key, List->Item, List->nitems,
345 sizeof(UDM_URLDATA),
346 (udm_qsort_cmp) UdmURLDataComp)))
347 {
348 /* TODO34: change to in-memory hash to resolve collisions */
349 d->site_id= UdmHash32(site.str, site.length);
350 }
351 }
352 }
353 return UDM_OK;
354 err:
355 UdmLog(A, UDM_LOG_ERROR, "Bad format for '##site' string");
356 return UDM_ERROR;
357 }
358