1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <math.h>
23 
24 #include "udm_common.h"
25 #include "udm_utils.h"
26 #include "udm_log.h"
27 #include "udm_searchtool.h"
28 #include "udm_coords.h"
29 #include "udm_db.h"
30 
31 
32 typedef struct
33 {
34   urlid_t id;
35   int popularity;
36 } UDM_URLIPOP;
37 
38 
39 typedef struct
40 {
41   size_t nitems;
42   size_t mitems;
43   UDM_URLIPOP *Item;
44 } UDM_URLIPOPLIST;
45 
46 
47 static void
UdmURLIPopListInit(UDM_URLIPOPLIST * List)48 UdmURLIPopListInit(UDM_URLIPOPLIST *List)
49 {
50   bzero((void*) List, sizeof(*List));
51 }
52 
53 
54 static void
UdmURLIPopListFree(UDM_URLIPOPLIST * List)55 UdmURLIPopListFree(UDM_URLIPOPLIST *List)
56 {
57   UdmFree(List->Item);
58 }
59 
60 
61 static udm_rc_t
UdmURLIPopListAlloc(UDM_URLIPOPLIST * List,size_t mitems)62 UdmURLIPopListAlloc(UDM_URLIPOPLIST *List, size_t mitems)
63 {
64   if (!(List->Item= (UDM_URLIPOP *) UdmMalloc(mitems * sizeof(UDM_URLIPOP))))
65     return UDM_ERROR;
66   List->mitems= mitems;
67   return UDM_OK;
68 }
69 
70 
71 static void
UdmURLIPopListAdd(UDM_URLIPOPLIST * List,const UDM_URLIPOP * Item)72 UdmURLIPopListAdd(UDM_URLIPOPLIST *List, const UDM_URLIPOP *Item)
73 {
74   List->Item[List->nitems++]= *Item;
75 }
76 
77 
78 static int
cmp_urlipop(const UDM_URLIPOP * a,const UDM_URLIPOP * b)79 cmp_urlipop(const UDM_URLIPOP *a, const UDM_URLIPOP *b)
80 {
81   if (a->popularity < b->popularity)
82     return 1;
83   if (a->popularity > b->popularity)
84     return -1;
85   if (a->id < b->id)
86     return -1;
87   if (a->id > b->id)
88     return 1;
89   return 0;
90 }
91 
92 static void
UdmURLIPopListSort(UDM_URLIPOPLIST * List)93 UdmURLIPopListSort(UDM_URLIPOPLIST *List)
94 {
95   UdmSort(List->Item, List->nitems, sizeof(UDM_URLIPOP), (udm_qsort_cmp) cmp_urlipop);
96 }
97 
98 
99 static size_t
UdmURLIPopListCountSame(UDM_URLIPOPLIST * List,size_t offs)100 UdmURLIPopListCountSame(UDM_URLIPOPLIST *List, size_t offs)
101 {
102   size_t offs0= offs++;
103   for ( ; offs < List->nitems; offs++)
104   {
105     if (List->Item[offs0].popularity != List->Item[offs].popularity)
106       break;
107   }
108   return offs - offs0;
109 }
110 
111 
112 static udm_rc_t
UdmURLIPopListEncode(UDM_AGENT * A,UDM_URLIPOPLIST * List,UDM_DSTR * dstr)113 UdmURLIPopListEncode(UDM_AGENT *A, UDM_URLIPOPLIST *List, UDM_DSTR *dstr)
114 {
115   size_t i;
116   for (i= 0; i < List->nitems; )
117   {
118     size_t n= UdmURLIPopListCountSame(List, i);
119     urlid_t id;
120     if (!UdmDSTRAppendINT2BE(dstr, List->Item[i].popularity))
121       return UDM_ERROR;
122     if (UDM_OK != UdmDSTRAppendCoord(dstr, n))
123     {
124       UdmLog(A, UDM_LOG_ERROR,
125              "URLIPopListEncode: DSTRAppendCoord failed: count=%d",
126              (int) n);
127       return UDM_ERROR;
128     }
129     for (id= 0; n ; n--)
130     {
131       UDM_URLIPOP *Item= &List->Item[i++];
132       if (UDM_OK != UdmDSTRAppendCoord(dstr, Item->id - id))
133       {
134         UdmLog(A, UDM_LOG_ERROR,
135                "URLIPopListEncode: DSTRAppendCoord failed: delta=%d",
136                Item->id - id);
137         return UDM_ERROR;
138       }
139       id= Item->id;
140     }
141   }
142   return UDM_OK;
143 }
144 
145 
146 #define ERROR_BYTES_TO_SHOW 4
147 
148 udm_rc_t
UdmURLDataListUnpackPopularity(UDM_AGENT * A,UDM_URLDATALIST * URLDataList,UDM_CONST_STR * cstr)149 UdmURLDataListUnpackPopularity(UDM_AGENT *A, UDM_URLDATALIST *URLDataList,
150                                UDM_CONST_STR *cstr)
151 {
152   const char *str= cstr->str;
153   const char *end= cstr->str + cstr->length;
154   size_t nfound;
155   char hex[ERROR_BYTES_TO_SHOW + 1];
156   for (nfound= 0; str + 3 < end ; )
157   {
158     urlid_t id;
159     size_t ndocs, nbytes, i;
160     uint4 pop= (((uint4) (unsigned char) str[0]) << 8) + (unsigned char) str[1];
161     str+= 2;
162     str+= (nbytes= udm_coord_get(&ndocs, (const unsigned char *) str,
163                                          (const unsigned char *) end));
164     if (!nbytes)
165       goto err;
166     /*fprintf(stderr, "pop=%d ndocs=%d\n", (int) pop, (int) ndocs);*/
167     for (id= 0, i= 0; i < ndocs; i++)
168     {
169       UDM_URLDATA *data;
170       size_t delta;
171       str+= (nbytes= udm_coord_get(&delta, (const unsigned char *) str,
172                                            (const unsigned char *) end));
173       if (!nbytes)
174         goto err;
175       id+= delta;
176       if ((data= UdmURLDataListSearch(URLDataList, id)))
177       {
178         data->pop_rank= (double) pop / (double) 0xFFFF;
179         nfound++;
180       }
181       /*fprintf(stderr, "  delta=%d id=%d %p\n", (int) delta, (int) id, (void*)data);*/
182     }
183   }
184   UdmLog(A, UDM_LOG_DEBUG, "Found %d documents in '##pop' record", (int) nfound);
185   return UDM_OK;
186 err:
187   UdmHexEncode(hex, str, UDM_MIN(end - str, ERROR_BYTES_TO_SHOW));
188   UdmLog(A, UDM_LOG_ERROR, "Bad data format in '##pop' record (%s)", hex);
189   return UDM_ERROR;
190 }
191 
192 
193 static udm_rc_t
UdmURLIPopListInitFromURLDataList(UDM_URLIPOPLIST * IPopList,UDM_URLDATALIST * List)194 UdmURLIPopListInitFromURLDataList(UDM_URLIPOPLIST *IPopList,
195                                   UDM_URLDATALIST *List)
196 {
197   size_t i;
198   UdmURLIPopListInit(IPopList);
199   if (UDM_OK != UdmURLIPopListAlloc(IPopList, List->nitems))
200     return UDM_ERROR;
201   for (i= 0; i < List->nitems; i++)
202   {
203     UDM_URLDATA *Item= &List->Item[i];
204     UDM_URLIPOP tmp;
205     if ((tmp.popularity= (int) (Item->pop_rank * 0xFFFF)))
206     {
207       if (tmp.popularity > 0xFFFF)
208         tmp.popularity= 0xFFFF;
209       tmp.id= Item->url_id;
210       UdmURLIPopListAdd(IPopList, &tmp);
211     }
212   }
213   return UDM_OK;
214 }
215 
216 
217 udm_rc_t
UdmURLDataListPackPopularity(UDM_AGENT * A,UDM_URLDATALIST * List,UDM_DSTR * pop)218 UdmURLDataListPackPopularity(UDM_AGENT *A, UDM_URLDATALIST *List, UDM_DSTR *pop)
219 {
220   UDM_URLIPOPLIST IPopList;
221 
222   if (UDM_OK != UdmURLIPopListInitFromURLDataList(&IPopList, List))
223     return UDM_ERROR;
224 
225   if (IPopList.nitems)
226   {
227     UdmURLIPopListSort(&IPopList);
228     if (UDM_OK != UdmURLIPopListEncode(A, &IPopList, pop))
229       return UDM_ERROR;
230     /*
231     for (i= 0; i < IPopList.nitems; i++)
232     {
233       UDM_URLIPOP *Item= &IPopList.Item[i];
234       fprintf(stderr, "[%d]=%d %04X\n", Item->id, Item->popularity, Item->popularity);
235     }
236     */
237   }
238   UdmURLIPopListFree(&IPopList);
239   return UDM_OK;
240 }
241