1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #ifndef __INDEXCACHE_H__
19 #define __INDEXCACHE_H__
20 
21 #define UDM_INDEXER_THREADS_MAX       64
22 #define INVERTED_INDEX_CACHE_PARTS     32
23 #define INVERTED_INDEX_CACHE_PART_SIZE ((4 * 1024 * 1024) / (INVERTED_INDEX_CACHE_PARTS))
24 
25 struct udm_inverted_index_cache_st;
26 
27 typedef struct
28 {
29   udm_timer_t load;
30   udm_timer_t sort;
31   udm_timer_t pack;
32   udm_timer_t send;
33   udm_timer_t send_multi;
34   udm_timer_t conv;
35   udm_timer_t unpack_cached_copy;
36   udm_timer_t parse;
37   udm_timer_t prepare_words;
38   udm_timer_t sort_wordlist;
39   udm_timer_t pairs;
40   unsigned long long bytes_loaded;
41 } UDM_INVERTED_INDEX_STATS;
42 
43 
44 typedef struct
45 {
46   UDM_UNIDATA *unidata;
47   UDM_ENV *Env;
48   int cnvflags; /* e.g. for StripAccents */
49   int pair_limit;
50   int save_section_size;
51   int aggregate_section_flags;
52 } UDM_INVERTED_INDEX_CACHE_PARAM;
53 
54 typedef struct
55 {
56   UDM_CONV cnv;
57   urlid_t url_id;
58   struct udm_inverted_index_cache_st *cache;
59 } UDM_CONSTWORD_HASH_DATA;
60 
61 
62 typedef struct
63 {
64   char *ptr;                  /* 8 */ /* Word followed by encoded positions */
65   urlid_t url_id;             /* 4 */
66   uint4 length:24;            /* 3 */
67   udm_secno_t secno;          /* 1 */
68 } UDM_INVERTED_INDEX_CACHE_ITEM;
69 
70 
71 typedef struct
72 {
73   size_t nitems;                       /* 8 */
74   size_t mitems;                       /* 8 */
75   UDM_INVERTED_INDEX_CACHE_ITEM *Item; /* 8 */
76 } UDM_INVERTED_INDEX_CACHE_PART;
77 
78 
79 typedef struct
80 {
81   char *str;                      /*  8 */
82   udmhash32_t crc;                /*  4 */
83   urlid_t last_url_id;            /*  4 */
84   udm_pos_t last_url_id_count:23; /*    */
85   int is_stopword:1;              /*    */
86   unsigned int length:8;          /*    */
87 } UDM_INVERTED_INDEX_WORD;
88 
89 typedef struct
90 {
91   size_t nitems;
92   size_t mitems;
93   UDM_INVERTED_INDEX_WORD *Item;
94 } UDM_INVERTED_INDEX_WORD_LIST;
95 
96 
97 typedef struct
98 {
99   UDM_MEMROOT root;
100   UDM_HASH hash;
101   UDM_INVERTED_INDEX_WORD_LIST list;
102 } UDM_INVERTED_INDEX_WORD_DICTIONARY;
103 
104 
105 typedef struct
106 {
107   uint4 word_id;   /* 4 */
108   UDM_COORD coord; /* 4 */
109 } UDM_INVERTED_INDEX_COORD;
110 
111 
112 typedef struct
113 {
114   size_t nitems;
115   size_t mitems;
116   UDM_INVERTED_INDEX_COORD *Item;
117 } UDM_INVERTED_INDEX_COORD_LIST;
118 
119 
120 typedef struct udm_inverted_index_cache_st
121 {
122   size_t nitems;
123   size_t mitems;
124   UDM_INVERTED_INDEX_CACHE_PART *Item;
125   UDM_MEMROOT coord_root;
126   UDM_INVERTED_INDEX_CACHE_PARAM param;
127   UDM_INVERTED_INDEX_WORD_DICTIONARY Words;
128   UDM_INVERTED_INDEX_STATS Stats;
129 } UDM_INVERTED_INDEX_CACHE;
130 
131 
132 void UdmInvertedIndexCachePartSort(UDM_INVERTED_INDEX_CACHE_PART *part);
133 void UdmInvertedIndexCachePartInit(UDM_INVERTED_INDEX_CACHE_PART *part);
134 void UdmInvertedIndexCachePartFree(UDM_INVERTED_INDEX_CACHE_PART *part);
135 udm_rc_t UdmInvertedIndexCacheAdd(UDM_AGENT *A,
136                                   UDM_CONSTWORD_HASH_DATA *data,
137                                   UDM_INVERTED_INDEX_CACHE *cache,
138                                   const UDM_WORD *W, size_t ncoords);
139 udm_rc_t UdmInvertedIndexCacheAllocParts(UDM_AGENT *A,
140                                          UDM_INVERTED_INDEX_CACHE *cache,
141                                          size_t n);
142 
143 void UdmInvertedIndexCacheInit(UDM_INVERTED_INDEX_CACHE *L, UDM_ENV *Env);
144 void UdmInvertedIndexCacheReset(UDM_INVERTED_INDEX_CACHE *L);
145 void UdmInvertedIndexCacheFree(UDM_INVERTED_INDEX_CACHE *L);
146 udm_rc_t UdmInvertedIndexCacheAddPart(UDM_AGENT *A, UDM_INVERTED_INDEX_CACHE *cache);
147 
148 udm_rc_t UdmConstWordListToInvertedIndexCoordList(UDM_INVERTED_INDEX_CACHE *cache,
149                                              UDM_INVERTED_INDEX_COORD_LIST *CL,
150                                              UDM_CONSTWORDLIST *CWL);
151 
152 udm_rc_t UdmInvertedIndexCoordListInit(UDM_INVERTED_INDEX_COORD_LIST *L, size_t mitems);
153 void UdmInvertedIndexCoordListFree(UDM_INVERTED_INDEX_COORD_LIST *L);
154 void UdmInvertedIndexCoordListSort(UDM_INVERTED_INDEX_COORD_LIST *L);
155 
156 udm_rc_t UdmInvertedIndexCoordList2InvertedIndexCache(UDM_AGENT *A,
157                                                      UDM_INVERTED_INDEX_COORD_LIST *CL,
158                                                      UDM_CONSTWORDLIST *CWLWithPos,
159                                                      UDM_CONSTWORD_HASH_DATA *data,
160                                                      UDM_INVERTED_INDEX_CACHE *L);
161 size_t UdmInvertedIndexCacheEstimateUsedMemory(const UDM_INVERTED_INDEX_CACHE *);
162 
163 #endif
164