1 /** @file
2 
3   A brief file description
4 
5   @section license License
6 
7   Licensed to the Apache Software Foundation (ASF) under one
8   or more contributor license agreements.  See the NOTICE file
9   distributed with this work for additional information
10   regarding copyright ownership.  The ASF licenses this file
11   to you under the Apache License, Version 2.0 (the
12   "License"); you may not use this file except in compliance
13   with the License.  You may obtain a copy of the License at
14 
15       http://www.apache.org/licenses/LICENSE-2.0
16 
17   Unless required by applicable law or agreed to in writing, software
18   distributed under the License is distributed on an "AS IS" BASIS,
19   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20   See the License for the specific language governing permissions and
21   limitations under the License.
22  */
23 
24 #pragma once
25 
26 #include "P_CacheHttp.h"
27 
28 struct Vol;
29 struct InterimCacheVol;
30 struct CacheVC;
31 
32 /*
33   Directory layout
34 */
35 
36 // Constants
37 
38 #define DIR_TAG_WIDTH 12
39 #define DIR_MASK_TAG(_t) ((_t) & ((1 << DIR_TAG_WIDTH) - 1))
40 #define SIZEOF_DIR 10
41 #define ESTIMATED_OBJECT_SIZE 8000
42 
43 #define MAX_DIR_SEGMENTS (32 * (1 << 16))
44 #define DIR_DEPTH 4
45 #define MAX_ENTRIES_PER_SEGMENT (1 << 16)
46 #define MAX_BUCKETS_PER_SEGMENT (MAX_ENTRIES_PER_SEGMENT / DIR_DEPTH)
47 #define DIR_SIZE_WIDTH 6
48 #define DIR_BLOCK_SIZES 4
49 #define DIR_BLOCK_SHIFT(_i) (3 * (_i))
50 #define DIR_BLOCK_SIZE(_i) (CACHE_BLOCK_SIZE << DIR_BLOCK_SHIFT(_i))
51 #define DIR_SIZE_WITH_BLOCK(_i) ((1 << DIR_SIZE_WIDTH) * DIR_BLOCK_SIZE(_i))
52 #define DIR_OFFSET_BITS 40
53 #define DIR_OFFSET_MAX ((((off_t)1) << DIR_OFFSET_BITS) - 1)
54 
55 #define SYNC_MAX_WRITE (2 * 1024 * 1024)
56 #define SYNC_DELAY HRTIME_MSECONDS(500)
57 #define DO_NOT_REMOVE_THIS 0
58 
59 // Debugging Options
60 
61 //#define DO_CHECK_DIR_FAST
62 //#define DO_CHECK_DIR
63 
64 // Macros
65 
66 #ifdef DO_CHECK_DIR
67 #define CHECK_DIR(_d) ink_assert(check_dir(_d))
68 #else
69 #define CHECK_DIR(_d) ((void)0)
70 #endif
71 
72 #define dir_index(_e, _i) ((Dir *)((char *)(_e)->dir + (SIZEOF_DIR * (_i))))
73 #define dir_assign(_e, _x)   \
74   do {                       \
75     (_e)->w[0] = (_x)->w[0]; \
76     (_e)->w[1] = (_x)->w[1]; \
77     (_e)->w[2] = (_x)->w[2]; \
78     (_e)->w[3] = (_x)->w[3]; \
79     (_e)->w[4] = (_x)->w[4]; \
80   } while (0)
81 #define dir_assign_data(_e, _x)         \
82   do {                                  \
83     unsigned short next = dir_next(_e); \
84     dir_assign(_e, _x);                 \
85     dir_set_next(_e, next);             \
86   } while (0)
87 // entry is valid
88 #define dir_valid(_d, _e) (_d->header->phase == dir_phase(_e) ? _d->vol_in_phase_valid(_e) : _d->vol_out_of_phase_valid(_e))
89 // entry is valid and outside of write aggregation region
90 #define dir_agg_valid(_d, _e) (_d->header->phase == dir_phase(_e) ? _d->vol_in_phase_valid(_e) : _d->vol_out_of_phase_agg_valid(_e))
91 // entry may be valid or overwritten in the last aggregated write
92 #define dir_write_valid(_d, _e) \
93   (_d->header->phase == dir_phase(_e) ? vol_in_phase_valid(_d, _e) : vol_out_of_phase_write_valid(_d, _e))
94 #define dir_agg_buf_valid(_d, _e) (_d->header->phase == dir_phase(_e) && _d->vol_in_phase_agg_buf_valid(_e))
95 #define dir_is_empty(_e) (!dir_offset(_e))
96 #define dir_clear(_e) \
97   do {                \
98     (_e)->w[0] = 0;   \
99     (_e)->w[1] = 0;   \
100     (_e)->w[2] = 0;   \
101     (_e)->w[3] = 0;   \
102     (_e)->w[4] = 0;   \
103   } while (0)
104 #define dir_clean(_e) dir_set_offset(_e, 0)
105 
106 // OpenDir
107 
108 #define OPEN_DIR_BUCKETS 256
109 
110 struct EvacuationBlock;
111 typedef uint32_t DirInfo;
112 
113 // Cache Directory
114 
115 // INTERNAL: do not access these members directly, use the
116 // accessors below (e.g. dir_offset, dir_set_offset).
117 // These structures are stored in memory 2 byte aligned.
118 // The accessors prevent unaligned memory access which
119 // is often either less efficient or unsupported depending
120 // on the processor.
121 struct Dir {
122 #if DO_NOT_REMOVE_THIS
123   // THE BIT-FIELD INTERPRETATION OF THIS STRUCT WHICH HAS TO
124   // USE MACROS TO PREVENT UNALIGNED LOADS
125   // bits are numbered from lowest in u16 to highest
126   // always index as u16 to avoid byte order issues
127   unsigned int offset : 24;      // (0,1:0-7) 16M * 512 = 8GB
128   unsigned int big : 2;          // (1:8-9) 512 << (3 * big)
129   unsigned int size : 6;         // (1:10-15) 6**2 = 64, 64*512 = 32768 .. 64*256=16MB
130   unsigned int tag : 12;         // (2:0-11) 2048 / 8 entries/bucket = .4%
131   unsigned int phase : 1;        // (2:12)
132   unsigned int head : 1;         // (2:13) first segment in a document
133   unsigned int pinned : 1;       // (2:14)
134   unsigned int token : 1;        // (2:15)
135   unsigned int next : 16;        // (3)
136   unsigned int offset_high : 16; // 8GB * 65k = 0.5PB (4)
137 #else
138   uint16_t w[5];
139   Dir() { dir_clear(this); }
140 #endif
141 };
142 
143 // INTERNAL: do not access these members directly, use the
144 // accessors below (e.g. dir_offset, dir_set_offset)
145 struct FreeDir {
146 #if DO_NOT_REMOVE_THIS
147   // THE BIT-FIELD INTERPRETATION OF THIS STRUCT WHICH HAS TO
148   // USE MACROS TO PREVENT UNALIGNED LOADS
149   unsigned int offset : 24; // 0: empty
150   unsigned int reserved : 8;
151   unsigned int prev : 16;        // (2)
152   unsigned int next : 16;        // (3)
153   unsigned int offset_high : 16; // 0: empty
154 #else
155   uint16_t w[5];
156   FreeDir() { dir_clear(this); }
157 #endif
158 };
159 
160 #define dir_offset(_e) \
161   ((int64_t)(((uint64_t)(_e)->w[0]) | (((uint64_t)((_e)->w[1] & 0xFF)) << 16) | (((uint64_t)(_e)->w[4]) << 24)))
162 #define dir_set_offset(_e, _o)                                              \
163   do {                                                                      \
164     (_e)->w[0] = (uint16_t)_o;                                              \
165     (_e)->w[1] = (uint16_t)((((_o) >> 16) & 0xFF) | ((_e)->w[1] & 0xFF00)); \
166     (_e)->w[4] = (uint16_t)((_o) >> 24);                                    \
167   } while (0)
168 #define dir_bit(_e, _w, _b) ((uint32_t)(((_e)->w[_w] >> (_b)) & 1))
169 #define dir_set_bit(_e, _w, _b, _v) (_e)->w[_w] = (uint16_t)(((_e)->w[_w] & ~(1 << (_b))) | (((_v) ? 1 : 0) << (_b)))
170 #define dir_big(_e) ((uint32_t)((((_e)->w[1]) >> 8) & 0x3))
171 #define dir_set_big(_e, _v) (_e)->w[1] = (uint16_t)(((_e)->w[1] & 0xFCFF) | (((uint16_t)(_v)) & 0x3) << 8)
172 #define dir_size(_e) ((uint32_t)(((_e)->w[1]) >> 10))
173 #define dir_set_size(_e, _v) (_e)->w[1] = (uint16_t)(((_e)->w[1] & ((1 << 10) - 1)) | ((_v) << 10))
174 #define dir_set_approx_size(_e, _s)                   \
175   do {                                                \
176     if ((_s) <= DIR_SIZE_WITH_BLOCK(0)) {             \
177       dir_set_big(_e, 0);                             \
178       dir_set_size(_e, ((_s)-1) / DIR_BLOCK_SIZE(0)); \
179     } else if ((_s) <= DIR_SIZE_WITH_BLOCK(1)) {      \
180       dir_set_big(_e, 1);                             \
181       dir_set_size(_e, ((_s)-1) / DIR_BLOCK_SIZE(1)); \
182     } else if ((_s) <= DIR_SIZE_WITH_BLOCK(2)) {      \
183       dir_set_big(_e, 2);                             \
184       dir_set_size(_e, ((_s)-1) / DIR_BLOCK_SIZE(2)); \
185     } else {                                          \
186       dir_set_big(_e, 3);                             \
187       dir_set_size(_e, ((_s)-1) / DIR_BLOCK_SIZE(3)); \
188     }                                                 \
189   } while (0)
190 #define dir_approx_size(_e) ((dir_size(_e) + 1) * DIR_BLOCK_SIZE(dir_big(_e)))
191 #define round_to_approx_dir_size(_s)      \
192   (_s <= DIR_SIZE_WITH_BLOCK(0) ?         \
193      ROUND_TO(_s, DIR_BLOCK_SIZE(0)) :    \
194      (_s <= DIR_SIZE_WITH_BLOCK(1) ?      \
195         ROUND_TO(_s, DIR_BLOCK_SIZE(1)) : \
196         (_s <= DIR_SIZE_WITH_BLOCK(2) ? ROUND_TO(_s, DIR_BLOCK_SIZE(2)) : ROUND_TO(_s, DIR_BLOCK_SIZE(3)))))
197 #define dir_tag(_e) ((uint32_t)((_e)->w[2] & ((1 << DIR_TAG_WIDTH) - 1)))
198 #define dir_set_tag(_e, _t) \
199   (_e)->w[2] = (uint16_t)(((_e)->w[2] & ~((1 << DIR_TAG_WIDTH) - 1)) | ((_t) & ((1 << DIR_TAG_WIDTH) - 1)))
200 #define dir_phase(_e) dir_bit(_e, 2, 12)
201 #define dir_set_phase(_e, _v) dir_set_bit(_e, 2, 12, _v)
202 #define dir_head(_e) dir_bit(_e, 2, 13)
203 #define dir_set_head(_e, _v) dir_set_bit(_e, 2, 13, _v)
204 #define dir_pinned(_e) dir_bit(_e, 2, 14)
205 #define dir_set_pinned(_e, _v) dir_set_bit(_e, 2, 14, _v)
206 #define dir_token(_e) dir_bit(_e, 2, 15)
207 #define dir_set_token(_e, _v) dir_set_bit(_e, 2, 15, _v)
208 #define dir_next(_e) (_e)->w[3]
209 #define dir_set_next(_e, _o) (_e)->w[3] = (uint16_t)(_o)
210 #define dir_prev(_e) (_e)->w[2]
211 #define dir_set_prev(_e, _o) (_e)->w[2] = (uint16_t)(_o)
212 
213 // INKqa11166 - Cache can not store 2 HTTP alternates simultaneously.
214 // To allow this, move the vector from the CacheVC to the OpenDirEntry.
215 // Each CacheVC now maintains a pointer to this vector. Adding/Deleting
216 // alternates from this vector is done under the Vol::lock. The alternate
217 // is deleted/inserted into the vector just before writing the vector disk
218 // (CacheVC::updateVector).
219 LINK_FORWARD_DECLARATION(CacheVC, opendir_link) // forward declaration
220 struct OpenDirEntry {
221   DLL<CacheVC, Link_CacheVC_opendir_link> writers; // list of all the current writers
222   DLL<CacheVC, Link_CacheVC_opendir_link> readers; // list of all the current readers - not used
223   CacheHTTPInfoVector vector;                      // Vector for the http document. Each writer
224                                                    // maintains a pointer to this vector and
225                                                    // writes it down to disk.
226   CacheKey single_doc_key;                         // Key for the resident alternate.
227   Dir single_doc_dir;                              // Directory for the resident alternate
228   Dir first_dir;                                   // Dir for the vector. If empty, a new dir is
229                                                    // inserted, otherwise this dir is overwritten
230   uint16_t num_writers;                            // num of current writers
231   uint16_t max_writers;                            // max number of simultaneous writers allowed
232   bool dont_update_directory;                      // if set, the first_dir is not updated.
233   bool move_resident_alt;                          // if set, single_doc_dir is inserted.
234   bool reading_vec;                                // somebody is currently reading the vector
235   bool writing_vec;                                // somebody is currently writing the vector
236 
237   LINK(OpenDirEntry, link);
238 
239   int wait(CacheVC *c, int msec);
240 
241   bool
has_multiple_writersOpenDirEntry242   has_multiple_writers()
243   {
244     return num_writers > 1;
245   }
246 };
247 
248 struct OpenDir : public Continuation {
249   Queue<CacheVC, Link_CacheVC_opendir_link> delayed_readers;
250   DLL<OpenDirEntry> bucket[OPEN_DIR_BUCKETS];
251 
252   int open_write(CacheVC *c, int allow_if_writers, int max_writers);
253   int close_write(CacheVC *c);
254   OpenDirEntry *open_read(const CryptoHash *key);
255   int signal_readers(int event, Event *e);
256 
257   OpenDir();
258 };
259 
260 struct CacheSync : public Continuation {
261   int vol_idx    = 0;
262   char *buf      = nullptr;
263   size_t buflen  = 0;
264   bool buf_huge  = false;
265   off_t writepos = 0;
266   AIOCallbackInternal io;
267   Event *trigger        = nullptr;
268   ink_hrtime start_time = 0;
269   int mainEvent(int event, Event *e);
270   void aio_write(int fd, char *b, int n, off_t o);
271 
CacheSyncCacheSync272   CacheSync() : Continuation(new_ProxyMutex()) { SET_HANDLER(&CacheSync::mainEvent); }
273 };
274 
275 // Global Functions
276 
277 void vol_init_dir(Vol *d);
278 int dir_token_probe(const CacheKey *, Vol *, Dir *);
279 int dir_probe(const CacheKey *, Vol *, Dir *, Dir **);
280 int dir_insert(const CacheKey *key, Vol *d, Dir *to_part);
281 int dir_overwrite(const CacheKey *key, Vol *d, Dir *to_part, Dir *overwrite, bool must_overwrite = true);
282 int dir_delete(const CacheKey *key, Vol *d, Dir *del);
283 int dir_lookaside_probe(const CacheKey *key, Vol *d, Dir *result, EvacuationBlock **eblock);
284 int dir_lookaside_insert(EvacuationBlock *b, Vol *d, Dir *to);
285 int dir_lookaside_fixup(const CacheKey *key, Vol *d);
286 void dir_lookaside_cleanup(Vol *d);
287 void dir_lookaside_remove(const CacheKey *key, Vol *d);
288 void dir_free_entry(Dir *e, int s, Vol *d);
289 void dir_sync_init();
290 int check_dir(Vol *d);
291 void dir_clean_vol(Vol *d);
292 void dir_clear_range(off_t start, off_t end, Vol *d);
293 int dir_segment_accounted(int s, Vol *d, int offby = 0, int *free = nullptr, int *used = nullptr, int *empty = nullptr,
294                           int *valid = nullptr, int *agg_valid = nullptr, int *avg_size = nullptr);
295 uint64_t dir_entries_used(Vol *d);
296 void sync_cache_dir_on_shutdown();
297 
298 // Global Data
299 
300 extern Dir empty_dir;
301 
302 // Inline Functions
303 
304 #define dir_in_seg(_s, _i) ((Dir *)(((char *)(_s)) + (SIZEOF_DIR * (_i))))
305 
306 TS_INLINE bool
dir_compare_tag(const Dir * e,const CacheKey * key)307 dir_compare_tag(const Dir *e, const CacheKey *key)
308 {
309   return (dir_tag(e) == DIR_MASK_TAG(key->slice32(2)));
310 }
311 
312 TS_INLINE Dir *
dir_from_offset(int64_t i,Dir * seg)313 dir_from_offset(int64_t i, Dir *seg)
314 {
315 #if DIR_DEPTH < 5
316   if (!i) {
317     return nullptr;
318   }
319   return dir_in_seg(seg, i);
320 #else
321   i = i + ((i - 1) / (DIR_DEPTH - 1));
322   return dir_in_seg(seg, i);
323 #endif
324 }
325 TS_INLINE Dir *
next_dir(Dir * d,Dir * seg)326 next_dir(Dir *d, Dir *seg)
327 {
328   int i = dir_next(d);
329   return dir_from_offset(i, seg);
330 }
331 TS_INLINE int64_t
dir_to_offset(const Dir * d,const Dir * seg)332 dir_to_offset(const Dir *d, const Dir *seg)
333 {
334 #if DIR_DEPTH < 5
335   return (((char *)d) - ((char *)seg)) / SIZEOF_DIR;
336 #else
337   int64_t i = (int64_t)((((char *)d) - ((char *)seg)) / SIZEOF_DIR);
338   i         = i - (i / DIR_DEPTH);
339   return i;
340 #endif
341 }
342 TS_INLINE Dir *
dir_bucket(int64_t b,Dir * seg)343 dir_bucket(int64_t b, Dir *seg)
344 {
345   return dir_in_seg(seg, b * DIR_DEPTH);
346 }
347 TS_INLINE Dir *
dir_bucket_row(Dir * b,int64_t i)348 dir_bucket_row(Dir *b, int64_t i)
349 {
350   return dir_in_seg(b, i);
351 }
352