1 typedef struct Config Config; 2 typedef struct AMap AMap; 3 typedef struct AMapN AMapN; 4 typedef struct Arena Arena; 5 typedef struct AState AState; 6 typedef struct ArenaCIG ArenaCIG; 7 typedef struct ArenaHead ArenaHead; 8 typedef struct ArenaPart ArenaPart; 9 typedef struct ArenaTail ArenaTail; 10 typedef struct ATailStats ATailStats; 11 typedef struct CIBlock CIBlock; 12 typedef struct Clump Clump; 13 typedef struct ClumpInfo ClumpInfo; 14 typedef struct Graph Graph; 15 typedef struct IAddr IAddr; 16 typedef struct IBucket IBucket; 17 typedef struct IEStream IEStream; 18 typedef struct IEntry IEntry; 19 typedef struct IFile IFile; 20 typedef struct ISect ISect; 21 typedef struct Index Index; 22 typedef struct Lump Lump; 23 typedef struct DBlock DBlock; 24 typedef struct Part Part; 25 typedef struct Statbin Statbin; 26 typedef struct Statdesc Statdesc; 27 typedef struct Stats Stats; 28 typedef struct ZBlock ZBlock; 29 typedef struct Round Round; 30 typedef struct Bloom Bloom; 31 32 #pragma incomplete IEStream 33 34 #define TWID32 ((u32int)~(u32int)0) 35 #define TWID64 ((u64int)~(u64int)0) 36 #define TWID8 ((u8int)~(u8int)0) 37 38 enum 39 { 40 /* 41 * formerly fundamental constant, 42 * now a server-imposed limitation. 43 */ 44 VtMaxLumpSize = 56*1024, 45 46 ABlockLog = 9, /* log2(512), the quantum for reading arenas */ 47 ANameSize = 64, 48 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ 49 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ 50 PartBlank = 256*1024, /* untouched section at beginning of partition */ 51 HeadSize = 512, /* size of a header after PartBlank */ 52 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ 53 IndexBase = 1024*1024, /* initial address to use in an index */ 54 MaxIo = 64*1024, /* max size of a single read or write operation */ 55 ICacheBits = 16, /* default bits for indexing icache */ 56 MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ 57 Unspecified = TWID32, 58 59 /* 60 * return codes from syncarena 61 */ 62 SyncDataErr = 1 << 0, /* problem reading the clump data */ 63 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ 64 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ 65 SyncFixErr = 1 << 3, /* error writing fixed data */ 66 SyncHeader = 1 << 4, /* altered header fields */ 67 68 /* 69 * error severity 70 */ 71 EOk = 0, /* error expected in normal operation */ 72 EStrange, /* strange error that should be logged */ 73 ECorrupt, /* corrupted data found in arenas */ 74 EICorrupt, /* corrupted data found in index */ 75 EAdmin, /* should be brought to administrators' attention */ 76 ECrash, /* really bad internal error */ 77 EBug, /* a limitation which should be fixed */ 78 EInconsist, /* inconsistencies between index and arena */ 79 EMax, 80 81 /* 82 * internal disk formats for the venti archival storage system 83 */ 84 /* 85 * magic numbers on disk 86 */ 87 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */ 88 ClumpFreeMagic = 0, /* free clump; terminates active clump log */ 89 90 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */ 91 ArenaMagic = 0xf2a14eadU, /* arena trailer */ 92 ArenaHeadMagic = 0xd15c4eadU, /* arena header */ 93 94 BloomMagic = 0xb1004eadU, /* bloom filter header */ 95 BloomMaxHash = 32, 96 97 ISectMagic = 0xd15c5ec7U, /* index header */ 98 99 ArenaPartVersion = 3, 100 ArenaVersion4 = 4, 101 ArenaVersion5 = 5, 102 BloomVersion = 1, 103 IndexVersion = 1, 104 ISectVersion1 = 1, 105 ISectVersion2 = 2, 106 107 /* 108 * encodings of clumps on disk 109 */ 110 ClumpEErr = 0, /* can't happen */ 111 ClumpENone, /* plain */ 112 ClumpECompress, /* compressed */ 113 ClumpEMax, 114 115 /* 116 * sizes in bytes on disk 117 */ 118 U8Size = 1, 119 U16Size = 2, 120 U32Size = 4, 121 U64Size = 8, 122 123 ArenaPartSize = 4 * U32Size, 124 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, 125 ArenaSize5 = ArenaSize4 + U32Size, 126 ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size, 127 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, 128 ArenaHeadSize5 = ArenaHeadSize4 + U32Size, 129 BloomHeadSize = 4 * U32Size, 130 ISectSize1 = 7 * U32Size + 2 * ANameSize, 131 ISectSize2 = ISectSize1 + U32Size, 132 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, 133 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, 134 MaxBloomSize = 1<<(32-3), /* 2^32 bits */ 135 MaxBloomHash = 32, /* bits per score */ 136 /* 137 * BUG - The various block copies that manipulate entry buckets 138 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, 139 * so that everything is word-aligned. Buildindex is actually cpu-bound 140 * by the (byte at a time) copying in qsort. 141 */ 142 IBucketSize = U32Size + U16Size, 143 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, 144 IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size, 145 IEntryAddrOff = VtScoreSize + U32Size + U16Size, 146 147 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, 148 149 IcacheFrac = 1000000, /* denominator */ 150 151 SleepForever = 1000000000, /* magic value for sleep time */ 152 /* 153 * dirty flags - order controls disk write order 154 */ 155 DirtyArena = 1, 156 DirtyArenaCib, 157 DirtyArenaTrailer, 158 DirtyMax, 159 160 ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry. 161 162 VentiZZZZZZZZ 163 }; 164 165 extern char TraceDisk[]; 166 extern char TraceLump[]; 167 extern char TraceBlock[]; 168 extern char TraceProc[]; 169 extern char TraceWork[]; 170 extern char TraceQuiet[]; 171 extern char TraceRpc[]; 172 173 /* 174 * results of parsing and initializing a config file 175 */ 176 struct Config 177 { 178 char *index; /* name of the index to initialize */ 179 int naparts; /* arena partitions initialized */ 180 ArenaPart **aparts; 181 int nsects; /* index sections initialized */ 182 ISect **sects; 183 Bloom *bloom; /* bloom filter */ 184 u32int bcmem; 185 u32int mem; 186 u32int icmem; 187 int queuewrites; 188 char* haddr; 189 char* vaddr; 190 char* webroot; 191 }; 192 193 /* 194 * a Part is the low level interface to files or disks. 195 * there are two main types of partitions 196 * arena paritions, which some number of arenas, each in a sub-partition. 197 * index partition, which only have one subpartition. 198 */ 199 struct Part 200 { 201 int fd; /* rock for accessing the disk */ 202 int mode; 203 u64int offset; 204 u64int size; /* size of the partiton */ 205 u32int blocksize; /* block size for reads and writes */ 206 u32int fsblocksize; /* minimum file system block size */ 207 char *name; 208 char *filename; 209 Channel *writechan; /* chan[dcache.nblock](DBlock*) */ 210 }; 211 212 /* 213 * a cached block from the partition 214 * yuck -- most of this is internal structure for the cache 215 * all other routines should only use data 216 */ 217 struct DBlock 218 { 219 u8int *data; 220 221 Part *part; /* partition in which cached */ 222 u64int addr; /* base address on the partition */ 223 u32int size; /* amount of data available, not amount allocated; should go away */ 224 u32int mode; 225 u32int dirty; 226 u32int dirtying; 227 DBlock *next; /* doubly linked hash chains */ 228 DBlock *prev; 229 u32int heap; /* index in heap table */ 230 u32int used; /* last reference times */ 231 u32int used2; 232 u32int ref; /* reference count */ 233 RWLock lock; /* for access to data only */ 234 Channel *writedonechan; 235 void* chanbuf[1]; /* buffer for the chan! */ 236 }; 237 238 /* 239 * a cached block from the partition 240 * yuck -- most of this is internal structure for the cache 241 * all other routines should only use data 242 * double yuck -- this is mostly the same as a DBlock 243 */ 244 struct Lump 245 { 246 Packet *data; 247 248 Part *part; /* partition in which cached */ 249 u8int score[VtScoreSize]; /* score of packet */ 250 u8int type; /* type of packet */ 251 u32int size; /* amount of data allocated to hold packet */ 252 Lump *next; /* doubly linked hash chains */ 253 Lump *prev; 254 u32int heap; /* index in heap table */ 255 u32int used; /* last reference times */ 256 u32int used2; 257 u32int ref; /* reference count */ 258 QLock lock; /* for access to data only */ 259 }; 260 261 /* 262 * mapping between names and address ranges 263 */ 264 struct AMap 265 { 266 u64int start; 267 u64int stop; 268 char name[ANameSize]; 269 }; 270 271 /* 272 * an AMap along with a length 273 */ 274 struct AMapN 275 { 276 int n; 277 AMap *map; 278 }; 279 280 /* 281 * an ArenaPart is a partition made up of Arenas 282 * it exists because most os's don't support many partitions, 283 * and we want to have many different Arenas 284 */ 285 struct ArenaPart 286 { 287 Part *part; 288 u64int size; /* size of underlying partition, rounded down to blocks */ 289 Arena **arenas; 290 u32int tabbase; /* base address of arena table on disk */ 291 u32int tabsize; /* max. bytes in arena table */ 292 293 /* 294 * fields stored on disk 295 */ 296 u32int version; 297 u32int blocksize; /* "optimal" block size for reads and writes */ 298 u32int arenabase; /* base address of first arena */ 299 300 /* 301 * stored in the arena mapping table on disk 302 */ 303 AMap *map; 304 int narenas; 305 }; 306 307 /* 308 * info about one block in the clump info cache 309 */ 310 struct CIBlock 311 { 312 u32int block; /* blocks in the directory */ 313 int offset; /* offsets of one clump in the data */ 314 DBlock *data; 315 }; 316 317 /* 318 * Statistics kept in the tail. 319 */ 320 struct ATailStats 321 { 322 u32int clumps; /* number of clumps */ 323 u32int cclumps; /* number of compressed clumps */ 324 u64int used; 325 u64int uncsize; 326 u8int sealed; 327 }; 328 329 /* 330 * Arena state - represents a point in the data log 331 */ 332 struct AState 333 { 334 Arena *arena; 335 u64int aa; /* index address */ 336 ATailStats stats; 337 }; 338 339 /* 340 * an Arena is a log of Clumps, preceeded by an ArenaHeader, 341 * and followed by a Arena, each in one disk block. 342 * struct on disk is not always up to date, but should be self-consistent. 343 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. 344 * <struct name="Arena" type="Arena *"> 345 * <field name="name" val="s->name" type="AName"/> 346 * <field name="version" val="s->version" type="U32int"/> 347 * <field name="partition" val="s->part->name" type="AName"/> 348 * <field name="blocksize" val="s->blocksize" type="U32int"/> 349 * <field name="start" val="s->base" type="U64int"/> 350 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/> 351 * <field name="created" val="s->ctime" type="U32int"/> 352 * <field name="modified" val="s->wtime" type="U32int"/> 353 * <field name="sealed" val="s->sealed" type="Sealed"/> 354 * <field name="score" val="s->score" type="Score"/> 355 * <field name="clumps" val="s->clumps" type="U32int"/> 356 * <field name="compressedclumps" val="s->cclumps" type="U32int"/> 357 * <field name="data" val="s->uncsize" type="U64int"/> 358 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/> 359 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/> 360 * </struct> 361 */ 362 struct Arena 363 { 364 QLock lock; /* lock for arena fields, writing to disk */ 365 Part *part; /* partition in which arena lives */ 366 int blocksize; /* size of block to read or write */ 367 u64int base; /* base address on disk */ 368 u64int size; /* total space in the arena */ 369 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ 370 371 int clumpmax; /* ClumpInfos per block */ 372 AState mem; 373 int inqueue; 374 375 /* 376 * fields stored on disk 377 */ 378 u32int version; 379 char name[ANameSize]; /* text label */ 380 ATailStats memstats; 381 ATailStats diskstats; 382 u32int ctime; /* first time a block was written */ 383 u32int wtime; /* last time a block was written */ 384 u32int clumpmagic; 385 386 ArenaCIG *cig; 387 int ncig; 388 }; 389 390 struct ArenaCIG 391 { 392 u64int offset; // from arena base 393 }; 394 395 /* 396 * redundant storage of some fields at the beginning of each arena 397 */ 398 struct ArenaHead 399 { 400 u32int version; 401 char name[ANameSize]; 402 u32int blocksize; 403 u64int size; 404 u32int clumpmagic; 405 }; 406 407 /* 408 * most interesting meta information for a clump. 409 * stored in each clump's header and in the Arena's directory, 410 * stored in reverse order just prior to the arena trailer 411 */ 412 struct ClumpInfo 413 { 414 u8int type; 415 u16int size; /* size of disk data, not including header */ 416 u16int uncsize; /* size of uncompressed data */ 417 u8int score[VtScoreSize]; /* score of the uncompressed data only */ 418 }; 419 420 /* 421 * header for an immutable clump of data 422 */ 423 struct Clump 424 { 425 ClumpInfo info; 426 u8int encoding; 427 u32int creator; /* initial client which wrote the block */ 428 u32int time; /* creation at gmt seconds since 1/1/1970 */ 429 }; 430 431 /* 432 * index of all clumps according to their score 433 * this is just a wrapper to tie together the index sections 434 * <struct name="Index" type="Index *"> 435 * <field name="name" val="s->name" type="AName"/> 436 * <field name="version" val="s->version" type="U32int"/> 437 * <field name="blocksize" val="s->blocksize" type="U32int"/> 438 * <field name="tabsize" val="s->tabsize" type="U32int"/> 439 * <field name="buckets" val="s->buckets" type="U32int"/> 440 * <field name="buckdiv" val="s->div" type="U32int"/> 441 * <field name="bitblocks" val="s->div" type="U32int"/> 442 * <field name="maxdepth" val="s->div" type="U32int"/> 443 * <field name="bitkeylog" val="s->div" type="U32int"/> 444 * <field name="bitkeymask" val="s->div" type="U32int"/> 445 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/> 446 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/> 447 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/> 448 * </struct> 449 * <struct name="Amap" type="AMap *"> 450 * <field name="name" val="s->name" type="AName"/> 451 * <field name="start" val="s->start" type="U64int"/> 452 * <field name="stop" val="s->stop" type="U64int"/> 453 * </struct> 454 */ 455 struct Index 456 { 457 u32int div; /* divisor for mapping score to bucket */ 458 u32int buckets; /* last bucket used in disk hash table */ 459 u32int blocksize; 460 u32int tabsize; /* max. bytes in index config */ 461 462 int mapalloc; /* first arena to check when adding a lump */ 463 Arena **arenas; /* arenas in the mapping */ 464 ISect **sects; /* sections which hold the buckets */ 465 Bloom *bloom; /* bloom filter */ 466 467 /* 468 * fields stored in config file 469 */ 470 u32int version; 471 char name[ANameSize]; /* text label */ 472 int nsects; 473 AMap *smap; /* mapping of buckets to index sections */ 474 int narenas; 475 AMap *amap; /* mapping from index addesses to arenas */ 476 477 QLock writing; 478 }; 479 480 /* 481 * one part of the bucket storage for an index. 482 * the index blocks are sequentially allocated 483 * across all of the sections. 484 */ 485 struct ISect 486 { 487 Part *part; 488 int blocklog; /* log2(blocksize) */ 489 int buckmax; /* max. entries in a index bucket */ 490 u32int tabbase; /* base address of index config table on disk */ 491 u32int tabsize; /* max. bytes in index config */ 492 Channel *writechan; 493 Channel *writedonechan; 494 void *ig; /* used by buildindex only */ 495 int ng; 496 497 /* 498 * fields stored on disk 499 */ 500 u32int version; 501 u32int bucketmagic; 502 char name[ANameSize]; /* text label */ 503 char index[ANameSize]; /* index owning the section */ 504 u32int blocksize; /* size of hash buckets in index */ 505 u32int blockbase; /* address of start of on disk index table */ 506 u32int blocks; /* total blocks on disk; some may be unused */ 507 u32int start; /* first bucket in this section */ 508 u32int stop; /* limit of buckets in this section */ 509 }; 510 511 /* 512 * externally interesting part of an IEntry 513 */ 514 struct IAddr 515 { 516 u64int addr; 517 u16int size; /* uncompressed size */ 518 u8int type; /* type of block */ 519 u8int blocks; /* arena io quanta for Clump + data */ 520 }; 521 522 /* 523 * entries in the index 524 * kept in IBuckets in the disk index table, 525 * cached in the memory ICache. 526 */ 527 struct IEntry 528 { 529 /* on disk data - 32 bytes*/ 530 u8int score[VtScoreSize]; 531 IAddr ia; 532 533 IEntry *nexthash; 534 IEntry *nextdirty; 535 IEntry *next; 536 IEntry *prev; 537 u8int state; 538 }; 539 enum { 540 IEClean = 0, 541 IEDirty = 1, 542 IESummary = 2, 543 }; 544 545 /* 546 * buckets in the on disk index table 547 */ 548 struct IBucket 549 { 550 u16int n; /* number of active indices */ 551 u32int buck; /* used by buildindex/checkindex only */ 552 u8int *data; 553 }; 554 555 /* 556 * temporary buffers used by individual threads 557 */ 558 struct ZBlock 559 { 560 u32int len; 561 u32int _size; 562 u8int *data; 563 u8int *free; 564 }; 565 566 /* 567 * simple input buffer for a '\0' terminated text file 568 */ 569 struct IFile 570 { 571 char *name; /* name of the file */ 572 ZBlock *b; /* entire contents of file */ 573 u32int pos; /* current position in the file */ 574 }; 575 576 struct Statdesc 577 { 578 char *name; 579 ulong max; 580 }; 581 582 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ 583 enum 584 { 585 StatRpcTotal, 586 StatRpcRead, 587 StatRpcReadOk, 588 StatRpcReadFail, 589 StatRpcReadBytes, 590 StatRpcReadTime, 591 StatRpcReadCached, 592 StatRpcReadCachedTime, 593 StatRpcReadUncached, 594 StatRpcReadUncachedTime, 595 StatRpcWrite, 596 StatRpcWriteNew, 597 StatRpcWriteOld, 598 StatRpcWriteFail, 599 StatRpcWriteBytes, 600 StatRpcWriteTime, 601 StatRpcWriteNewTime, 602 StatRpcWriteOldTime, 603 604 StatLcacheHit, 605 StatLcacheMiss, 606 StatLcacheRead, 607 StatLcacheWrite, 608 StatLcacheSize, 609 StatLcacheStall, 610 StatLcacheReadTime, 611 612 StatDcacheHit, 613 StatDcacheMiss, 614 StatDcacheLookup, 615 StatDcacheRead, 616 StatDcacheWrite, 617 StatDcacheDirty, 618 StatDcacheSize, 619 StatDcacheFlush, 620 StatDcacheStall, 621 StatDcacheLookupTime, 622 623 StatDblockStall, 624 StatLumpStall, 625 626 StatIcacheHit, 627 StatIcacheMiss, 628 StatIcacheRead, 629 StatIcacheWrite, 630 StatIcacheFill, 631 StatIcachePrefetch, 632 StatIcacheDirty, 633 StatIcacheSize, 634 StatIcacheFlush, 635 StatIcacheStall, 636 StatIcacheReadTime, 637 StatIcacheLookup, 638 StatScacheHit, 639 StatScachePrefetch, 640 641 StatBloomHit, 642 StatBloomMiss, 643 StatBloomFalseMiss, 644 StatBloomLookup, 645 StatBloomOnes, 646 StatBloomBits, 647 648 StatApartRead, 649 StatApartReadBytes, 650 StatApartWrite, 651 StatApartWriteBytes, 652 653 StatIsectRead, 654 StatIsectReadBytes, 655 StatIsectWrite, 656 StatIsectWriteBytes, 657 658 StatSumRead, 659 StatSumReadBytes, 660 661 StatCigLoad, 662 StatCigLoadTime, 663 664 NStat 665 }; 666 667 extern Statdesc statdesc[NStat]; 668 669 /* 670 * statistics about the operation of the server 671 * mainly for performance monitoring and profiling. 672 */ 673 struct Stats 674 { 675 ulong now; 676 ulong n[NStat]; 677 }; 678 679 struct Statbin 680 { 681 uint nsamp; 682 uint min; 683 uint max; 684 uint avg; 685 }; 686 687 struct Graph 688 { 689 long (*fn)(Stats*, Stats*, void*); 690 void *arg; 691 long t0; 692 long t1; 693 long min; 694 long max; 695 long wid; 696 long ht; 697 int fill; 698 }; 699 700 /* 701 * for kicking background processes that run one round after another after another 702 */ 703 struct Round 704 { 705 QLock lock; 706 Rendez start; 707 Rendez finish; 708 Rendez delaywait; 709 int delaytime; 710 int delaykick; 711 char* name; 712 int last; 713 int current; 714 int next; 715 int doanother; 716 }; 717 718 /* 719 * Bloom filter of stored block hashes 720 */ 721 struct Bloom 722 { 723 RWLock lk; /* protects nhash, nbits, tab, mb */ 724 QLock mod; /* one marker at a time, protects nb */ 725 int nhash; 726 ulong size; /* bytes in tab */ 727 ulong bitmask; /* to produce bit index */ 728 u8int *data; 729 Part *part; 730 Channel *writechan; 731 Channel *writedonechan; 732 }; 733 734 extern Index *mainindex; 735 extern u32int maxblocksize; /* max. block size used by any partition */ 736 extern int paranoid; /* should verify hashes on disk read */ 737 extern int queuewrites; /* put all lump writes on a queue and finish later */ 738 extern int readonly; /* only allowed to read the disk data */ 739 extern Stats stats; 740 extern u8int zeroscore[VtScoreSize]; 741 extern int compressblocks; 742 extern int writestodevnull; /* dangerous - for performance debugging */ 743 extern int bootstrap; /* writes but does not index - cannot read */ 744 extern int collectstats; 745 extern QLock memdrawlock; 746 extern int icachesleeptime; 747 extern int minicachesleeptime; 748 extern int arenasumsleeptime; 749 extern int manualscheduling; 750 extern int l0quantum; 751 extern int l1quantum; 752 extern int ignorebloom; 753 extern int icacheprefetch; 754 extern int syncwrites; 755 extern int debugarena; /* print in arena error msgs; -1==unknown */ 756 757 extern Stats *stathist; 758 extern int nstathist; 759 extern ulong stattime; 760 761 #ifndef PLAN9PORT 762 #pragma varargck type "V" uchar* 763 #define ODIRECT 0 764 #endif 765