1
2 /* Web Polygraph http://www.web-polygraph.org/
3 * Copyright 2003-2011 The Measurement Factory
4 * Licensed under the Apache License, Version 2.0 */
5
6 #include "base/polygraph.h"
7
8 #include <limits.h>
9 #include "xstd/h/iostream.h"
10 #include "xstd/h/sstream.h"
11 #include "xstd/h/iomanip.h"
12 #include "xstd/ZLib.h"
13 #include <fstream>
14
15 #include "xstd/Rnd.h"
16 #include "xstd/StringIdentifier.h"
17 #include "xstd/gadgets.h"
18 #include "base/RndPermut.h"
19 #include "base/BStream.h"
20 #include "base/rndDistrStat.h"
21 #include "base/ContTypeStat.h"
22 #include "base/polyLogCats.h"
23
24 #include "runtime/IOBuf.h"
25 #include "runtime/LogComment.h"
26 #include "runtime/ErrorMgr.h"
27 #include "runtime/MimeHeadersCfg.h"
28 #include "runtime/polyErrors.h"
29 #include "runtime/httpHdrs.h"
30 #include "runtime/httpText.h"
31 #include "pgl/MimeSym.h"
32 #include "pgl/ContentSym.h"
33 #include "csm/EmbedContMdl.h"
34 #include "csm/RndBodyIter.h"
35 #include "csm/CdbBodyIter.h"
36 #include "csm/RamFileBodyIter.h"
37 #include "csm/ContainerBodyIter.h"
38 #include "csm/InjectIter.h"
39 #include "csm/GzipEncoder.h"
40 #include "csm/RangeBodyIter.h"
41 #include "csm/cdbEntries.h"
42 #include "csm/ContentDbase.h"
43 #include "csm/RamFiles.h"
44 #include "csm/TextDbase.h"
45 #include "csm/ObjLifeCycle.h"
46 #include "csm/ContentCfg.h"
47
48 static const String DefaultContentCfgKind = "some-content";
49
50 int TheForeignContentId = -1;
51 int TheBodilessContentId = -1;
52 int TheUnknownContentId = -1;
53
54
ContentCfg(int anId)55 ContentCfg::ContentCfg(int anId):
56 theObjLifeCycle(0), theChbRatio(-1), theChecksumRatio(-1),
57 theUniqueRatio(-1), theSize(0),
58 theEmbedCont(0), theCdb(0),
59 theRamFiles(0),
60 theInjectionAlgorithm(ialgNone),
61 theTdb(0), theInjGap(0), theInfProb(-1),
62 theExtSel(0), thePfxSel(0),
63 theId(anId),
64 theEncodings(new int[codingEnd]),
65 theMimeHeaders(0),
66 theClientBehaviorSym(0),
67 generateText(true) {
68 theObjLifeCycle = new ObjLifeCycle;
69 }
70
~ContentCfg()71 ContentCfg::~ContentCfg() {
72 delete theObjLifeCycle;
73 delete[] theEncodings;
74 delete theMimeHeaders;
75 delete theRamFiles;
76 }
77
configure(const ContentSym & cfg)78 void ContentCfg::configure(const ContentSym &cfg) {
79 theKind = cfg.kind();
80 if (!theKind)
81 theKind = DefaultContentCfgKind;
82
83 ContType::Record(id(), kind());
84
85 if (MimeSym *mime = cfg.mime()) {
86 theMimeType = mime->mimeType();
87 mime->extensions(theExtensions, theExtSel);
88 mime->prefixes(thePrefixes, thePfxSel);
89 mime->queries(theQueries, theQrySel);
90 }
91
92 theSize = cfg.size();
93 theObjLifeCycle->configure(cfg.objLifeCycle());
94 cfg.cachable(theChbRatio);
95 cfg.checksum(theChecksumRatio);
96
97 if (cfg.unique(theUniqueRatio) && theUniqueRatio >= 0) {
98 // form unique prefix to be used to make some content unique
99 const int sharedId = GlbPermut(rndSharedContent);
100 char buf[64];
101 ofixedstream os(buf, sizeof(buf));
102 os << 'u' << hex << setfill('0') <<
103 setw(8) << sharedId << '.' <<
104 setw(2) << theId << '/' << ends;
105 os.flush();
106 theCommonPrefix = buf;
107 const double commonRatio = 1 - theUniqueRatio;
108 Comment(1) << "fyi: " << (100*commonRatio) << "% of '" <<
109 theKind << "' content (id " << theId <<
110 ") will be identical; common prefix: " << theCommonPrefix <<
111 endc;
112 }
113
114 if (cfg.hasEmbed()) {
115 theEmbedCont = new EmbedContMdl;
116 theEmbedCont->configure(&cfg);
117 }
118
119 if (const String &cdbName = cfg.cdb()) {
120 ifstream f(cdbName.cstr());
121 IBStream is;
122 is.configure(&f, cdbName);
123 theCdb = new ContentDbase;
124 theCdb->load(is);
125 if (!is.good())
126 Comment << "error: cannot load content database from '"
127 << cdbName << "'; " << Error::Last() << endc << xexit;
128 if (!theCdb->count())
129 Comment << "error: no entries in '" << cdbName
130 << "' content database" << endc << xexit;
131 if (!theEmbedCont && theCdb->hasLinkOrPage())
132 Comment << "error: content cfg `" << theKind << "': " <<
133 "Content::may_contain is needed but missing" << endc << xexit;
134 }
135
136 if (const String &documentRoot = cfg.documentRoot()) {
137 if (theCdb)
138 Comment(0) << "error: Content document_root and content_db are mutually exclusive" << endc << xexit;
139
140 theRamFiles = new RamFiles(documentRoot);
141 theRamFiles->load();
142 if (!theRamFiles->count()) {
143 Comment(0) << "error: no valid files in Content " << theKind <<
144 " document_root: " << documentRoot << endc << xexit;
145 }
146 Comment(1) << "fyi: loaded " << theRamFiles->count() << " files from " <<
147 "Content " << theKind << " document_root " << documentRoot <<
148 "; approximate RAM used: " << theRamFiles->ramSize() << endc;
149 }
150
151 //if (theEmbedCont && theCdb)
152 // Comment << "error: content cfg `" << theKind << "': "
153 // << "cannot support containers together with content_db yet; "
154 // << "do not use may_contain with content_db"
155 // << endc << xexit;
156
157 if (!theSize && !theCdb && !theRamFiles)
158 Comment << "error: content cfg `" << theKind << "': "
159 << "has neither size distribution nor content_db; "
160 << "either one or both must be specified for "
161 << "Polygaph to know what object sizes this content type "
162 << "should generate"
163 << endc << xexit;
164
165 configureInjections(cfg);
166
167 configureEncodings(cfg);
168
169 configureRndGeneration(cfg);
170
171 theClientBehaviorSym = cfg.clientBehavior();
172 }
173
configureInjections(const ContentSym & cfg)174 void ContentCfg::configureInjections(const ContentSym &cfg) {
175
176 if (const String &objKind = cfg.injectObject()) {
177 // convert external inject_object into internal InjectionAlgorithm
178 static StringIdentifier knownKinds;
179 if (!knownKinds.count()) {
180 knownKinds.add("db_text", ialgTextBetweenMarkup);
181 knownKinds.add("request_uri", ialgUriAtEnd);
182 }
183
184 const int id = knownKinds.lookup(objKind);
185 if (id <= 0) {
186 cerr << cfg.loc() << "unknown inject_object '" << objKind <<
187 "'; known objects are:";
188 for (StringIdentifier::Iter i = knownKinds.iterator(); i; ++i)
189 cerr << ' ' << i.str();
190 cerr << endl;
191 exit(-2);
192 }
193
194 theInjectionAlgorithm = static_cast<InjectionAlgorithm>(id);
195 }
196
197 if (const String &tdbName = cfg.injectDb()) {
198 ifstream f(tdbName.cstr());
199 theTdb = new TextDbase;
200 theTdb->load(f);
201 if (f.bad())
202 Comment << "error: cannot load text database from `"
203 << tdbName << "; " << Error::Last() << endc << xexit;
204 if (!theTdb->count())
205 Comment << "error: text database `"
206 << tdbName << " appears to be empty" << endc << xexit;
207
208 if (!theInjectionAlgorithm)
209 theInjectionAlgorithm = ialgTextBetweenMarkup; // default for tdb
210 }
211
212 theInjGap = cfg.injectGap();
213
214 const bool explicitProb = cfg.infectProb(theInfProb);
215 configureMimeHeaders(cfg);
216
217 // XXX: we should put all inject* fields into one PGL object
218 // does it look like we need to inject?
219 const char *need = 0;
220 if (explicitProb)
221 need = "infect_prob";
222 if (theInjectionAlgorithm)
223 need = "inject_object";
224 if (theTdb)
225 need = "inject_db";
226 if (theInjGap)
227 need = "inject_gap";
228 if (!need)
229 return;
230
231 // do we have what it takes?
232 const char *error = 0;
233 if (!explicitProb)
234 error = "lacks infect_prob";
235 if (!theInjectionAlgorithm && !theTdb)
236 error = "lacks either inject_object or inject_db";
237 if (theTdb && theInjectionAlgorithm != ialgTextBetweenMarkup) {
238 need = "inject_db";
239 error = "uses a conflicting inject_object value";
240 }
241 if (theTdb && !theInjGap) {
242 need = "inject_db";
243 error = "lacks inject_gap";
244 }
245 if (theInjGap && !theTdb) {
246 need = "inject_gap";
247 error = "lacks inject_db";
248 }
249 if (!error)
250 return;
251
252 Comment << cfg.loc() << "error: content cfg '" << theKind << "' " <<
253 "has " << need << " but " << error << endc << xexit;
254 }
255
configureEncodings(const ContentSym & cfg)256 void ContentCfg::configureEncodings(const ContentSym &cfg) {
257 theEncodings[codingIdentity] = theEncodings[codingGzip] = -1;
258
259 Strings encodings;
260 if (cfg.encodings(encodings)) {
261 for (int i = 0; i < encodings.count(); ++i) {
262 const String &encoding = *encodings[i];
263 if (encoding == "identity")
264 theEncodings[codingIdentity] = 0;
265 else
266 if (encoding == "gzip") {
267 if (zlib::Supported) {
268 theEncodings[codingGzip] = 6;
269 } else {
270 Comment << "error: support for 'gzip' content encoding " <<
271 "(fount in content cfg '" << theKind << "') has been " <<
272 "disabled" << endc << xexit;
273 }
274 } else
275 Comment << "error: unknown content encoding '" << encoding <<
276 "' in content cfg '" << theKind << "'; known codings are " <<
277 "'identity' and 'gzip'" << endc << xexit;
278 }
279 } else {
280 theEncodings[codingIdentity] = 0;
281 }
282 }
283
configureMimeHeaders(const ContentSym & cfg)284 void ContentCfg::configureMimeHeaders(const ContentSym &cfg) {
285 if (const ArraySym *const a = cfg.mimeHeaders())
286 theMimeHeaders = new MimeHeadersCfg(*a);
287 }
288
configureRndGeneration(const ContentSym & cfg)289 void ContentCfg::configureRndGeneration(const ContentSym &cfg) {
290 const String generator = cfg.generator();
291 if (!generator.len() || generator == "random_text")
292 generateText = true;
293 else
294 if (generator == "random_data")
295 generateText = false;
296 else {
297 Comment << "error: unknown content generation method " << generator <<
298 " in content cfg '" << theKind << "'; known methods are " <<
299 "random_text and random_data" << endc << xexit;
300 }
301 }
302
url_ext(int seed) const303 const String &ContentCfg::url_ext(int seed) const {
304 return pickStr(theExtensions, theExtSel, seed);
305 }
306
url_pfx(int seed) const307 const String &ContentCfg::url_pfx(int seed) const {
308 return pickStr(thePrefixes, thePfxSel, seed);
309 }
310
url_qry(int seed) const311 const String &ContentCfg::url_qry(int seed) const {
312 return pickStr(theQueries, theQrySel, seed);
313 }
314
repSizeMean() const315 double ContentCfg::repSizeMean() const {
316 Assert(theSize || theCdb || theRamFiles);
317 if (theSize)
318 return RndDistrStat(theSize).mean();
319 else
320 if (theCdb)
321 return theCdb->entrySizeMean();
322 else
323 return theRamFiles->fileSizeMean();
324 }
325
multipleContentCodings() const326 bool ContentCfg::multipleContentCodings() const {
327 return theEncodings[codingIdentity] >= 0 && theEncodings[codingGzip] >= 0;
328 }
329
calcTimes(const ObjId & oid,ObjTimes & times) const330 void ContentCfg::calcTimes(const ObjId &oid, ObjTimes ×) const {
331 const int seed = GlbPermut(oid.hash(), rndRepOlc);
332 theObjLifeCycle->calcTimes(seed, times);
333 }
334
calcContentCoding(ObjId & oid,const ReqHdr & req) const335 bool ContentCfg::calcContentCoding(ObjId &oid, const ReqHdr &req) const {
336 if (theEncodings[codingGzip] >= 0 && req.acceptedEncoding(codingGzip))
337 oid.gzipContent(true);
338 else
339 if (theEncodings[codingIdentity] >= 0 && req.acceptedEncoding(codingIdentity))
340 oid.gzipContent(false);
341 else
342 return false;
343 return true;
344 }
345
calcRawRepSize(const ObjId & oid,Size * suffixSizePtr) const346 Size ContentCfg::calcRawRepSize(const ObjId &oid, Size *suffixSizePtr) const {
347 Assert(theSize || theCdb || theRamFiles);
348 // make sure both prefix and suffix fit
349 const Size suffixSize = calcContentSuffixSize(oid);
350 const Size extras = calcContentPrefixSize(oid) + suffixSize;
351 if (suffixSizePtr)
352 *suffixSizePtr = suffixSize;
353 if (theSize) {
354 const int seed = GlbPermut(contentHash(oid), rndRepSize);
355 theSize->rndGen()->seed(seed);
356 const double dh = theSize->trial();
357 // prevent int overflows and leave room for headers
358 Size sz = (int)MiniMax((double)extras,
359 ceil(dh), (double)INT_MAX - 100*1024);
360 // paranoid sanity checks
361 if (!Should(sz >= extras))
362 sz = extras;
363 if (!Should(sz >= 0))
364 sz = 0;
365 return sz;
366 } else
367 if (theCdb) {
368 const int start = selectCdbStart(oid);
369 CdbEntryPrnOpt opt;
370 // assume that buf, injector, and rng are not needed
371 opt.embed.model = theEmbedCont;
372 opt.embed.container = oid;
373 opt.sizeMax = Size(INT_MAX); // Size::Max();
374 opt.entryOff = 0;
375 return theCdb->entry(start)->size(opt) + extras;
376 } else {
377 const Size fileSize = ramFile(oid).body.len();
378 return fileSize + extras;
379 }
380 }
381
calcFullEntitySize(const ObjId & oid)382 Size ContentCfg::calcFullEntitySize(const ObjId &oid) {
383 BodyIter &i = *getBodyIter(oid);
384 const Size res = i.fullEntitySize();
385 i.putBack();
386 return res;
387 }
388
calcCachability(const ObjId & oid) const389 bool ContentCfg::calcCachability(const ObjId &oid) const {
390 const int seed = GlbPermut(oid.hash(), rndRepCach);
391 RndGen rng(seed);
392 return rng.event(theChbRatio);
393 }
394
calcChecksumNeed(const ObjId & oid) const395 bool ContentCfg::calcChecksumNeed(const ObjId &oid) const {
396 const int seed = GlbPermut(oid.hash(), rndRepCheckNeed);
397 RndGen rng(seed);
398 return rng.event(theChecksumRatio);
399 }
400
calcChecksum(const ObjId & oid)401 xstd::Checksum ContentCfg::calcChecksum(const ObjId &oid) {
402 WrBuf buf;
403 xstd::ChecksumAlg checkAlg;
404 BodyIter &i = *getBodyIter(oid);
405 i.start(&buf);
406 while (i) {
407 i.pour();
408 checkAlg.update(buf.content(), buf.contSize());
409 buf.reset();
410 }
411 i.putBack();
412 checkAlg.final();
413
414 return checkAlg.sum();
415 }
416
shouldInject(const ObjId & oid) const417 bool ContentCfg::shouldInject(const ObjId &oid) const {
418 if (theInfProb <= 0)
419 return false;
420 RndGen rng(GlbPermut(contentHash(oid), rndInjProb));
421 return rng.event(theInfProb);
422 }
423
calcContentPrefixSize(const ObjId & oid) const424 Size ContentCfg::calcContentPrefixSize(const ObjId &oid) const {
425 switch (contentUniqueness(oid)) {
426 case cuUnique: {
427 IOBuf buf;
428 return pourUniqueContentPrefix(oid, buf);
429 }
430 case cuCommon:
431 return theCommonPrefix.len();
432
433 case cuChance:
434 default:
435 return 0;
436 }
437 }
438
calcContentSuffixSize(const ObjId & oid) const439 Size ContentCfg::calcContentSuffixSize(const ObjId &oid) const {
440 if (theInjectionAlgorithm != ialgUriAtEnd)
441 return 0; // no suffix configured at all
442
443 if (!shouldInject(oid))
444 return 0; // this object does not need a suffix
445
446 WrBuf buf;
447 return pourContentSuffix(oid, buf);
448 }
449
pourContentPrefix(const ObjId & oid,IOBuf & buf) const450 Size ContentCfg::pourContentPrefix(const ObjId &oid, IOBuf &buf) const {
451 switch (contentUniqueness(oid)) {
452 case cuUnique:
453 return pourUniqueContentPrefix(oid, buf);
454
455 case cuCommon: {
456 buf.append(theCommonPrefix.data(), theCommonPrefix.len());
457 return theCommonPrefix.len();
458 }
459 case cuChance:
460 default:
461 return 0;
462 }
463 }
464
465 // A simpler implementation would overwrite the tail of the poured response,
466 // but that does not work for tiny responses, may malform some cdb entries,
467 // and may even overwrite already buffered headers.
pourContentSuffix(const ObjId & oid,IOBuf & buf) const468 Size ContentCfg::pourContentSuffix(const ObjId &oid, IOBuf &buf) const {
469 if (theInjectionAlgorithm != ialgUriAtEnd)
470 return 0; // OK, no suffix configured at all
471
472 if (!shouldInject(oid))
473 return 0; // OK, this object does not need a suffix
474
475 // will need to be larger if we want to accomodate "long" foreign URIs
476 char sfx[8*1024];
477 ofixedstream os(sfx, sizeof(sfx));
478 const Size size = pourUri(oid, os);
479 if (size <= buf.spaceSize()) {
480 buf.append(sfx, size);
481 return size;
482 }
483 Should(size <= buf.capacity());
484 return 0; // and wait for the buffer to drain
485 }
486
contentUniqueness(const ObjId & oid) const487 int ContentCfg::contentUniqueness(const ObjId &oid) const {
488 if (theUniqueRatio < 0)
489 return cuChance; // default: leave it to chance or other factors
490
491 // no sense in generating content [prefix] for foreign oids
492 if (!Should(!oid.foreignUrl() && !oid.foreignSrc()))
493 return cuChance;
494
495 const int seed = GlbPermut(oid.hash(), rndUniqueContent);
496 RndGen rng(seed);
497 return rng.event(theUniqueRatio) ? cuUnique : cuCommon;
498 }
499
500 // internal method, should be called only if uniqueContent()
pourUniqueContentPrefix(const ObjId & oid,IOBuf & buf) const501 Size ContentCfg::pourUniqueContentPrefix(const ObjId &oid, IOBuf &buf) const {
502 // mimic Oid2Url() but do not use TheViservs and such, just indeces
503 ofixedstream os(buf.space(), buf.spaceSize());
504 const Size size = pourUri(oid, os);
505 Should(size < buf.spaceSize()); // otherwise may be too big
506 buf.appended(size);
507 return size;
508 }
509
pourUri(const ObjId & oid,ostream & os) const510 Size ContentCfg::pourUri(const ObjId &oid, ostream &os) const {
511 // mimic Oid2Url() but do not use TheViservs and such, just indeces
512 // if we need to allow truncated URIs, we can start with oid.hash()
513
514 os.write(" u:", 3); // a prefix to ease debugging/tracing
515 if (oid.foreignUrl()) {
516 os << oid.foreignUrl();
517 } else {
518 os << hex << setfill('0');
519 if (oid.secure())
520 os.write("s/", 2);
521 else
522 os << oid.scheme() << '/';
523 os << 'v' << setw(3) << oid.viserv() << '/' <<
524 'w' << oid.world() << '/' <<
525 't' << setw(2) << oid.type() << '/' <<
526 '_' << setw(16) << oid.name();
527 }
528 os << ' ';
529 os.flush();
530 return static_cast<std::streamoff>(os.tellp()); // poured size
531 }
532
contentHash(const ObjId & oid) const533 int ContentCfg::contentHash(const ObjId &oid) const {
534 if (contentUniqueness(oid) == cuCommon)
535 return theId;
536 else
537 return oid.hash();
538 }
539
selectCdbStart(const ObjId & oid) const540 int ContentCfg::selectCdbStart(const ObjId &oid) const {
541 Assert(theCdb);
542 RndGen rng(GlbPermut(contentHash(oid), rndCdbStart));
543 return rng(0, theCdb->count());
544 }
545
ramFile(const ObjId & oid) const546 const RamFile &ContentCfg::ramFile(const ObjId &oid) const {
547 Assert(theRamFiles && oid);
548 const int index = (oid.name() - 1) % theRamFiles->count();
549 return theRamFiles->fileAt(index);
550 }
551
pickStr(const Strings & strings,RndDistr * sel,int seed) const552 const String &ContentCfg::pickStr(const Strings &strings, RndDistr *sel, int seed) const {
553 static String noStr = 0;
554 if (const int count = strings.count()) {
555 sel->rndGen()->seed(seed);
556 const int idx = (int)sel->trial();
557 Assert(0 <= idx && idx < count);
558 return *strings[idx];
559 }
560 return noStr;
561 }
562
compContPerCall(const ContentCfg * cc) const563 double ContentCfg::compContPerCall(const ContentCfg *cc) const {
564 if (cc->id() == id())
565 return 1.0;
566
567 if (theEmbedCont)
568 return theEmbedCont->compContPerCall(cc);
569
570 return 0.0;
571 }
572
rndBuf() const573 const RndBuf &ContentCfg::rndBuf() const {
574 return generateText ? RndText() : RndBinary();
575 }
576
577 // XXX: iterators should be farmed, but it is hard because they
578 // come in different types (perhaps somebody else should farm them?)
getBodyIter(const ObjId & oid,const RangeList * const ranges)579 BodyIter *ContentCfg::getBodyIter(const ObjId &oid, const RangeList *const ranges) {
580 BodyIter *res = 0;
581
582 if (theCdb) {
583 CdbBodyIter *i = new CdbBodyIter;
584 i->cdb(theCdb);
585 if (theEmbedCont)
586 i->embedContModel(theEmbedCont);
587 i->startPos(selectCdbStart(oid));
588
589 if (theTdb) {
590 if (shouldInject(oid)) {
591 InjectIter *inj = new InjectIter; // XXX: Farm these!
592 inj->creator(this);
593 inj->textDbase(theTdb);
594 inj->gap(theInjGap);
595 i->injector(inj);
596 }
597 }
598
599 res = i;
600 } else
601 if (theRamFiles) {
602 RamFileBodyIter *i = new RamFileBodyIter;
603 i->file(ramFile(oid));
604 res = i;
605 } else
606 if (theEmbedCont) {
607 ContainerBodyIter *i = new ContainerBodyIter;
608 i->embedContModel(theEmbedCont);
609 res = i;
610 } else {
611 RndBodyIter *i = new RndBodyIter;
612 res = i;
613 }
614
615 if (Should(res)) {
616 // keep in sync with GzipEncoder ctor
617 res->contentCfg(this);
618 res->oidCfg(oid, contentHash(oid));
619 Size suffixSize;
620 const Size rawRepSize = calcRawRepSize(oid, &suffixSize);
621 res->contentSize(rawRepSize, suffixSize);
622 if (oid.gzipContent())
623 res = new GzipEncoder(theEncodings[codingGzip], res);
624 if (oid.range() && ranges)
625 res = new RangeBodyIter(*ranges, res);
626 }
627
628 return res;
629 }
630
putBodyIter(BodyIter * i) const631 void ContentCfg::putBodyIter(BodyIter *i) const {
632 i->stop();
633 delete i;
634 }
635
putInjector(InjectIter * i) const636 void ContentCfg::putInjector(InjectIter *i) const {
637 delete i; // XXX: Farm these?
638 }
639