181ad6265SDimitry Andric //===--- SourceLocationEncoding.h - Small serialized locations --*- C++ -*-===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric // Source locations are stored pervasively in the AST, making up a third of
1081ad6265SDimitry Andric // the size of typical serialized files. Storing them efficiently is important.
1181ad6265SDimitry Andric //
1281ad6265SDimitry Andric // We use integers optimized by VBR-encoding, because:
13*bdd1243dSDimitry Andric // - when abbreviations cannot be used, VBR6 encoding is our only choice
1481ad6265SDimitry Andric // - in the worst case a SourceLocation can be ~any 32-bit number, but in
1581ad6265SDimitry Andric // practice they are highly predictable
1681ad6265SDimitry Andric //
1781ad6265SDimitry Andric // We encode the integer so that likely values encode as small numbers that
1881ad6265SDimitry Andric // turn into few VBR chunks:
1981ad6265SDimitry Andric // - the invalid sentinel location is a very common value: it encodes as 0
2081ad6265SDimitry Andric // - the "macro or not" bit is stored at the bottom of the integer
2181ad6265SDimitry Andric // (rather than at the top, as in memory), so macro locations can have
2281ad6265SDimitry Andric // small representations.
2381ad6265SDimitry Andric // - related locations (e.g. of a left and right paren pair) are usually
2481ad6265SDimitry Andric // similar, so when encoding a sequence of locations we store only
2581ad6265SDimitry Andric // differences between successive elements.
2681ad6265SDimitry Andric //
2781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
2881ad6265SDimitry Andric
2981ad6265SDimitry Andric #include <climits>
3081ad6265SDimitry Andric #include "clang/Basic/SourceLocation.h"
3181ad6265SDimitry Andric
3281ad6265SDimitry Andric #ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
3381ad6265SDimitry Andric #define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
3481ad6265SDimitry Andric
3581ad6265SDimitry Andric namespace clang {
3681ad6265SDimitry Andric class SourceLocationSequence;
3781ad6265SDimitry Andric
3881ad6265SDimitry Andric /// Serialized encoding of SourceLocations without context.
3981ad6265SDimitry Andric /// Optimized to have small unsigned values (=> small after VBR encoding).
4081ad6265SDimitry Andric ///
4181ad6265SDimitry Andric // Macro locations have the top bit set, we rotate by one so it is the low bit.
4281ad6265SDimitry Andric class SourceLocationEncoding {
4381ad6265SDimitry Andric using UIntTy = SourceLocation::UIntTy;
4481ad6265SDimitry Andric constexpr static unsigned UIntBits = CHAR_BIT * sizeof(UIntTy);
4581ad6265SDimitry Andric
encodeRaw(UIntTy Raw)4681ad6265SDimitry Andric static UIntTy encodeRaw(UIntTy Raw) {
4781ad6265SDimitry Andric return (Raw << 1) | (Raw >> (UIntBits - 1));
4881ad6265SDimitry Andric }
decodeRaw(UIntTy Raw)4981ad6265SDimitry Andric static UIntTy decodeRaw(UIntTy Raw) {
5081ad6265SDimitry Andric return (Raw >> 1) | (Raw << (UIntBits - 1));
5181ad6265SDimitry Andric }
5281ad6265SDimitry Andric friend SourceLocationSequence;
5381ad6265SDimitry Andric
5481ad6265SDimitry Andric public:
5581ad6265SDimitry Andric static uint64_t encode(SourceLocation Loc,
5681ad6265SDimitry Andric SourceLocationSequence * = nullptr);
5781ad6265SDimitry Andric static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr);
5881ad6265SDimitry Andric };
5981ad6265SDimitry Andric
6081ad6265SDimitry Andric /// Serialized encoding of a sequence of SourceLocations.
6181ad6265SDimitry Andric ///
6281ad6265SDimitry Andric /// Optimized to produce small values when locations with the sequence are
6381ad6265SDimitry Andric /// similar. Each element can be delta-encoded against the last nonzero element.
6481ad6265SDimitry Andric ///
6581ad6265SDimitry Andric /// Sequences should be started by creating a SourceLocationSequence::State,
6681ad6265SDimitry Andric /// and then passed around as SourceLocationSequence*. Example:
6781ad6265SDimitry Andric ///
6881ad6265SDimitry Andric /// // establishes a sequence
6981ad6265SDimitry Andric /// void EmitTopLevelThing() {
7081ad6265SDimitry Andric /// SourceLocationSequence::State Seq;
7181ad6265SDimitry Andric /// EmitContainedThing(Seq);
7281ad6265SDimitry Andric /// EmitRecursiveThing(Seq);
7381ad6265SDimitry Andric /// }
7481ad6265SDimitry Andric ///
7581ad6265SDimitry Andric /// // optionally part of a sequence
7681ad6265SDimitry Andric /// void EmitContainedThing(SourceLocationSequence *Seq = nullptr) {
7781ad6265SDimitry Andric /// Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
7881ad6265SDimitry Andric /// }
7981ad6265SDimitry Andric ///
8081ad6265SDimitry Andric /// // establishes a sequence if there isn't one already
8181ad6265SDimitry Andric /// void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) {
8281ad6265SDimitry Andric /// SourceLocationSequence::State Seq(ParentSeq);
8381ad6265SDimitry Andric /// Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
8481ad6265SDimitry Andric /// EmitRecursiveThing(Seq);
8581ad6265SDimitry Andric /// }
8681ad6265SDimitry Andric ///
8781ad6265SDimitry Andric class SourceLocationSequence {
8881ad6265SDimitry Andric using UIntTy = SourceLocation::UIntTy;
8981ad6265SDimitry Andric using EncodedTy = uint64_t;
9081ad6265SDimitry Andric constexpr static auto UIntBits = SourceLocationEncoding::UIntBits;
9181ad6265SDimitry Andric static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!");
9281ad6265SDimitry Andric
9381ad6265SDimitry Andric // Prev stores the rotated last nonzero location.
9481ad6265SDimitry Andric UIntTy &Prev;
9581ad6265SDimitry Andric
9681ad6265SDimitry Andric // Zig-zag encoding turns small signed integers into small unsigned integers.
9781ad6265SDimitry Andric // 0 => 0, -1 => 1, 1 => 2, -2 => 3, ...
zigZag(UIntTy V)9881ad6265SDimitry Andric static UIntTy zigZag(UIntTy V) {
9981ad6265SDimitry Andric UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0);
10081ad6265SDimitry Andric return Sign ^ (V << 1);
10181ad6265SDimitry Andric }
zagZig(UIntTy V)10281ad6265SDimitry Andric static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); }
10381ad6265SDimitry Andric
SourceLocationSequence(UIntTy & Prev)10481ad6265SDimitry Andric SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {}
10581ad6265SDimitry Andric
encodeRaw(UIntTy Raw)10681ad6265SDimitry Andric EncodedTy encodeRaw(UIntTy Raw) {
10781ad6265SDimitry Andric if (Raw == 0)
10881ad6265SDimitry Andric return 0;
10981ad6265SDimitry Andric UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw);
11081ad6265SDimitry Andric if (Prev == 0)
11181ad6265SDimitry Andric return Prev = Rotated;
11281ad6265SDimitry Andric UIntTy Delta = Rotated - Prev;
11381ad6265SDimitry Andric Prev = Rotated;
11481ad6265SDimitry Andric // Exactly one 33 bit value is possible! (1 << 32).
11581ad6265SDimitry Andric // This is because we have two representations of zero: trivial & relative.
11681ad6265SDimitry Andric return 1 + EncodedTy{zigZag(Delta)};
11781ad6265SDimitry Andric }
decodeRaw(EncodedTy Encoded)11881ad6265SDimitry Andric UIntTy decodeRaw(EncodedTy Encoded) {
11981ad6265SDimitry Andric if (Encoded == 0)
12081ad6265SDimitry Andric return 0;
12181ad6265SDimitry Andric if (Prev == 0)
12281ad6265SDimitry Andric return SourceLocationEncoding::decodeRaw(Prev = Encoded);
12381ad6265SDimitry Andric return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1));
12481ad6265SDimitry Andric }
12581ad6265SDimitry Andric
12681ad6265SDimitry Andric public:
decode(EncodedTy Encoded)12781ad6265SDimitry Andric SourceLocation decode(EncodedTy Encoded) {
12881ad6265SDimitry Andric return SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
12981ad6265SDimitry Andric }
encode(SourceLocation Loc)13081ad6265SDimitry Andric EncodedTy encode(SourceLocation Loc) {
13181ad6265SDimitry Andric return encodeRaw(Loc.getRawEncoding());
13281ad6265SDimitry Andric }
13381ad6265SDimitry Andric
13481ad6265SDimitry Andric class State;
13581ad6265SDimitry Andric };
13681ad6265SDimitry Andric
13781ad6265SDimitry Andric /// This object establishes a SourceLocationSequence.
13881ad6265SDimitry Andric class SourceLocationSequence::State {
13981ad6265SDimitry Andric UIntTy Prev = 0;
14081ad6265SDimitry Andric SourceLocationSequence Seq;
14181ad6265SDimitry Andric
14281ad6265SDimitry Andric public:
14381ad6265SDimitry Andric // If Parent is provided and non-null, then this root becomes part of that
14481ad6265SDimitry Andric // enclosing sequence instead of establishing a new one.
14581ad6265SDimitry Andric State(SourceLocationSequence *Parent = nullptr)
14681ad6265SDimitry Andric : Seq(Parent ? Parent->Prev : Prev) {}
14781ad6265SDimitry Andric
14881ad6265SDimitry Andric // Implicit conversion for uniform use of roots vs propagated sequences.
14981ad6265SDimitry Andric operator SourceLocationSequence *() { return &Seq; }
15081ad6265SDimitry Andric };
15181ad6265SDimitry Andric
encode(SourceLocation Loc,SourceLocationSequence * Seq)15281ad6265SDimitry Andric inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc,
15381ad6265SDimitry Andric SourceLocationSequence *Seq) {
15481ad6265SDimitry Andric return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
15581ad6265SDimitry Andric }
15681ad6265SDimitry Andric inline SourceLocation
decode(uint64_t Encoded,SourceLocationSequence * Seq)15781ad6265SDimitry Andric SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) {
15881ad6265SDimitry Andric return Seq ? Seq->decode(Encoded)
15981ad6265SDimitry Andric : SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
16081ad6265SDimitry Andric }
16181ad6265SDimitry Andric
16281ad6265SDimitry Andric } // namespace clang
16381ad6265SDimitry Andric #endif
164