181ad6265SDimitry Andric //===--- SourceLocationEncoding.h - Small serialized locations --*- C++ -*-===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric // Source locations are stored pervasively in the AST, making up a third of
1081ad6265SDimitry Andric // the size of typical serialized files. Storing them efficiently is important.
1181ad6265SDimitry Andric //
1281ad6265SDimitry Andric // We use integers optimized by VBR-encoding, because:
13*bdd1243dSDimitry Andric //  - when abbreviations cannot be used, VBR6 encoding is our only choice
1481ad6265SDimitry Andric //  - in the worst case a SourceLocation can be ~any 32-bit number, but in
1581ad6265SDimitry Andric //    practice they are highly predictable
1681ad6265SDimitry Andric //
1781ad6265SDimitry Andric // We encode the integer so that likely values encode as small numbers that
1881ad6265SDimitry Andric // turn into few VBR chunks:
1981ad6265SDimitry Andric //  - the invalid sentinel location is a very common value: it encodes as 0
2081ad6265SDimitry Andric //  - the "macro or not" bit is stored at the bottom of the integer
2181ad6265SDimitry Andric //    (rather than at the top, as in memory), so macro locations can have
2281ad6265SDimitry Andric //    small representations.
2381ad6265SDimitry Andric //  - related locations (e.g. of a left and right paren pair) are usually
2481ad6265SDimitry Andric //    similar, so when encoding a sequence of locations we store only
2581ad6265SDimitry Andric //    differences between successive elements.
2681ad6265SDimitry Andric //
2781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
2881ad6265SDimitry Andric 
2981ad6265SDimitry Andric #include <climits>
3081ad6265SDimitry Andric #include "clang/Basic/SourceLocation.h"
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric #ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
3381ad6265SDimitry Andric #define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
3481ad6265SDimitry Andric 
3581ad6265SDimitry Andric namespace clang {
3681ad6265SDimitry Andric class SourceLocationSequence;
3781ad6265SDimitry Andric 
3881ad6265SDimitry Andric /// Serialized encoding of SourceLocations without context.
3981ad6265SDimitry Andric /// Optimized to have small unsigned values (=> small after VBR encoding).
4081ad6265SDimitry Andric ///
4181ad6265SDimitry Andric // Macro locations have the top bit set, we rotate by one so it is the low bit.
4281ad6265SDimitry Andric class SourceLocationEncoding {
4381ad6265SDimitry Andric   using UIntTy = SourceLocation::UIntTy;
4481ad6265SDimitry Andric   constexpr static unsigned UIntBits = CHAR_BIT * sizeof(UIntTy);
4581ad6265SDimitry Andric 
encodeRaw(UIntTy Raw)4681ad6265SDimitry Andric   static UIntTy encodeRaw(UIntTy Raw) {
4781ad6265SDimitry Andric     return (Raw << 1) | (Raw >> (UIntBits - 1));
4881ad6265SDimitry Andric   }
decodeRaw(UIntTy Raw)4981ad6265SDimitry Andric   static UIntTy decodeRaw(UIntTy Raw) {
5081ad6265SDimitry Andric     return (Raw >> 1) | (Raw << (UIntBits - 1));
5181ad6265SDimitry Andric   }
5281ad6265SDimitry Andric   friend SourceLocationSequence;
5381ad6265SDimitry Andric 
5481ad6265SDimitry Andric public:
5581ad6265SDimitry Andric   static uint64_t encode(SourceLocation Loc,
5681ad6265SDimitry Andric                          SourceLocationSequence * = nullptr);
5781ad6265SDimitry Andric   static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr);
5881ad6265SDimitry Andric };
5981ad6265SDimitry Andric 
6081ad6265SDimitry Andric /// Serialized encoding of a sequence of SourceLocations.
6181ad6265SDimitry Andric ///
6281ad6265SDimitry Andric /// Optimized to produce small values when locations with the sequence are
6381ad6265SDimitry Andric /// similar. Each element can be delta-encoded against the last nonzero element.
6481ad6265SDimitry Andric ///
6581ad6265SDimitry Andric /// Sequences should be started by creating a SourceLocationSequence::State,
6681ad6265SDimitry Andric /// and then passed around as SourceLocationSequence*. Example:
6781ad6265SDimitry Andric ///
6881ad6265SDimitry Andric ///   // establishes a sequence
6981ad6265SDimitry Andric ///   void EmitTopLevelThing() {
7081ad6265SDimitry Andric ///     SourceLocationSequence::State Seq;
7181ad6265SDimitry Andric ///     EmitContainedThing(Seq);
7281ad6265SDimitry Andric ///     EmitRecursiveThing(Seq);
7381ad6265SDimitry Andric ///   }
7481ad6265SDimitry Andric ///
7581ad6265SDimitry Andric ///   // optionally part of a sequence
7681ad6265SDimitry Andric ///   void EmitContainedThing(SourceLocationSequence *Seq = nullptr) {
7781ad6265SDimitry Andric ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
7881ad6265SDimitry Andric ///   }
7981ad6265SDimitry Andric ///
8081ad6265SDimitry Andric ///   // establishes a sequence if there isn't one already
8181ad6265SDimitry Andric ///   void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) {
8281ad6265SDimitry Andric ///     SourceLocationSequence::State Seq(ParentSeq);
8381ad6265SDimitry Andric ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
8481ad6265SDimitry Andric ///     EmitRecursiveThing(Seq);
8581ad6265SDimitry Andric ///   }
8681ad6265SDimitry Andric ///
8781ad6265SDimitry Andric class SourceLocationSequence {
8881ad6265SDimitry Andric   using UIntTy = SourceLocation::UIntTy;
8981ad6265SDimitry Andric   using EncodedTy = uint64_t;
9081ad6265SDimitry Andric   constexpr static auto UIntBits = SourceLocationEncoding::UIntBits;
9181ad6265SDimitry Andric   static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!");
9281ad6265SDimitry Andric 
9381ad6265SDimitry Andric   // Prev stores the rotated last nonzero location.
9481ad6265SDimitry Andric   UIntTy &Prev;
9581ad6265SDimitry Andric 
9681ad6265SDimitry Andric   // Zig-zag encoding turns small signed integers into small unsigned integers.
9781ad6265SDimitry Andric   // 0 => 0, -1 => 1, 1 => 2, -2 => 3, ...
zigZag(UIntTy V)9881ad6265SDimitry Andric   static UIntTy zigZag(UIntTy V) {
9981ad6265SDimitry Andric     UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0);
10081ad6265SDimitry Andric     return Sign ^ (V << 1);
10181ad6265SDimitry Andric   }
zagZig(UIntTy V)10281ad6265SDimitry Andric   static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); }
10381ad6265SDimitry Andric 
SourceLocationSequence(UIntTy & Prev)10481ad6265SDimitry Andric   SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {}
10581ad6265SDimitry Andric 
encodeRaw(UIntTy Raw)10681ad6265SDimitry Andric   EncodedTy encodeRaw(UIntTy Raw) {
10781ad6265SDimitry Andric     if (Raw == 0)
10881ad6265SDimitry Andric       return 0;
10981ad6265SDimitry Andric     UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw);
11081ad6265SDimitry Andric     if (Prev == 0)
11181ad6265SDimitry Andric       return Prev = Rotated;
11281ad6265SDimitry Andric     UIntTy Delta = Rotated - Prev;
11381ad6265SDimitry Andric     Prev = Rotated;
11481ad6265SDimitry Andric     // Exactly one 33 bit value is possible! (1 << 32).
11581ad6265SDimitry Andric     // This is because we have two representations of zero: trivial & relative.
11681ad6265SDimitry Andric     return 1 + EncodedTy{zigZag(Delta)};
11781ad6265SDimitry Andric   }
decodeRaw(EncodedTy Encoded)11881ad6265SDimitry Andric   UIntTy decodeRaw(EncodedTy Encoded) {
11981ad6265SDimitry Andric     if (Encoded == 0)
12081ad6265SDimitry Andric       return 0;
12181ad6265SDimitry Andric     if (Prev == 0)
12281ad6265SDimitry Andric       return SourceLocationEncoding::decodeRaw(Prev = Encoded);
12381ad6265SDimitry Andric     return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1));
12481ad6265SDimitry Andric   }
12581ad6265SDimitry Andric 
12681ad6265SDimitry Andric public:
decode(EncodedTy Encoded)12781ad6265SDimitry Andric   SourceLocation decode(EncodedTy Encoded) {
12881ad6265SDimitry Andric     return SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
12981ad6265SDimitry Andric   }
encode(SourceLocation Loc)13081ad6265SDimitry Andric   EncodedTy encode(SourceLocation Loc) {
13181ad6265SDimitry Andric     return encodeRaw(Loc.getRawEncoding());
13281ad6265SDimitry Andric   }
13381ad6265SDimitry Andric 
13481ad6265SDimitry Andric   class State;
13581ad6265SDimitry Andric };
13681ad6265SDimitry Andric 
13781ad6265SDimitry Andric /// This object establishes a SourceLocationSequence.
13881ad6265SDimitry Andric class SourceLocationSequence::State {
13981ad6265SDimitry Andric   UIntTy Prev = 0;
14081ad6265SDimitry Andric   SourceLocationSequence Seq;
14181ad6265SDimitry Andric 
14281ad6265SDimitry Andric public:
14381ad6265SDimitry Andric   // If Parent is provided and non-null, then this root becomes part of that
14481ad6265SDimitry Andric   // enclosing sequence instead of establishing a new one.
14581ad6265SDimitry Andric   State(SourceLocationSequence *Parent = nullptr)
14681ad6265SDimitry Andric       : Seq(Parent ? Parent->Prev : Prev) {}
14781ad6265SDimitry Andric 
14881ad6265SDimitry Andric   // Implicit conversion for uniform use of roots vs propagated sequences.
14981ad6265SDimitry Andric   operator SourceLocationSequence *() { return &Seq; }
15081ad6265SDimitry Andric };
15181ad6265SDimitry Andric 
encode(SourceLocation Loc,SourceLocationSequence * Seq)15281ad6265SDimitry Andric inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc,
15381ad6265SDimitry Andric                                                SourceLocationSequence *Seq) {
15481ad6265SDimitry Andric   return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
15581ad6265SDimitry Andric }
15681ad6265SDimitry Andric inline SourceLocation
decode(uint64_t Encoded,SourceLocationSequence * Seq)15781ad6265SDimitry Andric SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) {
15881ad6265SDimitry Andric   return Seq ? Seq->decode(Encoded)
15981ad6265SDimitry Andric              : SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
16081ad6265SDimitry Andric }
16181ad6265SDimitry Andric 
16281ad6265SDimitry Andric } // namespace clang
16381ad6265SDimitry Andric #endif
164