1*12c85518Srobert //===--- SourceLocationEncoding.h - Small serialized locations --*- C++ -*-===//
2*12c85518Srobert //
3*12c85518Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*12c85518Srobert // See https://llvm.org/LICENSE.txt for license information.
5*12c85518Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*12c85518Srobert //
7*12c85518Srobert //===----------------------------------------------------------------------===//
8*12c85518Srobert //
9*12c85518Srobert // Source locations are stored pervasively in the AST, making up a third of
10*12c85518Srobert // the size of typical serialized files. Storing them efficiently is important.
11*12c85518Srobert //
12*12c85518Srobert // We use integers optimized by VBR-encoding, because:
13*12c85518Srobert //  - when abbreviations cannot be used, VBR6 encoding is our only choice
14*12c85518Srobert //  - in the worst case a SourceLocation can be ~any 32-bit number, but in
15*12c85518Srobert //    practice they are highly predictable
16*12c85518Srobert //
17*12c85518Srobert // We encode the integer so that likely values encode as small numbers that
18*12c85518Srobert // turn into few VBR chunks:
19*12c85518Srobert //  - the invalid sentinel location is a very common value: it encodes as 0
20*12c85518Srobert //  - the "macro or not" bit is stored at the bottom of the integer
21*12c85518Srobert //    (rather than at the top, as in memory), so macro locations can have
22*12c85518Srobert //    small representations.
23*12c85518Srobert //  - related locations (e.g. of a left and right paren pair) are usually
24*12c85518Srobert //    similar, so when encoding a sequence of locations we store only
25*12c85518Srobert //    differences between successive elements.
26*12c85518Srobert //
27*12c85518Srobert //===----------------------------------------------------------------------===//
28*12c85518Srobert 
29*12c85518Srobert #include <climits>
30*12c85518Srobert #include "clang/Basic/SourceLocation.h"
31*12c85518Srobert 
32*12c85518Srobert #ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
33*12c85518Srobert #define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
34*12c85518Srobert 
35*12c85518Srobert namespace clang {
36*12c85518Srobert class SourceLocationSequence;
37*12c85518Srobert 
38*12c85518Srobert /// Serialized encoding of SourceLocations without context.
39*12c85518Srobert /// Optimized to have small unsigned values (=> small after VBR encoding).
40*12c85518Srobert ///
41*12c85518Srobert // Macro locations have the top bit set, we rotate by one so it is the low bit.
42*12c85518Srobert class SourceLocationEncoding {
43*12c85518Srobert   using UIntTy = SourceLocation::UIntTy;
44*12c85518Srobert   constexpr static unsigned UIntBits = CHAR_BIT * sizeof(UIntTy);
45*12c85518Srobert 
encodeRaw(UIntTy Raw)46*12c85518Srobert   static UIntTy encodeRaw(UIntTy Raw) {
47*12c85518Srobert     return (Raw << 1) | (Raw >> (UIntBits - 1));
48*12c85518Srobert   }
decodeRaw(UIntTy Raw)49*12c85518Srobert   static UIntTy decodeRaw(UIntTy Raw) {
50*12c85518Srobert     return (Raw >> 1) | (Raw << (UIntBits - 1));
51*12c85518Srobert   }
52*12c85518Srobert   friend SourceLocationSequence;
53*12c85518Srobert 
54*12c85518Srobert public:
55*12c85518Srobert   static uint64_t encode(SourceLocation Loc,
56*12c85518Srobert                          SourceLocationSequence * = nullptr);
57*12c85518Srobert   static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr);
58*12c85518Srobert };
59*12c85518Srobert 
60*12c85518Srobert /// Serialized encoding of a sequence of SourceLocations.
61*12c85518Srobert ///
62*12c85518Srobert /// Optimized to produce small values when locations with the sequence are
63*12c85518Srobert /// similar. Each element can be delta-encoded against the last nonzero element.
64*12c85518Srobert ///
65*12c85518Srobert /// Sequences should be started by creating a SourceLocationSequence::State,
66*12c85518Srobert /// and then passed around as SourceLocationSequence*. Example:
67*12c85518Srobert ///
68*12c85518Srobert ///   // establishes a sequence
69*12c85518Srobert ///   void EmitTopLevelThing() {
70*12c85518Srobert ///     SourceLocationSequence::State Seq;
71*12c85518Srobert ///     EmitContainedThing(Seq);
72*12c85518Srobert ///     EmitRecursiveThing(Seq);
73*12c85518Srobert ///   }
74*12c85518Srobert ///
75*12c85518Srobert ///   // optionally part of a sequence
76*12c85518Srobert ///   void EmitContainedThing(SourceLocationSequence *Seq = nullptr) {
77*12c85518Srobert ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
78*12c85518Srobert ///   }
79*12c85518Srobert ///
80*12c85518Srobert ///   // establishes a sequence if there isn't one already
81*12c85518Srobert ///   void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) {
82*12c85518Srobert ///     SourceLocationSequence::State Seq(ParentSeq);
83*12c85518Srobert ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
84*12c85518Srobert ///     EmitRecursiveThing(Seq);
85*12c85518Srobert ///   }
86*12c85518Srobert ///
87*12c85518Srobert class SourceLocationSequence {
88*12c85518Srobert   using UIntTy = SourceLocation::UIntTy;
89*12c85518Srobert   using EncodedTy = uint64_t;
90*12c85518Srobert   constexpr static auto UIntBits = SourceLocationEncoding::UIntBits;
91*12c85518Srobert   static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!");
92*12c85518Srobert 
93*12c85518Srobert   // Prev stores the rotated last nonzero location.
94*12c85518Srobert   UIntTy &Prev;
95*12c85518Srobert 
96*12c85518Srobert   // Zig-zag encoding turns small signed integers into small unsigned integers.
97*12c85518Srobert   // 0 => 0, -1 => 1, 1 => 2, -2 => 3, ...
zigZag(UIntTy V)98*12c85518Srobert   static UIntTy zigZag(UIntTy V) {
99*12c85518Srobert     UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0);
100*12c85518Srobert     return Sign ^ (V << 1);
101*12c85518Srobert   }
zagZig(UIntTy V)102*12c85518Srobert   static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); }
103*12c85518Srobert 
SourceLocationSequence(UIntTy & Prev)104*12c85518Srobert   SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {}
105*12c85518Srobert 
encodeRaw(UIntTy Raw)106*12c85518Srobert   EncodedTy encodeRaw(UIntTy Raw) {
107*12c85518Srobert     if (Raw == 0)
108*12c85518Srobert       return 0;
109*12c85518Srobert     UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw);
110*12c85518Srobert     if (Prev == 0)
111*12c85518Srobert       return Prev = Rotated;
112*12c85518Srobert     UIntTy Delta = Rotated - Prev;
113*12c85518Srobert     Prev = Rotated;
114*12c85518Srobert     // Exactly one 33 bit value is possible! (1 << 32).
115*12c85518Srobert     // This is because we have two representations of zero: trivial & relative.
116*12c85518Srobert     return 1 + EncodedTy{zigZag(Delta)};
117*12c85518Srobert   }
decodeRaw(EncodedTy Encoded)118*12c85518Srobert   UIntTy decodeRaw(EncodedTy Encoded) {
119*12c85518Srobert     if (Encoded == 0)
120*12c85518Srobert       return 0;
121*12c85518Srobert     if (Prev == 0)
122*12c85518Srobert       return SourceLocationEncoding::decodeRaw(Prev = Encoded);
123*12c85518Srobert     return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1));
124*12c85518Srobert   }
125*12c85518Srobert 
126*12c85518Srobert public:
decode(EncodedTy Encoded)127*12c85518Srobert   SourceLocation decode(EncodedTy Encoded) {
128*12c85518Srobert     return SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
129*12c85518Srobert   }
encode(SourceLocation Loc)130*12c85518Srobert   EncodedTy encode(SourceLocation Loc) {
131*12c85518Srobert     return encodeRaw(Loc.getRawEncoding());
132*12c85518Srobert   }
133*12c85518Srobert 
134*12c85518Srobert   class State;
135*12c85518Srobert };
136*12c85518Srobert 
137*12c85518Srobert /// This object establishes a SourceLocationSequence.
138*12c85518Srobert class SourceLocationSequence::State {
139*12c85518Srobert   UIntTy Prev = 0;
140*12c85518Srobert   SourceLocationSequence Seq;
141*12c85518Srobert 
142*12c85518Srobert public:
143*12c85518Srobert   // If Parent is provided and non-null, then this root becomes part of that
144*12c85518Srobert   // enclosing sequence instead of establishing a new one.
145*12c85518Srobert   State(SourceLocationSequence *Parent = nullptr)
146*12c85518Srobert       : Seq(Parent ? Parent->Prev : Prev) {}
147*12c85518Srobert 
148*12c85518Srobert   // Implicit conversion for uniform use of roots vs propagated sequences.
149*12c85518Srobert   operator SourceLocationSequence *() { return &Seq; }
150*12c85518Srobert };
151*12c85518Srobert 
encode(SourceLocation Loc,SourceLocationSequence * Seq)152*12c85518Srobert inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc,
153*12c85518Srobert                                                SourceLocationSequence *Seq) {
154*12c85518Srobert   return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
155*12c85518Srobert }
156*12c85518Srobert inline SourceLocation
decode(uint64_t Encoded,SourceLocationSequence * Seq)157*12c85518Srobert SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) {
158*12c85518Srobert   return Seq ? Seq->decode(Encoded)
159*12c85518Srobert              : SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
160*12c85518Srobert }
161*12c85518Srobert 
162*12c85518Srobert } // namespace clang
163*12c85518Srobert #endif
164