1 // -*- mode: C++ -*-
2 
3 // Copyright (c) 2010 Google Inc. All Rights Reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #ifndef COMMON_DWARF_BYTEREADER_H__
32 #define COMMON_DWARF_BYTEREADER_H__
33 
34 #include <stdint.h>
35 
36 #include <string>
37 
38 #include "common/dwarf/types.h"
39 #include "common/dwarf/dwarf2enums.h"
40 
41 namespace dwarf2reader {
42 
43 // We can't use the obvious name of LITTLE_ENDIAN and BIG_ENDIAN
44 // because it conflicts with a macro
45 enum Endianness {
46   ENDIANNESS_BIG,
47   ENDIANNESS_LITTLE
48 };
49 
50 // A ByteReader knows how to read single- and multi-byte values of
51 // various endiannesses, sizes, and encodings, as used in DWARF
52 // debugging information and Linux C++ exception handling data.
53 class ByteReader {
54  public:
55   // Construct a ByteReader capable of reading one-, two-, four-, and
56   // eight-byte values according to ENDIANNESS, absolute machine-sized
57   // addresses, DWARF-style "initial length" values, signed and
58   // unsigned LEB128 numbers, and Linux C++ exception handling data's
59   // encoded pointers.
60   explicit ByteReader(enum Endianness endianness);
61   virtual ~ByteReader();
62 
63   // Read a single byte from BUFFER and return it as an unsigned 8 bit
64   // number.
65   uint8 ReadOneByte(const uint8_t *buffer) const;
66 
67   // Read two bytes from BUFFER and return them as an unsigned 16 bit
68   // number, using this ByteReader's endianness.
69   uint16 ReadTwoBytes(const uint8_t *buffer) const;
70 
71   // Read four bytes from BUFFER and return them as an unsigned 32 bit
72   // number, using this ByteReader's endianness. This function returns
73   // a uint64 so that it is compatible with ReadAddress and
74   // ReadOffset. The number it returns will never be outside the range
75   // of an unsigned 32 bit integer.
76   uint64 ReadFourBytes(const uint8_t *buffer) const;
77 
78   // Read eight bytes from BUFFER and return them as an unsigned 64
79   // bit number, using this ByteReader's endianness.
80   uint64 ReadEightBytes(const uint8_t *buffer) const;
81 
82   // Read an unsigned LEB128 (Little Endian Base 128) number from
83   // BUFFER and return it as an unsigned 64 bit integer. Set LEN to
84   // the number of bytes read.
85   //
86   // The unsigned LEB128 representation of an integer N is a variable
87   // number of bytes:
88   //
89   // - If N is between 0 and 0x7f, then its unsigned LEB128
90   //   representation is a single byte whose value is N.
91   //
92   // - Otherwise, its unsigned LEB128 representation is (N & 0x7f) |
93   //   0x80, followed by the unsigned LEB128 representation of N /
94   //   128, rounded towards negative infinity.
95   //
96   // In other words, we break VALUE into groups of seven bits, put
97   // them in little-endian order, and then write them as eight-bit
98   // bytes with the high bit on all but the last.
99   uint64 ReadUnsignedLEB128(const uint8_t *buffer, size_t *len) const;
100 
101   // Read a signed LEB128 number from BUFFER and return it as an
102   // signed 64 bit integer. Set LEN to the number of bytes read.
103   //
104   // The signed LEB128 representation of an integer N is a variable
105   // number of bytes:
106   //
107   // - If N is between -0x40 and 0x3f, then its signed LEB128
108   //   representation is a single byte whose value is N in two's
109   //   complement.
110   //
111   // - Otherwise, its signed LEB128 representation is (N & 0x7f) |
112   //   0x80, followed by the signed LEB128 representation of N / 128,
113   //   rounded towards negative infinity.
114   //
115   // In other words, we break VALUE into groups of seven bits, put
116   // them in little-endian order, and then write them as eight-bit
117   // bytes with the high bit on all but the last.
118   int64 ReadSignedLEB128(const uint8_t *buffer, size_t *len) const;
119 
120   // Indicate that addresses on this architecture are SIZE bytes long. SIZE
121   // must be either 4 or 8. (DWARF allows addresses to be any number of
122   // bytes in length from 1 to 255, but we only support 32- and 64-bit
123   // addresses at the moment.) You must call this before using the
124   // ReadAddress member function.
125   //
126   // For data in a .debug_info section, or something that .debug_info
127   // refers to like line number or macro data, the compilation unit
128   // header's address_size field indicates the address size to use. Call
129   // frame information doesn't indicate its address size (a shortcoming of
130   // the spec); you must supply the appropriate size based on the
131   // architecture of the target machine.
132   void SetAddressSize(uint8 size);
133 
134   // Return the current address size, in bytes. This is either 4,
135   // indicating 32-bit addresses, or 8, indicating 64-bit addresses.
136   uint8 AddressSize() const { return address_size_; }
137 
138   // Read an address from BUFFER and return it as an unsigned 64 bit
139   // integer, respecting this ByteReader's endianness and address size. You
140   // must call SetAddressSize before calling this function.
141   uint64 ReadAddress(const uint8_t *buffer) const;
142 
143   // DWARF actually defines two slightly different formats: 32-bit DWARF
144   // and 64-bit DWARF. This is *not* related to the size of registers or
145   // addresses on the target machine; it refers only to the size of section
146   // offsets and data lengths appearing in the DWARF data. One only needs
147   // 64-bit DWARF when the debugging data itself is larger than 4GiB.
148   // 32-bit DWARF can handle x86_64 or PPC64 code just fine, unless the
149   // debugging data itself is very large.
150   //
151   // DWARF information identifies itself as 32-bit or 64-bit DWARF: each
152   // compilation unit and call frame information entry begins with an
153   // "initial length" field, which, in addition to giving the length of the
154   // data, also indicates the size of section offsets and lengths appearing
155   // in that data. The ReadInitialLength member function, below, reads an
156   // initial length and sets the ByteReader's offset size as a side effect.
157   // Thus, in the normal process of reading DWARF data, the appropriate
158   // offset size is set automatically. So, you should only need to call
159   // SetOffsetSize if you are using the same ByteReader to jump from the
160   // midst of one block of DWARF data into another.
161 
162   // Read a DWARF "initial length" field from START, and return it as
163   // an unsigned 64 bit integer, respecting this ByteReader's
164   // endianness. Set *LEN to the length of the initial length in
165   // bytes, either four or twelve. As a side effect, set this
166   // ByteReader's offset size to either 4 (if we see a 32-bit DWARF
167   // initial length) or 8 (if we see a 64-bit DWARF initial length).
168   //
169   // A DWARF initial length is either:
170   //
171   // - a byte count stored as an unsigned 32-bit value less than
172   //   0xffffff00, indicating that the data whose length is being
173   //   measured uses the 32-bit DWARF format, or
174   //
175   // - The 32-bit value 0xffffffff, followed by a 64-bit byte count,
176   //   indicating that the data whose length is being measured uses
177   //   the 64-bit DWARF format.
178   uint64 ReadInitialLength(const uint8_t *start, size_t *len);
179 
180   // Read an offset from BUFFER and return it as an unsigned 64 bit
181   // integer, respecting the ByteReader's endianness. In 32-bit DWARF, the
182   // offset is 4 bytes long; in 64-bit DWARF, the offset is eight bytes
183   // long. You must call ReadInitialLength or SetOffsetSize before calling
184   // this function; see the comments above for details.
185   uint64 ReadOffset(const uint8_t *buffer) const;
186 
187   // Return the current offset size, in bytes.
188   // A return value of 4 indicates that we are reading 32-bit DWARF.
189   // A return value of 8 indicates that we are reading 64-bit DWARF.
190   uint8 OffsetSize() const { return offset_size_; }
191 
192   // Indicate that section offsets and lengths are SIZE bytes long. SIZE
193   // must be either 4 (meaning 32-bit DWARF) or 8 (meaning 64-bit DWARF).
194   // Usually, you should not call this function yourself; instead, let a
195   // call to ReadInitialLength establish the data's offset size
196   // automatically.
197   void SetOffsetSize(uint8 size);
198 
199   // The Linux C++ ABI uses a variant of DWARF call frame information
200   // for exception handling. This data is included in the program's
201   // address space as the ".eh_frame" section, and intepreted at
202   // runtime to walk the stack, find exception handlers, and run
203   // cleanup code. The format is mostly the same as DWARF CFI, with
204   // some adjustments made to provide the additional
205   // exception-handling data, and to make the data easier to work with
206   // in memory --- for example, to allow it to be placed in read-only
207   // memory even when describing position-independent code.
208   //
209   // In particular, exception handling data can select a number of
210   // different encodings for pointers that appear in the data, as
211   // described by the DwarfPointerEncoding enum. There are actually
212   // four axes(!) to the encoding:
213   //
214   // - The pointer size: pointers can be 2, 4, or 8 bytes long, or use
215   //   the DWARF LEB128 encoding.
216   //
217   // - The pointer's signedness: pointers can be signed or unsigned.
218   //
219   // - The pointer's base address: the data stored in the exception
220   //   handling data can be the actual address (that is, an absolute
221   //   pointer), or relative to one of a number of different base
222   //   addreses --- including that of the encoded pointer itself, for
223   //   a form of "pc-relative" addressing.
224   //
225   // - The pointer may be indirect: it may be the address where the
226   //   true pointer is stored. (This is used to refer to things via
227   //   global offset table entries, program linkage table entries, or
228   //   other tricks used in position-independent code.)
229   //
230   // There are also two options that fall outside that matrix
231   // altogether: the pointer may be omitted, or it may have padding to
232   // align it on an appropriate address boundary. (That last option
233   // may seem like it should be just another axis, but it is not.)
234 
235   // Indicate that the exception handling data is loaded starting at
236   // SECTION_BASE, and that the start of its buffer in our own memory
237   // is BUFFER_BASE. This allows us to find the address that a given
238   // byte in our buffer would have when loaded into the program the
239   // data describes. We need this to resolve DW_EH_PE_pcrel pointers.
240   void SetCFIDataBase(uint64 section_base, const uint8_t *buffer_base);
241 
242   // Indicate that the base address of the program's ".text" section
243   // is TEXT_BASE. We need this to resolve DW_EH_PE_textrel pointers.
244   void SetTextBase(uint64 text_base);
245 
246   // Indicate that the base address for DW_EH_PE_datarel pointers is
247   // DATA_BASE. The proper value depends on the ABI; it is usually the
248   // address of the global offset table, held in a designated register in
249   // position-independent code. You will need to look at the startup code
250   // for the target system to be sure. I tried; my eyes bled.
251   void SetDataBase(uint64 data_base);
252 
253   // Indicate that the base address for the FDE we are processing is
254   // FUNCTION_BASE. This is the start address of DW_EH_PE_funcrel
255   // pointers. (This encoding does not seem to be used by the GNU
256   // toolchain.)
257   void SetFunctionBase(uint64 function_base);
258 
259   // Indicate that we are no longer processing any FDE, so any use of
260   // a DW_EH_PE_funcrel encoding is an error.
261   void ClearFunctionBase();
262 
263   // Return true if ENCODING is a valid pointer encoding.
264   bool ValidEncoding(DwarfPointerEncoding encoding) const;
265 
266   // Return true if we have all the information we need to read a
267   // pointer that uses ENCODING. This checks that the appropriate
268   // SetFooBase function for ENCODING has been called.
269   bool UsableEncoding(DwarfPointerEncoding encoding) const;
270 
271   // Read an encoded pointer from BUFFER using ENCODING; return the
272   // absolute address it represents, and set *LEN to the pointer's
273   // length in bytes, including any padding for aligned pointers.
274   //
275   // This function calls 'abort' if ENCODING is invalid or refers to a
276   // base address this reader hasn't been given, so you should check
277   // with ValidEncoding and UsableEncoding first if you would rather
278   // die in a more helpful way.
279   uint64 ReadEncodedPointer(const uint8_t *buffer,
280                             DwarfPointerEncoding encoding,
281                             size_t *len) const;
282 
283   Endianness GetEndianness() const;
284  private:
285 
286   // Function pointer type for our address and offset readers.
287   typedef uint64 (ByteReader::*AddressReader)(const uint8_t *) const;
288 
289   // Read an offset from BUFFER and return it as an unsigned 64 bit
290   // integer.  DWARF2/3 define offsets as either 4 or 8 bytes,
291   // generally depending on the amount of DWARF2/3 info present.
292   // This function pointer gets set by SetOffsetSize.
293   AddressReader offset_reader_;
294 
295   // Read an address from BUFFER and return it as an unsigned 64 bit
296   // integer.  DWARF2/3 allow addresses to be any size from 0-255
297   // bytes currently.  Internally we support 4 and 8 byte addresses,
298   // and will CHECK on anything else.
299   // This function pointer gets set by SetAddressSize.
300   AddressReader address_reader_;
301 
302   Endianness endian_;
303   uint8 address_size_;
304   uint8 offset_size_;
305 
306   // Base addresses for Linux C++ exception handling data's encoded pointers.
307   bool have_section_base_, have_text_base_, have_data_base_;
308   bool have_function_base_;
309   uint64 section_base_, text_base_, data_base_, function_base_;
310   const uint8_t *buffer_base_;
311 };
312 
313 }  // namespace dwarf2reader
314 
315 #endif  // COMMON_DWARF_BYTEREADER_H__
316