1 // Copyright 2016-2021 Doug Moen
2 // Licensed under the Apache License, version 2.0
3 // See accompanying file LICENSE or https://www.apache.org/licenses/LICENSE-2.0
4
5 #ifndef LIBCURV_VALUE_H
6 #define LIBCURV_VALUE_H
7
8 #include <libcurv/fail.h>
9 #include <libcurv/shared.h>
10 #include <libcurv/ternary.h>
11 #include <cstdint>
12 #include <ostream>
13
14 namespace curv {
15
16 union Value;
17 struct Symbol_Ref;
18 struct Context;
19
20 /// Base class for the object referenced by a curv reference value.
21 ///
22 /// The memory layout for Ref_Value is:
23 /// * vtable_pointer -- 64 bits
24 /// * use_count -- 32 bits
25 /// * type -- 32 bits
26 ///
27 /// The next data member has 128 bit alignment with no hole on 64 bit platforms.
28 ///
29 /// The type_ and subtype_ fields enable us to query the type by loading the
30 /// first 128 bits of the object into a cache line, without indirecting through
31 /// the vtable, which would cost a 2nd cache line hit and either a virtual
32 /// function call or an RTTI lookup, both relatively costly.
33 ///
34 /// Why not put the type code into Value? Right now, there are an extra 3 bits
35 /// in curv::Value where I could store a type code, similar to LuaJIT and most
36 /// Javascript VMs. I chose not to do this, for simplicity, and to leave room
37 /// for updating later to support 52 bit pointers.
38 ///
39 /// All Ref_Values must be allocated on the heap: see Shared_Base.
40 struct Ref_Value : public Shared_Base
41 {
42 uint16_t type_;
43 uint16_t subtype_;
44 enum {
45 ty_symbol,
46 ty_abstract_list,
47 sty_list,
48 sty_string,
49 ty_record,
50 sty_drecord,
51 sty_module,
52 sty_dir_record,
53 ty_function,
54 ty_lambda,
55 ty_reactive,
56 sty_uniform_variable,
57 sty_reactive_expression,
58 ty_type,
59 sty_error_type,
60 sty_bool_type,
61 sty_num_type,
62 sty_list_type,
63 ty_index,
64 sty_this,
65 sty_tpath,
66 sty_tslice
67 };
Ref_ValueRef_Value68 Ref_Value(int type) : Shared_Base(), type_(type), subtype_(type) {}
Ref_ValueRef_Value69 Ref_Value(int type, int subtype)
70 :
71 Shared_Base(), type_(type), subtype_(subtype)
72 {}
73
74 /// Print a value like a Curv expression.
75 virtual void print_repr(std::ostream&) const = 0;
76
77 /// Print a value like a string.
78 virtual void print_string(std::ostream&) const;
79
80 virtual void print_help(std::ostream&) const;
81 };
82
83 /// A boxed, dynamically typed value in the Curv runtime.
84 ///
85 /// A Value is 64 bits. 64 bit IEEE floats are represented as themselves
86 /// (same bit pattern), while all other types are represented as NaNs,
87 /// with the data stored in the low order 48 bits of the NaN. This is called
88 /// "NaN boxing", a technique also used by LuaJIT and JavaScriptCore.
89 ///
90 /// "Immediate" values are stored entirely in the 64 bit pattern of a Value.
91 /// These are numbers, booleans, characters, and the "missing" pseudovalue
92 /// that denotes the absence of a value within libcurv APIs.
93 ///
94 /// String, List, Record and Function values are "reference" values:
95 /// a Ref_Value* pointer is stored in the low order 48 bits of the Value.
96 /// This works on 64 bit Intel and ARM systems because those architectures
97 /// use 48 bit virtual addresses, with the upper 16 bits of a 64 bit pointer
98 /// being wasted space.
99 ///
100 /// Since this code was written, ARM and Intel have added support for pointers
101 /// with more than 48 significant bits. The Linux kernel does not enable this
102 /// feature by default, because it breaks all of the modern web browsers,
103 /// which use NaN boxing. So my implementation continues to be viable, for now.
104 /// I could upgrade my Nan boxing implementation to support 52 bit pointers,
105 /// but it's not clear if that would actually help anybody.
106 ///
107 /// Reference values have Shared semantics. The copy constructor increments
108 /// the reference count, the destructor decrements the reference count and
109 /// deletes the object if the refcount reaches 0.
110 ///
111 /// Each Value has a unique bit pattern (not a given: I'm forcing the values
112 /// of unused bits in the NaN box to ensure this). Only positive NaNs are used.
113 /// This speeds up is_ref() and some equality tests.
114 union Value
115 {
116 private:
117 // internal representation
118 double number_;
119 uint64_t bits_;
120 int64_t signed_bits_;
121
122 // A double whose upper 16 bits is 0x7FFF is a quiet NaN on both Intel
123 // and ARM. The low order 48 bits can store arbitrary data which the FPU
124 // will ignore. (On PA-RISC and MIPS we'd use 0x7FF7 for a quiet NaN, but
125 // we don't currently support those architectures.)
126 static constexpr uint64_t k_nanbits = 0x7FFF'0000'0000'0000;
127 // The missing value has a 48 bit payload of 0, same representation as the
128 // null pointer. So Value((Ref_Value*)0) is missing.
129 static constexpr uint64_t k_missingbits = k_nanbits|0;
130 // false and true have a payload of 2 and 3. The '2' bit is only set in the
131 // payload for a boolean value (assuming all Ref_Value* pointers have
132 // at least 4 byte alignment). The '1' bit is 0 or 1 for false and true.
133 static constexpr uint64_t k_boolbits = k_nanbits|2;
134 static constexpr uint64_t k_boolmask = 0xFFFF'FFFF'FFFF'FFFE;
135
136 /// In a character, the low order two bits are 01.
137 /// The 8 bits adjacent to this are the character value.
138 static constexpr uint64_t k_charbits = k_nanbits|1;
139 static constexpr uint64_t k_charmask = 0xFFFF'FFFF'FFFF'FC03;
140 static constexpr unsigned k_charoffset = 2;
141
142 // Note: the corresponding public constructor takes a Shared argument.
Value(const Ref_Value * r)143 inline Value(const Ref_Value* r)
144 {
145 #if UINTPTR_MAX == UINT64_MAX
146 // 64 bit pointers
147 bits_ = ((uint64_t)r & 0x0000'FFFF'FFFF'FFFF) | Value::k_nanbits;
148 #elif UINTPTR_MAX == UINT32_MAX
149 // 32 bit pointers
150 bits_ = (uint64_t)(uint32_t)r | Value::k_nanbits;
151 #else
152 static_assert(false, "only 32 and 64 bit architectures supported");
153 #endif
154 }
155 template<class T, class... Args> friend Value make_ref_value(Args&&...);
156 public:
157 /// Construct the `missing` value (default constructor).
158 ///
159 /// There is no way for a Curv program to construct the missing value.
160 /// It's kind of like a null pointer: it can be used as a special value
161 /// to signify that no value is present.
Value()162 inline constexpr Value() noexcept : bits_{k_missingbits} {}
163
164 /// True if value is missing.
is_missing()165 inline bool is_missing() const noexcept
166 {
167 return bits_ == k_missingbits;
168 }
169 explicit operator bool () const noexcept
170 {
171 return !is_missing();
172 }
173
174 /// Construct a boolean value.
Value(bool b)175 inline constexpr Value(bool b) : bits_{k_boolbits|(uint64_t)b} {}
176
177 /// True if the value is boolean.
is_bool()178 inline bool is_bool() const noexcept
179 {
180 return (bits_ & k_boolmask) == k_boolbits;
181 }
182 /// Convert a boxed boolean Value to `bool`.
183 ///
184 /// Only defined if is_bool() is true.
to_bool_unsafe()185 inline bool to_bool_unsafe() const noexcept
186 {
187 return (bool)(bits_ & 1);
188 }
189 /// Convert a Value to `bool`, throw an exception if wrong type.
190 bool to_bool(const Context&) const;
191
192 /**************
193 * characters *
194 **************/
195 /// Construct a character value.
Value(char c)196 inline constexpr Value(char c)
197 : bits_{k_charbits|(uint64_t((unsigned char)c) << k_charoffset)}
198 {}
199 /// True if the value is a character.
is_char()200 inline bool is_char() const noexcept
201 {
202 return (bits_ & k_charmask) == k_charbits;
203 }
204 /// Convert a boxed character Value to type 'char'.
205 /// Only defined if is_char() is true.
to_char_unsafe()206 inline char to_char_unsafe() const noexcept
207 {
208 return char(bits_ >> k_charoffset);
209 }
210 /// Convert a Value to `char`, throw an exception if wrong type.
211 char to_char(const Context&) const;
212
213 /// Construct a number value.
214 ///
215 /// The Curv Number type includes all of the IEEE 64 bit floating point
216 /// values except for NaN.
217 /// If the argument is NaN, construct the missing value.
Value(double n)218 inline Value(double n)
219 {
220 if (n == n)
221 number_ = n;
222 else
223 bits_ = k_missingbits;
224 }
225
226 /// True if the value is a number.
is_num()227 inline bool is_num() const noexcept
228 {
229 return number_ == number_;
230 }
231
232 bool is_int() const noexcept;
233
234 /// Convert a number value to `double`.
235 ///
236 /// Only defined if `is_num()` is true.
237 /// Potentially faster than `to_num_or_nan()`,
238 /// so call this version when guarded by `if(v.is_num())`.
to_num_unsafe()239 inline double to_num_unsafe() const noexcept
240 {
241 return number_;
242 }
243
244 /// Convert a number value to `double`.
245 ///
246 /// If is_num() is false then NaN is returned.
to_num_or_nan()247 inline double to_num_or_nan() const noexcept
248 {
249 return number_;
250 }
251
252 /// Convert a Value to `double`, throw an exception if wrong type.
253 double to_num(const Context&) const;
254
255 // Convert a Value to `int`.
256 // Throw an exception if wrong type or out of range.
257 int to_int(int lo, int hi, const Context&) const;
258
259 /// Construct a reference value.
260 ///
261 /// If the argument is nullptr, construct the null value.
Value(Shared<const Ref_Value> ptr)262 inline Value(Shared<const Ref_Value> ptr) : Value(ptr.detach())
263 {
264 }
265
266 /// True if the value is a reference value.
is_ref()267 inline bool is_ref() const noexcept
268 {
269 // Negative numbers will have the sign bit set, which means
270 // signed_bits_ < 0. Positive infinity has all 1s in the exponent,
271 // and is 0x7FF0'0000'0000'0000. Positive numbers have a smaller
272 // exponent than this, so will test < +inf as an integer bit pattern.
273 // The positive NaNs have the largest signed integer magnitude,
274 // and all non-numeric values are encoded as positive NaNs.
275 // The 3 special immediate values (missing, false and true)
276 // are encoded like pointer values in the range 0...3.
277 return signed_bits_ > (int64_t)(k_nanbits|3)
278 && (bits_ & 3) == 0;
279 // TODO: the second test rules out characters; make this faster.
280 }
281
282 /// Convert a reference value to `Ref_Value&`.
283 ///
284 /// Unsafe unless `is_ref()` is true, otherwise it returns a bad reference,
285 /// causing memory corruption or a crash if dereferenced.
286 /// Another reason it's unsafe is that the returned reference becomes
287 /// invalid if the original Value is destroyed.
288 /// See `maybe` for a completely safe alternative.
to_ref_unsafe()289 inline Ref_Value& to_ref_unsafe() const noexcept
290 {
291 #if UINTPTR_MAX == UINT64_MAX
292 // 64 bit pointers.
293
294 // Intel: "bits 48 through 63 of any virtual address must be copies
295 // of bit 47 (in a manner akin to sign extension), or the processor
296 // will raise an exception." https://en.wikipedia.org/wiki/X86-64
297
298 // Arm-64: also has 48 bit virtual addresses, and also seems to
299 // require the sign extension logic. "AArch64 features two 48-bit
300 // virtual address spaces, one for the kernel and one for
301 // applications. Application addressing starts at 0 and grows
302 // upwards, while kernel space grows down from 2^64; any references
303 // to unmapped addresses in between will trigger a fault. Pointers
304 // are sign extended to 64-bits, and can optionally be configured
305 // to use the upper 8-bits for tagging pointers with additional
306 // information." http://www.realworldtech.com/arm64/4/
307
308 // The following code truncates to 48 bits, then fills the upper
309 // 16 bits using sign extension. Is this sign extension necessary?
310 // LuaJIT and SpiderMonkey both assume 47 bit pointers (not 48)
311 // in their NaN boxes. They assume all pointers are positive.
312 //
313 // The sign extension is necessary. Here's a LuaJIT bug where
314 // a negative 48 bit pointer on ARM64 leads to a crash.
315 // https://github.com/LuaJIT/LuaJIT/issues/49
316 //
317 // ARMv8.2 will support 52 bit virtual addresses.
318 // That can still work (a NaN has a 52 bit payload). We'll need
319 // changes to the Value internals but the API won't change.
320 // I don't store type codes in the NaN box (unlike LuaJIT)
321 // which will make this change easier.
322 return *(Ref_Value*)((signed_bits_ << 16) >> 16);
323 #elif UINTPTR_MAX == UINT32_MAX
324 // 32 bit pointers
325 return *(Ref_Value*)(uint32_t)bits_;
326 #else
327 static_assert(false, "only 32 and 64 bit architectures supported");
328 #endif
329 }
330
331 /// Like dynamic_cast for a Value.
332 template <class T>
maybe()333 inline Shared<T> maybe() const noexcept
334 {
335 if (is_ref()) {
336 T* p = dynamic_cast<T*>(&to_ref_unsafe());
337 if (p != nullptr)
338 return share(*p);
339 }
340 return nullptr;
341 }
342 template <class T>
to(const Context & cx)343 inline Shared<T> to(const Context& cx) const
344 {
345 if (is_ref()) {
346 T* p = dynamic_cast<T*>(&to_ref_unsafe());
347 if (p != nullptr)
348 return share(*p);
349 }
350 to_abort(cx, T::name);
351 }
352 void to_abort [[noreturn]] (const Context&, const char*) const;
353 template <class T>
to(Fail fl,const Context & cx)354 inline Shared<T> to(Fail fl, const Context& cx) const
355 {
356 if (is_ref()) {
357 T* p = dynamic_cast<T*>(&to_ref_unsafe());
358 if (p != nullptr)
359 return share(*p);
360 }
361 if (fl == Fail::soft) return nullptr; else to_abort(cx, T::name);
362 }
363
364 Value at(Symbol_Ref fieldname, const Context& cx) const;
365
366 /// The copy constructor increments the use_count of a ref value.
Value(const Value & val)367 inline Value(const Value& val) noexcept
368 {
369 bits_ = val.bits_;
370 if (is_ref())
371 intrusive_ptr_add_ref(&to_ref_unsafe());
372 }
373
374 /// The move constructor.
Value(Value && val)375 inline Value(Value&& val) noexcept
376 {
377 bits_ = val.bits_;
378 val.bits_ = k_missingbits;
379 }
380
381 /// The assignment operator.
382 ///
383 /// There's just one assignment operator, and it's by-value.
384 /// This is simpler than `const Value&` and `const Value&&`
385 /// copy and move assignments, and allegedly the most efficient.
386 /// When possible, use `val = move(rhs);` for a non-rvalue rhs
387 /// for efficiency.
388 inline Value& operator=(Value rhs) noexcept
389 {
390 rhs.swap(*this);
391 return *this;
392 }
393
394 /// The swap operation.
swap(Value & rhs)395 void swap(Value& rhs) noexcept
396 {
397 auto tmpbits = bits_;
398 bits_ = rhs.bits_;
399 rhs.bits_ = tmpbits;
400 }
401
402 /// The destructor.
~Value()403 ~Value()
404 {
405 if (is_ref())
406 intrusive_ptr_release(&to_ref_unsafe());
407 }
408
409 /// Print a value like a Curv expression.
410 void print_repr(std::ostream&) const;
411
412 /// Print a value like a string.
413 void print_string(std::ostream&) const;
414
415 // Deep equality, which traverses the value tree and forces thunks.
416 // May throw an Exception if forcing a thunk fails.
417 // May return Ternary::Unknown when comparing reactive values.
418 // Used to implement `a == b` in the Curv language.
419 // Optimized for numbers, may be expensive for lists and records.
420 Ternary equal(Value, const Context&) const;
421
422 // Shallow equality, compares the bit patterns of two Values. Fast.
eq(Value rhs)423 bool eq(Value rhs) const
424 {
425 return bits_ == rhs.bits_;
426 }
427
428 size_t hash() const noexcept;
429 bool hash_eq(Value) const noexcept;
430
431 struct Hash
432 {
operatorValue::Hash433 size_t operator()(Value val) const noexcept
434 {
435 return val.hash();
436 }
437 };
438 struct Hash_Eq
439 {
operatorValue::Hash_Eq440 bool operator()(Value v1, Value v2) const noexcept
441 {
442 return v1.hash_eq(v2);
443 }
444 };
445 };
446
447 /// Special marker that denotes the absence of a value
448 extern const Value missing;
449
450 inline std::ostream&
451 operator<<(std::ostream& out, Value val)
452 {
453 val.print_repr(out);
454 return out;
455 }
456
457 /// Allocate a reference value with given class and constructor arguments.
make_ref_value(Args &&...args)458 template<typename T, class... Args> Value make_ref_value(Args&&... args)
459 {
460 T* ptr = new T(args...);
461 intrusive_ptr_add_ref(ptr);
462 return Value(ptr);
463 }
464
465 void help(Value, std::ostream&);
466
467 } // namespace curv
468 #endif // header guard
469