1// ================================================================
2// ORDERED MAP FROM STRING TO MLRVAL
3//
4// This is an implementation of insertion-ordered key-value pairs for Miller's
5// fundamental record data structure. It's also an ordered-map data structure,
6// suitable for Miller JSON decode/encode.
7//
8// ----------------------------------------------------------------
9// DESIGN
10//
11// * It keeps a doubly-linked list of key-value pairs.
12//
13// * By default, no hash functions are computed when the map is written to or
14//   read from.
15//
16// * Gets are implemented by sequential scan through the list: given a key,
17//   the key-value pairs are scanned through until a match is (or is not) found.
18//
19// * Performance improvement of 25% percent over hash-mapping from key to entry
20//   was found in the Go implementation. Test data was million-line CSV and
21//   DKVP, with a dozen columns or so.
22//
23// Note however that an auxiliary constructor is provided which does use
24// a key-to-entry hashmap in place of linear search for get/put/has/delete.
25// This may be useful in certain contexts, even though it's not the default
26// chosen for stream-records.
27//
28// ----------------------------------------------------------------
29// MOTIVATION
30//
31// * The use case for records in Miller is that *all* fields are read from
32//   strings & written to strings (split/join), while only *some* fields are
33//   operated on.
34//
35// * Meanwhile there are few repeated accesses to a given record: the
36//   access-to-construct ratio is quite low for Miller data records.  Miller
37//   instantiates thousands, millions, billions of records (depending on the
38//   input data) but accesses each record only once per transforming operation.
39//   (This is in contrast to accumulator hashmaps which are repeatedly accessed
40//   during a stats run.)
41//
42// * The hashed impl computes hashsums for *all* fields whether operated on or not,
43//   for the benefit of the *few* fields looked up during the transforming operation.
44//
45// * The hashless impl only keeps string pointers.  Lookups are done at runtime
46//   doing prefix search on the key names. Assuming field names are distinct,
47//   this is just a few char-ptr accesses which (in experiments) turn out to
48//   offer about a 10-15% performance improvement.
49//
50// * Added benefit: the field-rename operation (preserving field order) becomes
51//   trivial.
52// ================================================================
53
54package types
55
56// ----------------------------------------------------------------
57type Mlrmap struct {
58	FieldCount int
59	Head       *mlrmapEntry
60	Tail       *mlrmapEntry
61
62	// Surprisingly, using this costs about 25% for cat/cut/etc tests
63	// on million-line data files (CSV, DKVP) with a dozen or so columns.
64	// So, the constructor allows callsites to use it, or not.
65	keysToEntries map[string]*mlrmapEntry
66}
67
68type mlrmapEntry struct {
69	Key   string
70	Value *Mlrval
71	Prev  *mlrmapEntry
72	Next  *mlrmapEntry
73}
74
75// ----------------------------------------------------------------
76func NewMlrmapAsRecord() *Mlrmap {
77	return newMlrmapUnhashed()
78}
79func NewMlrmap() *Mlrmap {
80	return newMlrmapHashed()
81}
82
83// Faster on record-stream data as noted above.
84func newMlrmapUnhashed() *Mlrmap {
85	return &Mlrmap{
86		FieldCount:    0,
87		Head:          nil,
88		Tail:          nil,
89		keysToEntries: nil,
90	}
91}
92
93// Intended for use in DSL expressions wherein the access-to-construct ratio
94// might be higher (although this needs profiling over a variety of use-cases).
95func newMlrmapHashed() *Mlrmap {
96	return &Mlrmap{
97		FieldCount:    0,
98		Head:          nil,
99		Tail:          nil,
100		keysToEntries: make(map[string]*mlrmapEntry),
101	}
102}
103
104func NewMlrmapMaybeHashed(wantHashing bool) *Mlrmap {
105	if wantHashing {
106		return newMlrmapHashed()
107	} else {
108		return newMlrmapUnhashed()
109	}
110}
111
112func (this *Mlrmap) isHashed() bool {
113	return this.keysToEntries != nil
114}
115
116// ----------------------------------------------------------------
117// Value-copy is up to the caller -- PutReference and PutCopy
118// are in the public Mlrmap API.
119func newMlrmapEntry(key string, value *Mlrval) *mlrmapEntry {
120	return &mlrmapEntry{
121		key,
122		value,
123		nil,
124		nil,
125	}
126}
127