1// ================================================================ 2// ORDERED MAP FROM STRING TO MLRVAL 3// 4// This is an implementation of insertion-ordered key-value pairs for Miller's 5// fundamental record data structure. It's also an ordered-map data structure, 6// suitable for Miller JSON decode/encode. 7// 8// ---------------------------------------------------------------- 9// DESIGN 10// 11// * It keeps a doubly-linked list of key-value pairs. 12// 13// * By default, no hash functions are computed when the map is written to or 14// read from. 15// 16// * Gets are implemented by sequential scan through the list: given a key, 17// the key-value pairs are scanned through until a match is (or is not) found. 18// 19// * Performance improvement of 25% percent over hash-mapping from key to entry 20// was found in the Go implementation. Test data was million-line CSV and 21// DKVP, with a dozen columns or so. 22// 23// Note however that an auxiliary constructor is provided which does use 24// a key-to-entry hashmap in place of linear search for get/put/has/delete. 25// This may be useful in certain contexts, even though it's not the default 26// chosen for stream-records. 27// 28// ---------------------------------------------------------------- 29// MOTIVATION 30// 31// * The use case for records in Miller is that *all* fields are read from 32// strings & written to strings (split/join), while only *some* fields are 33// operated on. 34// 35// * Meanwhile there are few repeated accesses to a given record: the 36// access-to-construct ratio is quite low for Miller data records. Miller 37// instantiates thousands, millions, billions of records (depending on the 38// input data) but accesses each record only once per transforming operation. 39// (This is in contrast to accumulator hashmaps which are repeatedly accessed 40// during a stats run.) 41// 42// * The hashed impl computes hashsums for *all* fields whether operated on or not, 43// for the benefit of the *few* fields looked up during the transforming operation. 44// 45// * The hashless impl only keeps string pointers. Lookups are done at runtime 46// doing prefix search on the key names. Assuming field names are distinct, 47// this is just a few char-ptr accesses which (in experiments) turn out to 48// offer about a 10-15% performance improvement. 49// 50// * Added benefit: the field-rename operation (preserving field order) becomes 51// trivial. 52// ================================================================ 53 54package types 55 56// ---------------------------------------------------------------- 57type Mlrmap struct { 58 FieldCount int 59 Head *mlrmapEntry 60 Tail *mlrmapEntry 61 62 // Surprisingly, using this costs about 25% for cat/cut/etc tests 63 // on million-line data files (CSV, DKVP) with a dozen or so columns. 64 // So, the constructor allows callsites to use it, or not. 65 keysToEntries map[string]*mlrmapEntry 66} 67 68type mlrmapEntry struct { 69 Key string 70 Value *Mlrval 71 Prev *mlrmapEntry 72 Next *mlrmapEntry 73} 74 75// ---------------------------------------------------------------- 76func NewMlrmapAsRecord() *Mlrmap { 77 return newMlrmapUnhashed() 78} 79func NewMlrmap() *Mlrmap { 80 return newMlrmapHashed() 81} 82 83// Faster on record-stream data as noted above. 84func newMlrmapUnhashed() *Mlrmap { 85 return &Mlrmap{ 86 FieldCount: 0, 87 Head: nil, 88 Tail: nil, 89 keysToEntries: nil, 90 } 91} 92 93// Intended for use in DSL expressions wherein the access-to-construct ratio 94// might be higher (although this needs profiling over a variety of use-cases). 95func newMlrmapHashed() *Mlrmap { 96 return &Mlrmap{ 97 FieldCount: 0, 98 Head: nil, 99 Tail: nil, 100 keysToEntries: make(map[string]*mlrmapEntry), 101 } 102} 103 104func NewMlrmapMaybeHashed(wantHashing bool) *Mlrmap { 105 if wantHashing { 106 return newMlrmapHashed() 107 } else { 108 return newMlrmapUnhashed() 109 } 110} 111 112func (this *Mlrmap) isHashed() bool { 113 return this.keysToEntries != nil 114} 115 116// ---------------------------------------------------------------- 117// Value-copy is up to the caller -- PutReference and PutCopy 118// are in the public Mlrmap API. 119func newMlrmapEntry(key string, value *Mlrval) *mlrmapEntry { 120 return &mlrmapEntry{ 121 key, 122 value, 123 nil, 124 nil, 125 } 126} 127