1// Licensed to the Apache Software Foundation (ASF) under one 2// or more contributor license agreements. See the NOTICE file 3// distributed with this work for additional information 4// regarding copyright ownership. The ASF licenses this file 5// to you under the Apache License, Version 2.0 (the 6// "License"); you may not use this file except in compliance 7// with the License. You may obtain a copy of the License at 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, 12// software distributed under the License is distributed on an 13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14// KIND, either express or implied. See the License for the 15// specific language governing permissions and limitations 16// under the License. 17 18import { Vector } from './vector'; 19import { BufferType } from './enum'; 20import { Data, Buffers } from './data'; 21import { createIsValidFunction } from './builder/valid'; 22import { BuilderType as B, VectorType as V} from './interfaces'; 23import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer'; 24import { 25 DataType, strideForType, 26 Float, Int, Decimal, FixedSizeBinary, 27 Date_, Time, Timestamp, Interval, 28 Utf8, Binary, List, Map_ 29} from './type'; 30 31/** 32 * A set of options required to create a `Builder` instance for a given `DataType`. 33 * @see {@link Builder} 34 */ 35export interface BuilderOptions<T extends DataType = any, TNull = any> { 36 type: T; 37 nullValues?: TNull[] | ReadonlyArray<TNull> | null; 38 children?: { [key: string]: BuilderOptions; } | BuilderOptions[]; 39} 40 41/** 42 * A set of options to create an Iterable or AsyncIterable `Builder` transform function. 43 * @see {@link Builder.throughIterable} 44 * @see {@link Builder.throughAsyncIterable} 45 */ 46 47export interface IterableBuilderOptions<T extends DataType = any, TNull = any> extends BuilderOptions<T, TNull> { 48 highWaterMark?: number; 49 queueingStrategy?: 'bytes' | 'count'; 50 dictionaryHashFunction?: (value: any) => string | number; 51 valueToChildTypeId?: (builder: Builder<T, TNull>, value: any, offset: number) => number; 52} 53 54/** 55 * An abstract base class for types that construct Arrow Vectors from arbitrary JavaScript values. 56 * 57 * A `Builder` is responsible for writing arbitrary JavaScript values 58 * to ArrayBuffers and/or child Builders according to the Arrow specification 59 * for each DataType, creating or resizing the underlying ArrayBuffers as necessary. 60 * 61 * The `Builder` for each Arrow `DataType` handles converting and appending 62 * values for a given `DataType`. The high-level {@link Builder.new `Builder.new()`} convenience 63 * method creates the specific `Builder` subclass for the supplied `DataType`. 64 * 65 * Once created, `Builder` instances support both appending values to the end 66 * of the `Builder`, and random-access writes to specific indices 67 * (`Builder.prototype.append(value)` is a convenience method for 68 * `builder.set(builder.length, value)`). Appending or setting values beyond the 69 * Builder's current length may cause the builder to grow its underlying buffers 70 * or child Builders (if applicable) to accommodate the new values. 71 * 72 * After enough values have been written to a `Builder`, `Builder.prototype.flush()` 73 * will commit the values to the underlying ArrayBuffers (or child Builders). The 74 * internal Builder state will be reset, and an instance of `Data<T>` is returned. 75 * Alternatively, `Builder.prototype.toVector()` will flush the `Builder` and return 76 * an instance of `Vector<T>` instead. 77 * 78 * When there are no more values to write, use `Builder.prototype.finish()` to 79 * finalize the `Builder`. This does not reset the internal state, so it is 80 * necessary to call `Builder.prototype.flush()` or `toVector()` one last time 81 * if there are still values queued to be flushed. 82 * 83 * Note: calling `Builder.prototype.finish()` is required when using a `DictionaryBuilder`, 84 * because this is when it flushes the values that have been enqueued in its internal 85 * dictionary's `Builder`, and creates the `dictionaryVector` for the `Dictionary` `DataType`. 86 * 87 * ```ts 88 * import { Builder, Utf8 } from 'apache-arrow'; 89 * 90 * const utf8Builder = Builder.new({ 91 * type: new Utf8(), 92 * nullValues: [null, 'n/a'] 93 * }); 94 * 95 * utf8Builder 96 * .append('hello') 97 * .append('n/a') 98 * .append('world') 99 * .append(null); 100 * 101 * const utf8Vector = utf8Builder.finish().toVector(); 102 * 103 * console.log(utf8Vector.toJSON()); 104 * // > ["hello", null, "world", null] 105 * ``` 106 * 107 * @typeparam T The `DataType` of this `Builder`. 108 * @typeparam TNull The type(s) of values which will be considered null-value sentinels. 109 */ 110export abstract class Builder<T extends DataType = any, TNull = any> { 111 112 /** 113 * Create a `Builder` instance based on the `type` property of the supplied `options` object. 114 * @param {BuilderOptions<T, TNull>} options An object with a required `DataType` instance 115 * and other optional parameters to be passed to the `Builder` subclass for the given `type`. 116 * 117 * @typeparam T The `DataType` of the `Builder` to create. 118 * @typeparam TNull The type(s) of values which will be considered null-value sentinels. 119 * @nocollapse 120 */ 121 // @ts-ignore 122 public static new<T extends DataType = any, TNull = any>(options: BuilderOptions<T, TNull>): B<T, TNull> {} 123 124 /** @nocollapse */ 125 // @ts-ignore 126 public static throughNode<T extends DataType = any, TNull = any>(options: import('./io/node/builder').BuilderDuplexOptions<T, TNull>): import('stream').Duplex { 127 throw new Error(`"throughNode" not available in this environment`); 128 } 129 /** @nocollapse */ 130 // @ts-ignore 131 public static throughDOM<T extends DataType = any, TNull = any>(options: import('./io/whatwg/builder').BuilderTransformOptions<T, TNull>): import('./io/whatwg/builder').BuilderTransform<T, TNull> { 132 throw new Error(`"throughDOM" not available in this environment`); 133 } 134 135 /** 136 * Transform a synchronous `Iterable` of arbitrary JavaScript values into a 137 * sequence of Arrow Vector<T> following the chunking semantics defined in 138 * the supplied `options` argument. 139 * 140 * This function returns a function that accepts an `Iterable` of values to 141 * transform. When called, this function returns an Iterator of `Vector<T>`. 142 * 143 * The resulting `Iterator<Vector<T>>` yields Vectors based on the 144 * `queueingStrategy` and `highWaterMark` specified in the `options` argument. 145 * 146 * * If `queueingStrategy` is `"count"` (or omitted), The `Iterator<Vector<T>>` 147 * will flush the underlying `Builder` (and yield a new `Vector<T>`) once the 148 * Builder's `length` reaches or exceeds the supplied `highWaterMark`. 149 * * If `queueingStrategy` is `"bytes"`, the `Iterator<Vector<T>>` will flush 150 * the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength` 151 * reaches or exceeds the supplied `highWaterMark`. 152 * 153 * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use. 154 * @returns A function which accepts a JavaScript `Iterable` of values to 155 * write, and returns an `Iterator` that yields Vectors according 156 * to the chunking semantics defined in the `options` argument. 157 * @nocollapse 158 */ 159 public static throughIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { 160 return throughIterable(options); 161 } 162 163 /** 164 * Transform an `AsyncIterable` of arbitrary JavaScript values into a 165 * sequence of Arrow Vector<T> following the chunking semantics defined in 166 * the supplied `options` argument. 167 * 168 * This function returns a function that accepts an `AsyncIterable` of values to 169 * transform. When called, this function returns an AsyncIterator of `Vector<T>`. 170 * 171 * The resulting `AsyncIterator<Vector<T>>` yields Vectors based on the 172 * `queueingStrategy` and `highWaterMark` specified in the `options` argument. 173 * 174 * * If `queueingStrategy` is `"count"` (or omitted), The `AsyncIterator<Vector<T>>` 175 * will flush the underlying `Builder` (and yield a new `Vector<T>`) once the 176 * Builder's `length` reaches or exceeds the supplied `highWaterMark`. 177 * * If `queueingStrategy` is `"bytes"`, the `AsyncIterator<Vector<T>>` will flush 178 * the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength` 179 * reaches or exceeds the supplied `highWaterMark`. 180 * 181 * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use. 182 * @returns A function which accepts a JavaScript `AsyncIterable` of values 183 * to write, and returns an `AsyncIterator` that yields Vectors 184 * according to the chunking semantics defined in the `options` 185 * argument. 186 * @nocollapse 187 */ 188 public static throughAsyncIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { 189 return throughAsyncIterable(options); 190 } 191 192 /** 193 * Construct a builder with the given Arrow DataType with optional null values, 194 * which will be interpreted as "null" when set or appended to the `Builder`. 195 * @param {{ type: T, nullValues?: any[] }} options A `BuilderOptions` object used to create this `Builder`. 196 */ 197 constructor({ 'type': type, 'nullValues': nulls }: BuilderOptions<T, TNull>) { 198 this.type = type; 199 this.children = []; 200 this.nullValues = nulls; 201 this.stride = strideForType(type); 202 this._nulls = new BitmapBufferBuilder(); 203 if (nulls && nulls.length > 0) { 204 this._isValid = createIsValidFunction(nulls); 205 } 206 } 207 208 /** 209 * The Builder's `DataType` instance. 210 * @readonly 211 */ 212 public type: T; 213 /** 214 * The number of values written to the `Builder` that haven't been flushed yet. 215 * @readonly 216 */ 217 public length = 0; 218 /** 219 * A boolean indicating whether `Builder.prototype.finish()` has been called on this `Builder`. 220 * @readonly 221 */ 222 public finished = false; 223 /** 224 * The number of elements in the underlying values TypedArray that 225 * represent a single logical element, determined by this Builder's 226 * `DataType`. This is 1 for most types, but is larger when the `DataType` 227 * is `Int64`, `Uint64`, `Decimal`, `DateMillisecond`, certain variants of 228 * `Interval`, `Time`, or `Timestamp`, `FixedSizeBinary`, and `FixedSizeList`. 229 * @readonly 230 */ 231 public readonly stride: number; 232 public readonly children: Builder[]; 233 /** 234 * The list of null-value sentinels for this `Builder`. When one of these values 235 * is written to the `Builder` (either via `Builder.prototype.set()` or `Builder.prototype.append()`), 236 * a 1-bit is written to this Builder's underlying null BitmapBufferBuilder. 237 * @readonly 238 */ 239 public readonly nullValues?: TNull[] | ReadonlyArray<TNull> | null; 240 241 /** 242 * Flush the `Builder` and return a `Vector<T>`. 243 * @returns {Vector<T>} A `Vector<T>` of the flushed values. 244 */ 245 public toVector() { return Vector.new(this.flush()); } 246 247 public get ArrayType() { return this.type.ArrayType; } 248 public get nullCount() { return this._nulls.numInvalid; } 249 public get numChildren() { return this.children.length; } 250 251 /** 252 * @returns The aggregate length (in bytes) of the values that have been written. 253 */ 254 public get byteLength(): number { 255 let size = 0; 256 this._offsets && (size += this._offsets.byteLength); 257 this._values && (size += this._values.byteLength); 258 this._nulls && (size += this._nulls.byteLength); 259 this._typeIds && (size += this._typeIds.byteLength); 260 return this.children.reduce((size, child) => size + child.byteLength, size); 261 } 262 263 /** 264 * @returns The aggregate number of rows that have been reserved to write new values. 265 */ 266 public get reservedLength(): number { 267 return this._nulls.reservedLength; 268 } 269 270 /** 271 * @returns The aggregate length (in bytes) that has been reserved to write new values. 272 */ 273 public get reservedByteLength(): number { 274 let size = 0; 275 this._offsets && (size += this._offsets.reservedByteLength); 276 this._values && (size += this._values.reservedByteLength); 277 this._nulls && (size += this._nulls.reservedByteLength); 278 this._typeIds && (size += this._typeIds.reservedByteLength); 279 return this.children.reduce((size, child) => size + child.reservedByteLength, size); 280 } 281 282 // @ts-ignore 283 protected _offsets: DataBufferBuilder<Int32Array>; 284 public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } 285 286 // @ts-ignore 287 protected _values: BufferBuilder<T['TArray'], any>; 288 public get values() { return this._values ? this._values.buffer : null; } 289 290 protected _nulls: BitmapBufferBuilder; 291 public get nullBitmap() { return this._nulls ? this._nulls.buffer : null; } 292 293 // @ts-ignore 294 protected _typeIds: DataBufferBuilder<Int8Array>; 295 public get typeIds() { return this._typeIds ? this._typeIds.buffer : null; } 296 297 // @ts-ignore 298 protected _isValid: (value: T['TValue'] | TNull) => boolean; 299 // @ts-ignore 300 protected _setValue: (inst: Builder<T>, index: number, value: T['TValue']) => void; 301 302 /** 303 * Appends a value (or null) to this `Builder`. 304 * This is equivalent to `builder.set(builder.length, value)`. 305 * @param {T['TValue'] | TNull } value The value to append. 306 */ 307 public append(value: T['TValue'] | TNull) { return this.set(this.length, value); } 308 309 /** 310 * Validates whether a value is valid (true), or null (false) 311 * @param {T['TValue'] | TNull } value The value to compare against null the value representations 312 */ 313 // @ts-ignore 314 public isValid(value: T['TValue'] | TNull): boolean { return this._isValid(value); } 315 316 /** 317 * Write a value (or null-value sentinel) at the supplied index. 318 * If the value matches one of the null-value representations, a 1-bit is 319 * written to the null `BitmapBufferBuilder`. Otherwise, a 0 is written to 320 * the null `BitmapBufferBuilder`, and the value is passed to 321 * `Builder.prototype.setValue()`. 322 * @param {number} index The index of the value to write. 323 * @param {T['TValue'] | TNull } value The value to write at the supplied index. 324 * @returns {this} The updated `Builder` instance. 325 */ 326 public set(index: number, value: T['TValue'] | TNull) { 327 if (this.setValid(index, this.isValid(value))) { 328 this.setValue(index, value); 329 } 330 return this; 331 } 332 333 /** 334 * Write a value to the underlying buffers at the supplied index, bypassing 335 * the null-value check. This is a low-level method that 336 * @param {number} index 337 * @param {T['TValue'] | TNull } value 338 */ 339 // @ts-ignore 340 public setValue(index: number, value: T['TValue']) { this._setValue(this, index, value); } 341 public setValid(index: number, valid: boolean) { 342 this.length = this._nulls.set(index, +valid).length; 343 return valid; 344 } 345 346 // @ts-ignore 347 public addChild(child: Builder, name = `${this.numChildren}`) { 348 throw new Error(`Cannot append children to non-nested type "${this.type}"`); 349 } 350 351 /** 352 * Retrieve the child `Builder` at the supplied `index`, or null if no child 353 * exists at that index. 354 * @param {number} index The index of the child `Builder` to retrieve. 355 * @returns {Builder | null} The child Builder at the supplied index or null. 356 */ 357 public getChildAt<R extends DataType = any>(index: number): Builder<R> | null { 358 return this.children[index] || null; 359 } 360 361 /** 362 * Commit all the values that have been written to their underlying 363 * ArrayBuffers, including any child Builders if applicable, and reset 364 * the internal `Builder` state. 365 * @returns A `Data<T>` of the buffers and childData representing the values written. 366 */ 367 public flush() { 368 369 const buffers: any = []; 370 const values = this._values; 371 const offsets = this._offsets; 372 const typeIds = this._typeIds; 373 const { length, nullCount } = this; 374 375 if (typeIds) { /* Unions */ 376 buffers[BufferType.TYPE] = typeIds.flush(length); 377 // DenseUnions 378 offsets && (buffers[BufferType.OFFSET] = offsets.flush(length)); 379 } else if (offsets) { /* Variable-width primitives (Binary, Utf8) and Lists */ 380 // Binary, Utf8 381 values && (buffers[BufferType.DATA] = values.flush(offsets.last())); 382 buffers[BufferType.OFFSET] = offsets.flush(length); 383 } else if (values) { /* Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) */ 384 buffers[BufferType.DATA] = values.flush(length); 385 } 386 387 nullCount > 0 && (buffers[BufferType.VALIDITY] = this._nulls.flush(length)); 388 389 const data = Data.new<T>( 390 this.type, 0, length, nullCount, buffers as Buffers<T>, 391 this.children.map((child) => child.flush())) as Data<T>; 392 393 this.clear(); 394 395 return data; 396 } 397 398 /** 399 * Finalize this `Builder`, and child builders if applicable. 400 * @returns {this} The finalized `Builder` instance. 401 */ 402 public finish() { 403 this.finished = true; 404 this.children.forEach((child) => child.finish()); 405 return this; 406 } 407 408 /** 409 * Clear this Builder's internal state, including child Builders if applicable, and reset the length to 0. 410 * @returns {this} The cleared `Builder` instance. 411 */ 412 public clear() { 413 this.length = 0; 414 this._offsets && (this._offsets.clear()); 415 this._values && (this._values.clear()); 416 this._nulls && (this._nulls.clear()); 417 this._typeIds && (this._typeIds.clear()); 418 this.children.forEach((child) => child.clear()); 419 return this; 420 } 421} 422 423(Builder.prototype as any).length = 1; 424(Builder.prototype as any).stride = 1; 425(Builder.prototype as any).children = null; 426(Builder.prototype as any).finished = false; 427(Builder.prototype as any).nullValues = null; 428(Builder.prototype as any)._isValid = () => true; 429 430/** @ignore */ 431export abstract class FixedWidthBuilder<T extends Int | Float | FixedSizeBinary | Date_ | Timestamp | Time | Decimal | Interval = any, TNull = any> extends Builder<T, TNull> { 432 constructor(opts: BuilderOptions<T, TNull>) { 433 super(opts); 434 this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); 435 } 436 public setValue(index: number, value: T['TValue']) { 437 const values = this._values; 438 values.reserve(index - values.length + 1); 439 return super.setValue(index, value); 440 } 441} 442 443/** @ignore */ 444export abstract class VariableWidthBuilder<T extends Binary | Utf8 | List | Map_, TNull = any> extends Builder<T, TNull> { 445 protected _pendingLength: number = 0; 446 protected _offsets: OffsetsBufferBuilder; 447 protected _pending: Map<number, any> | undefined; 448 constructor(opts: BuilderOptions<T, TNull>) { 449 super(opts); 450 this._offsets = new OffsetsBufferBuilder(); 451 } 452 public setValue(index: number, value: T['TValue']) { 453 const pending = this._pending || (this._pending = new Map()); 454 const current = pending.get(index); 455 current && (this._pendingLength -= current.length); 456 this._pendingLength += value.length; 457 pending.set(index, value); 458 } 459 public setValid(index: number, isValid: boolean) { 460 if (!super.setValid(index, isValid)) { 461 (this._pending || (this._pending = new Map())).set(index, undefined); 462 return false; 463 } 464 return true; 465 } 466 public clear() { 467 this._pendingLength = 0; 468 this._pending = undefined; 469 return super.clear(); 470 } 471 public flush() { 472 this._flush(); 473 return super.flush(); 474 } 475 public finish() { 476 this._flush(); 477 return super.finish(); 478 } 479 protected _flush() { 480 const pending = this._pending; 481 const pendingLength = this._pendingLength; 482 this._pendingLength = 0; 483 this._pending = undefined; 484 if (pending && pending.size > 0) { 485 this._flushPending(pending, pendingLength); 486 } 487 return this; 488 } 489 protected abstract _flushPending(pending: Map<number, any>, pendingLength: number): void; 490} 491 492/** @ignore */ 493type ThroughIterable<T extends DataType = any, TNull = any> = (source: Iterable<T['TValue'] | TNull>) => IterableIterator<V<T>>; 494 495/** @ignore */ 496function throughIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { 497 const { ['queueingStrategy']: queueingStrategy = 'count' } = options; 498 const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? 1000 : 2 ** 14 } = options; 499 const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; 500 return function*(source: Iterable<T['TValue'] | TNull>) { 501 let numChunks = 0; 502 let builder = Builder.new(options); 503 for (const value of source) { 504 if (builder.append(value)[sizeProperty] >= highWaterMark) { 505 ++numChunks && (yield builder.toVector()); 506 } 507 } 508 if (builder.finish().length > 0 || numChunks === 0) { 509 yield builder.toVector(); 510 } 511 } as ThroughIterable<T, TNull>; 512} 513 514/** @ignore */ 515type ThroughAsyncIterable<T extends DataType = any, TNull = any> = (source: Iterable<T['TValue'] | TNull> | AsyncIterable<T['TValue'] | TNull>) => AsyncIterableIterator<V<T>>; 516 517/** @ignore */ 518function throughAsyncIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { 519 const { ['queueingStrategy']: queueingStrategy = 'count' } = options; 520 const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? 1000 : 2 ** 14 } = options; 521 const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; 522 return async function* (source: Iterable<T['TValue'] | TNull> | AsyncIterable<T['TValue'] | TNull>) { 523 let numChunks = 0; 524 let builder = Builder.new(options); 525 for await (const value of source) { 526 if (builder.append(value)[sizeProperty] >= highWaterMark) { 527 ++numChunks && (yield builder.toVector()); 528 } 529 } 530 if (builder.finish().length > 0 || numChunks === 0) { 531 yield builder.toVector(); 532 } 533 } as ThroughAsyncIterable<T, TNull>; 534} 535