1// Licensed to the Apache Software Foundation (ASF) under one 2// or more contributor license agreements. See the NOTICE file 3// distributed with this work for additional information 4// regarding copyright ownership. The ASF licenses this file 5// to you under the Apache License, Version 2.0 (the 6// "License"); you may not use this file except in compliance 7// with the License. You may obtain a copy of the License at 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, 12// software distributed under the License is distributed on an 13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14// KIND, either express or implied. See the License for the 15// specific language governing permissions and limitations 16// under the License. 17 18import randomatic from 'randomatic'; 19import { VectorType as V } from 'apache-arrow/interfaces'; 20 21import { 22 Data, Vector, Visitor, DataType, 23 Table, Schema, Field, RecordBatch, 24 Null, 25 Bool, 26 Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, 27 Float, Float16, Float32, Float64, 28 Utf8, 29 Binary, 30 FixedSizeBinary, 31 Date_, DateDay, DateMillisecond, 32 Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, 33 Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, 34 Decimal, 35 List, 36 Struct, 37 Union, DenseUnion, SparseUnion, 38 Dictionary, 39 Interval, IntervalDayTime, IntervalYearMonth, 40 FixedSizeList, 41 Map_, 42 DateUnit, TimeUnit, UnionMode, 43 util 44} from './Arrow'; 45 46type TKeys = Int8 | Int16 | Int32 | Uint8 | Uint16 | Uint32; 47 48interface TestDataVectorGenerator extends Visitor { 49 50 visit<T extends Null> (type: T, length?: number): GeneratedVector<V<T>>; 51 visit<T extends Bool> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 52 visit<T extends Int> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 53 visit<T extends Float> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 54 visit<T extends Utf8> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 55 visit<T extends Binary> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 56 visit<T extends FixedSizeBinary> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 57 visit<T extends Date_> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 58 visit<T extends Timestamp> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 59 visit<T extends Time> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 60 visit<T extends Decimal> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 61 visit<T extends Interval> (type: T, length?: number, nullCount?: number): GeneratedVector<V<T>>; 62 visit<T extends List> (type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector<V<T>>; 63 visit<T extends FixedSizeList> (type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector<V<T>>; 64 visit<T extends Dictionary> (type: T, length?: number, nullCount?: number, dictionary?: Vector): GeneratedVector<V<T>>; 65 visit<T extends Union> (type: T, length?: number, nullCount?: number, children?: Vector[]): GeneratedVector<V<T>>; 66 visit<T extends Struct> (type: T, length?: number, nullCount?: number, children?: Vector[]): GeneratedVector<V<T>>; 67 visit<T extends Map_> (type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector<V<T>>; 68 visit<T extends DataType> (type: T, length?: number, ...args: any[]): GeneratedVector<V<T>>; 69 70 visitNull: typeof generateNull; 71 visitBool: typeof generateBool; 72 visitInt: typeof generateInt; 73 visitFloat: typeof generateFloat; 74 visitUtf8: typeof generateUtf8; 75 visitBinary: typeof generateBinary; 76 visitFixedSizeBinary: typeof generateFixedSizeBinary; 77 visitDate: typeof generateDate; 78 visitTimestamp: typeof generateTimestamp; 79 visitTime: typeof generateTime; 80 visitDecimal: typeof generateDecimal; 81 visitList: typeof generateList; 82 visitStruct: typeof generateStruct; 83 visitUnion: typeof generateUnion; 84 visitDictionary: typeof generateDictionary; 85 visitInterval: typeof generateInterval; 86 visitFixedSizeList: typeof generateFixedSizeList; 87 visitMap: typeof generateMap; 88} 89 90class TestDataVectorGenerator extends Visitor {} 91 92TestDataVectorGenerator.prototype.visitNull = generateNull; 93TestDataVectorGenerator.prototype.visitBool = generateBool; 94TestDataVectorGenerator.prototype.visitInt = generateInt; 95TestDataVectorGenerator.prototype.visitFloat = generateFloat; 96TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; 97TestDataVectorGenerator.prototype.visitBinary = generateBinary; 98TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; 99TestDataVectorGenerator.prototype.visitDate = generateDate; 100TestDataVectorGenerator.prototype.visitTimestamp = generateTimestamp; 101TestDataVectorGenerator.prototype.visitTime = generateTime; 102TestDataVectorGenerator.prototype.visitDecimal = generateDecimal; 103TestDataVectorGenerator.prototype.visitList = generateList; 104TestDataVectorGenerator.prototype.visitStruct = generateStruct; 105TestDataVectorGenerator.prototype.visitUnion = generateUnion; 106TestDataVectorGenerator.prototype.visitDictionary = generateDictionary; 107TestDataVectorGenerator.prototype.visitInterval = generateInterval; 108TestDataVectorGenerator.prototype.visitFixedSizeList = generateFixedSizeList; 109TestDataVectorGenerator.prototype.visitMap = generateMap; 110 111const vectorGenerator = new TestDataVectorGenerator(); 112 113const defaultListChild = new Field('list[Int32]', new Int32()); 114 115const defaultRecordBatchChildren = () => [ 116 new Field('i32', new Int32()), 117 new Field('f32', new Float32()), 118 new Field('dict', new Dictionary(new Utf8(), new Int32())) 119]; 120 121const defaultStructChildren = () => [ 122 new Field('struct[0]', new Int32()), 123 new Field('struct[1]', new Utf8()), 124 new Field('struct[2]', new List(new Field('list[DateDay]', new DateDay()))) 125]; 126 127const defaultMapChild = () => [ 128 new Field('', new Struct<{ key: Utf8; value: Float32 }>([ 129 new Field('key', new Utf8()), 130 new Field('value', new Float32()) 131 ])) 132][0]; 133 134const defaultUnionChildren = () => [ 135 new Field('union[0]', new Float64()), 136 new Field('union[1]', new Dictionary(new Uint32(), new Int32())), 137 new Field('union[2]', new Map_(defaultMapChild())) 138]; 139 140export interface GeneratedTable { 141 table: Table; 142 rows: () => any[][]; 143 cols: () => any[][]; 144 keys: () => number[][]; 145 rowBatches: (() => any[][])[]; 146 colBatches: (() => any[][])[]; 147 keyBatches: (() => number[][])[]; 148} 149 150export interface GeneratedRecordBatch { 151 recordBatch: RecordBatch; 152 rows: () => any[][]; 153 cols: () => any[][]; 154 keys: () => number[][]; 155} 156 157export type GeneratedVector<TVec extends Vector = Vector> = { 158 vector: TVec; 159 keys?: number[]; 160 values: () => (TVec['TValue'] | null)[]; 161}; 162 163export const table = (lengths = [100], schema: Schema = new Schema(defaultRecordBatchChildren(), new Map([['foo', 'bar']]))): GeneratedTable => { 164 const generated = lengths.map((length) => recordBatch(length, schema)); 165 const rowBatches = generated.map(({ rows }) => rows); 166 const colBatches = generated.map(({ cols }) => cols); 167 const keyBatches = generated.map(({ keys }) => keys); 168 const rows = memoize(() => rowBatches.reduce((rows: any[][], batch) => [...rows, ...batch()], [])); 169 const keys = memoize(() => keyBatches.reduce((keys: any[][], batch) => ( 170 !keys.length ? batch() : keys.map((idxs, i) => [...(idxs || []), ...(batch()[i] || [])]) 171 ), [])); 172 const cols = memoize(() => colBatches.reduce((cols: any[][], batch) => ( 173 !cols.length ? batch() : cols.map((vals, i) => [...vals, ...batch()[i]]) 174 ), [])); 175 176 return { rows, cols, keys, rowBatches, colBatches, keyBatches, table: new Table(schema, generated.map(({ recordBatch }) => recordBatch)) }; 177}; 178 179export const recordBatch = (length = 100, schema: Schema = new Schema(defaultRecordBatchChildren())): GeneratedRecordBatch => { 180 181 const generated = schema.fields.map((f) => vectorGenerator.visit(f.type, length)); 182 const vecs = generated.map(({ vector }) => vector); 183 184 const keys = memoize(() => generated.map(({ keys }) => keys)); 185 const cols = memoize(() => generated.map(({ values }) => values())); 186 const rows = ((_cols: () => any[][]) => memoize((rows: any[][] = [], cols: any[][] = _cols()) => { 187 for (let i = -1; ++i < length; rows[i] = cols.map((vals) => vals[i])); 188 return rows; 189 }))(cols); 190 191 return { rows, cols, keys, recordBatch: new RecordBatch(schema, length, vecs) }; 192}; 193 194export const null_ = (length = 100) => vectorGenerator.visit(new Null(), length); 195export const bool = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Bool(), length, nullCount); 196export const int8 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Int8(), length, nullCount); 197export const int16 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Int16(), length, nullCount); 198export const int32 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Int32(), length, nullCount); 199export const int64 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Int64(), length, nullCount); 200export const uint8 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Uint8(), length, nullCount); 201export const uint16 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Uint16(), length, nullCount); 202export const uint32 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Uint32(), length, nullCount); 203export const uint64 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Uint64(), length, nullCount); 204export const float16 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Float16(), length, nullCount); 205export const float32 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Float32(), length, nullCount); 206export const float64 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Float64(), length, nullCount); 207export const utf8 = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Utf8(), length, nullCount); 208export const binary = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new Binary(), length, nullCount); 209export const fixedSizeBinary = (length = 100, nullCount = length * 0.2 | 0, byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); 210export const dateDay = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new DateDay(), length, nullCount); 211export const dateMillisecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new DateMillisecond(), length, nullCount); 212export const timestampSecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimestampSecond(), length, nullCount); 213export const timestampMillisecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimestampMillisecond(), length, nullCount); 214export const timestampMicrosecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimestampMicrosecond(), length, nullCount); 215export const timestampNanosecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimestampNanosecond(), length, nullCount); 216export const timeSecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimeSecond(), length, nullCount); 217export const timeMillisecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimeMillisecond(), length, nullCount); 218export const timeMicrosecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimeMicrosecond(), length, nullCount); 219export const timeNanosecond = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new TimeNanosecond(), length, nullCount); 220export const decimal = (length = 100, nullCount = length * 0.2 | 0, scale = 2, precision = 9) => vectorGenerator.visit(new Decimal(scale, precision), length, nullCount); 221export const list = (length = 100, nullCount = length * 0.2 | 0, child = defaultListChild) => vectorGenerator.visit(new List(child), length, nullCount); 222export const struct = <T extends { [key: string]: DataType } = any>(length = 100, nullCount = length * 0.2 | 0, children: Field<T[keyof T]>[] = <any> defaultStructChildren()) => vectorGenerator.visit(new Struct<T>(children), length, nullCount); 223export const denseUnion = (length = 100, nullCount = length * 0.2 | 0, children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new DenseUnion(children.map((f) => f.typeId), children), length, nullCount); 224export const sparseUnion = (length = 100, nullCount = length * 0.2 | 0, children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new SparseUnion(children.map((f) => f.typeId), children), length, nullCount); 225export const dictionary = <T extends DataType = Utf8, TKey extends TKeys = Int32> (length = 100, nullCount = length * 0.2 | 0, dict: T = <any> new Utf8(), keys: TKey = <any> new Int32()) => vectorGenerator.visit(new Dictionary(dict, keys), length, nullCount); 226export const intervalDayTime = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new IntervalDayTime(), length, nullCount); 227export const intervalYearMonth = (length = 100, nullCount = length * 0.2 | 0) => vectorGenerator.visit(new IntervalYearMonth(), length, nullCount); 228export const fixedSizeList = (length = 100, nullCount = length * 0.2 | 0, listSize = 2, child = defaultListChild) => vectorGenerator.visit(new FixedSizeList(listSize, child), length, nullCount); 229export const map = <TKey extends DataType = any, TValue extends DataType = any>(length = 100, nullCount = length * 0.2 | 0, child: Field<Struct<{key: TKey; value: TValue}>> = <any> defaultMapChild()) => vectorGenerator.visit(new Map_<TKey, TValue>(child), length, nullCount); 230 231export const vecs = { 232 null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map 233} as { [k: string]: (...args: any[]) => any }; 234 235function generateNull<T extends Null>(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector<V<T>> { 236 return { values: () => Array.from({ length }, () => null), vector: Vector.new(Data.Null(type, 0, length)) }; 237} 238 239function generateBool<T extends Bool>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 240 const data = createBitmap(length, length / 2 | 0); 241 const nullBitmap = createBitmap(length, nullCount); 242 const values = memoize(() => { 243 const values = [] as (boolean | null)[]; 244 iterateBitmap(length, nullBitmap, (i, valid) => values[i] = !valid ? null : isValid(data, i)); 245 return values; 246 }); 247 iterateBitmap(length, nullBitmap, (i, valid) => !valid && (data[i >> 3] &= ~(1 << (i % 8)))); 248 249 return { values, vector: Vector.new(Data.Bool(type, 0, length, nullCount, nullBitmap, data)) }; 250} 251 252function generateInt<T extends Int>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 253 const ArrayType = type.ArrayType; 254 const stride = 1 + Number(type.bitWidth > 32); 255 const nullBitmap = createBitmap(length, nullCount); 256 const data = fillRandom(ArrayType as any, length * stride); 257 const values = memoize(() => { 258 const values = [] as (number | null)[]; 259 iterateBitmap(length, nullBitmap, (i, valid) => { 260 values[i] = !valid ? null 261 : stride === 1 ? data[i] 262 : data.subarray(i * stride, (i + 1) * stride); 263 }); 264 return values; 265 }); 266 iterateBitmap(length, nullBitmap, (i, valid) => !valid && (data.set(new Uint8Array(stride), i * stride))); 267 return { values, vector: Vector.new(Data.Int(type, 0, length, nullCount, nullBitmap, data)) }; 268} 269 270function generateFloat<T extends Float>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 271 const ArrayType = type.ArrayType; 272 const precision = type.precision; 273 const data = fillRandom(ArrayType as any, length); 274 const nullBitmap = createBitmap(length, nullCount); 275 const values = memoize(() => { 276 const values = [] as (number | null)[]; 277 iterateBitmap(length, nullBitmap, (i, valid) => { 278 values[i] = !valid ? null : precision > 0 ? data[i] : util.uint16ToFloat64(data[i]); 279 }); 280 return values; 281 }); 282 iterateBitmap(length, nullBitmap, (i, valid) => data[i] = !valid ? 0 : data[i] * Math.random()); 283 return { values, vector: Vector.new(Data.Float(type, 0, length, nullCount, nullBitmap, data)) }; 284} 285 286function generateUtf8<T extends Utf8>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 287 const nullBitmap = createBitmap(length, nullCount); 288 const offsets = createVariableWidthOffsets(length, nullBitmap, undefined, undefined, nullCount != 0); 289 const values: string[] = new Array(offsets.length - 1).fill(null); 290 [...offsets.slice(1)] 291 .map((o, i) => isValid(nullBitmap, i) ? o - offsets[i] : null) 292 .reduce((map, length, i) => { 293 if (length !== null) { 294 if (length > 0) { 295 do { 296 values[i] = randomString(length); 297 } while (map.has(values[i])); 298 return map.set(values[i], i); 299 } 300 values[i] = ''; 301 } 302 return map; 303 }, new Map<string, number>()); 304 const data = createVariableWidthBytes(length, nullBitmap, offsets, (i) => encodeUtf8(values[i])); 305 return { values: () => values, vector: Vector.new(Data.Utf8(type, 0, length, nullCount, nullBitmap, offsets, data)) }; 306} 307 308function generateBinary<T extends Binary>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 309 const nullBitmap = createBitmap(length, nullCount); 310 const offsets = createVariableWidthOffsets(length, nullBitmap, undefined, undefined, nullCount != 0); 311 const values = [...offsets.slice(1)] 312 .map((o, i) => isValid(nullBitmap, i) ? o - offsets[i] : null) 313 .map((length) => length == null ? null : randomBytes(length)); 314 const data = createVariableWidthBytes(length, nullBitmap, offsets, (i) => values[i]!); 315 return { values: () => values, vector: Vector.new(Data.Binary(type, 0, length, nullCount, nullBitmap, offsets, data)) }; 316} 317 318function generateFixedSizeBinary<T extends FixedSizeBinary>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 319 const nullBitmap = createBitmap(length, nullCount); 320 const data = fillRandom(Uint8Array, length * type.byteWidth); 321 const values = memoize(() => { 322 const values = [] as (Uint8Array | null)[]; 323 iterateBitmap(length, nullBitmap, (i, valid) => { 324 values[i] = !valid ? null : data.subarray(i * type.byteWidth, (i + 1) * type.byteWidth); 325 }); 326 return values; 327 }); 328 iterateBitmap(length, nullBitmap, (i, valid) => !valid && data.set(new Uint8Array(type.byteWidth), i * type.byteWidth)); 329 return { values, vector: Vector.new(Data.FixedSizeBinary(type, 0, length, nullCount, nullBitmap, data)) }; 330} 331 332function generateDate<T extends Date_>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 333 const values = [] as (number | null)[]; 334 const nullBitmap = createBitmap(length, nullCount); 335 const data = type.unit === DateUnit.DAY 336 ? createDate32(length, nullBitmap, values) 337 : createDate64(length, nullBitmap, values); 338 return { 339 values: () => values.map((x) => x == null ? null : new Date(x)), 340 vector: Vector.new(Data.Date(type, 0, length, nullCount, nullBitmap, data)) 341 }; 342} 343 344function generateTimestamp<T extends Timestamp>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 345 const values = [] as (number | null)[]; 346 const nullBitmap = createBitmap(length, nullCount); 347 const multiple = type.unit === TimeUnit.NANOSECOND ? 1000000000 : 348 type.unit === TimeUnit.MICROSECOND ? 1000000 : 349 type.unit === TimeUnit.MILLISECOND ? 1000 : 1; 350 const data = createTimestamp(length, nullBitmap, multiple, values); 351 return { values: () => values, vector: Vector.new(Data.Timestamp(type, 0, length, nullCount, nullBitmap, data)) }; 352} 353 354function generateTime<T extends Time>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 355 const values = [] as (Int32Array | number | null)[]; 356 const nullBitmap = createBitmap(length, nullCount); 357 const multiple = type.unit === TimeUnit.NANOSECOND ? 1000000000 : 358 type.unit === TimeUnit.MICROSECOND ? 1000000 : 359 type.unit === TimeUnit.MILLISECOND ? 1000 : 1; 360 const data = type.bitWidth === 32 361 ? createTime32(length, nullBitmap, multiple, values as (number | null)[]) 362 : createTime64(length, nullBitmap, multiple, values as (Int32Array | null)[]); 363 return { values: () => values, vector: Vector.new(Data.Time(type, 0, length, nullCount, nullBitmap, data)) }; 364} 365 366function generateDecimal<T extends Decimal>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 367 const data = fillRandom(Uint32Array, length * 4); 368 const nullBitmap = createBitmap(length, nullCount); 369 const view = new DataView(data.buffer, 0, data.byteLength); 370 const values = memoize(() => { 371 const values = [] as (Uint32Array | null)[]; 372 iterateBitmap(length, nullBitmap, (i, valid) => { 373 values[i] = !valid ? null : new Uint32Array(data.buffer, 16 * i, 4); 374 }); 375 return values; 376 }); 377 iterateBitmap(length, nullBitmap, (i, valid) => { 378 if (!valid) { 379 view.setFloat64(4 * (i + 0), 0, true); 380 view.setFloat64(4 * (i + 1), 0, true); 381 } 382 }); 383 return { values, vector: Vector.new(Data.Decimal(type, 0, length, nullCount, nullBitmap, data))}; 384} 385 386function generateInterval<T extends Interval>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0): GeneratedVector<V<T>> { 387 const stride = (1 + type.unit); 388 const nullBitmap = createBitmap(length, nullCount); 389 const data = fillRandom(Int32Array, length * stride); 390 const values = memoize(() => { 391 const values = [] as (Int32Array | null)[]; 392 iterateBitmap(length, nullBitmap, (i: number, valid: boolean) => { 393 values[i] = !valid ? null : stride === 2 394 ? new Int32Array(data.buffer, 4 * i * stride, stride) 395 : new Int32Array([data[i] / 12 | 0, data[i] % 12 | 0]); 396 }); 397 return values; 398 }); 399 iterateBitmap(length, nullBitmap, (i: number, valid: boolean) => { 400 !valid && data.set(new Int32Array(stride), i * stride); 401 }); 402 return { values, vector: Vector.new(Data.Interval(type, 0, length, nullCount, nullBitmap, data)) }; 403} 404 405function generateList<T extends List>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0, child = this.visit(type.children[0].type, length * 3, nullCount * 3)): GeneratedVector<V<T>> { 406 const childVec = child.vector; 407 const nullBitmap = createBitmap(length, nullCount); 408 const stride = childVec.length / (length - nullCount); 409 const offsets = createVariableWidthOffsets(length, nullBitmap, childVec.length, stride); 410 const values = memoize(() => { 411 const childValues = child.values(); 412 const values: (T['valueType'] | null)[] = [...offsets.slice(1)] 413 .map((offset, i) => isValid(nullBitmap, i) ? offset : null) 414 .map((o, i) => o == null ? null : childValues.slice(offsets[i], o)); 415 return values; 416 }); 417 return { values, vector: Vector.new(Data.List(type, 0, length, nullCount, nullBitmap, offsets, childVec)) }; 418} 419 420function generateFixedSizeList<T extends FixedSizeList>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0, child = this.visit(type.children[0].type, length * type.listSize, nullCount * type.listSize)): GeneratedVector<V<T>> { 421 const nullBitmap = createBitmap(length, nullCount); 422 const values = memoize(() => { 423 const childValues = child.values(); 424 const values = [] as (T['valueType'] | null)[]; 425 for (let i = -1, stride = type.listSize; ++i < length;) { 426 values[i] = isValid(nullBitmap, i) ? childValues.slice(i * stride, (i + 1) * stride) : null; 427 } 428 return values; 429 }); 430 return { values, vector: Vector.new(Data.FixedSizeList(type, 0, length, nullCount, nullBitmap, child.vector)) }; 431} 432 433function generateDictionary<T extends Dictionary>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0, dictionary = this.visit(type.dictionary, length, 0)): GeneratedVector<V<T>> { 434 435 const t = <any> type; 436 const currValues = t.dictionaryValues; 437 const hasDict = t.dictionaryVector && t.dictionaryVector.length > 0; 438 const dict = hasDict ? t.dictionaryVector.concat(dictionary.vector) : dictionary.vector; 439 const vals = hasDict ? (() => [...currValues(), ...dictionary.values()]) : dictionary.values; 440 441 const maxIdx = dict.length - 1; 442 const keys = new t.indices.ArrayType(length); 443 const nullBitmap = createBitmap(length, nullCount); 444 445 const values = memoize(() => { 446 const dict = vals(); 447 const values = [] as (T['TValue'] | null)[]; 448 iterateBitmap(length, nullBitmap, (i, valid) => { 449 values[i] = !valid ? null : dict[keys[i]]; 450 }); 451 return values; 452 }); 453 454 iterateBitmap(length, nullBitmap, (i, valid) => { 455 keys[i] = !valid ? 0 : rand() * maxIdx | 0; 456 }); 457 458 t.dictionaryVector = dict; 459 t.dictionaryValues = vals; 460 461 return { values, keys, vector: Vector.new(Data.Dictionary(type, 0, length, nullCount, nullBitmap, keys, dict)) }; 462} 463 464function generateUnion<T extends Union>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0, children?: GeneratedVector<any>[]): GeneratedVector<V<T>> { 465 466 const numChildren = type.children.length; 467 468 if (!children) { 469 if (type.mode === UnionMode.Sparse) { 470 children = type.children.map((f) => this.visit(f.type, length, nullCount)); 471 } else { 472 const childLength = Math.ceil(length / numChildren); 473 const childNullCount = (nullCount / childLength) | 0; 474 children = type.children.map((f) => this.visit(f.type, childLength, childNullCount)); 475 } 476 } 477 478 const typeIds = type.typeIds; 479 const typeIdsBuffer = new Int8Array(length); 480 const vecs = children.map(({ vector }) => vector); 481 const cols = children.map(({ values }) => values); 482 const nullBitmap = createBitmap(length, nullCount); 483 const typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => { 484 return (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex; 485 }, Object.create(null) as { [key: number]: number }); 486 487 if (type.mode === UnionMode.Sparse) { 488 const values = memoize(() => { 489 const values = [] as any[]; 490 const childValues = cols.map((x) => x()); 491 iterateBitmap(length, nullBitmap, (i, valid) => { 492 values[i] = !valid ? null : childValues[typeIdToChildIndex[typeIdsBuffer[i]]][i]; 493 }); 494 return values; 495 }); 496 iterateBitmap(length, nullBitmap, (i, valid) => { 497 typeIdsBuffer[i] = !valid ? 0 : typeIds[rand() * numChildren | 0]; 498 }); 499 return { values, vector: Vector.new(Data.Union(type as SparseUnion, 0, length, nullCount, nullBitmap, typeIdsBuffer, vecs)) } as GeneratedVector<V<T>>; 500 } 501 502 const offsets = new Int32Array(length); 503 const values = memoize(() => { 504 const values = [] as any[]; 505 const childValues = cols.map((x) => x()); 506 iterateBitmap(length, nullBitmap, (i, valid) => { 507 values[i] = !valid ? null : childValues[typeIdToChildIndex[typeIdsBuffer[i]]][offsets[i]]; 508 }); 509 return values; 510 }); 511 iterateBitmap(length, nullBitmap, (i, valid) => { 512 if (!valid) { 513 offsets[i] = 0; 514 typeIdsBuffer[i] = 0; 515 } else { 516 const colIdx = rand() * numChildren | 0; 517 offsets[i] = i / numChildren | 0; 518 typeIdsBuffer[i] = typeIds[colIdx]; 519 } 520 }); 521 return { values, vector: Vector.new(Data.Union(type as DenseUnion, 0, length, nullCount, nullBitmap, typeIdsBuffer, offsets, vecs)) } as GeneratedVector<V<T>>; 522} 523 524function generateStruct<T extends Struct>(this: TestDataVectorGenerator, type: T, length = 100, nullCount = length * 0.2 | 0, children = type.children.map((f) => this.visit(f.type, length, nullCount))): GeneratedVector<V<T>> { 525 const vecs = children.map(({ vector }) => vector); 526 const cols = children.map(({ values }) => values); 527 const nullBitmap = createBitmap(length, nullCount); 528 const values = memoize(() => { 529 const values = [] as any[]; 530 const childValues = cols.map((x) => x()); 531 const names = type.children.map((f) => f.name); 532 iterateBitmap(length, nullBitmap, (i, valid) => { 533 values[i] = !valid ? null : childValues.reduce((row, col, j) => ({ 534 ...row, [names[j]]: col[i] 535 }), {}); 536 }); 537 return values; 538 }); 539 return { values, vector: Vector.new(Data.Struct(type, 0, length, nullCount, nullBitmap, vecs)) }; 540} 541 542function generateMap<T extends Map_>(this: TestDataVectorGenerator, 543 type: T, length = 100, nullCount = length * 0.2 | 0, 544 child = this.visit(type.children[0].type, length * 3, 0, [ 545 this.visit(type.children[0].type.children[0].type, length * 3, 0), 546 this.visit(type.children[0].type.children[1].type, length * 3, nullCount * 3) 547 ])): GeneratedVector<V<T>> { 548 549 type K = T['keyType']['TValue']; 550 type V = T['valueType']['TValue']; 551 552 const childVec = child.vector; 553 const nullBitmap = createBitmap(length, nullCount); 554 const stride = childVec.length / (length - nullCount); 555 const offsets = createVariableWidthOffsets(length, nullBitmap, childVec.length, stride); 556 const values = memoize(() => { 557 const childValues = child.values() as { key: K; value: V }[]; 558 const values: (T['TValue'] | null)[] = [...offsets.slice(1)] 559 .map((offset, i) => isValid(nullBitmap, i) ? offset : null) 560 .map((o, i) => o == null ? null : (() => { 561 const slice = childValues.slice(offsets[i], o); 562 const pairs = slice.map(({ key, value }) => [key, value]); 563 return new Map<K, V>(pairs as any as (readonly [K, V])[]); 564 })()); 565 return values; 566 }); 567 return { values, vector: Vector.new(Data.Map(type, 0, length, nullCount, nullBitmap, offsets, childVec)) }; 568} 569 570type TypedArrayConstructor = 571 (typeof Int8Array) | 572 (typeof Int16Array) | 573 (typeof Int32Array) | 574 (typeof Uint8Array) | 575 (typeof Uint16Array) | 576 (typeof Uint32Array) | 577 (typeof Float32Array) | 578 (typeof Float64Array); 579 580 581const rand = Math.random.bind(Math); 582const randomBytes = (length: number) => fillRandom(Uint8Array, length); 583const randomString = (length: number) => randomatic('?', length, { chars: `abcdefghijklmnopqrstuvwxyz0123456789_` }); 584 585const memoize = (fn: () => any) => ((x?: any) => () => x || (x = fn()))(); 586 587const encodeUtf8 = ((encoder) => 588 encoder.encode.bind(encoder) as (input?: string, options?: { stream?: boolean }) => Uint8Array 589)(new TextEncoder()); 590 591function fillRandom<T extends TypedArrayConstructor>(ArrayType: T, length: number) { 592 const BPE = ArrayType.BYTES_PER_ELEMENT; 593 const array = new ArrayType(length); 594 const max = (2 ** (8 * BPE)) - 1; 595 for (let i = -1; ++i < length; array[i] = rand() * max * (rand() > 0.5 ? -1 : 1)); 596 return array as InstanceType<T>; 597} 598 599function isValid(bitmap: Uint8Array, i: number) { 600 return (bitmap[i >> 3] & 1 << (i % 8)) !== 0; 601} 602 603function iterateBitmap(length: number, bitmap: Uint8Array, fn: (index: number, valid: boolean) => any) { 604 let byteIndex = 0, valueIndex = 0; 605 for (let bit = 0; length > 0; bit = 0) { 606 let byte = bitmap[byteIndex++]; 607 do { 608 fn(valueIndex++, (byte & 1 << bit) !== 0); 609 } while (--length > 0 && ++bit < 8); 610 } 611} 612 613function createBitmap(length: number, nullCount: number) { 614 const nulls = Object.create(null) as { [key: number]: boolean }; 615 const bytes = new Uint8Array((((length >> 3) + 7) & ~7) || 8).fill(255); 616 for (let i, j = -1; ++j < nullCount;) { 617 while (nulls[i = (rand() * length) | 0]); 618 nulls[i] = true; 619 bytes[i >> 3] &= ~(1 << (i % 8)); // false 620 } 621 return bytes; 622} 623 624function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, max = Infinity, stride = 20, allowEmpty = true) { 625 const offsets = new Int32Array(length + 1); 626 iterateBitmap(length, nullBitmap, (i, valid) => { 627 if (!valid) { 628 offsets[i + 1] = offsets[i]; 629 } else { 630 do { 631 offsets[i + 1] = Math.min(max, offsets[i] + (rand() * stride | 0)); 632 } while (!allowEmpty && offsets[i + 1] === offsets[i]); 633 } 634 }); 635 return offsets; 636} 637 638function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array, getBytes: (index: number) => Uint8Array) { 639 const bytes = new Uint8Array(offsets[length]); 640 iterateBitmap(length, nullBitmap, (i, valid) => { 641 valid && bytes.set(getBytes(i), offsets[i]); 642 }); 643 return bytes; 644} 645 646function createDate32(length: number, nullBitmap: Uint8Array, values: (number | null)[] = []) { 647 const data = new Int32Array(length).fill(Date.now() / 86400000 | 0); 648 iterateBitmap(length, nullBitmap, (i, valid) => { 649 if (!valid) { 650 data[i] = 0; 651 values[i] = null; 652 } else { 653 data[i] = data[i] + (rand() * 10000 * (rand() > 0.5 ? -1 : 1)) | 0; 654 values[i] = data[i] * 86400000; 655 } 656 }); 657 return data; 658} 659 660function createDate64(length: number, nullBitmap: Uint8Array, values: (number | null)[] = []) { 661 const data = new Int32Array(length * 2).fill(0); 662 const data32 = createDate32(length, nullBitmap, values); 663 iterateBitmap(length, nullBitmap, (i, valid) => { 664 if (valid) { 665 const value = data32[i] * 86400000; 666 const hi = (value / 4294967296) | 0; 667 const lo = (value - 4294967296 * hi) | 0; 668 values[i] = value; 669 data[i * 2 + 0] = lo; 670 data[i * 2 + 1] = hi; 671 } 672 }); 673 return data; 674} 675 676function createTimestamp(length: number, nullBitmap: Uint8Array, multiple: number, values: (number | null)[] = []) { 677 const mult = 86400 * multiple; 678 const data = new Int32Array(length * 2).fill(0); 679 const data32 = createDate32(length, nullBitmap, values); 680 iterateBitmap(length, nullBitmap, (i, valid) => { 681 if (valid) { 682 const value = data32[i] * mult; 683 const hi = (value / 4294967296) | 0; 684 const lo = (value - 4294967296 * hi) | 0; 685 data[i * 2 + 0] = lo; 686 data[i * 2 + 1] = hi; 687 } 688 }); 689 return data; 690} 691 692function createTime32(length: number, nullBitmap: Uint8Array, multiple: number, values: (number | null)[] = []) { 693 const data = new Int32Array(length).fill(0); 694 iterateBitmap(length, nullBitmap, (i, valid) => { 695 if (!valid) { 696 data[i] = 0; 697 values[i] = null; 698 } else { 699 values[i] = data[i] = ((1000 * rand()) | 0 * multiple) * (rand() > 0.5 ? -1 : 1); 700 } 701 }); 702 return data; 703} 704 705function createTime64(length: number, nullBitmap: Uint8Array, multiple: number, values: (Int32Array | null)[] = []) { 706 const data = new Int32Array(length * 2).fill(0); 707 iterateBitmap(length, nullBitmap, (i, valid) => { 708 if (!valid) { 709 values[i] = null; 710 } else { 711 const value = (1000 * rand()) | 0 * multiple; 712 const hi = (value / 4294967296) | 0; 713 const lo = (value - 4294967296 * hi) | 0; 714 data[i * 2 + 0] = lo; 715 data[i * 2 + 1] = hi; 716 values[i] = data.subarray(i * 2, (i + 1) * 2); 717 } 718 }); 719 return data; 720} 721