1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 // Eager evaluation convenience APIs for invoking common functions, including 19 // necessary memory allocations 20 21 #pragma once 22 23 #include "arrow/compute/function.h" 24 #include "arrow/datum.h" 25 #include "arrow/result.h" 26 #include "arrow/util/macros.h" 27 #include "arrow/util/visibility.h" 28 29 namespace arrow { 30 31 class Array; 32 33 namespace compute { 34 35 class ExecContext; 36 37 // ---------------------------------------------------------------------- 38 // Aggregate functions 39 40 /// \addtogroup compute-concrete-options 41 /// @{ 42 43 /// \brief Control general scalar aggregate kernel behavior 44 /// 45 /// By default, null values are ignored (skip_nulls = true). 46 class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { 47 public: 48 explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); 49 constexpr static char const kTypeName[] = "ScalarAggregateOptions"; Defaults()50 static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } 51 52 /// If true (the default), null values are ignored. Otherwise, if any value is null, 53 /// emit null. 54 bool skip_nulls; 55 /// If less than this many non-null values are observed, emit null. 56 uint32_t min_count; 57 }; 58 59 /// \brief Control count aggregate kernel behavior. 60 /// 61 /// By default, only non-null values are counted. 62 class ARROW_EXPORT CountOptions : public FunctionOptions { 63 public: 64 enum CountMode { 65 /// Count only non-null values. 66 ONLY_VALID = 0, 67 /// Count only null values. 68 ONLY_NULL, 69 /// Count both non-null and null values. 70 ALL, 71 }; 72 explicit CountOptions(CountMode mode = CountMode::ONLY_VALID); 73 constexpr static char const kTypeName[] = "CountOptions"; Defaults()74 static CountOptions Defaults() { return CountOptions{}; } 75 76 CountMode mode; 77 }; 78 79 /// \brief Control Mode kernel behavior 80 /// 81 /// Returns top-n common values and counts. 82 /// By default, returns the most common value and count. 83 class ARROW_EXPORT ModeOptions : public FunctionOptions { 84 public: 85 explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); 86 constexpr static char const kTypeName[] = "ModeOptions"; Defaults()87 static ModeOptions Defaults() { return ModeOptions{}; } 88 89 int64_t n = 1; 90 /// If true (the default), null values are ignored. Otherwise, if any value is null, 91 /// emit null. 92 bool skip_nulls; 93 /// If less than this many non-null values are observed, emit null. 94 uint32_t min_count; 95 }; 96 97 /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel 98 /// 99 /// The divisor used in calculations is N - ddof, where N is the number of elements. 100 /// By default, ddof is zero, and population variance or stddev is returned. 101 class ARROW_EXPORT VarianceOptions : public FunctionOptions { 102 public: 103 explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0); 104 constexpr static char const kTypeName[] = "VarianceOptions"; Defaults()105 static VarianceOptions Defaults() { return VarianceOptions{}; } 106 107 int ddof = 0; 108 /// If true (the default), null values are ignored. Otherwise, if any value is null, 109 /// emit null. 110 bool skip_nulls; 111 /// If less than this many non-null values are observed, emit null. 112 uint32_t min_count; 113 }; 114 115 /// \brief Control Quantile kernel behavior 116 /// 117 /// By default, returns the median value. 118 class ARROW_EXPORT QuantileOptions : public FunctionOptions { 119 public: 120 /// Interpolation method to use when quantile lies between two data points 121 enum Interpolation { 122 LINEAR = 0, 123 LOWER, 124 HIGHER, 125 NEAREST, 126 MIDPOINT, 127 }; 128 129 explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR, 130 bool skip_nulls = true, uint32_t min_count = 0); 131 132 explicit QuantileOptions(std::vector<double> q, 133 enum Interpolation interpolation = LINEAR, 134 bool skip_nulls = true, uint32_t min_count = 0); 135 136 constexpr static char const kTypeName[] = "QuantileOptions"; Defaults()137 static QuantileOptions Defaults() { return QuantileOptions{}; } 138 139 /// quantile must be between 0 and 1 inclusive 140 std::vector<double> q; 141 enum Interpolation interpolation; 142 /// If true (the default), null values are ignored. Otherwise, if any value is null, 143 /// emit null. 144 bool skip_nulls; 145 /// If less than this many non-null values are observed, emit null. 146 uint32_t min_count; 147 }; 148 149 /// \brief Control TDigest approximate quantile kernel behavior 150 /// 151 /// By default, returns the median value. 152 class ARROW_EXPORT TDigestOptions : public FunctionOptions { 153 public: 154 explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, 155 uint32_t buffer_size = 500, bool skip_nulls = true, 156 uint32_t min_count = 0); 157 explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100, 158 uint32_t buffer_size = 500, bool skip_nulls = true, 159 uint32_t min_count = 0); 160 constexpr static char const kTypeName[] = "TDigestOptions"; Defaults()161 static TDigestOptions Defaults() { return TDigestOptions{}; } 162 163 /// quantile must be between 0 and 1 inclusive 164 std::vector<double> q; 165 /// compression parameter, default 100 166 uint32_t delta; 167 /// input buffer size, default 500 168 uint32_t buffer_size; 169 /// If true (the default), null values are ignored. Otherwise, if any value is null, 170 /// emit null. 171 bool skip_nulls; 172 /// If less than this many non-null values are observed, emit null. 173 uint32_t min_count; 174 }; 175 176 /// \brief Control Index kernel behavior 177 class ARROW_EXPORT IndexOptions : public FunctionOptions { 178 public: 179 explicit IndexOptions(std::shared_ptr<Scalar> value); 180 // Default constructor for serialization 181 IndexOptions(); 182 constexpr static char const kTypeName[] = "IndexOptions"; 183 184 std::shared_ptr<Scalar> value; 185 }; 186 187 /// @} 188 189 /// \brief Count values in an array. 190 /// 191 /// \param[in] options counting options, see CountOptions for more information 192 /// \param[in] datum to count 193 /// \param[in] ctx the function execution context, optional 194 /// \return out resulting datum 195 /// 196 /// \since 1.0.0 197 /// \note API not yet finalized 198 ARROW_EXPORT 199 Result<Datum> Count(const Datum& datum, 200 const CountOptions& options = CountOptions::Defaults(), 201 ExecContext* ctx = NULLPTR); 202 203 /// \brief Compute the mean of a numeric array. 204 /// 205 /// \param[in] value datum to compute the mean, expecting Array 206 /// \param[in] options see ScalarAggregateOptions for more information 207 /// \param[in] ctx the function execution context, optional 208 /// \return datum of the computed mean as a DoubleScalar 209 /// 210 /// \since 1.0.0 211 /// \note API not yet finalized 212 ARROW_EXPORT 213 Result<Datum> Mean( 214 const Datum& value, 215 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 216 ExecContext* ctx = NULLPTR); 217 218 /// \brief Compute the product of values of a numeric array. 219 /// 220 /// \param[in] value datum to compute product of, expecting Array or ChunkedArray 221 /// \param[in] options see ScalarAggregateOptions for more information 222 /// \param[in] ctx the function execution context, optional 223 /// \return datum of the computed sum as a Scalar 224 /// 225 /// \since 6.0.0 226 /// \note API not yet finalized 227 ARROW_EXPORT 228 Result<Datum> Product( 229 const Datum& value, 230 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 231 ExecContext* ctx = NULLPTR); 232 233 /// \brief Sum values of a numeric array. 234 /// 235 /// \param[in] value datum to sum, expecting Array or ChunkedArray 236 /// \param[in] options see ScalarAggregateOptions for more information 237 /// \param[in] ctx the function execution context, optional 238 /// \return datum of the computed sum as a Scalar 239 /// 240 /// \since 1.0.0 241 /// \note API not yet finalized 242 ARROW_EXPORT 243 Result<Datum> Sum( 244 const Datum& value, 245 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 246 ExecContext* ctx = NULLPTR); 247 248 /// \brief Calculate the min / max of a numeric array 249 /// 250 /// This function returns both the min and max as a struct scalar, with type 251 /// struct<min: T, max: T>, where T is the input type 252 /// 253 /// \param[in] value input datum, expecting Array or ChunkedArray 254 /// \param[in] options see ScalarAggregateOptions for more information 255 /// \param[in] ctx the function execution context, optional 256 /// \return resulting datum as a struct<min: T, max: T> scalar 257 /// 258 /// \since 1.0.0 259 /// \note API not yet finalized 260 ARROW_EXPORT 261 Result<Datum> MinMax( 262 const Datum& value, 263 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 264 ExecContext* ctx = NULLPTR); 265 266 /// \brief Test whether any element in a boolean array evaluates to true. 267 /// 268 /// This function returns true if any of the elements in the array evaluates 269 /// to true and false otherwise. Null values are ignored by default. 270 /// If null values are taken into account by setting ScalarAggregateOptions 271 /// parameter skip_nulls = false then Kleene logic is used. 272 /// See KleeneOr for more details on Kleene logic. 273 /// 274 /// \param[in] value input datum, expecting a boolean array 275 /// \param[in] options see ScalarAggregateOptions for more information 276 /// \param[in] ctx the function execution context, optional 277 /// \return resulting datum as a BooleanScalar 278 /// 279 /// \since 3.0.0 280 /// \note API not yet finalized 281 ARROW_EXPORT 282 Result<Datum> Any( 283 const Datum& value, 284 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 285 ExecContext* ctx = NULLPTR); 286 287 /// \brief Test whether all elements in a boolean array evaluate to true. 288 /// 289 /// This function returns true if all of the elements in the array evaluate 290 /// to true and false otherwise. Null values are ignored by default. 291 /// If null values are taken into account by setting ScalarAggregateOptions 292 /// parameter skip_nulls = false then Kleene logic is used. 293 /// See KleeneAnd for more details on Kleene logic. 294 /// 295 /// \param[in] value input datum, expecting a boolean array 296 /// \param[in] options see ScalarAggregateOptions for more information 297 /// \param[in] ctx the function execution context, optional 298 /// \return resulting datum as a BooleanScalar 299 300 /// \since 3.0.0 301 /// \note API not yet finalized 302 ARROW_EXPORT 303 Result<Datum> All( 304 const Datum& value, 305 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 306 ExecContext* ctx = NULLPTR); 307 308 /// \brief Calculate the modal (most common) value of a numeric array 309 /// 310 /// This function returns top-n most common values and number of times they occur as 311 /// an array of `struct<mode: T, count: int64>`, where T is the input type. 312 /// Values with larger counts are returned before smaller ones. 313 /// If there are more than one values with same count, smaller value is returned first. 314 /// 315 /// \param[in] value input datum, expecting Array or ChunkedArray 316 /// \param[in] options see ModeOptions for more information 317 /// \param[in] ctx the function execution context, optional 318 /// \return resulting datum as an array of struct<mode: T, count: int64> 319 /// 320 /// \since 2.0.0 321 /// \note API not yet finalized 322 ARROW_EXPORT 323 Result<Datum> Mode(const Datum& value, 324 const ModeOptions& options = ModeOptions::Defaults(), 325 ExecContext* ctx = NULLPTR); 326 327 /// \brief Calculate the standard deviation of a numeric array 328 /// 329 /// \param[in] value input datum, expecting Array or ChunkedArray 330 /// \param[in] options see VarianceOptions for more information 331 /// \param[in] ctx the function execution context, optional 332 /// \return datum of the computed standard deviation as a DoubleScalar 333 /// 334 /// \since 2.0.0 335 /// \note API not yet finalized 336 ARROW_EXPORT 337 Result<Datum> Stddev(const Datum& value, 338 const VarianceOptions& options = VarianceOptions::Defaults(), 339 ExecContext* ctx = NULLPTR); 340 341 /// \brief Calculate the variance of a numeric array 342 /// 343 /// \param[in] value input datum, expecting Array or ChunkedArray 344 /// \param[in] options see VarianceOptions for more information 345 /// \param[in] ctx the function execution context, optional 346 /// \return datum of the computed variance as a DoubleScalar 347 /// 348 /// \since 2.0.0 349 /// \note API not yet finalized 350 ARROW_EXPORT 351 Result<Datum> Variance(const Datum& value, 352 const VarianceOptions& options = VarianceOptions::Defaults(), 353 ExecContext* ctx = NULLPTR); 354 355 /// \brief Calculate the quantiles of a numeric array 356 /// 357 /// \param[in] value input datum, expecting Array or ChunkedArray 358 /// \param[in] options see QuantileOptions for more information 359 /// \param[in] ctx the function execution context, optional 360 /// \return resulting datum as an array 361 /// 362 /// \since 4.0.0 363 /// \note API not yet finalized 364 ARROW_EXPORT 365 Result<Datum> Quantile(const Datum& value, 366 const QuantileOptions& options = QuantileOptions::Defaults(), 367 ExecContext* ctx = NULLPTR); 368 369 /// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm 370 /// 371 /// \param[in] value input datum, expecting Array or ChunkedArray 372 /// \param[in] options see TDigestOptions for more information 373 /// \param[in] ctx the function execution context, optional 374 /// \return resulting datum as an array 375 /// 376 /// \since 4.0.0 377 /// \note API not yet finalized 378 ARROW_EXPORT 379 Result<Datum> TDigest(const Datum& value, 380 const TDigestOptions& options = TDigestOptions::Defaults(), 381 ExecContext* ctx = NULLPTR); 382 383 /// \brief Find the first index of a value in an array. 384 /// 385 /// \param[in] value The array to search. 386 /// \param[in] options The array to search for. See IndexOoptions. 387 /// \param[in] ctx the function execution context, optional 388 /// \return out a Scalar containing the index (or -1 if not found). 389 /// 390 /// \since 5.0.0 391 /// \note API not yet finalized 392 ARROW_EXPORT 393 Result<Datum> Index(const Datum& value, const IndexOptions& options, 394 ExecContext* ctx = NULLPTR); 395 396 namespace internal { 397 398 /// Internal use only: streaming group identifier. 399 /// Consumes batches of keys and yields batches of the group ids. 400 class ARROW_EXPORT Grouper { 401 public: 402 virtual ~Grouper() = default; 403 404 /// Construct a Grouper which receives the specified key types 405 static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs, 406 ExecContext* ctx = default_exec_context()); 407 408 /// Consume a batch of keys, producing the corresponding group ids as an integer array. 409 /// Currently only uint32 indices will be produced, eventually the bit width will only 410 /// be as wide as necessary. 411 virtual Result<Datum> Consume(const ExecBatch& batch) = 0; 412 413 /// Get current unique keys. May be called multiple times. 414 virtual Result<ExecBatch> GetUniques() = 0; 415 416 /// Get the current number of groups. 417 virtual uint32_t num_groups() const = 0; 418 419 /// \brief Assemble lists of indices of identical elements. 420 /// 421 /// \param[in] ids An unsigned, all-valid integral array which will be 422 /// used as grouping criteria. 423 /// \param[in] num_groups An upper bound for the elements of ids 424 /// \return A num_groups-long ListArray where the slot at i contains a 425 /// list of indices where i appears in ids. 426 /// 427 /// MakeGroupings([ 428 /// 2, 429 /// 2, 430 /// 5, 431 /// 5, 432 /// 2, 433 /// 3 434 /// ], 8) == [ 435 /// [], 436 /// [], 437 /// [0, 1, 4], 438 /// [5], 439 /// [], 440 /// [2, 3], 441 /// [], 442 /// [] 443 /// ] 444 static Result<std::shared_ptr<ListArray>> MakeGroupings( 445 const UInt32Array& ids, uint32_t num_groups, 446 ExecContext* ctx = default_exec_context()); 447 448 /// \brief Produce a ListArray whose slots are selections of `array` which correspond to 449 /// the provided groupings. 450 /// 451 /// For example, 452 /// ApplyGroupings([ 453 /// [], 454 /// [], 455 /// [0, 1, 4], 456 /// [5], 457 /// [], 458 /// [2, 3], 459 /// [], 460 /// [] 461 /// ], [2, 2, 5, 5, 2, 3]) == [ 462 /// [], 463 /// [], 464 /// [2, 2, 2], 465 /// [3], 466 /// [], 467 /// [5, 5], 468 /// [], 469 /// [] 470 /// ] 471 static Result<std::shared_ptr<ListArray>> ApplyGroupings( 472 const ListArray& groupings, const Array& array, 473 ExecContext* ctx = default_exec_context()); 474 }; 475 476 /// \brief Configure a grouped aggregation 477 struct ARROW_EXPORT Aggregate { 478 /// the name of the aggregation function 479 std::string function; 480 481 /// options for the aggregation function 482 const FunctionOptions* options; 483 }; 484 485 /// Internal use only: helper function for testing HashAggregateKernels. 486 /// This will be replaced by streaming execution operators. 487 ARROW_EXPORT 488 Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys, 489 const std::vector<Aggregate>& aggregates, bool use_threads = false, 490 ExecContext* ctx = default_exec_context()); 491 492 } // namespace internal 493 } // namespace compute 494 } // namespace arrow 495