1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Eager evaluation convenience APIs for invoking common functions, including
19 // necessary memory allocations
20 
21 #pragma once
22 
23 #include "arrow/compute/function.h"
24 #include "arrow/datum.h"
25 #include "arrow/result.h"
26 #include "arrow/util/macros.h"
27 #include "arrow/util/visibility.h"
28 
29 namespace arrow {
30 
31 class Array;
32 
33 namespace compute {
34 
35 class ExecContext;
36 
37 // ----------------------------------------------------------------------
38 // Aggregate functions
39 
40 /// \addtogroup compute-concrete-options
41 /// @{
42 
43 /// \brief Control general scalar aggregate kernel behavior
44 ///
45 /// By default, null values are ignored (skip_nulls = true).
46 class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
47  public:
48   explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
49   constexpr static char const kTypeName[] = "ScalarAggregateOptions";
Defaults()50   static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
51 
52   /// If true (the default), null values are ignored. Otherwise, if any value is null,
53   /// emit null.
54   bool skip_nulls;
55   /// If less than this many non-null values are observed, emit null.
56   uint32_t min_count;
57 };
58 
59 /// \brief Control count aggregate kernel behavior.
60 ///
61 /// By default, only non-null values are counted.
62 class ARROW_EXPORT CountOptions : public FunctionOptions {
63  public:
64   enum CountMode {
65     /// Count only non-null values.
66     ONLY_VALID = 0,
67     /// Count only null values.
68     ONLY_NULL,
69     /// Count both non-null and null values.
70     ALL,
71   };
72   explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
73   constexpr static char const kTypeName[] = "CountOptions";
Defaults()74   static CountOptions Defaults() { return CountOptions{}; }
75 
76   CountMode mode;
77 };
78 
79 /// \brief Control Mode kernel behavior
80 ///
81 /// Returns top-n common values and counts.
82 /// By default, returns the most common value and count.
83 class ARROW_EXPORT ModeOptions : public FunctionOptions {
84  public:
85   explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
86   constexpr static char const kTypeName[] = "ModeOptions";
Defaults()87   static ModeOptions Defaults() { return ModeOptions{}; }
88 
89   int64_t n = 1;
90   /// If true (the default), null values are ignored. Otherwise, if any value is null,
91   /// emit null.
92   bool skip_nulls;
93   /// If less than this many non-null values are observed, emit null.
94   uint32_t min_count;
95 };
96 
97 /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
98 ///
99 /// The divisor used in calculations is N - ddof, where N is the number of elements.
100 /// By default, ddof is zero, and population variance or stddev is returned.
101 class ARROW_EXPORT VarianceOptions : public FunctionOptions {
102  public:
103   explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
104   constexpr static char const kTypeName[] = "VarianceOptions";
Defaults()105   static VarianceOptions Defaults() { return VarianceOptions{}; }
106 
107   int ddof = 0;
108   /// If true (the default), null values are ignored. Otherwise, if any value is null,
109   /// emit null.
110   bool skip_nulls;
111   /// If less than this many non-null values are observed, emit null.
112   uint32_t min_count;
113 };
114 
115 /// \brief Control Quantile kernel behavior
116 ///
117 /// By default, returns the median value.
118 class ARROW_EXPORT QuantileOptions : public FunctionOptions {
119  public:
120   /// Interpolation method to use when quantile lies between two data points
121   enum Interpolation {
122     LINEAR = 0,
123     LOWER,
124     HIGHER,
125     NEAREST,
126     MIDPOINT,
127   };
128 
129   explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
130                            bool skip_nulls = true, uint32_t min_count = 0);
131 
132   explicit QuantileOptions(std::vector<double> q,
133                            enum Interpolation interpolation = LINEAR,
134                            bool skip_nulls = true, uint32_t min_count = 0);
135 
136   constexpr static char const kTypeName[] = "QuantileOptions";
Defaults()137   static QuantileOptions Defaults() { return QuantileOptions{}; }
138 
139   /// quantile must be between 0 and 1 inclusive
140   std::vector<double> q;
141   enum Interpolation interpolation;
142   /// If true (the default), null values are ignored. Otherwise, if any value is null,
143   /// emit null.
144   bool skip_nulls;
145   /// If less than this many non-null values are observed, emit null.
146   uint32_t min_count;
147 };
148 
149 /// \brief Control TDigest approximate quantile kernel behavior
150 ///
151 /// By default, returns the median value.
152 class ARROW_EXPORT TDigestOptions : public FunctionOptions {
153  public:
154   explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
155                           uint32_t buffer_size = 500, bool skip_nulls = true,
156                           uint32_t min_count = 0);
157   explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
158                           uint32_t buffer_size = 500, bool skip_nulls = true,
159                           uint32_t min_count = 0);
160   constexpr static char const kTypeName[] = "TDigestOptions";
Defaults()161   static TDigestOptions Defaults() { return TDigestOptions{}; }
162 
163   /// quantile must be between 0 and 1 inclusive
164   std::vector<double> q;
165   /// compression parameter, default 100
166   uint32_t delta;
167   /// input buffer size, default 500
168   uint32_t buffer_size;
169   /// If true (the default), null values are ignored. Otherwise, if any value is null,
170   /// emit null.
171   bool skip_nulls;
172   /// If less than this many non-null values are observed, emit null.
173   uint32_t min_count;
174 };
175 
176 /// \brief Control Index kernel behavior
177 class ARROW_EXPORT IndexOptions : public FunctionOptions {
178  public:
179   explicit IndexOptions(std::shared_ptr<Scalar> value);
180   // Default constructor for serialization
181   IndexOptions();
182   constexpr static char const kTypeName[] = "IndexOptions";
183 
184   std::shared_ptr<Scalar> value;
185 };
186 
187 /// @}
188 
189 /// \brief Count values in an array.
190 ///
191 /// \param[in] options counting options, see CountOptions for more information
192 /// \param[in] datum to count
193 /// \param[in] ctx the function execution context, optional
194 /// \return out resulting datum
195 ///
196 /// \since 1.0.0
197 /// \note API not yet finalized
198 ARROW_EXPORT
199 Result<Datum> Count(const Datum& datum,
200                     const CountOptions& options = CountOptions::Defaults(),
201                     ExecContext* ctx = NULLPTR);
202 
203 /// \brief Compute the mean of a numeric array.
204 ///
205 /// \param[in] value datum to compute the mean, expecting Array
206 /// \param[in] options see ScalarAggregateOptions for more information
207 /// \param[in] ctx the function execution context, optional
208 /// \return datum of the computed mean as a DoubleScalar
209 ///
210 /// \since 1.0.0
211 /// \note API not yet finalized
212 ARROW_EXPORT
213 Result<Datum> Mean(
214     const Datum& value,
215     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
216     ExecContext* ctx = NULLPTR);
217 
218 /// \brief Compute the product of values of a numeric array.
219 ///
220 /// \param[in] value datum to compute product of, expecting Array or ChunkedArray
221 /// \param[in] options see ScalarAggregateOptions for more information
222 /// \param[in] ctx the function execution context, optional
223 /// \return datum of the computed sum as a Scalar
224 ///
225 /// \since 6.0.0
226 /// \note API not yet finalized
227 ARROW_EXPORT
228 Result<Datum> Product(
229     const Datum& value,
230     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
231     ExecContext* ctx = NULLPTR);
232 
233 /// \brief Sum values of a numeric array.
234 ///
235 /// \param[in] value datum to sum, expecting Array or ChunkedArray
236 /// \param[in] options see ScalarAggregateOptions for more information
237 /// \param[in] ctx the function execution context, optional
238 /// \return datum of the computed sum as a Scalar
239 ///
240 /// \since 1.0.0
241 /// \note API not yet finalized
242 ARROW_EXPORT
243 Result<Datum> Sum(
244     const Datum& value,
245     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
246     ExecContext* ctx = NULLPTR);
247 
248 /// \brief Calculate the min / max of a numeric array
249 ///
250 /// This function returns both the min and max as a struct scalar, with type
251 /// struct<min: T, max: T>, where T is the input type
252 ///
253 /// \param[in] value input datum, expecting Array or ChunkedArray
254 /// \param[in] options see ScalarAggregateOptions for more information
255 /// \param[in] ctx the function execution context, optional
256 /// \return resulting datum as a struct<min: T, max: T> scalar
257 ///
258 /// \since 1.0.0
259 /// \note API not yet finalized
260 ARROW_EXPORT
261 Result<Datum> MinMax(
262     const Datum& value,
263     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
264     ExecContext* ctx = NULLPTR);
265 
266 /// \brief Test whether any element in a boolean array evaluates to true.
267 ///
268 /// This function returns true if any of the elements in the array evaluates
269 /// to true and false otherwise. Null values are ignored by default.
270 /// If null values are taken into account by setting ScalarAggregateOptions
271 /// parameter skip_nulls = false then Kleene logic is used.
272 /// See KleeneOr for more details on Kleene logic.
273 ///
274 /// \param[in] value input datum, expecting a boolean array
275 /// \param[in] options see ScalarAggregateOptions for more information
276 /// \param[in] ctx the function execution context, optional
277 /// \return resulting datum as a BooleanScalar
278 ///
279 /// \since 3.0.0
280 /// \note API not yet finalized
281 ARROW_EXPORT
282 Result<Datum> Any(
283     const Datum& value,
284     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
285     ExecContext* ctx = NULLPTR);
286 
287 /// \brief Test whether all elements in a boolean array evaluate to true.
288 ///
289 /// This function returns true if all of the elements in the array evaluate
290 /// to true and false otherwise. Null values are ignored by default.
291 /// If null values are taken into account by setting ScalarAggregateOptions
292 /// parameter skip_nulls = false then Kleene logic is used.
293 /// See KleeneAnd for more details on Kleene logic.
294 ///
295 /// \param[in] value input datum, expecting a boolean array
296 /// \param[in] options see ScalarAggregateOptions for more information
297 /// \param[in] ctx the function execution context, optional
298 /// \return resulting datum as a BooleanScalar
299 
300 /// \since 3.0.0
301 /// \note API not yet finalized
302 ARROW_EXPORT
303 Result<Datum> All(
304     const Datum& value,
305     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
306     ExecContext* ctx = NULLPTR);
307 
308 /// \brief Calculate the modal (most common) value of a numeric array
309 ///
310 /// This function returns top-n most common values and number of times they occur as
311 /// an array of `struct<mode: T, count: int64>`, where T is the input type.
312 /// Values with larger counts are returned before smaller ones.
313 /// If there are more than one values with same count, smaller value is returned first.
314 ///
315 /// \param[in] value input datum, expecting Array or ChunkedArray
316 /// \param[in] options see ModeOptions for more information
317 /// \param[in] ctx the function execution context, optional
318 /// \return resulting datum as an array of struct<mode: T, count: int64>
319 ///
320 /// \since 2.0.0
321 /// \note API not yet finalized
322 ARROW_EXPORT
323 Result<Datum> Mode(const Datum& value,
324                    const ModeOptions& options = ModeOptions::Defaults(),
325                    ExecContext* ctx = NULLPTR);
326 
327 /// \brief Calculate the standard deviation of a numeric array
328 ///
329 /// \param[in] value input datum, expecting Array or ChunkedArray
330 /// \param[in] options see VarianceOptions for more information
331 /// \param[in] ctx the function execution context, optional
332 /// \return datum of the computed standard deviation as a DoubleScalar
333 ///
334 /// \since 2.0.0
335 /// \note API not yet finalized
336 ARROW_EXPORT
337 Result<Datum> Stddev(const Datum& value,
338                      const VarianceOptions& options = VarianceOptions::Defaults(),
339                      ExecContext* ctx = NULLPTR);
340 
341 /// \brief Calculate the variance of a numeric array
342 ///
343 /// \param[in] value input datum, expecting Array or ChunkedArray
344 /// \param[in] options see VarianceOptions for more information
345 /// \param[in] ctx the function execution context, optional
346 /// \return datum of the computed variance as a DoubleScalar
347 ///
348 /// \since 2.0.0
349 /// \note API not yet finalized
350 ARROW_EXPORT
351 Result<Datum> Variance(const Datum& value,
352                        const VarianceOptions& options = VarianceOptions::Defaults(),
353                        ExecContext* ctx = NULLPTR);
354 
355 /// \brief Calculate the quantiles of a numeric array
356 ///
357 /// \param[in] value input datum, expecting Array or ChunkedArray
358 /// \param[in] options see QuantileOptions for more information
359 /// \param[in] ctx the function execution context, optional
360 /// \return resulting datum as an array
361 ///
362 /// \since 4.0.0
363 /// \note API not yet finalized
364 ARROW_EXPORT
365 Result<Datum> Quantile(const Datum& value,
366                        const QuantileOptions& options = QuantileOptions::Defaults(),
367                        ExecContext* ctx = NULLPTR);
368 
369 /// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
370 ///
371 /// \param[in] value input datum, expecting Array or ChunkedArray
372 /// \param[in] options see TDigestOptions for more information
373 /// \param[in] ctx the function execution context, optional
374 /// \return resulting datum as an array
375 ///
376 /// \since 4.0.0
377 /// \note API not yet finalized
378 ARROW_EXPORT
379 Result<Datum> TDigest(const Datum& value,
380                       const TDigestOptions& options = TDigestOptions::Defaults(),
381                       ExecContext* ctx = NULLPTR);
382 
383 /// \brief Find the first index of a value in an array.
384 ///
385 /// \param[in] value The array to search.
386 /// \param[in] options The array to search for. See IndexOoptions.
387 /// \param[in] ctx the function execution context, optional
388 /// \return out a Scalar containing the index (or -1 if not found).
389 ///
390 /// \since 5.0.0
391 /// \note API not yet finalized
392 ARROW_EXPORT
393 Result<Datum> Index(const Datum& value, const IndexOptions& options,
394                     ExecContext* ctx = NULLPTR);
395 
396 namespace internal {
397 
398 /// Internal use only: streaming group identifier.
399 /// Consumes batches of keys and yields batches of the group ids.
400 class ARROW_EXPORT Grouper {
401  public:
402   virtual ~Grouper() = default;
403 
404   /// Construct a Grouper which receives the specified key types
405   static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
406                                                ExecContext* ctx = default_exec_context());
407 
408   /// Consume a batch of keys, producing the corresponding group ids as an integer array.
409   /// Currently only uint32 indices will be produced, eventually the bit width will only
410   /// be as wide as necessary.
411   virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
412 
413   /// Get current unique keys. May be called multiple times.
414   virtual Result<ExecBatch> GetUniques() = 0;
415 
416   /// Get the current number of groups.
417   virtual uint32_t num_groups() const = 0;
418 
419   /// \brief Assemble lists of indices of identical elements.
420   ///
421   /// \param[in] ids An unsigned, all-valid integral array which will be
422   ///                used as grouping criteria.
423   /// \param[in] num_groups An upper bound for the elements of ids
424   /// \return A num_groups-long ListArray where the slot at i contains a
425   ///         list of indices where i appears in ids.
426   ///
427   ///   MakeGroupings([
428   ///       2,
429   ///       2,
430   ///       5,
431   ///       5,
432   ///       2,
433   ///       3
434   ///   ], 8) == [
435   ///       [],
436   ///       [],
437   ///       [0, 1, 4],
438   ///       [5],
439   ///       [],
440   ///       [2, 3],
441   ///       [],
442   ///       []
443   ///   ]
444   static Result<std::shared_ptr<ListArray>> MakeGroupings(
445       const UInt32Array& ids, uint32_t num_groups,
446       ExecContext* ctx = default_exec_context());
447 
448   /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
449   /// the provided groupings.
450   ///
451   /// For example,
452   ///   ApplyGroupings([
453   ///       [],
454   ///       [],
455   ///       [0, 1, 4],
456   ///       [5],
457   ///       [],
458   ///       [2, 3],
459   ///       [],
460   ///       []
461   ///   ], [2, 2, 5, 5, 2, 3]) == [
462   ///       [],
463   ///       [],
464   ///       [2, 2, 2],
465   ///       [3],
466   ///       [],
467   ///       [5, 5],
468   ///       [],
469   ///       []
470   ///   ]
471   static Result<std::shared_ptr<ListArray>> ApplyGroupings(
472       const ListArray& groupings, const Array& array,
473       ExecContext* ctx = default_exec_context());
474 };
475 
476 /// \brief Configure a grouped aggregation
477 struct ARROW_EXPORT Aggregate {
478   /// the name of the aggregation function
479   std::string function;
480 
481   /// options for the aggregation function
482   const FunctionOptions* options;
483 };
484 
485 /// Internal use only: helper function for testing HashAggregateKernels.
486 /// This will be replaced by streaming execution operators.
487 ARROW_EXPORT
488 Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
489                       const std::vector<Aggregate>& aggregates, bool use_threads = false,
490                       ExecContext* ctx = default_exec_context());
491 
492 }  // namespace internal
493 }  // namespace compute
494 }  // namespace arrow
495