1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 #![warn(missing_docs)]
18 // Clippy lints, some should be disabled incrementally
19 #![allow(
20     clippy::float_cmp,
21     clippy::module_inception,
22     clippy::new_without_default,
23     clippy::type_complexity
24 )]
25 
26 //! DataFusion is an extensible query execution framework that uses
27 //! [Apache Arrow](https://arrow.apache.org) as its in-memory format.
28 //!
29 //! DataFusion supports both an SQL and a DataFrame API for building logical query plans
30 //! as well as a query optimizer and execution engine capable of parallel execution
31 //! against partitioned data sources (CSV and Parquet) using threads.
32 //!
33 //! Below is an example of how to execute a query against a CSV using [`DataFrames`](dataframe::DataFrame):
34 //!
35 //! ```rust
36 //! # use datafusion::prelude::*;
37 //! # use datafusion::error::Result;
38 //! # use arrow::record_batch::RecordBatch;
39 //!
40 //! # #[tokio::main]
41 //! # async fn main() -> Result<()> {
42 //! let mut ctx = ExecutionContext::new();
43 //!
44 //! // create the dataframe
45 //! let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
46 //!
47 //! // create a plan
48 //! let df = df.filter(col("a").lt_eq(col("b")))?
49 //!            .aggregate(&[col("a")], &[min(col("b"))])?
50 //!            .limit(100)?;
51 //!
52 //! // execute the plan
53 //! let results: Vec<RecordBatch> = df.collect().await?;
54 //! # Ok(())
55 //! # }
56 //! ```
57 //!
58 //! and how to execute a query against a CSV using SQL:
59 //!
60 //! ```
61 //! # use datafusion::prelude::*;
62 //! # use datafusion::error::Result;
63 //! # use arrow::record_batch::RecordBatch;
64 //!
65 //! # #[tokio::main]
66 //! # async fn main() -> Result<()> {
67 //! let mut ctx = ExecutionContext::new();
68 //!
69 //! ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?;
70 //!
71 //! // create a plan
72 //! let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?;
73 //!
74 //! // execute the plan
75 //! let results: Vec<RecordBatch> = df.collect().await?;
76 //! # Ok(())
77 //! # }
78 //! ```
79 //!
80 //! ## Parse, Plan, Optimize, Execute
81 //!
82 //! DataFusion is a fully fledged query engine capable of performing complex operations.
83 //! Specifically, when DataFusion receives an SQL query, there are different steps
84 //! that it passes through until a result is obtained. Broadly, they are:
85 //!
86 //! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.6.1/sqlparser/).
87 //! 2. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical expressions on the AST to logical expressions [`Expr`s](logical_plan::Expr).
88 //! 3. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical nodes on the AST to a [`LogicalPlan`](logical_plan::LogicalPlan).
89 //! 4. [`OptimizerRules`](optimizer::optimizer::OptimizerRule) are applied to the [`LogicalPlan`](logical_plan::LogicalPlan) to optimize it.
90 //! 5. The [`LogicalPlan`](logical_plan::LogicalPlan) is converted to an [`ExecutionPlan`](physical_plan::ExecutionPlan) by a [`PhysicalPlanner`](physical_plan::PhysicalPlanner)
91 //! 6. The [`ExecutionPlan`](physical_plan::ExecutionPlan) is executed against data through the [`ExecutionContext`](execution::context::ExecutionContext)
92 //!
93 //! With a [`DataFrame`](dataframe::DataFrame) API, steps 1-3 are not used as the DataFrame builds the [`LogicalPlan`](logical_plan::LogicalPlan) directly.
94 //!
95 //! Phases 1-5 are typically cheap when compared to phase 6, and thus DataFusion puts a
96 //! lot of effort to ensure that phase 6 runs efficiently and without errors.
97 //!
98 //! DataFusion's planning is divided in two main parts: logical planning and physical planning.
99 //!
100 //! ### Logical plan
101 //!
102 //! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr).
103 //! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed.
104 //!
105 //! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)).
106 //! All of these are located in [`logical_plan`](logical_plan).
107 //!
108 //! ### Physical plan
109 //!
110 //! A Physical plan ([`ExecutionPlan`](physical_plan::ExecutionPlan)) is a plan that can be executed against data.
111 //! Contrarily to a logical plan, the physical plan has concrete information about how the calculation
112 //! should be performed (e.g. what Rust functions are used) and how data should be loaded into memory.
113 //!
114 //! [`ExecutionPlan`](physical_plan::ExecutionPlan) uses the Arrow format as its in-memory representation of data, through the [arrow] crate.
115 //! We recommend going through [its documentation](arrow) for details on how the data is physically represented.
116 //!
117 //! A [`ExecutionPlan`](physical_plan::ExecutionPlan) is composed by nodes (implement the trait [`ExecutionPlan`](physical_plan::ExecutionPlan)),
118 //! and each node is composed by physical expressions ([`PhysicalExpr`](physical_plan::PhysicalExpr))
119 //! or aggreagate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)).
120 //! All of these are located in the module [`physical_plan`](physical_plan).
121 //!
122 //! Broadly speaking,
123 //!
124 //! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns
125 //!   an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch)
126 //!   (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader))
127 //! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch)
128 //!   and returns an [`Array`](arrow::array::Array)
129 //! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch)
130 //!   and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*)
131 //!
132 //! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition.
133 //!
134 //! The following physical nodes are currently implemented:
135 //!
136 //! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec)
137 //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec)
138 //! * Hash and Grouped aggregations: [`HashAggregateExec`](physical_plan::hash_aggregate::HashAggregateExec)
139 //! * Sort: [`SortExec`](physical_plan::sort::SortExec)
140 //! * Merge (partitions): [`MergeExec`](physical_plan::merge::MergeExec)
141 //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec)
142 //! * Scan a CSV: [`CsvExec`](physical_plan::csv::CsvExec)
143 //! * Scan a Parquet: [`ParquetExec`](physical_plan::parquet::ParquetExec)
144 //! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec)
145 //! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec)
146 //!
147 //! ## Customize
148 //!
149 //! DataFusion allows users to
150 //! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`](execution::context::QueryPlanner))
151 //! * declare and use user-defined scalar functions ([`ScalarUDF`](physical_plan::udf::ScalarUDF))
152 //! * declare and use user-defined aggregate functions ([`AggregateUDF`](physical_plan::udaf::AggregateUDF))
153 //!
154 //! you can find examples of each of them in examples section.
155 
156 extern crate arrow;
157 extern crate sqlparser;
158 
159 pub mod dataframe;
160 pub mod datasource;
161 pub mod error;
162 pub mod execution;
163 pub mod logical_plan;
164 pub mod optimizer;
165 pub mod physical_plan;
166 pub mod prelude;
167 pub mod scalar;
168 pub mod sql;
169 pub mod variable;
170 
171 #[cfg(test)]
172 pub mod test;
173