1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 #![warn(missing_docs)] 18 // Clippy lints, some should be disabled incrementally 19 #![allow( 20 clippy::float_cmp, 21 clippy::module_inception, 22 clippy::new_without_default, 23 clippy::type_complexity 24 )] 25 26 //! DataFusion is an extensible query execution framework that uses 27 //! [Apache Arrow](https://arrow.apache.org) as its in-memory format. 28 //! 29 //! DataFusion supports both an SQL and a DataFrame API for building logical query plans 30 //! as well as a query optimizer and execution engine capable of parallel execution 31 //! against partitioned data sources (CSV and Parquet) using threads. 32 //! 33 //! Below is an example of how to execute a query against a CSV using [`DataFrames`](dataframe::DataFrame): 34 //! 35 //! ```rust 36 //! # use datafusion::prelude::*; 37 //! # use datafusion::error::Result; 38 //! # use arrow::record_batch::RecordBatch; 39 //! 40 //! # #[tokio::main] 41 //! # async fn main() -> Result<()> { 42 //! let mut ctx = ExecutionContext::new(); 43 //! 44 //! // create the dataframe 45 //! let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; 46 //! 47 //! // create a plan 48 //! let df = df.filter(col("a").lt_eq(col("b")))? 49 //! .aggregate(&[col("a")], &[min(col("b"))])? 50 //! .limit(100)?; 51 //! 52 //! // execute the plan 53 //! let results: Vec<RecordBatch> = df.collect().await?; 54 //! # Ok(()) 55 //! # } 56 //! ``` 57 //! 58 //! and how to execute a query against a CSV using SQL: 59 //! 60 //! ``` 61 //! # use datafusion::prelude::*; 62 //! # use datafusion::error::Result; 63 //! # use arrow::record_batch::RecordBatch; 64 //! 65 //! # #[tokio::main] 66 //! # async fn main() -> Result<()> { 67 //! let mut ctx = ExecutionContext::new(); 68 //! 69 //! ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; 70 //! 71 //! // create a plan 72 //! let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; 73 //! 74 //! // execute the plan 75 //! let results: Vec<RecordBatch> = df.collect().await?; 76 //! # Ok(()) 77 //! # } 78 //! ``` 79 //! 80 //! ## Parse, Plan, Optimize, Execute 81 //! 82 //! DataFusion is a fully fledged query engine capable of performing complex operations. 83 //! Specifically, when DataFusion receives an SQL query, there are different steps 84 //! that it passes through until a result is obtained. Broadly, they are: 85 //! 86 //! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.6.1/sqlparser/). 87 //! 2. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical expressions on the AST to logical expressions [`Expr`s](logical_plan::Expr). 88 //! 3. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical nodes on the AST to a [`LogicalPlan`](logical_plan::LogicalPlan). 89 //! 4. [`OptimizerRules`](optimizer::optimizer::OptimizerRule) are applied to the [`LogicalPlan`](logical_plan::LogicalPlan) to optimize it. 90 //! 5. The [`LogicalPlan`](logical_plan::LogicalPlan) is converted to an [`ExecutionPlan`](physical_plan::ExecutionPlan) by a [`PhysicalPlanner`](physical_plan::PhysicalPlanner) 91 //! 6. The [`ExecutionPlan`](physical_plan::ExecutionPlan) is executed against data through the [`ExecutionContext`](execution::context::ExecutionContext) 92 //! 93 //! With a [`DataFrame`](dataframe::DataFrame) API, steps 1-3 are not used as the DataFrame builds the [`LogicalPlan`](logical_plan::LogicalPlan) directly. 94 //! 95 //! Phases 1-5 are typically cheap when compared to phase 6, and thus DataFusion puts a 96 //! lot of effort to ensure that phase 6 runs efficiently and without errors. 97 //! 98 //! DataFusion's planning is divided in two main parts: logical planning and physical planning. 99 //! 100 //! ### Logical plan 101 //! 102 //! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr). 103 //! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed. 104 //! 105 //! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)). 106 //! All of these are located in [`logical_plan`](logical_plan). 107 //! 108 //! ### Physical plan 109 //! 110 //! A Physical plan ([`ExecutionPlan`](physical_plan::ExecutionPlan)) is a plan that can be executed against data. 111 //! Contrarily to a logical plan, the physical plan has concrete information about how the calculation 112 //! should be performed (e.g. what Rust functions are used) and how data should be loaded into memory. 113 //! 114 //! [`ExecutionPlan`](physical_plan::ExecutionPlan) uses the Arrow format as its in-memory representation of data, through the [arrow] crate. 115 //! We recommend going through [its documentation](arrow) for details on how the data is physically represented. 116 //! 117 //! A [`ExecutionPlan`](physical_plan::ExecutionPlan) is composed by nodes (implement the trait [`ExecutionPlan`](physical_plan::ExecutionPlan)), 118 //! and each node is composed by physical expressions ([`PhysicalExpr`](physical_plan::PhysicalExpr)) 119 //! or aggreagate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)). 120 //! All of these are located in the module [`physical_plan`](physical_plan). 121 //! 122 //! Broadly speaking, 123 //! 124 //! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns 125 //! an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch) 126 //! (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader)) 127 //! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch) 128 //! and returns an [`Array`](arrow::array::Array) 129 //! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch) 130 //! and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*) 131 //! 132 //! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition. 133 //! 134 //! The following physical nodes are currently implemented: 135 //! 136 //! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec) 137 //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec) 138 //! * Hash and Grouped aggregations: [`HashAggregateExec`](physical_plan::hash_aggregate::HashAggregateExec) 139 //! * Sort: [`SortExec`](physical_plan::sort::SortExec) 140 //! * Merge (partitions): [`MergeExec`](physical_plan::merge::MergeExec) 141 //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec) 142 //! * Scan a CSV: [`CsvExec`](physical_plan::csv::CsvExec) 143 //! * Scan a Parquet: [`ParquetExec`](physical_plan::parquet::ParquetExec) 144 //! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec) 145 //! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec) 146 //! 147 //! ## Customize 148 //! 149 //! DataFusion allows users to 150 //! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`](execution::context::QueryPlanner)) 151 //! * declare and use user-defined scalar functions ([`ScalarUDF`](physical_plan::udf::ScalarUDF)) 152 //! * declare and use user-defined aggregate functions ([`AggregateUDF`](physical_plan::udaf::AggregateUDF)) 153 //! 154 //! you can find examples of each of them in examples section. 155 156 extern crate arrow; 157 extern crate sqlparser; 158 159 pub mod dataframe; 160 pub mod datasource; 161 pub mod error; 162 pub mod execution; 163 pub mod logical_plan; 164 pub mod optimizer; 165 pub mod physical_plan; 166 pub mod prelude; 167 pub mod scalar; 168 pub mod sql; 169 pub mod variable; 170 171 #[cfg(test)] 172 pub mod test; 173