library(conflicted)
library(dplyr)
conflict_prefer("filter", "dplyr")
#> [conflicted] Will prefer dplyr::filter over any other package.
To enable duckplyr for individual data frames instead of session wide,
library()
.duckplyr::as_duckdb_tibble()
as the first step in
your pipe, without attaching the package.eager <-
duckplyr::flights_df() |>
duckplyr::as_duckdb_tibble() |>
filter(!is.na(arr_delay), !is.na(dep_delay)) |>
mutate(inflight_delay = arr_delay - dep_delay) |>
summarize(
.by = c(year, month),
mean_inflight_delay = mean(inflight_delay),
median_inflight_delay = median(inflight_delay),
) |>
filter(month <= 6)
The result is a tibble, with its own class.
class(eager)
#> [1] "duckplyr_df" "tbl_df" "tbl" "data.frame"
names(eager)
#> [1] "year" "month" "mean_inflight_delay"
#> [4] "median_inflight_delay"
DuckDB is responsible for eventually carrying out the operations. Despite the late filter, the summary is not computed for the months in the second half of the year.
eager |>
explain()
#> ┌---------------------------┐
#> │ HASH_GROUP_BY │
#> │ -------------------- │
#> │ Groups: │
#> │ #0 │
#> │ #1 │
#> │ │
#> │ Aggregates: │
#> │ mean(#2) │
#> │ median(#3) │
#> │ │
#> │ ~33677 Rows │
#> └-------------┬-------------┘
#> ┌-------------┴-------------┐
#> │ PROJECTION │
#> │ -------------------- │
#> │ year │
#> │ month │
#> │ inflight_delay │
#> │ inflight_delay │
#> │ │
#> │ ~67355 Rows │
#> └-------------┬-------------┘
#> ┌-------------┴-------------┐
#> │ PROJECTION │
#> │ -------------------- │
#> │ year │
#> │ month │
#> │ inflight_delay │
#> │ │
#> │ ~67355 Rows │
#> └-------------┬-------------┘
#> ┌-------------┴-------------┐
#> │ FILTER │
#> │ -------------------- │
#> │ ((CAST(month AS DOUBLE) <=│
#> │ 6.0) AND (NOT ((arr_delay│
#> │ IS NULL) OR isnan │
#> │ (arr_delay))) AND (NOT ( │
#> │ (dep_delay IS NULL) OR │
#> │ isnan(dep_delay)))) │
#> │ │
#> │ ~67355 Rows │
#> └-------------┬-------------┘
#> ┌-------------┴-------------┐
#> │ R_DATAFRAME_SCAN │
#> │ -------------------- │
#> │ Text: data.frame │
#> │ │
#> │ Projections: │
#> │ year │
#> │ month │
#> │ dep_delay │
#> │ arr_delay │
#> │ │
#> │ ~336776 Rows │
#> └---------------------------┘
All data frame operations are supported. Computation happens upon the first request.
After the computation has been carried out, the results are preserved and available immediately:
The default mode for as_duckdb_tibble()
is eager. This
allows applying all data frame operations on the results, including
column subsetting or retrieving the number of rows. In addition, if an
operation cannot be carried out by duckdb, the dplyr fallback is used
transparently. Use .lazy = TRUE
to ensure that all
operations are carried out by DuckDB, or fail. This is also the default
for the ingestion functions such as
read_parquet_duckdb()
.
Columns or the row count cannot be accessed directly in this mode:
Also, operations that are not (yet) supported will fail:
lazy |>
mutate(inflight_delay = arr_delay - dep_delay) |>
summarize(
.by = c(year, month),
mean_inflight_delay = mean(inflight_delay, na.rm = TRUE),
median_inflight_delay = median(inflight_delay, na.rm = TRUE),
)
#> Error in `summarize()`:
#> ! This operation cannot be carried out by DuckDB, and the input is a
#> lazy duckplyr frame.
#> ℹ Use `compute(lazy = FALSE)` to materialize to temporary storage and continue
#> with duckplyr.
#> ℹ See the "Eager and lazy" section in `?duckdb_tibble()` for other options.
#> Caused by error in `rel_translate_lang()`:
#> ! Can't translate named argument `mean(na.rm = )`.
See vignette("limits")
for current limitations, and the
contributing guide for how to add support for additional operations.
duckplyr also defines a set of generics that provide a low-level implementer’s interface for dplyr’s high-level user interface. Other packages may then implement methods for those generics.
library(conflicted)
library(dplyr)
conflict_prefer("filter", "dplyr")
#> [conflicted] Removing existing preference.
#> [conflicted] Will prefer dplyr::filter over any other package.
library(duckplyr)
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.
# Create a relational to be used by examples below
new_dfrel <- function(x) {
stopifnot(is.data.frame(x))
new_relational(list(x), class = "dfrel")
}
mtcars_rel <- new_dfrel(mtcars[1:5, 1:4])
# Example 1: return a data.frame
rel_to_df.dfrel <- function(rel, ...) {
unclass(rel)[[1]]
}
rel_to_df(mtcars_rel)
#> mpg cyl disp hp
#> Mazda RX4 21.0 6 160 110
#> Mazda RX4 Wag 21.0 6 160 110
#> Datsun 710 22.8 4 108 93
#> Hornet 4 Drive 21.4 6 258 110
#> Hornet Sportabout 18.7 8 360 175
# Example 2: A (random) filter
rel_filter.dfrel <- function(rel, exprs, ...) {
df <- unclass(rel)[[1]]
# A real implementation would evaluate the predicates defined
# by the exprs argument
new_dfrel(df[sample.int(nrow(df), 3, replace = TRUE), ])
}
rel_filter(
mtcars_rel,
list(
relexpr_function(
"gt",
list(relexpr_reference("cyl"), relexpr_constant("6"))
)
)
)
#> [[1]]
#> mpg cyl disp hp
#> Hornet Sportabout 18.7 8 360 175
#> Hornet 4 Drive 21.4 6 258 110
#> Mazda RX4 21.0 6 160 110
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 3: A custom projection
rel_project.dfrel <- function(rel, exprs, ...) {
df <- unclass(rel)[[1]]
# A real implementation would evaluate the expressions defined
# by the exprs argument
new_dfrel(df[seq_len(min(3, base::ncol(df)))])
}
rel_project(
mtcars_rel,
list(relexpr_reference("cyl"), relexpr_reference("disp"))
)
#> [[1]]
#> mpg cyl disp
#> Mazda RX4 21.0 6 160
#> Mazda RX4 Wag 21.0 6 160
#> Datsun 710 22.8 4 108
#> Hornet 4 Drive 21.4 6 258
#> Hornet Sportabout 18.7 8 360
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 4: A custom ordering (eg, ascending by mpg)
rel_order.dfrel <- function(rel, exprs, ...) {
df <- unclass(rel)[[1]]
# A real implementation would evaluate the expressions defined
# by the exprs argument
new_dfrel(df[order(df[[1]]), ])
}
rel_order(
mtcars_rel,
list(relexpr_reference("mpg"))
)
#> [[1]]
#> mpg cyl disp hp
#> Hornet Sportabout 18.7 8 360 175
#> Mazda RX4 21.0 6 160 110
#> Mazda RX4 Wag 21.0 6 160 110
#> Hornet 4 Drive 21.4 6 258 110
#> Datsun 710 22.8 4 108 93
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 5: A custom join
rel_join.dfrel <- function(left, right, conds, join, ...) {
left_df <- unclass(left)[[1]]
right_df <- unclass(right)[[1]]
# A real implementation would evaluate the expressions
# defined by the conds argument,
# use different join types based on the join argument,
# and implement the join itself instead of relaying to left_join().
new_dfrel(dplyr::left_join(left_df, right_df))
}
rel_join(new_dfrel(data.frame(mpg = 21)), mtcars_rel)
#> Joining with `by = join_by(mpg)`
#> Joining with `by = join_by(mpg)`
#> [[1]]
#> mpg cyl disp hp
#> 1 21 6 160 110
#> 2 21 6 160 110
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 6: Limit the maximum rows returned
rel_limit.dfrel <- function(rel, n, ...) {
df <- unclass(rel)[[1]]
new_dfrel(df[seq_len(n), ])
}
rel_limit(mtcars_rel, 3)
#> [[1]]
#> mpg cyl disp hp
#> Mazda RX4 21.0 6 160 110
#> Mazda RX4 Wag 21.0 6 160 110
#> Datsun 710 22.8 4 108 93
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 7: Suppress duplicate rows
# (ignoring row names)
rel_distinct.dfrel <- function(rel, ...) {
df <- unclass(rel)[[1]]
new_dfrel(df[!duplicated(df), ])
}
rel_distinct(new_dfrel(mtcars[1:3, 1:4]))
#> [[1]]
#> mpg cyl disp hp
#> Mazda RX4 21.0 6 160 110
#> Datsun 710 22.8 4 108 93
#>
#> attr(,"class")
#> [1] "dfrel" "relational"
# Example 8: Return column names
rel_names.dfrel <- function(rel, ...) {
df <- unclass(rel)[[1]]
names(df)
}
rel_names(mtcars_rel)
#> [1] "mpg" "cyl" "disp" "hp"