Title: | Lightning-Fast 'DataFrame' Library |
---|---|
Description: | Lightning-fast 'DataFrame' library written in 'Rust'. Convert R data to 'Polars' data and vice versa. Perform fast, lazy, larger-than-memory and optimized data queries. 'Polars' is interoperable with the package 'arrow', as both are based on the 'Apache Arrow' Columnar Format. |
Authors: | Ritchie Vink [aut], Soren Welling [aut, cre], Tatsuya Shima [aut], Etienne Bacher [aut] |
Maintainer: | Soren Welling <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.22.0.9000 |
Built: | 2025-01-19 02:53:56 UTC |
Source: | https://github.com/pola-rs/r-polars |
Mimics the behavior of [x[i, j, drop = TRUE]
][Extract] for data.frame or R vector.
## S3 method for class 'RPolarsDataFrame' x[i, j, drop = TRUE] ## S3 method for class 'RPolarsLazyFrame' x[i, j, drop = TRUE] ## S3 method for class 'RPolarsSeries' x[i]
## S3 method for class 'RPolarsDataFrame' x[i, j, drop = TRUE] ## S3 method for class 'RPolarsLazyFrame' x[i, j, drop = TRUE] ## S3 method for class 'RPolarsSeries' x[i]
x |
|
i |
Rows to select. Integer vector, logical vector, or an Expression. |
j |
Columns to select. Integer vector, logical vector, character vector, or an Expression. For LazyFrames, only an Expression can be used. |
drop |
Convert to a Polars Series if only one column is selected.
For LazyFrames, if the result has one column and |
<Series>[i]
is equivalent to pl$select(<Series>)[i, , drop = TRUE]
.
<DataFrame>$select()
,
<LazyFrame>$select()
,
<DataFrame>$filter()
,
<LazyFrame>$filter()
df = as_polars_df(data.frame(a = 1:3, b = letters[1:3])) lf = df$lazy() # Select a row df[1, ] # If only `i` is specified, it is treated as `j` # Select a column df[1] # Select a column by name (and convert to a Series) df[, "b"] # Can use Expression for filtering and column selection lf[pl$col("a") >= 2, pl$col("b")$alias("new"), drop = FALSE] |> as.data.frame()
df = as_polars_df(data.frame(a = 1:3, b = letters[1:3])) lf = df$lazy() # Select a row df[1, ] # If only `i` is specified, it is treated as `j` # Select a column df[1] # Select a column by name (and convert to a Series) df[, "b"] # Can use Expression for filtering and column selection lf[pl$col("a") >= 2, pl$col("b")$alias("new"), drop = FALSE] |> as.data.frame()
Create a arrow Table from a Polars object
## S3 method for class 'RPolarsDataFrame' as_arrow_table(x, ..., compat_level = FALSE)
## S3 method for class 'RPolarsDataFrame' as_arrow_table(x, ..., compat_level = FALSE)
x |
|
... |
Ignored |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
library(arrow) pl_df = as_polars_df(mtcars) as_arrow_table(pl_df)
library(arrow) pl_df = as_polars_df(mtcars) as_arrow_table(pl_df)
Create a nanoarrow_array_stream from a Polars object
## S3 method for class 'RPolarsDataFrame' as_nanoarrow_array_stream(x, ..., schema = NULL, compat_level = FALSE) ## S3 method for class 'RPolarsSeries' as_nanoarrow_array_stream(x, ..., schema = NULL, compat_level = FALSE)
## S3 method for class 'RPolarsDataFrame' as_nanoarrow_array_stream(x, ..., schema = NULL, compat_level = FALSE) ## S3 method for class 'RPolarsSeries' as_nanoarrow_array_stream(x, ..., schema = NULL, compat_level = FALSE)
x |
A polars object |
... |
Ignored |
schema |
must stay at default value NULL |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
library(nanoarrow) pl_df = as_polars_df(mtcars)$head(5) pl_s = as_polars_series(letters[1:5]) as.data.frame(as_nanoarrow_array_stream(pl_df)) as.vector(as_nanoarrow_array_stream(pl_s))
library(nanoarrow) pl_df = as_polars_df(mtcars)$head(5) pl_s = as_polars_series(letters[1:5]) as.data.frame(as_nanoarrow_array_stream(pl_df)) as.vector(as_nanoarrow_array_stream(pl_s))
as_polars_df()
is a generic function that converts an R object to a
polars DataFrame.
as_polars_df(x, ...) ## Default S3 method: as_polars_df(x, ...) ## S3 method for class 'data.frame' as_polars_df( x, ..., rownames = NULL, make_names_unique = TRUE, schema = NULL, schema_overrides = NULL ) ## S3 method for class 'RPolarsDataFrame' as_polars_df(x, ...) ## S3 method for class 'RPolarsGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsRollingGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsDynamicGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsSeries' as_polars_df(x, ...) ## S3 method for class 'RPolarsLazyFrame' as_polars_df( x, n_rows = Inf, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE ) ## S3 method for class 'RPolarsLazyGroupBy' as_polars_df(x, ...) ## S3 method for class 'ArrowTabular' as_polars_df( x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL, experimental = FALSE ) ## S3 method for class 'RecordBatchReader' as_polars_df(x, ..., experimental = FALSE) ## S3 method for class 'nanoarrow_array' as_polars_df(x, ...) ## S3 method for class 'nanoarrow_array_stream' as_polars_df(x, ..., experimental = FALSE)
as_polars_df(x, ...) ## Default S3 method: as_polars_df(x, ...) ## S3 method for class 'data.frame' as_polars_df( x, ..., rownames = NULL, make_names_unique = TRUE, schema = NULL, schema_overrides = NULL ) ## S3 method for class 'RPolarsDataFrame' as_polars_df(x, ...) ## S3 method for class 'RPolarsGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsRollingGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsDynamicGroupBy' as_polars_df(x, ...) ## S3 method for class 'RPolarsSeries' as_polars_df(x, ...) ## S3 method for class 'RPolarsLazyFrame' as_polars_df( x, n_rows = Inf, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE ) ## S3 method for class 'RPolarsLazyGroupBy' as_polars_df(x, ...) ## S3 method for class 'ArrowTabular' as_polars_df( x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL, experimental = FALSE ) ## S3 method for class 'RecordBatchReader' as_polars_df(x, ..., experimental = FALSE) ## S3 method for class 'nanoarrow_array' as_polars_df(x, ...) ## S3 method for class 'nanoarrow_array_stream' as_polars_df(x, ..., experimental = FALSE)
x |
Object to convert to a polars DataFrame. |
... |
Additional arguments passed to methods. |
rownames |
How to treat existing row names of a data frame:
|
make_names_unique |
A logical flag to replace duplicated column names
with unique names. If |
schema |
named list of DataTypes, or character vector of column names.
Should match the number of columns in |
schema_overrides |
named list of DataTypes. Cast some columns to the DataType. |
n_rows |
Number of rows to fetch. Defaults to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
no_optimization |
Logical. Sets the following parameters to |
collect_in_background |
Logical. Detach this query from R session. Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session. |
rechunk |
A logical flag (default |
experimental |
If |
For LazyFrame objects, this function is a shortcut for $collect() or $fetch(), depending on whether the number of rows to fetch is infinite or not.
# Convert the row names of a data frame to a column as_polars_df(mtcars, rownames = "car") # Convert a data frame, with renaming all columns as_polars_df( data.frame(x = 1, y = 2), schema = c("a", "b") ) # Convert a data frame, with renaming and casting all columns as_polars_df( data.frame(x = 1, y = 2), schema = list(b = pl$Int64, a = pl$String) ) # Convert a data frame, with casting some columns as_polars_df( data.frame(x = 1, y = 2), schema_overrides = list(y = pl$String) # cast some columns ) # Convert an arrow Table to a polars DataFrame at = arrow::arrow_table(x = 1:5, y = 6:10) as_polars_df(at) # Create a polars DataFrame from a data.frame lf = as_polars_df(mtcars)$lazy() # Collect all rows from the LazyFrame as_polars_df(lf) # Fetch 5 rows from the LazyFrame as_polars_df(lf, 5)
# Convert the row names of a data frame to a column as_polars_df(mtcars, rownames = "car") # Convert a data frame, with renaming all columns as_polars_df( data.frame(x = 1, y = 2), schema = c("a", "b") ) # Convert a data frame, with renaming and casting all columns as_polars_df( data.frame(x = 1, y = 2), schema = list(b = pl$Int64, a = pl$String) ) # Convert a data frame, with casting some columns as_polars_df( data.frame(x = 1, y = 2), schema_overrides = list(y = pl$String) # cast some columns ) # Convert an arrow Table to a polars DataFrame at = arrow::arrow_table(x = 1:5, y = 6:10) as_polars_df(at) # Create a polars DataFrame from a data.frame lf = as_polars_df(mtcars)$lazy() # Collect all rows from the LazyFrame as_polars_df(lf) # Fetch 5 rows from the LazyFrame as_polars_df(lf, 5)
as_polars_lf()
is a generic function that converts an R object to a
polars LazyFrame. It is basically a shortcut for as_polars_df(x, ...) with the
$lazy() method.
as_polars_lf(x, ...) ## Default S3 method: as_polars_lf(x, ...) ## S3 method for class 'RPolarsLazyFrame' as_polars_lf(x, ...) ## S3 method for class 'RPolarsLazyGroupBy' as_polars_lf(x, ...)
as_polars_lf(x, ...) ## Default S3 method: as_polars_lf(x, ...) ## S3 method for class 'RPolarsLazyFrame' as_polars_lf(x, ...) ## S3 method for class 'RPolarsLazyGroupBy' as_polars_lf(x, ...)
x |
Object to convert to a polars DataFrame. |
... |
Additional arguments passed to methods. |
as_polars_lf(mtcars)
as_polars_lf(mtcars)
as_polars_series()
is a generic function that converts an R object to
a polars Series.
as_polars_series(x, name = NULL, ...) ## Default S3 method: as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsSeries' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsExpr' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsThen' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsChainedThen' as_polars_series(x, name = NULL, ...) ## S3 method for class 'POSIXlt' as_polars_series(x, name = NULL, ...) ## S3 method for class 'data.frame' as_polars_series(x, name = NULL, ...) ## S3 method for class 'vctrs_rcrd' as_polars_series(x, name = NULL, ...) ## S3 method for class 'Array' as_polars_series(x, name = NULL, ..., rechunk = TRUE) ## S3 method for class 'ChunkedArray' as_polars_series(x, name = NULL, ..., rechunk = TRUE) ## S3 method for class 'RecordBatchReader' as_polars_series(x, name = NULL, ...) ## S3 method for class 'nanoarrow_array' as_polars_series(x, name = NULL, ...) ## S3 method for class 'nanoarrow_array_stream' as_polars_series(x, name = NULL, ..., experimental = FALSE) ## S3 method for class 'clock_time_point' as_polars_series(x, name = NULL, ...) ## S3 method for class 'clock_sys_time' as_polars_series(x, name = NULL, ...) ## S3 method for class 'clock_zoned_time' as_polars_series(x, name = NULL, ...) ## S3 method for class 'list' as_polars_series(x, name = NULL, ...)
as_polars_series(x, name = NULL, ...) ## Default S3 method: as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsSeries' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsExpr' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsThen' as_polars_series(x, name = NULL, ...) ## S3 method for class 'RPolarsChainedThen' as_polars_series(x, name = NULL, ...) ## S3 method for class 'POSIXlt' as_polars_series(x, name = NULL, ...) ## S3 method for class 'data.frame' as_polars_series(x, name = NULL, ...) ## S3 method for class 'vctrs_rcrd' as_polars_series(x, name = NULL, ...) ## S3 method for class 'Array' as_polars_series(x, name = NULL, ..., rechunk = TRUE) ## S3 method for class 'ChunkedArray' as_polars_series(x, name = NULL, ..., rechunk = TRUE) ## S3 method for class 'RecordBatchReader' as_polars_series(x, name = NULL, ...) ## S3 method for class 'nanoarrow_array' as_polars_series(x, name = NULL, ...) ## S3 method for class 'nanoarrow_array_stream' as_polars_series(x, name = NULL, ..., experimental = FALSE) ## S3 method for class 'clock_time_point' as_polars_series(x, name = NULL, ...) ## S3 method for class 'clock_sys_time' as_polars_series(x, name = NULL, ...) ## S3 method for class 'clock_zoned_time' as_polars_series(x, name = NULL, ...) ## S3 method for class 'list' as_polars_series(x, name = NULL, ...)
x |
Object to convert into a polars Series. |
name |
A character to use as the name of the Series.
If |
... |
Additional arguments passed to methods. |
rechunk |
A logical flag (default |
experimental |
If |
a Series
as_polars_series(1:4) as_polars_series(list(1:4)) as_polars_series(data.frame(a = 1:4)) as_polars_series(as_polars_series(1:4, name = "foo")) as_polars_series(pl$lit(1:4)) # Nested type support as_polars_series(list(data.frame(a = I(list(1:4)))))
as_polars_series(1:4) as_polars_series(list(1:4)) as_polars_series(data.frame(a = 1:4)) as_polars_series(as_polars_series(1:4, name = "foo")) as_polars_series(pl$lit(1:4)) # Nested type support as_polars_series(list(data.frame(a = I(list(1:4)))))
Create a arrow RecordBatchReader from a Polars object
## S3 method for class 'RPolarsDataFrame' as_record_batch_reader(x, ..., compat_level = FALSE)
## S3 method for class 'RPolarsDataFrame' as_record_batch_reader(x, ..., compat_level = FALSE)
x |
|
... |
Ignored |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
library(arrow) pl_df = as_polars_df(mtcars) as_record_batch_reader(pl_df)
library(arrow) pl_df = as_polars_df(mtcars) as_record_batch_reader(pl_df)
Convert to a character vector
## S3 method for class 'RPolarsSeries' as.character(x, ..., str_length = NULL)
## S3 method for class 'RPolarsSeries' as.character(x, ..., str_length = NULL)
x |
A Polars Series |
... |
Not used. |
str_length |
An integer. If specified, utf8 or categorical type Series will be formatted to a string of this length. |
s = as_polars_series(c("foo", "barbaz")) as.character(s) as.character(s, str_length = 3)
s = as_polars_series(c("foo", "barbaz")) as.character(s) as.character(s, str_length = 3)
Equivalent to as_polars_df(x, ...)$to_data_frame(...)
.
## S3 method for class 'RPolarsDataFrame' as.data.frame(x, ..., int64_conversion = polars_options()$int64_conversion) ## S3 method for class 'RPolarsLazyFrame' as.data.frame( x, ..., n_rows = Inf, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE )
## S3 method for class 'RPolarsDataFrame' as.data.frame(x, ..., int64_conversion = polars_options()$int64_conversion) ## S3 method for class 'RPolarsLazyFrame' as.data.frame( x, ..., n_rows = Inf, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE )
x |
An object to convert to a data.frame. |
... |
Additional arguments passed to methods. |
int64_conversion |
How should Int64 values be handled when converting a polars object to R?
|
n_rows |
Number of rows to fetch. Defaults to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
no_optimization |
Logical. Sets the following parameters to |
collect_in_background |
Logical. Detach this query from R session. Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session. |
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
Equivalent to as.data.frame(x, ...) |> as.matrix()
.
## S3 method for class 'RPolarsDataFrame' as.matrix(x, ...) ## S3 method for class 'RPolarsLazyFrame' as.matrix(x, ...)
## S3 method for class 'RPolarsDataFrame' as.matrix(x, ...) ## S3 method for class 'RPolarsLazyFrame' as.matrix(x, ...)
x |
An object to convert to a matrix. |
... |
Additional arguments passed to methods. |
Convert to a vector
## S3 method for class 'RPolarsSeries' as.vector(x, mode)
## S3 method for class 'RPolarsSeries' as.vector(x, mode)
x |
A Polars Series |
mode |
Not used. |
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
Combine to a Series
## S3 method for class 'RPolarsSeries' c(x, ...)
## S3 method for class 'RPolarsSeries' c(x, ...)
x |
A Polars Series |
... |
Series(s) or any object that can be converted to a Series. |
All objects must have the same datatype. Combining does not rechunk. Read more
about R vectors, Series and chunks in docs_translations
:
a combined Series
s = c(as_polars_series(1:5), 3:1, NA_integer_) s$chunk_lengths() # the series contain three unmerged chunks
s = c(as_polars_series(1:5), 3:1, NA_integer_) s$chunk_lengths() # the series contain three unmerged chunks
This allows to convert all columns to a datatype or to convert only specific columns. Contrarily to the Python implementation, it is not possible to convert all columns of a specific datatype to another datatype.
DataFrame_cast(dtypes, ..., strict = TRUE)
DataFrame_cast(dtypes, ..., strict = TRUE)
dtypes |
Either a datatype or a list where the names are column names and the values are the datatypes to convert to. |
... |
Ignored. |
strict |
If |
A DataFrame
df = pl$DataFrame( foo = 1:3, bar = c(6, 7, 8), ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) ) # Cast only some columns df$cast(list(foo = pl$Float32, bar = pl$UInt8)) # Cast all columns to the same type df$cast(pl$String)
df = pl$DataFrame( foo = 1:3, bar = c(6, 7, 8), ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) ) # Cast only some columns df$cast(list(foo = pl$Float32, bar = pl$UInt8)) # Cast all columns to the same type df$cast(pl$String)
The DataFrame
-class is simply two environments of respectively
the public and private methods/function calls to the polars Rust side. The
instantiated DataFrame
-object is an externalptr
to a low-level Rust
polars DataFrame object.
The S3 method .DollarNames.RPolarsDataFrame
exposes all public
$foobar()
-methods which are callable onto the object. Most methods return
another DataFrame
- class instance or similar which allows for method
chaining. This class system could be called "environment classes" (in lack
of a better name) and is the same class system extendr
provides, except
here there are both a public and private set of methods. For implementation
reasons, the private methods are external and must be called from
.pr$DataFrame$methodname()
. Also, all private methods must take any
self
as an argument, thus they are pure functions. Having the private
methods as pure functions solved/simplified self-referential complications.
Check out the source code in
R/dataframe_frame.R
to see how public methods are derived from private methods. Check out
extendr-wrappers.R
to see the extendr
-auto-generated methods. These are moved to .pr
and
converted into pure external functions in
after-wrappers.R.
In zzz.R (named
zzz
to be last file sourced) the extendr
-methods are removed and
replaced by any function prefixed DataFrame_
.
$columns
returns a character vector with the column names.
$dtypes
returns a unnamed list with the data type of each column.
$flags
returns a nested list with column names at the top level and
column flags in each sublist.
Flags are used internally to avoid doing unnecessary computations, such as
sorting a variable that we know is already sorted. The number of flags
varies depending on the column type: columns of type array
and list
have the flags SORTED_ASC
, SORTED_DESC
, and FAST_EXPLODE
, while other
column types only have the former two.
SORTED_ASC
is set to TRUE
when we sort a column in increasing order, so
that we can use this information later on to avoid re-sorting it.
SORTED_DESC
is similar but applies to sort in decreasing order.
$height
returns the number of rows in the DataFrame.
$schema
returns a named list with the data type of each column.
$shape
returns a numeric vector of length two with the number of rows and
the number of columns.
$width
returns the number of columns in the DataFrame.
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
# see all public exported method names (normally accessed via a class # instance with $) ls(.pr$env$RPolarsDataFrame) # see all private methods (not intended for regular use) ls(.pr$DataFrame) # make an object df = as_polars_df(iris) # call an active binding df$shape # use a private method, which has mutability result = .pr$DataFrame$set_column_from_robj(df, 150:1, "some_ints") # Column exists in both dataframes-objects now, as they are just pointers to # the same object # There are no public methods with mutability. df2 = df df$columns df2$columns # Show flags df$sort("Sepal.Length")$flags # set_column_from_robj-method is fallible and returned a result which could # be "ok" or an error. # No public method or function will ever return a result. # The `result` is very close to the same as output from functions decorated # with purrr::safely. # To use results on the R side, these must be unwrapped first such that # potentially errors can be thrown. `unwrap(result)` is a way to communicate # errors happening on the Rust side to the R side. `Extendr` default behavior # is to use `panic!`(s) which would cause some unnecessarily confusing and # some very verbose error messages on the inner workings of rust. # `unwrap(result)` in this case no error, just a NULL because this mutable # method does not return any ok-value. # Try unwrapping an error from polars due to unmatching column lengths err_result = .pr$DataFrame$set_column_from_robj(df, 1:10000, "wrong_length") tryCatch(unwrap(err_result, call = NULL), error = \(e) cat(as.character(e)))
# see all public exported method names (normally accessed via a class # instance with $) ls(.pr$env$RPolarsDataFrame) # see all private methods (not intended for regular use) ls(.pr$DataFrame) # make an object df = as_polars_df(iris) # call an active binding df$shape # use a private method, which has mutability result = .pr$DataFrame$set_column_from_robj(df, 150:1, "some_ints") # Column exists in both dataframes-objects now, as they are just pointers to # the same object # There are no public methods with mutability. df2 = df df$columns df2$columns # Show flags df$sort("Sepal.Length")$flags # set_column_from_robj-method is fallible and returned a result which could # be "ok" or an error. # No public method or function will ever return a result. # The `result` is very close to the same as output from functions decorated # with purrr::safely. # To use results on the R side, these must be unwrapped first such that # potentially errors can be thrown. `unwrap(result)` is a way to communicate # errors happening on the Rust side to the R side. `Extendr` default behavior # is to use `panic!`(s) which would cause some unnecessarily confusing and # some very verbose error messages on the inner workings of rust. # `unwrap(result)` in this case no error, just a NULL because this mutable # method does not return any ok-value. # Try unwrapping an error from polars due to unmatching column lengths err_result = .pr$DataFrame$set_column_from_robj(df, 1:10000, "wrong_length") tryCatch(unwrap(err_result, call = NULL), error = \(e) cat(as.character(e)))
Returns a n-row null-filled DataFrame with an identical schema. n
can be
greater than the current number of rows in the DataFrame.
DataFrame_clear(n = 0)
DataFrame_clear(n = 0)
n |
Number of (null-filled) rows to return in the cleared frame. |
A n-row null-filled DataFrame with an identical schema
df = pl$DataFrame( a = c(NA, 2, 3, 4), b = c(0.5, NA, 2.5, 13), c = c(TRUE, TRUE, FALSE, NA) ) df$clear() df$clear(n = 5)
df = pl$DataFrame( a = c(NA, 2, 3, 4), b = c(0.5, NA, 2.5, 13), c = c(TRUE, TRUE, FALSE, NA) ) df$clear() df$clear(n = 5)
This makes a very cheap deep copy/clone of an existing
DataFrame
. Rarely useful as DataFrame
s are nearly 100%
immutable. Any modification of a DataFrame
should lead to a clone anyways,
but this can be useful when dealing with attributes (see examples).
DataFrame_clone()
DataFrame_clone()
A DataFrame
df1 = as_polars_df(iris) # Make a function to take a DataFrame, add an attribute, and return a DataFrame give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original DataFrame also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_df(iris) df2 = give_attr(df1) # now, the original DataFrame doesn't get this attribute attributes(df1)
df1 = as_polars_df(iris) # Make a function to take a DataFrame, add an attribute, and return a DataFrame give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original DataFrame also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_df(iris) df2 = give_attr(df1) # now, the original DataFrame doesn't get this attribute attributes(df1)
This returns the total number of rows, the number of missing
values, the mean, standard deviation, min, max, median and the percentiles
specified in the argument percentiles
.
DataFrame_describe(percentiles = c(0.25, 0.75), interpolation = "nearest")
DataFrame_describe(percentiles = c(0.25, 0.75), interpolation = "nearest")
percentiles |
One or more percentiles to include in the summary statistics.
All values must be in the range |
interpolation |
Interpolation method for computing quantiles. One of
|
DataFrame
as_polars_df(iris)$describe() # string, date, boolean columns are also supported: df = pl$DataFrame( int = 1:3, string = c(letters[1:2], NA), date = c(as.Date("2024-01-20"), as.Date("2024-01-21"), NA), cat = factor(c(letters[1:2], NA)), bool = c(TRUE, FALSE, NA) ) df df$describe()
as_polars_df(iris)$describe() # string, date, boolean columns are also supported: df = pl$DataFrame( int = 1:3, string = c(letters[1:2], NA), date = c(as.Date("2024-01-20"), as.Date("2024-01-21"), NA), cat = factor(c(letters[1:2], NA)), bool = c(TRUE, FALSE, NA) ) df df$describe()
Drop columns of a DataFrame
DataFrame_drop(..., strict = TRUE)
DataFrame_drop(..., strict = TRUE)
... |
Characters of column names to drop. Passed to |
strict |
Validate that all column names exist in the schema and throw an exception if a column name does not exist in the schema. |
DataFrame
as_polars_df(mtcars)$drop(c("mpg", "hp")) # equivalent as_polars_df(mtcars)$drop("mpg", "hp")
as_polars_df(mtcars)$drop(c("mpg", "hp")) # equivalent as_polars_df(mtcars)$drop("mpg", "hp")
Drop a single column in-place and return the dropped column.
DataFrame_drop_in_place(name)
DataFrame_drop_in_place(name)
name |
string Name of the column to drop. |
Series
dat = as_polars_df(iris) x = dat$drop_in_place("Species") x dat$columns
dat = as_polars_df(iris) x = dat$drop_in_place("Species") x dat$columns
Drop all rows that contain nulls (which correspond to NA
in R).
DataFrame_drop_nulls(subset = NULL)
DataFrame_drop_nulls(subset = NULL)
subset |
A character vector with the names of the column(s) for which
nulls are considered. If |
DataFrame
tmp = mtcars tmp[1:3, "mpg"] = NA tmp[4, "hp"] = NA tmp = as_polars_df(tmp) # number of rows in `tmp` before dropping nulls tmp$height tmp$drop_nulls()$height tmp$drop_nulls("mpg")$height tmp$drop_nulls(c("mpg", "hp"))$height
tmp = mtcars tmp[1:3, "mpg"] = NA tmp[4, "hp"] = NA tmp = as_polars_df(tmp) # number of rows in `tmp` before dropping nulls tmp$height tmp$drop_nulls()$height tmp$drop_nulls("mpg")$height tmp$drop_nulls(c("mpg", "hp"))$height
Get the data type of all columns as strings. You can see all
available types with names(pl$dtypes)
. The data type of each column is also
shown when printing the DataFrame.
DataFrame_dtype_strings()
DataFrame_dtype_strings()
A character vector with the data type of each column
as_polars_df(iris)$dtype_strings()
as_polars_df(iris)$dtype_strings()
Check if two DataFrames are equal.
DataFrame_equals(other)
DataFrame_equals(other)
other |
DataFrame to compare with. |
A logical value
dat1 = as_polars_df(iris) dat2 = as_polars_df(iris) dat3 = as_polars_df(mtcars) dat1$equals(dat2) dat1$equals(dat3)
dat1 = as_polars_df(iris) dat2 = as_polars_df(iris) dat3 = as_polars_df(mtcars) dat1$equals(dat2) dat1$equals(dat3)
Return an estimation of the total (heap) allocated size of the DataFrame.
DataFrame_estimated_size()
DataFrame_estimated_size()
function
Estimated size in bytes
as_polars_df(mtcars)$estimated_size()
as_polars_df(mtcars)$estimated_size()
Explode columns containing a list of values
DataFrame_explode(...)
DataFrame_explode(...)
... |
Column(s) to be exploded as individual |
DataFrame
df = pl$DataFrame( letters = letters[1:4], numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)), numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers ) df # explode a single column, append others df$explode("numbers") # explode two columns of same nesting structure, by names or the common dtype # "List(Float64)" df$explode("numbers", "numbers_2") df$explode(pl$col(pl$List(pl$Float64)))
df = pl$DataFrame( letters = letters[1:4], numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)), numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers ) df # explode a single column, append others df$explode("numbers") # explode two columns of same nesting structure, by names or the common dtype # "List(Float64)" df$explode("numbers", "numbers_2") df$explode(pl$col(pl$List(pl$Float64)))
Fill floating point NaN value with a fill value
DataFrame_fill_nan(value)
DataFrame_fill_nan(value)
value |
Value used to fill |
DataFrame
df = pl$DataFrame( a = c(1.5, 2, NaN, 4), b = c(1.5, NaN, NaN, 4) ) df$fill_nan(99)
df = pl$DataFrame( a = c(1.5, 2, NaN, 4), b = c(1.5, NaN, NaN, 4) ) df$fill_nan(99)
Fill null values (which correspond to NA
in R) using the
specified value or strategy.
DataFrame_fill_null(fill_value)
DataFrame_fill_null(fill_value)
fill_value |
Value to fill nulls with. |
DataFrame
df = pl$DataFrame( a = c(1.5, 2, NA, 4), b = c(1.5, NA, NA, 4) ) df$fill_null(99) df$fill_null(pl$col("a")$mean())
df = pl$DataFrame( a = c(1.5, 2, NA, 4), b = c(1.5, NA, NA, 4) ) df$fill_null(99) df$fill_null(pl$col("a")$mean())
Filter rows with an Expression defining a boolean column.
Multiple expressions are combined with &
(AND).
This is equivalent to dplyr::filter()
.
DataFrame_filter(...)
DataFrame_filter(...)
... |
Polars expressions which will evaluate to a boolean. |
Rows where the condition returns NA
are dropped.
A DataFrame with only the rows where the conditions are TRUE
.
df = as_polars_df(iris) df$filter(pl$col("Sepal.Length") > 5) # This is equivalent to # df$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) df$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1) # rows where condition is NA are dropped iris2 = iris iris2[c(1, 3, 5), "Species"] = NA df = as_polars_df(iris2) df$filter(pl$col("Species") == "setosa")
df = as_polars_df(iris) df$filter(pl$col("Sepal.Length") > 5) # This is equivalent to # df$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) df$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1) # rows where condition is NA are dropped iris2 = iris iris2[c(1, 3, 5), "Species"] = NA df = as_polars_df(iris2) df$filter(pl$col("Species") == "setosa")
Get the first row of the DataFrame.
DataFrame_first()
DataFrame_first()
A DataFrame with one row.
as_polars_df(mtcars)$first()
as_polars_df(mtcars)$first()
Take every nth row in the DataFrame
DataFrame_gather_every(n, offset = 0)
DataFrame_gather_every(n, offset = 0)
n |
Gather every |
offset |
Starting index. |
A DataFrame
df = pl$DataFrame(a = 1:4, b = 5:8) df$gather_every(2) df$gather_every(2, offset = 1)
df = pl$DataFrame(a = 1:4, b = 5:8) df$gather_every(2) df$gather_every(2, offset = 1)
Extract a DataFrame column as a Polars series.
DataFrame_get_column(name)
DataFrame_get_column(name)
name |
Name of the column to extract. |
Series
df = as_polars_df(iris[1:2, ]) df$get_column("Species")
df = as_polars_df(iris[1:2, ]) df$get_column("Species")
Get the DataFrame as a List of Series
DataFrame_get_columns()
DataFrame_get_columns()
A list of Series
<DataFrame>$to_list()
:
Similar to this method but returns a list of vectors instead of Series.
df = pl$DataFrame(foo = 1L:3L, bar = 4L:6L) df$get_columns() df = pl$DataFrame( a = 1:4, b = c(0.5, 4, 10, 13), c = c(TRUE, TRUE, FALSE, TRUE) ) df$get_columns()
df = pl$DataFrame(foo = 1L:3L, bar = 4L:6L) df$get_columns() df = pl$DataFrame( a = 1:4, b = c(0.5, 4, 10, 13), c = c(TRUE, TRUE, FALSE, TRUE) ) df$get_columns()
The formatting shows one line per column so that wide DataFrames display cleanly. Each line shows the column name, the data type, and the first few values.
DataFrame_glimpse( ..., max_items_per_column = 10, max_colname_length = 50, return_as_string = FALSE )
DataFrame_glimpse( ..., max_items_per_column = 10, max_colname_length = 50, return_as_string = FALSE )
... |
Ignored. |
max_items_per_column |
Maximum number of items to show per column. |
max_colname_length |
Maximum length of the displayed column names. Values that exceed this value are truncated with a trailing ellipsis. |
return_as_string |
Logical (default |
DataFrame
as_polars_df(iris)$glimpse()
as_polars_df(iris)$glimpse()
This doesn't modify the data but only stores information about
the group structure. This structure can then be used by several functions
($agg()
, $filter()
, etc.).
DataFrame_group_by(..., maintain_order = polars_options()$maintain_order)
DataFrame_group_by(..., maintain_order = polars_options()$maintain_order)
... |
Column(s) to group by. Accepts expression input. Characters are parsed as column names. |
maintain_order |
Ensure that the order of the groups is consistent with the input data.
This is slower than a default group by.
Setting this to |
Within each group, the order of the rows is always preserved,
regardless of the maintain_order
argument.
GroupBy (a DataFrame with special groupby methods like $agg()
)
df = pl$DataFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) df$group_by("a")$agg(pl$col("b")$sum()) # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input. df$group_by("a", maintain_order = TRUE)$agg(pl$col("c")) # Group by multiple columns by passing a list of column names. df$group_by(c("a", "b"))$agg(pl$max("c")) # Or pass some arguments to group by multiple columns in the same way. # Expressions are also accepted. df$group_by("a", pl$col("b") %/% 2)$agg( pl$col("c")$mean() ) # The columns will be renamed to the argument names. df$group_by(d = "a", e = pl$col("b") %/% 2)$agg( pl$col("c")$mean() )
df = pl$DataFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) df$group_by("a")$agg(pl$col("b")$sum()) # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input. df$group_by("a", maintain_order = TRUE)$agg(pl$col("c")) # Group by multiple columns by passing a list of column names. df$group_by(c("a", "b"))$agg(pl$max("c")) # Or pass some arguments to group by multiple columns in the same way. # Expressions are also accepted. df$group_by("a", pl$col("b") %/% 2)$agg( pl$col("c")$mean() ) # The columns will be renamed to the argument names. df$group_by(d = "a", e = pl$col("b") %/% 2)$agg( pl$col("c")$mean() )
If you have a time series <t_0, t_1, ..., t_n>
, then by default the windows
created will be:
(t_0 - period, t_0]
(t_1 - period, t_1]
…
(t_n - period, t_n]
whereas if you pass a non-default offset, then the windows will be:
(t_0 + offset, t_0 + offset + period]
(t_1 + offset, t_1 + offset + period]
…
(t_n + offset, t_n + offset + period]
DataFrame_group_by_dynamic( index_column, ..., every, period = NULL, offset = NULL, include_boundaries = FALSE, closed = "left", label = "left", group_by = NULL, start_by = "window" )
DataFrame_group_by_dynamic( index_column, ..., every, period = NULL, offset = NULL, include_boundaries = FALSE, closed = "left", label = "left", group_by = NULL, start_by = "window" )
index_column |
Column used to group based on the time window. Often of
type Date/Datetime. This column must be sorted in ascending order (or, if |
... |
Ignored. |
every |
Interval of the window. |
period |
A character representing the length of the window,
must be non-negative. See the |
offset |
A character representing the offset of the window,
or |
include_boundaries |
Add two columns |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
label |
Define which label to use for the window:
|
group_by |
Also group by this column/these columns. |
start_by |
The strategy to determine the start of the first window by:
|
In case of a rolling operation on an integer column, the windows are defined by:
"1i" # length 1
"10i" # length 10
A GroupBy object
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() ) # using "include_boundaries = TRUE" is helpful to see the period considered df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") ) # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally df$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") ) # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: df$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") ) # Dynamic group bys can also be combined with grouping on normal keys df = df$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) df df$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n")) # We can also create a dynamic group by based on an index column df = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) df df$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() ) # using "include_boundaries = TRUE" is helpful to see the period considered df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") ) # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally df$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") ) # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: df$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") ) # Dynamic group bys can also be combined with grouping on normal keys df = df$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) df df$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n")) # We can also create a dynamic group by based on an index column df = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) df df$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))
n
rows.Get the first n
rows.
DataFrame_head(n = 5L) DataFrame_limit(n = 5L)
DataFrame_head(n = 5L) DataFrame_limit(n = 5L)
n |
Number of rows to return. If a negative value is passed,
return all rows except the last |
$limit()
is an alias for $head()
.
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) df$head(3) # Pass a negative value to get all rows except the last `abs(n)`. df$head(-3)
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) df$head(3) # Pass a negative value to get all rows except the last `abs(n)`. df$head(-3)
If row and column location are not specified, the DataFrame must have dimensions (1, 1).
DataFrame_item(row = NULL, column = NULL)
DataFrame_item(row = NULL, column = NULL)
row |
Optional row index (0-indexed). |
column |
Optional column index (0-indexed) or name. |
A value of length 1
df = pl$DataFrame(a = c(1, 2, 3), b = c(4, 5, 6)) df$select((pl$col("a") * pl$col("b"))$sum())$item() df$item(1, 1) df$item(2, "b")
df = pl$DataFrame(a = c(1, 2, 3), b = c(4, 5, 6)) df$select((pl$col("a") * pl$col("b"))$sum())$item() df$item(1, 1) df$item(2, "b")
This function can do both mutating joins (adding columns based on matching
observations, for example with how = "left"
) and filtering joins (keeping
observations based on matching observations, for example with how = "inner"
).
DataFrame_join( other, on = NULL, how = "inner", ..., left_on = NULL, right_on = NULL, suffix = "_right", validate = "m:m", join_nulls = FALSE, allow_parallel = TRUE, force_parallel = FALSE, coalesce = NULL )
DataFrame_join( other, on = NULL, how = "inner", ..., left_on = NULL, right_on = NULL, suffix = "_right", validate = "m:m", join_nulls = FALSE, allow_parallel = TRUE, force_parallel = FALSE, coalesce = NULL )
other |
DataFrame to join with. |
on |
Either a vector of column names or a list of expressions and/or
strings. Use |
how |
One of the following methods: "inner", "left", "right", "full", "semi", "anti", "cross". |
... |
Ignored. |
left_on , right_on
|
Same as |
suffix |
Suffix to add to duplicated column names. |
validate |
Checks if join is of specified type:
Note that this is currently not supported by the streaming engine, and is only supported when joining by single columns. |
join_nulls |
Join on null values. By default null values will never produce matches. |
allow_parallel |
Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel. |
force_parallel |
Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. |
coalesce |
Coalescing behavior (merging of join columns).
|
DataFrame
# inner join by default df1 = pl$DataFrame(list(key = 1:3, payload = c("f", "i", NA))) df2 = pl$DataFrame(list(key = c(3L, 4L, 5L, NA_integer_))) df1$join(other = df2, on = "key") # cross join df1 = pl$DataFrame(x = letters[1:3]) df2 = pl$DataFrame(y = 1:4) df1$join(other = df2, how = "cross")
# inner join by default df1 = pl$DataFrame(list(key = 1:3, payload = c("f", "i", NA))) df2 = pl$DataFrame(list(key = c(3L, 4L, 5L, NA_integer_))) df1$join(other = df2, on = "key") # cross join df1 = pl$DataFrame(x = letters[1:3]) df2 = pl$DataFrame(y = 1:4) df1$join(other = df2, how = "cross")
This is similar to a left-join except that we match on nearest key rather than equal keys.
DataFrame_join_asof( other, ..., left_on = NULL, right_on = NULL, on = NULL, by_left = NULL, by_right = NULL, by = NULL, strategy = c("backward", "forward", "nearest"), suffix = "_right", tolerance = NULL, allow_parallel = TRUE, force_parallel = FALSE, coalesce = TRUE )
DataFrame_join_asof( other, ..., left_on = NULL, right_on = NULL, on = NULL, by_left = NULL, by_right = NULL, by = NULL, strategy = c("backward", "forward", "nearest"), suffix = "_right", tolerance = NULL, allow_parallel = TRUE, force_parallel = FALSE, coalesce = TRUE )
other |
DataFrame or LazyFrame |
... |
Not used, blocks use of further positional arguments |
left_on , right_on
|
Same as |
on |
Either a vector of column names or a list of expressions and/or
strings. Use |
by_left , by_right
|
Same as |
by |
Join on these columns before performing asof join. Either a vector
of column names or a list of expressions and/or strings. Use |
strategy |
Strategy for where to find match:
|
suffix |
Suffix to add to duplicated column names. |
tolerance |
Numeric tolerance. By setting this the join will only be done if the near
keys are within this distance. If an asof join is done on columns of dtype
"Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
About the language, see the There may be a circumstance where R types are not sufficient to express a
numeric tolerance. In that case, you can use the expression syntax like
|
allow_parallel |
Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel. |
force_parallel |
Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. |
coalesce |
Coalescing behavior (merging of
|
Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
New joined DataFrame
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
# create two DataFrames to join asof gdp = pl$DataFrame( date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")), gdp = c(4321, 4164, 4411, 4566, 4696), group = c("b", "a", "a", "b", "b") ) pop = pl$DataFrame( date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")), population = c(82.19, 82.66, 83.12, 83.52), group = c("b", "b", "a", "a") ) # optional make sure tables are already sorted with "on" join-key gdp = gdp$sort("date") pop = pop$sort("date") # Left-join_asof DataFrame pop with gdp on "date" # Look backward in gdp to find closest matching date pop$join_asof(gdp, on = "date", strategy = "backward") # .... and forward pop$join_asof(gdp, on = "date", strategy = "forward") # join by a group: "only look within within group" pop$join_asof(gdp, on = "date", by = "group", strategy = "backward") # only look 2 weeks and 2 days back pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d") # only look 11 days back (numeric tolerance depends on polars type, <date> is in days) pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)
# create two DataFrames to join asof gdp = pl$DataFrame( date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")), gdp = c(4321, 4164, 4411, 4566, 4696), group = c("b", "a", "a", "b", "b") ) pop = pl$DataFrame( date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")), population = c(82.19, 82.66, 83.12, 83.52), group = c("b", "b", "a", "a") ) # optional make sure tables are already sorted with "on" join-key gdp = gdp$sort("date") pop = pop$sort("date") # Left-join_asof DataFrame pop with gdp on "date" # Look backward in gdp to find closest matching date pop$join_asof(gdp, on = "date", strategy = "backward") # .... and forward pop$join_asof(gdp, on = "date", strategy = "forward") # join by a group: "only look within within group" pop$join_asof(gdp, on = "date", by = "group", strategy = "backward") # only look 2 weeks and 2 days back pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d") # only look 11 days back (numeric tolerance depends on polars type, <date> is in days) pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)
This performs an inner join, so only rows where all predicates are true are included in the result, and a row from either DataFrame may be included multiple times in the result.
Note that the row order of the input DataFrames is not preserved.
DataFrame_join_where(other, ..., suffix = "_right")
DataFrame_join_where(other, ..., suffix = "_right")
other |
DataFrame to join with. |
... |
(In)Equality condition to join the two tables on. When a column
name occurs in both tables, the proper suffix must be applied in the
predicate. For example, if both tables have a column |
suffix |
Suffix to append to columns with a duplicate name. |
A DataFrame
east = pl$DataFrame( id = c(100, 101, 102), dur = c(120, 140, 160), rev = c(12, 14, 16), cores = c(2, 8, 4) ) west = pl$DataFrame( t_id = c(404, 498, 676, 742), time = c(90, 130, 150, 170), cost = c(9, 13, 15, 16), cores = c(4, 2, 1, 4) ) east$join_where( west, pl$col("dur") < pl$col("time"), pl$col("rev") < pl$col("cost") )
east = pl$DataFrame( id = c(100, 101, 102), dur = c(120, 140, 160), rev = c(12, 14, 16), cores = c(2, 8, 4) ) west = pl$DataFrame( t_id = c(404, 498, 676, 742), time = c(90, 130, 150, 170), cost = c(9, 13, 15, 16), cores = c(4, 2, 1, 4) ) east$join_where( west, pl$col("dur") < pl$col("time"), pl$col("rev") < pl$col("cost") )
Get the last row of the DataFrame.
DataFrame_last()
DataFrame_last()
A DataFrame with one row.
as_polars_df(mtcars)$last()
as_polars_df(mtcars)$last()
Start a new lazy query from a DataFrame.
DataFrame_lazy()
DataFrame_lazy()
A LazyFrame
as_polars_df(iris)$lazy()
as_polars_df(iris)$lazy()
Aggregate the columns in the DataFrame to their maximum value.
DataFrame_max()
DataFrame_max()
A DataFrame with one row.
as_polars_df(mtcars)$max()
as_polars_df(mtcars)$max()
Aggregate the columns in the DataFrame to their mean value.
DataFrame_mean()
DataFrame_mean()
A DataFrame with one row.
as_polars_df(mtcars)$mean()
as_polars_df(mtcars)$mean()
Aggregate the columns in the DataFrame to their median value.
DataFrame_median()
DataFrame_median()
A DataFrame with one row.
as_polars_df(mtcars)$median()
as_polars_df(mtcars)$median()
Aggregate the columns in the DataFrame to their minimum value.
DataFrame_min()
DataFrame_min()
A DataFrame with one row.
as_polars_df(mtcars)$min()
as_polars_df(mtcars)$min()
Number of chunks (memory allocations) for all or first Series in a DataFrame.
DataFrame_n_chunks(strategy = "first")
DataFrame_n_chunks(strategy = "first")
strategy |
Either |
A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper around a ChunkedArray, which is like a virtual contiguous vector physically backed by an ordered set of chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and cross-platform memory layout.
In R, combining with c()
or rbind()
requires immediate vector re-allocation
to place vector values in contiguous memory. This is slow and memory consuming,
and it is why repeatedly appending to a vector in R is discouraged.
In polars, when we concatenate or append to Series or DataFrame, the re-allocation can be avoided or delayed by simply appending chunks to each individual Series. However, if chunks become many and small or are misaligned across Series, this can hurt the performance of subsequent operations.
Most places in the polars api where chunking could occur, the user have to
typically actively opt-out by setting an argument rechunk = FALSE
.
A real vector of chunk counts per Series.
# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks how = "horizontal" ) df df$n_chunks() # rechunk a chunked DataFrame df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a # S3-classed list with same type of vectors and where have all relevant S3 # generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") } rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation z = c(x, x) z # looks like a plain vector lapply(z, tracemem) # mem allocation in z are the same from x str(z) z = rechunk(z) str(z)
# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks how = "horizontal" ) df df$n_chunks() # rechunk a chunked DataFrame df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a # S3-classed list with same type of vectors and where have all relevant S3 # generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") } rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation z = c(x, x) z # looks like a plain vector lapply(z, tracemem) # mem allocation in z are the same from x str(z) z = rechunk(z) str(z)
Create a new DataFrame that shows the null (which correspond
to NA
in R) counts per column.
DataFrame_null_count()
DataFrame_null_count()
function
DataFrame
x = mtcars x[1, 2:3] = NA pl$DataFrame(x)$null_count()
x = mtcars x[1, 2:3] = NA pl$DataFrame(x)$null_count()
Similar to $group_by()
.
Group by the given columns and return the groups as separate DataFrames.
It is useful to use this in combination with functions like lapply()
or purrr::map()
.
DataFrame_partition_by( ..., maintain_order = TRUE, include_key = TRUE, as_nested_list = FALSE )
DataFrame_partition_by( ..., maintain_order = TRUE, include_key = TRUE, as_nested_list = FALSE )
... |
Characters of column names to group by. Passed to |
maintain_order |
If |
include_key |
If |
as_nested_list |
This affects the format of the output.
If |
A list of DataFrames. See the examples for details.
df = pl$DataFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) df # Pass a single column name to partition by that column. df$partition_by("a") # Partition by multiple columns. df$partition_by("a", "b") # Partition by column data type df$partition_by(pl$String) # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. # The `key` is a named list of the key values, and the `data` is the DataFrame. df$partition_by("a", "b", as_nested_list = TRUE) # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. tryCatch( df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), warning = function(w) w ) # Example of using with lapply(), and printing the key and the data summary df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> lapply(\(x) { sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |> cat() x$data$drop(names(x$key))$describe() |> print() invisible(NULL) }) |> invisible()
df = pl$DataFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) df # Pass a single column name to partition by that column. df$partition_by("a") # Partition by multiple columns. df$partition_by("a", "b") # Partition by column data type df$partition_by(pl$String) # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. # The `key` is a named list of the key values, and the `data` is the DataFrame. df$partition_by("a", "b", as_nested_list = TRUE) # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. tryCatch( df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), warning = function(w) w ) # Example of using with lapply(), and printing the key and the data summary df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> lapply(\(x) { sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |> cat() x$data$drop(names(x$key))$describe() |> print() invisible(NULL) }) |> invisible()
Pivot data from long to wide
DataFrame_pivot( on, ..., index, values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, separator = "_" )
DataFrame_pivot( on, ..., index, values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, separator = "_" )
on |
Name of the column(s) whose values will be used as the header of the output DataFrame. |
... |
Not used. |
index |
One or multiple keys to group by. |
values |
Column values to aggregate. Can be multiple columns if the
|
aggregate_function |
One of:
|
maintain_order |
Sort the grouped keys so that the output order is predictable. |
sort_columns |
Sort the transposed columns by name. Default is by order of discovery. |
separator |
Used as separator/delimiter in generated column names. |
DataFrame
df = pl$DataFrame( foo = c("one", "one", "one", "two", "two", "two"), bar = c("A", "B", "C", "A", "B", "C"), baz = c(1, 2, 3, 4, 5, 6) ) df df$pivot( values = "baz", index = "foo", on = "bar" ) # Run an expression as aggregation function df = pl$DataFrame( col1 = c("a", "a", "a", "b", "b", "b"), col2 = c("x", "x", "x", "x", "y", "y"), col3 = c(6, 7, 3, 2, 5, 7) ) df df$pivot( index = "col1", on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() )
df = pl$DataFrame( foo = c("one", "one", "one", "two", "two", "two"), bar = c("A", "B", "C", "A", "B", "C"), baz = c(1, 2, 3, 4, 5, 6) ) df df$pivot( values = "baz", index = "foo", on = "bar" ) # Run an expression as aggregation function df = pl$DataFrame( col1 = c("a", "a", "a", "b", "b", "b"), col2 = c("x", "x", "x", "x", "y", "y"), col3 = c(6, 7, 3, 2, 5, 7) ) df df$pivot( index = "col1", on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() )
Aggregate the columns in the DataFrame to a unique quantile
value. Use $describe()
to specify several quantiles.
DataFrame_quantile(quantile, interpolation = "nearest")
DataFrame_quantile(quantile, interpolation = "nearest")
quantile |
Numeric of length 1 between 0 and 1. |
interpolation |
One of |
DataFrame
as_polars_df(mtcars)$quantile(.4)
as_polars_df(mtcars)$quantile(.4)
Rechunking re-allocates any "chunked" memory allocations to speed-up e.g. vectorized operations.
DataFrame_rechunk()
DataFrame_rechunk()
A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper around a ChunkedArray, which is like a virtual contiguous vector physically backed by an ordered set of chunks. Each chunk of values has a contiguous memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and cross-platform memory layout.
In R, combining with c()
or rbind()
requires immediate vector re-allocation
to place vector values in contiguous memory. This is slow and memory consuming,
and it is why repeatedly appending to a vector in R is discouraged.
In polars, when we concatenate or append to Series or DataFrame, the re-allocation can be avoided or delayed by simply appending chunks to each individual Series. However, if chunks become many and small or are misaligned across Series, this can hurt the performance of subsequent operations.
Most places in the polars api where chunking could occur, the user have to
typically actively opt-out by setting an argument rechunk = FALSE
.
A DataFrame
# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks how = "horizontal" ) df df$n_chunks() # rechunk a chunked DataFrame df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a # S3-classed list with same type of vectors and where have all relevant S3 # generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") } rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation z = c(x, x) z # looks like a plain vector lapply(z, tracemem) # mem allocation in z are the same from x str(z) z = rechunk(z) str(z)
# create DataFrame with misaligned chunks df = pl$concat( 1:10, # single chunk pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks how = "horizontal" ) df df$n_chunks() # rechunk a chunked DataFrame df$rechunk()$n_chunks() # rechunk is not an in-place operation df$n_chunks() # The following toy example emulates the Series "chunkyness" in R. Here it a # S3-classed list with same type of vectors and where have all relevant S3 # generics implemented to make behave as if it was a regular vector. "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") print.chunked_vector = \(x, ...) print(unlist(x), ...) c.chunked_vector = \(...) { structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") } rechunk = \(x) structure(unlist(x), class = "chunked_vector") x = structure(list(1:4, 5L), class = "chunked_vector") x x + 5:1 lapply(x, tracemem) # trace chunks to verify no re-allocation z = c(x, x) z # looks like a plain vector lapply(z, tracemem) # mem allocation in z are the same from x str(z) z = rechunk(z) str(z)
Rename column names of a DataFrame
DataFrame_rename(...)
DataFrame_rename(...)
... |
One of the following:
|
If existing names are swapped (e.g. A
points to B
and B
points to A
),
polars will block projection and predicate pushdowns at this node.
df = pl$DataFrame( foo = 1:3, bar = 6:8, ham = letters[1:3] ) df$rename(foo = "apple") df$rename( \(column_name) paste0("c", substr(column_name, 2, 100)) )
df = pl$DataFrame( foo = 1:3, bar = 6:8, ham = letters[1:3] ) df$rename(foo = "apple") df$rename( \(column_name) paste0("c", substr(column_name, 2, 100)) )
Reverse the DataFrame (the last row becomes the first one, etc.).
DataFrame_reverse()
DataFrame_reverse()
DataFrame
as_polars_df(mtcars)$reverse()
as_polars_df(mtcars)$reverse()
If you have a time series <t_0, t_1, ..., t_n>
, then by default the windows
created will be:
(t_0 - period, t_0]
(t_1 - period, t_1]
…
(t_n - period, t_n]
whereas if you pass a non-default offset, then the windows will be:
(t_0 + offset, t_0 + offset + period]
(t_1 + offset, t_1 + offset + period]
…
(t_n + offset, t_n + offset + period]
DataFrame_rolling( index_column, ..., period, offset = NULL, closed = "right", group_by = NULL )
DataFrame_rolling( index_column, ..., period, offset = NULL, closed = "right", group_by = NULL )
index_column |
Column used to group based on the time window. Often of
type Date/Datetime. This column must be sorted in ascending order (or, if |
... |
Ignored. |
period |
A character representing the length of the window,
must be non-negative. See the |
offset |
A character representing the offset of the window,
or |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
group_by |
Also group by this column/these columns. |
In case of a rolling operation on an integer column, the windows are defined by:
"1i" # length 1
"10i" # length 10
A RollingGroupBy object
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
date = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$DataFrame(dt = date, a = c(3, 7, 5, 9, 2, 1))$with_columns( pl$col("dt")$str$strptime(pl$Datetime())$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( sum_a = pl$sum("a"), min_a = pl$min("a"), max_a = pl$max("a") )
date = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$DataFrame(dt = date, a = c(3, 7, 5, 9, 2, 1))$with_columns( pl$col("dt")$str$strptime(pl$Datetime())$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( sum_a = pl$sum("a"), min_a = pl$min("a"), max_a = pl$max("a") )
Take a sample of rows from a DataFrame
DataFrame_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
DataFrame_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
n |
Number of rows to return. Cannot be used with |
... |
Ignored. |
fraction |
Fraction of rows to return. Cannot be used with |
with_replacement |
Allow values to be sampled more than once. |
shuffle |
If |
seed |
Seed for the random number generator. If set to |
DataFrame
df = as_polars_df(iris) df$sample(n = 20) df$sample(fraction = 0.1)
df = as_polars_df(iris) df$sample(n = 20) df$sample(fraction = 0.1)
Similar to dplyr::mutate()
. However, it discards unmentioned
columns (like .()
in data.table
).
DataFrame_select(...)
DataFrame_select(...)
... |
Columns to keep. Those can be expressions (e.g |
DataFrame
as_polars_df(iris)$select( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
as_polars_df(iris)$select( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
Similar to dplyr::mutate()
. However, it discards unmentioned columns (like
.()
in data.table
).
This will run all expression sequentially instead of in parallel. Use this
when the work per expression is cheap. Otherwise, $select()
should be
preferred.
DataFrame_select_seq(...)
DataFrame_select_seq(...)
... |
Columns to keep. Those can be expressions (e.g |
DataFrame
as_polars_df(iris)$select_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
as_polars_df(iris)$select_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
Shift the values by a given period. If the period (n
) is positive,
then n
rows will be inserted at the top of the DataFrame and the last n
rows will be discarded. Vice-versa if the period is negative. In the end,
the total number of rows of the DataFrame doesn't change.
DataFrame_shift(n = 1, fill_value = NULL)
DataFrame_shift(n = 1, fill_value = NULL)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
fill_value |
Fill the resulting null values with this value. Accepts expression input. Non-expression inputs are parsed as literals. |
DataFrame
df = pl$DataFrame(a = 1:4, b = 5:8) df$shift(2) df$shift(-2) df$shift(-2, fill_value = 100)
df = pl$DataFrame(a = 1:4, b = 5:8) df$shift(2) df$shift(-2) df$shift(-2, fill_value = 100)
Get a slice of the DataFrame.
DataFrame_slice(offset, length = NULL)
DataFrame_slice(offset, length = NULL)
offset |
Start index, can be a negative value. This is 0-indexed, so
|
length |
Length of the slice. If |
DataFrame
# skip the first 2 rows and take the 4 following rows as_polars_df(mtcars)$slice(2, 4) # this is equivalent to: mtcars[3:6, ]
# skip the first 2 rows and take the 4 following rows as_polars_df(mtcars)$slice(2, 4) # this is equivalent to: mtcars[3:6, ]
Sort a DataFrame
DataFrame_sort( by, ..., descending = FALSE, nulls_last = FALSE, maintain_order = FALSE )
DataFrame_sort( by, ..., descending = FALSE, nulls_last = FALSE, maintain_order = FALSE )
by |
Column(s) to sort by. Can be character vector of column names, a list of Expr(s) or a list with a mix of Expr(s) and column names. |
... |
More columns to sort by as above but provided one Expr per argument. |
descending |
Logical. Sort in descending order (default is |
nulls_last |
A logical or logical vector of the same length as the number of columns.
If |
maintain_order |
Whether the order should be maintained if elements are
equal. If |
DataFrame
df = mtcars df$mpg[1] = NA df = pl$DataFrame(df) df$sort("mpg") df$sort("mpg", nulls_last = TRUE) df$sort("cyl", "mpg") df$sort(c("cyl", "mpg")) df$sort(c("cyl", "mpg"), descending = TRUE) df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE)) df$sort(pl$col("cyl"), pl$col("mpg"))
df = mtcars df$mpg[1] = NA df = pl$DataFrame(df) df$sort("mpg") df$sort("mpg", nulls_last = TRUE) df$sort("cyl", "mpg") df$sort(c("cyl", "mpg")) df$sort(c("cyl", "mpg"), descending = TRUE) df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE)) df$sort(pl$col("cyl"), pl$col("mpg"))
The calling frame is automatically registered as a table in the SQL context
under the name "self"
. All DataFrames and
LazyFrames found in the envir
are also registered,
using their variable name.
More control over registration and execution behaviour is available by
the SQLContext object.
DataFrame_sql(query, ..., table_name = NULL, envir = parent.frame())
DataFrame_sql(query, ..., table_name = NULL, envir = parent.frame())
query |
A character of the SQL query to execute. |
... |
Ignored. |
table_name |
|
envir |
The environment to search for polars DataFrames/LazyFrames. |
This functionality is considered unstable, although it is close to being considered stable. It may be changed at any point without it being considered a breaking change.
df1 = pl$DataFrame( a = 1:3, b = c("zz", "yy", "xx"), c = as.Date(c("1999-12-31", "2010-10-10", "2077-08-08")) ) # Query the DataFrame using SQL: df1$sql("SELECT c, b FROM self WHERE a > 1") # Join two DataFrames using SQL. df2 = pl$DataFrame(a = 3:1, d = c(125, -654, 888)) df1$sql( " SELECT self.*, d FROM self INNER JOIN df2 USING (a) WHERE a > 1 AND EXTRACT(year FROM c) < 2050 " ) # Apply transformations to a DataFrame using SQL, aliasing "self" to "frame". df1$sql( query = r"( SELECT a, MOD(a, 2) == 0 AS a_is_even, CONCAT_WS(':', b, b) AS b_b, EXTRACT(year FROM c) AS year, 0::float AS 'zero' FROM frame )", table_name = "frame" )
df1 = pl$DataFrame( a = 1:3, b = c("zz", "yy", "xx"), c = as.Date(c("1999-12-31", "2010-10-10", "2077-08-08")) ) # Query the DataFrame using SQL: df1$sql("SELECT c, b FROM self WHERE a > 1") # Join two DataFrames using SQL. df2 = pl$DataFrame(a = 3:1, d = c(125, -654, 888)) df1$sql( " SELECT self.*, d FROM self INNER JOIN df2 USING (a) WHERE a > 1 AND EXTRACT(year FROM c) < 2050 " ) # Apply transformations to a DataFrame using SQL, aliasing "self" to "frame". df1$sql( query = r"( SELECT a, MOD(a, 2) == 0 AS a_is_even, CONCAT_WS(':', b, b) AS b_b, EXTRACT(year FROM c) AS year, 0::float AS 'zero' FROM frame )", table_name = "frame" )
Aggregate the columns of this DataFrame to their standard deviation values.
DataFrame_std(ddof = 1)
DataFrame_std(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A DataFrame with one row.
as_polars_df(mtcars)$std()
as_polars_df(mtcars)$std()
Aggregate the columns of this DataFrame to their sum values.
DataFrame_sum()
DataFrame_sum()
A DataFrame with one row.
as_polars_df(mtcars)$sum()
as_polars_df(mtcars)$sum()
n
rows.Get the last n
rows.
DataFrame_tail(n = 5L)
DataFrame_tail(n = 5L)
n |
Number of rows to return. If a negative value is passed,
return all rows except the first |
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) df$tail(3) # Pass a negative value to get all rows except the first `abs(n)`. df$tail(-3)
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) df$tail(3) # Pass a negative value to get all rows except the first `abs(n)`. df$tail(-3)
Return Polars DataFrame as R data.frame
DataFrame_to_data_frame( ..., int64_conversion = polars_options()$int64_conversion )
DataFrame_to_data_frame( ..., int64_conversion = polars_options()$int64_conversion )
... |
Any args pased to |
int64_conversion |
How should Int64 values be handled when converting a polars object to R?
|
An R data.frame
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
df = as_polars_df(iris[1:3, ]) df$to_data_frame()
df = as_polars_df(iris[1:3, ]) df$to_data_frame()
Convert variables into dummy/indicator variables
DataFrame_to_dummies(columns = NULL, ..., separator = "_", drop_first = FALSE)
DataFrame_to_dummies(columns = NULL, ..., separator = "_", drop_first = FALSE)
columns |
Column name(s) or selector(s) that should be converted to
dummy variables. If |
... |
Ignored. |
separator |
Separator/delimiter used when generating column names. |
drop_first |
Remove the first category from the variables being encoded. |
A DataFrame
df = pl$DataFrame(foo = 1:2, bar = 3:4, ham = c("a", "b")) df$to_dummies() df$to_dummies(drop_first = TRUE) df$to_dummies(c("foo", "bar"), separator = "::")
df = pl$DataFrame(foo = 1:2, bar = 3:4, ham = c("a", "b")) df$to_dummies() df$to_dummies(drop_first = TRUE) df$to_dummies(c("foo", "bar"), separator = "::")
Return Polars DataFrame as a list of vectors
DataFrame_to_list( unnest_structs = TRUE, ..., int64_conversion = polars_options()$int64_conversion )
DataFrame_to_list( unnest_structs = TRUE, ..., int64_conversion = polars_options()$int64_conversion )
unnest_structs |
Logical. If |
... |
Any args pased to |
int64_conversion |
How should Int64 values be handled when converting a polars object to R?
|
For simplicity reasons, this implementation relies on unnesting all structs
before exporting to R. If unnest_structs = FALSE
, then struct
columns
will be returned as nested lists, where each row is a list of values. Such a
structure is not very typical or efficient in R.
R list of vectors
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
<DataFrame>$get_columns()
:
Similar to this method but returns a list of Series instead of vectors.
as_polars_df(iris)$to_list()
as_polars_df(iris)$to_list()
Write Arrow IPC data to a raw vector
DataFrame_to_raw_ipc( compression = c("uncompressed", "zstd", "lz4"), ..., compat_level = FALSE )
DataFrame_to_raw_ipc( compression = c("uncompressed", "zstd", "lz4"), ..., compat_level = FALSE )
compression |
|
... |
Ignored. |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
A raw vector
df = pl$DataFrame( foo = 1:5, bar = 6:10, ham = letters[1:5] ) raw_ipc = df$to_raw_ipc() pl$read_ipc(raw_ipc) if (require("arrow", quietly = TRUE)) { arrow::read_ipc_file(raw_ipc, as_data_frame = FALSE) }
df = pl$DataFrame( foo = 1:5, bar = 6:10, ham = letters[1:5] ) raw_ipc = df$to_raw_ipc() pl$read_ipc(raw_ipc) if (require("arrow", quietly = TRUE)) { arrow::read_ipc_file(raw_ipc, as_data_frame = FALSE) }
Extract a DataFrame column (by index) as a Polars series. Unlike
get_column()
, this method will not fail but will return a NULL
if the
index doesn't exist in the DataFrame. Keep in mind that Polars is 0-indexed
so "0" is the first column.
DataFrame_to_series(idx = 0)
DataFrame_to_series(idx = 0)
idx |
Index of the column to return as Series. Defaults to 0, which is the first column. |
Series or NULL
df = as_polars_df(iris[1:10, ]) # default is to extract the first column df$to_series() # Polars is 0-indexed, so we use idx = 1 to extract the *2nd* column df$to_series(idx = 1) # doesn't error if the column isn't there df$to_series(idx = 8)
df = as_polars_df(iris[1:10, ]) # default is to extract the first column df$to_series() # Polars is 0-indexed, so we use idx = 1 to extract the *2nd* column df$to_series(idx = 1) # doesn't error if the column isn't there df$to_series(idx = 8)
Convert DataFrame to a Series of type "struct"
DataFrame_to_struct(name = "")
DataFrame_to_struct(name = "")
name |
Name given to the new Series |
A Series of type "struct"
# round-trip conversion from DataFrame with two columns df = pl$DataFrame(a = 1:5, b = c("one", "two", "three", "four", "five")) s = df$to_struct() s # convert to an R list s$to_r() # Convert back to a DataFrame df_s = s$to_frame() df_s
# round-trip conversion from DataFrame with two columns df = pl$DataFrame(a = 1:5, b = c("one", "two", "three", "four", "five")) s = df$to_struct() s # convert to an R list s$to_r() # Convert back to a DataFrame df_s = s$to_frame() df_s
Transpose a DataFrame over the diagonal.
DataFrame_transpose( include_header = FALSE, header_name = "column", column_names = NULL )
DataFrame_transpose( include_header = FALSE, header_name = "column", column_names = NULL )
include_header |
If |
header_name |
If |
column_names |
Character vector indicating the new column names. If |
This is a very expensive operation.
Transpose may be the fastest option to perform non foldable (see fold()
or reduce()
)
row operations like median.
Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
DataFrame
# simple use-case as_polars_df(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) # All rows must have one shared supertype, recast Categorical to String which is a supertype # of f64, and then dataset "Iris" can be transposed as_polars_df(iris)$with_columns(pl$col("Species")$cast(pl$String))$transpose()
# simple use-case as_polars_df(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) # All rows must have one shared supertype, recast Categorical to String which is a supertype # of f64, and then dataset "Iris" can be transposed as_polars_df(iris)$with_columns(pl$col("Species")$cast(pl$String))$transpose()
Drop duplicated rows
DataFrame_unique(subset = NULL, ..., keep = "any", maintain_order = FALSE)
DataFrame_unique(subset = NULL, ..., keep = "any", maintain_order = FALSE)
subset |
A character vector with the names of the column(s) to use to
identify duplicates. If |
... |
Not used. |
keep |
Which of the duplicate rows to keep:
|
maintain_order |
Keep the same order as the original data. Setting this
to |
DataFrame
df = pl$DataFrame( x = c(1:3, 1:3, 3:1, 1L), y = c(1:3, 1:3, 1:3, 1L) ) df$height df$unique()$height # subset to define unique, keep only last or first df$unique(subset = "x", keep = "last") df$unique(subset = "x", keep = "first") # only keep unique rows df$unique(keep = "none")
df = pl$DataFrame( x = c(1:3, 1:3, 3:1, 1L), y = c(1:3, 1:3, 1:3, 1L) ) df$height df$unique()$height # subset to define unique, keep only last or first df$unique(subset = "x", keep = "last") df$unique(subset = "x", keep = "first") # only keep unique rows df$unique(keep = "none")
Unnest the Struct columns of a DataFrame
DataFrame_unnest(...)
DataFrame_unnest(...)
... |
Names of the struct columns to unnest. This doesn't accept Expr. If nothing is provided, then all columns of datatype Struct are unnested. |
A DataFrame where some or all columns of datatype Struct are unnested.
df = pl$DataFrame( a = 1:5, b = c("one", "two", "three", "four", "five"), c = 6:10 )$ select( pl$struct("b"), pl$struct(c("a", "c"))$alias("a_and_c") ) df # by default, all struct columns are unnested df$unnest() # we can specify specific columns to unnest df$unnest("a_and_c")
df = pl$DataFrame( a = 1:5, b = c("one", "two", "three", "four", "five"), c = 6:10 )$ select( pl$struct("b"), pl$struct(c("a", "c"))$alias("a_and_c") ) df # by default, all struct columns are unnested df$unnest() # we can specify specific columns to unnest df$unnest("a_and_c")
Unpivot a Frame from wide to long format
DataFrame_unpivot( on = NULL, ..., index = NULL, variable_name = NULL, value_name = NULL )
DataFrame_unpivot( on = NULL, ..., index = NULL, variable_name = NULL, value_name = NULL )
on |
Values to use as identifier variables. If |
... |
Not used. |
index |
Columns to use as identifier variables. |
variable_name |
Name to give to the new column containing the names of the melted columns. Defaults to "variable". |
value_name |
Name to give to the new column containing the values of
the melted columns. Defaults to |
Optionally leaves identifiers set.
This function is useful to massage a Frame into a format where one or more columns are identifier variables (id_vars), while all other columns, considered measured variables (value_vars), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'.
A new DataFrame
df = pl$DataFrame( a = c("x", "y", "z"), b = c(1, 3, 5), c = c(2, 4, 6), d = c(7, 8, 9) ) df$unpivot(index = "a", on = c("b", "c", "d"))
df = pl$DataFrame( a = c("x", "y", "z"), b = c(1, 3, 5), c = c(2, 4, 6), d = c(7, 8, 9) ) df$unpivot(index = "a", on = c("b", "c", "d"))
Aggregate the columns of this DataFrame to their variance values.
DataFrame_var(ddof = 1)
DataFrame_var(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A DataFrame with one row.
as_polars_df(mtcars)$var()
as_polars_df(mtcars)$var()
Add columns or modify existing ones with expressions. This is
the equivalent of dplyr::mutate()
as it keeps unmentioned columns (unlike
$select()
).
DataFrame_with_columns(...)
DataFrame_with_columns(...)
... |
Any expressions or string column name, or same wrapped in a list. If first and only element is a list, it is unwrapped as a list of args. |
A DataFrame
as_polars_df(iris)$with_columns( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_df(iris)$with_columns(l_expr) as_polars_df(iris)$with_columns( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
as_polars_df(iris)$with_columns( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_df(iris)$with_columns(l_expr) as_polars_df(iris)$with_columns( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
Add columns or modify existing ones with expressions. This is
the equivalent of dplyr::mutate()
as it keeps unmentioned columns (unlike
$select()
).
This will run all expression sequentially instead of in parallel. Use this
when the work per expression is cheap. Otherwise, $with_columns()
should be
preferred.
DataFrame_with_columns_seq(...)
DataFrame_with_columns_seq(...)
... |
Any expressions or string column name, or same wrapped in a list. If first and only element is a list, it is unwrapped as a list of args. |
A DataFrame
as_polars_df(iris)$with_columns_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_df(iris)$with_columns_seq(l_expr) as_polars_df(iris)$with_columns_seq( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
as_polars_df(iris)$with_columns_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_df(iris)$with_columns_seq(l_expr) as_polars_df(iris)$with_columns_seq( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
Add a new column at index 0 that counts the rows
DataFrame_with_row_index(name, offset = NULL)
DataFrame_with_row_index(name, offset = NULL)
name |
string name of the created column |
offset |
positive integer offset for the start of the counter |
A new DataFrame
object with a counter column in front
df = as_polars_df(mtcars) # by default, the index starts at 0 (to mimic the behavior of Python Polars) df$with_row_index("idx") # but in R, we use a 1-index df$with_row_index("idx", offset = 1)
df = as_polars_df(mtcars) # by default, the index starts at 0 (to mimic the behavior of Python Polars) df$with_row_index("idx") # but in R, we use a 1-index df$with_row_index("idx", offset = 1)
Write to comma-separated values (CSV) file
DataFrame_write_csv( file, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_values = "", quote_style = "necessary" )
DataFrame_write_csv( file, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_values = "", quote_style = "necessary" )
file |
File path to which the result should be written. |
... |
Ignored. |
include_bom |
Whether to include UTF-8 BOM (byte order mark) in the CSV output. |
include_header |
Whether to include header in the CSV output. |
separator |
Separate CSV fields with this symbol. |
line_terminator |
String used to end each row. |
quote_char |
Byte to use as quoting character. |
batch_size |
Number of rows that will be processed per thread. |
datetime_format |
A format string, with the specifiers defined by the chrono Rust crate. If no format specified, the default fractional-second precision is inferred from the maximum timeunit found in the frame’s Datetime cols (if any). |
date_format |
A format string, with the specifiers defined by the chrono Rust crate. |
time_format |
A format string, with the specifiers defined by the chrono Rust crate. |
float_precision |
Number of decimal places to write, applied to both Float32 and Float64 datatypes. |
null_values |
A string representing null values (defaulting to the empty string). |
quote_style |
Determines the quoting strategy used.
|
Invisibly returns the input DataFrame.
dat = as_polars_df(mtcars) destination = tempfile(fileext = ".csv") dat$select(pl$col("drat", "mpg"))$write_csv(destination) pl$read_csv(destination)
dat = as_polars_df(mtcars) destination = tempfile(fileext = ".csv") dat$select(pl$col("drat", "mpg"))$write_csv(destination) pl$read_csv(destination)
Write to Arrow IPC file (a.k.a Feather file)
DataFrame_write_ipc( file, compression = c("uncompressed", "zstd", "lz4"), ..., compat_level = TRUE )
DataFrame_write_ipc( file, compression = c("uncompressed", "zstd", "lz4"), ..., compat_level = TRUE )
file |
File path to which the result should be written. |
compression |
|
... |
Ignored. |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
Invisibly returns the input DataFrame.
dat = as_polars_df(mtcars) destination = tempfile(fileext = ".arrow") dat$write_ipc(destination) if (require("arrow", quietly = TRUE)) { arrow::read_ipc_file(destination, as_data_frame = FALSE) }
dat = as_polars_df(mtcars) destination = tempfile(fileext = ".arrow") dat$write_ipc(destination) if (require("arrow", quietly = TRUE)) { arrow::read_ipc_file(destination, as_data_frame = FALSE) }
Write to JSON file
DataFrame_write_json(file, ..., pretty = FALSE, row_oriented = FALSE)
DataFrame_write_json(file, ..., pretty = FALSE, row_oriented = FALSE)
file |
File path to which the result should be written. |
... |
Ignored. |
pretty |
Pretty serialize JSON. |
row_oriented |
Write to row-oriented JSON. This is slower, but more common. |
Invisibly returns the input DataFrame.
if (require("jsonlite", quiet = TRUE)) { dat = as_polars_df(head(mtcars)) destination = tempfile() dat$select(pl$col("drat", "mpg"))$write_json(destination) jsonlite::fromJSON(destination) dat$select(pl$col("drat", "mpg"))$write_json(destination, row_oriented = TRUE) jsonlite::fromJSON(destination) }
if (require("jsonlite", quiet = TRUE)) { dat = as_polars_df(head(mtcars)) destination = tempfile() dat$select(pl$col("drat", "mpg"))$write_json(destination) jsonlite::fromJSON(destination) dat$select(pl$col("drat", "mpg"))$write_json(destination, row_oriented = TRUE) jsonlite::fromJSON(destination) }
Write to NDJSON file
DataFrame_write_ndjson(file)
DataFrame_write_ndjson(file)
file |
File path to which the result should be written. |
Invisibly returns the input DataFrame.
dat = as_polars_df(head(mtcars)) destination = tempfile() dat$select(pl$col("drat", "mpg"))$write_ndjson(destination) pl$read_ndjson(destination)
dat = as_polars_df(head(mtcars)) destination = tempfile() dat$select(pl$col("drat", "mpg"))$write_ndjson(destination) pl$read_ndjson(destination)
Write to parquet file
DataFrame_write_parquet( file, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, partition_by = NULL, partition_chunk_size_bytes = 4294967296 )
DataFrame_write_parquet( file, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, partition_by = NULL, partition_chunk_size_bytes = 4294967296 )
file |
File path to which the result should be written. This should be a path to a directory if writing a partitioned dataset. |
... |
Ignored. |
compression |
String. The compression method. One of:
|
compression_level |
|
statistics |
Whether statistics should be written to the Parquet headers. Possible values:
|
row_group_size |
|
data_page_size |
Size of the data page in bytes. If |
partition_by |
Column(s) to partition by. A partitioned dataset will be written if this is specified. |
partition_chunk_size_bytes |
Approximate size to split DataFrames within a single partition when writing. Note this is calculated using the size of the DataFrame in memory - the size of the output file may differ depending on the file format / compression. |
Invisibly returns the input DataFrame.
dat = as_polars_df(mtcars) # write data to a single parquet file destination = withr::local_tempfile(fileext = ".parquet") dat$write_parquet(destination) # write data to folder with a hive-partitioned structure dest_folder = withr::local_tempdir() dat$write_parquet(dest_folder, partition_by = c("gear", "cyl")) list.files(dest_folder, recursive = TRUE)
dat = as_polars_df(mtcars) # write data to a single parquet file destination = withr::local_tempfile(fileext = ".parquet") dat$write_parquet(destination) # write data to folder with a hive-partitioned structure dest_folder = withr::local_tempdir() dat$write_parquet(dest_folder, partition_by = c("gear", "cyl")) list.files(dest_folder, recursive = TRUE)
The Array and List datatypes are very similar. The only difference is that
sub-arrays all have the same length while sublists can have different lengths.
Array methods can be accessed via the $arr
subnamespace.
DataType_Array(datatype = "unknown", width)
DataType_Array(datatype = "unknown", width)
datatype |
An inner DataType. The default is |
width |
The length of the arrays. |
An array DataType with an inner DataType
# basic Array pl$Array(pl$Int32, 4) # some nested Array pl$Array(pl$Array(pl$Boolean, 3), 2)
# basic Array pl$Array(pl$Int32, 4) # some nested Array pl$Array(pl$Array(pl$Boolean, 3), 2)
Create Categorical DataType
DataType_Categorical(ordering = "physical")
DataType_Categorical(ordering = "physical")
ordering |
Either |
When a categorical variable is created, its string values (or "lexical" values) are stored and encoded as integers ("physical" values) by order of appearance. Therefore, sorting a categorical value can be done either on the lexical or on the physical values. See Examples.
A Categorical DataType
# default is to order by physical values df = pl$DataFrame(x = c("z", "z", "k", "a", "z"), schema = list(x = pl$Categorical())) df$sort("x") # when setting ordering = "lexical", sorting will be based on the strings df_lex = pl$DataFrame( x = c("z", "z", "k", "a", "z"), schema = list(x = pl$Categorical("lexical")) ) df_lex$sort("x")
# default is to order by physical values df = pl$DataFrame(x = c("z", "z", "k", "a", "z"), schema = list(x = pl$Categorical())) df$sort("x") # when setting ordering = "lexical", sorting will be based on the strings df_lex = pl$DataFrame( x = c("z", "z", "k", "a", "z"), schema = list(x = pl$Categorical("lexical")) ) df_lex$sort("x")
Check whether the data type contains categoricals
DataType_contains_categoricals()
DataType_contains_categoricals()
A logical value
pl$List(pl$Categorical())$contains_categoricals() pl$List(pl$Enum(c("a", "b")))$contains_categoricals() pl$List(pl$Float32)$contains_categoricals() pl$List(pl$List(pl$Categorical()))$contains_categoricals()
pl$List(pl$Categorical())$contains_categoricals() pl$List(pl$Enum(c("a", "b")))$contains_categoricals() pl$List(pl$Float32)$contains_categoricals() pl$List(pl$List(pl$Categorical()))$contains_categoricals()
Check whether the data type contains views
DataType_contains_views()
DataType_contains_views()
A logical value
pl$List(pl$String)$contains_views() pl$List(pl$Binary)$contains_views() pl$List(pl$Float32)$contains_views() pl$List(pl$List(pl$Binary))$contains_views()
pl$List(pl$String)$contains_views() pl$List(pl$Binary)$contains_views() pl$List(pl$Float32)$contains_views() pl$List(pl$List(pl$Binary))$contains_views()
The underlying representation of this type is a 64-bit signed integer. The integer indicates the number of time units since the Unix epoch (1970-01-01 00:00:00). The number can be negative to indicate datetimes before the epoch.
DataType_Datetime(time_unit = "us", time_zone = NULL)
DataType_Datetime(time_unit = "us", time_zone = NULL)
time_unit |
Unit of time. One of |
time_zone |
Time zone string, as defined in |
Datetime DataType
pl$Datetime("ns", "Pacific/Samoa") df = pl$DataFrame( naive_time = as.POSIXct("1900-01-01"), zoned_time = as.POSIXct("1900-01-01", "UTC") ) df df$select(pl$col(pl$Datetime("us", "*")))
pl$Datetime("ns", "Pacific/Samoa") df = pl$DataFrame( naive_time = as.POSIXct("1900-01-01"), zoned_time = as.POSIXct("1900-01-01", "UTC") ) df df$select(pl$col(pl$Datetime("us", "*")))
Data type representing a time duration
DataType_Duration(time_unit = "us")
DataType_Duration(time_unit = "us")
time_unit |
Unit of time. One of |
Duration DataType
test = pl$DataFrame( a = 1:2, b = c("a", "b"), c = pl$duration(weeks = c(1, 2), days = c(0, 2)) ) # select all columns of type "duration" test$select(pl$col(pl$Duration()))
test = pl$DataFrame( a = 1:2, b = c("a", "b"), c = pl$duration(weeks = c(1, 2), days = c(0, 2)) ) # select all columns of type "duration" test$select(pl$col(pl$Duration()))
An Enum
is a fixed set categorical encoding of a set of strings. It is
similar to the Categorical
data type, but the
categories are explicitly provided by the user and cannot be modified.
DataType_Enum(categories)
DataType_Enum(categories)
categories |
A character vector specifying the categories of the variable. |
This functionality is unstable. It is a work-in-progress feature and may not always work as expected. It may be changed at any point without it being considered a breaking change.
An Enum DataType
pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar"), schema = list(x = pl$Enum(c("Polar", "Panda", "Brown"))) ) # All values of the variable have to be in the categories dtype = pl$Enum(c("Polar", "Panda", "Brown")) tryCatch( pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), schema = list(x = dtype) ), error = function(e) e ) # Comparing two Enum is only valid if they have the same categories df = pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar"), y = c("Polar", "Polar", "Polar", "Brown", "Brown"), z = c("Polar", "Polar", "Polar", "Brown", "Brown"), schema = list( x = pl$Enum(c("Polar", "Panda", "Brown")), y = pl$Enum(c("Polar", "Panda", "Brown")), z = pl$Enum(c("Polar", "Black", "Brown")) ) ) # Same categories df$with_columns(x_eq_y = pl$col("x") == pl$col("y")) # Different categories tryCatch( df$with_columns(x_eq_z = pl$col("x") == pl$col("z")), error = function(e) e )
pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar"), schema = list(x = pl$Enum(c("Polar", "Panda", "Brown"))) ) # All values of the variable have to be in the categories dtype = pl$Enum(c("Polar", "Panda", "Brown")) tryCatch( pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), schema = list(x = dtype) ), error = function(e) e ) # Comparing two Enum is only valid if they have the same categories df = pl$DataFrame( x = c("Polar", "Panda", "Brown", "Brown", "Polar"), y = c("Polar", "Polar", "Polar", "Brown", "Brown"), z = c("Polar", "Polar", "Polar", "Brown", "Brown"), schema = list( x = pl$Enum(c("Polar", "Panda", "Brown")), y = pl$Enum(c("Polar", "Panda", "Brown")), z = pl$Enum(c("Polar", "Black", "Brown")) ) ) # Same categories df$with_columns(x_eq_y = pl$col("x") == pl$col("y")) # Different categories tryCatch( df$with_columns(x_eq_z = pl$col("x") == pl$col("z")), error = function(e) e )
Check whether the data type is an array type
DataType_is_array()
DataType_is_array()
A logical value
pl$Array(width = 2)$is_array() pl$Float32$is_array()
pl$Array(width = 2)$is_array() pl$Float32$is_array()
Check whether the data type is a binary type
DataType_is_binary()
DataType_is_binary()
A logical value
pl$Binary$is_binary() pl$Float32$is_binary()
pl$Binary$is_binary() pl$Float32$is_binary()
Check whether the data type is a boolean type
DataType_is_bool()
DataType_is_bool()
A logical value
pl$Boolean$is_bool() pl$Float32$is_bool()
pl$Boolean$is_bool() pl$Float32$is_bool()
Check whether the data type is a Categorical type
DataType_is_categorical()
DataType_is_categorical()
A logical value
pl$Categorical()$is_categorical() pl$Enum(c("a", "b"))$is_categorical()
pl$Categorical()$is_categorical() pl$Enum(c("a", "b"))$is_categorical()
Check whether the data type is an Enum type
DataType_is_enum()
DataType_is_enum()
A logical value
pl$Enum(c("a", "b"))$is_enum() pl$Categorical()$is_enum()
pl$Enum(c("a", "b"))$is_enum() pl$Categorical()$is_enum()
Check whether the data type is a float type
DataType_is_float()
DataType_is_float()
A logical value
pl$Float32$is_float() pl$Int32$is_float()
pl$Float32$is_float() pl$Int32$is_float()
Check whether the data type is an integer type
DataType_is_integer()
DataType_is_integer()
A logical value
pl$Int32$is_integer() pl$Float32$is_integer()
pl$Int32$is_integer() pl$Float32$is_integer()
Check whether the data type is known
DataType_is_known()
DataType_is_known()
A logical value
pl$String$is_known() pl$Unknown$is_known()
pl$String$is_known() pl$Unknown$is_known()
Check whether the data type is a list type
DataType_is_list()
DataType_is_list()
A logical value
pl$List()$is_list() pl$Float32$is_list()
pl$List()$is_list() pl$Float32$is_list()
Check whether the data type is a logical type
DataType_is_logical()
DataType_is_logical()
A logical value
Check whether the data type is a nested type
DataType_is_nested()
DataType_is_nested()
A logical value
pl$List()$is_nested() pl$Array(width = 2)$is_nested() pl$Float32$is_nested()
pl$List()$is_nested() pl$Array(width = 2)$is_nested() pl$Float32$is_nested()
Check whether the data type is a null type
DataType_is_null()
DataType_is_null()
A logical value
pl$Null$is_null() pl$Float32$is_null()
pl$Null$is_null() pl$Float32$is_null()
Check whether the data type is a numeric type
DataType_is_numeric()
DataType_is_numeric()
A logical value
pl$Float32$is_numeric() pl$Int32$is_numeric() pl$String$is_numeric()
pl$Float32$is_numeric() pl$Int32$is_numeric() pl$String$is_numeric()
Check whether the data type is an ordinal type
DataType_is_ord()
DataType_is_ord()
A logical value
pl$String$is_ord() pl$Categorical()$is_ord()
pl$String$is_ord() pl$Categorical()$is_ord()
Check whether the data type is a primitive type
DataType_is_primitive()
DataType_is_primitive()
A logical value
pl$Float32$is_primitive() pl$List()$is_primitive()
pl$Float32$is_primitive() pl$List()$is_primitive()
Check whether the data type is a signed integer type
DataType_is_signed_integer()
DataType_is_signed_integer()
A logical value
pl$Int32$is_signed_integer() pl$UInt32$is_signed_integer()
pl$Int32$is_signed_integer() pl$UInt32$is_signed_integer()
Check whether the data type is a String type
DataType_is_string()
DataType_is_string()
A logical value
pl$String$is_string() pl$Float32$is_string()
pl$String$is_string() pl$Float32$is_string()
Check whether the data type is a temporal type
DataType_is_struct()
DataType_is_struct()
A logical value
pl$Struct()$is_struct() pl$Float32$is_struct()
pl$Struct()$is_struct() pl$Float32$is_struct()
Check whether the data type is a temporal type
DataType_is_temporal()
DataType_is_temporal()
A logical value
pl$Date$is_temporal() pl$Float32$is_temporal()
pl$Date$is_temporal() pl$Float32$is_temporal()
Check whether the data type is an unsigned integer type
DataType_is_unsigned_integer()
DataType_is_unsigned_integer()
A logical value
pl$UInt32$is_unsigned_integer() pl$Int32$is_unsigned_integer()
pl$UInt32$is_unsigned_integer() pl$Int32$is_unsigned_integer()
Create List DataType
DataType_List(datatype = "unknown")
DataType_List(datatype = "unknown")
datatype |
The inner DataType. |
A list DataType with an inner DataType
# some nested List pl$List(pl$List(pl$Boolean)) # check if some maybe_list is a List DataType maybe_List = pl$List(pl$UInt64) pl$same_outer_dt(maybe_List, pl$List())
# some nested List pl$List(pl$List(pl$Boolean)) # check if some maybe_list is a List DataType maybe_List = pl$List(pl$UInt64) pl$same_outer_dt(maybe_List, pl$List())
One can create a Struct
data type with pl$Struct()
. There are also
multiple ways to create columns of data type Struct
in a DataFrame
or
a Series
, see the examples.
DataType_Struct(...)
DataType_Struct(...)
... |
Either named inputs of the form |
A Struct DataType containing a list of Fields
# create a Struct-DataType pl$Struct(foo = pl$Int32, pl$Field("bar", pl$Boolean)) # check if an element is any kind of Struct() test = pl$Struct(a = pl$UInt64) pl$same_outer_dt(test, pl$Struct()) # `test` is a type of Struct, but it doesn't mean it is equal to an empty Struct test == pl$Struct() # The way to create a `Series` of type `Struct` is a bit convoluted as it involves # `data.frame()`, `list()`, and `I()`: as_polars_series( data.frame(a = 1:2, b = I(list(c("x", "y"), "z"))) ) # A slightly simpler way would be via `tibble::tibble()` or # `data.table::data.table()`: if (requireNamespace("tibble", quietly = TRUE)) { as_polars_series( tibble::tibble(a = 1:2, b = list(c("x", "y"), "z")) ) } # Finally, one can use `pl$struct()` to convert existing columns or `Series` # to a `Struct`: x = pl$DataFrame( a = 1:2, b = list(c("x", "y"), "z") ) out = x$select(pl$struct(c("a", "b"))) out out$schema
# create a Struct-DataType pl$Struct(foo = pl$Int32, pl$Field("bar", pl$Boolean)) # check if an element is any kind of Struct() test = pl$Struct(a = pl$UInt64) pl$same_outer_dt(test, pl$Struct()) # `test` is a type of Struct, but it doesn't mean it is equal to an empty Struct test == pl$Struct() # The way to create a `Series` of type `Struct` is a bit convoluted as it involves # `data.frame()`, `list()`, and `I()`: as_polars_series( data.frame(a = 1:2, b = I(list(c("x", "y"), "z"))) ) # A slightly simpler way would be via `tibble::tibble()` or # `data.table::data.table()`: if (requireNamespace("tibble", quietly = TRUE)) { as_polars_series( tibble::tibble(a = 1:2, b = list(c("x", "y"), "z")) ) } # Finally, one can use `pl$struct()` to convert existing columns or `Series` # to a `Struct`: x = pl$DataFrame( a = 1:2, b = list(c("x", "y"), "z") ) out = x$select(pl$struct(c("a", "b"))) out out$schema
Get the dimensions
## S3 method for class 'RPolarsDataFrame' dim(x) ## S3 method for class 'RPolarsLazyFrame' dim(x)
## S3 method for class 'RPolarsDataFrame' dim(x) ## S3 method for class 'RPolarsLazyFrame' dim(x)
x |
Get the row and column names
## S3 method for class 'RPolarsDataFrame' dimnames(x) ## S3 method for class 'RPolarsLazyFrame' dimnames(x)
## S3 method for class 'RPolarsDataFrame' dimnames(x) ## S3 method for class 'RPolarsLazyFrame' dimnames(x)
x |
#Comments for how the R and python world translates into polars:
R and python are both high-level glue languages great for Data Science. Rust is a pedantic low-level language with similar use cases as C and C++. Polars is written in ~100k lines of rust and has a rust API. Py-polars the python API for polars, is implemented as an interface with the rust API. r-polars is very parallel to py-polars except it interfaces with R. The performance and behavior are unexpectedly quite similar as the 'engine' is the exact same rust code and data structures.
info
Not applicable
R only has a native Int32 type, no Uint32, Int64, UInt64 , ... types. These days Int32 is getting a bit small, to refer to more rows than ~ 2^31-1. There are packages which provide int64, but the most normal hack' is to just use floats as 'integerish'. There is an unique float64 value for every integer up to about 2^52 which is plenty for all practical concerns. Some polars methods may accept or return a floats even though an integer ideally would be more accurate. Most R functions intermix Int32 (integer) and Float64 (double) seamlessly.
R has allocated a value in every vector type to signal missingness, these are collectively
called NAs
. Polars uses a bool bitmask to signal NA
-like missing value and it is called Null
and Nulls
in plural. Not to confuse with R NULL
(see paragraph below). Polars supports
missingness for any possible type as it kept separately in the bitmask. In python lists the
symbol None
can carry a similar meaning. R NA
~ polars Null
~ py-polars [None]
(in a py list)
From writing a lot of tests for all implementations, it appears polars does not have a
fully consistent nor well documented behavior, when it comes to comparisons and sorting of
floats. Though some general thumb rules do apply:
Polars have chosen to define in sorting that Null
is a value lower than -Inf
as in
Expr.arg_min()
However except when Null
is ignored Expr.min()
, there is a Expr.nan_min()
but no Expr.nan_min()
.
NaN
is sometimes a value higher than Inf and sometimes regarded as a Null
.
Polars conventions NaN
> Inf
> 99
> -99
> -Inf
> Null
Null == Null
yields often times false, sometimes true, sometimes Null
.
The documentation or examples do not reveal this variations. The best to do, when in doubt,
is to do test sort on a small Series/Column of all values.
#' R NaN
~ polars NaN
~ python [float("NaN")]
#only floats have NaN
s
R Inf
~ polars inf
~ python [float("inf")]
#only floats have Inf
The R NULL does not exist inside polars frames and series and so on. It resembles the
Option::None in the hidden rust code. It resembles the python None
. In all three languages the
NULL
/None
/None
are used in this context as function argument to signal default behavior or
perhaps a deactivated feature. R NULL
does NOT translate into the polars bitmask Null
, that
is NA
. R NULL
~ rust-polars Option::None
~ pypolars None
#typically used for function
arguments
The following translations are relevant when loading data into polars. The R list appears
similar to python dictionary (hashmap), but is implemented more similar to the python list
(array of pointers). R list do support string naming elements via a string vector.
In polars both lists (of vectors or series) and data.frames can be used to construct a polars
DataFrame, just a as dictionaries would be used in python. In terms of loading in/out data the
follow translation holds: R data.frame
/list
~ polars DataFrame
~ python dictonary
The R vector (Integer, Double, Character, ...) resembles the Series as both are external from any
frame and can be of any length. The implementation is quite different. E.g. for
-loop appending
to an R vector is considered quite bad for performance. The vector will be fully rewritten in
memory for every append. The polars Series has chunked memory allocation, which allows any
append data to be written only. However fragmented memory is not great for fast computations and
polars objects have a rechunk
()-method, to reallocate chunks into one. Rechunk might be called
implicitly by polars. In the context of constructing. Series and extracting data , the following
translation holds: R vector
~ polars Series
/column
~ python list
The polars Expr do not have any base R counterpart. Expr are analogous to how ggplot split
plotting instructions from the rendering. Base R plot immediately pushes any instruction by
adding e.g. pixels to a .png canvas. ggplot
collects instructions and in the end when executed
the rendering can be performed with optimization across all instructions. Btw ggplot
command-syntax is a monoid meaning the order does not matter, that is not the case for polars
Expr. Polars Expr's can be understood as a DSL (domain specific language) that expresses syntax
trees of instructions. R expressions evaluate to syntax trees also, but it difficult to optimize
the execution order automatically, without rewriting the code. A great selling point of Polars is
that any query will be optimized. Expr are very light-weight symbols chained together.
Aggregate a DataFrame over a time or integer window created with
$group_by_dynamic()
.
DynamicGroupBy_agg(...)
DynamicGroupBy_agg(...)
... |
Exprs to aggregate over. Those can also be passed wrapped in a
list, e.g |
An aggregated DataFrame
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() ) # using "include_boundaries = TRUE" is helpful to see the period considered df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") ) # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally df$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") ) # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: df$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") ) # Dynamic group bys can also be combined with grouping on normal keys df = df$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) df df$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n")) # We can also create a dynamic group by based on an index column df = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) df df$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() ) # using "include_boundaries = TRUE" is helpful to see the period considered df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") ) # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally df$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") ) # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: df$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") ) # Dynamic group bys can also be combined with grouping on normal keys df = df$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) df df$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n")) # We can also create a dynamic group by based on an index column df = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) df df$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))
This class comes from <DataFrame>$group_by_dynamic()
.
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) # get the sum in the following hour relative to the "time" column df$group_by_dynamic("time", every = "1h")
Revert the $group_by_dynamic()
operation. Doing
<DataFrame>$group_by_dynamic(...)$ungroup()
returns the original DataFrame
.
DynamicGroupBy_ungroup()
DynamicGroupBy_ungroup()
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) df df$group_by_dynamic("time", every = "1h")$ungroup()
df = pl$DataFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) df df$group_by_dynamic("time", every = "1h")$ungroup()
Compute the absolute values
Expr_abs()
Expr_abs()
Expr
pl$DataFrame(a = -1:1)$ with_columns(abs = pl$col("a")$abs())
pl$DataFrame(a = -1:1)$ with_columns(abs = pl$col("a")$abs())
Method equivalent of addition operator expr + other
.
Expr_add(other)
Expr_add(other)
other |
numeric or string value; accepts expression input. |
df = pl$DataFrame(x = 1:5) df$with_columns( `x+int` = pl$col("x")$add(2L), `x+expr` = pl$col("x")$add(pl$col("x")$cum_prod()) ) df = pl$DataFrame( x = c("a", "d", "g"), y = c("b", "e", "h"), z = c("c", "f", "i") ) df$with_columns( pl$col("x")$add(pl$col("y"))$add(pl$col("z"))$alias("xyz") )
df = pl$DataFrame(x = 1:5) df$with_columns( `x+int` = pl$col("x")$add(2L), `x+expr` = pl$col("x")$add(pl$col("x")$cum_prod()) ) df = pl$DataFrame( x = c("a", "d", "g"), y = c("b", "e", "h"), z = c("c", "f", "i") ) df$with_columns( pl$col("x")$add(pl$col("y"))$add(pl$col("z"))$alias("xyz") )
Get the group indexes of the group by operation. Should be used in aggregation context only.
Expr_agg_groups()
Expr_agg_groups()
Expr
df = pl$DataFrame(list( group = c("one", "one", "one", "two", "two", "two"), value = c(94, 95, 96, 97, 97, 99) )) df$group_by("group", maintain_order = TRUE)$agg(pl$col("value")$agg_groups())
df = pl$DataFrame(list( group = c("one", "one", "one", "two", "two", "two"), value = c(94, 95, 96, 97, 97, 99) )) df$group_by("group", maintain_order = TRUE)$agg(pl$col("value")$agg_groups())
Rename the output of an expression.
Expr_alias(name)
Expr_alias(name)
name |
New name of output |
Expr
pl$col("bob")$alias("alice")
pl$col("bob")$alias("alice")
Check if all values in a Boolean column are TRUE
. This method is an
expression - not to be confused with pl$all()
which is a function
to select all columns.
Expr_all(..., ignore_nulls = TRUE)
Expr_all(..., ignore_nulls = TRUE)
... |
Ignored. |
ignore_nulls |
If |
A logical value
df = pl$DataFrame( a = c(TRUE, TRUE), b = c(TRUE, FALSE), c = c(NA, TRUE), d = c(NA, NA) ) # By default, ignore null values. If there are only nulls, then all() returns # TRUE. df$select(pl$col("*")$all()) # If we set ignore_nulls = FALSE, then we don't know if all values in column # "c" are TRUE, so it returns null df$select(pl$col("*")$all(ignore_nulls = FALSE))
df = pl$DataFrame( a = c(TRUE, TRUE), b = c(TRUE, FALSE), c = c(NA, TRUE), d = c(NA, NA) ) # By default, ignore null values. If there are only nulls, then all() returns # TRUE. df$select(pl$col("*")$all()) # If we set ignore_nulls = FALSE, then we don't know if all values in column # "c" are TRUE, so it returns null df$select(pl$col("*")$all(ignore_nulls = FALSE))
Combine two boolean expressions with AND.
Expr_and(other)
Expr_and(other)
other |
numeric or string value; accepts expression input. |
pl$lit(TRUE) & TRUE pl$lit(TRUE)$and(pl$lit(TRUE))
pl$lit(TRUE) & TRUE pl$lit(TRUE)$and(pl$lit(TRUE))
Check if any boolean value in a Boolean column is TRUE
.
Expr_any(..., ignore_nulls = TRUE)
Expr_any(..., ignore_nulls = TRUE)
... |
Ignored. |
ignore_nulls |
If |
A logical value
df = pl$DataFrame( a = c(TRUE, FALSE), b = c(FALSE, FALSE), c = c(NA, FALSE) ) df$select(pl$col("*")$any()) # If we set ignore_nulls = FALSE, then we don't know if any values in column # "c" is TRUE, so it returns null df$select(pl$col("*")$any(ignore_nulls = FALSE))
df = pl$DataFrame( a = c(TRUE, FALSE), b = c(FALSE, FALSE), c = c(NA, FALSE) ) df$select(pl$col("*")$any()) # If we set ignore_nulls = FALSE, then we don't know if any values in column # "c" is TRUE, so it returns null df$select(pl$col("*")$any(ignore_nulls = FALSE))
This is done by adding the chunks of other
to this output
.
Expr_append(other, upcast = TRUE)
Expr_append(other, upcast = TRUE)
other |
Expr or something coercible to an Expr. |
upcast |
Cast both Expr to a common supertype if they have one. |
Expr
# append bottom to to row df = pl$DataFrame(list(a = 1:3, b = c(NA_real_, 4, 5))) df$select(pl$all()$head(1)$append(pl$all()$tail(1))) # implicit upcast, when default = TRUE pl$DataFrame(list())$select(pl$lit(42)$append(42L)) pl$DataFrame(list())$select(pl$lit(42)$append(FALSE)) pl$DataFrame(list())$select(pl$lit("Bob")$append(FALSE))
# append bottom to to row df = pl$DataFrame(list(a = 1:3, b = c(NA_real_, 4, 5))) df$select(pl$all()$head(1)$append(pl$all()$tail(1))) # implicit upcast, when default = TRUE pl$DataFrame(list())$select(pl$lit(42)$append(42L)) pl$DataFrame(list())$select(pl$lit(42)$append(FALSE)) pl$DataFrame(list())$select(pl$lit("Bob")$append(FALSE))
This is done using the HyperLogLog++ algorithm for cardinality estimation.
Expr_approx_n_unique()
Expr_approx_n_unique()
Expr
as_polars_df(mtcars)$select(count = pl$col("cyl")$approx_n_unique())
as_polars_df(mtcars)$select(count = pl$col("cyl")$approx_n_unique())
Compute inverse cosine
Expr_arccos()
Expr_arccos()
Expr
pl$DataFrame(a = c(-1, cos(0.5), 0, 1, NA_real_))$ with_columns(arccos = pl$col("a")$arccos())
pl$DataFrame(a = c(-1, cos(0.5), 0, 1, NA_real_))$ with_columns(arccos = pl$col("a")$arccos())
Compute inverse hyperbolic cosine
Expr_arccosh()
Expr_arccosh()
Expr
pl$DataFrame(a = c(-1, cosh(0.5), 0, 1, NA_real_))$ with_columns(arccosh = pl$col("a")$arccosh())
pl$DataFrame(a = c(-1, cosh(0.5), 0, 1, NA_real_))$ with_columns(arccosh = pl$col("a")$arccosh())
Compute inverse sine
Expr_arcsin()
Expr_arcsin()
Expr
pl$DataFrame(a = c(-1, sin(0.5), 0, 1, NA_real_))$ with_columns(arcsin = pl$col("a")$arcsin())
pl$DataFrame(a = c(-1, sin(0.5), 0, 1, NA_real_))$ with_columns(arcsin = pl$col("a")$arcsin())
Compute inverse hyperbolic sine
Expr_arcsinh()
Expr_arcsinh()
Expr
pl$DataFrame(a = c(-1, sinh(0.5), 0, 1, NA_real_))$ with_columns(arcsinh = pl$col("a")$arcsinh())
pl$DataFrame(a = c(-1, sinh(0.5), 0, 1, NA_real_))$ with_columns(arcsinh = pl$col("a")$arcsinh())
Compute inverse tangent
Expr_arctan()
Expr_arctan()
Expr
pl$DataFrame(a = c(-1, tan(0.5), 0, 1, NA_real_))$ with_columns(arctan = pl$col("a")$arctan())
pl$DataFrame(a = c(-1, tan(0.5), 0, 1, NA_real_))$ with_columns(arctan = pl$col("a")$arctan())
Compute inverse hyperbolic tangent
Expr_arctanh()
Expr_arctanh()
Expr
pl$DataFrame(a = c(-1, tanh(0.5), 0, 1, NA_real_))$ with_columns(arctanh = pl$col("a")$arctanh())
pl$DataFrame(a = c(-1, tanh(0.5), 0, 1, NA_real_))$ with_columns(arctanh = pl$col("a")$arctanh())
Get the index of the maximal value.
Expr_arg_max()
Expr_arg_max()
Expr
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_max = pl$col("a")$arg_max())
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_max = pl$col("a")$arg_max())
Get the index of the minimal value.
Expr_arg_min()
Expr_arg_min()
Expr
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_min = pl$col("a")$arg_min())
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_min = pl$col("a")$arg_min())
Get the index values that would sort this column.
Expr_arg_sort(descending = FALSE, nulls_last = FALSE)
Expr_arg_sort(descending = FALSE, nulls_last = FALSE)
descending |
A logical. If |
nulls_last |
A logical. If |
Expr
pl$arg_sort_by() to find the row indices that would sort multiple columns.
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_sorted = pl$col("a")$arg_sort())
pl$DataFrame( a = c(6, 1, 0, NA, Inf, NaN) )$with_columns(arg_sorted = pl$col("a")$arg_sort())
This finds the position of first occurrence of each unique value.
Expr_arg_unique()
Expr_arg_unique()
Expr
pl$select(pl$lit(c(1:2, 1:3))$arg_unique())
pl$select(pl$lit(c(1:2, 1:3))$arg_unique())
Fill missing values with the next to be seen values. Syntactic sugar for
$fill_null(strategy = "backward")
.
Expr_backward_fill(limit = NULL)
Expr_backward_fill(limit = NULL)
limit |
Number of consecutive null values to fill when using the
|
Expr
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( backward = pl$col("a")$backward_fill() )
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( backward = pl$col("a")$backward_fill() )
Return the k
smallest elements. This has time complexity:
Expr_bottom_k(k)
Expr_bottom_k(k)
k |
Number of top values to get. |
Expr
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$bottom_k(5))
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$bottom_k(5))
Cast between DataType
Expr_cast(dtype, strict = TRUE)
Expr_cast(dtype, strict = TRUE)
dtype |
DataType to cast to. |
strict |
If |
Expr
df = pl$DataFrame(a = 1:3, b = c(1, 2, 3)) df$with_columns( pl$col("a")$cast(pl$dtypes$Float64), pl$col("b")$cast(pl$dtypes$Int32) ) # strict FALSE, inserts null for any cast failure pl$lit(c(100, 200, 300))$cast(pl$dtypes$UInt8, strict = FALSE)$to_series() # strict TRUE, raise any failure as an error when query is executed. tryCatch( { pl$lit("a")$cast(pl$dtypes$Float64, strict = TRUE)$to_series() }, error = function(e) e )
df = pl$DataFrame(a = 1:3, b = c(1, 2, 3)) df$with_columns( pl$col("a")$cast(pl$dtypes$Float64), pl$col("b")$cast(pl$dtypes$Int32) ) # strict FALSE, inserts null for any cast failure pl$lit(c(100, 200, 300))$cast(pl$dtypes$UInt8, strict = FALSE)$to_series() # strict TRUE, raise any failure as an error when query is executed. tryCatch( { pl$lit("a")$cast(pl$dtypes$Float64, strict = TRUE)$to_series() }, error = function(e) e )
Rounds up to the nearest integer value. Only works on floating point Series.
Expr_ceil()
Expr_ceil()
Expr
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( ceiling = pl$col("a")$ceil() )
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( ceiling = pl$col("a")$ceil() )
Expressions are all the functions and methods that are applicable to a Polars DataFrame or LazyFrame object. Some methods are under the sub-namespaces.
$arr
stores all array related methods.
$bin
stores all binary related methods.
$cat
stores all categorical related methods.
$dt
stores all temporal related methods.
$list
stores all list related methods.
$meta
stores all methods for working with the meta data.
$name
stores all name related methods.
$str
stores all string related methods.
$struct
stores all struct related methods.
df = pl$DataFrame( a = 1:2, b = list(1:2, 3:4), schema = list(a = pl$Int64, b = pl$Array(pl$Int64, 2)) ) df$select(pl$col("a")$first()) df$select(pl$col("b")$arr$sum())
df = pl$DataFrame( a = 1:2, b = list(1:2, 3:4), schema = list(a = pl$Int64, b = pl$Array(pl$Int64, 2)) ) df$select(pl$col("a")$first()) df$select(pl$col("b")$arr$sum())
Set values outside the given boundaries to the boundary value. This only works for numeric and temporal values.
Expr_clip(lower_bound = NULL, upper_bound = NULL)
Expr_clip(lower_bound = NULL, upper_bound = NULL)
lower_bound |
Lower bound. Accepts expression input. Strings are parsed as column names and other non-expression inputs are parsed as literals. |
upper_bound |
Upper bound. Accepts expression input. Strings are parsed as column names and other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame(foo = c(-50L, 5L, NA_integer_, 50L), bound = c(1, 10, 1, 1)) # With the two bounds df$with_columns(clipped = pl$col("foo")$clip(1, 10)) # Without lower bound df$with_columns(clipped = pl$col("foo")$clip(upper_bound = 10)) # Using another column as lower bound df$with_columns(clipped = pl$col("foo")$clip(lower_bound = "bound"))
df = pl$DataFrame(foo = c(-50L, 5L, NA_integer_, 50L), bound = c(1, 10, 1, 1)) # With the two bounds df$with_columns(clipped = pl$col("foo")$clip(1, 10)) # Without lower bound df$with_columns(clipped = pl$col("foo")$clip(upper_bound = 10)) # Using another column as lower bound df$with_columns(clipped = pl$col("foo")$clip(lower_bound = "bound"))
Compute cosine
Expr_cos()
Expr_cos()
Expr
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(cosine = pl$col("a")$cos())
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(cosine = pl$col("a")$cos())
Compute hyperbolic cosine
Expr_cosh()
Expr_cosh()
Expr
pl$DataFrame(a = c(-1, acosh(2), 0, 1, NA_real_))$ with_columns(cosh = pl$col("a")$cosh())
pl$DataFrame(a = c(-1, acosh(2), 0, 1, NA_real_))$ with_columns(cosh = pl$col("a")$cosh())
Count the number of elements in this expression. Note that NULL
values are
also counted. $len()
is an alias.
Expr_count() Expr_len()
Expr_count() Expr_len()
Expr
pl$DataFrame( all = c(TRUE, TRUE), any = c(TRUE, FALSE), none = c(FALSE, FALSE) )$select( pl$all()$count() )
pl$DataFrame( all = c(TRUE, TRUE), any = c(TRUE, FALSE), none = c(FALSE, FALSE) )$select( pl$all()$count() )
Get an array with the cumulative count (zero-indexed) computed at every element.
Expr_cum_count(reverse = FALSE)
Expr_cum_count(reverse = FALSE)
reverse |
If |
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
$cum_count()
does not seem to count within lists.
Expr
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_count()$alias("cum_count"), pl$col("a")$cum_count(reverse = TRUE)$alias("cum_count_reversed") )
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_count()$alias("cum_count"), pl$col("a")$cum_count(reverse = TRUE)$alias("cum_count_reversed") )
Get an array with the cumulative max computed at every element.
Expr_cum_max(reverse = FALSE)
Expr_cum_max(reverse = FALSE)
reverse |
If |
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
Expr
pl$DataFrame(a = c(1:4, 2L))$with_columns( pl$col("a")$cum_max()$alias("cummux"), pl$col("a")$cum_max(reverse = TRUE)$alias("cum_max_reversed") )
pl$DataFrame(a = c(1:4, 2L))$with_columns( pl$col("a")$cum_max()$alias("cummux"), pl$col("a")$cum_max(reverse = TRUE)$alias("cum_max_reversed") )
Get an array with the cumulative min computed at every element.
Expr_cum_min(reverse = FALSE)
Expr_cum_min(reverse = FALSE)
reverse |
If |
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
Expr
pl$DataFrame(a = c(1:4, 2L))$with_columns( pl$col("a")$cum_min()$alias("cum_min"), pl$col("a")$cum_min(reverse = TRUE)$alias("cum_min_reversed") )
pl$DataFrame(a = c(1:4, 2L))$with_columns( pl$col("a")$cum_min()$alias("cum_min"), pl$col("a")$cum_min(reverse = TRUE)$alias("cum_min_reversed") )
Get an array with the cumulative product computed at every element.
Expr_cum_prod(reverse = FALSE)
Expr_cum_prod(reverse = FALSE)
reverse |
If |
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
Expr
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_prod()$alias("cum_prod"), pl$col("a")$cum_prod(reverse = TRUE)$alias("cum_prod_reversed") )
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_prod()$alias("cum_prod"), pl$col("a")$cum_prod(reverse = TRUE)$alias("cum_prod_reversed") )
Get an array with the cumulative sum computed at every element.
Expr_cum_sum(reverse = FALSE)
Expr_cum_sum(reverse = FALSE)
reverse |
If |
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
Expr
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_sum()$alias("cum_sum"), pl$col("a")$cum_sum(reverse = TRUE)$alias("cum_sum_reversed") )
pl$DataFrame(a = 1:4)$with_columns( pl$col("a")$cum_sum()$alias("cum_sum"), pl$col("a")$cum_sum(reverse = TRUE)$alias("cum_sum_reversed") )
Run an expression over a sliding window that increases by 1
slot every
iteration.
Expr_cumulative_eval(expr, min_periods = 1L, parallel = FALSE)
Expr_cumulative_eval(expr, min_periods = 1L, parallel = FALSE)
expr |
Expression to evaluate. |
min_periods |
Number of valid (non-null) values there should be in the window before the expression is evaluated. |
parallel |
Run in parallel. Don't do this in a groupby or another operation that already has much parallelization. |
This can be really slow as it can have O(n^2)
complexity. Don't use this
for operations that visit all elements.
Expr
pl$lit(1:5)$cumulative_eval( pl$element()$first() - pl$element()$last()^2 )$to_r()
pl$lit(1:5)$cumulative_eval( pl$element()$first() - pl$element()$last()^2 )$to_r()
Bin continuous values into discrete categories
Expr_cut( breaks, ..., labels = NULL, left_closed = FALSE, include_breaks = FALSE )
Expr_cut( breaks, ..., labels = NULL, left_closed = FALSE, include_breaks = FALSE )
breaks |
Unique cut points. |
... |
Ignored. |
labels |
Names of the categories. The number of labels must be equal to the number of cut points plus one. |
left_closed |
Set the intervals to be left-closed instead of right-closed. |
include_breaks |
Include a column with the right endpoint of the bin each
observation falls in. This will change the data type of the output from a
|
Expr of data type Categorical
is include_breaks
is FALSE
and
of data type Struct
if include_breaks
is TRUE
.
df = pl$DataFrame(foo = c(-2, -1, 0, 1, 2)) df$with_columns( cut = pl$col("foo")$cut(c(-1, 1), labels = c("a", "b", "c")) ) # Add both the category and the breakpoint df$with_columns( cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE) )$unnest("cut")
df = pl$DataFrame(foo = c(-2, -1, 0, 1, 2)) df$with_columns( cut = pl$col("foo")$cut(c(-1, 1), labels = c("a", "b", "c")) ) # Add both the category and the breakpoint df$with_columns( cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE) )$unnest("cut")
Calculate the n-th discrete difference.
Expr_diff(n = 1, null_behavior = c("ignore", "drop"))
Expr_diff(n = 1, null_behavior = c("ignore", "drop"))
n |
Number of slots to shift. |
null_behavior |
String, either |
Expr
pl$DataFrame(a = c(20L, 10L, 30L, 40L))$with_columns( diff_default = pl$col("a")$diff(), diff_2_ignore = pl$col("a")$diff(2, "ignore") )
pl$DataFrame(a = c(20L, 10L, 30L, 40L))$with_columns( diff_default = pl$col("a")$diff(), diff_2_ignore = pl$col("a")$diff(2, "ignore") )
Method equivalent of float division operator expr / other
.
Expr_div(other)
Expr_div(other)
other |
Numeric literal or expression value. |
Zero-division behaviour follows IEEE-754:
0/0
: Invalid operation - mathematically undefined, returns NaN
.
n/0
: On finite operands gives an exact infinite result, e.g.: ±infinity.
df = pl$DataFrame( x = -2:2, y = c(0.5, 0, 0, -4, -0.5) ) df$with_columns( `x/2` = pl$col("x")$div(2), `x/y` = pl$col("x")$div(pl$col("y")) )
df = pl$DataFrame( x = -2:2, y = c(0.5, 0, 0, -4, -0.5) ) df$with_columns( `x/2` = pl$col("x")$div(2), `x/y` = pl$col("x")$div(pl$col("y")) )
Compute the dot/inner product between two Expressions.
Expr_dot(other)
Expr_dot(other)
other |
numeric or string value; accepts expression input. |
pl$DataFrame( a = 1:4, b = c(1, 2, 3, 4) )$with_columns( pl$col("a")$dot(pl$col("b"))$alias("a dot b"), pl$col("a")$dot(pl$col("a"))$alias("a dot a") )
pl$DataFrame( a = 1:4, b = c(1, 2, 3, 4) )$with_columns( pl$col("a")$dot(pl$col("b"))$alias("a dot b"), pl$col("a")$dot(pl$col("a"))$alias("a dot a") )
Drop NaN
Expr_drop_nans()
Expr_drop_nans()
Note that NaN
values are not null
values. Null values correspond to NA
in R.
Expr
drop_nulls()
pl$DataFrame(list(x = c(1, 2, NaN, NA)))$select(pl$col("x")$drop_nans())
pl$DataFrame(list(x = c(1, 2, NaN, NA)))$select(pl$col("x")$drop_nans())
Drop missing values
Expr_drop_nulls()
Expr_drop_nulls()
Expr
drop_nans()
pl$DataFrame(list(x = c(1, 2, NaN, NA)))$select(pl$col("x")$drop_nulls())
pl$DataFrame(list(x = c(1, 2, NaN, NA)))$select(pl$col("x")$drop_nulls())
The entropy is measured with the formula -sum(pk * log(pk))
where pk
are
discrete probabilities.
Expr_entropy(base = base::exp(1), normalize = TRUE)
Expr_entropy(base = base::exp(1), normalize = TRUE)
base |
Given exponential base, defaults to |
normalize |
Normalize |
Expr
pl$DataFrame(x = c(1, 2, 3, 2))$ with_columns(entropy = pl$col("x")$entropy(base = 2))
pl$DataFrame(x = c(1, 2, 3, 2))$ with_columns(entropy = pl$col("x")$entropy(base = 2))
Method equivalent of addition operator expr + other
.
Expr_eq(other)
Expr_eq(other)
other |
numeric or string value; accepts expression input. |
pl$lit(2) == 2 pl$lit(2) == pl$lit(2) pl$lit(2)$eq(pl$lit(2))
pl$lit(2) == 2 pl$lit(2) == pl$lit(2) pl$lit(2)$eq(pl$lit(2))
null
propagationMethod equivalent of addition operator expr + other
.
Expr_eq_missing(other)
Expr_eq_missing(other)
other |
numeric or string value; accepts expression input. |
df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) df$with_columns( eq = pl$col("x")$eq("y"), eq_missing = pl$col("x")$eq_missing("y") )
df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) df$with_columns( eq = pl$col("x")$eq("y"), eq_missing = pl$col("x")$eq_missing("y") )
Exponentially-weighted moving average
Expr_ewm_mean( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, min_periods = 1L, ignore_nulls = TRUE )
Expr_ewm_mean( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, min_periods = 1L, ignore_nulls = TRUE )
com |
Specify decay in terms of center of mass, |
span |
Specify decay in terms of span, |
half_life |
Specify decay in terms of half-life, :math: |
alpha |
Specify smoothing factor alpha directly, |
adjust |
Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings:
|
min_periods |
Minimum number of observations in window required to have a value (otherwise result is null). |
ignore_nulls |
Ignore missing values when calculating weights:
|
Expr
pl$DataFrame(a = 1:3)$ with_columns(ewm_mean = pl$col("a")$ewm_mean(com = 1))
pl$DataFrame(a = 1:3)$ with_columns(ewm_mean = pl$col("a")$ewm_mean(com = 1))
Exponentially-weighted moving standard deviation
Expr_ewm_std( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, bias = FALSE, min_periods = 1L, ignore_nulls = TRUE )
Expr_ewm_std( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, bias = FALSE, min_periods = 1L, ignore_nulls = TRUE )
com |
Specify decay in terms of center of mass, |
span |
Specify decay in terms of span, |
half_life |
Specify decay in terms of half-life, :math: |
alpha |
Specify smoothing factor alpha directly, |
adjust |
Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings:
|
bias |
If |
min_periods |
Minimum number of observations in window required to have a value (otherwise result is null). |
ignore_nulls |
Ignore missing values when calculating weights:
|
Expr
pl$DataFrame(a = 1:3)$ with_columns(ewm_std = pl$col("a")$ewm_std(com = 1))
pl$DataFrame(a = 1:3)$ with_columns(ewm_std = pl$col("a")$ewm_std(com = 1))
Exponentially-weighted moving variance
Expr_ewm_var( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, bias = FALSE, min_periods = 1L, ignore_nulls = TRUE )
Expr_ewm_var( com = NULL, span = NULL, half_life = NULL, alpha = NULL, adjust = TRUE, bias = FALSE, min_periods = 1L, ignore_nulls = TRUE )
com |
Specify decay in terms of center of mass, |
span |
Specify decay in terms of span, |
half_life |
Specify decay in terms of half-life, :math: |
alpha |
Specify smoothing factor alpha directly, |
adjust |
Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings:
|
bias |
If |
min_periods |
Minimum number of observations in window required to have a value (otherwise result is null). |
ignore_nulls |
Ignore missing values when calculating weights:
|
Expr
pl$DataFrame(a = 1:3)$ with_columns(ewm_var = pl$col("a")$ewm_var(com = 1))
pl$DataFrame(a = 1:3)$ with_columns(ewm_var = pl$col("a")$ewm_var(com = 1))
Exclude certain columns from selection
Expr_exclude(columns)
Expr_exclude(columns)
columns |
Given param type:
|
Expr
# make DataFrame df = as_polars_df(iris) # by name(s) df$select(pl$all()$exclude("Species")) # by type df$select(pl$all()$exclude(pl$Categorical())) df$select(pl$all()$exclude(list(pl$Categorical(), pl$Float64))) # by regex df$select(pl$all()$exclude("^Sepal.*$"))
# make DataFrame df = as_polars_df(iris) # by name(s) df$select(pl$all()$exclude("Species")) # by type df$select(pl$all()$exclude(pl$Categorical())) df$select(pl$all()$exclude(list(pl$Categorical(), pl$Float64))) # by regex df$select(pl$all()$exclude("^Sepal.*$"))
Compute the exponential of the elements
Expr_exp()
Expr_exp()
Expr
pl$DataFrame(a = -1:3)$with_columns(a_exp = pl$col("a")$exp())
pl$DataFrame(a = -1:3)$with_columns(a_exp = pl$col("a")$exp())
This means that every item is expanded to a new row.
Expr_explode()
Expr_explode()
Categorical values are not supported.
Expr
df = pl$DataFrame(x = c("abc", "ab"), y = c(list(1:3), list(3:5))) df df$select(pl$col("y")$explode())
df = pl$DataFrame(x = c("abc", "ab"), y = c(list(1:3), list(3:5))) df df$select(pl$col("y")$explode())
Extend the Series with given number of values.
Expr_extend_constant(value, n)
Expr_extend_constant(value, n)
value |
The value to extend the Series with. This value may be |
n |
The number of values to extend. |
Expr
pl$select(pl$lit(1:4)$extend_constant(10.1, 2)) pl$select(pl$lit(1:4)$extend_constant(NULL, 2))
pl$select(pl$lit(1:4)$extend_constant(10.1, 2)) pl$select(pl$lit(1:4)$extend_constant(NULL, 2))
Fill floating point NaN value with a fill value
Expr_fill_nan(value = NULL)
Expr_fill_nan(value = NULL)
value |
Value used to fill |
Expr
pl$DataFrame(a = c(NaN, 1, NaN, 2, NA))$ with_columns( literal = pl$col("a")$fill_nan(999), # implicit coercion to string string = pl$col("a")$fill_nan("invalid") )
pl$DataFrame(a = c(NaN, 1, NaN, 2, NA))$ with_columns( literal = pl$col("a")$fill_nan(999), # implicit coercion to string string = pl$col("a")$fill_nan("invalid") )
Fill null values with a value or strategy
Expr_fill_null(value = NULL, strategy = NULL, limit = NULL)
Expr_fill_null(value = NULL, strategy = NULL, limit = NULL)
value |
Expr or something coercible in an Expr |
strategy |
Possible choice are |
limit |
Number of consecutive null values to fill when using the
|
Expr
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( value = pl$col("a")$fill_null(999), backward = pl$col("a")$fill_null(strategy = "backward"), mean = pl$col("a")$fill_null(strategy = "mean") )
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( value = pl$col("a")$fill_null(999), backward = pl$col("a")$fill_null(strategy = "backward"), mean = pl$col("a")$fill_null(strategy = "mean") )
Mostly useful in an aggregation context. If you want to filter on a
DataFrame level, use DataFrame$filter()
(or LazyFrame$filter()
).
Expr_filter(predicate)
Expr_filter(predicate)
predicate |
An Expr or something coercible to an Expr. Must return a boolean. |
Expr
df = pl$DataFrame( group_col = c("g1", "g1", "g2"), b = c(1, 2, 3) ) df df$group_by("group_col")$agg( lt = pl$col("b")$filter(pl$col("b") < 2), gte = pl$col("b")$filter(pl$col("b") >= 2) )
df = pl$DataFrame( group_col = c("g1", "g1", "g2"), b = c(1, 2, 3) ) df df$group_by("group_col")$agg( lt = pl$col("b")$filter(pl$col("b") < 2), gte = pl$col("b")$filter(pl$col("b") >= 2) )
Get the first value.
Expr_first()
Expr_first()
Expr
pl$DataFrame(x = 3:1)$with_columns(first = pl$col("x")$first())
pl$DataFrame(x = 3:1)$with_columns(first = pl$col("x")$first())
This is an alias for <Expr>$explode()
.
Expr_flatten()
Expr_flatten()
Expr
df = pl$DataFrame(x = c("abc", "ab"), y = c(list(1:3), list(3:5))) df df$select(pl$col("y")$flatten())
df = pl$DataFrame(x = c("abc", "ab"), y = c(list(1:3), list(3:5))) df df$select(pl$col("y")$flatten())
Rounds down to the nearest integer value. Only works on floating point Series.
Expr_floor()
Expr_floor()
Expr
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( floor = pl$col("a")$floor() )
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( floor = pl$col("a")$floor() )
Method equivalent of floor division operator expr %/% other
.
Expr_floor_div(other)
Expr_floor_div(other)
other |
Numeric literal or expression value. |
df = pl$DataFrame(x = 1:5) df$with_columns( `x/2` = pl$col("x")$div(2), `x%/%2` = pl$col("x")$floor_div(2) )
df = pl$DataFrame(x = 1:5) df$with_columns( `x/2` = pl$col("x")$div(2), `x%/%2` = pl$col("x")$floor_div(2) )
Fill missing values with the last seen values. Syntactic sugar for
$fill_null(strategy = "forward")
.
Expr_forward_fill(limit = NULL)
Expr_forward_fill(limit = NULL)
limit |
Number of consecutive null values to fill when using the
|
Expr
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( backward = pl$col("a")$forward_fill() )
pl$DataFrame(a = c(NA, 1, NA, 2, NA))$ with_columns( backward = pl$col("a")$forward_fill() )
Gather values by index
Expr_gather(indices)
Expr_gather(indices)
indices |
R vector or Series, or Expr that leads to a Series of dtype Int64. (0-indexed) |
Expr
df = pl$DataFrame(a = 1:10) df$select(pl$col("a")$gather(c(0, 2, 4, -1)))
df = pl$DataFrame(a = 1:10) df$select(pl$col("a")$gather(c(0, 2, 4, -1)))
Gather every nth value in the Series and return as a new Series.
Expr_gather_every(n, offset = 0)
Expr_gather_every(n, offset = 0)
n |
Positive integer. |
offset |
Starting index. |
Expr
pl$DataFrame(a = 0:24)$select(pl$col("a")$gather_every(6))
pl$DataFrame(a = 0:24)$select(pl$col("a")$gather_every(6))
Method equivalent of addition operator expr + other
.
Expr_gt(other)
Expr_gt(other)
other |
numeric or string value; accepts expression input. |
pl$lit(2) > 1 pl$lit(2) > pl$lit(1) pl$lit(2)$gt(pl$lit(1))
pl$lit(2) > 1 pl$lit(2) > pl$lit(1) pl$lit(2)$gt(pl$lit(1))
Method equivalent of addition operator expr + other
.
Expr_gt_eq(other)
Expr_gt_eq(other)
other |
numeric or string value; accepts expression input. |
pl$lit(2) >= 2 pl$lit(2) >= pl$lit(2) pl$lit(2)$gt_eq(pl$lit(2))
pl$lit(2) >= 2 pl$lit(2) >= pl$lit(2) pl$lit(2)$gt_eq(pl$lit(2))
Check whether the expression contains one or more null values
Expr_has_nulls()
Expr_has_nulls()
Expr
df = pl$DataFrame( a = c(NA, 1, NA), b = c(1, NA, 2), c = c(1, 2, 3) ) df$select(pl$all()$has_nulls())
df = pl$DataFrame( a = c(NA, 1, NA), b = c(1, NA, 2), c = c(1, 2, 3) ) df$select(pl$all()$has_nulls())
The hash value is of type UInt64
.
Expr_hash(seed = 0, seed_1 = NULL, seed_2 = NULL, seed_3 = NULL)
Expr_hash(seed = 0, seed_1 = NULL, seed_2 = NULL, seed_3 = NULL)
seed |
Random seed parameter. Defaults to 0. Doesn't have any effect for now. |
seed_1 , seed_2 , seed_3
|
Random seed parameter. Defaults to arg seed. The column will be coerced to UInt32. |
Expr
df = as_polars_df(iris[1:3, c(1, 2)]) df$with_columns(pl$all()$hash(1234)$name$suffix("_hash"))
df = as_polars_df(iris[1:3, c(1, 2)]) df$with_columns(pl$all()$hash(1234)$name$suffix("_hash"))
Get the first n elements
Expr_head(n = 10)
Expr_head(n = 10)
n |
Number of elements to take. |
Expr
pl$DataFrame(x = 1:11)$select(pl$col("x")$head(3))
pl$DataFrame(x = 1:11)$select(pl$col("x")$head(3))
Aggregate values into a list.
Expr_implode()
Expr_implode()
Use $to_struct()
to wrap a DataFrame.
Expr
df = pl$DataFrame( a = 1:3, b = 4:6 ) df$select(pl$all()$implode())
df = pl$DataFrame( a = 1:3, b = 4:6 ) df$select(pl$all()$implode())
Print the value that this expression evaluates to and pass on the value. The printing will happen when the expression evaluates, not when it is formed.
Expr_inspect(fmt = "{}")
Expr_inspect(fmt = "{}")
fmt |
format string, should contain one set of |
Expr
pl$select(pl$lit(1:5)$inspect( "Here's what the Series looked like before keeping the first two values: {}" )$head(2))
pl$select(pl$lit(1:5)$inspect( "Here's what the Series looked like before keeping the first two values: {}" )$head(2))
Fill nulls with linear interpolation using non-missing values. Can also be used to regrid data to a new grid - see examples below.
Expr_interpolate(method = "linear")
Expr_interpolate(method = "linear")
method |
String, either |
Expr
pl$DataFrame(x = c(1, NA, 4, NA, 100, NaN, 150))$ with_columns( interp_lin = pl$col("x")$interpolate(), interp_near = pl$col("x")$interpolate("nearest") ) # x, y interpolation over a grid df_original_grid = pl$DataFrame( grid_points = c(1, 3, 10), values = c(2.0, 6.0, 20.0) ) df_original_grid df_new_grid = pl$DataFrame(grid_points = (1:10) * 1.0) df_new_grid # Interpolate from this to the new grid df_new_grid$join( df_original_grid, on = "grid_points", how = "left" )$with_columns(pl$col("values")$interpolate())
pl$DataFrame(x = c(1, NA, 4, NA, 100, NaN, 150))$ with_columns( interp_lin = pl$col("x")$interpolate(), interp_near = pl$col("x")$interpolate("nearest") ) # x, y interpolation over a grid df_original_grid = pl$DataFrame( grid_points = c(1, 3, 10), values = c(2.0, 6.0, 20.0) ) df_original_grid df_new_grid = pl$DataFrame(grid_points = (1:10) * 1.0) df_new_grid # Interpolate from this to the new grid df_new_grid$join( df_original_grid, on = "grid_points", how = "left" )$with_columns(pl$col("values")$interpolate())
Check if an expression is between the given lower and upper bounds
Expr_is_between(lower_bound, upper_bound, closed = "both")
Expr_is_between(lower_bound, upper_bound, closed = "both")
lower_bound |
Lower bound, can be an Expr. Strings are parsed as column names. |
upper_bound |
Upper bound, can be an Expr. Strings are parsed as column names. |
closed |
Define which sides of the interval are closed (inclusive). This
can be either |
Note that in polars, NaN
are equal to other NaN
s, and greater than any
non-NaN
value.
Expr
df = pl$DataFrame(num = 1:5) df$with_columns( is_between = pl$col("num")$is_between(2, 4), is_between_excl_upper = pl$col("num")$is_between(2, 4, closed = "left"), is_between_excl_both = pl$col("num")$is_between(2, 4, closed = "none") ) # lower and upper bounds can also be column names or expr df = pl$DataFrame( num = 1:5, lower = c(0, 2, 3, 3, 3), upper = c(6, 4, 4, 8, 3.5) ) df$with_columns( is_between_cols = pl$col("num")$is_between("lower", "upper"), is_between_expr = pl$col("num")$is_between(pl$col("lower") / 2, "upper") )
df = pl$DataFrame(num = 1:5) df$with_columns( is_between = pl$col("num")$is_between(2, 4), is_between_excl_upper = pl$col("num")$is_between(2, 4, closed = "left"), is_between_excl_both = pl$col("num")$is_between(2, 4, closed = "none") ) # lower and upper bounds can also be column names or expr df = pl$DataFrame( num = 1:5, lower = c(0, 2, 3, 3, 3), upper = c(6, 4, 4, 8, 3.5) ) df$with_columns( is_between_cols = pl$col("num")$is_between("lower", "upper"), is_between_expr = pl$col("num")$is_between(pl$col("lower") / 2, "upper") )
This is syntactic sugar for $is_unique()$not()
.
Expr_is_duplicated()
Expr_is_duplicated()
Expr
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_duplicated = pl$col("mpg")$is_duplicated())
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_duplicated = pl$col("mpg")$is_duplicated())
Returns a boolean Series indicating which values are finite.
Expr_is_finite()
Expr_is_finite()
Expr
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(finite = pl$col("alice")$is_finite())
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(finite = pl$col("alice")$is_finite())
Check whether each value is the first occurrence
Expr_is_first_distinct()
Expr_is_first_distinct()
Expr
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_ufirst = pl$col("mpg")$is_first_distinct())
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_ufirst = pl$col("mpg")$is_first_distinct())
Notice that to check whether a factor value is in a vector of strings, you
need to use the string cache, either with pl$enable_string_cache()
or
with pl$with_string_cache()
. See examples.
Expr_is_in(other)
Expr_is_in(other)
other |
numeric or string value; accepts expression input. |
Expr
pl$DataFrame(a = c(1:4, NA_integer_))$with_columns( in_1_3 = pl$col("a")$is_in(c(1, 3)), in_NA = pl$col("a")$is_in(pl$lit(NA_real_)) ) # this fails because we can't compare factors to strings # pl$DataFrame(a = factor(letters[1:5]))$with_columns( # in_abc = pl$col("a")$is_in(c("a", "b", "c")) # ) # need to use the string cache for this pl$with_string_cache({ pl$DataFrame(a = factor(letters[1:5]))$with_columns( in_abc = pl$col("a")$is_in(c("a", "b", "c")) ) })
pl$DataFrame(a = c(1:4, NA_integer_))$with_columns( in_1_3 = pl$col("a")$is_in(c(1, 3)), in_NA = pl$col("a")$is_in(pl$lit(NA_real_)) ) # this fails because we can't compare factors to strings # pl$DataFrame(a = factor(letters[1:5]))$with_columns( # in_abc = pl$col("a")$is_in(c("a", "b", "c")) # ) # need to use the string cache for this pl$with_string_cache({ pl$DataFrame(a = factor(letters[1:5]))$with_columns( in_abc = pl$col("a")$is_in(c("a", "b", "c")) ) })
Returns a boolean Series indicating which values are infinite.
Expr_is_infinite()
Expr_is_infinite()
Expr
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(infinite = pl$col("alice")$is_infinite())
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(infinite = pl$col("alice")$is_infinite())
Check whether each value is the last occurrence
Expr_is_last_distinct()
Expr_is_last_distinct()
Expr
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_ulast = pl$col("mpg")$is_last_distinct())
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_ulast = pl$col("mpg")$is_last_distinct())
Returns a boolean Series indicating which values are NaN.
Expr_is_nan()
Expr_is_nan()
Expr
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(nan = pl$col("alice")$is_nan())
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(nan = pl$col("alice")$is_nan())
Returns a boolean Series indicating which values are not NaN. Syntactic sugar
for $is_nan()$not()
.
Expr_is_not_nan()
Expr_is_not_nan()
Expr
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(not_nan = pl$col("alice")$is_not_nan())
pl$DataFrame(list(alice = c(0, NaN, NA, Inf, -Inf)))$ with_columns(not_nan = pl$col("alice")$is_not_nan())
Returns a boolean Series indicating which values are not null. Syntactic sugar
for $is_null()$not()
.
Expr_is_not_null()
Expr_is_not_null()
Expr
pl$DataFrame(list(x = c(1, NA, 3)))$select(pl$col("x")$is_not_null())
pl$DataFrame(list(x = c(1, NA, 3)))$select(pl$col("x")$is_not_null())
Returns a boolean Series indicating which values are null.
Expr_is_null()
Expr_is_null()
Expr
pl$DataFrame(list(x = c(1, NA, 3)))$select(pl$col("x")$is_null())
pl$DataFrame(list(x = c(1, NA, 3)))$select(pl$col("x")$is_null())
Check whether each value is unique
Expr_is_unique()
Expr_is_unique()
Expr
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_unique = pl$col("mpg")$is_unique())
as_polars_df(head(mtcars[, 1:2]))$ with_columns(is_unique = pl$col("mpg")$is_unique())
Compute the kurtosis (Fisher or Pearson) of a dataset.
Expr_kurtosis(fisher = TRUE, bias = TRUE)
Expr_kurtosis(fisher = TRUE, bias = TRUE)
fisher |
If |
bias |
If |
Kurtosis is the fourth central moment divided by the square of the variance. If Fisher's definition is used, then 3 is subtracted from the result to give 0 for a normal distribution.
If bias is FALSE
, then the kurtosis is calculated using k
statistics to
eliminate bias coming from biased moment estimators.
Expr
pl$DataFrame(a = c(1:3, 2:1))$ with_columns(kurt = pl$col("a")$kurtosis())
pl$DataFrame(a = c(1:3, 2:1))$ with_columns(kurt = pl$col("a")$kurtosis())
Get the last value
Expr_last()
Expr_last()
Expr
pl$DataFrame(x = 3:1)$with_columns(last = pl$col("x")$last())
pl$DataFrame(x = 3:1)$with_columns(last = pl$col("x")$last())
This is an alias for <Expr>$head()
.
Expr_limit(n = 10)
Expr_limit(n = 10)
n |
Number of elements to take. |
Expr
pl$DataFrame(x = 1:11)$select(pl$col("x")$limit(3))
pl$DataFrame(x = 1:11)$select(pl$col("x")$limit(3))
Compute the logarithm of elements
Expr_log(base = base::exp(1))
Expr_log(base = base::exp(1))
base |
Numeric base value for logarithm, default is |
Expr
pl$DataFrame(a = c(1, 2, 3, exp(1)))$ with_columns(log = pl$col("a")$log())
pl$DataFrame(a = c(1, 2, 3, exp(1)))$ with_columns(log = pl$col("a")$log())
Compute the base-10 logarithm of elements
Expr_log10()
Expr_log10()
Expr
pl$DataFrame(a = c(1, 2, 3, exp(1)))$ with_columns(log10 = pl$col("a")$log10())
pl$DataFrame(a = c(1, 2, 3, exp(1)))$ with_columns(log10 = pl$col("a")$log10())
Find the lower bound of a DataType
Expr_lower_bound()
Expr_lower_bound()
Expr
pl$DataFrame( x = 1:3, y = 1:3, schema = list(x = pl$UInt32, y = pl$Int32) )$ select(pl$all()$lower_bound())
pl$DataFrame( x = 1:3, y = 1:3, schema = list(x = pl$UInt32, y = pl$Int32) )$ select(pl$all()$lower_bound())
Method equivalent of addition operator expr + other
.
Expr_lt(other)
Expr_lt(other)
other |
numeric or string value; accepts expression input. |
pl$lit(5) < 10 pl$lit(5) < pl$lit(10) pl$lit(5)$lt(pl$lit(10))
pl$lit(5) < 10 pl$lit(5) < pl$lit(10) pl$lit(5)$lt(pl$lit(10))
Method equivalent of addition operator expr + other
.
Expr_lt_eq(other)
Expr_lt_eq(other)
other |
numeric or string value; accepts expression input. |
pl$lit(2) <= 2 pl$lit(2) <= pl$lit(2) pl$lit(2)$lt_eq(pl$lit(2))
pl$lit(2) <= 2 pl$lit(2) <= pl$lit(2) pl$lit(2)$lt_eq(pl$lit(2))
Map an expression with an R function
Expr_map_batches( f, output_type = NULL, agg_list = FALSE, in_background = FALSE )
Expr_map_batches( f, output_type = NULL, agg_list = FALSE, in_background = FALSE )
f |
a function to map with |
output_type |
|
agg_list |
Aggregate list. Map from vector to group in group_by context. |
in_background |
Logical. Whether to execute the map in a background R
process. Combined with setting e.g. |
It is sometimes necessary to apply a specific R function on one or several
columns. However, note that using R code in $map_batches()
is slower than native polars.
The user function must take one polars Series
as input and the return
should be a Series
or any Robj convertible into a Series
(e.g. vectors).
Map fully supports browser()
.
If in_background = FALSE
the function can access any global variable of the
R session. However, note that several calls to $map_batches()
will sequentially share the same main R session,
so the global environment might change between the start of the query and the moment
a $map_batches()
call is evaluated. Any native
polars computations can still be executed meanwhile. If in_background = TRUE
,
the map will run in one or more other R sessions and will not have access
to global variables. Use options(polars.rpool_cap = 4)
and
polars_options()$rpool_cap
to set and view number of parallel R sessions.
Expr
as_polars_df(iris)$ select( pl$col("Sepal.Length")$map_batches(\(x) { paste("cheese", as.character(x$to_vector())) }, pl$dtypes$String) ) # R parallel process example, use Sys.sleep() to imitate some CPU expensive # computation. # map a,b,c,d sequentially pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }) )$collect() |> system.time() # map in parallel 1: Overhead to start up extra R processes / sessions options(polars.rpool_cap = 0) # drop any previous processes, just to show start-up overhead options(polars.rpool_cap = 4) # set back to 4, the default polars_options()$rpool_cap pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }, in_background = TRUE) )$collect() |> system.time() # map in parallel 2: Reuse R processes in "polars global_rpool". polars_options()$rpool_cap pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }, in_background = TRUE) )$collect() |> system.time()
as_polars_df(iris)$ select( pl$col("Sepal.Length")$map_batches(\(x) { paste("cheese", as.character(x$to_vector())) }, pl$dtypes$String) ) # R parallel process example, use Sys.sleep() to imitate some CPU expensive # computation. # map a,b,c,d sequentially pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }) )$collect() |> system.time() # map in parallel 1: Overhead to start up extra R processes / sessions options(polars.rpool_cap = 0) # drop any previous processes, just to show start-up overhead options(polars.rpool_cap = 4) # set back to 4, the default polars_options()$rpool_cap pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }, in_background = TRUE) )$collect() |> system.time() # map in parallel 2: Reuse R processes in "polars global_rpool". polars_options()$rpool_cap pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select( pl$all()$map_batches(\(s) { Sys.sleep(.1) s * 2 }, in_background = TRUE) )$collect() |> system.time()
The UDF is applied to each element of a column. See Details for more information on specificities related to the context.
Expr_map_elements( f, return_type = NULL, strict_return_type = TRUE, allow_fail_eval = FALSE, in_background = FALSE )
Expr_map_elements( f, return_type = NULL, strict_return_type = TRUE, allow_fail_eval = FALSE, in_background = FALSE )
f |
Function to map |
return_type |
DataType of the output Series. If |
strict_return_type |
If |
allow_fail_eval |
If |
in_background |
Whether to run the function in a background R process,
default is |
Note that, in a GroupBy context, the column will have been pre-aggregated and so each element will itself be a Series. Therefore, depending on the context, requirements for function differ:
in $select()
or $with_columns()
(selection context), the function must
operate on R values of length 1. Polars will convert each element into an R value
and pass it to the function. The output of the user function will be converted
back into a polars type (the return type must match, see argument return_type
).
Using $map_elements()
in this context should be avoided as a lapply()
has half the overhead.
in $agg()
(GroupBy context), the function must take a Series
and return
a Series
or an R object convertible to Series
, e.g. a vector. In this
context, it is much faster if there are the number of groups is much lower
than the number of rows, as the iteration is only across the groups. The R
user function could e.g. convert the Series
to a vector with $to_r()
and
perform some vectorized operations.
Note that it is preferred to express your function in polars syntax, which will almost always be significantly faster and more memory efficient because:
the native expression engine runs in Rust; functions run in R.
use of R functions forces the DataFrame to be materialized in memory.
Polars-native expressions can be parallelized (R functions cannot).
Polars-native expressions can be logically optimized (R functions cannot).
Wherever possible you should strongly prefer the native expression API to
achieve the best performance and avoid using $map_elements()
.
Expr
# apply over groups: here, the input must be a Series # prepare two expressions, one to compute the sum of each variable, one to # get the first two values of each variable and store them in a list e_sum = pl$all()$map_elements(\(s) sum(s$to_r()))$name$suffix("_sum") e_head = pl$all()$map_elements(\(s) head(s$to_r(), 2))$name$suffix("_head") as_polars_df(iris)$group_by("Species")$agg(e_sum, e_head) # apply a function on each value (should be avoided): here the input is an R # value of length 1 # select only Float64 columns my_selection = pl$col(pl$dtypes$Float64) # prepare two expressions, the first one only adds 10 to each element, the # second returns the letter whose index matches the element e_add10 = my_selection$map_elements(\(x) { x + 10 })$name$suffix("_sum") e_letter = my_selection$map_elements(\(x) { letters[ceiling(x)] }, return_type = pl$dtypes$String)$name$suffix("_letter") as_polars_df(iris)$select(e_add10, e_letter) # Small benchmark -------------------------------- # Using `$map_elements()` is much slower than a more polars-native approach. # First we multiply each element of a Series of 1M elements by 2. n = 1000000L set.seed(1) df = pl$DataFrame(list( a = 1:n, b = sample(letters, n, replace = TRUE) )) system.time({ df$with_columns( bob = pl$col("a")$map_elements(\(x) { x * 2L }) ) }) # Comparing this to the standard polars syntax: system.time({ df$with_columns( bob = pl$col("a") * 2L ) }) # Running in parallel -------------------------------- # here, we use Sys.sleep() to imitate some CPU expensive computation. # use apply over each Species-group in each column equal to 12 sequential # runs ~1.2 sec. system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }) )$collect() }) # first run in parallel: there is some overhead to start up extra R processes # drop any previous processes, just to show start-up overhead here options(polars.rpool_cap = 0) # set back to 4, the default options(polars.rpool_cap = 4) polars_options()$rpool_cap system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }, in_background = TRUE) )$collect() }) # second run in parallel: this reuses R processes in "polars global_rpool". polars_options()$rpool_cap system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }, in_background = TRUE) )$collect() })
# apply over groups: here, the input must be a Series # prepare two expressions, one to compute the sum of each variable, one to # get the first two values of each variable and store them in a list e_sum = pl$all()$map_elements(\(s) sum(s$to_r()))$name$suffix("_sum") e_head = pl$all()$map_elements(\(s) head(s$to_r(), 2))$name$suffix("_head") as_polars_df(iris)$group_by("Species")$agg(e_sum, e_head) # apply a function on each value (should be avoided): here the input is an R # value of length 1 # select only Float64 columns my_selection = pl$col(pl$dtypes$Float64) # prepare two expressions, the first one only adds 10 to each element, the # second returns the letter whose index matches the element e_add10 = my_selection$map_elements(\(x) { x + 10 })$name$suffix("_sum") e_letter = my_selection$map_elements(\(x) { letters[ceiling(x)] }, return_type = pl$dtypes$String)$name$suffix("_letter") as_polars_df(iris)$select(e_add10, e_letter) # Small benchmark -------------------------------- # Using `$map_elements()` is much slower than a more polars-native approach. # First we multiply each element of a Series of 1M elements by 2. n = 1000000L set.seed(1) df = pl$DataFrame(list( a = 1:n, b = sample(letters, n, replace = TRUE) )) system.time({ df$with_columns( bob = pl$col("a")$map_elements(\(x) { x * 2L }) ) }) # Comparing this to the standard polars syntax: system.time({ df$with_columns( bob = pl$col("a") * 2L ) }) # Running in parallel -------------------------------- # here, we use Sys.sleep() to imitate some CPU expensive computation. # use apply over each Species-group in each column equal to 12 sequential # runs ~1.2 sec. system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }) )$collect() }) # first run in parallel: there is some overhead to start up extra R processes # drop any previous processes, just to show start-up overhead here options(polars.rpool_cap = 0) # set back to 4, the default options(polars.rpool_cap = 4) polars_options()$rpool_cap system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }, in_background = TRUE) )$collect() }) # second run in parallel: this reuses R processes in "polars global_rpool". polars_options()$rpool_cap system.time({ as_polars_lf(iris)$group_by("Species")$agg( pl$all()$map_elements(\(s) { Sys.sleep(.1) s$sum() }, in_background = TRUE) )$collect() })
Get maximum value
Expr_max()
Expr_max()
Expr
pl$DataFrame(x = c(1, NA, 3))$ with_columns(max = pl$col("x")$max())
pl$DataFrame(x = c(1, NA, 3))$ with_columns(max = pl$col("x")$max())
Get mean value
Expr_mean()
Expr_mean()
Expr
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(mean = pl$col("x")$mean())
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(mean = pl$col("x")$mean())
Get median value
Expr_median()
Expr_median()
Expr
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(median = pl$col("x")$median())
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(median = pl$col("x")$median())
Get minimum value
Expr_min()
Expr_min()
Expr
pl$DataFrame(x = c(1, NA, 3))$ with_columns(min = pl$col("x")$min())
pl$DataFrame(x = c(1, NA, 3))$ with_columns(min = pl$col("x")$min())
Method equivalent of modulus operator expr %% other
.
Expr_mod(other)
Expr_mod(other)
other |
Numeric literal or expression value. |
df = pl$DataFrame(x = -5L:5L) df$with_columns( `x%%2` = pl$col("x")$mod(2) )
df = pl$DataFrame(x = -5L:5L) df$with_columns( `x%%2` = pl$col("x")$mod(2) )
Compute the most occurring value(s). Can return multiple values if there are ties.
Expr_mode()
Expr_mode()
Expr
df = pl$DataFrame(a = 1:6, b = c(1L, 1L, 3L, 3L, 5L, 6L), c = c(1L, 1L, 2L, 2L, 3L, 3L)) df$select(pl$col("a")$mode()) df$select(pl$col("b")$mode()) df$select(pl$col("c")$mode())
df = pl$DataFrame(a = 1:6, b = c(1L, 1L, 3L, 3L, 5L, 6L), c = c(1L, 1L, 2L, 2L, 3L, 3L)) df$select(pl$col("a")$mode()) df$select(pl$col("b")$mode()) df$select(pl$col("c")$mode())
Method equivalent of multiplication operator expr * other
.
Expr_mul(other)
Expr_mul(other)
other |
Numeric literal or expression value. |
df = pl$DataFrame(x = c(1, 2, 4, 8, 16)) df$with_columns( `x*2` = pl$col("x")$mul(2), `x * xlog2` = pl$col("x")$mul(pl$col("x")$log(2)) )
df = pl$DataFrame(x = c(1, 2, 4, 8, 16)) df$with_columns( `x*2` = pl$col("x")$mul(2), `x * xlog2` = pl$col("x")$mul(pl$col("x")$log(2)) )
Count number of unique values
Expr_n_unique()
Expr_n_unique()
Expr
as_polars_df(iris[, 4:5])$with_columns(count = pl$col("Species")$n_unique())
as_polars_df(iris[, 4:5])$with_columns(count = pl$col("Species")$n_unique())
Get maximum value, but returns NaN
if there are any.
Expr_nan_max()
Expr_nan_max()
Expr
pl$DataFrame(x = c(1, NA, 3, NaN, Inf))$ with_columns(nan_max = pl$col("x")$nan_max())
pl$DataFrame(x = c(1, NA, 3, NaN, Inf))$ with_columns(nan_max = pl$col("x")$nan_max())
Get minimum value, but returns NaN
if there are any.
Expr_nan_min()
Expr_nan_min()
Expr
pl$DataFrame(x = c(1, NA, 3, NaN, Inf))$ with_columns(nan_min = pl$col("x")$nan_min())
pl$DataFrame(x = c(1, NA, 3, NaN, Inf))$ with_columns(nan_min = pl$col("x")$nan_min())
Method equivalent of addition operator expr + other
.
Expr_neq(other)
Expr_neq(other)
other |
numeric or string value; accepts expression input. |
pl$lit(1) != 2 pl$lit(1) != pl$lit(2) pl$lit(1)$neq(pl$lit(2))
pl$lit(1) != 2 pl$lit(1) != pl$lit(2) pl$lit(1)$neq(pl$lit(2))
null
propagationMethod equivalent of addition operator expr + other
.
Expr_neq_missing(other)
Expr_neq_missing(other)
other |
numeric or string value; accepts expression input. |
df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) df$with_columns( neq = pl$col("x")$neq("y"), neq_missing = pl$col("x")$neq_missing("y") )
df = pl$DataFrame(x = c(NA, FALSE, TRUE), y = c(TRUE, TRUE, TRUE)) df$with_columns( neq = pl$col("x")$neq("y"), neq_missing = pl$col("x")$neq_missing("y") )
Method equivalent of negation operator !expr
.
Expr_not()
Expr_not()
# two syntaxes same result pl$lit(TRUE)$not() !pl$lit(TRUE)
# two syntaxes same result pl$lit(TRUE)$not() !pl$lit(TRUE)
Count missing values
Expr_null_count()
Expr_null_count()
Expr
pl$DataFrame(x = c(NA, "a", NA, "b"))$ with_columns(n_missing = pl$col("x")$null_count())
pl$DataFrame(x = c(NA, "a", NA, "b"))$ with_columns(n_missing = pl$col("x")$null_count())
Combine two boolean expressions with OR.
Expr_or(other)
Expr_or(other)
other |
numeric or string value; accepts expression input. |
pl$lit(TRUE) | FALSE pl$lit(TRUE)$or(pl$lit(TRUE))
pl$lit(TRUE) | FALSE pl$lit(TRUE)$or(pl$lit(TRUE))
This expression is similar to performing a group by aggregation and joining the result back into the original DataFrame. The outcome is similar to how window functions work in PostgreSQL.
Expr_over(..., order_by = NULL, mapping_strategy = "group_to_rows")
Expr_over(..., order_by = NULL, mapping_strategy = "group_to_rows")
... |
Column(s) to group by. Accepts expression input. Characters are parsed as column names. |
order_by |
Order the window functions/aggregations with the partitioned
groups by the result of the expression passed to |
mapping_strategy |
One of the following:
|
Expr
# Pass the name of a column to compute the expression over that column. df = pl$DataFrame( a = c("a", "a", "b", "b", "b"), b = c(1, 2, 3, 5, 3), c = c(5, 4, 2, 1, 3) ) df$with_columns( pl$col("c")$max()$over("a")$name$suffix("_max") ) # Expression input is supported. df$with_columns( pl$col("c")$max()$over(pl$col("b") %/% 2)$name$suffix("_max") ) # Group by multiple columns by passing a character vector of column names # or list of expressions. df$with_columns( pl$col("c")$min()$over(c("a", "b"))$name$suffix("_min") ) df$with_columns( pl$col("c")$min()$over(list(pl$col("a"), pl$col("b")))$name$suffix("_min") ) # Or use positional arguments to group by multiple columns in the same way. df$with_columns( pl$col("c")$min()$over("a", pl$col("b") %% 2)$name$suffix("_min") ) # Alternative mapping strategy: join values in a list output df$with_columns( top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") ) # order_by specifies how values are sorted within a group, which is # essential when the operation depends on the order of values df = pl$DataFrame( g = c(1, 1, 1, 1, 2, 2, 2, 2), t = c(1, 2, 3, 4, 4, 1, 2, 3), x = c(10, 20, 30, 40, 10, 20, 30, 40) ) # without order_by, the first and second values in the second group would # be inverted, which would be wrong df$with_columns( x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") )
# Pass the name of a column to compute the expression over that column. df = pl$DataFrame( a = c("a", "a", "b", "b", "b"), b = c(1, 2, 3, 5, 3), c = c(5, 4, 2, 1, 3) ) df$with_columns( pl$col("c")$max()$over("a")$name$suffix("_max") ) # Expression input is supported. df$with_columns( pl$col("c")$max()$over(pl$col("b") %/% 2)$name$suffix("_max") ) # Group by multiple columns by passing a character vector of column names # or list of expressions. df$with_columns( pl$col("c")$min()$over(c("a", "b"))$name$suffix("_min") ) df$with_columns( pl$col("c")$min()$over(list(pl$col("a"), pl$col("b")))$name$suffix("_min") ) # Or use positional arguments to group by multiple columns in the same way. df$with_columns( pl$col("c")$min()$over("a", pl$col("b") %% 2)$name$suffix("_min") ) # Alternative mapping strategy: join values in a list output df$with_columns( top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") ) # order_by specifies how values are sorted within a group, which is # essential when the operation depends on the order of values df = pl$DataFrame( g = c(1, 1, 1, 1, 2, 2, 2, 2), t = c(1, 2, 3, 4, 4, 1, 2, 3), x = c(10, 20, 30, 40, 10, 20, 30, 40) ) # without order_by, the first and second values in the second group would # be inverted, which would be wrong df$with_columns( x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") )
Computes percentage change (as fraction) between current element and most-
recent non-null element at least n
period(s) before the current element.
Computes the change from the previous row by default.
Expr_pct_change(n = 1)
Expr_pct_change(n = 1)
n |
Periods to shift for computing percent change. |
Expr
pl$DataFrame(a = c(10L, 11L, 12L, NA_integer_, 12L))$ with_columns(pct_change = pl$col("a")$pct_change())
pl$DataFrame(a = c(10L, 11L, 12L, NA_integer_, 12L))$ with_columns(pct_change = pl$col("a")$pct_change())
A local maximum is the point that marks the transition between an increase and a decrease in a Series. The first and last values of the Series can never be a peak.
Expr_peak_max()
Expr_peak_max()
Expr
$peak_min()
df = pl$DataFrame(x = c(1, 2, 3, 2, 3, 4, 5, 2)) df df$with_columns(peak_max = pl$col("x")$peak_max())
df = pl$DataFrame(x = c(1, 2, 3, 2, 3, 4, 5, 2)) df df$with_columns(peak_max = pl$col("x")$peak_max())
A local minimum is the point that marks the transition between a decrease and an increase in a Series. The first and last values of the Series can never be a peak.
Expr_peak_min()
Expr_peak_min()
Expr
$peak_max()
df = pl$DataFrame(x = c(1, 2, 3, 2, 3, 4, 5, 2)) df df$with_columns(peak_min = pl$col("x")$peak_min())
df = pl$DataFrame(x = c(1, 2, 3, 2, 3, 4, 5, 2)) df df$with_columns(peak_min = pl$col("x")$peak_min())
Method equivalent of exponentiation operator expr ^ exponent
.
Expr_pow(exponent)
Expr_pow(exponent)
exponent |
Numeric literal or expression value. |
df = pl$DataFrame(x = c(1, 2, 4, 8)) df$with_columns( cube = pl$col("x")$pow(3), `x^xlog2` = pl$col("x")$pow(pl$col("x")$log(2)) )
df = pl$DataFrame(x = c(1, 2, 4, 8)) df$with_columns( cube = pl$col("x")$pow(3), `x^xlog2` = pl$col("x")$pow(pl$col("x")$log(2)) )
Compute the product of an expression.
Expr_product()
Expr_product()
Expr
pl$DataFrame(x = c(2L, NA, 2L))$ with_columns(product = pl$col("x")$product())
pl$DataFrame(x = c(2L, NA, 2L))$ with_columns(product = pl$col("x")$product())
Bin continuous values into discrete categories based on their quantiles
Expr_qcut( quantiles, ..., labels = NULL, left_closed = FALSE, allow_duplicates = FALSE, include_breaks = FALSE )
Expr_qcut( quantiles, ..., labels = NULL, left_closed = FALSE, allow_duplicates = FALSE, include_breaks = FALSE )
quantiles |
Either a vector of quantile probabilities between 0 and 1 or a positive integer determining the number of bins with uniform probability. |
... |
Ignored. |
labels |
Names of the categories. The number of labels must be equal to the number of cut points plus one. |
left_closed |
Set the intervals to be left-closed instead of right-closed. |
allow_duplicates |
If set to |
include_breaks |
Include a column with the right endpoint of the bin each
observation falls in. This will change the data type of the output from a
|
Expr of data type Categorical
is include_breaks
is FALSE
and
of data type Struct
if include_breaks
is TRUE
.
df = pl$DataFrame(foo = c(-2, -1, 0, 1, 2)) # Divide a column into three categories according to pre-defined quantile # probabilities df$with_columns( qcut = pl$col("foo")$qcut(c(0.25, 0.75), labels = c("a", "b", "c")) ) # Divide a column into two categories using uniform quantile probabilities. df$with_columns( qcut = pl$col("foo")$qcut(2, labels = c("low", "high"), left_closed = TRUE) ) # Add both the category and the breakpoint df$with_columns( qcut = pl$col("foo")$qcut(c(0.25, 0.75), include_breaks = TRUE) )$unnest("qcut")
df = pl$DataFrame(foo = c(-2, -1, 0, 1, 2)) # Divide a column into three categories according to pre-defined quantile # probabilities df$with_columns( qcut = pl$col("foo")$qcut(c(0.25, 0.75), labels = c("a", "b", "c")) ) # Divide a column into two categories using uniform quantile probabilities. df$with_columns( qcut = pl$col("foo")$qcut(2, labels = c("low", "high"), left_closed = TRUE) ) # Add both the category and the breakpoint df$with_columns( qcut = pl$col("foo")$qcut(c(0.25, 0.75), include_breaks = TRUE) )$unnest("qcut")
Get quantile value.
Expr_quantile(quantile, interpolation = "nearest")
Expr_quantile(quantile, interpolation = "nearest")
quantile |
Either a numeric value or an Expr whose value must be between 0 and 1. |
interpolation |
One of |
Null values are ignored and NaN
s are ranked as the largest value.
For linear interpolation NaN
poisons Inf
, that poisons any other value.
Expr
pl$DataFrame(x = -5:5)$ select(pl$col("x")$quantile(0.5))
pl$DataFrame(x = -5:5)$ select(pl$col("x")$quantile(0.5))
Assign ranks to data, dealing with ties appropriately.
Expr_rank( method = c("average", "min", "max", "dense", "ordinal", "random"), descending = FALSE, seed = NULL )
Expr_rank( method = c("average", "min", "max", "dense", "ordinal", "random"), descending = FALSE, seed = NULL )
method |
String, one of
|
descending |
Rank in descending order. |
seed |
string parsed or number converted into uint64. Used if method="random". |
Expr
# The 'average' method: pl$DataFrame(a = c(3, 6, 1, 1, 6))$ with_columns(rank = pl$col("a")$rank()) # The 'ordinal' method: pl$DataFrame(a = c(3, 6, 1, 1, 6))$ with_columns(rank = pl$col("a")$rank("ordinal"))
# The 'average' method: pl$DataFrame(a = c(3, 6, 1, 1, 6))$ with_columns(rank = pl$col("a")$rank()) # The 'ordinal' method: pl$DataFrame(a = c(3, 6, 1, 1, 6))$ with_columns(rank = pl$col("a")$rank("ordinal"))
Create a single chunk of memory for this Series.
Expr_rechunk()
Expr_rechunk()
See rechunk() explained here docs_translations
.
Expr
# get chunked lengths with/without rechunk series_list = pl$DataFrame(list(a = 1:3, b = 4:6))$select( pl$col("a")$append(pl$col("b"))$alias("a_chunked"), pl$col("a")$append(pl$col("b"))$rechunk()$alias("a_rechunked") )$get_columns() lapply(series_list, \(x) x$chunk_lengths())
# get chunked lengths with/without rechunk series_list = pl$DataFrame(list(a = 1:3, b = 4:6))$select( pl$col("a")$append(pl$col("b"))$alias("a_chunked"), pl$col("a")$append(pl$col("b"))$rechunk()$alias("a_rechunked") )$get_columns() lapply(series_list, \(x) x$chunk_lengths())
Reinterpret the underlying bits as a signed/unsigned integer. This operation is only allowed for Int64. For lower bits integers, you can safely use the cast operation.
Expr_reinterpret(signed = TRUE)
Expr_reinterpret(signed = TRUE)
signed |
If |
Expr
df = pl$DataFrame(x = 1:5, schema = list(x = pl$Int64)) df$select(pl$all()$reinterpret())
df = pl$DataFrame(x = 1:5, schema = list(x = pl$Int64)) df$select(pl$all()$reinterpret())
This expression takes input and repeats it n times and append chunk.
Expr_rep(n, rechunk = TRUE)
Expr_rep(n, rechunk = TRUE)
n |
The number of times to repeat, must be non-negative and finite. |
rechunk |
If |
If the input has length 1, this uses a special faster implementation that
doesn't require rechunking (so rechunk = TRUE
has no effect).
Expr
pl$select(pl$lit("alice")$rep(n = 3)) pl$select(pl$lit(1:3)$rep(n = 2))
pl$select(pl$lit("alice")$rep(n = 3)) pl$select(pl$lit(1:3)$rep(n = 2))
Repeat the elements in this Series as specified in the given expression.
The repeated elements are expanded into a List
.
Expr_repeat_by(by)
Expr_repeat_by(by)
by |
Expr that determines how often the values will be repeated. The column will be coerced to UInt32. |
Expr
df = pl$DataFrame(a = c("w", "x", "y", "z"), n = c(-1, 0, 1, 2)) df$with_columns(repeated = pl$col("a")$repeat_by("n"))
df = pl$DataFrame(a = c("w", "x", "y", "z"), n = c(-1, 0, 1, 2)) df$with_columns(repeated = pl$col("a")$repeat_by("n"))
This allows one to recode values in a column, leaving all other values
unchanged. See $replace_strict()
to give a default
value to all other values and to specify the output datatype.
Expr_replace(old, new)
Expr_replace(old, new)
old |
Can be several things:
|
new |
Either a vector of length 1, a vector of same length as |
Expr
df = pl$DataFrame(a = c(1, 2, 2, 3)) # "old" and "new" can take vectors of length 1 or of same length df$with_columns(replaced = pl$col("a")$replace(2, 100)) df$with_columns(replaced = pl$col("a")$replace(c(2, 3), c(100, 200))) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) df$with_columns(replaced = pl$col("a")$replace(mapping)) df = pl$DataFrame(a = c("x", "y", "z")) mapping = list(x = 1, y = 2, z = 3) df$with_columns(replaced = pl$col("a")$replace(mapping)) # "old" and "new" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace( old = pl$col("a")$max(), new = pl$col("b")$sum() ) )
df = pl$DataFrame(a = c(1, 2, 2, 3)) # "old" and "new" can take vectors of length 1 or of same length df$with_columns(replaced = pl$col("a")$replace(2, 100)) df$with_columns(replaced = pl$col("a")$replace(c(2, 3), c(100, 200))) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) df$with_columns(replaced = pl$col("a")$replace(mapping)) df = pl$DataFrame(a = c("x", "y", "z")) mapping = list(x = 1, y = 2, z = 3) df$with_columns(replaced = pl$col("a")$replace(mapping)) # "old" and "new" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace( old = pl$col("a")$max(), new = pl$col("b")$sum() ) )
This changes all the values in a column, either using a specific replacement
or a default one. See $replace()
to replace only a subset
of values.
Expr_replace_strict(old, new, default = NULL, return_dtype = NULL)
Expr_replace_strict(old, new, default = NULL, return_dtype = NULL)
old |
Can be several things:
|
new |
Either a vector of length 1, a vector of same length as |
default |
The default replacement if the value is not in |
return_dtype |
The data type of the resulting expression. If set to
|
Expr
df = pl$DataFrame(a = c(1, 2, 2, 3)) # "old" and "new" can take vectors of length 1 or of same length df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) df$with_columns( replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) ) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) # one can specify the data type to return instead of automatically # inferring it df$with_columns( replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) ) # "old", "new", and "default" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace_strict( old = pl$col("a")$max(), new = pl$col("b")$sum(), default = pl$col("b"), ) )
df = pl$DataFrame(a = c(1, 2, 2, 3)) # "old" and "new" can take vectors of length 1 or of same length df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) df$with_columns( replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) ) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) # one can specify the data type to return instead of automatically # inferring it df$with_columns( replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) ) # "old", "new", and "default" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace_strict( old = pl$col("a")$max(), new = pl$col("b")$sum(), default = pl$col("b"), ) )
Reshape this Expr to a flat Series or a Series of Lists
Expr_reshape(dimensions)
Expr_reshape(dimensions)
dimensions |
A integer vector of length of the dimension size.
If |
Expr. If a single dimension is given, results in an expression of the original data type. If a multiple dimensions are given, results in an expression of data type List with shape equal to the dimensions.
df = pl$DataFrame(foo = 1:9) df$select(pl$col("foo")$reshape(9)) df$select(pl$col("foo")$reshape(c(3, 3))) # Use `-1` to infer the other dimension df$select(pl$col("foo")$reshape(c(-1, 3))) df$select(pl$col("foo")$reshape(c(3, -1))) # One can specify more than 2 dimensions by using the Array type df = pl$DataFrame(foo = 1:12) df$select( pl$col("foo")$reshape(c(3, 2, 2)) )
df = pl$DataFrame(foo = 1:9) df$select(pl$col("foo")$reshape(9)) df$select(pl$col("foo")$reshape(c(3, 3))) # Use `-1` to infer the other dimension df$select(pl$col("foo")$reshape(c(-1, 3))) df$select(pl$col("foo")$reshape(c(3, -1))) # One can specify more than 2 dimensions by using the Array type df = pl$DataFrame(foo = 1:12) df$select( pl$col("foo")$reshape(c(3, 2, 2)) )
Reverse a variable
Expr_reverse()
Expr_reverse()
Expr
pl$DataFrame(list(a = 1:5))$select(pl$col("a")$reverse())
pl$DataFrame(list(a = 1:5))$select(pl$col("a")$reverse())
Get the lengths of runs of identical values
Expr_rle()
Expr_rle()
Expr
df = pl$DataFrame(s = c(1, 1, 2, 1, NA, 1, 3, 3)) df$select(pl$col("s")$rle())$unnest("s")
df = pl$DataFrame(s = c(1, 1, 2, 1, NA, 1, 3, 3)) df$select(pl$col("s")$rle())$unnest("s")
Similar to $rle(), but it maps each value to an ID corresponding to the run into which it falls. This is especially useful when you want to define groups by runs of identical values rather than the values themselves. Note that the ID is 0-indexed.
Expr_rle_id()
Expr_rle_id()
Expr
df = pl$DataFrame(a = c(1, 2, 1, 1, 1, 4)) df$with_columns(a_r = pl$col("a")$rle_id())
df = pl$DataFrame(a = c(1, 2, 1, 1, 1, 4)) df$with_columns(a_r = pl$col("a")$rle_id())
If you have a time series <t_0, t_1, ..., t_n>
, then by default the windows
created will be:
(t_0 - period, t_0]
(t_1 - period, t_1]
…
(t_n - period, t_n]
whereas if you pass a non-default offset, then the windows will be:
(t_0 + offset, t_0 + offset + period]
(t_1 + offset, t_1 + offset + period]
…
(t_n + offset, t_n + offset + period]
Expr_rolling(index_column, ..., period, offset = NULL, closed = "right")
Expr_rolling(index_column, ..., period, offset = NULL, closed = "right")
index_column |
Column used to group based on the time window. Often of type Date/Datetime. This column must be sorted in ascending order. If this column represents an index, it has to be either Int32 or Int64. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column. |
... |
Ignored. |
period |
A character representing the length of the window,
must be non-negative. See the |
offset |
A character representing the offset of the window,
or |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
In case of a rolling operation on an integer column, the windows are defined by:
"1i" # length 1
"10i" # length 10
Expr
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
# create a DataFrame with a Datetime column and an f64 column dates = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() ) df$with_columns( sum_a = pl$sum("a")$rolling(index_column = "dt", period = "2d"), min_a = pl$min("a")$rolling(index_column = "dt", period = "2d"), max_a = pl$max("a")$rolling(index_column = "dt", period = "2d") ) # we can use "offset" to change the start of the window period. Here, with # offset = "1d", we start the window one day after the value in "dt", and # then we add a 2-day window relative to the window start. df$with_columns( sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") )
# create a DataFrame with a Datetime column and an f64 column dates = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ with_columns( pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() ) df$with_columns( sum_a = pl$sum("a")$rolling(index_column = "dt", period = "2d"), min_a = pl$min("a")$rolling(index_column = "dt", period = "2d"), max_a = pl$max("a")$rolling(index_column = "dt", period = "2d") ) # we can use "offset" to change the start of the window period. Here, with # offset = "1d", we start the window one day after the value in "dt", and # then we add a 2-day window relative to the window start. df$with_columns( sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") )
Compute the rolling (= moving) max over the values in this array. A window of
length window_size
will traverse the array. The values that fill this window
will (optionally) be multiplied with the weights given by the weight
vector.
Expr_rolling_max( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
Expr_rolling_max( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_max = pl$col("a")$rolling_max(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_max = pl$col("a")$rolling_max(window_size = 2))
Apply a rolling max based on another column.
Expr_rolling_max_by(by, window_size, ..., min_periods = 1, closed = "right")
Expr_rolling_max_by(by, window_size, ..., min_periods = 1, closed = "right")
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_max = pl$col("index")$rolling_max_by("date", window_size = "3h") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_max = pl$col("index")$rolling_max_by("date", window_size = "3h") )
Compute the rolling (= moving) mean over the values in this array. A window of
length window_size
will traverse the array. The values that fill this window
will (optionally) be multiplied with the weights given by the weight
vector.
Expr_rolling_mean( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
Expr_rolling_mean( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_mean = pl$col("a")$rolling_mean(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_mean = pl$col("a")$rolling_mean(window_size = 2))
Apply a rolling mean based on another column.
Expr_rolling_mean_by(by, window_size, ..., min_periods = 1, closed = "right")
Expr_rolling_mean_by(by, window_size, ..., min_periods = 1, closed = "right")
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_mean = pl$col("index")$rolling_mean_by("date", window_size = "3h") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_mean = pl$col("index")$rolling_mean_by("date", window_size = "3h") )
Compute the rolling (= moving) median over the values in this array. A window
of length window_size
will traverse the array. The values that fill this
window will (optionally) be multiplied with the weights given by the weight
vector.
Expr_rolling_median( window_size, weights = NULL, min_periods = NULL, center = FALSE )
Expr_rolling_median( window_size, weights = NULL, min_periods = NULL, center = FALSE )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_median = pl$col("a")$rolling_median(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_median = pl$col("a")$rolling_median(window_size = 2))
Apply a rolling median based on another column.
Expr_rolling_median_by(by, window_size, ..., min_periods = 1, closed = "right")
Expr_rolling_median_by(by, window_size, ..., min_periods = 1, closed = "right")
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_median = pl$col("index")$rolling_median_by("date", window_size = "3h") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_median = pl$col("index")$rolling_median_by("date", window_size = "3h") )
Compute the rolling (= moving) min over the values in this array. A window of
length window_size
will traverse the array. The values that fill this window
will (optionally) be multiplied with the weights given by the weight
vector.
Expr_rolling_min( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
Expr_rolling_min( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_min = pl$col("a")$rolling_min(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_min = pl$col("a")$rolling_min(window_size = 2))
Apply a rolling min based on another column.
Expr_rolling_min_by(by, window_size, ..., min_periods = 1, closed = "right")
Expr_rolling_min_by(by, window_size, ..., min_periods = 1, closed = "right")
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_min = pl$col("index")$rolling_min_by("date", window_size = "3h") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_min = pl$col("index")$rolling_min_by("date", window_size = "3h") )
Compute the rolling (= moving) quantile over the values in this array. A
window of length window_size
will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
weight
vector.
Expr_rolling_quantile( quantile, interpolation = "nearest", window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
Expr_rolling_quantile( quantile, interpolation = "nearest", window_size, weights = NULL, min_periods = NULL, ..., center = FALSE )
quantile |
Quantile between 0 and 1. |
interpolation |
String, one of |
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_quant = pl$col("a")$rolling_quantile(0.3, window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_quant = pl$col("a")$rolling_quantile(0.3, window_size = 2))
Compute a rolling quantile based on another column
Expr_rolling_quantile_by( by, window_size, ..., quantile, interpolation = "nearest", min_periods = 1, closed = "right" )
Expr_rolling_quantile_by( by, window_size, ..., quantile, interpolation = "nearest", min_periods = 1, closed = "right" )
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
quantile |
Either a numeric value or an Expr whose value must be between 0 and 1. |
interpolation |
One of |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_quantile = pl$col("index")$rolling_quantile_by( "date", window_size = "2h", quantile = 0.3 ) )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_quantile = pl$col("index")$rolling_quantile_by( "date", window_size = "2h", quantile = 0.3 ) )
Compute the rolling (= moving) skewness over the values in this array. A
window of length window_size
will traverse the array.
Expr_rolling_skew(window_size, bias = TRUE)
Expr_rolling_skew(window_size, bias = TRUE)
window_size |
Integer specifying the length of the window. |
bias |
If |
For normally distributed data, the skewness should be about zero. For uni-modal continuous distributions, a skewness value greater than zero means that there is more weight in the right tail of the distribution.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_skew = pl$col("a")$rolling_skew(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_skew = pl$col("a")$rolling_skew(window_size = 2))
Compute the rolling (= moving) standard deviation over the values in this
array. A window of length window_size
will traverse the array. The values
that fill this window will (optionally) be multiplied with the weights given
by the weight
vector.
Expr_rolling_std( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE, ddof = 1 )
Expr_rolling_std( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE, ddof = 1 )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_std = pl$col("a")$rolling_std(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_std = pl$col("a")$rolling_std(window_size = 2))
Compute a rolling standard deviation based on another column
Expr_rolling_std_by( by, window_size, ..., min_periods = 1, closed = "right", ddof = 1 )
Expr_rolling_std_by( by, window_size, ..., min_periods = 1, closed = "right", ddof = 1 )
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal # Compute the rolling std with the temporal windows closed on the right (default) df_temporal$with_columns( rolling_row_std = pl$col("index")$rolling_std_by("date", window_size = "2h") ) # Compute the rolling std with the closure of windows on both sides df_temporal$with_columns( rolling_row_std = pl$col("index")$rolling_std_by("date", window_size = "2h", closed = "both") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal # Compute the rolling std with the temporal windows closed on the right (default) df_temporal$with_columns( rolling_row_std = pl$col("index")$rolling_std_by("date", window_size = "2h") ) # Compute the rolling std with the closure of windows on both sides df_temporal$with_columns( rolling_row_std = pl$col("index")$rolling_std_by("date", window_size = "2h", closed = "both") )
Compute the rolling (= moving) sum over the values in this array. A window of
length window_size
will traverse the array. The values that fill this window
will (optionally) be multiplied with the weights given by the weight
vector.
Expr_rolling_sum( window_size, weights = NULL, min_periods = NULL, center = FALSE )
Expr_rolling_sum( window_size, weights = NULL, min_periods = NULL, center = FALSE )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
center |
Set the labels at the center of the window |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_sum = pl$col("a")$rolling_sum(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_sum = pl$col("a")$rolling_sum(window_size = 2))
Apply a rolling sum based on another column.
Expr_rolling_sum_by(by, window_size, ..., min_periods = 1, closed = "right")
Expr_rolling_sum_by(by, window_size, ..., min_periods = 1, closed = "right")
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_sum = pl$col("index")$rolling_sum_by("date", window_size = "3h") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal df_temporal$with_columns( rolling_row_sum = pl$col("index")$rolling_sum_by("date", window_size = "3h") )
Compute the rolling (= moving) variance over the values in this array. A
window of length window_size
will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
weight
vector.
Expr_rolling_var( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE, ddof = 1 )
Expr_rolling_var( window_size, weights = NULL, min_periods = NULL, ..., center = FALSE, ddof = 1 )
window_size |
Integer specifying the length of the window. |
weights |
An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
... |
Ignored. |
center |
Set the labels at the center of the window |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_var = pl$col("a")$rolling_var(window_size = 2))
pl$DataFrame(a = c(1, 3, 2, 4, 5, 6))$ with_columns(roll_var = pl$col("a")$rolling_var(window_size = 2))
Compute a rolling variance based on another column
Expr_rolling_var_by( by, window_size, ..., min_periods = 1, closed = "right", ddof = 1 )
Expr_rolling_var_by( by, window_size, ..., min_periods = 1, closed = "right", ddof = 1 )
by |
|
window_size |
The length of the window. Can be a fixed integer size, or a dynamic temporal size indicated by the following string language:
|
... |
Ignored. |
min_periods |
The number of values in the window that should be non-null
before computing a result. If |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
If you want to compute multiple aggregation statistics over the same dynamic
window, consider using $rolling()
this method can cache the window size
computation.
Expr
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal # Compute the rolling var with the temporal windows closed on the right (default) df_temporal$with_columns( rolling_row_var = pl$col("index")$rolling_var_by("date", window_size = "2h") ) # Compute the rolling var with the closure of windows on both sides df_temporal$with_columns( rolling_row_var = pl$col("index")$rolling_var_by("date", window_size = "2h", closed = "both") )
df_temporal = pl$DataFrame( date = pl$datetime_range(as.Date("2001-1-1"), as.Date("2001-1-2"), "1h") )$with_row_index("index") df_temporal # Compute the rolling var with the temporal windows closed on the right (default) df_temporal$with_columns( rolling_row_var = pl$col("index")$rolling_var_by("date", window_size = "2h") ) # Compute the rolling var with the closure of windows on both sides df_temporal$with_columns( rolling_row_var = pl$col("index")$rolling_var_by("date", window_size = "2h", closed = "both") )
Round underlying floating point data by decimals
digits.
Expr_round(decimals)
Expr_round(decimals)
decimals |
Number of decimals to round by. |
Expr
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( round = pl$col("a")$round(1) )
pl$DataFrame(a = c(0.33, 0.5, 1.02, 1.5, NaN, NA, Inf, -Inf))$with_columns( round = pl$col("a")$round(1) )
Take a sample
Expr_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
Expr_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
n |
Number of items to return. Cannot be used with |
... |
Ignored. |
fraction |
Fraction of items to return. Cannot be used with |
with_replacement |
If |
shuffle |
Shuffle the order of sampled data points (implicitly |
seed |
numeric value of 0 to 2^52 Seed for the random number generator.
If |
Expr
df = pl$DataFrame(a = 1:4) df$select(pl$col("a")$sample(fraction = 1, with_replacement = TRUE, seed = 1L)) df$select(pl$col("a")$sample(fraction = 2, with_replacement = TRUE, seed = 1L)) df$select(pl$col("a")$sample(n = 2, with_replacement = FALSE, seed = 1L))
df = pl$DataFrame(a = 1:4) df$select(pl$col("a")$sample(fraction = 1, with_replacement = TRUE, seed = 1L)) df$select(pl$col("a")$sample(fraction = 2, with_replacement = TRUE, seed = 1L)) df$select(pl$col("a")$sample(n = 2, with_replacement = FALSE, seed = 1L))
Find indices where elements should be inserted to maintain order.
Expr_search_sorted(element)
Expr_search_sorted(element)
element |
Element to insert. Can be an Expr or something coercible to an Expr. |
This function looks up where to insert element to keep self column sorted. It is assumed the column is already sorted in ascending order (otherwise this leads to wrong results).
Expr
df = pl$DataFrame(a = c(1, 3, 4, 4, 6)) df # in which row should 5 be inserted in order to not break the sort? # (value is 0-indexed) df$select( zero = pl$col("a")$search_sorted(0), three = pl$col("a")$search_sorted(3), five = pl$col("a")$search_sorted(5) )
df = pl$DataFrame(a = c(1, 3, 4, 4, 6)) df # in which row should 5 be inserted in order to not break the sort? # (value is 0-indexed) df$select( zero = pl$col("a")$search_sorted(0), three = pl$col("a")$search_sorted(3), five = pl$col("a")$search_sorted(5) )
This enables downstream code to use fast paths for sorted arrays. WARNING: this doesn't check whether the data is actually sorted, you have to ensure of that yourself.
Expr_set_sorted(..., descending = FALSE)
Expr_set_sorted(..., descending = FALSE)
... |
Ignored. |
descending |
Sort the columns in descending order. |
Expr
# correct use flag something correctly as ascendingly sorted s = pl$select(pl$lit(1:4)$set_sorted()$alias("a"))$get_column("a") s$flags # incorrect use, flag something as not sorted ascendingly s2 = pl$select(pl$lit(c(1, 3, 2, 4))$set_sorted()$alias("a"))$get_column("a") s2$sort() s2$flags # returns TRUE while it's not actually sorted
# correct use flag something correctly as ascendingly sorted s = pl$select(pl$lit(1:4)$set_sorted()$alias("a"))$get_column("a") s$flags # incorrect use, flag something as not sorted ascendingly s2 = pl$select(pl$lit(c(1, 3, 2, 4))$set_sorted()$alias("a"))$get_column("a") s2$sort() s2$flags # returns TRUE while it's not actually sorted
Shift values by the given number of indices
Expr_shift(n = 1, fill_value = NULL)
Expr_shift(n = 1, fill_value = NULL)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
fill_value |
Fill the resulting null values with this value. Accepts expression input. Non-expression inputs are parsed as literals. |
Expr
pl$DataFrame(a = c(1, 2, 4, 5, 8))$ with_columns( pl$col("a")$shift(-2)$alias("shift-2"), pl$col("a")$shift(2)$alias("shift+2") )
pl$DataFrame(a = c(1, 2, 4, 5, 8))$ with_columns( pl$col("a")$shift(-2)$alias("shift-2"), pl$col("a")$shift(2)$alias("shift+2") )
Shrink to the dtype needed to fit the extrema of this Series. This can be used to reduce memory pressure.
Expr_shrink_dtype()
Expr_shrink_dtype()
Expr
df = pl$DataFrame( a = 1:3, b = c(1, 2, 3) ) df df$with_columns(pl$all()$shrink_dtype()$name$suffix("_shrunk"))
df = pl$DataFrame( a = 1:3, b = c(1, 2, 3) ) df df$with_columns(pl$all()$shrink_dtype()$name$suffix("_shrunk"))
Shuffle values
Expr_shuffle(seed = NULL)
Expr_shuffle(seed = NULL)
seed |
numeric value of 0 to 2^52 Seed for the random number generator.
If |
Expr
pl$DataFrame(a = 1:4)$with_columns(shuff = pl$col("a")$shuffle(seed = 1))
pl$DataFrame(a = 1:4)$with_columns(shuff = pl$col("a")$shuffle(seed = 1))
Get the sign of elements
Expr_sign()
Expr_sign()
Expr
pl$DataFrame(a = c(.9, -3, -0, 0, 4, NA_real_))$ with_columns(sign = pl$col("a")$sign())
pl$DataFrame(a = c(.9, -3, -0, 0, 4, NA_real_))$ with_columns(sign = pl$col("a")$sign())
Compute sine
Expr_sin()
Expr_sin()
Expr
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(sine = pl$col("a")$sin())
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(sine = pl$col("a")$sin())
Compute hyperbolic sine
Expr_sinh()
Expr_sinh()
Expr
pl$DataFrame(a = c(-1, asinh(0.5), 0, 1, NA_real_))$ with_columns(sinh = pl$col("a")$sinh())
pl$DataFrame(a = c(-1, asinh(0.5), 0, 1, NA_real_))$ with_columns(sinh = pl$col("a")$sinh())
Compute the sample skewness of a data set.
Expr_skew(bias = TRUE)
Expr_skew(bias = TRUE)
bias |
If |
For normally distributed data, the skewness should be about zero. For uni-modal continuous distributions, a skewness value greater than zero means that there is more weight in the right tail of the distribution.
Expr
df = pl$DataFrame(list(a = c(1:3, 2:1))) df$select(pl$col("a")$skew())
df = pl$DataFrame(list(a = c(1:3, 2:1))) df$select(pl$col("a")$skew())
Performing a slice of length 1 on a subset of columns will recycle this value in those columns but will not change the number of rows in the data. See examples.
Expr_slice(offset, length = NULL)
Expr_slice(offset, length = NULL)
offset |
Numeric or expression, zero-indexed. Indicates where to start the slice. A negative value is one-indexed and starts from the end. |
length |
Maximum number of elements contained in the slice. Default is full data. |
Expr
# as head pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(0, 6) ) # as tail pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(-6, 6) ) pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(80) ) # recycling as_polars_df(mtcars)$with_columns(pl$col("mpg")$slice(0, 1)$first())
# as head pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(0, 6) ) # as tail pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(-6, 6) ) pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(80) ) # recycling as_polars_df(mtcars)$with_columns(pl$col("mpg")$slice(0, 1)$first())
Sort this column. If used in a groupby context, the groups are sorted.
Expr_sort(..., descending = FALSE, nulls_last = FALSE)
Expr_sort(..., descending = FALSE, nulls_last = FALSE)
... |
Ignored. |
descending |
A logical. If |
nulls_last |
A logical. If |
Expr
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$ with_columns(sorted = pl$col("a")$sort())
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$ with_columns(sorted = pl$col("a")$sort())
Sort this column by the ordering of another column, or multiple other columns. If used in a groupby context, the groups are sorted.
Expr_sort_by( by, ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, maintain_order = FALSE )
Expr_sort_by( by, ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, maintain_order = FALSE )
by |
One expression or a list of expressions and/or strings (interpreted as column names). |
... |
Ignored. |
descending |
A logical. If |
nulls_last |
A logical. If |
multithreaded |
A logical. If |
maintain_order |
A logical to indicate whether the order should be maintained if elements are equal. |
Expr
df = pl$DataFrame( group = c("a", "a", "a", "b", "b", "b"), value1 = c(98, 1, 3, 2, 99, 100), value2 = c("d", "f", "b", "e", "c", "a") ) # by one column/expression df$with_columns( sorted = pl$col("group")$sort_by("value1") ) # by two columns/expressions df$with_columns( sorted = pl$col("group")$sort_by( list("value2", pl$col("value1")), descending = c(TRUE, FALSE) ) ) # by some expression df$with_columns( sorted = pl$col("group")$sort_by(pl$col("value1")$sort(descending = TRUE)) )
df = pl$DataFrame( group = c("a", "a", "a", "b", "b", "b"), value1 = c(98, 1, 3, 2, 99, 100), value2 = c("d", "f", "b", "e", "c", "a") ) # by one column/expression df$with_columns( sorted = pl$col("group")$sort_by("value1") ) # by two columns/expressions df$with_columns( sorted = pl$col("group")$sort_by( list("value2", pl$col("value1")), descending = c(TRUE, FALSE) ) ) # by some expression df$with_columns( sorted = pl$col("group")$sort_by(pl$col("value1")$sort(descending = TRUE)) )
Compute the square root of the elements
Expr_sqrt()
Expr_sqrt()
Expr
pl$DataFrame(a = -1:3)$with_columns(a_sqrt = pl$col("a")$sqrt())
pl$DataFrame(a = -1:3)$with_columns(a_sqrt = pl$col("a")$sqrt())
Get standard deviation
Expr_std(ddof = 1)
Expr_std(ddof = 1)
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
Expr
pl$select(pl$lit(1:5)$std())
pl$select(pl$lit(1:5)$std())
Method equivalent of subtraction operator expr - other
.
Expr_sub(other)
Expr_sub(other)
other |
Numeric literal or expression value. |
df = pl$DataFrame(x = 0:4) df$with_columns( `x-2` = pl$col("x")$sub(2), `x-expr` = pl$col("x")$sub(pl$col("x")$cum_sum()) )
df = pl$DataFrame(x = 0:4) df$with_columns( `x-2` = pl$col("x")$sub(2), `x-expr` = pl$col("x")$sub(pl$col("x")$cum_sum()) )
Get sum value
Expr_sum()
Expr_sum()
The dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
Expr
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(sum = pl$col("x")$sum())
pl$DataFrame(x = c(1L, NA, 2L))$ with_columns(sum = pl$col("x")$sum())
Get the last n elements
Expr_tail(n = 10)
Expr_tail(n = 10)
n |
Number of elements to take. |
Expr
pl$DataFrame(x = 1:11)$select(pl$col("x")$tail(3))
pl$DataFrame(x = 1:11)$select(pl$col("x")$tail(3))
Compute tangent
Expr_tan()
Expr_tan()
Expr
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(tangent = pl$col("a")$tan())
pl$DataFrame(a = c(0, pi / 2, pi, NA_real_))$ with_columns(tangent = pl$col("a")$tan())
Compute hyperbolic tangent
Expr_tanh()
Expr_tanh()
Expr
pl$DataFrame(a = c(-1, atanh(0.5), 0, 1, NA_real_))$ with_columns(tanh = pl$col("a")$tanh())
pl$DataFrame(a = c(-1, atanh(0.5), 0, 1, NA_real_))$ with_columns(tanh = pl$col("a")$tanh())
The following DataTypes will be converted:
Date -> Int32
Datetime -> Int64
Time -> Int64
Duration -> Int64
Categorical -> UInt32
List(inner) -> List(physical of inner) Other data types will be left unchanged.
Expr_to_physical()
Expr_to_physical()
Expr
pl$DataFrame( list(vals = c("a", "x", NA, "a", "b")) )$with_columns( pl$col("vals")$cast(pl$Categorical()), pl$col("vals") $cast(pl$Categorical()) $to_physical() $alias("vals_physical") )
pl$DataFrame( list(vals = c("a", "x", NA, "a", "b")) )$with_columns( pl$col("vals")$cast(pl$Categorical()), pl$col("vals") $cast(pl$Categorical()) $to_physical() $alias("vals_physical") )
This is mostly useful to debug an expression. It evaluates the Expr in an empty DataFrame and return the first Series to R.
Expr_to_r( df = NULL, i = 0, ..., int64_conversion = polars_options()$int64_conversion )
Expr_to_r( df = NULL, i = 0, ..., int64_conversion = polars_options()$int64_conversion )
df |
If |
i |
Numeric column to extract. Default is zero (which gives the first column). |
... |
Any args pased to |
int64_conversion |
How should Int64 values be handled when converting a polars object to R?
|
R object
pl$lit(1:3)$to_r()
pl$lit(1:3)$to_r()
Collect an expression based on literals into a Series.
Expr_to_series()
Expr_to_series()
Series
pl$lit(1:5)$to_series()
pl$lit(1:5)$to_series()
Return the k
largest elements. This has time complexity:
Expr_top_k(k)
Expr_top_k(k)
k |
Number of top values to get. |
Expr
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$top_k(5))
pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$top_k(5))
Get unique values
Expr_unique(maintain_order = FALSE)
Expr_unique(maintain_order = FALSE)
maintain_order |
If |
Expr
as_polars_df(iris)$select(pl$col("Species")$unique())
as_polars_df(iris)$select(pl$col("Species")$unique())
Return a count of the unique values in the order of appearance. This method
differs from $value_counts()
in that it does not return the values, only
the counts and it might be faster.
Expr_unique_counts()
Expr_unique_counts()
Expr
as_polars_df(iris)$select(pl$col("Species")$unique_counts())
as_polars_df(iris)$select(pl$col("Species")$unique_counts())
Find the upper bound of a DataType
Expr_upper_bound()
Expr_upper_bound()
Expr
pl$DataFrame( x = c(1, 2, 3), y = -2:0, schema = list(x = pl$Float64, y = pl$Int32) )$ select(pl$all()$upper_bound())
pl$DataFrame( x = c(1, 2, 3), y = -2:0, schema = list(x = pl$Float64, y = pl$Int32) )$ select(pl$all()$upper_bound())
Count all unique values and create a struct mapping value to count.
Expr_value_counts(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE)
Expr_value_counts(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE)
... |
Ignored. |
sort |
Ensure the output is sorted from most values to least. |
parallel |
Better to turn this off in the aggregation context, as it can lead to contention. |
name |
Give the resulting count column a specific name. The default is
|
normalize |
If |
Expr
df = as_polars_df(iris) df$select(pl$col("Species")$value_counts())$unnest() df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest()
df = as_polars_df(iris) df$select(pl$col("Species")$value_counts())$unnest() df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest()
Get variance
Expr_var(ddof = 1)
Expr_var(ddof = 1)
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
Expr
pl$select(pl$lit(1:5)$var())
pl$select(pl$lit(1:5)$var())
when-then-otherwise
is similar to R ifelse()
. Always initiated by a
pl$when(<condition>)$then(<value if condition>)
, and optionally followed
by chaining one or more $when(<condition>)$then(<value if condition>)
statements.
pl_when(...) When_then(statement) Then_when(...) Then_otherwise(statement) ChainedWhen_then(statement) ChainedThen_when(...) ChainedThen_otherwise(statement)
pl_when(...) When_then(statement) Then_when(...) Then_otherwise(statement) ChainedWhen_then(statement) ChainedThen_when(...) ChainedThen_otherwise(statement)
... |
Expr or something coercible to an Expr that returns a boolian each row. |
statement |
Expr or something coercible to
an Expr value to insert in |
Chained when-then operations should be read like if, else if, else if, ...
in R,
not as if, if, if, ...
, i.e. the first condition that evaluates to true
will
be picked.
If none of the conditions are true
, an optional
$otherwise(<value if all statements are false>)
can be appended at the end.
If not appended, and none of the conditions are true
, null
will be returned.
RPolarsThen
objects and RPolarsChainedThen
objects (returned by $then()
)
stores the same methods as Expr.
pl$when()
returns a When
object
<When>$then()
returns a Then
object
<Then>$when()
returns a ChainedWhen
object
<ChainedWhen>$then()
returns a ChainedThen
object
$otherwise()
returns an Expr object.
df = pl$DataFrame(foo = c(1, 3, 4), bar = c(3, 4, 0)) # Add a column with the value 1, where column "foo" > 2 and the value -1 # where it isn’t. df$with_columns( val = pl$when(pl$col("foo") > 2)$then(1)$otherwise(-1) ) # With multiple when-then chained: df$with_columns( val = pl$when(pl$col("foo") > 2) $then(1) $when(pl$col("bar") > 2) $then(4) $otherwise(-1) ) # The `$otherwise` at the end is optional. # If left out, any rows where none of the `$when()` expressions are evaluated to `true`, # are set to `null` df$with_columns( val = pl$when(pl$col("foo") > 2)$then(1) ) # Pass multiple predicates, each of which must be met: df$with_columns( val = pl$when( pl$col("bar") > 0, pl$col("foo") %% 2 != 0 ) $then(99) $otherwise(-1) ) # In `$then()`, a character vector is parsed as column names df$with_columns(baz = pl$when(pl$col("foo") %% 2 == 1)$then("bar")) # So use `pl$lit()` to insert a string df$with_columns(baz = pl$when(pl$col("foo") %% 2 == 1)$then(pl$lit("bar")))
df = pl$DataFrame(foo = c(1, 3, 4), bar = c(3, 4, 0)) # Add a column with the value 1, where column "foo" > 2 and the value -1 # where it isn’t. df$with_columns( val = pl$when(pl$col("foo") > 2)$then(1)$otherwise(-1) ) # With multiple when-then chained: df$with_columns( val = pl$when(pl$col("foo") > 2) $then(1) $when(pl$col("bar") > 2) $then(4) $otherwise(-1) ) # The `$otherwise` at the end is optional. # If left out, any rows where none of the `$when()` expressions are evaluated to `true`, # are set to `null` df$with_columns( val = pl$when(pl$col("foo") > 2)$then(1) ) # Pass multiple predicates, each of which must be met: df$with_columns( val = pl$when( pl$col("bar") > 0, pl$col("foo") %% 2 != 0 ) $then(99) $otherwise(-1) ) # In `$then()`, a character vector is parsed as column names df$with_columns(baz = pl$when(pl$col("foo") %% 2 == 1)$then("bar")) # So use `pl$lit()` to insert a string df$with_columns(baz = pl$when(pl$col("foo") %% 2 == 1)$then(pl$lit("bar")))
Combine two boolean expressions with XOR.
Expr_xor(other)
Expr_xor(other)
other |
numeric or string value; accepts expression input. |
pl$lit(TRUE)$xor(pl$lit(FALSE))
pl$lit(TRUE)$xor(pl$lit(FALSE))
Evaluate whether all boolean values in an array are true
ExprArr_all()
ExprArr_all()
Expr
df = pl$DataFrame( values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)), schema = list(values = pl$Array(pl$Boolean, 2)) ) df$with_columns(all = pl$col("values")$arr$all())
df = pl$DataFrame( values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)), schema = list(values = pl$Array(pl$Boolean, 2)) ) df$with_columns(all = pl$col("values")$arr$all())
Evaluate whether any boolean values in an array are true
ExprArr_any()
ExprArr_any()
Expr
df = pl$DataFrame( values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)), schema = list(values = pl$Array(pl$Boolean, 2)) ) df$with_columns(any = pl$col("values")$arr$any())
df = pl$DataFrame( values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)), schema = list(values = pl$Array(pl$Boolean, 2)) ) df$with_columns(any = pl$col("values")$arr$any())
Get the index of the maximal value in an array
ExprArr_arg_max()
ExprArr_arg_max()
Expr
df = pl$DataFrame( values = list(1:2, 2:1), schema = list(values = pl$Array(pl$Int32, 2)) ) df$with_columns( arg_max = pl$col("values")$arr$arg_max() )
df = pl$DataFrame( values = list(1:2, 2:1), schema = list(values = pl$Array(pl$Int32, 2)) ) df$with_columns( arg_max = pl$col("values")$arr$arg_max() )
Get the index of the minimal value in an array
ExprArr_arg_min()
ExprArr_arg_min()
Expr
df = pl$DataFrame( values = list(1:2, 2:1), schema = list(values = pl$Array(pl$Int32, 2)) ) df$with_columns( arg_min = pl$col("values")$arr$arg_min() )
df = pl$DataFrame( values = list(1:2, 2:1), schema = list(values = pl$Array(pl$Int32, 2)) ) df$with_columns( arg_min = pl$col("values")$arr$arg_min() )
Check if array contains a given value
ExprArr_contains(item)
ExprArr_contains(item)
item |
Expr or something coercible to an Expr. Strings are not parsed as columns. |
Expr
df = pl$DataFrame( values = list(0:2, 4:6, c(NA_integer_, NA_integer_, NA_integer_)), item = c(0L, 4L, 2L), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns( with_expr = pl$col("values")$arr$contains(pl$col("item")), with_lit = pl$col("values")$arr$contains(1) )
df = pl$DataFrame( values = list(0:2, 4:6, c(NA_integer_, NA_integer_, NA_integer_)), item = c(0L, 4L, 2L), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns( with_expr = pl$col("values")$arr$contains(pl$col("item")), with_lit = pl$col("values")$arr$contains(1) )
This allows to extract one value per array only.
ExprArr_get(index, ..., null_on_oob = TRUE)
ExprArr_get(index, ..., null_on_oob = TRUE)
index |
An Expr or something coercible to an Expr, that must return a
single index. Values are 0-indexed (so index 0 would return the first item
of every sub-array) and negative values start from the end (index |
... |
Ignored. |
null_on_oob |
If |
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), idx = c(1, NA, 3), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns( using_expr = pl$col("values")$arr$get("idx"), val_0 = pl$col("values")$arr$get(0), val_minus_1 = pl$col("values")$arr$get(-1), val_oob = pl$col("values")$arr$get(10) )
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), idx = c(1, NA, 3), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns( using_expr = pl$col("values")$arr$get("idx"), val_0 = pl$col("values")$arr$get(0), val_minus_1 = pl$col("values")$arr$get(-1), val_oob = pl$col("values")$arr$get(10) )
Join all string items in a sub-array and place a separator between them. This
only works on columns of type list[str]
.
ExprArr_join(separator, ignore_nulls = FALSE)
ExprArr_join(separator, ignore_nulls = FALSE)
separator |
String to separate the items with. Can be an Expr. Strings are not parsed as columns. |
ignore_nulls |
If |
Expr
df = pl$DataFrame( values = list(c("a", "b", "c"), c("x", "y", "z"), c("e", NA, NA)), separator = c("-", "+", "/"), schema = list(values = pl$Array(pl$String, 3)) ) df$with_columns( join_with_expr = pl$col("values")$arr$join(pl$col("separator")), join_with_lit = pl$col("values")$arr$join(" "), join_ignore_null = pl$col("values")$arr$join(" ", ignore_nulls = TRUE) )
df = pl$DataFrame( values = list(c("a", "b", "c"), c("x", "y", "z"), c("e", NA, NA)), separator = c("-", "+", "/"), schema = list(values = pl$Array(pl$String, 3)) ) df$with_columns( join_with_expr = pl$col("values")$arr$join(pl$col("separator")), join_with_lit = pl$col("values")$arr$join(" "), join_ignore_null = pl$col("values")$arr$join(" ", ignore_nulls = TRUE) )
Find the maximum value in an array
ExprArr_max()
ExprArr_max()
This method is only available with the "nightly" feature.
See polars_info()
for more details.
Expr
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(max = pl$col("values")$arr$max())
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(max = pl$col("values")$arr$max())
Find the median in an array
ExprArr_median()
ExprArr_median()
Expr
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(median = pl$col("values")$arr$median())
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(median = pl$col("values")$arr$median())
Find the minimum value in an array
ExprArr_min()
ExprArr_min()
This method is only available with the "nightly" feature.
See polars_info()
for more details.
Expr
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(min = pl$col("values")$arr$min())
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(min = pl$col("values")$arr$min())
Reverse values in an array
ExprArr_reverse()
ExprArr_reverse()
Expr
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(reverse = pl$col("values")$arr$reverse())
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(reverse = pl$col("values")$arr$reverse())
n
indicesShift array values by n
indices
ExprArr_shift(n = 1)
ExprArr_shift(n = 1)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
Expr
df = pl$DataFrame( values = list(1:3, c(2L, NA_integer_, 5L)), idx = 1:2, schema = list(values = pl$Array(pl$Int32, 3)) ) df$with_columns( shift_by_expr = pl$col("values")$arr$shift(pl$col("idx")), shift_by_lit = pl$col("values")$arr$shift(2) )
df = pl$DataFrame( values = list(1:3, c(2L, NA_integer_, 5L)), idx = 1:2, schema = list(values = pl$Array(pl$Int32, 3)) ) df$with_columns( shift_by_expr = pl$col("values")$arr$shift(pl$col("idx")), shift_by_lit = pl$col("values")$arr$shift(2) )
Sort values in an array
ExprArr_sort(descending = FALSE, nulls_last = FALSE)
ExprArr_sort(descending = FALSE, nulls_last = FALSE)
descending |
A logical. If |
nulls_last |
A logical. If |
df = pl$DataFrame( values = list(c(2, 1), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(sort = pl$col("values")$arr$sort(nulls_last = TRUE))
df = pl$DataFrame( values = list(c(2, 1), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(sort = pl$col("values")$arr$sort(nulls_last = TRUE))
Find the standard deviation in an array
ExprArr_std(ddof = 1)
ExprArr_std(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
Expr
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(std = pl$col("values")$arr$std())
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(std = pl$col("values")$arr$std())
Sum all elements in an array
ExprArr_sum()
ExprArr_sum()
Expr
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(sum = pl$col("values")$arr$sum())
df = pl$DataFrame( values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(sum = pl$col("values")$arr$sum())
Convert an Array column into a List column with the same inner data type
ExprArr_to_list()
ExprArr_to_list()
df = pl$DataFrame( a = list(c(1, 2), c(3, 4)), schema = list(a = pl$Array(pl$Int8, 2)) ) df$with_columns( list = pl$col("a")$arr$to_list() )
df = pl$DataFrame( a = list(c(1, 2), c(3, 4)), schema = list(a = pl$Array(pl$Int8, 2)) ) df$with_columns( list = pl$col("a")$arr$to_list() )
Convert array to struct
ExprArr_to_struct(fields = NULL)
ExprArr_to_struct(fields = NULL)
fields |
If the name and number of the desired fields is known in
advance, a list of field names can be given, which will be assigned by
index. Otherwise, to dynamically assign field names, a custom R function
that takes an R double and outputs a string value can be used. If
|
Expr
df = pl$DataFrame( values = list(1:3, c(2L, NA_integer_, 5L)), schema = list(values = pl$Array(pl$Int32, 3)) ) df$with_columns( struct = pl$col("values")$arr$to_struct() ) # pass a custom function that will name all fields by adding a prefix df2 = df$with_columns( pl$col("values")$arr$to_struct( fields = \(idx) paste0("col_", idx) ) ) df2 df2$unnest()
df = pl$DataFrame( values = list(1:3, c(2L, NA_integer_, 5L)), schema = list(values = pl$Array(pl$Int32, 3)) ) df$with_columns( struct = pl$col("values")$arr$to_struct() ) # pass a custom function that will name all fields by adding a prefix df2 = df$with_columns( pl$col("values")$arr$to_struct( fields = \(idx) paste0("col_", idx) ) ) df2 df2$unnest()
Get unique values in an array
ExprArr_unique(maintain_order = FALSE)
ExprArr_unique(maintain_order = FALSE)
maintain_order |
If |
Expr
df = pl$DataFrame( values = list(c(1, 1, 2), c(4, 4, 4), c(NA_real_, 6, 7)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(unique = pl$col("values")$arr$unique())
df = pl$DataFrame( values = list(c(1, 1, 2), c(4, 4, 4), c(NA_real_, 6, 7)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(unique = pl$col("values")$arr$unique())
Find the variance in an array
ExprArr_var(ddof = 1)
ExprArr_var(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
Expr
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(var = pl$col("values")$arr$var())
df = pl$DataFrame( values = list(c(2, 1, 4), c(8.4, 3.2, 1)), schema = list(values = pl$Array(pl$Float64, 3)) ) df$with_columns(var = pl$col("values")$arr$var())
Check if binaries contain a binary substring
ExprBin_contains(literal)
ExprBin_contains(literal)
literal |
The binary substring to look for. |
Expr returning a Boolean
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), lit = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", contains_with_lit = pl$col("code")$bin$contains("xff"), contains_with_expr = pl$col("code")$bin$contains(pl$col("lit")) )
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), lit = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", contains_with_lit = pl$col("code")$bin$contains("xff"), contains_with_expr = pl$col("code")$bin$contains(pl$col("lit")) )
Decode values using the provided encoding
ExprBin_decode(encoding, ..., strict = TRUE)
ExprBin_decode(encoding, ..., strict = TRUE)
encoding |
A character, |
... |
Ignored. |
strict |
Raise an error if the underlying value cannot be decoded,
otherwise mask out with a |
Expr of data type String.
df = pl$DataFrame( name = c("black", "yellow", "blue"), code_hex = as_polars_series(c("000000", "ffff00", "0000ff"))$cast(pl$Binary), code_base64 = as_polars_series(c("AAAA", "//8A", "AAD/"))$cast(pl$Binary) ) df$with_columns( decoded_hex = pl$col("code_hex")$bin$decode("hex"), decoded_base64 = pl$col("code_base64")$bin$decode("base64") ) # Set `strict = FALSE` to set invalid values to `null` instead of raising an error. df = pl$DataFrame( colors = as_polars_series(c("000000", "ffff00", "invalid_value"))$cast(pl$Binary) ) df$select(pl$col("colors")$bin$decode("hex", strict = FALSE))
df = pl$DataFrame( name = c("black", "yellow", "blue"), code_hex = as_polars_series(c("000000", "ffff00", "0000ff"))$cast(pl$Binary), code_base64 = as_polars_series(c("AAAA", "//8A", "AAD/"))$cast(pl$Binary) ) df$with_columns( decoded_hex = pl$col("code_hex")$bin$decode("hex"), decoded_base64 = pl$col("code_base64")$bin$decode("base64") ) # Set `strict = FALSE` to set invalid values to `null` instead of raising an error. df = pl$DataFrame( colors = as_polars_series(c("000000", "ffff00", "invalid_value"))$cast(pl$Binary) ) df$select(pl$col("colors")$bin$decode("hex", strict = FALSE))
Encode a value using the provided encoding
ExprBin_encode(encoding)
ExprBin_encode(encoding)
encoding |
A character, |
Expr of data type String.
df = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series( c("000000", "ffff00", "0000ff") )$cast(pl$Binary)$bin$decode("hex") ) df$with_columns(encoded = pl$col("code")$bin$encode("hex"))
df = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series( c("000000", "ffff00", "0000ff") )$cast(pl$Binary)$bin$decode("hex") ) df$with_columns(encoded = pl$col("code")$bin$encode("hex"))
Check if string values end with a binary substring
ExprBin_ends_with(suffix)
ExprBin_ends_with(suffix)
suffix |
Suffix substring. |
Expr returning a Boolean
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), suffix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", ends_with_lit = pl$col("code")$bin$ends_with("xff"), ends_with_expr = pl$col("code")$bin$ends_with(pl$col("suffix")) )
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), suffix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", ends_with_lit = pl$col("code")$bin$ends_with("xff"), ends_with_expr = pl$col("code")$bin$ends_with(pl$col("suffix")) )
Get the size of binary values in the given unit
ExprBin_size(unit = "b")
ExprBin_size(unit = "b")
unit |
Scale the returned size to the given unit. Can be |
Expr of data type UInt or Float.
df = pl$DataFrame( name = c("black", "yellow", "blue"), code_hex = as_polars_series(c("000000", "ffff00", "0000ff"))$cast(pl$Binary) ) df$with_columns( n_bytes = pl$col("code_hex")$bin$size(), n_kilobytes = pl$col("code_hex")$bin$size("kb") )
df = pl$DataFrame( name = c("black", "yellow", "blue"), code_hex = as_polars_series(c("000000", "ffff00", "0000ff"))$cast(pl$Binary) ) df$with_columns( n_bytes = pl$col("code_hex")$bin$size(), n_kilobytes = pl$col("code_hex")$bin$size("kb") )
Check if values start with a binary substring
ExprBin_starts_with(sub)
ExprBin_starts_with(sub)
sub |
Prefix substring. |
Expr returing a Boolean
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), prefix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", starts_with_lit = pl$col("code")$bin$starts_with("xff"), starts_with_expr = pl$col("code")$bin$starts_with(pl$col("prefix")) )
colors = pl$DataFrame( name = c("black", "yellow", "blue"), code = as_polars_series(c("x00x00x00", "xffxffx00", "x00x00xff"))$cast(pl$Binary), prefix = as_polars_series(c("x00", "xffx00", "xffxff"))$cast(pl$Binary) ) colors$select( "name", starts_with_lit = pl$col("code")$bin$starts_with("xff"), starts_with_expr = pl$col("code")$bin$starts_with(pl$col("prefix")) )
Get the categories stored in this data type
ExprCat_get_categories()
ExprCat_get_categories()
A polars DataFrame with the categories for each categorical Series.
df = pl$DataFrame( cats = factor(c("z", "z", "k", "a", "b")), vals = factor(c(3, 1, 2, 2, 3)) ) df df$select( pl$col("cats")$cat$get_categories() ) df$select( pl$col("vals")$cat$get_categories() )
df = pl$DataFrame( cats = factor(c("z", "z", "k", "a", "b")), vals = factor(c(3, 1, 2, 2, 3)) ) df df$select( pl$col("cats")$cat$get_categories() ) df$select( pl$col("vals")$cat$get_categories() )
Determine how this categorical series should be sorted.
ExprCat_set_ordering(ordering)
ExprCat_set_ordering(ordering)
ordering |
string either 'physical' or 'lexical'
|
An Expr of datatype Categorical
df = pl$DataFrame( cats = factor(c("z", "z", "k", "a", "b")), vals = c(3, 1, 2, 2, 3) ) # sort by the string value of categories df$with_columns( pl$col("cats")$cat$set_ordering("lexical") )$sort("cats", "vals") # sort by the underlying value of categories df$with_columns( pl$col("cats")$cat$set_ordering("physical") )$sort("cats", "vals")
df = pl$DataFrame( cats = factor(c("z", "z", "k", "a", "b")), vals = c(3, 1, 2, 2, 3) ) # sort by the string value of categories df$with_columns( pl$col("cats")$cat$set_ordering("lexical") )$sort("cats", "vals") # sort by the underlying value of categories df$with_columns( pl$col("cats")$cat$set_ordering("physical") )$sort("cats", "vals")
Cast the underlying data to another time unit. This may lose precision. The corresponding global timepoint will stay unchanged +/- precision.
ExprDT_cast_time_unit(tu = c("ns", "us", "ms"))
ExprDT_cast_time_unit(tu = c("ns", "us", "ms"))
tu |
string option either 'ns', 'us', or 'ms' |
Expr of i64
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$cast_time_unit()$alias("cast_time_unit_ns"), pl$col("date")$dt$cast_time_unit(tu = "ms")$alias("cast_time_unit_ms") )
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$cast_time_unit()$alias("cast_time_unit_ns"), pl$col("date")$dt$cast_time_unit(tu = "ms")$alias("cast_time_unit_ms") )
If the underlying expression is a Datetime then its time component is replaced, and if it is a Date then a new Datetime is created by combining the two values.
ExprDT_combine(time, time_unit = "us")
ExprDT_combine(time, time_unit = "us")
time |
The number of epoch since or before (if negative) the Date. Can be an Expr or a PTime. |
time_unit |
Unit of time. One of |
Date/Datetime expr
df = pl$DataFrame( dtm = c( ISOdatetime(2022, 12, 31, 10, 30, 45), ISOdatetime(2023, 7, 5, 23, 59, 59) ), dt = c(ISOdate(2022, 10, 10), ISOdate(2022, 7, 5)), tm = c(pl$time(1, 2, 3, 456000), pl$time(7, 8, 9, 101000)) )$explode("tm") df df$select( d1 = pl$col("dtm")$dt$combine(pl$col("tm")), s2 = pl$col("dt")$dt$combine(pl$col("tm")), d3 = pl$col("dt")$dt$combine(pl$time(4, 5, 6)) )
df = pl$DataFrame( dtm = c( ISOdatetime(2022, 12, 31, 10, 30, 45), ISOdatetime(2023, 7, 5, 23, 59, 59) ), dt = c(ISOdate(2022, 10, 10), ISOdate(2022, 7, 5)), tm = c(pl$time(1, 2, 3, 456000), pl$time(7, 8, 9, 101000)) )$explode("tm") df df$select( d1 = pl$col("dtm")$dt$combine(pl$col("tm")), s2 = pl$col("dt")$dt$combine(pl$col("tm")), d3 = pl$col("dt")$dt$combine(pl$time(4, 5, 6)) )
If converting from a time-zone-naive datetime, then conversion will happen as if converting from UTC, regardless of your system’s time zone.
ExprDT_convert_time_zone(time_zone)
ExprDT_convert_time_zone(time_zone)
time_zone |
String time zone from |
Expr of i64
df = pl$DataFrame( date = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-05-01", tz = "UTC"), "1mo1s" ) ) df$select( "date", London = pl$col("date")$dt$convert_time_zone("Europe/London") )
df = pl$DataFrame( date = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-05-01", tz = "UTC"), "1mo1s" ) ) df$select( "date", London = pl$col("date")$dt$convert_time_zone("Europe/London") )
Extract day from underlying Date representation. Applies to Date and Datetime columns. Returns the day of month starting from 1. The return value ranges from 1 to 31. (The last day of month differs by months.)
ExprDT_day()
ExprDT_day()
Expr of day as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$day()$alias("day") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$day()$alias("day") )
Get the time passed since the Unix EPOCH in the give time unit.
ExprDT_epoch(time_unit = "us")
ExprDT_epoch(time_unit = "us")
time_unit |
Time unit, one of |
Expr with datatype Int64
df = pl$DataFrame(date = pl$date_range(as.Date("2001-1-1"), as.Date("2001-1-3"))) df$with_columns( epoch_ns = pl$col("date")$dt$epoch(), epoch_s = pl$col("date")$dt$epoch(time_unit = "s") )
df = pl$DataFrame(date = pl$date_range(as.Date("2001-1-1"), as.Date("2001-1-3"))) df$with_columns( epoch_ns = pl$col("date")$dt$epoch(), epoch_s = pl$col("date")$dt$epoch(time_unit = "s") )
Extract hour from underlying Datetime representation. Applies to Datetime columns. Returns the hour number from 0 to 23.
ExprDT_hour()
ExprDT_hour()
Expr of hour as UInt32
df = pl$DataFrame( date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d2h", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$hour()$alias("hour") )
df = pl$DataFrame( date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d2h", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$hour()$alias("hour") )
Determine whether the year of the underlying date is a leap year
ExprDT_is_leap_year()
ExprDT_is_leap_year()
An Expr of datatype Bool
df = pl$DataFrame(date = as.Date(c("2000-01-01", "2001-01-01", "2002-01-01"))) df$with_columns( leap_year = pl$col("date")$dt$is_leap_year() )
df = pl$DataFrame(date = as.Date(c("2000-01-01", "2001-01-01", "2002-01-01"))) df$with_columns( leap_year = pl$col("date")$dt$is_leap_year() )
Extract ISO year from underlying Date representation. Applies to Date and Datetime columns. Returns the year number in the ISO standard. This may not correspond with the calendar year.
ExprDT_iso_year()
ExprDT_iso_year()
Expr of iso_year as Int32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$year()$alias("year"), pl$col("date")$dt$iso_year()$alias("iso_year") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$year()$alias("year"), pl$col("date")$dt$iso_year()$alias("iso_year") )
Applies to Datetime columns.
ExprDT_microsecond()
ExprDT_microsecond()
Expr of data type Int32
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( microsecond = pl$col("datetime")$dt$microsecond() )
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( microsecond = pl$col("datetime")$dt$microsecond() )
Applies to Datetime columns.
ExprDT_millisecond()
ExprDT_millisecond()
Expr of data type Int32
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( millisecond = pl$col("datetime")$dt$millisecond() )
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( millisecond = pl$col("datetime")$dt$millisecond() )
Extract minutes from underlying Datetime representation. Applies to Datetime columns. Returns the minute number from 0 to 59.
ExprDT_minute()
ExprDT_minute()
Expr of minute as UInt32
df = pl$DataFrame( date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d5s", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$minute()$alias("minute") )
df = pl$DataFrame( date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d5s", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$minute()$alias("minute") )
Extract month from underlying Date representation. Applies to Date and Datetime columns. Returns the month number starting from 1. The return value ranges from 1 to 12.
ExprDT_month()
ExprDT_month()
Expr of month as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$month()$alias("month") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$month()$alias("month") )
Applies to Datetime columns.
ExprDT_nanosecond()
ExprDT_nanosecond()
Expr of data type Int32
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( nanosecond = pl$col("datetime")$dt$nanosecond() )
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( nanosecond = pl$col("datetime")$dt$nanosecond() )
Offset this date by a relative time offset.
This differs from pl$col("foo_datetime_tu") + value_tu
in that it can
take months and leap years into account. Note that only a single minus
sign is allowed in the by
string, as the first character.
ExprDT_offset_by(by)
ExprDT_offset_by(by)
by |
optional string encoding duration see details. |
The by
are created with the the following string language:
1ns # 1 nanosecond
1us # 1 microsecond
1ms # 1 millisecond
1s # 1 second
1m # 1 minute
1h # 1 hour
1d # 1 day
1w # 1 calendar week
1mo # 1 calendar month
1y # 1 calendar year
1i # 1 index count
These strings can be combined:
3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
Date/Datetime expr
df = pl$DataFrame( dates = pl$date_range( as.Date("2000-1-1"), as.Date("2005-1-1"), "1y" ) ) df$select( pl$col("dates")$dt$offset_by("1y")$alias("date_plus_1y"), pl$col("dates")$dt$offset_by("-1y2mo")$alias("date_min") ) # the "by" argument also accepts expressions df = pl$DataFrame( dates = pl$datetime_range( as.POSIXct("2022-01-01", tz = "GMT"), as.POSIXct("2022-01-02", tz = "GMT"), interval = "6h", time_unit = "ms", time_zone = "GMT" )$to_r(), offset = c("1d", "-2d", "1mo", NA, "1y") ) df df$with_columns(new_dates = pl$col("dates")$dt$offset_by(pl$col("offset")))
df = pl$DataFrame( dates = pl$date_range( as.Date("2000-1-1"), as.Date("2005-1-1"), "1y" ) ) df$select( pl$col("dates")$dt$offset_by("1y")$alias("date_plus_1y"), pl$col("dates")$dt$offset_by("-1y2mo")$alias("date_min") ) # the "by" argument also accepts expressions df = pl$DataFrame( dates = pl$datetime_range( as.POSIXct("2022-01-01", tz = "GMT"), as.POSIXct("2022-01-02", tz = "GMT"), interval = "6h", time_unit = "ms", time_zone = "GMT" )$to_r(), offset = c("1d", "-2d", "1mo", NA, "1y") ) df df$with_columns(new_dates = pl$col("dates")$dt$offset_by(pl$col("offset")))
Extract ordinal day from underlying Date representation. Applies to Date and Datetime columns. Returns the day of year starting from 1. The return value ranges from 1 to 366. (The last day of year differs by years.)
ExprDT_ordinal_day()
ExprDT_ordinal_day()
Expr of ordinal_day as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$ordinal_day()$alias("ordinal_day") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$ordinal_day()$alias("ordinal_day") )
Extract quarter from underlying Date representation. Applies to Date and Datetime columns. Returns the quarter ranging from 1 to 4.
ExprDT_quarter()
ExprDT_quarter()
Expr of quarter as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$quarter()$alias("quarter") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$quarter()$alias("quarter") )
Cast time zone for a Series of type Datetime. This is different from
$convert_time_zone()
as it will also modify the
underlying timestamp. Use to correct a wrong time zone annotation. This will
change the corresponding global timepoint.
ExprDT_replace_time_zone( time_zone, ..., ambiguous = "raise", non_existent = "raise" )
ExprDT_replace_time_zone( time_zone, ..., ambiguous = "raise", non_existent = "raise" )
time_zone |
|
... |
Ignored. |
ambiguous |
Determine how to deal with ambiguous datetimes:
|
non_existent |
Determine how to deal with non-existent datetimes:
|
Expr of i64
df1 = pl$DataFrame( london_timezone = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-07-01", tz = "UTC"), "1mo1s" )$dt$convert_time_zone("Europe/London") ) df1$select( "london_timezone", London_to_Amsterdam = pl$col("london_timezone")$dt$replace_time_zone("Europe/Amsterdam") ) # You can use `ambiguous` to deal with ambiguous datetimes: dates = c( "2018-10-28 01:30", "2018-10-28 02:00", "2018-10-28 02:30", "2018-10-28 02:00" ) df2 = pl$DataFrame( ts = as_polars_series(dates)$str$strptime(pl$Datetime("us")), ambiguous = c("earliest", "earliest", "latest", "latest") ) df2$with_columns( ts_localized = pl$col("ts")$dt$replace_time_zone( "Europe/Brussels", ambiguous = pl$col("ambiguous") ) )
df1 = pl$DataFrame( london_timezone = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-07-01", tz = "UTC"), "1mo1s" )$dt$convert_time_zone("Europe/London") ) df1$select( "london_timezone", London_to_Amsterdam = pl$col("london_timezone")$dt$replace_time_zone("Europe/Amsterdam") ) # You can use `ambiguous` to deal with ambiguous datetimes: dates = c( "2018-10-28 01:30", "2018-10-28 02:00", "2018-10-28 02:30", "2018-10-28 02:00" ) df2 = pl$DataFrame( ts = as_polars_series(dates)$str$strptime(pl$Datetime("us")), ambiguous = c("earliest", "earliest", "latest", "latest") ) df2$with_columns( ts_localized = pl$col("ts")$dt$replace_time_zone( "Europe/Brussels", ambiguous = pl$col("ambiguous") ) )
Divide the date/datetime range into buckets. Each date/datetime in the first half of the interval is mapped to the start of its bucket. Each date/datetime in the second half of the interval is mapped to the end of its bucket.
ExprDT_round(every)
ExprDT_round(every)
every |
Either an Expr or a string indicating a column name or a duration (see Details). |
The every
and offset
argument are created with the
the following string language:
1ns # 1 nanosecond
1us # 1 microsecond
1ms # 1 millisecond
1s # 1 second
1m # 1 minute
1h # 1 hour
1d # 1 day
1w # 1 calendar week
1mo # 1 calendar month
1y # 1 calendar year These strings can be combined:
3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
Date/Datetime expr
t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") df = pl$DataFrame(datetime = s)$with_columns( pl$col("datetime")$dt$round("4s")$alias("rounded_4s") ) df
t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") df = pl$DataFrame(datetime = s)$with_columns( pl$col("datetime")$dt$round("4s")$alias("rounded_4s") ) df
Applies to Datetime columns.
Returns the integer second number from 0 to 59, or a floating
point number from 0 < 60 if fractional=TRUE
that includes
any milli/micro/nanosecond component.
ExprDT_second(fractional = FALSE)
ExprDT_second(fractional = FALSE)
fractional |
A logical. Whether to include the fractional component of the second. |
Expr of data type Int8 or Float64
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( second = pl$col("datetime")$dt$second(), second_fractional = pl$col("datetime")$dt$second(fractional = TRUE) )
df = pl$DataFrame( datetime = as.POSIXct( c( "1978-01-01 01:01:01", "2024-10-13 05:30:14.500", "2065-01-01 10:20:30.06" ), "UTC" ) ) df$with_columns( second = pl$col("datetime")$dt$second(), second_fractional = pl$col("datetime")$dt$second(fractional = TRUE) )
Format Date/Datetime with a formatting rule.
See chrono strftime/strptime <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>
_.
ExprDT_strftime(format)
ExprDT_strftime(format)
format |
string format very much like in R passed to chrono |
Date/Datetime expr
pl$lit(as.POSIXct("2021-01-02 12:13:14", tz = "GMT"))$dt$strftime("this is the year: %Y")$to_r()
pl$lit(as.POSIXct("2021-01-02 12:13:14", tz = "GMT"))$dt$strftime("this is the year: %Y")$to_r()
This only works on Datetime Series, it will error on Date Series.
ExprDT_time()
ExprDT_time()
A Time Expr
df = pl$DataFrame(dates = pl$datetime_range( as.Date("2000-1-1"), as.Date("2000-1-2"), "1h" )) df$with_columns(times = pl$col("dates")$dt$time())
df = pl$DataFrame(dates = pl$datetime_range( as.Date("2000-1-1"), as.Date("2000-1-2"), "1h" )) df$with_columns(times = pl$col("dates")$dt$time())
Return a timestamp in the given time unit.
ExprDT_timestamp(tu = c("ns", "us", "ms"))
ExprDT_timestamp(tu = c("ns", "us", "ms"))
tu |
string option either 'ns', 'us', or 'ms' |
Expr of i64
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$timestamp()$alias("timestamp_ns"), pl$col("date")$dt$timestamp(tu = "ms")$alias("timestamp_ms") )
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$timestamp()$alias("timestamp_ns"), pl$col("date")$dt$timestamp(tu = "ms")$alias("timestamp_ms") )
Extract the days from a Duration type.
ExprDT_total_days()
ExprDT_total_days()
Expr of i64
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), interval = "1mo1s" ) ) df$select( pl$col("date"), diff_days = pl$col("date")$diff()$dt$total_days() )
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), interval = "1mo1s" ) ) df$select( pl$col("date"), diff_days = pl$col("date")$diff()$dt$total_days() )
Extract the hours from a Duration type.
ExprDT_total_hours()
ExprDT_total_hours()
Expr of i64
df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ) ) df$select( pl$col("date"), diff_hours = pl$col("date")$diff()$dt$total_hours() )
df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ) ) df$select( pl$col("date"), diff_hours = pl$col("date")$diff()$dt$total_hours() )
Extract the microseconds from a Duration type.
ExprDT_total_microseconds()
ExprDT_total_microseconds()
Expr of i64
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_microsec = pl$col("date")$diff()$dt$total_microseconds() )
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_microsec = pl$col("date")$diff()$dt$total_microseconds() )
Extract the milliseconds from a Duration type.
ExprDT_total_milliseconds()
ExprDT_total_milliseconds()
Expr of i64
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_millisec = pl$col("date")$diff()$dt$total_milliseconds() )
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_millisec = pl$col("date")$diff()$dt$total_milliseconds() )
Extract the minutes from a Duration type.
ExprDT_total_minutes()
ExprDT_total_minutes()
Expr of i64
df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ) ) df$select( pl$col("date"), diff_minutes = pl$col("date")$diff()$dt$total_minutes() )
df = pl$DataFrame( date = pl$date_range( start = as.Date("2020-1-1"), end = as.Date("2020-1-4"), interval = "1d" ) ) df$select( pl$col("date"), diff_minutes = pl$col("date")$diff()$dt$total_minutes() )
Extract the nanoseconds from a Duration type.
ExprDT_total_nanoseconds()
ExprDT_total_nanoseconds()
Expr of i64
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_nanosec = pl$col("date")$diff()$dt$total_nanoseconds() )
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" )) df$select( pl$col("date"), diff_nanosec = pl$col("date")$diff()$dt$total_nanoseconds() )
Extract the seconds from a Duration type.
ExprDT_total_seconds()
ExprDT_total_seconds()
Expr of i64
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), interval = "1m" )) df$select( pl$col("date"), diff_sec = pl$col("date")$diff()$dt$total_seconds() )
df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), interval = "1m" )) df$select( pl$col("date"), diff_sec = pl$col("date")$diff()$dt$total_seconds() )
Divide the date/datetime range into buckets. Each date/datetime is mapped to the start of its bucket.
ExprDT_truncate(every)
ExprDT_truncate(every)
every |
Either an Expr or a string indicating a column name or a duration (see Details). |
The every
and offset
argument are created with the
the following string language:
1ns # 1 nanosecond
1us # 1 microsecond
1ms # 1 millisecond
1s # 1 second
1m # 1 minute
1h # 1 hour
1d # 1 day
1w # 1 calendar week
1mo # 1 calendar month
1y # 1 calendar year These strings can be combined:
3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
Date/Datetime expr
t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") df = pl$DataFrame(datetime = s)$with_columns( pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) df
t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") df = pl$DataFrame(datetime = s)$with_columns( pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) df
Extract the week from the underlying Date representation. Applies to Date and Datetime columns. Returns the ISO week number starting from 1. The return value ranges from 1 to 53. (The last week of year differs by years.)
ExprDT_week()
ExprDT_week()
Expr of week as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$week()$alias("week") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$week()$alias("week") )
Extract the week day from the underlying Date representation. Applies to Date and Datetime columns. Returns the ISO weekday number where monday = 1 and sunday = 7
ExprDT_weekday()
ExprDT_weekday()
Expr of weekday as UInt32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$weekday()$alias("weekday") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$weekday()$alias("weekday") )
Set time unit of a Series of dtype Datetime or Duration. This does not modify underlying data, and should be used to fix an incorrect time unit. The corresponding global timepoint will change.
ExprDT_with_time_unit(tu = c("ns", "us", "ms"))
ExprDT_with_time_unit(tu = c("ns", "us", "ms"))
tu |
string option either 'ns', 'us', or 'ms' |
Expr of i64
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$with_time_unit()$alias("with_time_unit_ns"), pl$col("date")$dt$with_time_unit(tu = "ms")$alias("with_time_unit_ms") )
df = pl$DataFrame( date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" ) ) df$select( pl$col("date"), pl$col("date")$dt$with_time_unit()$alias("with_time_unit_ns"), pl$col("date")$dt$with_time_unit(tu = "ms")$alias("with_time_unit_ms") )
Extract year from underlying Date representation. Applies to Date and Datetime columns. Returns the year number in the calendar date.
ExprDT_year()
ExprDT_year()
Expr of Year as Int32
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$year()$alias("year"), pl$col("date")$dt$iso_year()$alias("iso_year") )
df = pl$DataFrame( date = pl$date_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d", time_zone = "GMT" ) ) df$with_columns( pl$col("date")$dt$year()$alias("year"), pl$col("date")$dt$iso_year()$alias("iso_year") )
Evaluate whether all boolean values in a list are true
ExprList_all()
ExprList_all()
Expr
df = pl$DataFrame( list(a = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), NA, c())) ) df$with_columns(all = pl$col("a")$list$all())
df = pl$DataFrame( list(a = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), NA, c())) ) df$with_columns(all = pl$col("a")$list$all())
Evaluate whether any boolean values in a list are true
ExprList_any()
ExprList_any()
Expr
df = pl$DataFrame( list(a = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), NA, c())) ) df$with_columns(any = pl$col("a")$list$any())
df = pl$DataFrame( list(a = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), NA, c())) ) df$with_columns(any = pl$col("a")$list$any())
Get the index of the maximal value in list
ExprList_arg_max()
ExprList_arg_max()
Expr
df = pl$DataFrame(list(s = list(1:2, 2:1))) df$with_columns( arg_max = pl$col("s")$list$arg_max() )
df = pl$DataFrame(list(s = list(1:2, 2:1))) df$with_columns( arg_max = pl$col("s")$list$arg_max() )
Get the index of the minimal value in list
ExprList_arg_min()
ExprList_arg_min()
Expr
df = pl$DataFrame(list(s = list(1:2, 2:1))) df$with_columns( arg_min = pl$col("s")$list$arg_min() )
df = pl$DataFrame(list(s = list(1:2, 2:1))) df$with_columns( arg_min = pl$col("s")$list$arg_min() )
Concat two list variables
ExprList_concat(other)
ExprList_concat(other)
other |
Values to concat with. Can be an Expr or something coercible to an Expr. |
Expr
df = pl$DataFrame( a = list("a", "x"), b = list(c("b", "c"), c("y", "z")) ) df$with_columns( conc_to_b = pl$col("a")$list$concat(pl$col("b")), conc_to_lit_str = pl$col("a")$list$concat(pl$lit("some string")), conc_to_lit_list = pl$col("a")$list$concat(pl$lit(list("hello", c("hello", "world")))) )
df = pl$DataFrame( a = list("a", "x"), b = list(c("b", "c"), c("y", "z")) ) df$with_columns( conc_to_b = pl$col("a")$list$concat(pl$col("b")), conc_to_lit_str = pl$col("a")$list$concat(pl$lit("some string")), conc_to_lit_list = pl$col("a")$list$concat(pl$lit(list("hello", c("hello", "world")))) )
Check if list contains a given value
ExprList_contains(item)
ExprList_contains(item)
item |
Expr or something coercible to an Expr. Strings are not parsed as columns. |
Expr
df = pl$DataFrame( a = list(3:1, NULL, 1:2), item = 0:2 ) df$with_columns( with_expr = pl$col("a")$list$contains(pl$col("item")), with_lit = pl$col("a")$list$contains(1) )
df = pl$DataFrame( a = list(3:1, NULL, 1:2), item = 0:2 ) df$with_columns( with_expr = pl$col("a")$list$contains(pl$col("item")), with_lit = pl$col("a")$list$contains(1) )
This computes the first discrete difference between shifted items of every
list. The parameter n
gives the interval between items to subtract, e.g n = 2
the output will be the difference between the 1st and the 3rd value, the
2nd and 4th value, etc.
ExprList_diff(n = 1, null_behavior = c("ignore", "drop"))
ExprList_diff(n = 1, null_behavior = c("ignore", "drop"))
n |
Number of slots to shift. If negative, then it starts from the end. |
null_behavior |
How to handle |
Expr
df = pl$DataFrame(list(s = list(1:4, c(10L, 2L, 1L)))) df$with_columns(diff = pl$col("s")$list$diff(2)) # negative value starts shifting from the end df$with_columns(diff = pl$col("s")$list$diff(-2))
df = pl$DataFrame(list(s = list(1:4, c(10L, 2L, 1L)))) df$with_columns(diff = pl$col("s")$list$diff(2)) # negative value starts shifting from the end df$with_columns(diff = pl$col("s")$list$diff(-2))
Run any polars expression on the list values
ExprList_eval(expr, parallel = FALSE)
ExprList_eval(expr, parallel = FALSE)
expr |
Expression to run. Note that you can select an element with
|
parallel |
Run all expression parallel. Don't activate this blindly.
Parallelism is worth it if there is enough work to do per thread. This
likely should not be used in the |
Expr
df = pl$DataFrame( a = list(c(1, 8, 3), c(3, 2), c(NA, NA, 1)), b = list(c("R", "is", "amazing"), c("foo", "bar"), "text") ) df # standardize each value inside a list, using only the values in this list df$select( a_stand = pl$col("a")$list$eval( (pl$element() - pl$element()$mean()) / pl$element()$std() ) ) # count characters for each element in list. Since column "b" is list[str], # we can apply all `$str` functions on elements in the list: df$select( b_len_chars = pl$col("b")$list$eval( pl$element()$str$len_chars() ) ) # concat strings in each list df$select( pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() )
df = pl$DataFrame( a = list(c(1, 8, 3), c(3, 2), c(NA, NA, 1)), b = list(c("R", "is", "amazing"), c("foo", "bar"), "text") ) df # standardize each value inside a list, using only the values in this list df$select( a_stand = pl$col("a")$list$eval( (pl$element() - pl$element()$mean()) / pl$element()$std() ) ) # count characters for each element in list. Since column "b" is list[str], # we can apply all `$str` functions on elements in the list: df$select( b_len_chars = pl$col("b")$list$eval( pl$element()$str$len_chars() ) ) # concat strings in each list df$select( pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() )
Returns a column with a separate row for every list element
ExprList_explode()
ExprList_explode()
Expr
df = pl$DataFrame(a = list(c(1, 2, 3), c(4, 5, 6))) df$select(pl$col("a")$list$explode())
df = pl$DataFrame(a = list(c(1, 2, 3), c(4, 5, 6))) df$select(pl$col("a")$list$explode())
Get the first value in a list
ExprList_first()
ExprList_first()
Expr
df = pl$DataFrame(list(a = list(3:1, NULL, 1:2))) df$with_columns( first = pl$col("a")$list$first() )
df = pl$DataFrame(list(a = list(3:1, NULL, 1:2))) df$with_columns( first = pl$col("a")$list$first() )
This allows to extract several values per list. To extract a single value by
index, use $list$get()
.
ExprList_gather(index, null_on_oob = FALSE)
ExprList_gather(index, null_on_oob = FALSE)
index |
An Expr or something coercible to an Expr, that can return
several single indices. Values are 0-indexed (so index 0 would return the
first item of every sublist) and negative values start from the end (index
|
null_on_oob |
If |
Expr
df = pl$DataFrame( a = list(c(3, 2, 1), 1, c(1, 2)), idx = list(0:1, integer(), c(1L, 999L)) ) df$with_columns( gathered = pl$col("a")$list$gather("idx", null_on_oob = TRUE) ) df$with_columns( gathered = pl$col("a")$list$gather(2, null_on_oob = TRUE) ) # by some column name, must cast to an Int/Uint type to work df$with_columns( gathered = pl$col("a")$list$gather(pl$col("a")$cast(pl$List(pl$UInt64)), null_on_oob = TRUE) )
df = pl$DataFrame( a = list(c(3, 2, 1), 1, c(1, 2)), idx = list(0:1, integer(), c(1L, 999L)) ) df$with_columns( gathered = pl$col("a")$list$gather("idx", null_on_oob = TRUE) ) df$with_columns( gathered = pl$col("a")$list$gather(2, null_on_oob = TRUE) ) # by some column name, must cast to an Int/Uint type to work df$with_columns( gathered = pl$col("a")$list$gather(pl$col("a")$cast(pl$List(pl$UInt64)), null_on_oob = TRUE) )
Gather every nth element in a list
ExprList_gather_every(n, offset = 0)
ExprList_gather_every(n, offset = 0)
n |
Positive integer. |
offset |
Starting index. |
Expr
df = pl$DataFrame( a = list(1:5, 6:8, 9:12), n = c(2, 1, 3), offset = c(0, 1, 0) ) df$with_columns( gather_every = pl$col("a")$list$gather_every(pl$col("n"), offset = pl$col("offset")) )
df = pl$DataFrame( a = list(1:5, 6:8, 9:12), n = c(2, 1, 3), offset = c(0, 1, 0) ) df$with_columns( gather_every = pl$col("a")$list$gather_every(pl$col("n"), offset = pl$col("offset")) )
This allows to extract one value per list only. To extract several values by
index, use $list$gather()
.
ExprList_get(index, ..., null_on_oob = TRUE)
ExprList_get(index, ..., null_on_oob = TRUE)
index |
An Expr or something coercible to an Expr, that must return a
single index. Values are 0-indexed (so index 0 would return the first item
of every sublist) and negative values start from the end (index |
... |
Ignored. |
null_on_oob |
If |
df = pl$DataFrame( values = list(c(2, 2, NA), c(1, 2, 3), NA_real_, NULL), idx = c(1, 2, NA, 3) ) df$with_columns( using_expr = pl$col("values")$list$get("idx"), val_0 = pl$col("values")$list$get(0), val_minus_1 = pl$col("values")$list$get(-1), val_oob = pl$col("values")$list$get(10) )
df = pl$DataFrame( values = list(c(2, 2, NA), c(1, 2, 3), NA_real_, NULL), idx = c(1, 2, NA, 3) ) df$with_columns( using_expr = pl$col("values")$list$get("idx"), val_0 = pl$col("values")$list$get(0), val_minus_1 = pl$col("values")$list$get(-1), val_oob = pl$col("values")$list$get(10) )
n
values of a listGet the first n
values of a list
ExprList_head(n = 5L)
ExprList_head(n = 5L)
n |
Number of values to return for each sublist. Can be an Expr. Strings are parsed as column names. |
Expr
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), n = 1:2 ) df$with_columns( head_by_expr = pl$col("s")$list$head("n"), head_by_lit = pl$col("s")$list$head(2) )
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), n = 1:2 ) df$with_columns( head_by_expr = pl$col("s")$list$head("n"), head_by_lit = pl$col("s")$list$head(2) )
Join all string items in a sublist and place a separator between them. This
only works on columns of type list[str]
.
ExprList_join(separator, ignore_nulls = FALSE)
ExprList_join(separator, ignore_nulls = FALSE)
separator |
String to separate the items with. Can be an Expr. Strings are not parsed as columns. |
ignore_nulls |
If |
Expr
df = pl$DataFrame( s = list(c("a", "b", "c"), c("x", "y"), c("e", NA)), separator = c("-", "+", "/") ) df$with_columns( join_with_expr = pl$col("s")$list$join(pl$col("separator")), join_with_lit = pl$col("s")$list$join(" "), join_ignore_null = pl$col("s")$list$join(" ", ignore_nulls = TRUE) )
df = pl$DataFrame( s = list(c("a", "b", "c"), c("x", "y"), c("e", NA)), separator = c("-", "+", "/") ) df$with_columns( join_with_expr = pl$col("s")$list$join(pl$col("separator")), join_with_lit = pl$col("s")$list$join(" "), join_ignore_null = pl$col("s")$list$join(" ", ignore_nulls = TRUE) )
Get the last value in a list
ExprList_last()
ExprList_last()
Expr
df = pl$DataFrame(list(a = list(3:1, NULL, 1:2))) df$with_columns( last = pl$col("a")$list$last() )
df = pl$DataFrame(list(a = list(3:1, NULL, 1:2))) df$with_columns( last = pl$col("a")$list$last() )
Return the number of elements in each list. Null values are counted in the total.
ExprList_len()
ExprList_len()
Expr
df = pl$DataFrame(list(list_of_strs = list(c("a", "b", NA), "c"))) df$with_columns(len_list = pl$col("list_of_strs")$list$len())
df = pl$DataFrame(list(list_of_strs = list(c("a", "b", NA), "c"))) df$with_columns(len_list = pl$col("list_of_strs")$list$len())
Find the maximum value in a list
ExprList_max()
ExprList_max()
Expr
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(max = pl$col("values")$list$max())
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(max = pl$col("values")$list$max())
Compute the mean value of a list
ExprList_mean()
ExprList_mean()
Expr
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(mean = pl$col("values")$list$mean())
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(mean = pl$col("values")$list$mean())
Find the minimum value in a list
ExprList_min()
ExprList_min()
Expr
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(min = pl$col("values")$list$min())
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(min = pl$col("values")$list$min())
Get the number of unique values in a list
ExprList_n_unique()
ExprList_n_unique()
Expr
df = pl$DataFrame(values = list(c(2, 2, NA), c(1, 2, 3), NA_real_)) df$with_columns(unique = pl$col("values")$list$n_unique())
df = pl$DataFrame(values = list(c(2, 2, NA), c(1, 2, 3), NA_real_)) df$with_columns(unique = pl$col("values")$list$n_unique())
Reverse values in a list
ExprList_reverse()
ExprList_reverse()
Expr
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(reverse = pl$col("values")$list$reverse())
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(reverse = pl$col("values")$list$reverse())
Sample from this list
ExprList_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
ExprList_sample( n = NULL, ..., fraction = NULL, with_replacement = FALSE, shuffle = FALSE, seed = NULL )
n |
Number of items to return. Cannot be used with |
... |
Ignored. |
fraction |
Fraction of items to return. Cannot be used with |
with_replacement |
If |
shuffle |
Shuffle the order of sampled data points (implicitly |
seed |
numeric value of 0 to 2^52 Seed for the random number generator.
If |
Expr
df = pl$DataFrame( values = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), n = c(1, 1, 1, 2) ) df$with_columns( sample = pl$col("values")$list$sample(n = pl$col("n"), seed = 1) )
df = pl$DataFrame( values = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), n = c(1, 1, 1, 2) ) df$with_columns( sample = pl$col("values")$list$sample(n = pl$col("n"), seed = 1) )
This returns the "asymmetric difference", meaning only the elements of the
first list that are not in the second list. To get all elements that are in
only one of the two lists, use
$set_symmetric_difference()
.
ExprList_set_difference(other)
ExprList_set_difference(other)
other |
Other list variable. Can be an Expr or something coercible to an Expr. |
Note that the datatypes inside the list must have a common supertype. For
example, the first column can be list[i32]
and the second one can be
list[i8]
because it can be cast to list[i32]
. However, the second column
cannot be e.g list[f32]
.
Expr
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(difference = pl$col("a")$list$set_difference("b"))
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(difference = pl$col("a")$list$set_difference("b"))
Get the intersection of two list variables
ExprList_set_intersection(other)
ExprList_set_intersection(other)
other |
Other list variable. Can be an Expr or something coercible to an Expr. |
Note that the datatypes inside the list must have a common supertype. For
example, the first column can be list[i32]
and the second one can be
list[i8]
because it can be cast to list[i32]
. However, the second column
cannot be e.g list[f32]
.
Expr
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(intersection = pl$col("a")$list$set_intersection("b"))
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(intersection = pl$col("a")$list$set_intersection("b"))
This returns all elements that are in only one of the two lists. To get only
elements that are in the first list but not in the second one, use
$set_difference()
.
ExprList_set_symmetric_difference(other)
ExprList_set_symmetric_difference(other)
other |
Other list variable. Can be an Expr or something coercible to an Expr. |
Note that the datatypes inside the list must have a common supertype. For
example, the first column can be list[i32]
and the second one can be
list[i8]
because it can be cast to list[i32]
. However, the second column
cannot be e.g list[f32]
.
Expr
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns( symmetric_difference = pl$col("a")$list$set_symmetric_difference("b") )
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns( symmetric_difference = pl$col("a")$list$set_symmetric_difference("b") )
Get the union of two list variables
ExprList_set_union(other)
ExprList_set_union(other)
other |
Other list variable. Can be an Expr or something coercible to an Expr. |
Note that the datatypes inside the list must have a common supertype. For
example, the first column can be list[i32]
and the second one can be
list[i8]
because it can be cast to list[i32]
. However, the second column
cannot be e.g list[f32]
.
Expr
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(union = pl$col("a")$list$set_union("b"))
df = pl$DataFrame( a = list(1:3, NA_integer_, c(NA_integer_, 3L), 5:7), b = list(2:4, 3L, c(3L, 4L, NA_integer_), c(6L, 8L)) ) df$with_columns(union = pl$col("a")$list$set_union("b"))
n
indicesShift list values by n
indices
ExprList_shift(n = 1)
ExprList_shift(n = 1)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
Expr
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), idx = 1:2 ) df$with_columns( shift_by_expr = pl$col("s")$list$shift(pl$col("idx")), shift_by_lit = pl$col("s")$list$shift(2) )
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), idx = 1:2 ) df$with_columns( shift_by_expr = pl$col("s")$list$shift(pl$col("idx")), shift_by_lit = pl$col("s")$list$shift(2) )
This extracts length
values at most, starting at index offset
. This can
return less than length
values if length
is larger than the number of
values.
ExprList_slice(offset, length = NULL)
ExprList_slice(offset, length = NULL)
offset |
Start index. Negative indexing is supported. Can be an Expr. Strings are parsed as column names. |
length |
Length of the slice. If |
Expr
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), idx_off = 1:2, len = c(4, 1) ) df$with_columns( slice_by_expr = pl$col("s")$list$slice("idx_off", "len"), slice_by_lit = pl$col("s")$list$slice(2, 3) )
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), idx_off = 1:2, len = c(4, 1) ) df$with_columns( slice_by_expr = pl$col("s")$list$slice("idx_off", "len"), slice_by_lit = pl$col("s")$list$slice(2, 3) )
Sort values in a list
ExprList_sort(descending = FALSE)
ExprList_sort(descending = FALSE)
descending |
Sort values in descending order |
Expr
df = pl$DataFrame(values = list(c(NA, 2, 1, 3), c(Inf, 2, 3, NaN), NA_real_)) df$with_columns(sort = pl$col("values")$list$sort())
df = pl$DataFrame(values = list(c(NA, 2, 1, 3), c(Inf, 2, 3, NaN), NA_real_)) df$with_columns(sort = pl$col("values")$list$sort())
Sum all elements in a list
ExprList_sum()
ExprList_sum()
Expr
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(sum = pl$col("values")$list$sum())
df = pl$DataFrame(values = list(c(1, 2, 3, NA), c(2, 3), NA_real_)) df$with_columns(sum = pl$col("values")$list$sum())
n
values of a listGet the last n
values of a list
ExprList_tail(n = 5L)
ExprList_tail(n = 5L)
n |
Number of values to return for each sublist. Can be an Expr. Strings are parsed as column names. |
Expr
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), n = 1:2 ) df$with_columns( tail_by_expr = pl$col("s")$list$tail("n"), tail_by_lit = pl$col("s")$list$tail(2) )
df = pl$DataFrame( s = list(1:4, c(10L, 2L, 1L)), n = 1:2 ) df$with_columns( tail_by_expr = pl$col("s")$list$tail("n"), tail_by_lit = pl$col("s")$list$tail(2) )
List
to Struct
Convert a Series of type List
to Struct
ExprList_to_struct( n_field_strategy = c("first_non_null", "max_width"), fields = NULL, upper_bound = 0 )
ExprList_to_struct( n_field_strategy = c("first_non_null", "max_width"), fields = NULL, upper_bound = 0 )
n_field_strategy |
Strategy to determine the number of fields of the
struct. If |
fields |
If the name and number of the desired fields is known in
advance, a list of field names can be given, which will be assigned by
index. Otherwise, to dynamically assign field names, a custom R function
that takes an R double and outputs a string value can be used. If
|
upper_bound |
A |
Expr
df = pl$DataFrame(list(a = list(1:2, 1:3))) # this discards the third value of the second list as the struct length is # determined based on the length of the first non-empty list df$with_columns( struct = pl$col("a")$list$to_struct() ) # we can use "max_width" to keep all values df$with_columns( struct = pl$col("a")$list$to_struct(n_field_strategy = "max_width") ) # pass a custom function that will name all fields by adding a prefix df2 = df$with_columns( pl$col("a")$list$to_struct( fields = \(idx) paste0("col_", idx) ) ) df2 df2$unnest() df2$to_list()
df = pl$DataFrame(list(a = list(1:2, 1:3))) # this discards the third value of the second list as the struct length is # determined based on the length of the first non-empty list df$with_columns( struct = pl$col("a")$list$to_struct() ) # we can use "max_width" to keep all values df$with_columns( struct = pl$col("a")$list$to_struct(n_field_strategy = "max_width") ) # pass a custom function that will name all fields by adding a prefix df2 = df$with_columns( pl$col("a")$list$to_struct( fields = \(idx) paste0("col_", idx) ) ) df2 df2$unnest() df2$to_list()
Get unique values in a list
ExprList_unique(maintain_order = FALSE)
ExprList_unique(maintain_order = FALSE)
maintain_order |
Maintain order of data. This requires more work. |
Expr
df = pl$DataFrame(values = list(c(2, 2, NA), c(1, 2, 3), NA_real_)) df$with_columns(unique = pl$col("values")$list$unique())
df = pl$DataFrame(values = list(c(2, 2, NA), c(1, 2, 3), NA_real_)) df$with_columns(unique = pl$col("values")$list$unique())
Indicate if this expression is the same as another expression. See also the
counterpart $meta$neq()
.
ExprMeta_eq(other)
ExprMeta_eq(other)
other |
Expr to compare with |
A logical value
# three naive expression literals e1 = pl$lit(40) + 2 e2 = pl$lit(42) e3 = pl$lit(40) + 2 # e1 and e3 are identical expressions e1$meta$eq(e3) # when evaluated, e1 and e2 are equal e1$eq(e2)$to_r() # however, on the meta-level, e1 and e2 are NOT identical expressions e1$meta$eq(e2)
# three naive expression literals e1 = pl$lit(40) + 2 e2 = pl$lit(42) e3 = pl$lit(40) + 2 # e1 and e3 are identical expressions e1$meta$eq(e3) # when evaluated, e1 and e2 are equal e1$eq(e2)$to_r() # however, on the meta-level, e1 and e2 are NOT identical expressions e1$meta$eq(e2)
Indicate if an expression has multiple outputs
ExprMeta_has_multiple_outputs()
ExprMeta_has_multiple_outputs()
Boolean
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$has_multiple_outputs() # pl$all() select multiple cols to modify them, so it has multiple outputs pl$all()$meta$has_multiple_outputs()
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$has_multiple_outputs() # pl$all() select multiple cols to modify them, so it has multiple outputs pl$all()$meta$has_multiple_outputs()
Indicate if an expression uses a regex projection
ExprMeta_is_regex_projection()
ExprMeta_is_regex_projection()
Boolean
pl$col("^Sepal.*$")$meta$is_regex_projection() pl$col("Sepal.Length")$meta$is_regex_projection()
pl$col("^Sepal.*$")$meta$is_regex_projection() pl$col("Sepal.Length")$meta$is_regex_projection()
Indicate if this expression is different from another expression. See also
the counterpart $meta$eq()
.
ExprMeta_neq(other)
ExprMeta_neq(other)
other |
Expr to compare with |
A logical value
# three naive expression literals e1 = pl$lit(40) + 2 e2 = pl$lit(42) e3 = pl$lit(40) + 2 # e1 and e3 are identical expressions e1$meta$neq(e3) # when evaluated, e1 and e2 are equal e1$neq(e2)$to_r() # however, on the meta-level, e1 and e2 are different e1$meta$neq(e2)
# three naive expression literals e1 = pl$lit(40) + 2 e2 = pl$lit(42) e3 = pl$lit(40) + 2 # e1 and e3 are identical expressions e1$meta$neq(e3) # when evaluated, e1 and e2 are equal e1$neq(e2)$to_r() # however, on the meta-level, e1 and e2 are different e1$meta$neq(e2)
It may not always be possible to determine the output name as
that can depend on the schema of the context; in that case this will
raise an error if raise_if_undetermined
is TRUE
(the default), or
return NA
otherwise.
ExprMeta_output_name(..., raise_if_undetermined = TRUE)
ExprMeta_output_name(..., raise_if_undetermined = TRUE)
... |
Ignored. |
raise_if_undetermined |
If |
A character vector
e = pl$col("foo") * pl$col("bar") e$meta$output_name() e_filter = pl$col("foo")$filter(pl$col("bar") == 13) e_filter$meta$output_name() e_sum_over = pl$sum("foo")$over("groups") e_sum_over$meta$output_name() e_sum_slice = pl$sum("foo")$slice(pl$len() - 10, pl$col("bar")) e_sum_slice$meta$output_name() pl$len()$meta$output_name() pl$col("*")$meta$output_name(raise_if_undetermined = FALSE)
e = pl$col("foo") * pl$col("bar") e$meta$output_name() e_filter = pl$col("foo")$filter(pl$col("bar") == 13) e_filter$meta$output_name() e_sum_over = pl$sum("foo")$over("groups") e_sum_over$meta$output_name() e_sum_slice = pl$sum("foo")$slice(pl$len() - 10, pl$col("bar")) e_sum_slice$meta$output_name() pl$len()$meta$output_name() pl$col("*")$meta$output_name(raise_if_undetermined = FALSE)
Pop the latest expression and return the input(s) of the popped expression.
ExprMeta_pop()
ExprMeta_pop()
A list of expressions which in most cases will have a unit length.
This is not the case when an expression has multiple inputs, for instance
in a $fold()
expression.
e1 = pl$lit(40) + 2 e2 = pl$lit(42)$sum() e1 e1$meta$pop() e2 e2$meta$pop()
e1 = pl$lit(40) + 2 e2 = pl$lit(42)$sum() e1 e1$meta$pop() e2 e2$meta$pop()
This returns the names of input columns. Use
$meta$output_name()
to get the name of output
column.
ExprMeta_root_names()
ExprMeta_root_names()
A character vector
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$root_names()
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$root_names()
Format an expression as a tree
ExprMeta_tree_format(return_as_string = FALSE)
ExprMeta_tree_format(return_as_string = FALSE)
return_as_string |
Return the tree as a character vector? If |
If return_as_string
is TRUE
, a character vector describing the tree.
If return_as_string
is FALSE
, prints the tree in the console but doesn't
return any value.
my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2 my_expr$meta$tree_format()
my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2 my_expr$meta$tree_format()
This removes any renaming operation like $alias()
or
$name$keep()
. Polars uses the "leftmost rule" to determine
naming, meaning that the first element of the expression will be used to
name the output.
ExprMeta_undo_aliases()
ExprMeta_undo_aliases()
Expr with aliases undone
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$output_name() e$meta$undo_aliases()$meta$output_name()
e = (pl$col("alice") + pl$col("eve"))$alias("bob") e$meta$output_name() e$meta$undo_aliases()$meta$output_name()
Keep the original root name of the expression.
ExprName_keep()
ExprName_keep()
Expr
pl$DataFrame(list(alice = 1:3))$select(pl$col("alice")$alias("bob")$name$keep())
pl$DataFrame(list(alice = 1:3))$select(pl$col("alice")$alias("bob")$name$keep())
Add a prefix to a column name
ExprName_prefix(prefix)
ExprName_prefix(prefix)
prefix |
Prefix to be added to column name(s) |
Expr
$suffix()
to add a suffix
dat = as_polars_df(mtcars) dat$select( pl$col("mpg"), pl$col("mpg")$name$prefix("name_"), pl$col("cyl", "drat")$name$prefix("bar_") )
dat = as_polars_df(mtcars) dat$select( pl$col("mpg"), pl$col("mpg")$name$prefix("name_"), pl$col("cyl", "drat")$name$prefix("bar_") )
Add a prefix to all fields name of a struct
ExprName_prefix_fields(prefix)
ExprName_prefix_fields(prefix)
prefix |
Prefix to add to the field name. |
Expr
df = pl$DataFrame(a = 1, b = 2)$select( pl$struct(pl$all())$alias("my_struct") ) df$with_columns(pl$col("my_struct")$name$prefix_fields("col_"))$unnest()
df = pl$DataFrame(a = 1, b = 2)$select( pl$struct(pl$all())$alias("my_struct") ) df$with_columns(pl$col("my_struct")$name$prefix_fields("col_"))$unnest()
Add a suffix to a column name
ExprName_suffix(suffix)
ExprName_suffix(suffix)
suffix |
Suffix to be added to column name(s) |
Expr
$prefix()
to add a prefix
dat = as_polars_df(mtcars) dat$select( pl$col("mpg"), pl$col("mpg")$name$suffix("_foo"), pl$col("cyl", "drat")$name$suffix("_bar") )
dat = as_polars_df(mtcars) dat$select( pl$col("mpg"), pl$col("mpg")$name$suffix("_foo"), pl$col("cyl", "drat")$name$suffix("_bar") )
Add a suffix to all fields name of a struct
ExprName_suffix_fields(suffix)
ExprName_suffix_fields(suffix)
suffix |
Suffix to add to the field name. |
Expr
df = pl$DataFrame(a = 1, b = 2)$select( pl$struct(pl$all())$alias("my_struct") ) df$with_columns(pl$col("my_struct")$name$suffix_fields("_post"))$unnest()
df = pl$DataFrame(a = 1, b = 2)$select( pl$struct(pl$all())$alias("my_struct") ) df$with_columns(pl$col("my_struct")$name$suffix_fields("_post"))$unnest()
Due to implementation constraints, this method can only be called as the last expression in a chain.
ExprName_to_lowercase()
ExprName_to_lowercase()
Expr
pl$DataFrame(Alice = 1:3)$with_columns(pl$col("Alice")$name$to_lowercase())
pl$DataFrame(Alice = 1:3)$with_columns(pl$col("Alice")$name$to_lowercase())
Due to implementation constraints, this method can only be called as the last expression in a chain.
ExprName_to_uppercase()
ExprName_to_uppercase()
Expr
pl$DataFrame(Alice = 1:3)$with_columns(pl$col("Alice")$name$to_uppercase())
pl$DataFrame(Alice = 1:3)$with_columns(pl$col("Alice")$name$to_uppercase())
Check if string contains a substring that matches a pattern
ExprStr_contains(pattern, ..., literal = FALSE, strict = TRUE)
ExprStr_contains(pattern, ..., literal = FALSE, strict = TRUE)
pattern |
A character or something can be coerced to a string Expr of a valid regex pattern, compatible with the regex crate. |
... |
Ignored. |
literal |
Logical. If |
strict |
Logical. If |
To modify regular expression behaviour (such as case-sensitivity)
with flags, use the inline (?iLmsuxU)
syntax. See the regex crate’s section
on grouping and flags
for additional information about the use of inline expression modifiers.
Expr of Boolean data type
$str$start_with()
: Check if string values
start with a substring.
$str$ends_with()
: Check if string values end
with a substring.
$str$find()
: Return the index position of the first
substring matching a pattern.
# The inline `(?i)` syntax example pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( default_match = pl$col("s")$str$contains("AA"), insensitive_match = pl$col("s")$str$contains("(?i)AA") ) df = pl$DataFrame(txt = c("Crab", "cat and dog", "rab$bit", NA)) df$with_columns( regex = pl$col("txt")$str$contains("cat|bit"), literal = pl$col("txt")$str$contains("rab$", literal = TRUE) )
# The inline `(?i)` syntax example pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( default_match = pl$col("s")$str$contains("AA"), insensitive_match = pl$col("s")$str$contains("(?i)AA") ) df = pl$DataFrame(txt = c("Crab", "cat and dog", "rab$bit", NA)) df$with_columns( regex = pl$col("txt")$str$contains("cat|bit"), literal = pl$col("txt")$str$contains("rab$", literal = TRUE) )
This function determines if any of the patterns find a match.
ExprStr_contains_any(patterns, ..., ascii_case_insensitive = FALSE)
ExprStr_contains_any(patterns, ..., ascii_case_insensitive = FALSE)
patterns |
Character vector or something can be coerced to strings Expr of a valid regex pattern, compatible with the regex crate. |
... |
Ignored. |
ascii_case_insensitive |
Enable ASCII-aware case insensitive matching. When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. |
Expr of Boolean data type
df = pl$DataFrame( lyrics = c( "Everybody wants to rule the world", "Tell me what you want, what you really really want", "Can you feel the love tonight" ) ) df$with_columns( contains_any = pl$col("lyrics")$str$contains_any(c("you", "me")) )
df = pl$DataFrame( lyrics = c( "Everybody wants to rule the world", "Tell me what you want, what you really really want", "Can you feel the love tonight" ) ) df$with_columns( contains_any = pl$col("lyrics")$str$contains_any(c("you", "me")) )
Count all successive non-overlapping regex matches
ExprStr_count_matches(pattern, ..., literal = FALSE)
ExprStr_count_matches(pattern, ..., literal = FALSE)
pattern |
A character or something can be coerced to a string Expr of a valid regex pattern, compatible with the regex crate. |
... |
Ignored. |
literal |
Logical. If |
Expr of data type UInt32
.
Returns null
if the original value is null
.
df = pl$DataFrame(foo = c("12 dbc 3xy", "cat\\w", "1zy3\\d\\d", NA)) df$with_columns( count_digits = pl$col("foo")$str$count_matches(r"(\d)"), count_slash_d = pl$col("foo")$str$count_matches(r"(\d)", literal = TRUE) )
df = pl$DataFrame(foo = c("12 dbc 3xy", "cat\\w", "1zy3\\d\\d", NA)) df$with_columns( count_digits = pl$col("foo")$str$count_matches(r"(\d)"), count_slash_d = pl$col("foo")$str$count_matches(r"(\d)", literal = TRUE) )
Decode a value using the provided encoding
ExprStr_decode(encoding, ..., strict = TRUE)
ExprStr_decode(encoding, ..., strict = TRUE)
encoding |
Either 'hex' or 'base64'. |
... |
Not used currently. |
strict |
If |
String array with values decoded using provided encoding
df = pl$DataFrame(strings = c("foo", "bar", NA)) df$select(pl$col("strings")$str$encode("hex")) df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) )
df = pl$DataFrame(strings = c("foo", "bar", NA)) df$select(pl$col("strings")$str$encode("hex")) df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) )
Encode a value using the provided encoding
ExprStr_encode(encoding)
ExprStr_encode(encoding)
encoding |
Either 'hex' or 'base64'. |
String array with values encoded using provided encoding
df = pl$DataFrame(strings = c("foo", "bar", NA)) df$select(pl$col("strings")$str$encode("hex")) df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) )
df = pl$DataFrame(strings = c("foo", "bar", NA)) df$select(pl$col("strings")$str$encode("hex")) df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) )
Check if string values end with a substring.
ExprStr_ends_with(sub)
ExprStr_ends_with(sub)
sub |
Suffix substring or Expr. |
See also $str$starts_with()
and $str$contains()
.
Expr of Boolean data type
df = pl$DataFrame(fruits = c("apple", "mango", NA)) df$select( pl$col("fruits"), pl$col("fruits")$str$ends_with("go")$alias("has_suffix") )
df = pl$DataFrame(fruits = c("apple", "mango", NA)) df$select( pl$col("fruits"), pl$col("fruits")$str$ends_with("go")$alias("has_suffix") )
Extract the target capture group from provided patterns
ExprStr_extract(pattern, group_index)
ExprStr_extract(pattern, group_index)
pattern |
A valid regex pattern. Can be an Expr or something coercible to an Expr. Strings are parsed as column names. |
group_index |
Index of the targeted capture group. Group 0 means the whole pattern, first group begin at index 1 (default). |
String array. Contains null if original value is null or regex capture nothing.
df = pl$DataFrame( a = c( "http://vote.com/ballon_dor?candidate=messi&ref=polars", "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars" ) ) df$with_columns( extracted = pl$col("a")$str$extract(pl$lit(r"(candidate=(\w+))"), 1) )
df = pl$DataFrame( a = c( "http://vote.com/ballon_dor?candidate=messi&ref=polars", "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars" ) ) df$with_columns( extracted = pl$col("a")$str$extract(pl$lit(r"(candidate=(\w+))"), 1) )
Extracts all matches for the given regex pattern. Extracts each successive non-overlapping regex match in an individual string as an array.
ExprStr_extract_all(pattern)
ExprStr_extract_all(pattern)
pattern |
A valid regex pattern |
List[String]
array. Contain null if original value is null or regex capture
nothing.
df = pl$DataFrame(foo = c("123 bla 45 asd", "xyz 678 910t")) df$select( pl$col("foo")$str$extract_all(r"((\d+))")$alias("extracted_nrs") )
df = pl$DataFrame(foo = c("123 bla 45 asd", "xyz 678 910t")) df$select( pl$col("foo")$str$extract_all(r"((\d+))")$alias("extracted_nrs") )
Extract all capture groups for the given regex pattern
ExprStr_extract_groups(pattern)
ExprStr_extract_groups(pattern)
pattern |
A character of a valid regular expression pattern containing at least one capture group, compatible with the regex crate. |
All group names are strings. If your pattern contains unnamed groups, their numerical position is converted to a string. See examples.
Expr of data type Struct with fields of data type
String
.
df = pl$DataFrame( url = c( "http://vote.com/ballon_dor?candidate=messi&ref=python", "http://vote.com/ballon_dor?candidate=weghorst&ref=polars", "http://vote.com/ballon_dor?error=404&ref=rust" ) ) pattern = r"(candidate=(?<candidate>\w+)&ref=(?<ref>\w+))" df$with_columns( captures = pl$col("url")$str$extract_groups(pattern) )$unnest("captures") # If the groups are unnamed, their numerical position (as a string) is used: pattern = r"(candidate=(\w+)&ref=(\w+))" df$with_columns( captures = pl$col("url")$str$extract_groups(pattern) )$unnest("captures")
df = pl$DataFrame( url = c( "http://vote.com/ballon_dor?candidate=messi&ref=python", "http://vote.com/ballon_dor?candidate=weghorst&ref=polars", "http://vote.com/ballon_dor?error=404&ref=rust" ) ) pattern = r"(candidate=(?<candidate>\w+)&ref=(?<ref>\w+))" df$with_columns( captures = pl$col("url")$str$extract_groups(pattern) )$unnest("captures") # If the groups are unnamed, their numerical position (as a string) is used: pattern = r"(candidate=(\w+)&ref=(\w+))" df$with_columns( captures = pl$col("url")$str$extract_groups(pattern) )$unnest("captures")
Use the aho-corasick algorithm to extract matches
ExprStr_extract_many( patterns, ..., ascii_case_insensitive = FALSE, overlapping = FALSE )
ExprStr_extract_many( patterns, ..., ascii_case_insensitive = FALSE, overlapping = FALSE )
patterns |
String patterns to search. This can be an Expr or something coercible to an Expr. Strings are parsed as column names. |
... |
Ignored. |
ascii_case_insensitive |
Enable ASCII-aware case insensitive matching. When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. |
overlapping |
Whether matches can overlap. |
Expr: Series of dtype String.
df = pl$DataFrame(values = "discontent") patterns = pl$lit(c("winter", "disco", "onte", "discontent")) df$with_columns( matches = pl$col("values")$str$extract_many(patterns), matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE) ) df = pl$DataFrame( values = c("discontent", "rhapsody"), patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce")) ) df$select(pl$col("values")$str$extract_many("patterns"))
df = pl$DataFrame(values = "discontent") patterns = pl$lit(c("winter", "disco", "onte", "discontent")) df$with_columns( matches = pl$col("values")$str$extract_many(patterns), matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE) ) df = pl$DataFrame( values = c("discontent", "rhapsody"), patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce")) ) df$select(pl$col("values")$str$extract_many("patterns"))
Return the index position of the first substring matching a pattern
ExprStr_find(pattern, ..., literal = FALSE, strict = TRUE)
ExprStr_find(pattern, ..., literal = FALSE, strict = TRUE)
pattern |
A character or something can be coerced to a string Expr of a valid regex pattern, compatible with the regex crate. |
... |
Ignored. |
literal |
Logical. If |
strict |
Logical. If |
To modify regular expression behaviour (such as case-sensitivity)
with flags, use the inline (?iLmsuxU)
syntax. See the regex crate’s section
on grouping and flags
for additional information about the use of inline expression modifiers.
An Expr of data type UInt32
$str$start_with()
: Check if string values
start with a substring.
$str$ends_with()
: Check if string values end
with a substring.
$str$contains()
: Check if string contains a substring
that matches a pattern.
pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( default_match = pl$col("s")$str$find("Aa"), insensitive_match = pl$col("s")$str$find("(?i)Aa") )
pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( default_match = pl$col("s")$str$find("Aa"), insensitive_match = pl$col("s")$str$find("(?i)Aa") )
Return the first n characters of each string
ExprStr_head(n)
ExprStr_head(n)
n |
Length of the slice (integer or expression). Strings are parsed as column names. Negative indexing is supported. |
The n
input is defined in terms of the number of characters in the (UTF-8)
string. A character is defined as a Unicode scalar value. A single character
is represented by a single byte when working with ASCII text, and a maximum
of 4 bytes otherwise.
When the n
input is negative, head()
returns characters up to the n
th
from the end of the string. For example, if n = -3
, then all characters
except the last three are returned.
If the length of the string has fewer than n
characters, the full string is
returned.
Expr: Series of dtype String.
df = pl$DataFrame( s = c("pear", NA, "papaya", "dragonfruit"), n = c(3, 4, -2, -5) ) df$with_columns( s_head_5 = pl$col("s")$str$head(5), s_head_n = pl$col("s")$str$head("n") )
df = pl$DataFrame( s = c("pear", NA, "papaya", "dragonfruit"), n = c(3, 4, -2, -5) ) df$with_columns( s_head_5 = pl$col("s")$str$head(5), s_head_n = pl$col("s")$str$head("n") )
Vertically concatenate the string values in the column to a single string value.
ExprStr_join(delimiter = "", ..., ignore_nulls = TRUE)
ExprStr_join(delimiter = "", ..., ignore_nulls = TRUE)
delimiter |
The delimiter to insert between consecutive string values. |
... |
Ignored. |
ignore_nulls |
Ignore null values (default). If |
Expr of String concatenated
# concatenate a Series of strings to a single string df = pl$DataFrame(foo = c(1, NA, 2)) df$select(pl$col("foo")$str$join("-")) df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE))
# concatenate a Series of strings to a single string df = pl$DataFrame(foo = c(1, NA, 2)) df$select(pl$col("foo")$str$join("-")) df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE))
Parse string values as JSON.
ExprStr_json_decode(dtype, infer_schema_length = 100)
ExprStr_json_decode(dtype, infer_schema_length = 100)
dtype |
The dtype to cast the extracted value to. If |
infer_schema_length |
How many rows to parse to determine the schema.
If |
Throw errors if encounter invalid json strings.
Expr returning a struct
df = pl$DataFrame( json_val = c('{"a":1, "b": true}', NA, '{"a":2, "b": false}') ) dtype = pl$Struct(pl$Field("a", pl$Int64), pl$Field("b", pl$Boolean)) df$select(pl$col("json_val")$str$json_decode(dtype))
df = pl$DataFrame( json_val = c('{"a":1, "b": true}', NA, '{"a":2, "b": false}') ) dtype = pl$Struct(pl$Field("a", pl$Int64), pl$Field("b", pl$Boolean)) df$select(pl$col("json_val")$str$json_decode(dtype))
Extract the first match of JSON string with the provided JSONPath expression
ExprStr_json_path_match(json_path)
ExprStr_json_path_match(json_path)
json_path |
A valid JSON path query string. |
Throw errors if encounter invalid JSON strings. All return value will be cast to String regardless of the original value.
Documentation on JSONPath standard can be found here: https://goessner.net/articles/JsonPath/.
String array. Contain null if original value is null or the json_path return nothing.
df = pl$DataFrame( json_val = c('{"a":"1"}', NA, '{"a":2}', '{"a":2.1}', '{"a":true}') ) df$select(pl$col("json_val")$str$json_path_match("$.a"))
df = pl$DataFrame( json_val = c('{"a":"1"}', NA, '{"a":2}', '{"a":2.1}', '{"a":true}') ) df$select(pl$col("json_val")$str$json_path_match("$.a"))
Get length of the strings as UInt32 (as number of bytes). Use $str$len_chars()
to get the number of characters.
ExprStr_len_bytes()
ExprStr_len_bytes()
If you know that you are working with ASCII text, lengths
will be
equivalent, and faster (returns length in terms of the number of bytes).
Expr of u32
pl$DataFrame( s = c("Café", NA, "345", "æøå") )$select( pl$col("s"), pl$col("s")$str$len_bytes()$alias("lengths"), pl$col("s")$str$len_chars()$alias("n_chars") )
pl$DataFrame( s = c("Café", NA, "345", "æøå") )$select( pl$col("s"), pl$col("s")$str$len_bytes()$alias("lengths"), pl$col("s")$str$len_chars()$alias("n_chars") )
Get length of the strings as UInt32 (as number of characters). Use
$str$len_bytes()
to get the number of bytes.
ExprStr_len_chars()
ExprStr_len_chars()
If you know that you are working with ASCII text, lengths
will be
equivalent, and faster (returns length in terms of the number of bytes).
Expr of u32
pl$DataFrame( s = c("Café", NA, "345", "æøå") )$select( pl$col("s"), pl$col("s")$str$len_bytes()$alias("lengths"), pl$col("s")$str$len_chars()$alias("n_chars") )
pl$DataFrame( s = c("Café", NA, "345", "æøå") )$select( pl$col("s"), pl$col("s")$str$len_bytes()$alias("lengths"), pl$col("s")$str$len_chars()$alias("n_chars") )
Return the string left justified in a string of length width
.
ExprStr_pad_end(width, fillchar = " ")
ExprStr_pad_end(width, fillchar = " ")
width |
Justify left to this length. |
fillchar |
Fill with this ASCII character. |
Padding is done using the specified fillchar
. The original string
is returned if width
is less than or equal to len(s)
.
Expr of String
df = pl$DataFrame(a = c("cow", "monkey", NA, "hippopotamus")) df$select(pl$col("a")$str$pad_end(8, "*"))
df = pl$DataFrame(a = c("cow", "monkey", NA, "hippopotamus")) df$select(pl$col("a")$str$pad_end(8, "*"))
Return the string right justified in a string of length width
.
ExprStr_pad_start(width, fillchar = " ")
ExprStr_pad_start(width, fillchar = " ")
width |
Justify right to this length. |
fillchar |
Fill with this ASCII character. |
Padding is done using the specified fillchar
. The original string
is returned if width
is less than or equal to len(s)
.
Expr of String
df = pl$DataFrame(a = c("cow", "monkey", NA, "hippopotamus")) df$select(pl$col("a")$str$pad_start(8, "*"))
df = pl$DataFrame(a = c("cow", "monkey", NA, "hippopotamus")) df$select(pl$col("a")$str$pad_start(8, "*"))
Replace first matching regex/literal substring with a new string value
ExprStr_replace(pattern, value, ..., literal = FALSE, n = 1L)
ExprStr_replace(pattern, value, ..., literal = FALSE, n = 1L)
pattern |
A character or something can be coerced to a string Expr of a valid regex pattern, compatible with the regex crate. |
value |
A character or an Expr of string that will replace the matched substring. |
... |
Ignored. |
literal |
Logical. If |
n |
A number of matches to replace.
Note that regex replacement with |
To modify regular expression behaviour (such as case-sensitivity)
with flags, use the inline (?iLmsuxU)
syntax. See the regex crate’s section
on grouping and flags
for additional information about the use of inline expression modifiers.
Expr of String type
The dollar sign ($
) is a special character related to capture groups.
To refer to a literal dollar sign, use $$
instead or set literal
to TRUE
.
df = pl$DataFrame(id = 1L:2L, text = c("123abc", "abc456")) df$with_columns(pl$col("text")$str$replace(r"(abc\b)", "ABC")) # Capture groups are supported. # Use `${1}` in the value string to refer to the first capture group in the pattern, # `${2}` to refer to the second capture group, and so on. # You can also use named capture groups. df = pl$DataFrame(word = c("hat", "hut")) df$with_columns( positional = pl$col("word")$str$replace("h(.)t", "b${1}d"), named = pl$col("word")$str$replace("h(?<vowel>.)t", "b${vowel}d") ) # Apply case-insensitive string replacement using the `(?i)` flag. df = pl$DataFrame( city = "Philadelphia", season = c("Spring", "Summer", "Autumn", "Winter"), weather = c("Rainy", "Sunny", "Cloudy", "Snowy") ) df$with_columns( pl$col("weather")$str$replace("(?i)foggy|rainy|cloudy|snowy", "Sunny") )
df = pl$DataFrame(id = 1L:2L, text = c("123abc", "abc456")) df$with_columns(pl$col("text")$str$replace(r"(abc\b)", "ABC")) # Capture groups are supported. # Use `${1}` in the value string to refer to the first capture group in the pattern, # `${2}` to refer to the second capture group, and so on. # You can also use named capture groups. df = pl$DataFrame(word = c("hat", "hut")) df$with_columns( positional = pl$col("word")$str$replace("h(.)t", "b${1}d"), named = pl$col("word")$str$replace("h(?<vowel>.)t", "b${vowel}d") ) # Apply case-insensitive string replacement using the `(?i)` flag. df = pl$DataFrame( city = "Philadelphia", season = c("Spring", "Summer", "Autumn", "Winter"), weather = c("Rainy", "Sunny", "Cloudy", "Snowy") ) df$with_columns( pl$col("weather")$str$replace("(?i)foggy|rainy|cloudy|snowy", "Sunny") )
Replace all matching regex/literal substrings with a new string value
ExprStr_replace_all(pattern, value, ..., literal = FALSE)
ExprStr_replace_all(pattern, value, ..., literal = FALSE)
pattern |
A character or something can be coerced to a string Expr of a valid regex pattern, compatible with the regex crate. |
value |
A character or an Expr of string that will replace the matched substring. |
... |
Ignored. |
literal |
Logical. If |
To modify regular expression behaviour (such as case-sensitivity)
with flags, use the inline (?iLmsuxU)
syntax. See the regex crate’s section
on grouping and flags
for additional information about the use of inline expression modifiers.
Expr of String type
The dollar sign ($
) is a special character related to capture groups.
To refer to a literal dollar sign, use $$
instead or set literal
to TRUE
.
df = pl$DataFrame(id = 1L:2L, text = c("abcabc", "123a123")) df$with_columns(pl$col("text")$str$replace_all("a", "-")) # Capture groups are supported. # Use `${1}` in the value string to refer to the first capture group in the pattern, # `${2}` to refer to the second capture group, and so on. # You can also use named capture groups. df = pl$DataFrame(word = c("hat", "hut")) df$with_columns( positional = pl$col("word")$str$replace_all("h(.)t", "b${1}d"), named = pl$col("word")$str$replace_all("h(?<vowel>.)t", "b${vowel}d") ) # Apply case-insensitive string replacement using the `(?i)` flag. df = pl$DataFrame( city = "Philadelphia", season = c("Spring", "Summer", "Autumn", "Winter"), weather = c("Rainy", "Sunny", "Cloudy", "Snowy") ) df$with_columns( pl$col("weather")$str$replace_all( "(?i)foggy|rainy|cloudy|snowy", "Sunny" ) )
df = pl$DataFrame(id = 1L:2L, text = c("abcabc", "123a123")) df$with_columns(pl$col("text")$str$replace_all("a", "-")) # Capture groups are supported. # Use `${1}` in the value string to refer to the first capture group in the pattern, # `${2}` to refer to the second capture group, and so on. # You can also use named capture groups. df = pl$DataFrame(word = c("hat", "hut")) df$with_columns( positional = pl$col("word")$str$replace_all("h(.)t", "b${1}d"), named = pl$col("word")$str$replace_all("h(?<vowel>.)t", "b${vowel}d") ) # Apply case-insensitive string replacement using the `(?i)` flag. df = pl$DataFrame( city = "Philadelphia", season = c("Spring", "Summer", "Autumn", "Winter"), weather = c("Rainy", "Sunny", "Cloudy", "Snowy") ) df$with_columns( pl$col("weather")$str$replace_all( "(?i)foggy|rainy|cloudy|snowy", "Sunny" ) )
This function replaces several matches at once.
ExprStr_replace_many(patterns, replace_with, ascii_case_insensitive = FALSE)
ExprStr_replace_many(patterns, replace_with, ascii_case_insensitive = FALSE)
patterns |
String patterns to search. Can be an Expr. |
replace_with |
A vector of strings used as replacements. If this is of
length 1, then it is applied to all matches. Otherwise, it must be of same
length as the |
ascii_case_insensitive |
Enable ASCII-aware case insensitive matching. When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. |
Expr
df = pl$DataFrame( lyrics = c( "Everybody wants to rule the world", "Tell me what you want, what you really really want", "Can you feel the love tonight" ) ) # a replacement of length 1 is applied to all matches df$with_columns( remove_pronouns = pl$col("lyrics")$str$replace_many(c("you", "me"), "") ) # if there are more than one replacement, the patterns and replacements are # matched df$with_columns( fake_pronouns = pl$col("lyrics")$str$replace_many(c("you", "me"), c("foo", "bar")) )
df = pl$DataFrame( lyrics = c( "Everybody wants to rule the world", "Tell me what you want, what you really really want", "Can you feel the love tonight" ) ) # a replacement of length 1 is applied to all matches df$with_columns( remove_pronouns = pl$col("lyrics")$str$replace_many(c("you", "me"), "") ) # if there are more than one replacement, the patterns and replacements are # matched df$with_columns( fake_pronouns = pl$col("lyrics")$str$replace_many(c("you", "me"), c("foo", "bar")) )
Returns string values in reversed order
ExprStr_reverse()
ExprStr_reverse()
Expr
df = pl$DataFrame(text = c("foo", "bar", NA)) df$with_columns(reversed = pl$col("text")$str$reverse())
df = pl$DataFrame(text = c("foo", "bar", NA)) df$with_columns(reversed = pl$col("text")$str$reverse())
Create subslices of the string values of a String Series
ExprStr_slice(offset, length = NULL)
ExprStr_slice(offset, length = NULL)
offset |
Start index. Negative indexing is supported. |
length |
Length of the slice. If |
Expr: Series of dtype String.
df = pl$DataFrame(s = c("pear", NA, "papaya", "dragonfruit")) df$with_columns( pl$col("s")$str$slice(-3)$alias("s_sliced") )
df = pl$DataFrame(s = c("pear", NA, "papaya", "dragonfruit")) df$with_columns( pl$col("s")$str$slice(-3)$alias("s_sliced") )
Split the string by a substring
ExprStr_split(by, inclusive = FALSE)
ExprStr_split(by, inclusive = FALSE)
by |
Substring to split by. Can be an Expr. |
inclusive |
If |
List of String type
df = pl$DataFrame(s = c("foo bar", "foo-bar", "foo bar baz")) df$select(pl$col("s")$str$split(by = " ")) df = pl$DataFrame( s = c("foo^bar", "foo_bar", "foo*bar*baz"), by = c("_", "_", "*") ) df df$select(pl$col("s")$str$split(by = pl$col("by"))$alias("split"))
df = pl$DataFrame(s = c("foo bar", "foo-bar", "foo bar baz")) df$select(pl$col("s")$str$split(by = " ")) df = pl$DataFrame( s = c("foo^bar", "foo_bar", "foo*bar*baz"), by = c("_", "_", "*") ) df df$select(pl$col("s")$str$split(by = pl$col("by"))$alias("split"))
n
splitsThis results in a struct of n+1
fields. If it cannot make n
splits, the remaining field elements will be null.
ExprStr_split_exact(by, n, inclusive = FALSE)
ExprStr_split_exact(by, n, inclusive = FALSE)
by |
Substring to split by. |
n |
Number of splits to make. |
inclusive |
If |
Struct where each of n+1 fields is of String type
df = pl$DataFrame(s = c("a_1", NA, "c", "d_4")) df$with_columns( split = pl$col("s")$str$split_exact(by = "_", 1), split_inclusive = pl$col("s")$str$split_exact(by = "_", 1, inclusive = TRUE) )
df = pl$DataFrame(s = c("a_1", NA, "c", "d_4")) df$with_columns( split = pl$col("s")$str$split_exact(by = "_", 1), split_inclusive = pl$col("s")$str$split_exact(by = "_", 1, inclusive = TRUE) )
n
itemsIf the number of possible splits is less than n-1
, the remaining field
elements will be null. If the number of possible splits is n-1
or greater,
the last (nth) substring will contain the remainder of the string.
ExprStr_splitn(by, n)
ExprStr_splitn(by, n)
by |
Substring to split by. |
n |
Number of splits to make. |
Struct where each of n
fields is of String type
df = pl$DataFrame(s = c("a_1", NA, "c", "d_4_e")) df$with_columns( s1 = pl$col("s")$str$splitn(by = "_", 1), s2 = pl$col("s")$str$splitn(by = "_", 2), s3 = pl$col("s")$str$splitn(by = "_", 3) )
df = pl$DataFrame(s = c("a_1", NA, "c", "d_4_e")) df$with_columns( s1 = pl$col("s")$str$splitn(by = "_", 1), s2 = pl$col("s")$str$splitn(by = "_", 2), s3 = pl$col("s")$str$splitn(by = "_", 3) )
Check if string values starts with a substring.
ExprStr_starts_with(sub)
ExprStr_starts_with(sub)
sub |
Prefix substring or Expr. |
See also $str$contains()
and $str$ends_with()
.
Expr of Boolean data type
df = pl$DataFrame(fruits = c("apple", "mango", NA)) df$select( pl$col("fruits"), pl$col("fruits")$str$starts_with("app")$alias("has_suffix") )
df = pl$DataFrame(fruits = c("apple", "mango", NA)) df$select( pl$col("fruits"), pl$col("fruits")$str$starts_with("app")$alias("has_suffix") )
Remove leading and trailing characters.
ExprStr_strip_chars(matches = NULL)
ExprStr_strip_chars(matches = NULL)
matches |
The set of characters to be removed. All combinations of this
set of characters will be stripped. If |
This function will not strip any chars beyond the first char not matched.
strip_chars()
removes characters at the beginning and the end of the string.
Use strip_chars_start()
and strip_chars_end()
to remove characters only
from left and right respectively.
Expr of String lowercase chars
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars()) df$select(pl$col("foo")$str$strip_chars(" hel rld"))
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars()) df$select(pl$col("foo")$str$strip_chars(" hel rld"))
Remove trailing characters.
ExprStr_strip_chars_end(matches = NULL)
ExprStr_strip_chars_end(matches = NULL)
matches |
The set of characters to be removed. All combinations of this
set of characters will be stripped. If |
This function will not strip any chars beyond the first char not matched.
strip_chars_end()
removes characters at the end of the string only.
Use strip_chars()
and strip_chars_start()
to remove characters from the left
and right or only from the left respectively.
Expr of String lowercase chars
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars_end(" hel\trld")) df$select(pl$col("foo")$str$strip_chars_end("rldhel\t "))
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars_end(" hel\trld")) df$select(pl$col("foo")$str$strip_chars_end("rldhel\t "))
Remove leading characters.
ExprStr_strip_chars_start(matches = NULL)
ExprStr_strip_chars_start(matches = NULL)
matches |
The set of characters to be removed. All combinations of this
set of characters will be stripped. If |
This function will not strip any chars beyond the first char not matched.
strip_chars_start()
removes characters at the beginning of the string only.
Use strip_chars()
and strip_chars_end()
to remove characters from the left
and right or only from the right respectively.
Expr of String lowercase chars
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars_start(" hel rld"))
df = pl$DataFrame(foo = c(" hello", "\tworld")) df$select(pl$col("foo")$str$strip_chars_start(" hel rld"))
Similar to the strptime()
function.
ExprStr_strptime( dtype, format = NULL, ..., strict = TRUE, exact = TRUE, cache = TRUE, ambiguous = "raise" )
ExprStr_strptime( dtype, format = NULL, ..., strict = TRUE, exact = TRUE, cache = TRUE, ambiguous = "raise" )
dtype |
The data type to convert into. Can be either |
format |
Format to use for conversion. Refer to
the chrono crate documentation
for the full specification. Example: |
... |
Not used. |
strict |
If |
exact |
If |
cache |
Use a cache of unique, converted dates to apply the datetime conversion. |
ambiguous |
Determine how to deal with ambiguous datetimes:
|
When parsing a Datetime the column precision will be inferred from the format
string, if given, e.g.: "%F %T%.3f"
=> pl$Datetime("ms")
.
If no fractional second component is found then the default is "us"
(microsecond).
Expr of Date, Datetime or Time type
# Dealing with a consistent format s = as_polars_series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) s$str$strptime(pl$Datetime(), "%Y-%m-%d %H:%M%#z") # Auto infer format s$str$strptime(pl$Datetime()) # Datetime with timezone is interpreted as UTC timezone as_polars_series("2020-01-01T01:00:00+09:00")$str$strptime(pl$Datetime()) # Dealing with different formats. s = as_polars_series( c( "2021-04-22", "2022-01-04 00:00:00", "01/31/22", "Sun Jul 8 00:34:60 2001" ), "date" ) s$to_frame()$select( pl$coalesce( pl$col("date")$str$strptime(pl$Date, "%F", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%F %T", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%D", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%c", strict = FALSE) ) ) # Ignore invalid time s = as_polars_series( c( "2023-01-01 11:22:33 -0100", "2023-01-01 11:22:33 +0300", "invalid time" ) ) s$str$strptime( pl$Datetime("ns"), format = "%Y-%m-%d %H:%M:%S %z", strict = FALSE )
# Dealing with a consistent format s = as_polars_series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) s$str$strptime(pl$Datetime(), "%Y-%m-%d %H:%M%#z") # Auto infer format s$str$strptime(pl$Datetime()) # Datetime with timezone is interpreted as UTC timezone as_polars_series("2020-01-01T01:00:00+09:00")$str$strptime(pl$Datetime()) # Dealing with different formats. s = as_polars_series( c( "2021-04-22", "2022-01-04 00:00:00", "01/31/22", "Sun Jul 8 00:34:60 2001" ), "date" ) s$to_frame()$select( pl$coalesce( pl$col("date")$str$strptime(pl$Date, "%F", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%F %T", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%D", strict = FALSE), pl$col("date")$str$strptime(pl$Date, "%c", strict = FALSE) ) ) # Ignore invalid time s = as_polars_series( c( "2023-01-01 11:22:33 -0100", "2023-01-01 11:22:33 +0300", "invalid time" ) ) s$str$strptime( pl$Datetime("ns"), format = "%Y-%m-%d %H:%M:%S %z", strict = FALSE )
Return the last n characters of each string
ExprStr_tail(n)
ExprStr_tail(n)
n |
Length of the slice (integer or expression). Strings are parsed as column names. Negative indexing is supported. |
The n
input is defined in terms of the number of characters in the (UTF-8)
string. A character is defined as a Unicode scalar value. A single character
is represented by a single byte when working with ASCII text, and a maximum
of 4 bytes otherwise.
When the n
input is negative, tail()
returns characters starting from the
n
th from the beginning of the string. For example, if n = -3
, then all
characters except the first three are returned.
If the length of the string has fewer than n
characters, the full string is
returned.
Expr: Series of dtype String.
df = pl$DataFrame( s = c("pear", NA, "papaya", "dragonfruit"), n = c(3, 4, -2, -5) ) df$with_columns( s_tail_5 = pl$col("s")$str$tail(5), s_tail_n = pl$col("s")$str$tail("n") )
df = pl$DataFrame( s = c("pear", NA, "papaya", "dragonfruit"), n = c(3, 4, -2, -5) ) df$with_columns( s_tail_5 = pl$col("s")$str$tail(5), s_tail_n = pl$col("s")$str$tail("n") )
Convert a String column into a Date column
ExprStr_to_date(format = NULL, ..., strict = TRUE, exact = TRUE, cache = TRUE)
ExprStr_to_date(format = NULL, ..., strict = TRUE, exact = TRUE, cache = TRUE)
format |
Format to use for conversion. Refer to
the chrono crate documentation
for the full specification. Example: |
... |
Not used. |
strict |
If |
exact |
If |
cache |
Use a cache of unique, converted dates to apply the datetime conversion. |
Expr of Date type
s = as_polars_series(c("2020/01/01", "2020/02/01", "2020/03/01")) s$str$to_date() # by default, this errors if some values cannot be converted s = as_polars_series(c("2020/01/01", "2020 02 01", "2020-03-01")) try(s$str$to_date()) s$str$to_date(strict = FALSE)
s = as_polars_series(c("2020/01/01", "2020/02/01", "2020/03/01")) s$str$to_date() # by default, this errors if some values cannot be converted s = as_polars_series(c("2020/01/01", "2020 02 01", "2020-03-01")) try(s$str$to_date()) s$str$to_date(strict = FALSE)
Convert a String column into a Datetime column
ExprStr_to_datetime( format = NULL, ..., time_unit = NULL, time_zone = NULL, strict = TRUE, exact = TRUE, cache = TRUE, ambiguous = "raise" )
ExprStr_to_datetime( format = NULL, ..., time_unit = NULL, time_zone = NULL, strict = TRUE, exact = TRUE, cache = TRUE, ambiguous = "raise" )
format |
Format to use for conversion. Refer to
the chrono crate documentation
for the full specification. Example: |
... |
Not used. |
time_unit |
Unit of time for the resulting Datetime column. If |
time_zone |
for the resulting Datetime column. |
strict |
If |
exact |
If |
cache |
Use a cache of unique, converted dates to apply the datetime conversion. |
ambiguous |
Determine how to deal with ambiguous datetimes:
|
s = as_polars_series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) s$str$to_datetime("%Y-%m-%d %H:%M%#z") s$str$to_datetime(time_unit = "ms")
s = as_polars_series(c("2020-01-01 01:00Z", "2020-01-01 02:00Z")) s$str$to_datetime("%Y-%m-%d %H:%M%#z") s$str$to_datetime(time_unit = "ms")
Convert a String column into an Int64 column with base radix
ExprStr_to_integer(..., base = 10L, strict = TRUE)
ExprStr_to_integer(..., base = 10L, strict = TRUE)
... |
Ignored. |
base |
A positive integer or expression which is the base of the string
we are parsing. Characters are parsed as column names. Default: |
strict |
A logical. If |
Expression of data type Int64
.
df = pl$DataFrame(bin = c("110", "101", "010", "invalid")) df$with_columns( parsed = pl$col("bin")$str$to_integer(base = 2, strict = FALSE) ) df = pl$DataFrame(hex = c("fa1e", "ff00", "cafe", NA)) df$with_columns( parsed = pl$col("hex")$str$to_integer(base = 16, strict = TRUE) )
df = pl$DataFrame(bin = c("110", "101", "010", "invalid")) df$with_columns( parsed = pl$col("bin")$str$to_integer(base = 2, strict = FALSE) ) df = pl$DataFrame(hex = c("fa1e", "ff00", "cafe", NA)) df$with_columns( parsed = pl$col("hex")$str$to_integer(base = 16, strict = TRUE) )
Transform to lowercase variant.
ExprStr_to_lowercase()
ExprStr_to_lowercase()
Expr of String lowercase chars
pl$lit(c("A", "b", "c", "1", NA))$str$to_lowercase()$to_series()
pl$lit(c("A", "b", "c", "1", NA))$str$to_lowercase()$to_series()
Convert a String column into a Time column
ExprStr_to_time(format = NULL, ..., strict = TRUE, cache = TRUE)
ExprStr_to_time(format = NULL, ..., strict = TRUE, cache = TRUE)
format |
Format to use for conversion. Refer to
the chrono crate documentation
for the full specification. Example: |
... |
Not used. |
strict |
If |
cache |
Use a cache of unique, converted dates to apply the datetime conversion. |
Expr of Time type
s = as_polars_series(c("01:00", "02:00", "03:00")) s$str$to_time("%H:%M")
s = as_polars_series(c("01:00", "02:00", "03:00")) s$str$to_time("%H:%M")
Transform to titlecase variant.
ExprStr_to_titlecase()
ExprStr_to_titlecase()
This method is only available with the "nightly" feature.
See polars_info()
for more details.
Expr of String titlecase chars
pl$lit(c("hello there", "HI, THERE", NA))$str$to_titlecase()$to_series()
pl$lit(c("hello there", "HI, THERE", NA))$str$to_titlecase()$to_series()
Transform to uppercase variant.
ExprStr_to_uppercase()
ExprStr_to_uppercase()
Expr of String uppercase chars
pl$lit(c("A", "b", "c", "1", NA))$str$to_uppercase()$to_series()
pl$lit(c("A", "b", "c", "1", NA))$str$to_uppercase()$to_series()
Add zeroes to a string until it reaches n
characters. If the
number of characters is already greater than n
, the string is not modified.
ExprStr_zfill(alignment)
ExprStr_zfill(alignment)
alignment |
Fill the value up to this length. This can be an Expr or something coercible to an Expr. Strings are parsed as column names. |
Return a copy of the string left filled with ASCII '0' digits to make a string of length width.
A leading sign prefix ('+'/'-') is handled by inserting the padding after the
sign character rather than before. The original string is returned if width is
less than or equal to len(s)
.
Expr
some_floats_expr = pl$lit(c(0, 10, -5, 5)) # cast to String and ljust alignment = 5, and view as R char vector some_floats_expr$cast(pl$String)$str$zfill(5)$to_r() # cast to int and the to utf8 and then ljust alignment = 5, and view as R # char vector some_floats_expr$cast(pl$Int64)$cast(pl$String)$str$zfill(5)$to_r()
some_floats_expr = pl$lit(c(0, 10, -5, 5)) # cast to String and ljust alignment = 5, and view as R char vector some_floats_expr$cast(pl$String)$str$zfill(5)$to_r() # cast to int and the to utf8 and then ljust alignment = 5, and view as R # char vector some_floats_expr$cast(pl$Int64)$cast(pl$String)$str$zfill(5)$to_r()
Retrieve one of the fields of this Struct as a new Series
ExprStruct_field(name)
ExprStruct_field(name)
name |
Name of the field. |
Expr of datatype Struct
df = pl$DataFrame( aaa = c(1, 2), bbb = c("ab", "cd"), ccc = c(TRUE, NA), ddd = list(c(1, 2), 3) )$select( pl$struct(pl$all())$alias("struct_col") ) # struct field into a new Series df$select( pl$col("struct_col")$struct$field("bbb"), pl$col("struct_col")$struct$field("ddd") )
df = pl$DataFrame( aaa = c(1, 2), bbb = c("ab", "cd"), ccc = c(TRUE, NA), ddd = list(c(1, 2), 3) )$select( pl$struct(pl$all())$alias("struct_col") ) # struct field into a new Series df$select( pl$col("struct_col")$struct$field("bbb"), pl$col("struct_col")$struct$field("ddd") )
Rename the fields of the struct
ExprStruct_rename_fields(names)
ExprStruct_rename_fields(names)
names |
Vector or list of strings given in the same order as the struct's fields. Providing fewer names will drop the latter fields. If too many names are given, the extra names are ignored. |
Expr of datatype Struct
df = pl$DataFrame( aaa = 1:2, bbb = c("ab", "cd"), ccc = c(TRUE, NA), ddd = list(1:2, 3L) )$select( pl$struct(pl$all())$alias("struct_col") )$select( pl$col("struct_col")$struct$rename_fields(c("www", "xxx", "yyy", "zzz")) ) df$unnest()
df = pl$DataFrame( aaa = 1:2, bbb = c("ab", "cd"), ccc = c(TRUE, NA), ddd = list(1:2, 3L) )$select( pl$struct(pl$all())$alias("struct_col") )$select( pl$col("struct_col")$struct$rename_fields(c("www", "xxx", "yyy", "zzz")) ) df$unnest()
This is similar to $with_columns()
on
DataFrame
. Use pl$field()
to quickly
select a field in a $struct$with_fields()
context.
ExprStruct_with_fields(...)
ExprStruct_with_fields(...)
... |
Field(s) to add. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
An Expr
of data type Struct.
df = pl$DataFrame(x = c(1, 4, 9), y = c(4, 9, 16), multiply = c(10, 2, 3))$ with_columns(coords = pl$struct(c("x", "y")))$ select("coords", "multiply") df df = df$with_columns( pl$col("coords")$struct$with_fields( pl$field("x")$sqrt(), y_mul = pl$field("y") * pl$col("multiply") ) ) df df$unnest("coords")
df = pl$DataFrame(x = c(1, 4, 9), y = c(4, 9, 16), multiply = c(10, 2, 3))$ with_columns(coords = pl$struct(c("x", "y")))$ select("coords", "multiply") df df = df$with_columns( pl$col("coords")$struct$with_fields( pl$field("x")$sqrt(), y_mul = pl$field("y") * pl$col("multiply") ) ) df df$unnest("coords")
Deprecated. Use polars_options() to get, and pl$set_options() to set.
pl_get_global_rpool_cap() pl_set_global_rpool_cap(n)
pl_get_global_rpool_cap() pl_set_global_rpool_cap(n)
n |
Integer, the capacity limit R sessions to process R code. |
Background R sessions communicate via polars arrow IPC (series/vectors) or R
serialize + shared memory buffers via the rust crate ipc-channel
.
Multi-process communication has overhead because all data must be
serialized/de-serialized and sent via buffers. Using multiple R sessions
will likely only give a speed-up in a low io - high cpu
scenario. Native
polars query syntax runs in threads and have no overhead. Polars has as default
double as many thread workers as cores. If any worker are queuing for or using R sessions,
other workers can still continue any native polars parts as much as possible.
polars_options()$rpool_cap
returns the capacity ("limit") of co-running external R sessions /
processes. polars_options()$rpool_active
is the number of R sessions are already spawned
in the pool. rpool_cap
is the limit of new R sessions to spawn. Anytime a polars
thread worker needs a background R session specifically to run R code embedded
in a query via $map_batches(..., in_background = TRUE)
or $map_elements(..., in_background = TRUE)
,
it will obtain any R session idling in
rpool, or spawn a new R session (process) if capacity
is not already reached. If capacity
is already reached, the thread worker
will sleep and in a R job queue until an R session is idle.
default = polars_options()$rpool_cap |> print() options(polars.rpool_cap = 8) polars_options()$rpool_cap options(polars.rpool_cap = default) polars_options()$rpool_cap
default = polars_options()$rpool_cap |> print() options(polars.rpool_cap = 8) polars_options()$rpool_cap options(polars.rpool_cap = default) polars_options()$rpool_cap
Aggregate a DataFrame over a groupby
GroupBy_agg(...)
GroupBy_agg(...)
... |
exprs to aggregate over.
... args can also be passed wrapped in a list |
aggregated DataFrame
pl$DataFrame( foo = c("one", "two", "two", "one", "two"), bar = c(5, 3, 2, 4, 1) )$group_by("foo")$agg( pl$col("bar")$sum()$name$suffix("_sum"), pl$col("bar")$mean()$alias("bar_tail_sum") )
pl$DataFrame( foo = c("one", "two", "two", "one", "two"), bar = c(5, 3, 2, 4, 1) )$group_by("foo")$agg( pl$col("bar")$sum()$name$suffix("_sum"), pl$col("bar")$mean()$alias("bar_tail_sum") )
The GroupBy class in R, is just another interface on top of the
DataFrame in rust polars.
Groupby does not use the rust api for
<DataFrame>$group_by()
+ $agg()
because the groupby-struct is a reference to a DataFrame and that reference
will share lifetime with its parent DataFrame.
There is no way to expose lifetime limited objects via extendr currently
(might be quirky anyhow with R GC).
Instead the inputs for the group_by
are just stored on R side, until also agg
is called.
Which will end up in a self-owned DataFrame object and all is fine. groupby aggs are performed
via the rust polars LazyGroupBy methods, see DataFrame.groupby_agg method.
$columns
returns a character vector with the column names.
as_polars_df(mtcars)$group_by("cyl")$agg( pl$col("mpg")$sum() )
as_polars_df(mtcars)$group_by("cyl")$agg( pl$col("mpg")$sum() )
Reduce the groups to the first value.
GroupBy_first()
GroupBy_first()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$first()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$first()
Reduce the groups to the last value.
GroupBy_last()
GroupBy_last()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$last()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$last()
Reduce the groups to the maximum value.
GroupBy_max()
GroupBy_max()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$max()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$max()
Reduce the groups to the mean value.
GroupBy_mean()
GroupBy_mean()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$mean()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$mean()
Reduce the groups to the median value.
GroupBy_median()
GroupBy_median()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$median()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$median()
Reduce the groups to the minimum value.
GroupBy_min()
GroupBy_min()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$min()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$min()
Create a new DataFrame that shows the null counts per column.
GroupBy_null_count()
GroupBy_null_count()
DataFrame
x = mtcars x[1:10, 3:5] = NA pl$DataFrame(x)$group_by("cyl")$null_count()
x = mtcars x[1:10, 3:5] = NA pl$DataFrame(x)$group_by("cyl")$null_count()
Aggregate the columns in the DataFrame to their quantile value.
GroupBy_quantile(quantile, interpolation = "nearest")
GroupBy_quantile(quantile, interpolation = "nearest")
quantile |
numeric Quantile between 0.0 and 1.0. |
interpolation |
string Interpolation method: "nearest", "higher", "lower", "midpoint", or "linear". |
GroupBy
as_polars_df(mtcars)$lazy()$quantile(.4)$collect()
as_polars_df(mtcars)$lazy()$quantile(.4)$collect()
Shift the values by a given period
GroupBy_shift(n = 1, fill_value = NULL)
GroupBy_shift(n = 1, fill_value = NULL)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
fill_value |
Fill the resulting null values with this value. Accepts expression input. Non-expression inputs are parsed as literals. |
GroupBy
as_polars_df(mtcars)$group_by("cyl")$shift(2)
as_polars_df(mtcars)$group_by("cyl")$shift(2)
Reduce the groups to the standard deviation value.
GroupBy_std()
GroupBy_std()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$std()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$std()
Reduce the groups to the sum value.
GroupBy_sum()
GroupBy_sum()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$sum()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$sum()
Revert the group by operation.
GroupBy_ungroup()
GroupBy_ungroup()
gb = as_polars_df(mtcars)$group_by("cyl") gb gb$ungroup()
gb = as_polars_df(mtcars)$group_by("cyl") gb gb$ungroup()
Reduce the groups to the variance value.
GroupBy_var()
GroupBy_var()
aggregated DataFrame
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$var()
df = pl$DataFrame( a = c(1, 2, 2, 3, 4, 5), b = c(0.5, 0.5, 4, 10, 13, 14), c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE), d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana") ) df$group_by("d", maintain_order = TRUE)$var()
n
parts of an objectThey are equivalent to $head()
and $tail()
methods.
## S3 method for class 'RPolarsDataFrame' head(x, n = 6L, ...) ## S3 method for class 'RPolarsLazyFrame' head(x, n = 6L, ...) ## S3 method for class 'RPolarsDataFrame' tail(x, n = 6L, ...) ## S3 method for class 'RPolarsLazyFrame' tail(x, n = 6L, ...)
## S3 method for class 'RPolarsDataFrame' head(x, n = 6L, ...) ## S3 method for class 'RPolarsLazyFrame' head(x, n = 6L, ...) ## S3 method for class 'RPolarsDataFrame' tail(x, n = 6L, ...) ## S3 method for class 'RPolarsLazyFrame' tail(x, n = 6L, ...)
x |
A polars object |
n |
An integer vector of length 1.
Note that negative values are not supported for if |
... |
Ignored |
A polars object of the same class as x
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) lf = df$lazy() head(df, 2) tail(df, 2) head(lf, 2) tail(lf, 2) head(df, -2) tail(df, -2)
df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) lf = df$lazy() head(df, 2) tail(df, 2) head(lf, 2) tail(lf, 2) head(df, -2) tail(df, -2)
Infer nanoarrow schema from a Polars object
## S3 method for class 'RPolarsDataFrame' infer_nanoarrow_schema(x, ..., compat_level = FALSE) ## S3 method for class 'RPolarsSeries' infer_nanoarrow_schema(x, ..., compat_level = FALSE)
## S3 method for class 'RPolarsDataFrame' infer_nanoarrow_schema(x, ..., compat_level = FALSE) ## S3 method for class 'RPolarsSeries' infer_nanoarrow_schema(x, ..., compat_level = FALSE)
x |
A polars object |
... |
Ignored |
compat_level |
Use a specific compatibility level when exporting Polars’ internal data structures. This can be:
|
library(nanoarrow) pl_df = as_polars_df(mtcars)$select("mpg", "cyl") pl_s = as_polars_series(letters) infer_nanoarrow_schema(pl_df) infer_nanoarrow_schema(pl_s)
library(nanoarrow) pl_df = as_polars_df(mtcars)$select("mpg", "cyl") pl_s = as_polars_series(letters) infer_nanoarrow_schema(pl_df) infer_nanoarrow_schema(pl_s)
This function tests if the object is a polars DataFrame.
is_polars_df(x)
is_polars_df(x)
x |
An object |
A logical value
is_polars_df(mtcars) is_polars_df(as_polars_df(mtcars))
is_polars_df(mtcars) is_polars_df(as_polars_df(mtcars))
Test if the object a polars DataType
is_polars_dtype(x, include_unknown = FALSE)
is_polars_dtype(x, include_unknown = FALSE)
x |
An object |
include_unknown |
If |
A logical value
is_polars_dtype(pl$Int64) is_polars_dtype(mtcars) is_polars_dtype(pl$Unknown) is_polars_dtype(pl$Unknown, include_unknown = TRUE)
is_polars_dtype(pl$Int64) is_polars_dtype(mtcars) is_polars_dtype(pl$Unknown) is_polars_dtype(pl$Unknown, include_unknown = TRUE)
This function tests if the object is a polars LazyFrame.
is_polars_lf(x)
is_polars_lf(x)
x |
An object |
A logical value
is_polars_lf(mtcars) is_polars_lf(as_polars_lf(mtcars))
is_polars_lf(mtcars) is_polars_lf(as_polars_lf(mtcars))
This function tests if the object is a polars Series.
is_polars_series(x)
is_polars_series(x)
x |
An object |
A logical value
is_polars_series(1:3) is_polars_series(as_polars_series(1:3))
is_polars_series(1:3) is_polars_series(as_polars_series(1:3))
Mimics Python Polars' NotebookFormatter for HTML outputs.
## S3 method for class 'RPolarsDataFrame' knit_print(x, ...)
## S3 method for class 'RPolarsDataFrame' knit_print(x, ...)
x |
a polars DataFrame to knit_print |
... |
additional arguments, not used |
Outputs HTML tables if the output format is HTML
and the document's df_print
option is not "default"
or "tibble"
.
Or, the output format can be enforced with R's options
function as follows:
options(polars.df_knitr_print = "default")
for the default print method.
options(polars.df_knitr_print = "html")
for the HTML table.
invisible x or NULL
This allows to convert all columns to a datatype or to convert only specific columns. Contrarily to the Python implementation, it is not possible to convert all columns of a specific datatype to another datatype.
LazyFrame_cast(dtypes, ..., strict = TRUE)
LazyFrame_cast(dtypes, ..., strict = TRUE)
dtypes |
Either a datatype or a list where the names are column names and the values are the datatypes to convert to. |
... |
Ignored. |
strict |
If |
A LazyFrame
lf = pl$LazyFrame( foo = 1:3, bar = c(6, 7, 8), ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) ) # Cast only some columns lf$cast(list(foo = pl$Float32, bar = pl$UInt8))$collect() # Cast all columns to the same type lf$cast(pl$String)$collect()
lf = pl$LazyFrame( foo = 1:3, bar = c(6, 7, 8), ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) ) # Cast only some columns lf$cast(list(foo = pl$Float32, bar = pl$UInt8))$collect() # Cast all columns to the same type lf$cast(pl$String)$collect()
The LazyFrame
-class is simply two environments of respectively
the public and private methods/function calls to the polars rust side. The
instantiated LazyFrame
-object is an externalptr
to a lowlevel rust polars
LazyFrame object. The pointer address is the only statefullness of the
LazyFrame object on the R side. Any other state resides on the rust side. The
S3 method .DollarNames.RPolarsLazyFrame
exposes all public $foobar()
-methods which
are callable onto the object.
Most methods return another LazyFrame
-class instance or similar which allows
for method chaining. This class system in lack of a better name could be called
"environment classes" and is the same class system extendr provides, except
here there is both a public and private set of methods. For implementation
reasons, the private methods are external and must be called from
.pr$LazyFrame$methodname()
. Also, all private methods must take
any self as an argument, thus they are pure functions. Having the private methods
as pure functions solved/simplified self-referential complications.
DataFrame
and LazyFrame
can both be said to be a Frame
. To convert use
<DataFrame>$lazy()
and <LazyFrame>$collect()
.
You can also create a LazyFrame
directly with pl$LazyFrame()
.
This is quite similar to the lazy-collect syntax of the dplyr
package to
interact with database connections such as SQL variants. Most SQL databases
would be able to perform the same optimizations as polars such predicate pushdown
and projection pushdown. However polars can interact and optimize queries with both
SQL DBs and other data sources such parquet files simultaneously.
$columns
returns a character vector with the column names.
$dtypes
returns a unnamed list with the data type of each column.
$schema
returns a named list with the data type of each column.
$width
returns the number of columns in the LazyFrame.
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
# see all exported methods ls(.pr$env$RPolarsLazyFrame) # see all private methods (not intended for regular use) ls(.pr$LazyFrame) ## Practical example ## # First writing R iris dataset to disk, to illustrte a difference temp_filepath = tempfile() write.csv(iris, temp_filepath, row.names = FALSE) # Following example illustrates 2 ways to obtain a LazyFrame # The-Okay-way: convert an in-memory DataFrame to LazyFrame # eager in-mem R data.frame Rdf = read.csv(temp_filepath) # eager in-mem polars DataFrame Pdf = as_polars_df(Rdf) # lazy frame starting from in-mem DataFrame Ldf_okay = Pdf$lazy() # The-Best-Way: LazyFrame created directly from a data source is best... Ldf_best = pl$scan_csv(temp_filepath) # ... as if to e.g. filter the LazyFrame, that filtering also caleld predicate will be # pushed down in the executation stack to the csv_reader, and thereby only bringing into # memory the rows matching to filter. # apply filter: filter_expr = pl$col("Species") == "setosa" # get only rows where Species is setosa Ldf_okay = Ldf_okay$filter(filter_expr) # overwrite LazyFrame with new Ldf_best = Ldf_best$filter(filter_expr) # the non optimized plans are similar, on entire in-mem csv, apply filter Ldf_okay$explain(optimized = FALSE) Ldf_best$explain(optimized = FALSE) # NOTE For Ldf_okay, the full time to load csv alrady paid when creating Rdf and Pdf # The optimized plan are quite different, Ldf_best will read csv and perform filter simultaneously Ldf_okay$explain() Ldf_best$explain() # To acquire result in-mem use $colelct() Pdf_okay = Ldf_okay$collect() Pdf_best = Ldf_best$collect() # verify tables would be the same all.equal( Pdf_okay$to_data_frame(), Pdf_best$to_data_frame() ) # a user might write it as a one-liner like so: Pdf_best2 = pl$scan_csv(temp_filepath)$filter(pl$col("Species") == "setosa")
# see all exported methods ls(.pr$env$RPolarsLazyFrame) # see all private methods (not intended for regular use) ls(.pr$LazyFrame) ## Practical example ## # First writing R iris dataset to disk, to illustrte a difference temp_filepath = tempfile() write.csv(iris, temp_filepath, row.names = FALSE) # Following example illustrates 2 ways to obtain a LazyFrame # The-Okay-way: convert an in-memory DataFrame to LazyFrame # eager in-mem R data.frame Rdf = read.csv(temp_filepath) # eager in-mem polars DataFrame Pdf = as_polars_df(Rdf) # lazy frame starting from in-mem DataFrame Ldf_okay = Pdf$lazy() # The-Best-Way: LazyFrame created directly from a data source is best... Ldf_best = pl$scan_csv(temp_filepath) # ... as if to e.g. filter the LazyFrame, that filtering also caleld predicate will be # pushed down in the executation stack to the csv_reader, and thereby only bringing into # memory the rows matching to filter. # apply filter: filter_expr = pl$col("Species") == "setosa" # get only rows where Species is setosa Ldf_okay = Ldf_okay$filter(filter_expr) # overwrite LazyFrame with new Ldf_best = Ldf_best$filter(filter_expr) # the non optimized plans are similar, on entire in-mem csv, apply filter Ldf_okay$explain(optimized = FALSE) Ldf_best$explain(optimized = FALSE) # NOTE For Ldf_okay, the full time to load csv alrady paid when creating Rdf and Pdf # The optimized plan are quite different, Ldf_best will read csv and perform filter simultaneously Ldf_okay$explain() Ldf_best$explain() # To acquire result in-mem use $colelct() Pdf_okay = Ldf_okay$collect() Pdf_best = Ldf_best$collect() # verify tables would be the same all.equal( Pdf_okay$to_data_frame(), Pdf_best$to_data_frame() ) # a user might write it as a one-liner like so: Pdf_best2 = pl$scan_csv(temp_filepath)$filter(pl$col("Species") == "setosa")
Returns a n-row null-filled LazyFrame with an identical schema. n
can be
greater than the current number of rows in the LazyFrame.
LazyFrame_clear(n = 0)
LazyFrame_clear(n = 0)
n |
Number of (null-filled) rows to return in the cleared frame. |
A n-row null-filled LazyFrame with an identical schema
df = pl$LazyFrame( a = c(NA, 2, 3, 4), b = c(0.5, NA, 2.5, 13), c = c(TRUE, TRUE, FALSE, NA) ) df$clear() df$clear(n = 5)
df = pl$LazyFrame( a = c(NA, 2, 3, 4), b = c(0.5, NA, 2.5, 13), c = c(TRUE, TRUE, FALSE, NA) ) df$clear() df$clear(n = 5)
This makes a very cheap deep copy/clone of an existing
LazyFrame
. Rarely useful as LazyFrame
s are nearly 100%
immutable. Any modification of a LazyFrame
should lead to a clone anyways,
but this can be useful when dealing with attributes (see examples).
LazyFrame_clone()
LazyFrame_clone()
A LazyFrame
df1 = as_polars_lf(iris) # Make a function to take a LazyFrame, add an attribute, and return a LazyFrame give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original LazyFrame also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_lf(iris) df2 = give_attr(df1) # now, the original LazyFrame doesn't get this attribute attributes(df1)
df1 = as_polars_lf(iris) # Make a function to take a LazyFrame, add an attribute, and return a LazyFrame give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original LazyFrame also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_lf(iris) df2 = give_attr(df1) # now, the original LazyFrame doesn't get this attribute attributes(df1)
$collect()
performs the query on the LazyFrame. It returns a
DataFrame
LazyFrame_collect( ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE )
LazyFrame_collect( ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE )
... |
Ignored. |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
no_optimization |
Logical. Sets the following parameters to |
collect_in_background |
Logical. Detach this query from R session. Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session. |
Note: use $fetch(n)
if you want to run your query on the first n
rows only.
This can be a huge time saver in debugging queries.
A DataFrame
$fetch()
- fast limited query check
$profile()
- same as $collect()
but also returns
a table with each operation profiled.
$collect_in_background()
- non-blocking
collect returns a future handle. Can also just be used via
$collect(collect_in_background = TRUE)
.
$sink_parquet()
streams query to a parquet file.
$sink_ipc()
streams query to a arrow file.
as_polars_lf(iris)$filter(pl$col("Species") == "setosa")$collect()
as_polars_lf(iris)$filter(pl$col("Species") == "setosa")$collect()
This doesn't block the R session as it calls $collect()
in a
a detached thread. This can also be used via $collect(collect_in_background = TRUE)
.
LazyFrame_collect_in_background()
LazyFrame_collect_in_background()
This function immediately returns an RThreadHandle.
Use <RPolarsRThreadHandle>$is_finished()
to see if done.
Use <RPolarsRThreadHandle>$join()
to wait and get the final result.
It is useful to not block the R session while query executes. If you use
<Expr>$map_batches()
or
<Expr>$map_elements()
to run R functions in the query,
then you must pass in_background = TRUE
in $map_batches()
(or
$map_elements()
). Otherwise, $collect_in_background()
will fail because
the main R session is not available for polars execution. See also examples
below.
RThreadHandle, a future-like thread handle for the task
# Some expression which does contain a map expr = pl$col("mpg")$map_batches( \(x) { Sys.sleep(.1) x * 0.43 }, in_background = TRUE # set TRUE if collecting in background queries with $map or $apply )$alias("kml") # return is immediately a handle to another thread. handle = as_polars_lf(mtcars)$with_columns(expr)$collect_in_background() # ask if query is done if (!handle$is_finished()) print("not done yet") # get result, blocking until polars query is done df = handle$join() df
# Some expression which does contain a map expr = pl$col("mpg")$map_batches( \(x) { Sys.sleep(.1) x * 0.43 }, in_background = TRUE # set TRUE if collecting in background queries with $map or $apply )$alias("kml") # return is immediately a handle to another thread. handle = as_polars_lf(mtcars)$with_columns(expr)$collect_in_background() # ask if query is done if (!handle$is_finished()) print("not done yet") # get result, blocking until polars query is done df = handle$join() df
Drop columns of a LazyFrame
LazyFrame_drop(..., strict = TRUE)
LazyFrame_drop(..., strict = TRUE)
... |
Characters of column names to drop. Passed to |
strict |
Validate that all column names exist in the schema and throw an exception if a column name does not exist in the schema. |
LazyFrame
as_polars_lf(mtcars)$drop(c("mpg", "hp"))$collect() # equivalent as_polars_lf(mtcars)$drop("mpg", "hp")$collect()
as_polars_lf(mtcars)$drop(c("mpg", "hp"))$collect() # equivalent as_polars_lf(mtcars)$drop("mpg", "hp")$collect()
Drop all rows that contain nulls (which correspond to NA
in R).
LazyFrame_drop_nulls(subset = NULL)
LazyFrame_drop_nulls(subset = NULL)
subset |
A character vector with the names of the column(s) for which
nulls are considered. If |
LazyFrame
tmp = mtcars tmp[1:3, "mpg"] = NA tmp[4, "hp"] = NA tmp = pl$LazyFrame(tmp) # number of rows in `tmp` before dropping nulls tmp$collect()$height tmp$drop_nulls()$collect()$height tmp$drop_nulls("mpg")$collect()$height tmp$drop_nulls(c("mpg", "hp"))$collect()$height
tmp = mtcars tmp[1:3, "mpg"] = NA tmp[4, "hp"] = NA tmp = pl$LazyFrame(tmp) # number of rows in `tmp` before dropping nulls tmp$collect()$height tmp$drop_nulls()$collect()$height tmp$drop_nulls("mpg")$collect()$height tmp$drop_nulls(c("mpg", "hp"))$collect()$height
The query plan is read from bottom to top. When optimized = FALSE
, the
query as it was written by the user is shown. This is not what Polars runs.
Instead, it applies optimizations that are displayed by default by $explain()
.
One classic example is the predicate pushdown, which applies the filter as
early as possible (i.e. at the bottom of the plan).
LazyFrame_explain( ..., format = "plain", optimized = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE )
LazyFrame_explain( ..., format = "plain", optimized = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE )
... |
Ignored. |
format |
The format to use for displaying the logical plan. Must be either
|
optimized |
Return an optimized query plan. If |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
A character value containing the query plan.
lazy_frame = as_polars_lf(iris) # Prepare your query lazy_query = lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") # This is the query that was written by the user, without any optimizations # (use cat() for better printing) lazy_query$explain(optimized = FALSE) |> cat() # This is the query after `polars` optimizes it: instead of sorting first and # then filtering, it is faster to filter first and then sort the rest. lazy_query$explain() |> cat() # Also possible to see this as tree format lazy_query$explain(format = "tree") |> cat()
lazy_frame = as_polars_lf(iris) # Prepare your query lazy_query = lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") # This is the query that was written by the user, without any optimizations # (use cat() for better printing) lazy_query$explain(optimized = FALSE) |> cat() # This is the query after `polars` optimizes it: instead of sorting first and # then filtering, it is faster to filter first and then sort the rest. lazy_query$explain() |> cat() # Also possible to see this as tree format lazy_query$explain(format = "tree") |> cat()
This will take every element of a list column and add it on an additional row.
LazyFrame_explode(...)
LazyFrame_explode(...)
... |
Column(s) to be exploded as individual |
Only columns of DataType List
or Array
can be exploded.
Named expressions like $explode(a = pl$col("b"))
will not implicitly trigger
$alias("a")
here, due to only variant Expr::Column
is supported in
rust-polars.
LazyFrame
df = pl$LazyFrame( letters = c("aa", "aa", "bb", "cc"), numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)), numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers ) df # explode a single column, append others df$explode("numbers")$collect() # explode two columns of same nesting structure, by names or the common dtype # "List(Float64)" df$explode("numbers", "numbers_2")$collect() df$explode(pl$col(pl$List(pl$Float64)))$collect()
df = pl$LazyFrame( letters = c("aa", "aa", "bb", "cc"), numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)), numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers ) df # explode a single column, append others df$explode("numbers")$collect() # explode two columns of same nesting structure, by names or the common dtype # "List(Float64)" df$explode("numbers", "numbers_2")$collect() df$explode(pl$col(pl$List(pl$Float64)))$collect()
n
rows of a LazyFrameThis is similar to $collect()
but limit the number of rows to collect. It
is mostly useful to check that a query works as expected.
LazyFrame_fetch( n_rows = 500, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE )
LazyFrame_fetch( n_rows = 500, ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE )
n_rows |
Integer. Maximum number of rows to fetch. |
... |
Ignored. |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
no_optimization |
Logical. Sets the following parameters to |
$fetch()
does not guarantee the final number of rows in the DataFrame output.
It only guarantees that n
rows are used at the beginning of the query.
Filters, join operations and a lower number of rows available in the scanned
file influence the final number of rows.
A DataFrame of maximum n_rows
$collect()
- regular collect.
$profile()
- same as $collect()
but also returns
a table with each operation profiled.
$collect_in_background()
- non-blocking
collect returns a future handle. Can also just be used via
$collect(collect_in_background = TRUE)
.
$sink_parquet()
streams query to a parquet file.
$sink_ipc()
streams query to a arrow file.
# fetch 3 rows as_polars_lf(iris)$fetch(3) # this fetch-query returns 4 rows, because we started with 3 and appended one # row in the query (see section 'Details') as_polars_lf(iris)$ select(pl$col("Species")$append("flora gigantica, alien"))$ fetch(3)
# fetch 3 rows as_polars_lf(iris)$fetch(3) # this fetch-query returns 4 rows, because we started with 3 and appended one # row in the query (see section 'Details') as_polars_lf(iris)$ select(pl$col("Species")$append("flora gigantica, alien"))$ fetch(3)
Fill floating point NaN value with a fill value
LazyFrame_fill_nan(value)
LazyFrame_fill_nan(value)
value |
Value used to fill |
LazyFrame
df = pl$LazyFrame( a = c(1.5, 2, NaN, 4), b = c(1.5, NaN, NaN, 4) ) df$fill_nan(99)$collect()
df = pl$LazyFrame( a = c(1.5, 2, NaN, 4), b = c(1.5, NaN, NaN, 4) ) df$fill_nan(99)$collect()
Fill null values (which correspond to NA
in R) using the
specified value or strategy.
LazyFrame_fill_null(fill_value)
LazyFrame_fill_null(fill_value)
fill_value |
Value to fill nulls with. |
LazyFrame
df = pl$LazyFrame( a = c(1.5, 2, NA, 4), b = c(1.5, NA, NA, 4) ) df$fill_null(99)$collect()
df = pl$LazyFrame( a = c(1.5, 2, NA, 4), b = c(1.5, NA, NA, 4) ) df$fill_null(99)$collect()
Filter rows with an Expression defining a boolean column.
Multiple expressions are combined with &
(AND).
This is equivalent to dplyr::filter()
.
LazyFrame_filter(...)
LazyFrame_filter(...)
... |
Polars expressions which will evaluate to a boolean. |
Rows where the condition returns NA
are dropped.
A new LazyFrame
object with add/modified column.
lf = as_polars_lf(iris) lf$filter(pl$col("Species") == "setosa")$collect() # This is equivalent to # lf$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) lf$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1)
lf = as_polars_lf(iris) lf$filter(pl$col("Species") == "setosa")$collect() # This is equivalent to # lf$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) lf$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1)
Get the first row of a LazyFrame
LazyFrame_first()
LazyFrame_first()
A LazyFrame with one row
as_polars_lf(mtcars)$first()$collect()
as_polars_lf(mtcars)$first()$collect()
Take every nth row in the LazyFrame
LazyFrame_gather_every(n, offset = 0)
LazyFrame_gather_every(n, offset = 0)
n |
Gather every |
offset |
Starting index. |
A LazyFrame
lf = pl$LazyFrame(a = 1:4, b = 5:8) lf$gather_every(2)$collect() lf$gather_every(2, offset = 1)$collect()
lf = pl$LazyFrame(a = 1:4, b = 5:8) lf$gather_every(2)$collect() lf$gather_every(2, offset = 1)$collect()
This doesn't modify the data but only stores information about
the group structure. This structure can then be used by several functions
($agg()
, $filter()
, etc.).
LazyFrame_group_by(..., maintain_order = polars_options()$maintain_order)
LazyFrame_group_by(..., maintain_order = polars_options()$maintain_order)
... |
Column(s) to group by. Accepts expression input. Characters are parsed as column names. |
maintain_order |
Ensure that the order of the groups is consistent with the input data.
This is slower than a default group by.
Setting this to |
LazyGroupBy (a LazyFrame with special groupby methods like $agg()
)
lf = pl$LazyFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) lf$group_by("a")$agg(pl$col("b")$sum())$collect() # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input. lf$group_by("a", maintain_order = TRUE)$agg(pl$col("c"))$collect() # Group by multiple columns by passing a list of column names. lf$group_by(c("a", "b"))$agg(pl$max("c"))$collect() # Or pass some arguments to group by multiple columns in the same way. # Expressions are also accepted. lf$group_by("a", pl$col("b") %/% 2)$agg( pl$col("c")$mean() )$collect() # The columns will be renamed to the argument names. lf$group_by(d = "a", e = pl$col("b") %/% 2)$agg( pl$col("c")$mean() )$collect()
lf = pl$LazyFrame( a = c("a", "b", "a", "b", "c"), b = c(1, 2, 1, 3, 3), c = c(5, 4, 3, 2, 1) ) lf$group_by("a")$agg(pl$col("b")$sum())$collect() # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input. lf$group_by("a", maintain_order = TRUE)$agg(pl$col("c"))$collect() # Group by multiple columns by passing a list of column names. lf$group_by(c("a", "b"))$agg(pl$max("c"))$collect() # Or pass some arguments to group by multiple columns in the same way. # Expressions are also accepted. lf$group_by("a", pl$col("b") %/% 2)$agg( pl$col("c")$mean() )$collect() # The columns will be renamed to the argument names. lf$group_by(d = "a", e = pl$col("b") %/% 2)$agg( pl$col("c")$mean() )$collect()
If you have a time series <t_0, t_1, ..., t_n>
, then by default the windows
created will be:
(t_0 - period, t_0]
(t_1 - period, t_1]
…
(t_n - period, t_n]
whereas if you pass a non-default offset, then the windows will be:
(t_0 + offset, t_0 + offset + period]
(t_1 + offset, t_1 + offset + period]
…
(t_n + offset, t_n + offset + period]
LazyFrame_group_by_dynamic( index_column, ..., every, period = NULL, offset = NULL, include_boundaries = FALSE, closed = "left", label = "left", group_by = NULL, start_by = "window" )
LazyFrame_group_by_dynamic( index_column, ..., every, period = NULL, offset = NULL, include_boundaries = FALSE, closed = "left", label = "left", group_by = NULL, start_by = "window" )
index_column |
Column used to group based on the time window. Often of
type Date/Datetime. This column must be sorted in ascending order (or, if |
... |
Ignored. |
every |
Interval of the window. |
period |
A character representing the length of the window,
must be non-negative. See the |
offset |
A character representing the offset of the window,
or |
include_boundaries |
Add two columns |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
label |
Define which label to use for the window:
|
group_by |
Also group by this column/these columns. |
start_by |
The strategy to determine the start of the first window by:
|
In case of a rolling operation on an integer column, the windows are defined by:
"1i" # length 1
"10i" # length 10
A LazyGroupBy object
lf = pl$LazyFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) lf$collect() # get the sum in the following hour relative to the "time" column lf$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() )$collect() # using "include_boundaries = TRUE" is helpful to see the period considered lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") )$collect() # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") )$collect() # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") )$collect() # Dynamic group bys can also be combined with grouping on normal keys lf = lf$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) lf$collect() lf$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n"))$collect() # We can also create a dynamic group by based on an index column lf = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) lf$collect() lf$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))$collect()
lf = pl$LazyFrame( time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), interval = "30m" ), n = 0:6 ) lf$collect() # get the sum in the following hour relative to the "time" column lf$group_by_dynamic("time", every = "1h")$agg( vals = pl$col("n"), sum = pl$col("n")$sum() )$collect() # using "include_boundaries = TRUE" is helpful to see the period considered lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( vals = pl$col("n") )$collect() # in the example above, the values didn't include the one *exactly* 1h after # the start because "closed = 'left'" by default. # Changing it to "right" includes values that are exactly 1h after. Note that # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], # even if this interval wasn't there originally lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( vals = pl$col("n") )$collect() # To keep both boundaries, we use "closed = 'both'". Some values now belong to # several groups: lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( vals = pl$col("n") )$collect() # Dynamic group bys can also be combined with grouping on normal keys lf = lf$with_columns( groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a")) ) lf$collect() lf$group_by_dynamic( "time", every = "1h", closed = "both", group_by = "groups", include_boundaries = TRUE )$agg(pl$col("n"))$collect() # We can also create a dynamic group by based on an index column lf = pl$LazyFrame( idx = 0:5, A = c("A", "A", "B", "B", "B", "C") )$with_columns(pl$col("idx")$set_sorted()) lf$collect() lf$group_by_dynamic( "idx", every = "2i", period = "3i", include_boundaries = TRUE, closed = "right" )$agg(A_agg_list = pl$col("A"))$collect()
n
rows.A shortcut for $slice(0, n)
.
Consider using the $fetch()
method if you want to test your query.
The $fetch()
operation will load the first n
rows at
the scan level, whereas $head()
is applied at the end.
LazyFrame_head(n = 5L)
LazyFrame_head(n = 5L)
n |
Number of rows to return. |
$limit()
is an alias for $head()
.
A new LazyFrame
object with applied filter.
lf = pl$LazyFrame(a = 1:6, b = 7:12) lf$head()$collect() lf$head(2)$collect()
lf = pl$LazyFrame(a = 1:6, b = 7:12) lf$head()$collect() lf$head(2)$collect()
This function can do both mutating joins (adding columns based on matching
observations, for example with how = "left"
) and filtering joins (keeping
observations based on matching observations, for example with how = "inner"
).
LazyFrame_join( other, on = NULL, how = "inner", ..., left_on = NULL, right_on = NULL, suffix = "_right", validate = "m:m", join_nulls = FALSE, allow_parallel = TRUE, force_parallel = FALSE, coalesce = NULL )
LazyFrame_join( other, on = NULL, how = "inner", ..., left_on = NULL, right_on = NULL, suffix = "_right", validate = "m:m", join_nulls = FALSE, allow_parallel = TRUE, force_parallel = FALSE, coalesce = NULL )
other |
LazyFrame to join with. |
on |
Either a vector of column names or a list of expressions and/or
strings. Use |
how |
One of the following methods: "inner", "left", "right", "full", "semi", "anti", "cross". |
... |
Ignored. |
left_on , right_on
|
Same as |
suffix |
Suffix to add to duplicated column names. |
validate |
Checks if join is of specified type:
Note that this is currently not supported by the streaming engine, and is only supported when joining by single columns. |
join_nulls |
Join on null values. By default null values will never produce matches. |
allow_parallel |
Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel. |
force_parallel |
Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. |
coalesce |
Coalescing behavior (merging of join columns).
|
LazyFrame
# inner join by default df1 = pl$LazyFrame(list(key = 1:3, payload = c("f", "i", NA))) df2 = pl$LazyFrame(list(key = c(3L, 4L, 5L, NA_integer_))) df1$join(other = df2, on = "key") # cross join df1 = pl$LazyFrame(x = letters[1:3]) df2 = pl$LazyFrame(y = 1:4) df1$join(other = df2, how = "cross") # use "validate" to ensure join keys are not duplicated df1 = pl$LazyFrame(x = letters[1:5], y = 1:5) df2 = pl$LazyFrame(x = c("a", letters[1:4]), y2 = 6:10) # this throws an error because there are two keys in df2 that match the key # in df1 tryCatch( df1$join(df2, on = "x", validate = "1:1")$collect(), error = function(e) print(e) )
# inner join by default df1 = pl$LazyFrame(list(key = 1:3, payload = c("f", "i", NA))) df2 = pl$LazyFrame(list(key = c(3L, 4L, 5L, NA_integer_))) df1$join(other = df2, on = "key") # cross join df1 = pl$LazyFrame(x = letters[1:3]) df2 = pl$LazyFrame(y = 1:4) df1$join(other = df2, how = "cross") # use "validate" to ensure join keys are not duplicated df1 = pl$LazyFrame(x = letters[1:5], y = 1:5) df2 = pl$LazyFrame(x = c("a", letters[1:4]), y2 = 6:10) # this throws an error because there are two keys in df2 that match the key # in df1 tryCatch( df1$join(df2, on = "x", validate = "1:1")$collect(), error = function(e) print(e) )
This is similar to a left-join except that we match on nearest key rather than equal keys.
LazyFrame_join_asof( other, ..., left_on = NULL, right_on = NULL, on = NULL, by_left = NULL, by_right = NULL, by = NULL, strategy = c("backward", "forward", "nearest"), suffix = "_right", tolerance = NULL, allow_parallel = TRUE, force_parallel = FALSE, coalesce = TRUE )
LazyFrame_join_asof( other, ..., left_on = NULL, right_on = NULL, on = NULL, by_left = NULL, by_right = NULL, by = NULL, strategy = c("backward", "forward", "nearest"), suffix = "_right", tolerance = NULL, allow_parallel = TRUE, force_parallel = FALSE, coalesce = TRUE )
other |
LazyFrame |
... |
Not used, blocks use of further positional arguments |
left_on , right_on
|
Same as |
on |
Either a vector of column names or a list of expressions and/or
strings. Use |
by_left , by_right
|
Same as |
by |
Join on these columns before performing asof join. Either a vector
of column names or a list of expressions and/or strings. Use |
strategy |
Strategy for where to find match:
|
suffix |
Suffix to add to duplicated column names. |
tolerance |
Numeric tolerance. By setting this the join will only be done if the near
keys are within this distance. If an asof join is done on columns of dtype
"Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
About the language, see the There may be a circumstance where R types are not sufficient to express a
numeric tolerance. In that case, you can use the expression syntax like
|
allow_parallel |
Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel. |
force_parallel |
Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. |
coalesce |
Coalescing behavior (merging of
|
Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
# # create two LazyFrame to join asof gdp = pl$LazyFrame( date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")), gdp = c(4321, 4164, 4411, 4566, 4696), group = c("b", "a", "a", "b", "b") ) pop = pl$LazyFrame( date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")), population = c(82.19, 82.66, 83.12, 83.52), group = c("b", "b", "a", "a") ) # optional make sure tables are already sorted with "on" join-key gdp = gdp$sort("date") pop = pop$sort("date") # Left-join_asof LazyFrame pop with gdp on "date" # Look backward in gdp to find closest matching date pop$join_asof(gdp, on = "date", strategy = "backward")$collect() # .... and forward pop$join_asof(gdp, on = "date", strategy = "forward")$collect() # join by a group: "only look within groups" pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect() # only look 2 weeks and 2 days back pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect() # only look 11 days back (numeric tolerance depends on polars type, <date> is in days) pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
# # create two LazyFrame to join asof gdp = pl$LazyFrame( date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")), gdp = c(4321, 4164, 4411, 4566, 4696), group = c("b", "a", "a", "b", "b") ) pop = pl$LazyFrame( date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")), population = c(82.19, 82.66, 83.12, 83.52), group = c("b", "b", "a", "a") ) # optional make sure tables are already sorted with "on" join-key gdp = gdp$sort("date") pop = pop$sort("date") # Left-join_asof LazyFrame pop with gdp on "date" # Look backward in gdp to find closest matching date pop$join_asof(gdp, on = "date", strategy = "backward")$collect() # .... and forward pop$join_asof(gdp, on = "date", strategy = "forward")$collect() # join by a group: "only look within groups" pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect() # only look 2 weeks and 2 days back pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect() # only look 11 days back (numeric tolerance depends on polars type, <date> is in days) pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
This performs an inner join, so only rows where all predicates are true are included in the result, and a row from either LazyFrame may be included multiple times in the result.
Note that the row order of the input LazyFrames is not preserved.
LazyFrame_join_where(other, ..., suffix = "_right")
LazyFrame_join_where(other, ..., suffix = "_right")
other |
LazyFrame to join with. |
... |
(In)Equality condition to join the two tables on. When a column
name occurs in both tables, the proper suffix must be applied in the
predicate. For example, if both tables have a column |
suffix |
Suffix to append to columns with a duplicate name. |
A LazyFrame
east = pl$LazyFrame( id = c(100, 101, 102), dur = c(120, 140, 160), rev = c(12, 14, 16), cores = c(2, 8, 4) ) west = pl$LazyFrame( t_id = c(404, 498, 676, 742), time = c(90, 130, 150, 170), cost = c(9, 13, 15, 16), cores = c(4, 2, 1, 4) ) east$join_where( west, pl$col("dur") < pl$col("time"), pl$col("rev") < pl$col("cost") )$collect()
east = pl$LazyFrame( id = c(100, 101, 102), dur = c(120, 140, 160), rev = c(12, 14, 16), cores = c(2, 8, 4) ) west = pl$LazyFrame( t_id = c(404, 498, 676, 742), time = c(90, 130, 150, 170), cost = c(9, 13, 15, 16), cores = c(4, 2, 1, 4) ) east$join_where( west, pl$col("dur") < pl$col("time"), pl$col("rev") < pl$col("cost") )$collect()
Aggregate the columns in the LazyFrame to their maximum value.
LazyFrame_last()
LazyFrame_last()
A LazyFrame with one row
as_polars_lf(mtcars)$last()$collect()
as_polars_lf(mtcars)$last()$collect()
Aggregate the columns in the LazyFrame to their maximum value.
LazyFrame_max()
LazyFrame_max()
A LazyFrame with one row
as_polars_lf(mtcars)$max()$collect()
as_polars_lf(mtcars)$max()$collect()
Aggregate the columns in the LazyFrame to their mean value.
LazyFrame_mean()
LazyFrame_mean()
A LazyFrame with one row
as_polars_lf(mtcars)$mean()$collect()
as_polars_lf(mtcars)$mean()$collect()
Aggregate the columns in the LazyFrame to their median value.
LazyFrame_median()
LazyFrame_median()
A LazyFrame with one row
as_polars_lf(mtcars)$median()$collect()
as_polars_lf(mtcars)$median()$collect()
Aggregate the columns in the LazyFrame to their minimum value.
LazyFrame_min()
LazyFrame_min()
A LazyFrame with one row
as_polars_lf(mtcars)$min()$collect()
as_polars_lf(mtcars)$min()$collect()
can be used i the middle of a method chain
LazyFrame_print(x)
LazyFrame_print(x)
x |
LazyFrame |
self
as_polars_lf(iris)$print()
as_polars_lf(iris)$print()
This will run the query and return a list containing the materialized DataFrame and a DataFrame that contains profiling information of each node that is executed.
LazyFrame_profile( type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE, show_plot = FALSE, truncate_nodes = 0 )
LazyFrame_profile( type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE, no_optimization = FALSE, collect_in_background = FALSE, show_plot = FALSE, truncate_nodes = 0 )
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
no_optimization |
Logical. Sets the following parameters to |
collect_in_background |
Logical. Detach this query from R session. Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session. |
show_plot |
Show a Gantt chart of the profiling result |
truncate_nodes |
Truncate the label lengths in the Gantt chart to this
number of characters. If |
The units of the timings are microseconds.
List of two DataFrame
s: one with the collected result, the other
with the timings of each step. If show_graph = TRUE
, then the plot is
also stored in the list.
$collect()
- regular collect.
$fetch()
- fast limited query check
$collect_in_background()
- non-blocking
collect returns a future handle. Can also just be used via
$collect(collect_in_background = TRUE)
.
$sink_parquet()
streams query to a parquet file.
$sink_ipc()
streams query to a arrow file.
## Simplest use case pl$LazyFrame()$select(pl$lit(2) + 2)$profile() ## Use $profile() to compare two queries # -1- map each Species-group with native polars, takes ~120us only as_polars_lf(iris)$ sort("Sepal.Length")$ group_by("Species", maintain_order = TRUE)$ agg(pl$col(pl$Float64)$first() + 5)$ profile() # -2- map each Species-group of each numeric column with an R function, takes ~7000us (slow!) # some R function, prints `.` for each time called by polars r_func = \(s) { cat(".") s$to_r()[1] + 5 } as_polars_lf(iris)$ sort("Sepal.Length")$ group_by("Species", maintain_order = TRUE)$ agg(pl$col(pl$Float64)$map_elements(r_func))$ profile()
## Simplest use case pl$LazyFrame()$select(pl$lit(2) + 2)$profile() ## Use $profile() to compare two queries # -1- map each Species-group with native polars, takes ~120us only as_polars_lf(iris)$ sort("Sepal.Length")$ group_by("Species", maintain_order = TRUE)$ agg(pl$col(pl$Float64)$first() + 5)$ profile() # -2- map each Species-group of each numeric column with an R function, takes ~7000us (slow!) # some R function, prints `.` for each time called by polars r_func = \(s) { cat(".") s$to_r()[1] + 5 } as_polars_lf(iris)$ sort("Sepal.Length")$ group_by("Species", maintain_order = TRUE)$ agg(pl$col(pl$Float64)$map_elements(r_func))$ profile()
Aggregate the columns in the DataFrame to a unique quantile
value. Use $describe()
to specify several quantiles.
LazyFrame_quantile(quantile, interpolation = "nearest")
LazyFrame_quantile(quantile, interpolation = "nearest")
quantile |
Numeric of length 1 between 0 and 1. |
interpolation |
One of |
LazyFrame
as_polars_lf(mtcars)$quantile(.4)$collect()
as_polars_lf(mtcars)$quantile(.4)$collect()
Rename column names of a LazyFrame
LazyFrame_rename(...)
LazyFrame_rename(...)
... |
One of the following:
|
If existing names are swapped (e.g. A
points to B
and B
points to A
),
polars will block projection and predicate pushdowns at this node.
lf = pl$LazyFrame( foo = 1:3, bar = 6:8, ham = letters[1:3] ) lf$rename(foo = "apple")$collect() lf$rename( \(column_name) paste0("c", substr(column_name, 2, 100)) )$collect()
lf = pl$LazyFrame( foo = 1:3, bar = 6:8, ham = letters[1:3] ) lf$rename(foo = "apple")$collect() lf$rename( \(column_name) paste0("c", substr(column_name, 2, 100)) )$collect()
Reverse the LazyFrame (the last row becomes the first one, etc.).
LazyFrame_reverse()
LazyFrame_reverse()
LazyFrame
as_polars_lf(mtcars)$reverse()$collect()
as_polars_lf(mtcars)$reverse()$collect()
If you have a time series <t_0, t_1, ..., t_n>
, then by default the windows
created will be:
(t_0 - period, t_0]
(t_1 - period, t_1]
…
(t_n - period, t_n]
whereas if you pass a non-default offset, then the windows will be:
(t_0 + offset, t_0 + offset + period]
(t_1 + offset, t_1 + offset + period]
…
(t_n + offset, t_n + offset + period]
LazyFrame_rolling( index_column, ..., period, offset = NULL, closed = "right", group_by = NULL )
LazyFrame_rolling( index_column, ..., period, offset = NULL, closed = "right", group_by = NULL )
index_column |
Column used to group based on the time window. Often of
type Date/Datetime. This column must be sorted in ascending order (or, if |
... |
Ignored. |
period |
A character representing the length of the window,
must be non-negative. See the |
offset |
A character representing the offset of the window,
or |
closed |
Define which sides of the temporal interval are closed
(inclusive). This can be either |
group_by |
Also group by this column/these columns. |
In case of a rolling operation on an integer column, the windows are defined by:
"1i" # length 1
"10i" # length 10
A LazyGroupBy object
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
dates = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns( pl$col("dt")$str$strptime(pl$Datetime())$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( sum_a = pl$sum("a"), min_a = pl$min("a"), max_a = pl$max("a") )$collect()
dates = c( "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" ) df = pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns( pl$col("dt")$str$strptime(pl$Datetime())$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( sum_a = pl$sum("a"), min_a = pl$min("a"), max_a = pl$max("a") )$collect()
Similar to dplyr::mutate()
. However, it discards unmentioned
columns (like .()
in data.table
).
LazyFrame_select(...)
LazyFrame_select(...)
... |
Columns to keep. Those can be expressions (e.g |
A LazyFrame
as_polars_lf(iris)$select( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
as_polars_lf(iris)$select( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
Similar to dplyr::mutate()
. However, it discards unmentioned columns (like
.()
in data.table
).
This will run all expression sequentially instead of in parallel. Use this
when the work per expression is cheap. Otherwise, $select()
should be
preferred.
LazyFrame_select_seq(...)
LazyFrame_select_seq(...)
... |
Columns to keep. Those can be expressions (e.g |
A LazyFrame
as_polars_lf(iris)$select_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
as_polars_lf(iris)$select_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") )
Note that not all LazyFrames can be serialized. For example, LazyFrames that
contain UDFs such as $map_elements()
cannot be serialized.
LazyFrame_serialize()
LazyFrame_serialize()
A character of the JSON representation of the logical plan
lf = pl$LazyFrame(a = 1:3)$sum() json = lf$serialize() json # The logical plan can later be deserialized back into a LazyFrame. pl$deserialize_lf(json)$collect()
lf = pl$LazyFrame(a = 1:3)$sum() json = lf$serialize() json # The logical plan can later be deserialized back into a LazyFrame. pl$deserialize_lf(json)$collect()
Shift the values by a given period. If the period (n
) is positive,
then n
rows will be inserted at the top of the DataFrame and the last n
rows will be discarded. Vice-versa if the period is negative. In the end,
the total number of rows of the DataFrame doesn't change.
LazyFrame_shift(n = 1, fill_value = NULL)
LazyFrame_shift(n = 1, fill_value = NULL)
n |
Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. |
fill_value |
Fill the resulting null values with this value. Accepts expression input. Non-expression inputs are parsed as literals. |
LazyFrame
lf = pl$LazyFrame(a = 1:4, b = 5:8) lf$shift(2)$collect() lf$shift(-2)$collect() lf$shift(-2, fill_value = 100)$collect()
lf = pl$LazyFrame(a = 1:4, b = 5:8) lf$shift(2)$collect() lf$shift(-2)$collect() lf$shift(-2, fill_value = 100)$collect()
This writes the output of a query directly to a CSV file without collecting it in the R session first. This is useful if the output of the query is still larger than RAM as it would crash the R session if it was collected into R.
LazyFrame_sink_csv( path, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_values = "", quote_style = "necessary", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
LazyFrame_sink_csv( path, ..., include_bom = FALSE, include_header = TRUE, separator = ",", line_terminator = "\n", quote_char = "\"", batch_size = 1024, datetime_format = NULL, date_format = NULL, time_format = NULL, float_precision = NULL, null_values = "", quote_style = "necessary", maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
path |
A character. File path to which the file should be written. |
... |
Ignored. |
include_bom |
Whether to include UTF-8 BOM (byte order mark) in the CSV output. |
include_header |
Whether to include header in the CSV output. |
separator |
Separate CSV fields with this symbol. |
line_terminator |
String used to end each row. |
quote_char |
Byte to use as quoting character. |
batch_size |
Number of rows that will be processed per thread. |
datetime_format |
A format string, with the specifiers defined by the chrono Rust crate. If no format specified, the default fractional-second precision is inferred from the maximum timeunit found in the frame’s Datetime cols (if any). |
date_format |
A format string, with the specifiers defined by the chrono Rust crate. |
time_format |
A format string, with the specifiers defined by the chrono Rust crate. |
float_precision |
Number of decimal places to write, applied to both Float32 and Float64 datatypes. |
null_values |
A string representing null values (defaulting to the empty string). |
quote_style |
Determines the quoting strategy used.
|
maintain_order |
Maintain the order in which data is processed. Setting
this to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
no_optimization |
Logical. Sets the following parameters to |
Invisibly returns the input LazyFrame
# sink table 'mtcars' from mem to CSV tmpf = tempfile() as_polars_lf(mtcars)$sink_csv(tmpf) # stream a query end-to-end tmpf2 = tempfile() pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2) # load parquet directly into a DataFrame / memory pl$scan_csv(tmpf2)$collect()
# sink table 'mtcars' from mem to CSV tmpf = tempfile() as_polars_lf(mtcars)$sink_csv(tmpf) # stream a query end-to-end tmpf2 = tempfile() pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2) # load parquet directly into a DataFrame / memory pl$scan_csv(tmpf2)$collect()
This writes the output of a query directly to an Arrow IPC file without collecting it in the R session first. This is useful if the output of the query is still larger than RAM as it would crash the R session if it was collected into R.
LazyFrame_sink_ipc( path, ..., compression = c("zstd", "lz4", "uncompressed"), maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
LazyFrame_sink_ipc( path, ..., compression = c("zstd", "lz4", "uncompressed"), maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
path |
A character. File path to which the file should be written. |
... |
Ignored. |
compression |
|
maintain_order |
Maintain the order in which data is processed. Setting
this to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
no_optimization |
Logical. Sets the following parameters to |
Invisibly returns the input LazyFrame
# sink table 'mtcars' from mem to ipc tmpf = tempfile() as_polars_lf(mtcars)$sink_ipc(tmpf) # stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040) # tmpf2 = tempfile() # pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2) # load ipc directly into a DataFrame / memory # pl$scan_ipc(tmpf2)$collect()
# sink table 'mtcars' from mem to ipc tmpf = tempfile() as_polars_lf(mtcars)$sink_ipc(tmpf) # stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040) # tmpf2 = tempfile() # pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2) # load ipc directly into a DataFrame / memory # pl$scan_ipc(tmpf2)$collect()
This writes the output of a query directly to a JSON file without collecting it in the R session first. This is useful if the output of the query is still larger than RAM as it would crash the R session if it was collected into R.
LazyFrame_sink_ndjson( path, ..., maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
LazyFrame_sink_ndjson( path, ..., maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
path |
A character. File path to which the file should be written. |
... |
Ignored. |
maintain_order |
Maintain the order in which data is processed. Setting
this to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
no_optimization |
Logical. Sets the following parameters to |
Invisibly returns the input LazyFrame
# sink table 'mtcars' from mem to JSON tmpf = tempfile(fileext = ".json") as_polars_lf(mtcars)$sink_ndjson(tmpf) # load parquet directly into a DataFrame / memory pl$scan_ndjson(tmpf)$collect()
# sink table 'mtcars' from mem to JSON tmpf = tempfile(fileext = ".json") as_polars_lf(mtcars)$sink_ndjson(tmpf) # load parquet directly into a DataFrame / memory pl$scan_ndjson(tmpf)$collect()
This writes the output of a query directly to a Parquet file without collecting it in the R session first. This is useful if the output of the query is still larger than RAM as it would crash the R session if it was collected into R.
LazyFrame_sink_parquet( path, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
LazyFrame_sink_parquet( path, ..., compression = "zstd", compression_level = 3, statistics = TRUE, row_group_size = NULL, data_page_size = NULL, maintain_order = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, no_optimization = FALSE )
path |
A character. File path to which the file should be written. |
... |
Ignored. |
compression |
String. The compression method. One of:
|
compression_level |
|
statistics |
Whether statistics should be written to the Parquet headers. Possible values:
|
row_group_size |
|
data_page_size |
Size of the data page in bytes. If |
maintain_order |
Maintain the order in which data is processed. Setting
this to |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
no_optimization |
Logical. Sets the following parameters to |
Invisibly returns the input LazyFrame
# sink table 'mtcars' from mem to parquet tmpf = tempfile() as_polars_lf(mtcars)$sink_parquet(tmpf) # stream a query end-to-end tmpf2 = tempfile() pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2) # load parquet directly into a DataFrame / memory pl$scan_parquet(tmpf2)$collect()
# sink table 'mtcars' from mem to parquet tmpf = tempfile() as_polars_lf(mtcars)$sink_parquet(tmpf) # stream a query end-to-end tmpf2 = tempfile() pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2) # load parquet directly into a DataFrame / memory pl$scan_parquet(tmpf2)$collect()
Get a slice of the LazyFrame.
LazyFrame_slice(offset, length = NULL)
LazyFrame_slice(offset, length = NULL)
offset |
Start index, can be a negative value. This is 0-indexed, so
|
length |
Length of the slice. If |
as_polars_lf(mtcars)$slice(2, 4)$collect() as_polars_lf(mtcars)$slice(30)$collect() mtcars[2:6, ]
as_polars_lf(mtcars)$slice(2, 4)$collect() as_polars_lf(mtcars)$slice(30)$collect() mtcars[2:6, ]
Sort the LazyFrame by the given columns
LazyFrame_sort( by, ..., descending = FALSE, nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE )
LazyFrame_sort( by, ..., descending = FALSE, nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE )
by |
Column(s) to sort by. Can be character vector of column names, a list of Expr(s) or a list with a mix of Expr(s) and column names. |
... |
More columns to sort by as above but provided one Expr per argument. |
descending |
Logical. Sort in descending order (default is |
nulls_last |
A logical or logical vector of the same length as the number of columns.
If |
maintain_order |
Whether the order should be maintained if elements are
equal. If |
multithreaded |
A logical. If |
LazyFrame
df = mtcars df$mpg[1] = NA df = pl$LazyFrame(df) df$sort("mpg")$collect() df$sort("mpg", nulls_last = TRUE)$collect() df$sort("cyl", "mpg")$collect() df$sort(c("cyl", "mpg"))$collect() df$sort(c("cyl", "mpg"), descending = TRUE)$collect() df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))$collect() df$sort(pl$col("cyl"), pl$col("mpg"))$collect()
df = mtcars df$mpg[1] = NA df = pl$LazyFrame(df) df$sort("mpg")$collect() df$sort("mpg", nulls_last = TRUE)$collect() df$sort("cyl", "mpg")$collect() df$sort(c("cyl", "mpg"))$collect() df$sort(c("cyl", "mpg"), descending = TRUE)$collect() df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))$collect() df$sort(pl$col("cyl"), pl$col("mpg"))$collect()
The calling frame is automatically registered as a table in the SQL context
under the name "self"
. All DataFrames and
LazyFrames found in the envir
are also registered,
using their variable name.
More control over registration and execution behaviour is available by
the SQLContext object.
LazyFrame_sql(query, ..., table_name = NULL, envir = parent.frame())
LazyFrame_sql(query, ..., table_name = NULL, envir = parent.frame())
query |
A character of the SQL query to execute. |
... |
Ignored. |
table_name |
|
envir |
The environment to search for polars DataFrames/LazyFrames. |
This functionality is considered unstable, although it is close to being considered stable. It may be changed at any point without it being considered a breaking change.
lf1 = pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x")) lf2 = pl$LazyFrame(a = 3:1, d = c(125, -654, 888)) # Query the LazyFrame using SQL: lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect() # Join two LazyFrames: lf1$sql( " SELECT self.*, d FROM self INNER JOIN lf2 USING (a) WHERE a > 1 AND b < 8 " )$collect() # Apply SQL transforms (aliasing "self" to "frame") and subsequently # filter natively (you can freely mix SQL and native operations): lf1$sql( query = r"( SELECT a, MOD(a, 2) == 0 AS a_is_even, (b::float / 2) AS 'b/2', CONCAT_WS(':', c, c, c) AS c_c_c FROM frame ORDER BY a )", table_name = "frame" )$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
lf1 = pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x")) lf2 = pl$LazyFrame(a = 3:1, d = c(125, -654, 888)) # Query the LazyFrame using SQL: lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect() # Join two LazyFrames: lf1$sql( " SELECT self.*, d FROM self INNER JOIN lf2 USING (a) WHERE a > 1 AND b < 8 " )$collect() # Apply SQL transforms (aliasing "self" to "frame") and subsequently # filter natively (you can freely mix SQL and native operations): lf1$sql( query = r"( SELECT a, MOD(a, 2) == 0 AS a_is_even, (b::float / 2) AS 'b/2', CONCAT_WS(':', c, c, c) AS c_c_c FROM frame ORDER BY a )", table_name = "frame" )$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
Aggregate the columns of this LazyFrame to their standard deviation values.
LazyFrame_std(ddof = 1)
LazyFrame_std(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A LazyFrame with one row
as_polars_lf(mtcars)$std()$collect()
as_polars_lf(mtcars)$std()$collect()
Aggregate the columns of this LazyFrame to their sum values.
LazyFrame_sum()
LazyFrame_sum()
A LazyFrame with one row
as_polars_lf(mtcars)$sum()$collect()
as_polars_lf(mtcars)$sum()$collect()
n
rows.Get the last n
rows.
LazyFrame_tail(n = 5L)
LazyFrame_tail(n = 5L)
n |
Number of rows to return. |
A new LazyFrame
object with applied filter.
lf = pl$LazyFrame(a = 1:6, b = 7:12) lf$tail()$collect() lf$tail(2)$collect()
lf = pl$LazyFrame(a = 1:6, b = 7:12) lf$tail()$collect() lf$tail(2)$collect()
This only returns the "dot" output that can be passed to other packages, such
as DiagrammeR::grViz()
.
LazyFrame_to_dot( ..., optimized = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE )
LazyFrame_to_dot( ..., optimized = TRUE, type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, cluster_with_columns = TRUE, streaming = FALSE )
... |
Not used.. |
optimized |
Optimize the query plan. |
type_coercion |
Logical. Coerce types such that operations succeed and run on minimal required memory. |
predicate_pushdown |
Logical. Applies filters as early as possible at scan level. |
projection_pushdown |
Logical. Select only the columns that are needed at the scan level. |
simplify_expression |
Logical. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. |
slice_pushdown |
Logical. Only load the required slice from the scan
level. Don't materialize sliced outputs (e.g. |
comm_subplan_elim |
Logical. Will try to cache branching subplans that occur on self-joins or unions. |
comm_subexpr_elim |
Logical. Common subexpressions will be cached and reused. |
cluster_with_columns |
Combine sequential independent calls to
|
streaming |
Logical. Run parts of the query in a streaming fashion (this is in an alpha state). |
A character vector
lf = pl$LazyFrame( a = c("a", "b", "a", "b", "b", "c"), b = 1:6, c = 6:1 ) query = lf$group_by("a", maintain_order = TRUE)$agg( pl$all()$sum() )$sort( "a" ) query$to_dot() |> cat() # You could print the graph by using DiagrammeR for example, with # query$to_dot() |> DiagrammeR::grViz().
lf = pl$LazyFrame( a = c("a", "b", "a", "b", "b", "c"), b = 1:6, c = 6:1 ) query = lf$group_by("a", maintain_order = TRUE)$agg( pl$all()$sum() )$sort( "a" ) query$to_dot() |> cat() # You could print the graph by using DiagrammeR for example, with # query$to_dot() |> DiagrammeR::grViz().
Drop duplicated rows
LazyFrame_unique(subset = NULL, ..., keep = "any", maintain_order = FALSE)
LazyFrame_unique(subset = NULL, ..., keep = "any", maintain_order = FALSE)
subset |
A character vector with the names of the column(s) to use to
identify duplicates. If |
... |
Not used. |
keep |
Which of the duplicate rows to keep:
|
maintain_order |
Keep the same order as the original data. Setting this
to |
LazyFrame
df = pl$LazyFrame( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) df$collect()$height df$unique()$collect()$height df$unique(subset = "x")$collect()$height df$unique(keep = "last") # only keep unique rows df$unique(keep = "none")
df = pl$LazyFrame( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) df$collect()$height df$unique()$collect()$height df$unique(subset = "x")$collect()$height df$unique(keep = "last") # only keep unique rows df$unique(keep = "none")
Unnest the Struct columns of a LazyFrame
LazyFrame_unnest(...)
LazyFrame_unnest(...)
... |
Names of the struct columns to unnest. This doesn't accept Expr. If nothing is provided, then all columns of datatype Struct are unnested. |
A LazyFrame where some or all columns of datatype Struct are unnested.
lf = pl$LazyFrame( a = 1:5, b = c("one", "two", "three", "four", "five"), c = 6:10 )$ select( pl$struct("b"), pl$struct(c("a", "c"))$alias("a_and_c") ) lf$collect() # by default, all struct columns are unnested lf$unnest()$collect() # we can specify specific columns to unnest lf$unnest("a_and_c")$collect()
lf = pl$LazyFrame( a = 1:5, b = c("one", "two", "three", "four", "five"), c = 6:10 )$ select( pl$struct("b"), pl$struct(c("a", "c"))$alias("a_and_c") ) lf$collect() # by default, all struct columns are unnested lf$unnest()$collect() # we can specify specific columns to unnest lf$unnest("a_and_c")$collect()
Unpivot a Frame from wide to long format
LazyFrame_unpivot( on = NULL, ..., index = NULL, variable_name = NULL, value_name = NULL )
LazyFrame_unpivot( on = NULL, ..., index = NULL, variable_name = NULL, value_name = NULL )
on |
Values to use as identifier variables. If |
... |
Not used. |
index |
Columns to use as identifier variables. |
variable_name |
Name to give to the new column containing the names of the melted columns. Defaults to "variable". |
value_name |
Name to give to the new column containing the values of
the melted columns. Defaults to |
Optionally leaves identifiers set.
This function is useful to massage a Frame into a format where one or more columns are identifier variables (id_vars), while all other columns, considered measured variables (value_vars), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'.
A LazyFrame
lf = pl$LazyFrame( a = c("x", "y", "z"), b = c(1, 3, 5), c = c(2, 4, 6) ) lf$unpivot(index = "a", on = c("b", "c"))$collect()
lf = pl$LazyFrame( a = c("x", "y", "z"), b = c(1, 3, 5), c = c(2, 4, 6) ) lf$unpivot(index = "a", on = c("b", "c"))$collect()
Aggregate the columns of this LazyFrame to their variance values.
LazyFrame_var(ddof = 1)
LazyFrame_var(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A LazyFrame with one row
as_polars_lf(mtcars)$var()$collect()
as_polars_lf(mtcars)$var()$collect()
Add columns or modify existing ones with expressions. This is
the equivalent of dplyr::mutate()
as it keeps unmentioned columns (unlike
$select()
).
LazyFrame_with_columns(...)
LazyFrame_with_columns(...)
... |
Any expressions or string column name, or same wrapped in a list. If first and only element is a list, it is unwrapped as a list of args. |
A LazyFrame
as_polars_lf(iris)$with_columns( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_lf(iris)$with_columns(l_expr) as_polars_lf(iris)$with_columns( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
as_polars_lf(iris)$with_columns( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_lf(iris)$with_columns(l_expr) as_polars_lf(iris)$with_columns( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
Add columns or modify existing ones with expressions. This is
the equivalent of dplyr::mutate()
as it keeps unmentioned columns (unlike
$select()
).
This will run all expression sequentially instead of in parallel. Use this
when the work per expression is cheap. Otherwise, $with_columns()
should be
preferred.
LazyFrame_with_columns_seq(...)
LazyFrame_with_columns_seq(...)
... |
Any expressions or string column name, or same wrapped in a list. If first and only element is a list, it is unwrapped as a list of args. |
A LazyFrame
as_polars_lf(iris)$with_columns_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_lf(iris)$with_columns_seq(l_expr) as_polars_lf(iris)$with_columns_seq( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
as_polars_lf(iris)$with_columns_seq( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) # same query l_expr = list( pl$col("Sepal.Length")$abs()$alias("abs_SL"), (pl$col("Sepal.Length") + 2)$alias("add_2_SL") ) as_polars_lf(iris)$with_columns_seq(l_expr) as_polars_lf(iris)$with_columns_seq( pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" SW_add_2 = (pl$col("Sepal.Width") + 2) )
This allows expressions to also access columns from DataFrames or LazyFrames that are not part of this one.
LazyFrame_with_context(other)
LazyFrame_with_context(other)
other |
Data/LazyFrame to have access to. This can be a list of DataFrames and LazyFrames. |
A LazyFrame
lf = pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) lf_other = pl$LazyFrame(c = c("foo", "ham")) lf$with_context(lf_other)$select( pl$col("b") + pl$col("c")$first() )$collect() # Fill nulls with the median from another lazyframe: train_lf = pl$LazyFrame( feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) ) test_lf = pl$LazyFrame( feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) ) test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) )$collect()
lf = pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) lf_other = pl$LazyFrame(c = c("foo", "ham")) lf$with_context(lf_other)$select( pl$col("b") + pl$col("c")$first() )$collect() # Fill nulls with the median from another lazyframe: train_lf = pl$LazyFrame( feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) ) test_lf = pl$LazyFrame( feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) ) test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) )$collect()
Add a new column at index 0 that counts the rows
LazyFrame_with_row_index(name, offset = NULL)
LazyFrame_with_row_index(name, offset = NULL)
name |
string name of the created column |
offset |
positive integer offset for the start of the counter |
A new LazyFrame with a counter column in front
df = as_polars_lf(mtcars) # by default, the index starts at 0 (to mimic the behavior of Python Polars) df$with_row_index("idx") # but in R, we use a 1-index df$with_row_index("idx", offset = 1)
df = as_polars_lf(mtcars) # by default, the index starts at 0 (to mimic the behavior of Python Polars) df$with_row_index("idx") # but in R, we use a 1-index df$with_row_index("idx", offset = 1)
aggregate a polar_lazy_group_by
LazyGroupBy_agg(...)
LazyGroupBy_agg(...)
... |
exprs to aggregate over.
... args can also be passed wrapped in a list |
A new LazyFrame
object.
lgb = pl$DataFrame( foo = c("one", "two", "two", "one", "two"), bar = c(5, 3, 2, 4, 1) )$ lazy()$ group_by("foo") print(lgb) lgb$ agg( pl$col("bar")$sum()$name$suffix("_sum"), pl$col("bar")$mean()$alias("bar_tail_sum") )
lgb = pl$DataFrame( foo = c("one", "two", "two", "one", "two"), bar = c(5, 3, 2, 4, 1) )$ lazy()$ group_by("foo") print(lgb) lgb$ agg( pl$col("bar")$sum()$name$suffix("_sum"), pl$col("bar")$mean()$alias("bar_tail_sum") )
This class comes from <LazyFrame>$group_by()
, etc.
$columns
returns a character vector with the column names.
as_polars_lf(mtcars)$group_by("cyl")$agg( pl$col("mpg")$sum() )
as_polars_lf(mtcars)$group_by("cyl")$agg( pl$col("mpg")$sum() )
get n rows of head of group
LazyGroupBy_head(n = 1L)
LazyGroupBy_head(n = 1L)
n |
integer number of rows to get |
A new LazyFrame
object.
prints opaque groupby, not much to show
LazyGroupBy_print()
LazyGroupBy_print()
invisible self
get n tail rows of group
LazyGroupBy_tail(n = 1L)
LazyGroupBy_tail(n = 1L)
n |
integer number of rows to get |
A new LazyFrame
object.
Revert the group by operation.
LazyGroupBy_ungroup()
LazyGroupBy_ungroup()
A new LazyFrame
object.
lf = as_polars_lf(mtcars) lf lgb = lf$group_by("cyl") lgb lgb$ungroup()
lf = as_polars_lf(mtcars) lf lgb = lf$group_by("cyl") lgb lgb$ungroup()
Get the length
## S3 method for class 'RPolarsDataFrame' length(x) ## S3 method for class 'RPolarsLazyFrame' length(x) ## S3 method for class 'RPolarsSeries' length(x)
## S3 method for class 'RPolarsDataFrame' length(x) ## S3 method for class 'RPolarsLazyFrame' length(x) ## S3 method for class 'RPolarsSeries' length(x)
x |
Compute the maximum value
## S3 method for class 'RPolarsDataFrame' max(x, ...) ## S3 method for class 'RPolarsLazyFrame' max(x, ...) ## S3 method for class 'RPolarsSeries' max(x, ...)
## S3 method for class 'RPolarsDataFrame' max(x, ...) ## S3 method for class 'RPolarsLazyFrame' max(x, ...) ## S3 method for class 'RPolarsSeries' max(x, ...)
x |
|
... |
Not used. |
Compute the mean
## S3 method for class 'RPolarsDataFrame' mean(x, ...) ## S3 method for class 'RPolarsLazyFrame' mean(x, ...) ## S3 method for class 'RPolarsSeries' mean(x, ...)
## S3 method for class 'RPolarsDataFrame' mean(x, ...) ## S3 method for class 'RPolarsLazyFrame' mean(x, ...) ## S3 method for class 'RPolarsSeries' mean(x, ...)
x |
|
... |
Not used. |
Compute the median
## S3 method for class 'RPolarsDataFrame' median(x, ...) ## S3 method for class 'RPolarsLazyFrame' median(x, ...) ## S3 method for class 'RPolarsSeries' median(x, ...)
## S3 method for class 'RPolarsDataFrame' median(x, ...) ## S3 method for class 'RPolarsLazyFrame' median(x, ...) ## S3 method for class 'RPolarsSeries' median(x, ...)
x |
|
... |
Not used. |
Compute the minimum value
## S3 method for class 'RPolarsDataFrame' min(x, ...) ## S3 method for class 'RPolarsLazyFrame' min(x, ...) ## S3 method for class 'RPolarsSeries' min(x, ...)
## S3 method for class 'RPolarsDataFrame' min(x, ...) ## S3 method for class 'RPolarsLazyFrame' min(x, ...) ## S3 method for class 'RPolarsSeries' min(x, ...)
x |
|
... |
Not used. |
Drop missing values
## S3 method for class 'RPolarsLazyFrame' na.omit(object, subset = NULL, ...) ## S3 method for class 'RPolarsDataFrame' na.omit(object, subset = NULL, ...)
## S3 method for class 'RPolarsLazyFrame' na.omit(object, subset = NULL, ...) ## S3 method for class 'RPolarsDataFrame' na.omit(object, subset = NULL, ...)
object |
|
subset |
Character vector of column names to drop missing values from. |
... |
Not used. |
df = as_polars_df(data.frame(a = c(NA, 2:10), b = c(1, NA, 3:10)))$lazy() na.omit(df) na.omit(df, subset = "a") na.omit(df, subset = c("a", "b"))
df = as_polars_df(data.frame(a = c(NA, 2:10), b = c(1, NA, 3:10)))$lazy() na.omit(df) na.omit(df, subset = "a") na.omit(df, subset = c("a", "b"))
Get the column names
## S3 method for class 'RPolarsDataFrame' names(x) ## S3 method for class 'RPolarsLazyFrame' names(x) ## S3 method for class 'RPolarsLazyGroupBy' names(x) ## S3 method for class 'RPolarsGroupBy' names(x)
## S3 method for class 'RPolarsDataFrame' names(x) ## S3 method for class 'RPolarsLazyFrame' names(x) ## S3 method for class 'RPolarsLazyGroupBy' names(x) ## S3 method for class 'RPolarsGroupBy' names(x)
x |
Not to mix up with Expr_object$all()
which is a 'reduce Boolean columns by
AND' method.
pl_all(name = NULL)
pl_all(name = NULL)
name |
Character vector indicating on which columns the AND operation should be applied. |
Boolean literal
test = pl$DataFrame(col_1 = c(TRUE, TRUE), col_2 = c(TRUE, FALSE)) test # here, the first `$all()` selects all columns, and the second `$all()` checks # whether all values are true in each column test$with_columns(pl$all()$all())
test = pl$DataFrame(col_1 = c(TRUE, TRUE), col_2 = c(TRUE, FALSE)) test # here, the first `$all()` selects all columns, and the second `$all()` checks # whether all values are true in each column test$with_columns(pl$all()$all())
Apply the AND logical rowwise
pl_all_horizontal(...)
pl_all_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = c(TRUE, FALSE, NA, NA), b = c(TRUE, FALSE, NA, NA), c = c(TRUE, FALSE, NA, TRUE) ) df df$with_columns( pl$all_horizontal("a", "b", "c")$alias("all") ) # drop rows that have at least one missing value # == keep rows that only have non-missing values df$filter( pl$all_horizontal(pl$all()$is_not_null()) )
df = pl$DataFrame( a = c(TRUE, FALSE, NA, NA), b = c(TRUE, FALSE, NA, NA), c = c(TRUE, FALSE, NA, TRUE) ) df df$with_columns( pl$all_horizontal("a", "b", "c")$alias("all") ) # drop rows that have at least one missing value # == keep rows that only have non-missing values df$filter( pl$all_horizontal(pl$all()$is_not_null()) )
Apply the OR logical rowwise
pl_any_horizontal(...)
pl_any_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = c(FALSE, FALSE, NA, NA), b = c(TRUE, FALSE, NA, NA), c = c(TRUE, FALSE, NA, TRUE) ) df df$with_columns( pl$any_horizontal("a", "b", "c")$alias("any") ) # drop rows that only have missing values == keep rows that have at least one # non-missing value df$filter( pl$any_horizontal(pl$all()$is_not_null()) )
df = pl$DataFrame( a = c(FALSE, FALSE, NA, NA), b = c(TRUE, FALSE, NA, NA), c = c(TRUE, FALSE, NA, TRUE) ) df df$with_columns( pl$any_horizontal("a", "b", "c")$alias("any") ) # drop rows that only have missing values == keep rows that have at least one # non-missing value df$filter( pl$any_horizontal(pl$all()$is_not_null()) )
This function is syntactic sugar for pl$col(...)$approx_n_unique()
,
and uses the HyperLogLog++ algorithm for cardinality estimation.
pl_approx_n_unique(...)
pl_approx_n_unique(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( a = c(1, 8, 1), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$approx_n_unique("a")) df$select(pl$approx_n_unique("b", "c"))
df = pl$DataFrame( a = c(1, 8, 1), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$approx_n_unique("a")) df$select(pl$approx_n_unique("b", "c"))
Return the row indices that would sort the columns
pl_arg_sort_by( ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, maintain_order = FALSE )
pl_arg_sort_by( ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, maintain_order = FALSE )
... |
Column(s) to arg sort by. Can be Expr(s) or something coercible to Expr(s). Strings are parsed as column names. |
descending |
Logical. Sort in descending order (default is |
nulls_last |
A logical or logical vector of the same length as the number of columns.
If |
multithreaded |
A logical. If |
maintain_order |
Whether the order should be maintained if elements are
equal. If |
Expr
$arg_sort() to find the row indices that would sort an Expr.
df = pl$DataFrame( a = c(0, 1, 1, 0), b = c(3, 2, 3, 2) ) df$with_columns( arg_sort_a = pl$arg_sort_by("a"), arg_sort_ab = pl$arg_sort_by(c("a", "b"), descending = TRUE) ) # we can also pass Expr df$with_columns( arg_sort_a = pl$arg_sort_by(pl$col("a") * -1) )
df = pl$DataFrame( a = c(0, 1, 1, 0), b = c(3, 2, 3, 2) ) df$with_columns( arg_sort_a = pl$arg_sort_by("a"), arg_sort_ab = pl$arg_sort_by(c("a", "b"), descending = TRUE) ) # we can also pass Expr df$with_columns( arg_sort_a = pl$arg_sort_by(pl$col("a") * -1) )
Return indices that match a condition
pl_arg_where(condition)
pl_arg_where(condition)
condition |
An Expr that gives a boolean. |
Expr
df = pl$DataFrame(a = c(1, 2, 3, 4, 5)) df$select( pl$arg_where(pl$col("a") %% 2 == 0) )
df = pl$DataFrame(a = c(1, 2, 3, 4, 5)) df$select( pl$arg_where(pl$col("a") %% 2 == 0) )
Folds the expressions from left to right, keeping the first non-null value.
pl_coalesce(...)
pl_coalesce(...)
... |
is a: If one arg:
If several args, then wrapped in a list and handled as above. |
Expr
df = pl$DataFrame( a = NA_real_, b = c(1L, 4L, NA_real_, NA_real_), c = c(2:4, NA_real_) ) # use coalesce to get first non Null value for each row, otherwise insert 99.9 df$with_columns( pl$coalesce("a", "b", "c", 99.9)$alias("d") )
df = pl$DataFrame( a = NA_real_, b = c(1L, 4L, NA_real_, NA_real_), c = c(2:4, NA_real_) ) # use coalesce to get first non Null value for each row, otherwise insert 99.9 df$with_columns( pl$coalesce("a", "b", "c", 99.9)$alias("d") )
Create an expression representing column(s) in a dataframe
pl_col(...)
pl_col(...)
... |
One of the following:
|
Expr of a column or columns
# a single column by a character pl$col("foo") # multiple columns by characters pl$col("foo", "bar") # multiple columns by RPolarsDataTypes pl$col(pl$Float64, pl$String) # Single `"*"` is converted to a wildcard expression pl$col("*") # multiple character vectors and a list of RPolarsDataTypes are also allowed pl$col(c("foo", "bar"), "baz") pl$col("foo", c("bar", "baz")) pl$col(list(pl$Float64, pl$String)) # there are some special notations for selecting columns df = pl$DataFrame(foo = 1:3, bar = 4:6, baz = 7:9) ## select all columns with a wildcard `"*"` df$select(pl$col("*")) ## select multiple columns by a regular expression ## starts with `^` and ends with `$` df$select(pl$col(c("^ba.*$")))
# a single column by a character pl$col("foo") # multiple columns by characters pl$col("foo", "bar") # multiple columns by RPolarsDataTypes pl$col(pl$Float64, pl$String) # Single `"*"` is converted to a wildcard expression pl$col("*") # multiple character vectors and a list of RPolarsDataTypes are also allowed pl$col(c("foo", "bar"), "baz") pl$col("foo", c("bar", "baz")) pl$col(list(pl$Float64, pl$String)) # there are some special notations for selecting columns df = pl$DataFrame(foo = 1:3, bar = 4:6, baz = 7:9) ## select all columns with a wildcard `"*"` df$select(pl$col("*")) ## select multiple columns by a regular expression ## starts with `^` and ends with `$` df$select(pl$col(c("^ba.*$")))
Concat polars objects
pl_concat( ..., how = c("vertical", "vertical_relaxed", "horizontal", "diagonal", "diagonal_relaxed"), rechunk = FALSE, parallel = TRUE )
pl_concat( ..., how = c("vertical", "vertical_relaxed", "horizontal", "diagonal", "diagonal_relaxed"), rechunk = FALSE, parallel = TRUE )
... |
Either individual unpacked args or args wrapped in list(). Args can
be eager as DataFrame, Series and R vectors, or lazy as LazyFrame and Expr.
The first element determines the output of |
how |
Bind direction. Can be "vertical" (like |
rechunk |
Perform a rechunk at last. |
parallel |
Only used for LazyFrames. If |
Categorical columns/Series must have been constructed while global string
cache enabled. See pl$enable_string_cache()
.
DataFrame, Series, LazyFrame or Expr
# vertical l_ver = lapply(1:10, function(i) { l_internal = list( a = 1:5, b = letters[1:5] ) pl$DataFrame(l_internal) }) pl$concat(l_ver, how = "vertical") # horizontal l_hor = lapply(1:10, function(i) { l_internal = list( 1:5, letters[1:5] ) names(l_internal) = paste0(c("a", "b"), i) pl$DataFrame(l_internal) }) pl$concat(l_hor, how = "horizontal") # diagonal pl$concat(l_hor, how = "diagonal") # if two columns don't share the same type, concat() will error unless we use # `how = "vertical_relaxed"`: test = pl$DataFrame(x = 1L) # i32 test2 = pl$DataFrame(x = 1.0) # f64 pl$concat(test, test2, how = "vertical_relaxed")
# vertical l_ver = lapply(1:10, function(i) { l_internal = list( a = 1:5, b = letters[1:5] ) pl$DataFrame(l_internal) }) pl$concat(l_ver, how = "vertical") # horizontal l_hor = lapply(1:10, function(i) { l_internal = list( 1:5, letters[1:5] ) names(l_internal) = paste0(c("a", "b"), i) pl$DataFrame(l_internal) }) pl$concat(l_hor, how = "horizontal") # diagonal pl$concat(l_hor, how = "diagonal") # if two columns don't share the same type, concat() will error unless we use # `how = "vertical_relaxed"`: test = pl$DataFrame(x = 1L) # i32 test2 = pl$DataFrame(x = 1.0) # f64 pl$concat(test, test2, how = "vertical_relaxed")
Folds the expressions from left to right, keeping the first non-null value.
pl_concat_list(exprs)
pl_concat_list(exprs)
exprs |
list of Into |
Expr
# Create lagged columns and collect them into a list. This mimics a rolling window. df = pl$DataFrame(A = c(1, 2, 9, 2, 13)) df$with_columns(lapply( 0:2, \(i) pl$col("A")$shift(i)$alias(paste0("A_lag_", i)) ))$select( pl$concat_list(lapply(2:0, \(i) pl$col(paste0("A_lag_", i))))$alias( "A_rolling" ) ) # concat Expr a Series and an R obejct pl$concat_list(list( pl$lit(1:5), as_polars_series(5:1), rep(0L, 5) ))$alias("alice")$to_series()
# Create lagged columns and collect them into a list. This mimics a rolling window. df = pl$DataFrame(A = c(1, 2, 9, 2, 13)) df$with_columns(lapply( 0:2, \(i) pl$col("A")$shift(i)$alias(paste0("A_lag_", i)) ))$select( pl$concat_list(lapply(2:0, \(i) pl$col(paste0("A_lag_", i))))$alias( "A_rolling" ) ) # concat Expr a Series and an R obejct pl$concat_list(list( pl$lit(1:5), as_polars_series(5:1), rep(0L, 5) ))$alias("alice")$to_series()
Horizontally concatenate columns into a single string column
pl_concat_str(..., separator = "", ignore_nulls = FALSE)
pl_concat_str(..., separator = "", ignore_nulls = FALSE)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. Non-String columns are cast to String |
separator |
String that will be used to separate the values of each column. |
ignore_nulls |
If |
Expr
df = pl$DataFrame( a = 1:3, b = c("dogs", "cats", NA), c = c("play", "swim", "walk") ) df$with_columns( pl$concat_str( pl$col("a") * 2L, "b", "c", pl$lit("!"), separator = " " )$alias("full_sentence") ) df$with_columns( pl$concat_str( pl$col("a") * 2L, "b", "c", pl$lit("!"), separator = " ", ignore_nulls = TRUE )$alias("full_sentence") )
df = pl$DataFrame( a = 1:3, b = c("dogs", "cats", NA), c = c("play", "swim", "walk") ) df$with_columns( pl$concat_str( pl$col("a") * 2L, "b", "c", pl$lit("!"), separator = " " )$alias("full_sentence") ) df$with_columns( pl$concat_str( pl$col("a") * 2L, "b", "c", pl$lit("!"), separator = " ", ignore_nulls = TRUE )$alias("full_sentence") )
Calculates the correlation between two columns
pl_corr(a, b, method = "pearson", propagate_nans = FALSE)
pl_corr(a, b, method = "pearson", propagate_nans = FALSE)
a |
One column name or Expr or anything convertible Into |
b |
Another column name or Expr or anything convertible Into |
method |
str One of 'pearson' or 'spearman' |
propagate_nans |
bool Used only when calculating the spearman rank correlation.
If |
Expr for the computed correlation
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$corr("a", "b", method = "spearman"))$collect()
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$corr("a", "b", method = "spearman"))$collect()
This function is syntactic sugar for pl$col(...)$count()
.
pl_count(...)
pl_count(...)
... |
Characters indicating the column names, passed to |
Calling this function without any arguments returns the number of rows in the context.
This way of using the function is deprecated.
Please use pl$len()
instead.
Expression of data type UInt32
df = pl$DataFrame( a = c(1, 2, NA), b = c(3, NA, NA), c = c("foo", "bar", "foo") ) df$select(pl$count("a")) df$select(pl$count(c("b", "c")))
df = pl$DataFrame( a = c(1, 2, NA), b = c(3, NA, NA), c = c("foo", "bar", "foo") ) df$select(pl$count("a")) df$select(pl$count(c("b", "c")))
Calculates the covariance between two columns / expressions.
pl_cov(a, b, ddof = 1)
pl_cov(a, b, ddof = 1)
a |
One column name or Expr or anything convertible Into |
b |
Another column name or Expr or anything convertible Into |
ddof |
integer Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
Expr for the computed covariance
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$cov("a", "b"))$collect() pl$cov(c(1, 8, 3), c(4, 5, 2))$to_r()
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$cov("a", "b"))$collect() pl$cov(c(1, 8, 3), c(4, 5, 2))$to_r()
Create a new polars DataFrame
pl_DataFrame(..., make_names_unique = TRUE, schema = NULL)
pl_DataFrame(..., make_names_unique = TRUE, schema = NULL)
... |
One of the following:
Columns will be named as of named arguments or alternatively by names of Series or given a placeholder name. |
make_names_unique |
If |
schema |
A named list that will be used to convert a variable to a
specific DataType. Same as |
pl$DataFrame( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1:1, 1:2, 1:3, 1:4, 1:5) ) # directly from vectors
pl$DataFrame( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1:1, 1:2, 1:3, 1:4, 1:5) ) # directly from vectors
Create a Date expression
pl_date(year, month, day)
pl_date(year, month, day)
year |
An Expr or something coercible to an Expr, that must return an integer. Strings are parsed as column names. Floats are cast to integers. |
month |
An Expr or something coercible to an Expr, that must return an integer between 1 and 12. Strings are parsed as column names. Floats are cast to integers. |
day |
An Expr or something coercible to an Expr, that must return an integer between 1 and 31. Strings are parsed as column names. Floats are cast to integers. |
An Expr of type Date
df = pl$DataFrame(year = 2019:2021, month = 9:11, day = 10:12) df$with_columns( date_from_cols = pl$date("year", "month", "day"), date_from_lit = pl$date(2020, 3, 5), date_from_mix = pl$date("year", 3, 5) ) # floats are coerced to integers df$with_columns( date_floats = pl$date(2018.8, 5.3, 1) ) # if date can't be constructed, it returns null df$with_columns( date_floats = pl$date(pl$lit("abc"), -2, 1) )
df = pl$DataFrame(year = 2019:2021, month = 9:11, day = 10:12) df$with_columns( date_from_cols = pl$date("year", "month", "day"), date_from_lit = pl$date(2020, 3, 5), date_from_mix = pl$date("year", 3, 5) ) # floats are coerced to integers df$with_columns( date_floats = pl$date(2018.8, 5.3, 1) ) # if date can't be constructed, it returns null df$with_columns( date_floats = pl$date(pl$lit("abc"), -2, 1) )
If both start
and end
are passed as the Date types (not Datetime), and
the interval
granularity is no finer than "1d"
, the returned range is
also of type Date. All other permutations return a Datetime.
pl_date_range(start, end, interval = "1d", ..., closed = "both")
pl_date_range(start, end, interval = "1d", ..., closed = "both")
start |
Lower bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
end |
Upper bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
interval |
Interval of the range periods, specified as a difftime object
or using the Polars duration string language.
See the |
... |
Ignored. |
closed |
Define which sides of the range are closed (inclusive).
One of the followings: |
An Expr of data type Date or Datetime
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
pl$date_ranges()
to create a simple Series of data
type list(Date) based on column values.
# Using Polars duration string to specify the interval: pl$date_range(as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo") |> as_polars_series("date") # Using `difftime` object to specify the interval: pl$date_range( as.Date("1985-01-01"), as.Date("1985-01-10"), as.difftime(2, units = "days") ) |> as_polars_series("date")
# Using Polars duration string to specify the interval: pl$date_range(as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo") |> as_polars_series("date") # Using `difftime` object to specify the interval: pl$date_range( as.Date("1985-01-01"), as.Date("1985-01-10"), as.difftime(2, units = "days") ) |> as_polars_series("date")
If both start
and end
are passed as the Date types (not Datetime), and
the interval
granularity is no finer than "1d"
, the returned range is
also of type Date. All other permutations return a Datetime.
pl_date_ranges(start, end, interval = "1d", ..., closed = "both")
pl_date_ranges(start, end, interval = "1d", ..., closed = "both")
start |
Lower bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
end |
Upper bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
interval |
Interval of the range periods, specified as a difftime object
or using the Polars duration string language.
See the |
... |
Ignored. |
closed |
Define which sides of the range are closed (inclusive).
One of the followings: |
An Expr of data type List(Date) or List(Datetime)
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
pl$date_range()
to create a simple Series of data
type Date.
df = pl$DataFrame( start = as.Date(c("2022-01-01", "2022-01-02", NA)), end = as.Date("2022-01-03") ) df$with_columns( date_range = pl$date_ranges("start", "end"), date_range_cr = pl$date_ranges("start", "end", closed = "right") ) # provide a custom "end" value df$with_columns( date_range_lit = pl$date_ranges("start", pl$lit(as.Date("2022-01-02"))) )
df = pl$DataFrame( start = as.Date(c("2022-01-01", "2022-01-02", NA)), end = as.Date("2022-01-03") ) df$with_columns( date_range = pl$date_ranges("start", "end"), date_range_cr = pl$date_ranges("start", "end", closed = "right") ) # provide a custom "end" value df$with_columns( date_range_lit = pl$date_ranges("start", pl$lit(as.Date("2022-01-02"))) )
Create a Datetime expression
pl_datetime( year, month, day, hour = NULL, minute = NULL, second = NULL, microsecond = NULL, ..., time_unit = "us", time_zone = NULL, ambiguous = "raise" )
pl_datetime( year, month, day, hour = NULL, minute = NULL, second = NULL, microsecond = NULL, ..., time_unit = "us", time_zone = NULL, ambiguous = "raise" )
year |
An Expr or something coercible to an Expr, that must return an integer. Strings are parsed as column names. Floats are cast to integers. |
month |
An Expr or something coercible to an Expr, that must return an integer between 1 and 12. Strings are parsed as column names. Floats are cast to integers. |
day |
An Expr or something coercible to an Expr, that must return an integer between 1 and 31. Strings are parsed as column names. Floats are cast to integers. |
hour |
An Expr or something coercible to an Expr, that must return an integer between 0 and 23. Strings are parsed as column names. Floats are cast to integers. |
minute |
An Expr or something coercible to an Expr, that must return an integer between 0 and 59. Strings are parsed as column names. Floats are cast to integers. |
second |
An Expr or something coercible to an Expr, that must return an integer between 0 and 59. Strings are parsed as column names. Floats are cast to integers. |
microsecond |
An Expr or something coercible to an Expr, that must return an integer between 0 and 999,999. Strings are parsed as column names. Floats are cast to integers. |
... |
Not used. |
time_unit |
Unit of time. One of |
time_zone |
Time zone string, as defined in |
ambiguous |
Determine how to deal with ambiguous datetimes:
|
An Expr of type Datetime
df = pl$DataFrame( year = 2019:2021, month = 9:11, day = 10:12, min = 55:57 ) df$with_columns( dt_from_cols = pl$datetime("year", "month", "day", minute = "min"), dt_from_lit = pl$datetime(2020, 3, 5, hour = 20:22), dt_from_mix = pl$datetime("year", 3, 5, second = 1) ) # floats are coerced to integers df$with_columns( dt_floats = pl$datetime(2018.8, 5.3, 1, second = 2.1) ) # if datetime can't be constructed, it returns null df$with_columns( dt_floats = pl$datetime(pl$lit("abc"), -2, 1) ) # can control the time_unit df$with_columns( dt_from_cols = pl$datetime("year", "month", "day", minute = "min", time_unit = "ms") )
df = pl$DataFrame( year = 2019:2021, month = 9:11, day = 10:12, min = 55:57 ) df$with_columns( dt_from_cols = pl$datetime("year", "month", "day", minute = "min"), dt_from_lit = pl$datetime(2020, 3, 5, hour = 20:22), dt_from_mix = pl$datetime("year", 3, 5, second = 1) ) # floats are coerced to integers df$with_columns( dt_floats = pl$datetime(2018.8, 5.3, 1, second = 2.1) ) # if datetime can't be constructed, it returns null df$with_columns( dt_floats = pl$datetime(pl$lit("abc"), -2, 1) ) # can control the time_unit df$with_columns( dt_from_cols = pl$datetime("year", "month", "day", minute = "min", time_unit = "ms") )
Generate a datetime range
pl_datetime_range( start, end, interval = "1d", ..., closed = "both", time_unit = NULL, time_zone = NULL )
pl_datetime_range( start, end, interval = "1d", ..., closed = "both", time_unit = NULL, time_zone = NULL )
start |
Lower bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
end |
Upper bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
interval |
Interval of the range periods, specified as a difftime object
or using the Polars duration string language.
See the |
... |
Ignored. |
closed |
Define which sides of the range are closed (inclusive).
One of the followings: |
time_unit |
Time unit of the resulting the Datetime
data type. One of |
time_zone |
Time zone of the resulting Datetime data type. |
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
pl$datetime_ranges()
to create a simple
Series of data type list(Datetime) based on column values.
# Using Polars duration string to specify the interval: pl$datetime_range(as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo") |> as_polars_series("datetime") # Using `difftime` object to specify the interval: pl$datetime_range( as.Date("1985-01-01"), as.Date("1985-01-10"), as.difftime(1, units = "days") + as.difftime(12, units = "hours") ) |> as_polars_series("datetime") # Specifying a time zone: pl$datetime_range( as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo", time_zone = "America/New_York" ) |> as_polars_series("datetime")
# Using Polars duration string to specify the interval: pl$datetime_range(as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo") |> as_polars_series("datetime") # Using `difftime` object to specify the interval: pl$datetime_range( as.Date("1985-01-01"), as.Date("1985-01-10"), as.difftime(1, units = "days") + as.difftime(12, units = "hours") ) |> as_polars_series("datetime") # Specifying a time zone: pl$datetime_range( as.Date("2022-01-01"), as.Date("2022-03-01"), "1mo", time_zone = "America/New_York" ) |> as_polars_series("datetime")
Generate a list containing a datetime range
pl_datetime_ranges( start, end, interval = "1d", ..., closed = "both", time_unit = NULL, time_zone = NULL )
pl_datetime_ranges( start, end, interval = "1d", ..., closed = "both", time_unit = NULL, time_zone = NULL )
start |
Lower bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
end |
Upper bound of the date range. Something that can be coerced to a Date or a Datetime expression. See examples for details. |
interval |
Interval of the range periods, specified as a difftime object
or using the Polars duration string language.
See the |
... |
Ignored. |
closed |
Define which sides of the range are closed (inclusive).
One of the followings: |
time_unit |
Time unit of the resulting the Datetime
data type. One of |
time_zone |
Time zone of the resulting Datetime data type. |
An Expr of data type list(Datetime)
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
pl$datetime_range()
to create a simple Series
of data type Datetime.
df = pl$DataFrame( start = as.POSIXct(c("2022-01-01 10:00", "2022-01-01 11:00", NA)), end = as.POSIXct("2022-01-01 12:00") ) df$with_columns( dt_range = pl$datetime_ranges("start", "end", interval = "1h"), dt_range_cr = pl$datetime_ranges("start", "end", closed = "right", interval = "1h") ) # provide a custom "end" value df$with_columns( dt_range_lit = pl$datetime_ranges( "start", pl$lit(as.POSIXct("2022-01-01 11:00")), interval = "1h" ) )
df = pl$DataFrame( start = as.POSIXct(c("2022-01-01 10:00", "2022-01-01 11:00", NA)), end = as.POSIXct("2022-01-01 12:00") ) df$with_columns( dt_range = pl$datetime_ranges("start", "end", interval = "1h"), dt_range_cr = pl$datetime_ranges("start", "end", closed = "right", interval = "1h") ) # provide a custom "end" value df$with_columns( dt_range_lit = pl$datetime_ranges( "start", pl$lit(as.POSIXct("2022-01-01 11:00")), interval = "1h" ) )
Read a logical plan from a JSON file to construct a LazyFrame
pl_deserialize_lf(json)
pl_deserialize_lf(json)
json |
A character of the JSON representation of the logical plan. |
lf = pl$LazyFrame(a = 1:3)$sum() json = lf$serialize() pl$deserialize_lf(json)$collect()
lf = pl$LazyFrame(a = 1:3)$sum() json = lf$serialize() pl$deserialize_lf(json)$collect()
Some functions (e.g joins) can be applied on Categorical series only allowed
if using the global string cache is enabled. This function disables
the string_cache. In general, you should use pl$with_string_cache()
instead.
pl_disable_string_cache()
pl_disable_string_cache()
This doesn't return any value.
pl$using_string_cache
pl$enable_string_cache
pl$with_string_cache
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
DataType
any polars type (ported so far)
not applicable
print(ls(pl$dtypes)) pl$dtypes$Float64 pl$dtypes$String pl$List(pl$List(pl$UInt64)) pl$Struct(pl$Field("CityNames", pl$String)) # The function changes type from Int32 to String # Specifying the output DataType: String solves the problem as_polars_series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$String)
print(ls(pl$dtypes)) pl$dtypes$Float64 pl$dtypes$String pl$List(pl$List(pl$UInt64)) pl$Struct(pl$Field("CityNames", pl$String)) # The function changes type from Int32 to String # Specifying the output DataType: String solves the problem as_polars_series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$String)
Create polars Duration from distinct time components
pl_duration( ..., weeks = NULL, days = NULL, hours = NULL, minutes = NULL, seconds = NULL, milliseconds = NULL, microseconds = NULL, nanoseconds = NULL, time_unit = "us" )
pl_duration( ..., weeks = NULL, days = NULL, hours = NULL, minutes = NULL, seconds = NULL, milliseconds = NULL, microseconds = NULL, nanoseconds = NULL, time_unit = "us" )
... |
Not used. |
weeks |
Number of weeks to add. Expr or something coercible to an Expr.
Strings are parsed as column names. Same thing for argument |
days |
Number of days to add. |
hours |
Number of hours to add. |
minutes |
Number of minutes to add. |
seconds |
Number of seconds to add. |
milliseconds |
Number of milliseconds to add. |
microseconds |
Number of microseconds to add. |
nanoseconds |
Number of nanoseconds to add. |
time_unit |
Time unit of the resulting expression. |
A duration represents a fixed amount of time. For example,
pl$duration(days = 1)
means "exactly 24 hours". By contrast,
Expr$dt$offset_by('1d')
means "1 calendar day", which could sometimes be 23
hours or 25 hours depending on Daylight Savings Time. For non-fixed durations
such as "calendar month" or "calendar day", please use Expr$dt$offset_by()
instead.
Expr
test = pl$DataFrame( dt = c( "2022-01-01 00:00:00", "2022-01-02 00:00:00" ), add = 1:2 )$with_columns( pl$col("dt")$str$strptime(pl$Datetime("us"), format = NULL) ) test$with_columns( (pl$col("dt") + pl$duration(weeks = "add"))$alias("add_weeks"), (pl$col("dt") + pl$duration(days = "add"))$alias("add_days"), (pl$col("dt") + pl$duration(seconds = "add"))$alias("add_seconds"), (pl$col("dt") + pl$duration(milliseconds = "add"))$alias("add_millis"), (pl$col("dt") + pl$duration(hours = "add"))$alias("add_hours") ) # we can also pass an Expr test$with_columns( (pl$col("dt") + pl$duration(weeks = pl$col("add") + 1))$alias("add_weeks"), (pl$col("dt") + pl$duration(days = pl$col("add") + 1))$alias("add_days"), (pl$col("dt") + pl$duration(seconds = pl$col("add") + 1))$alias("add_seconds"), (pl$col("dt") + pl$duration(milliseconds = pl$col("add") + 1))$alias("add_millis"), (pl$col("dt") + pl$duration(hours = pl$col("add") + 1))$alias("add_hours") )
test = pl$DataFrame( dt = c( "2022-01-01 00:00:00", "2022-01-02 00:00:00" ), add = 1:2 )$with_columns( pl$col("dt")$str$strptime(pl$Datetime("us"), format = NULL) ) test$with_columns( (pl$col("dt") + pl$duration(weeks = "add"))$alias("add_weeks"), (pl$col("dt") + pl$duration(days = "add"))$alias("add_days"), (pl$col("dt") + pl$duration(seconds = "add"))$alias("add_seconds"), (pl$col("dt") + pl$duration(milliseconds = "add"))$alias("add_millis"), (pl$col("dt") + pl$duration(hours = "add"))$alias("add_hours") ) # we can also pass an Expr test$with_columns( (pl$col("dt") + pl$duration(weeks = pl$col("add") + 1))$alias("add_weeks"), (pl$col("dt") + pl$duration(days = pl$col("add") + 1))$alias("add_days"), (pl$col("dt") + pl$duration(seconds = pl$col("add") + 1))$alias("add_seconds"), (pl$col("dt") + pl$duration(milliseconds = pl$col("add") + 1))$alias("add_millis"), (pl$col("dt") + pl$duration(hours = pl$col("add") + 1))$alias("add_hours") )
Alias for an element in evaluated in an eval
expression.
pl_element()
pl_element()
Expr
pl$lit(1:5)$cumulative_eval(pl$element()$first() - pl$element()$last()**2)$to_r()
pl$lit(1:5)$cumulative_eval(pl$element()$first() - pl$element()$last()**2)$to_r()
Some functions (e.g joins) can be applied on Categorical series only allowed
if using the global string cache is enabled. This function enables
the string_cache. In general, you should use pl$with_string_cache()
instead.
pl_enable_string_cache()
pl_enable_string_cache()
This doesn't return any value.
pl$using_string_cache
pl$disable_string_cache
pl$with_string_cache
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
This is syntactic sugar that should mostly be used in
$struct$with_fields()
. pl$field("x")
is
equivalent to pl$col("my_struct")$struct$field("x")
.
pl_field(name)
pl_field(name)
name |
Name of the field to select. |
An Expr with the datatype from the selected field.
df = pl$DataFrame(x = c(1, 4, 9), y = c(4, 9, 16), multiply = c(10, 2, 3))$ with_columns(coords = pl$struct(c("x", "y")))$ select("coords", "multiply") df df = df$with_columns( pl$col("coords")$struct$with_fields( pl$field("x")$sqrt(), y_mul = pl$field("y") * pl$col("multiply") ) ) df df$unnest("coords")
df = pl$DataFrame(x = c(1, 4, 9), y = c(4, 9, 16), multiply = c(10, 2, 3))$ with_columns(coords = pl$struct(c("x", "y")))$ select("coords", "multiply") df df = df$with_columns( pl$col("coords")$struct$with_fields( pl$field("x")$sqrt(), y_mul = pl$field("y") * pl$col("multiply") ) ) df df$unnest("coords")
A Field is composed of a name and a data type. Fields are used in Structs-datatypes and Schemas to represent everything of the Series/Column except the raw values.
pl_Field(name, datatype)
pl_Field(name, datatype)
name |
Field name |
datatype |
An object of class "RPolarsRField"
containing its name and its
data type.
$datatype
returns the data type of the Field.
$datatype = <RPolarsDataType>
sets the data type of the Field.
$name
returns the name of the Field.
$name = "new_name"
sets the name of the Field.
field = pl$Field("city_names", pl$String) field field$datatype field$name # Set the new data type field$datatype = pl$Categorical() field$datatype # Set the new name field$name = "CityPoPulations" field
field = pl$Field("city_names", pl$String) field field$datatype field$name # Set the new data type field$datatype = pl$Categorical() field$datatype # Set the new name field$name = "CityPoPulations" field
This function has different behavior depending on arguments:
Missing -> Takes first column of a context.
Character vectors -> Syntactic sugar for pl$col(...)$first()
.
pl_first(...)
pl_first(...)
... |
Characters indicating the column names
(passed to |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$first()) df$select(pl$first("b")) df$select(pl$first(c("a", "c")))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$first()) df$select(pl$first("b")) df$select(pl$first(c("a", "c")))
This allows one to do rowwise operations, starting with an initial value
(acc
). See pl$reduce()
to do rowwise operations without this initial
value.
pl_fold(acc, lambda, exprs)
pl_fold(acc, lambda, exprs)
acc |
an Expr or Into |
lambda |
R function which takes two polars Series as input and return one. |
exprs |
Expressions to aggregate over. May also be a wildcard expression. |
An expression that will be applied rowwise
df = as_polars_df(mtcars) # Make the row-wise sum of all columns df$with_columns( pl$fold( acc = pl$lit(0), lambda = \(acc, x) acc + x, exprs = pl$col("*") )$alias("mpg_drat_sum_folded") )
df = as_polars_df(mtcars) # Make the row-wise sum of all columns df$with_columns( pl$fold( acc = pl$lit(0), lambda = \(acc, x) acc + x, exprs = pl$col("*") )$alias("mpg_drat_sum_folded") )
Depending on the time_unit
provided, this function will return a different
dtype:
time_unit = "d"
returns pl$Date
time_unit = "s"
returns pl$Datetime("us")
(pl$Datetime
’s default)
time_unit = "ms"
returns pl$Datetime("ms")
time_unit = "us"
returns pl$Datetime("us")
time_unit = "ns"
returns pl$Datetime("ns")
pl_from_epoch(column, time_unit = "s")
pl_from_epoch(column, time_unit = "s")
column |
An Expr from which integers will be parsed. If this is a float
column, then the decimal part of the float will be ignored. Character are
parsed as column names, but other literal values must be passed to
|
time_unit |
One of |
Expr as Date or Datetime depending on the
time_unit
.
# pass an integer column df = pl$DataFrame(timestamp = c(1666683077, 1666683099)) df$with_columns( timestamp_to_datetime = pl$from_epoch(pl$col("timestamp"), time_unit = "s") ) # pass a literal pl$from_epoch(pl$lit(c(1666683077, 1666683099)), time_unit = "s")$to_series() # use different time_unit df = pl$DataFrame(timestamp = c(12345, 12346)) df$with_columns( timestamp_to_date = pl$from_epoch(pl$col("timestamp"), time_unit = "d") )
# pass an integer column df = pl$DataFrame(timestamp = c(1666683077, 1666683099)) df$with_columns( timestamp_to_datetime = pl$from_epoch(pl$col("timestamp"), time_unit = "s") ) # pass a literal pl$from_epoch(pl$lit(c(1666683077, 1666683099)), time_unit = "s")$to_series() # use different time_unit df = pl$DataFrame(timestamp = c(12345, 12346)) df$with_columns( timestamp_to_date = pl$from_epoch(pl$col("timestamp"), time_unit = "d") )
n
rows.This function is syntactic sugar for pl$col(...)$head(n)
.
pl_head(..., n = 10)
pl_head(..., n = 10)
... |
Characters indicating the column names, passed to |
n |
Number of rows to return. |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$head("a")) df$select(pl$head("a", "b", n = 2))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$head("a")) df$select(pl$head("a", "b", n = 2))
This function is syntactic sugar for pl$col(...)$implode()
.
pl_implode(...)
pl_implode(...)
... |
Characters indicating the column names, passed to |
as_polars_df(iris)$select(pl$implode("Species"))
as_polars_df(iris)$select(pl$implode("Species"))
Generate a range of integers
pl_int_range(start = 0, end = NULL, step = 1, ..., dtype = pl$Int64)
pl_int_range(start = 0, end = NULL, step = 1, ..., dtype = pl$Int64)
start |
Start of the range (inclusive). Defaults to 0. |
end |
End of the range (exclusive). If |
step |
Step size of the range. |
... |
Not used. |
dtype |
Data type of the range. |
An Expr with the data type specified in dtype
(default is Int64
).
pl$int_ranges()
to generate a range of integers for
each row of the input columns.
pl$int_range(0, 3) |> as_polars_series() # "end" can be omitted for shorter syntax pl$int_range(3) |> as_polars_series() # custom data type pl$int_range(3, dtype = pl$Int16) |> as_polars_series() # one can use pl$int_range() and pl$len() to create an index column df = pl$DataFrame(a = c(1, 3, 5), b = c(2, 4, 6)) df$select( index = pl$int_range(pl$len(), dtype = pl$UInt32), pl$all() )
pl$int_range(0, 3) |> as_polars_series() # "end" can be omitted for shorter syntax pl$int_range(3) |> as_polars_series() # custom data type pl$int_range(3, dtype = pl$Int16) |> as_polars_series() # one can use pl$int_range() and pl$len() to create an index column df = pl$DataFrame(a = c(1, 3, 5), b = c(2, 4, 6)) df$select( index = pl$int_range(pl$len(), dtype = pl$UInt32), pl$all() )
Generate a range of integers for each row of the input columns
pl_int_ranges(start = 0, end = NULL, step = 1, ..., dtype = pl$Int64)
pl_int_ranges(start = 0, end = NULL, step = 1, ..., dtype = pl$Int64)
start |
Start of the range (inclusive). Defaults to 0. |
end |
End of the range (exclusive). If |
step |
Step size of the range. |
... |
Not used. |
dtype |
Data type of the range. |
An Expr with the data type List(dtype
) (with Int64
as default of
dtype
).
pl$int_range()
to generate a single range of
integers.
df = pl$DataFrame(start = c(1, -1), end = c(3, 2)) df$with_columns(int_range = pl$int_ranges("start", "end")) df$with_columns(int_range = pl$int_ranges("start", "end", dtype = pl$Int16))
df = pl$DataFrame(start = c(1, -1), end = c(3, 2)) df$with_columns(int_range = pl$int_ranges("start", "end")) df$with_columns(int_range = pl$int_ranges("start", "end", dtype = pl$Int16))
check if schema
pl_is_schema(x)
pl_is_schema(x)
x |
object to test if schema |
bool
pl$is_schema(as_polars_df(iris)$schema) pl$is_schema(list("alice", "bob"))
pl$is_schema(as_polars_df(iris)$schema) pl$is_schema(list("alice", "bob"))
This function has different behavior depending on the input type:
Missing -> Takes last column of a context.
Character vectors -> Syntactic sugar for pl$col(...)$last()
.
pl_last(...)
pl_last(...)
... |
Characters indicating the column names
(passed to |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "baz") ) df$select(pl$last()) df$select(pl$last("a")) df$select(pl$last(c("b", "c")))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "baz") ) df$select(pl$last()) df$select(pl$last("a")) df$select(pl$last(c("b", "c")))
This is simply a convenience function to create LazyFrame
s in a quick way.
It is a wrapper around pl$DataFrame()$lazy()
. Note that this should only
be used for making examples and quick demonstrations.
pl_LazyFrame(...)
pl_LazyFrame(...)
... |
Anything that is accepted by |
pl$LazyFrame( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1:1, 1:2, 1:3, 1:4, 1:5) ) # directly from vectors # from a list of vectors or data.frame pl$LazyFrame(list( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1L, 1:2, 1:3, 1:4, 1:5) )) # custom schema pl$LazyFrame( iris, schema = list(Sepal.Length = pl$Float32, Species = pl$String) )$collect()
pl$LazyFrame( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1:1, 1:2, 1:3, 1:4, 1:5) ) # directly from vectors # from a list of vectors or data.frame pl$LazyFrame(list( a = c(1, 2, 3, 4, 5), b = 1:5, c = letters[1:5], d = list(1L, 1:2, 1:3, 1:4, 1:5) )) # custom schema pl$LazyFrame( iris, schema = list(Sepal.Length = pl$Float32, Species = pl$String) )$collect()
This is similar to COUNT(*)
in SQL.
pl_len()
pl_len()
Expression of data type UInt32
df = pl$DataFrame( a = c(1, 2, NA), b = c(3, NA, NA), c = c("foo", "bar", "foo") ) df$select(pl$len())
df = pl$DataFrame( a = c(1, 2, NA), b = c(3, NA, NA), c = c("foo", "bar", "foo") ) df$select(pl$len())
Create a literal value
pl_lit(x)
pl_lit(x)
x |
A vector of any length |
pl$lit(NULL)
translates into a polars null
.
Expr
# values to literal, explicit `pl$lit(42)` implicit `+ 2` pl$col("some_column") / pl$lit(42) + 2 # vector to literal explicitly via Series and back again # R vector to expression and back again pl$select(pl$lit(as_polars_series(1:4)))$to_list()[[1L]] # r vector to literal and back r vector pl$lit(1:4)$to_r() # r vector to literal to dataframe pl$select(pl$lit(1:4)) # r vector to literal to Series pl$lit(1:4)$to_series() # vectors to literal implicitly (pl$lit(2) + 1:4) / 4:1
# values to literal, explicit `pl$lit(42)` implicit `+ 2` pl$col("some_column") / pl$lit(42) + 2 # vector to literal explicitly via Series and back again # R vector to expression and back again pl$select(pl$lit(as_polars_series(1:4)))$to_list()[[1L]] # r vector to literal and back r vector pl$lit(1:4)$to_r() # r vector to literal to dataframe pl$select(pl$lit(1:4)) # r vector to literal to Series pl$lit(1:4)$to_series() # vectors to literal implicitly (pl$lit(2) + 1:4) / 4:1
Syntactic sugar for pl$col(...)$max()
.
pl_max(...)
pl_max(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( num_1 = c(1, 8, 3), num_2 = c(4, 5, 2), chr_1 = c("foo", "bar", "foo") ) df$select(pl$max("num_1")) # Get the maximum value of multiple columns. df$select(pl$max(r"(^num_\d+$)")) df$select(pl$max("num_1", "num_2"))
df = pl$DataFrame( num_1 = c(1, 8, 3), num_2 = c(4, 5, 2), chr_1 = c("foo", "bar", "foo") ) df$select(pl$max("num_1")) # Get the maximum value of multiple columns. df$select(pl$max(r"(^num_\d+$)")) df$select(pl$max("num_1", "num_2"))
Get the maximum value rowwise
pl_max_horizontal(...)
pl_max_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = NA_real_, b = c(2:1, NA_real_, NA_real_), c = c(1:2, NA_real_, Inf) ) df$with_columns( pl$max_horizontal("a", "b", "c", 99.9)$alias("max") )
df = pl$DataFrame( a = NA_real_, b = c(2:1, NA_real_, NA_real_), c = c(1:2, NA_real_, Inf) ) df$with_columns( pl$max_horizontal("a", "b", "c", 99.9)$alias("max") )
This function is syntactic sugar for pl$col(...)$mean()
.
pl_mean(...)
pl_mean(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$mean("a")) df$select(pl$mean("a", "b"))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$mean("a")) df$select(pl$mean("a", "b"))
Compute the mean rowwise
pl_mean_horizontal(...)
pl_mean_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = c(1, 8, 3, 6, 7), b = c(4, 5, NA_real_, Inf, NaN) ) df$with_columns( pl$mean_horizontal("a", "b")$alias("mean"), pl$mean_horizontal("a", "b", 5)$alias("mean_with_lit") )
df = pl$DataFrame( a = c(1, 8, 3, 6, 7), b = c(4, 5, NA_real_, Inf, NaN) ) df$with_columns( pl$mean_horizontal("a", "b")$alias("mean"), pl$mean_horizontal("a", "b", 5)$alias("mean_with_lit") )
This function is syntactic sugar for pl$col(...)$median()
.
pl_median(...)
pl_median(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$median("a")) df$select(pl$median("a", "b"))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$median("a")) df$select(pl$median("a", "b"))
Get underlying mem address a rust object (via ExtPtr). Expert use only.
pl_mem_address(robj)
pl_mem_address(robj)
robj |
an R object |
Does not give meaningful answers for regular R objects.
String of mem address
pl$mem_address(pl$Series(values = 1:3))
pl$mem_address(pl$Series(values = 1:3))
Syntactic sugar for pl$col(...)$min()
.
pl_min(...)
pl_min(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( num_1 = c(1, 8, 3), num_2 = c(4, 5, 2), chr_1 = c("foo", "bar", "foo") ) df$select(pl$min("num_1")) # Get the minimum value of multiple columns. df$select(pl$min(r"(^num_\d+$)")) df$select(pl$min("num_1", "num_2"))
df = pl$DataFrame( num_1 = c(1, 8, 3), num_2 = c(4, 5, 2), chr_1 = c("foo", "bar", "foo") ) df$select(pl$min("num_1")) # Get the minimum value of multiple columns. df$select(pl$min(r"(^num_\d+$)")) df$select(pl$min("num_1", "num_2"))
Get the minimum value rowwise
pl_min_horizontal(...)
pl_min_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = NA_real_, b = c(2:1, NA_real_, NA_real_), c = c(1:2, NA_real_, -Inf) ) df$with_columns( pl$min_horizontal("a", "b", "c", 99.9)$alias("min") )
df = pl$DataFrame( a = NA_real_, b = c(2:1, NA_real_, NA_real_), c = c(1:2, NA_real_, -Inf) ) df$with_columns( pl$min_horizontal("a", "b", "c", 99.9)$alias("min") )
This function is syntactic sugar for pl$col(...)$n_unique()
.
pl_n_unique(...)
pl_n_unique(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame( a = c(1, 8, 1), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$n_unique("a")) df$select(pl$n_unique("b", "c"))
df = pl$DataFrame( a = c(1, 8, 1), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$n_unique("a")) df$select(pl$n_unique("b", "c"))
pl
-object is a environment of all public functions and class constructors.
Public functions are not exported as a normal package as it would be huge namespace
collision with base:: and other functions. All object-methods are accessed with object$method()
via the new class functions.
Having all functions in an namespace is similar to the rust- and python- polars api.
pl
pl
An object of class pl_polars_env
(inherits from environment
) of length 109.
If someone do not particularly like the letter combination pl
, they are free to
bind the environment to another variable name as simon_says = pl
or even do attach(pl)
not applicable
# how to use polars via `pl` pl$col("colname")$sum() / pl$lit(42L) # expression ~ chain-method / literal-expression # show all public functions, RPolarsDataTypes, classes and methods pl$show_all_public_functions() pl$show_all_public_methods()
# how to use polars via `pl` pl$col("colname")$sum() / pl$lit(42L) # expression ~ chain-method / literal-expression # show all public functions, RPolarsDataTypes, classes and methods pl$show_all_public_functions() pl$show_all_public_methods()
Store Time in R
pl_PTime(x, tu = c("s", "ms", "us", "ns"), format = "%H:%M:%S")
pl_PTime(x, tu = c("s", "ms", "us", "ns"), format = "%H:%M:%S")
x |
an integer or double vector of n epochs since midnight OR a char vector of char times passed to as.POSIXct converted to seconds. |
tu |
timeunit either "s","ms","us","ns" |
format |
a format string passed to as.POSIXct format via ... |
PTime should probably be replaced with package nanotime or similar.
base R is missing encoding of Time since midnight "s" "ms", "us" and "ns". The latter "ns" is the standard for the polars Time type.
Use PTime to convert R doubles and integers and use as input to polars functions which needs a time.
Loosely inspired by data.table::ITime which is i32 only. PTime must support polars native timeunit is nanoseconds. The R double(float64) can imitate a i64 ns with full precision within the full range of 24 hours.
PTime does not have a time zone and always prints the time as is no matter local machine time zone.
An essential difference between R and polars is R prints POSIXct/lt without a timezone in local time. Polars prints Datetime without a timezone label as is (GMT). For POSIXct/lt taged with a timexone(tzone) and Datetime with a timezone(tz) the behavior is the same conversion is intuitive.
It appears behavior of R timezones is subject to change a bit in R 4.3.0, see polars unit test test-expr_datetime.R/"pl$date_range Date lazy/eager".
a PTime vector either double or integer, with class "PTime" and attribute "tu" being either "s","ms","us" or "ns"
# make PTime in all time units pl$PTime(runif(5) * 3600 * 24 * 1E0, tu = "s") pl$PTime(runif(5) * 3600 * 24 * 1E3, tu = "ms") pl$PTime(runif(5) * 3600 * 24 * 1E6, tu = "us") pl$PTime(runif(5) * 3600 * 24 * 1E9, tu = "ns") pl$PTime("23:59:59") as_polars_series(pl$PTime(runif(5) * 3600 * 24 * 1E0, tu = "s")) pl$lit(pl$PTime("23:59:59"))$to_series() pl$lit(pl$PTime("23:59:59"))$to_r()
# make PTime in all time units pl$PTime(runif(5) * 3600 * 24 * 1E0, tu = "s") pl$PTime(runif(5) * 3600 * 24 * 1E3, tu = "ms") pl$PTime(runif(5) * 3600 * 24 * 1E6, tu = "us") pl$PTime(runif(5) * 3600 * 24 * 1E9, tu = "ns") pl$PTime("23:59:59") as_polars_series(pl$PTime(runif(5) * 3600 * 24 * 1E0, tu = "s")) pl$lit(pl$PTime("23:59:59"))$to_series() pl$lit(pl$PTime("23:59:59"))$to_r()
Create an "rpolars_raw_list", which is an R list where all elements must be
an R raw or NULL
.
pl_raw_list(...) ## S3 method for class 'rpolars_raw_list' x[index] ## S3 method for class 'rpolars_raw_list' as.list(x, ...)
pl_raw_list(...) ## S3 method for class 'rpolars_raw_list' x[index] ## S3 method for class 'rpolars_raw_list' as.list(x, ...)
... |
Elements |
x |
A |
index |
Elements to select |
In R, raw can contain a binary sequence of bytes, and the length is the number
of bytes. In polars a Series of DataType Binary is more like a
vector of vectors of bytes where missing values are allowed, similar to how
NA
s can be present in vectors.
To ensure correct round-trip conversion, r-polars uses an R list where any
elements must be raw or NULL
(encoded as missing), and the S3 class is
c("rpolars_raw_list","list")
.
An R list where any elements must be raw, and the S3 class is
c("rpolars_raw_list","list")
.
# create a rpolars_raw_list raw_list = pl$raw_list(raw(1), raw(3), charToRaw("alice"), NULL) # pass it to Series or lit pl$Series(values = raw_list) pl$lit(raw_list) # convert polars bianry Series to rpolars_raw_list pl$Series(values = raw_list)$to_r() # NB: a plain list of raws yield a polars Series of DateType [list[Binary]] # which is not the same pl$Series(values = list(raw(1), raw(2))) # to regular list, use as.list or unclass as.list(raw_list) # subsetting preserves class pl$raw_list(NULL, raw(2), raw(3))[1:2] # to regular list, use as.list or unclass pl$raw_list(NULL, raw(2), raw(3)) |> as.list()
# create a rpolars_raw_list raw_list = pl$raw_list(raw(1), raw(3), charToRaw("alice"), NULL) # pass it to Series or lit pl$Series(values = raw_list) pl$lit(raw_list) # convert polars bianry Series to rpolars_raw_list pl$Series(values = raw_list)$to_r() # NB: a plain list of raws yield a polars Series of DateType [list[Binary]] # which is not the same pl$Series(values = list(raw(1), raw(2))) # to regular list, use as.list or unclass as.list(raw_list) # subsetting preserves class pl$raw_list(NULL, raw(2), raw(3))[1:2] # to regular list, use as.list or unclass pl$raw_list(NULL, raw(2), raw(3)) |> as.list()
New DataFrame from CSV
pl_read_csv( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, dtypes = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, reuse_downloaded = TRUE, include_file_paths = NULL )
pl_read_csv( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, dtypes = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, reuse_downloaded = TRUE, include_file_paths = NULL )
source |
Path to a file or URL. It is possible to provide multiple paths provided that all CSV files have the same schema. It is not possible to provide several URLs. |
... |
Ignored. |
has_header |
Indicate if the first row of dataset is a header or not.If
|
separator |
Single byte character to use as separator in the file. |
comment_prefix |
A string, which can be up to 5 symbols in length, used to indicate
the start of a comment line. For instance, it can be set to |
quote_char |
Single byte character used for quoting. Set to |
skip_rows |
Start reading after a particular number of rows. The header will be parsed at this offset. |
dtypes |
Named list of column names - dtypes or dtype - column names. This list is used while reading to overwrite dtypes. Supported types so far are:
|
null_values |
Values to interpret as
|
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
cache |
Cache the result after reading. |
infer_schema_length |
Maximum number of rows to read to infer the column
types. If set to 0, all columns will be read as UTF-8. If |
n_rows |
Maximum number of rows to read. |
encoding |
Either |
low_memory |
Reduce memory usage (will yield a lower performance). |
rechunk |
Reallocate to contiguous memory when all chunks / files are parsed. |
skip_rows_after_header |
Parse the first row as headers, and then skip this number of rows. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
try_parse_dates |
Try to automatically parse dates. Most ISO8601-like
formats can be inferred, as well as a handful of others. If this does not
succeed, the column remains of data type |
eol_char |
Single byte end of line character (default: |
raise_if_empty |
If |
truncate_ragged_lines |
Truncate lines that are longer than the schema. |
reuse_downloaded |
If |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
Read into a DataFrame from Arrow IPC (Feather v2) file
pl_read_ipc( source, ..., n_rows = NULL, memory_map = TRUE, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE )
pl_read_ipc( source, ..., n_rows = NULL, memory_map = TRUE, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE )
source |
A single character or a raw vector of Apache Arrow IPC file.
You can use globbing with |
... |
Ignored. |
n_rows |
Maximum number of rows to read. |
memory_map |
A logical. If |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
rechunk |
In case of reading multiple files via a glob pattern, rechunk the final DataFrame into contiguous memory chunks. |
cache |
Cache the result after reading. |
temp_dir = tempfile() # Write a hive-style partitioned arrow file dataset arrow::write_dataset( mtcars, temp_dir, partitioning = c("cyl", "gear"), format = "arrow", hive_style = TRUE ) list.files(temp_dir, recursive = TRUE) # Read the dataset # Sinse hive-style partitioning is not supported, # the `cyl` and `gear` columns are not contained in the result pl$read_ipc( file.path(temp_dir, "**/*.arrow") ) # Read a raw vector arrow::arrow_table( foo = 1:5, bar = 6:10, ham = letters[1:5] ) |> arrow::write_to_raw(format = "file") |> pl$read_ipc()
temp_dir = tempfile() # Write a hive-style partitioned arrow file dataset arrow::write_dataset( mtcars, temp_dir, partitioning = c("cyl", "gear"), format = "arrow", hive_style = TRUE ) list.files(temp_dir, recursive = TRUE) # Read the dataset # Sinse hive-style partitioning is not supported, # the `cyl` and `gear` columns are not contained in the result pl$read_ipc( file.path(temp_dir, "**/*.arrow") ) # Read a raw vector arrow::arrow_table( foo = 1:5, bar = 6:10, ham = letters[1:5] ) |> arrow::write_to_raw(format = "file") |> pl$read_ipc()
Read a file from path into a polars DataFrame.
pl_read_ndjson( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE )
pl_read_ndjson( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, ignore_errors = FALSE )
source |
Path to a file or URL. It is possible to provide multiple paths provided that all NDJSON files have the same schema. It is not possible to provide several URLs. |
... |
Ignored. |
infer_schema_length |
Maximum number of rows to read to infer the column
types. If set to 0, all columns will be read as UTF-8. If |
batch_size |
Number of rows that will be processed per thread. |
n_rows |
Maximum number of rows to read. |
low_memory |
Reduce memory usage (will yield a lower performance). |
rechunk |
Reallocate to contiguous memory when all chunks / files are parsed. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
A DataFrame
if (require("jsonlite", quietly = TRUE)) { ndjson_filename = tempfile() jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) pl$read_ndjson(ndjson_filename) }
if (require("jsonlite", quietly = TRUE)) { ndjson_filename = tempfile() jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) pl$read_ndjson(ndjson_filename) }
Read a parquet file
pl_read_parquet( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = c("auto", "columns", "row_groups", "none"), hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, schema = NULL, rechunk = TRUE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL, allow_missing_columns = FALSE )
pl_read_parquet( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = c("auto", "columns", "row_groups", "none"), hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, schema = NULL, rechunk = TRUE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL, allow_missing_columns = FALSE )
source |
Path to a file. You can use globbing with |
... |
Ignored. |
n_rows |
Maximum number of rows to read. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
parallel |
This determines the direction of parallelism. |
hive_partitioning |
Infer statistics and schema from Hive partitioned URL
and use them to prune reads. If |
hive_schema |
A list containing the column names and data types of the
columns by which the data is partitioned, e.g.
|
try_parse_hive_dates |
Whether to try parsing hive values as date/datetime types. |
glob |
Expand path given via globbing rules. |
schema |
Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s).
If there are extra columns that are not in the file(s), consider also enabling |
rechunk |
In case of reading multiple files via a glob pattern, rechunk the final DataFrame into contiguous memory chunks. |
low_memory |
Reduce memory usage (will yield a lower performance). |
storage_options |
Experimental. List of options necessary to scan parquet files from different cloud storage providers (GCP, AWS, Azure, HuggingFace). See the 'Details' section. |
use_statistics |
Use statistics in the parquet file to determine if pages can be skipped from reading. |
cache |
Cache the result after reading. |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
allow_missing_columns |
When reading a list of parquet files, if a column existing in the first
file cannot be found in subsequent files, the default behavior is to raise an error.
However, if |
The prefiltered strategy first evaluates the pushed-down predicates in parallel and determines a mask of which rows to read. Then, it parallelizes over both the columns and the row groups while filtering out rows that do not need to be read. This can provide significant speedups for large files (i.e. many row-groups) with a predicate that filters clustered rows or filters heavily. In other cases, prefiltered may slow down the scan compared other strategies.
The prefiltered settings falls back to auto if no predicate is given.
Polars supports scanning parquet files from different cloud providers.
The cloud providers currently supported are AWS, GCP, and Azure.
The supported keys to pass to the storage_options
argument can be found
here:
Currently it is impossible to scan public parquet files from GCP without
a valid service account. Be sure to always include a service account in the
storage_options
argument.
It is possible to scan data stored on HuggingFace using a path starting with
hf://
. The hf://
path format is defined as
hf://BUCKET/REPOSITORY@REVISION/PATH
, where:
BUCKET is one of datasets or spaces
REPOSITORY is the location of the repository. this is usually in the
format of username/repo_name. A branch can also be optionally specified by
appending @branch
.
REVISION is the name of the branch (or commit) to use. This is optional and defaults to main if not given.
PATH is a file or directory path, or a glob pattern from the repository root.
A Hugging Face API key can be passed to access private locations using either of the following methods:
Passing a token in storage_options to the scan function, e.g.
scan_parquet(..., storage_options = list(token = <your HF token>))
Setting the HF_TOKEN environment variable, e.g.
Sys.setenv(HF_TOKEN = <your HF token>)
.
# Write a Parquet file than we can then import as DataFrame temp_file = withr::local_tempfile(fileext = ".parquet") as_polars_df(mtcars)$write_parquet(temp_file) pl$read_parquet(temp_file) # Write a hive-style partitioned parquet dataset temp_dir = withr::local_tempdir() as_polars_df(mtcars)$write_parquet(temp_dir, partition_by = c("cyl", "gear")) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$read_parquet(temp_dir)
# Write a Parquet file than we can then import as DataFrame temp_file = withr::local_tempfile(fileext = ".parquet") as_polars_df(mtcars)$write_parquet(temp_file) pl$read_parquet(temp_file) # Write a hive-style partitioned parquet dataset temp_dir = withr::local_tempdir() as_polars_df(mtcars)$write_parquet(temp_dir, partition_by = c("cyl", "gear")) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$read_parquet(temp_dir)
This allows one to do rowwise operations. See pl$fold()
to do rowwise
operations with an initial value.
pl_reduce(lambda, exprs)
pl_reduce(lambda, exprs)
lambda |
R function which takes two polars Series as input and return one. |
exprs |
Expressions to aggregate over. May also be a wildcard expression. |
An expression that will be applied rowwise
df = as_polars_df(mtcars) # Make the row-wise sum of all columns df$with_columns( pl$reduce( lambda = \(acc, x) acc + x, exprs = pl$col("*") )$alias("mpg_drat_sum_reduced") )
df = as_polars_df(mtcars) # Make the row-wise sum of all columns df$with_columns( pl$reduce( lambda = \(acc, x) acc + x, exprs = pl$col("*") )$alias("mpg_drat_sum_reduced") )
Calculates the rolling correlation between two columns
pl_rolling_corr(a, b, window_size, min_periods = NULL, ddof = 1)
pl_rolling_corr(a, b, window_size, min_periods = NULL, ddof = 1)
a |
One column name or Expr or anything convertible Into |
b |
Another column name or Expr or anything convertible Into |
window_size |
int The length of the window |
min_periods |
NULL or int The number of values in the window that should be non-null before computing a result. If NULL, it will be set equal to window size. |
ddof |
integer Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
Expr for the computed rolling correlation
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_corr("a", "b", window_size = 2))$collect()
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_corr("a", "b", window_size = 2))$collect()
Calculates the rolling covariance between two columns
pl_rolling_cov(a, b, window_size, min_periods = NULL, ddof = 1)
pl_rolling_cov(a, b, window_size, min_periods = NULL, ddof = 1)
a |
One column name or Expr or anything convertible Into |
b |
Another column name or Expr or anything convertible Into |
window_size |
int The length of the window |
min_periods |
NULL or int The number of values in the window that should be non-null before computing a result. If NULL, it will be set equal to window size. |
ddof |
integer Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
Expr for the computed rolling covariance
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_cov("a", "b", window_size = 2))$collect()
lf = as_polars_lf(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_cov("a", "b", window_size = 2))$collect()
Read a file from path into a polars LazyFrame.
pl_scan_csv( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, dtypes = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, reuse_downloaded = TRUE, include_file_paths = NULL )
pl_scan_csv( source, ..., has_header = TRUE, separator = ",", comment_prefix = NULL, quote_char = "\"", skip_rows = 0, dtypes = NULL, null_values = NULL, ignore_errors = FALSE, cache = FALSE, infer_schema_length = 100, n_rows = NULL, encoding = "utf8", low_memory = FALSE, rechunk = TRUE, skip_rows_after_header = 0, row_index_name = NULL, row_index_offset = 0, try_parse_dates = FALSE, eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, reuse_downloaded = TRUE, include_file_paths = NULL )
source |
Path to a file or URL. It is possible to provide multiple paths provided that all CSV files have the same schema. It is not possible to provide several URLs. |
... |
Ignored. |
has_header |
Indicate if the first row of dataset is a header or not.If
|
separator |
Single byte character to use as separator in the file. |
comment_prefix |
A string, which can be up to 5 symbols in length, used to indicate
the start of a comment line. For instance, it can be set to |
quote_char |
Single byte character used for quoting. Set to |
skip_rows |
Start reading after a particular number of rows. The header will be parsed at this offset. |
dtypes |
Named list of column names - dtypes or dtype - column names. This list is used while reading to overwrite dtypes. Supported types so far are:
|
null_values |
Values to interpret as
|
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
cache |
Cache the result after reading. |
infer_schema_length |
Maximum number of rows to read to infer the column
types. If set to 0, all columns will be read as UTF-8. If |
n_rows |
Maximum number of rows to read. |
encoding |
Either |
low_memory |
Reduce memory usage (will yield a lower performance). |
rechunk |
Reallocate to contiguous memory when all chunks / files are parsed. |
skip_rows_after_header |
Parse the first row as headers, and then skip this number of rows. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
try_parse_dates |
Try to automatically parse dates. Most ISO8601-like
formats can be inferred, as well as a handful of others. If this does not
succeed, the column remains of data type |
eol_char |
Single byte end of line character (default: |
raise_if_empty |
If |
truncate_ragged_lines |
Truncate lines that are longer than the schema. |
reuse_downloaded |
If |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
my_file = tempfile() write.csv(iris, my_file) lazy_frame = pl$scan_csv(my_file) lazy_frame$collect() unlink(my_file)
my_file = tempfile() write.csv(iris, my_file) lazy_frame = pl$scan_csv(my_file) lazy_frame$collect() unlink(my_file)
This allows the query optimizer to push down predicates and projections to the scan level, thereby potentially reducing memory overhead.
pl_scan_ipc( source, ..., n_rows = NULL, memory_map = TRUE, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, include_file_paths = NULL )
pl_scan_ipc( source, ..., n_rows = NULL, memory_map = TRUE, row_index_name = NULL, row_index_offset = 0L, rechunk = FALSE, cache = TRUE, hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, include_file_paths = NULL )
source |
Path to a file. You can use globbing with |
... |
Ignored. |
n_rows |
Maximum number of rows to read. |
memory_map |
A logical. If |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
rechunk |
In case of reading multiple files via a glob pattern, rechunk the final DataFrame into contiguous memory chunks. |
cache |
Cache the result after reading. |
hive_partitioning |
Infer statistics and schema from Hive partitioned URL
and use them to prune reads. If |
hive_schema |
A list containing the column names and data types of the
columns by which the data is partitioned, e.g.
|
try_parse_hive_dates |
Whether to try parsing hive values as date/datetime types. |
include_file_paths |
Character value indicating the column name that will include the path of the source file(s). |
Hive-style partitioning is not supported yet.
temp_dir = tempfile() # Write a hive-style partitioned arrow file dataset arrow::write_dataset( mtcars, temp_dir, partitioning = c("cyl", "gear"), format = "arrow", hive_style = TRUE ) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$scan_ipc(temp_dir)$collect() # We can also impose a schema to the partition pl$scan_ipc(temp_dir, hive_schema = list(cyl = pl$String, gear = pl$Int32))$collect()
temp_dir = tempfile() # Write a hive-style partitioned arrow file dataset arrow::write_dataset( mtcars, temp_dir, partitioning = c("cyl", "gear"), format = "arrow", hive_style = TRUE ) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$scan_ipc(temp_dir)$collect() # We can also impose a schema to the partition pl$scan_ipc(temp_dir, hive_schema = list(cyl = pl$String, gear = pl$Int32))$collect()
Read a file from path into a polars LazyFrame.
pl_scan_ndjson( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, reuse_downloaded = TRUE, ignore_errors = FALSE )
pl_scan_ndjson( source, ..., infer_schema_length = 100, batch_size = NULL, n_rows = NULL, low_memory = FALSE, rechunk = FALSE, row_index_name = NULL, row_index_offset = 0, reuse_downloaded = TRUE, ignore_errors = FALSE )
source |
Path to a file or URL. It is possible to provide multiple paths provided that all NDJSON files have the same schema. It is not possible to provide several URLs. |
... |
Ignored. |
infer_schema_length |
Maximum number of rows to read to infer the column
types. If set to 0, all columns will be read as UTF-8. If |
batch_size |
Number of rows that will be processed per thread. |
n_rows |
Maximum number of rows to read. |
low_memory |
Reduce memory usage (will yield a lower performance). |
rechunk |
Reallocate to contiguous memory when all chunks / files are parsed. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
reuse_downloaded |
If |
ignore_errors |
Keep reading the file even if some lines yield errors.
You can also use |
A LazyFrame
if (require("jsonlite", quietly = TRUE)) { ndjson_filename = tempfile() jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) pl$scan_ndjson(ndjson_filename)$collect() }
if (require("jsonlite", quietly = TRUE)) { ndjson_filename = tempfile() jsonlite::stream_out(iris, file(ndjson_filename), verbose = FALSE) pl$scan_ndjson(ndjson_filename)$collect() }
Scan a parquet file
pl_scan_parquet( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = c("auto", "columns", "row_groups", "none"), hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, schema = NULL, rechunk = FALSE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL, allow_missing_columns = FALSE )
pl_scan_parquet( source, ..., n_rows = NULL, row_index_name = NULL, row_index_offset = 0L, parallel = c("auto", "columns", "row_groups", "none"), hive_partitioning = NULL, hive_schema = NULL, try_parse_hive_dates = TRUE, glob = TRUE, schema = NULL, rechunk = FALSE, low_memory = FALSE, storage_options = NULL, use_statistics = TRUE, cache = TRUE, include_file_paths = NULL, allow_missing_columns = FALSE )
source |
Path to a file. You can use globbing with |
... |
Ignored. |
n_rows |
Maximum number of rows to read. |
row_index_name |
If not |
row_index_offset |
Offset to start the row index column (only used if the name is set). |
parallel |
This determines the direction of parallelism. |
hive_partitioning |
Infer statistics and schema from Hive partitioned URL
and use them to prune reads. If |
hive_schema |
A list containing the column names and data types of the
columns by which the data is partitioned, e.g.
|
try_parse_hive_dates |
Whether to try parsing hive values as date/datetime types. |
glob |
Expand path given via globbing rules. |
schema |
Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s).
If there are extra columns that are not in the file(s), consider also enabling |
rechunk |
In case of reading multiple files via a glob pattern, rechunk the final DataFrame into contiguous memory chunks. |
low_memory |
Reduce memory usage (will yield a lower performance). |
storage_options |
Experimental. List of options necessary to scan parquet files from different cloud storage providers (GCP, AWS, Azure, HuggingFace). See the 'Details' section. |
use_statistics |
Use statistics in the parquet file to determine if pages can be skipped from reading. |
cache |
Cache the result after reading. |
include_file_paths |
Include the path of the source file(s) as a column with this name. |
allow_missing_columns |
When reading a list of parquet files, if a column existing in the first
file cannot be found in subsequent files, the default behavior is to raise an error.
However, if |
The prefiltered strategy first evaluates the pushed-down predicates in parallel and determines a mask of which rows to read. Then, it parallelizes over both the columns and the row groups while filtering out rows that do not need to be read. This can provide significant speedups for large files (i.e. many row-groups) with a predicate that filters clustered rows or filters heavily. In other cases, prefiltered may slow down the scan compared other strategies.
The prefiltered settings falls back to auto if no predicate is given.
Polars supports scanning parquet files from different cloud providers.
The cloud providers currently supported are AWS, GCP, and Azure.
The supported keys to pass to the storage_options
argument can be found
here:
Currently it is impossible to scan public parquet files from GCP without
a valid service account. Be sure to always include a service account in the
storage_options
argument.
It is possible to scan data stored on HuggingFace using a path starting with
hf://
. The hf://
path format is defined as
hf://BUCKET/REPOSITORY@REVISION/PATH
, where:
BUCKET is one of datasets or spaces
REPOSITORY is the location of the repository. this is usually in the
format of username/repo_name. A branch can also be optionally specified by
appending @branch
.
REVISION is the name of the branch (or commit) to use. This is optional and defaults to main if not given.
PATH is a file or directory path, or a glob pattern from the repository root.
A Hugging Face API key can be passed to access private locations using either of the following methods:
Passing a token in storage_options to the scan function, e.g.
scan_parquet(..., storage_options = list(token = <your HF token>))
Setting the HF_TOKEN environment variable, e.g.
Sys.setenv(HF_TOKEN = <your HF token>)
.
# Write a Parquet file than we can then import as DataFrame temp_file = withr::local_tempfile(fileext = ".parquet") as_polars_df(mtcars)$write_parquet(temp_file) pl$scan_parquet(temp_file)$collect() # Write a hive-style partitioned parquet dataset temp_dir = withr::local_tempdir() as_polars_df(mtcars)$write_parquet(temp_dir, partition_by = c("cyl", "gear")) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$scan_parquet(temp_dir)$collect()
# Write a Parquet file than we can then import as DataFrame temp_file = withr::local_tempfile(fileext = ".parquet") as_polars_df(mtcars)$write_parquet(temp_file) pl$scan_parquet(temp_file)$collect() # Write a hive-style partitioned parquet dataset temp_dir = withr::local_tempdir() as_polars_df(mtcars)$write_parquet(temp_dir, partition_by = c("cyl", "gear")) list.files(temp_dir, recursive = TRUE) # If the path is a folder, Polars automatically tries to detect partitions # and includes them in the output pl$scan_parquet(temp_dir)$collect()
pl$select(...)
is a shorthand for pl$DataFrame(list())$select(...)
pl_select(...)
pl_select(...)
... |
pl$select( pl$lit(1:4)$alias("ints"), pl$lit(letters[1:4])$alias("letters") )
pl$select( pl$lit(1:4)$alias("ints"), pl$lit(letters[1:4])$alias("letters") )
This function is a simple way to convert R vectors to
the Series class object.
Internally, this function is a simple wrapper of as_polars_series()
.
pl_Series( name = NULL, values = NULL, dtype = NULL, ..., strict = TRUE, nan_to_null = FALSE )
pl_Series( name = NULL, values = NULL, dtype = NULL, ..., strict = TRUE, nan_to_null = FALSE )
name |
A character to use as the name of the Series, or |
values |
Object to convert into a polars Series.
Passed to the |
dtype |
One of polars data type or |
... |
Ignored. |
strict |
A logical. If |
nan_to_null |
If |
Python Polars has a feature that automatically interprets something like polars.Series([1])
as polars.Series(values=[1])
if you specify Array like objects as the first argument.
This feature is not available in R Polars, so something like pl$Series(1)
will raise an error.
You should use pl$Series(values = 1)
or as_polars_series(1)
instead.
# Constructing a Series by specifying name and values positionally: s = pl$Series("a", 1:3) s # Notice that the dtype is automatically inferred as a polars Int32: s$dtype # Constructing a Series with a specific dtype: s2 = pl$Series(values = 1:3, name = "a", dtype = pl$Float32) s2
# Constructing a Series by specifying name and values positionally: s = pl$Series("a", 1:3) s # Notice that the dtype is automatically inferred as a polars Int32: s$dtype # Constructing a Series with a specific dtype: s2 = pl$Series(values = 1:3, name = "a", dtype = pl$Float32) s2
Create a new SQLContext and register the given LazyFrames.
pl_SQLContext(...)
pl_SQLContext(...)
... |
Name-value pairs of LazyFrame like objects to register. |
An SQLContext
ctx = pl$SQLContext(mtcars = mtcars) ctx
ctx = pl$SQLContext(mtcars = mtcars) ctx
This function is syntactic sugar for pl$col(...)$std(ddof)
.
pl_std(..., ddof = 1)
pl_std(..., ddof = 1)
... |
Characters indicating the column names, passed to |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$std("a")) df$select(pl$std(c("a", "b")))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$std("a")) df$select(pl$std(c("a", "b")))
Collect columns into a struct column
pl_struct(exprs, schema = NULL)
pl_struct(exprs, schema = NULL)
exprs |
Columns/Expressions to collect into a Struct. |
schema |
Optional schema named list that explicitly defines the struct
field dtypes. Each name must match a column name wrapped in the struct. Can
only be used to cast some or all dtypes, not to change the names. If |
pl$struct()
creates an Expr of DataType Struct()
.
Compared to the Python implementation, pl$struct()
doesn't have the
argument eager
and always returns an Expr. Use $to_series()
to return a
Series.
Expr with dtype Struct
# isolated expression to wrap all columns in a struct aliased 'my_struct' pl$struct(pl$all())$alias("my_struct") # wrap all column into on column/Series df = pl$DataFrame( int = 1:2, str = c("a", "b"), bool = c(TRUE, NA), list = list(1:2, 3L) )$select( pl$struct(pl$all())$alias("my_struct") ) print(df) print(df$schema) # returns a schema, a named list containing one element a Struct named my_struct # wrap two columns in a struct and provide a schema to set all or some DataTypes by name e1 = pl$struct( pl$col(c("int", "str")), schema = list(int = pl$Int64, str = pl$String) )$alias("my_struct") # same result as e.g. wrapping the columns in a struct and casting afterwards e2 = pl$struct( list(pl$col("int"), pl$col("str")) )$cast( pl$Struct(int = pl$Int64, str = pl$String) )$alias("my_struct") df = pl$DataFrame( int = 1:2, str = c("a", "b"), bool = c(TRUE, NA), list = list(1:2, 3L) ) # verify equality in R identical(df$select(e1)$to_list(), df$select(e2)$to_list()) df$select(e2) df$select(e2)$to_data_frame()
# isolated expression to wrap all columns in a struct aliased 'my_struct' pl$struct(pl$all())$alias("my_struct") # wrap all column into on column/Series df = pl$DataFrame( int = 1:2, str = c("a", "b"), bool = c(TRUE, NA), list = list(1:2, 3L) )$select( pl$struct(pl$all())$alias("my_struct") ) print(df) print(df$schema) # returns a schema, a named list containing one element a Struct named my_struct # wrap two columns in a struct and provide a schema to set all or some DataTypes by name e1 = pl$struct( pl$col(c("int", "str")), schema = list(int = pl$Int64, str = pl$String) )$alias("my_struct") # same result as e.g. wrapping the columns in a struct and casting afterwards e2 = pl$struct( list(pl$col("int"), pl$col("str")) )$cast( pl$Struct(int = pl$Int64, str = pl$String) )$alias("my_struct") df = pl$DataFrame( int = 1:2, str = c("a", "b"), bool = c(TRUE, NA), list = list(1:2, 3L) ) # verify equality in R identical(df$select(e1)$to_list(), df$select(e2)$to_list()) df$select(e2) df$select(e2)$to_data_frame()
Syntactic sugar for pl$col(...)$sum()
.
pl_sum(...)
pl_sum(...)
... |
Characters indicating the column names, passed to |
df = pl$DataFrame(col_a = 1:2, col_b = 3:4, c = 5:6) df$select(pl$sum("col_a")) # Sum multiple columns df$select(pl$sum("col_a", "col_b")) df$select(pl$sum("^col_.*$"))
df = pl$DataFrame(col_a = 1:2, col_b = 3:4, c = 5:6) df$select(pl$sum("col_a")) # Sum multiple columns df$select(pl$sum("col_a", "col_b")) df$select(pl$sum("^col_.*$"))
Compute the sum rowwise
pl_sum_horizontal(...)
pl_sum_horizontal(...)
... |
Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
Expr
df = pl$DataFrame( a = NA_real_, b = c(3:4, NA_real_, NA_real_), c = c(1:2, NA_real_, -Inf) ) df$with_columns( pl$sum_horizontal("a", "b", "c", 2)$alias("sum") )
df = pl$DataFrame( a = NA_real_, b = c(3:4, NA_real_, NA_real_), c = c(1:2, NA_real_, -Inf) ) df$with_columns( pl$sum_horizontal("a", "b", "c", 2)$alias("sum") )
n
rows.This function is syntactic sugar for pl$col(...)$tail(n)
.
pl_tail(..., n = 10)
pl_tail(..., n = 10)
... |
Characters indicating the column names, passed to |
n |
Number of rows to return. |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$tail("a")) df$select(pl$tail("a", "b", n = 2))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$tail("a")) df$select(pl$tail("a", "b", n = 2))
The threadpool size can be overridden by setting the
POLARS_MAX_THREADS
environment variable before process start.
It cannot be modified once polars
is loaded.
It is strongly recommended not to override this value as it will be
set automatically by the engine.
pl_thread_pool_size()
pl_thread_pool_size()
For compatibility with CRAN, the threadpool size is set to 2 by default. To disable this behavior and let the engine determine the threadpool size, one of the following ways can be used:
Enable the disable_limit_max_threads
feature of the library.
This can be done by setting the feature flag when installing the package.
See the installation vignette (vignette("install", "polars")
)
for details.
Set the polars.limit_max_threads
option to FALSE
with
the options()
function. Same as setting the POLARS_MAX_THREADS
environment
variable, this option must be set before loading the package.
The number of threads
pl$thread_pool_size()
pl$thread_pool_size()
Create a Time expression
pl_time(hour = NULL, minute = NULL, second = NULL, microsecond = NULL)
pl_time(hour = NULL, minute = NULL, second = NULL, microsecond = NULL)
hour |
An Expr or something coercible to an Expr, that must return an integer between 0 and 23. Strings are parsed as column names. Floats are cast to integers. |
minute |
An Expr or something coercible to an Expr, that must return an integer between 0 and 59. Strings are parsed as column names. Floats are cast to integers. |
second |
An Expr or something coercible to an Expr, that must return an integer between 0 and 59. Strings are parsed as column names. Floats are cast to integers. |
microsecond |
An Expr or something coercible to an Expr, that must return an integer between 0 and 999,999. Strings are parsed as column names. Floats are cast to integers. |
An Expr of type Time
df = pl$DataFrame(hour = 19:21, min = 9:11, sec = 10:12, micro = 1) df$with_columns( time_from_cols = pl$time("hour", "min", "sec", "micro"), time_from_lit = pl$time(12, 3, 5), time_from_mix = pl$time("hour", 3, 5) ) # floats are coerced to integers df$with_columns( time_floats = pl$time(12.5, 5.3, 1) ) # if time can't be constructed, it returns null df$with_columns( time_floats = pl$time(pl$lit("abc"), -2, 1) )
df = pl$DataFrame(hour = 19:21, min = 9:11, sec = 10:12, micro = 1) df$with_columns( time_from_cols = pl$time("hour", "min", "sec", "micro"), time_from_lit = pl$time(12, 3, 5), time_from_mix = pl$time("hour", 3, 5) ) # floats are coerced to integers df$with_columns( time_floats = pl$time(12.5, 5.3, 1) ) # if time can't be constructed, it returns null df$with_columns( time_floats = pl$time(pl$lit("abc"), -2, 1) )
This function simply checks if the global string cache is active.
pl_using_string_cache()
pl_using_string_cache()
A logical value
pl$with_string_cache
pl$enable_enable_cache
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
pl$enable_string_cache() pl$using_string_cache() pl$disable_string_cache() pl$using_string_cache()
This function is syntactic sugar for pl$col(...)$var(ddof)
.
pl_var(..., ddof = 1)
pl_var(..., ddof = 1)
... |
Characters indicating the column names, passed to |
ddof |
An integer representing "Delta Degrees of Freedom":
the divisor used in the calculation is |
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$var("a")) df$select(pl$var("a", "b"))
df = pl$DataFrame( a = c(1, 8, 3), b = c(4, 5, 2), c = c("foo", "bar", "foo") ) df$select(pl$var("a")) df$select(pl$var("a", "b"))
This function only temporarily enables the global string cache.
pl_with_string_cache(expr)
pl_with_string_cache(expr)
expr |
An Expr to evaluate while the string cache is enabled. |
return value of expression
pl$using_string_cache
pl$enable_enable_cache
# activate string cache temporarily when constructing two DataFrame's pl$with_string_cache({ df1 = as_polars_df(head(iris, 2)) df2 = as_polars_df(tail(iris, 2)) }) pl$concat(list(df1, df2))
# activate string cache temporarily when constructing two DataFrame's pl$with_string_cache({ df1 = as_polars_df(head(iris, 2)) df2 = as_polars_df(tail(iris, 2)) }) pl$concat(list(df1, df2))
One SEXP of Rtype: "externalptr" + a class attribute
object$method()
calls are facilitated by a $.ClassName
- s3method see 'R/after-wrappers.R'
Code completion is facilitated by .DollarNames.ClassName
-s3method see e.g. 'R/dataframe__frame.R'
Implementation of property-methods as DataFrame_columns() and syntax checking is an extension to $.ClassName
See function macro_add_syntax_check_to_class().
not applicable
# all a polars object is only made of: some_polars_object = as_polars_df(iris) str(some_polars_object) # External Pointer tagged with a class attribute. # All state is stored on rust side. # The single exception from the rule is class "GroupBy", where objects also have # two private attributes "groupby_input" and "maintain_order". str(as_polars_df(iris)$group_by("Species"))
# all a polars object is only made of: some_polars_object = as_polars_df(iris) str(some_polars_object) # External Pointer tagged with a class attribute. # All state is stored on rust side. # The single exception from the rule is class "GroupBy", where objects also have # two private attributes "groupby_input" and "maintain_order". str(as_polars_df(iris)$group_by("Species"))
Polars code completion
polars_code_completion_activate( mode = c("auto", "rstudio", "native"), verbose = TRUE ) polars_code_completion_deactivate()
polars_code_completion_activate( mode = c("auto", "rstudio", "native"), verbose = TRUE ) polars_code_completion_deactivate()
mode |
One of |
verbose |
Print message of what mode is started. |
Polars code completion has one implementation for a native terminal via
utils:::rc.getOption("custom.completer")
and one for Rstudio by intercepting
Rstudio internal functions .rs.getCompletionsFunction
&
.rs.getCompletionsDollar
in the loaded session environment tools:rstudio
.
Therefore, any error or slowness in the completion is likely to come from
r-polars implementation.
Either completers will evaluate the full line-buffer to decide what methods are available. Pressing tab will literally evaluate left-hand-side with any following side. This works swiftly for the polars lazy API, but it can take some time for the eager API depending on the size of the data and of the query.
if (interactive()) { # activate completion polars_code_completion_activate() # method / property completion for chained expressions # add a $ and press tab to see methods of LazyFrame pl$LazyFrame(iris) # Arg + column-name completion # press tab inside group_by() to see args and/or column names. pl$LazyFrame(iris)$group_by() # deactivate like this or restart R session polars_code_completion_deactivate() }
if (interactive()) { # activate completion polars_code_completion_activate() # method / property completion for chained expressions # add a $ and press tab to see methods of LazyFrame pl$LazyFrame(iris) # Arg + column-name completion # press tab inside group_by() to see args and/or column names. pl$LazyFrame(iris)$group_by() # deactivate like this or restart R session polars_code_completion_deactivate() }
The Polars duration string language
Polars duration string language is a simple representation of durations. It is used in many Polars functions that accept durations.
It has the following format:
1ns (1 nanosecond)
1us (1 microsecond)
1ms (1 millisecond)
1s (1 second)
1m (1 minute)
1h (1 hour)
1d (1 calendar day)
1w (1 calendar week)
1mo (1 calendar month)
1q (1 calendar quarter)
1y (1 calendar year)
Or combine them: "3d12h4m25s"
# 3 days, 12 hours, 4 minutes, and 25 seconds
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
Get polars environment variables
polars_envvars()
polars_envvars()
The following envvars are available (in alphabetical order, with the default value in parenthesis):
POLARS_FMT_MAX_COLS
(5
): Set the number of columns that are visible when
displaying tables. If negative, all columns are displayed.
POLARS_FMT_MAX_ROWS
(8
): Set the number of rows that are visible when
displaying tables. If negative, all rows are displayed. This applies to both
DataFrame
and Series
.
POLARS_FMT_STR_LEN
(32
): Maximum number of characters to display;
POLARS_FMT_TABLE_CELL_ALIGNMENT
("LEFT"
): set the table cell alignment.
Can be "LEFT"
, "CENTER"
, "RIGHT"
;
POLARS_FMT_TABLE_CELL_LIST_LEN
(3
): Maximum number of elements of list
variables to display;
POLARS_FMT_TABLE_CELL_NUMERIC_ALIGNMENT
("LEFT"
): Set the table cell
alignment for numeric columns. Can be "LEFT"
, "CENTER"
, "RIGHT"
;
POLARS_FMT_TABLE_DATAFRAME_SHAPE_BELOW
("0"
): print the DataFrame shape
information below the data when displaying tables. Can be "0"
or "1"
.
POLARS_FMT_TABLE_FORMATTING
("UTF8_FULL_CONDENSED"
): Set table
formatting style. Possible values:
"ASCII_FULL"
: ASCII, with all borders and lines, including row dividers.
"ASCII_FULL_CONDENSED"
: Same as ASCII_FULL, but with dense row spacing.
"ASCII_NO_BORDERS"
: ASCII, no borders.
"ASCII_BORDERS_ONLY"
: ASCII, borders only.
"ASCII_BORDERS_ONLY_CONDENSED"
: ASCII, borders only, dense row spacing.
"ASCII_HORIZONTAL_ONLY"
: ASCII, horizontal lines only.
"ASCII_MARKDOWN"
: ASCII, Markdown compatible.
"UTF8_FULL"
: UTF8, with all borders and lines, including row dividers.
"UTF8_FULL_CONDENSED"
: Same as UTF8_FULL, but with dense row spacing.
"UTF8_NO_BORDERS"
: UTF8, no borders.
"UTF8_BORDERS_ONLY"
: UTF8, borders only.
"UTF8_HORIZONTAL_ONLY"
: UTF8, horizontal lines only.
"NOTHING"
: No borders or other lines.
POLARS_FMT_TABLE_HIDE_COLUMN_DATA_TYPES
("0"
): Hide table column data
types (i64, f64, str etc.). Can be "0"
or "1"
.
POLARS_FMT_TABLE_HIDE_COLUMN_NAMES
("0"
): Hide table column names. Can
be "0"
or "1"
.
POLARS_FMT_TABLE_HIDE_COLUMN_SEPARATOR
("0"
): Hide the "---"
separator
between the column names and column types. Can be "0"
or "1"
.
POLARS_FMT_TABLE_HIDE_DATAFRAME_SHAPE_INFORMATION
("0"
): Hide the
DataFrame shape information when displaying tables. Can be "0"
or "1"
.
POLARS_FMT_TABLE_INLINE_COLUMN_DATA_TYPE
("0"
): Moves the data type
inline with the column name (to the right, in parentheses). Can be "0"
or "1"
.
POLARS_FMT_TABLE_ROUNDED_CORNERS
("0"
): Apply rounded corners to
UTF8-styled tables (only applies to UTF8 formats).
POLARS_MAX_THREADS
(<variable>
): Maximum number of threads used to
initialize the thread pool. The thread pool is locked once polars is loaded,
so this envvar must be set before loading the package.
POLARS_STREAMING_CHUNK_SIZE
(<variable>
): Chunk size used in the
streaming engine. Integer larger than 1. By default, the chunk size is
determined by the schema and size of the thread pool. For some datasets
(esp. when you have large string elements) this can be too optimistic and
lead to Out of Memory errors.
POLARS_TABLE_WIDTH
(<variable>
): Set the maximum width of a table in
characters.
POLARS_VERBOSE
("0"
): Enable additional verbose/debug logging.
POLARS_WARN_UNSTABLE
("0"
): Issue a warning when unstable functionality
is used. Enabling this setting may help avoid functionality that is still
evolving, potentially reducing maintenance burden from API changes and bugs.
Can be "0"
or "1"
.
The following configuration options are present in the Python API but currently cannot be changed in R: decimal separator, thousands separator, float precision, float formatting, trimming decimal zeros.
polars_envvars()
returns a named list where the names are the names
of environment variables and values are their values.
polars_envvars() pl$DataFrame(x = "This is a very very very long sentence.") Sys.setenv(POLARS_FMT_STR_LEN = 50) pl$DataFrame(x = "This is a very very very long sentence.") # back to default Sys.setenv(POLARS_FMT_STR_LEN = 32)
polars_envvars() pl$DataFrame(x = "This is a very very very long sentence.") Sys.setenv(POLARS_FMT_STR_LEN = 50) pl$DataFrame(x = "This is a very very very long sentence.") # back to default Sys.setenv(POLARS_FMT_STR_LEN = 32)
This function reports the following information:
Package versions (the Polars R package version and the dependent Rust Polars crate version)
Rust feature flags (See vignette("install", "polars")
for details)
Code completion mode: either "deactivated"
, "rstudio"
, or "native"
.
See polars_code_completion_activate()
.
polars_info()
polars_info()
A list with information of the package
polars_info() polars_info()$versions polars_info()$features$nightly
polars_info() polars_info()$versions polars_info()$features$nightly
polars_options()
returns a list of options for polars. Options
can be set with options()
. Note that options must be prefixed with
"polars.", e.g to modify the option strictly_immutable
you need to pass
options(polars.strictly_immutable =)
. See below for a description of all
options.
polars_options_reset()
brings all polars options back to their default
value.
polars_options() polars_options_reset()
polars_options() polars_options_reset()
The following options are available (in alphabetical order, with the default value in parenthesis):
debug_polars
(FALSE
): Print additional information to debug Polars.
do_not_repeat_call
(FALSE
): Do not print the call causing the error in
error messages. The default is to show them.
int64_conversion
("double"
): How should Int64 values be handled when
converting a polars object to R?
"double"
converts the integer values to double.
"bit64"
uses bit64::as.integer64()
to do the conversion (requires
the package bit64
to be attached).
"string"
converts Int64 values to character.
limit_max_threads
(!polars_info()$features$disable_limit_max_threads
):
See ?pl_thread_pool_size
for details.
This option should be set before the package is loaded.
maintain_order
(FALSE
): Default for the maintain_order
argument in
<LazyFrame>$group_by()
and
<DataFrame>$group_by()
.
no_messages
(FALSE
): Hide messages.
rpool_cap
: The maximum number of R sessions that can be used to process
R code in the background. See the section "About pool options" below.
strictly_immutable
(TRUE
): Keep polars strictly immutable. Polars/arrow
is in general pro "immutable objects". Immutability is also classic in R.
To mimic the Python-polars API, set this to FALSE.
polars_options()
returns a named list where the names are option names and
values are option values.
polars_options_reset()
doesn't return anything.
polars_options()$rpool_active
indicates the number of R sessions already
spawned in pool. polars_options()$rpool_cap
indicates the maximum number
of new R sessions that can be spawned. Anytime a polars thread worker needs
a background R session specifically to run R code embedded in a query via
$map_batches(..., in_background = TRUE)
or
$map_elements(..., in_background = TRUE)
, it will
obtain any R session idling in rpool, or spawn a new R session (process)
and add it to the rpool if rpool_cap
is not already reached. If
rpool_cap
is already reached, the thread worker will sleep until an R
session is idling.
Background R sessions communicate via polars arrow IPC (series/vectors) or
R serialize + shared memory buffers via the rust crate ipc-channel
.
Multi-process communication has overhead because all data must be
serialized/de-serialized and sent via buffers. Using multiple R sessions
will likely only give a speed-up in a low io - high cpu
scenario. Native
polars query syntax runs in threads and have no overhead.
options(polars.maintain_order = TRUE, polars.strictly_immutable = FALSE) polars_options() # option checks are run when calling polars_options(), not when setting # options options(polars.maintain_order = 42, polars.int64_conversion = "foobar") tryCatch( polars_options(), error = function(e) print(e) ) # reset options to their default value polars_options_reset()
options(polars.maintain_order = TRUE, polars.strictly_immutable = FALSE) polars_options() # option checks are run when calling polars_options(), not when setting # options options(polars.maintain_order = 42, polars.int64_conversion = "foobar") tryCatch( polars_options(), error = function(e) print(e) ) # reset options to their default value polars_options_reset()
Print values
## S3 method for class 'RPolarsSeries' print(x, ...)
## S3 method for class 'RPolarsSeries' print(x, ...)
x |
A Polars Series |
... |
Not used |
Aggregate a DataFrame over a rolling window created with $rolling()
.
RollingGroupBy_agg(...)
RollingGroupBy_agg(...)
... |
Exprs to aggregate over. Those can also be passed wrapped in a
list, e.g |
An aggregated DataFrame
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( pl$col("a"), pl$sum("a")$alias("sum_a"), pl$min("a")$alias("min_a"), pl$max("a")$alias("max_a") )
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$agg( pl$col("a"), pl$sum("a")$alias("sum_a"), pl$min("a")$alias("min_a"), pl$max("a")$alias("max_a") )
This class comes from <DataFrame>$rolling()
.
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")
Revert the $rolling()
operation. Doing <DataFrame>$rolling(...)$ungroup()
returns the original DataFrame
.
RollingGroupBy_ungroup()
RollingGroupBy_ungroup()
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$ungroup()
df = pl$DataFrame( dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), a = c(3, 7, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) df$rolling(index_column = "dt", period = "2d")$ungroup()
Get the row names
## S3 method for class 'RPolarsDataFrame' row.names(x)
## S3 method for class 'RPolarsDataFrame' row.names(x)
x |
A Polars DataFrame |
A handle to some polars query running in a background thread.
<LazyFrame>$collect_in_background()
will execute a polars
query detached from the R session and return an RPolarsRThreadHandle
immediately. This
RPolarsRThreadHandle
-class has the methods is_finished()
and
join()
.
The background thread cannot use the main R session, but can access the pool of extra R sessions
to process R code embedded in polars query via $map_batches(..., in_background = TRUE)
or
$map_elements(background=TRUE)
. Use options(polars.rpool_cap = XX)
to limit number of
parallel R sessions.
Starting polars <LazyFrame>$collect_in_background()
with
e.g. some $map_batches(..., in_background = FALSE)
will raise an Error as the main R session is not
available to process the R part of the polars query. Native polars query does not need any R
session.
prexpr = pl$col("mpg")$map_batches(\(x) { Sys.sleep(.1) x * 0.43 }, in_background = TRUE)$alias("kml") handle = as_polars_lf(mtcars)$with_columns(prexpr)$collect_in_background() if (!handle$is_finished()) print("not done yet") df = handle$join() # get result df
prexpr = pl$col("mpg")$map_batches(\(x) { Sys.sleep(.1) x * 0.43 }, in_background = TRUE)$alias("kml") handle = as_polars_lf(mtcars)$with_columns(prexpr)$collect_in_background() if (!handle$is_finished()) print("not done yet") df = handle$join() # get result df
Ask if RThreadHandle is finished?
RThreadHandle_is_finished()
RThreadHandle_is_finished()
trinary value: TRUE
if finished, FALSE
if not, and NULL
if the handle was exhausted
with <RThreadHandle>$join()
.
Join a RThreadHandle
RThreadHandle_join()
RThreadHandle_join()
method <RThreadHandle>$join()
: will block until job is done and then return some value
or raise an error from the thread.
Calling <RThreadHandle>$join()
a second time will raise an error because handle is already
exhausted.
return value from background thread
Arithmetic operators for RPolars objects
## S3 method for class 'RPolarsExpr' x + y ## S3 method for class 'RPolarsExpr' x - y ## S3 method for class 'RPolarsExpr' x * y ## S3 method for class 'RPolarsExpr' x / y ## S3 method for class 'RPolarsExpr' x ^ y ## S3 method for class 'RPolarsExpr' x %% y ## S3 method for class 'RPolarsExpr' x %/% y ## S3 method for class 'RPolarsSeries' x + y ## S3 method for class 'RPolarsSeries' x - y ## S3 method for class 'RPolarsSeries' x * y ## S3 method for class 'RPolarsSeries' x / y ## S3 method for class 'RPolarsSeries' x ^ y ## S3 method for class 'RPolarsSeries' x %% y ## S3 method for class 'RPolarsSeries' x %/% y
## S3 method for class 'RPolarsExpr' x + y ## S3 method for class 'RPolarsExpr' x - y ## S3 method for class 'RPolarsExpr' x * y ## S3 method for class 'RPolarsExpr' x / y ## S3 method for class 'RPolarsExpr' x ^ y ## S3 method for class 'RPolarsExpr' x %% y ## S3 method for class 'RPolarsExpr' x %/% y ## S3 method for class 'RPolarsSeries' x + y ## S3 method for class 'RPolarsSeries' x - y ## S3 method for class 'RPolarsSeries' x * y ## S3 method for class 'RPolarsSeries' x / y ## S3 method for class 'RPolarsSeries' x ^ y ## S3 method for class 'RPolarsSeries' x %% y ## S3 method for class 'RPolarsSeries' x %/% y
x , y
|
numeric type of RPolars objects or objects that can be coerced such.
Only |
A Polars object the same type as the input.
pl$lit(5) + 10 5 + pl$lit(10) pl$lit(5) + pl$lit(10) +pl$lit(1) # This will not raise an error as it is not actually evaluated. expr = pl$lit(5) + "10" expr # Will raise an error as it is evaluated. tryCatch( expr$to_series(), error = function(e) e ) as_polars_series(5) + 10 +as_polars_series(5) -as_polars_series(5)
pl$lit(5) + 10 5 + pl$lit(10) pl$lit(5) + pl$lit(10) +pl$lit(1) # This will not raise an error as it is not actually evaluated. expr = pl$lit(5) + "10" expr # Will raise an error as it is evaluated. tryCatch( expr$to_series(), error = function(e) e ) as_polars_series(5) + 10 +as_polars_series(5) -as_polars_series(5)
Method equivalent of addition operator series + other
.
Series_add(other)
Series_add(other)
other |
Series like object of numeric or string values.
Converted to Series by |
as_polars_series(1:3)$add(as_polars_series(11:13)) as_polars_series(1:3)$add(11:13) as_polars_series(1:3)$add(1L) as_polars_series("a")$add("-z")
as_polars_series(1:3)$add(as_polars_series(11:13)) as_polars_series(1:3)$add(11:13) as_polars_series(1:3)$add(1L) as_polars_series("a")$add("-z")
Change name of Series
Series_alias(name)
Series_alias(name)
name |
New name. |
as_polars_series(1:3, name = "alice")$alias("bob")
as_polars_series(1:3, name = "alice")$alias("bob")
Reduce Boolean Series with ALL
Series_all()
Series_all()
A logical value
as_polars_series(c(TRUE, TRUE, NA))$all()
as_polars_series(c(TRUE, TRUE, NA))$all()
Reduce boolean Series with ANY
Series_any()
Series_any()
A logical value
as_polars_series(c(TRUE, FALSE, NA))$any()
as_polars_series(c(TRUE, FALSE, NA))$any()
Append two Series
Series_append(other, immutable = TRUE)
Series_append(other, immutable = TRUE)
other |
Series to append. |
immutable |
Should the |
If immutable = FALSE
, the Series object will not behave as immutable. This
means that appending to this Series will affect any variable pointing to this
memory location. This will break normal scoping rules of R. Setting
immutable = FALSE
is discouraged as it can have undesirable side effects
and cloning Polars Series is a cheap operation.
# default immutable behavior, s_imut and s_imut_copy stay the same s_imut = as_polars_series(1:3) s_imut_copy = s_imut s_new = s_imut$append(as_polars_series(1:3)) s_new # the original Series didn't change s_imut s_imut_copy # enabling mutable behavior requires setting a global option withr::with_options( list(polars.strictly_immutable = FALSE), { s_mut = as_polars_series(1:3) s_mut_copy = s_mut s_new = s_mut$append(as_polars_series(1:3), immutable = FALSE) print(s_new) # the original Series also changed since it's mutable print(s_mut) print(s_mut_copy) } )
# default immutable behavior, s_imut and s_imut_copy stay the same s_imut = as_polars_series(1:3) s_imut_copy = s_imut s_new = s_imut$append(as_polars_series(1:3)) s_new # the original Series didn't change s_imut s_imut_copy # enabling mutable behavior requires setting a global option withr::with_options( list(polars.strictly_immutable = FALSE), { s_mut = as_polars_series(1:3) s_mut_copy = s_mut s_new = s_mut$append(as_polars_series(1:3), immutable = FALSE) print(s_new) # the original Series also changed since it's mutable print(s_mut) print(s_mut_copy) } )
Note that this is 0-indexed.
Series_arg_max()
Series_arg_max()
A numeric value
as_polars_series(c(5, 1))$arg_max()
as_polars_series(c(5, 1))$arg_max()
Note that this is 0-indexed.
Series_arg_min()
Series_arg_min()
A numeric value
as_polars_series(c(5, 1))$arg_min()
as_polars_series(c(5, 1))$arg_min()
Lengths of Series memory chunks
Series_chunk_lengths()
Series_chunk_lengths()
Numeric vector. Output length is the number of chunks, and the sum of the output is equal to the length of the full Series.
chunked_series = c(as_polars_series(1:3), as_polars_series(1:10)) chunked_series$chunk_lengths()
chunked_series = c(as_polars_series(1:3), as_polars_series(1:10)) chunked_series$chunk_lengths()
The Series
-class is simply two environments of respectively
the public and private methods/function calls to the polars rust side. The
instantiated Series
-object is an externalptr
to a lowlevel rust polars
Series object. The pointer address is the only statefullness of the Series
object on the R side. Any other state resides on the rust side. The S3
method .DollarNames.RPolarsSeries
exposes all public $foobar()
-methods
which are callable onto the object. Most methods return another
Series
-class instance or similar which allows for method chaining. This
class system in lack of a better name could be called "environment classes"
and is the same class system extendr provides, except here there is both a
public and private set of methods. For implementation reasons, the private
methods are external and must be called from .pr$Series$methodname()
,
also all private methods must take any self as an argument, thus they are
pure functions. Having the private methods as pure functions
solved/simplified self-referential complications.
Check out the source code in R/Series_frame.R how public methods are
derived from private methods. Check out extendr-wrappers.R to see the
extendr-auto-generated methods. These are moved to .pr and converted into
pure external functions in after-wrappers.R. In zzz.R (named zzz to be last
file sourced) the extendr-methods are removed and replaced by any function
prefixed Series_
.
$dtype
returns the data type of the Series.
$flags
returns a named list with flag names and their values.
Flags are used internally to avoid doing unnecessary computations, such as
sorting a variable that we know is already sorted. The number of flags
varies depending on the column type: columns of type array
and list
have the flags SORTED_ASC
, SORTED_DESC
, and FAST_EXPLODE
, while other
column types only have the former two.
SORTED_ASC
is set to TRUE
when we sort a column in increasing order, so
that we can use this information later on to avoid re-sorting it.
SORTED_DESC
is similar but applies to sort in decreasing order.
$name
returns the name of the Series.
$shape
returns a numeric vector of length two with the number of length
of the Series and width of the Series (always 1).
Series stores most of all Expr methods.
Some of these are stored in sub-namespaces.
$arr
stores all array related methods.
$bin
stores all binary related methods.
$cat
stores all categorical related methods.
$dt
stores all temporal related methods.
$list
stores all list related methods.
$str
stores all string related methods.
$struct
stores all struct related methods and active bindings.
Active bindings specific to Series:
$struct$fields
: Returns a character vector of the fields in the struct.
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
# make a Series s = as_polars_series(c(1:3, 1L)) # call an active binding s$shape # show flags s$sort()$flags # use Expr method s$cos() # use Expr method in subnamespaces as_polars_series(list(3:1, 1:2, NULL))$list$first() as_polars_series(c(1, NA, 2))$str$join("-") s = pl$date_range( as.Date("2024-02-18"), as.Date("2024-02-24"), interval = "1d" )$to_series() s s$dt$day() # Other active bindings in subnamespaces as_polars_series(data.frame(a = 1:2, b = 3:4))$struct$fields # show all available methods for Series pl$show_all_public_methods("RPolarsSeries")
# make a Series s = as_polars_series(c(1:3, 1L)) # call an active binding s$shape # show flags s$sort()$flags # use Expr method s$cos() # use Expr method in subnamespaces as_polars_series(list(3:1, 1:2, NULL))$list$first() as_polars_series(c(1, NA, 2))$str$join("-") s = pl$date_range( as.Date("2024-02-18"), as.Date("2024-02-24"), interval = "1d" )$to_series() s s$dt$day() # Other active bindings in subnamespaces as_polars_series(data.frame(a = 1:2, b = 3:4))$struct$fields # show all available methods for Series pl$show_all_public_methods("RPolarsSeries")
Returns a n-row null-filled Series with an identical schema. n
can be
greater than the current number of values in the Series.
Series_clear(n = 0)
Series_clear(n = 0)
n |
Number of (null-filled) rows to return in the cleared frame. |
A n-value null-filled Series with an identical schema
s = pl$Series(name = "a", values = 1:3) s$clear() s$clear(n = 5)
s = pl$Series(name = "a", values = 1:3) s$clear() s$clear(n = 5)
This makes a very cheap deep copy/clone of an existing
Series
. Rarely useful as Series
are nearly 100%
immutable. Any modification of a Series
should lead to a clone anyways, but
this can be useful when dealing with attributes (see examples).
Series_clone()
Series_clone()
df1 = as_polars_series(1:10) # Make a function to take a Series, add an attribute, and return a Series give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original Series also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_series(1:10) df2 = give_attr(df1) # now, the original Series doesn't get this attribute attributes(df1)
df1 = as_polars_series(1:10) # Make a function to take a Series, add an attribute, and return a Series give_attr = function(data) { attr(data, "created_on") = "2024-01-29" data } df2 = give_attr(df1) # Problem: the original Series also gets the attribute while it shouldn't! attributes(df1) # Use $clone() inside the function to avoid that give_attr = function(data) { data = data$clone() attr(data, "created_on") = "2024-01-29" data } df1 = as_polars_series(1:10) df2 = give_attr(df1) # now, the original Series doesn't get this attribute attributes(df1)
Method equivalent of division operator series / other
.
Series_div(other)
Series_div(other)
other |
Series like object of numeric.
Converted to Series by |
as_polars_series(1:3)$div(11:13) as_polars_series(1:3)$div(as_polars_series(11:13)) as_polars_series(1:3)$div(1L)
as_polars_series(1:3)$div(11:13) as_polars_series(1:3)$div(as_polars_series(11:13)) as_polars_series(1:3)$div(1L)
This checks whether two Series are equal in values and in their name.
Series_equals(other, null_equal = FALSE, strict = FALSE)
Series_equals(other, null_equal = FALSE, strict = FALSE)
other |
Series to compare with. |
null_equal |
If |
strict |
If |
A logical value
as_polars_series(1:4)$equals(as_polars_series(1:4)) # names are different as_polars_series(1:4, "bob")$equals(as_polars_series(1:4)) # nulls are different by default as_polars_series(c(1:4, NA))$equals(as_polars_series(c(1:4, NA))) as_polars_series(c(1:4, NA))$equals(as_polars_series(c(1:4, NA)), null_equal = TRUE) # datatypes are ignored by default as_polars_series(1:4)$cast(pl$Int16)$equals(as_polars_series(1:4)) as_polars_series(1:4)$cast(pl$Int16)$equals(as_polars_series(1:4), strict = TRUE)
as_polars_series(1:4)$equals(as_polars_series(1:4)) # names are different as_polars_series(1:4, "bob")$equals(as_polars_series(1:4)) # nulls are different by default as_polars_series(c(1:4, NA))$equals(as_polars_series(c(1:4, NA))) as_polars_series(c(1:4, NA))$equals(as_polars_series(c(1:4, NA)), null_equal = TRUE) # datatypes are ignored by default as_polars_series(1:4)$cast(pl$Int16)$equals(as_polars_series(1:4)) as_polars_series(1:4)$cast(pl$Int16)$equals(as_polars_series(1:4), strict = TRUE)
Method equivalent of floor division operator series %/% other
.
Series_floor_div(other)
Series_floor_div(other)
other |
Series like object of numeric.
Converted to Series by |
as_polars_series(1:3)$floor_div(11:13) as_polars_series(1:3)$floor_div(as_polars_series(11:13)) as_polars_series(1:3)$floor_div(1L)
as_polars_series(1:3)$floor_div(11:13) as_polars_series(1:3)$floor_div(as_polars_series(11:13)) as_polars_series(1:3)$floor_div(1L)
This checks whether the Series DataType is in pl$numeric_dtypes
.
Series_is_numeric()
Series_is_numeric()
A logical value
as_polars_series(1:4)$is_numeric() as_polars_series(c("a", "b", "c"))$is_numeric() pl$numeric_dtypes
as_polars_series(1:4)$is_numeric() as_polars_series(c("a", "b", "c"))$is_numeric() pl$numeric_dtypes
Check if the Series is sorted
Series_is_sorted(descending = FALSE)
Series_is_sorted(descending = FALSE)
descending |
Check if the Series is sorted in descending order. |
A logical value
Use $set_sorted()
to add a "sorted" flag to the Series
that could be used for faster operations later on.
as_polars_series(1:4)$sort()$is_sorted()
as_polars_series(1:4)$sort()$is_sorted()
Return the element at the given index
Series_item(index = NULL)
Series_item(index = NULL)
index |
Index of the item to return. |
A value of length 1
s1 = pl$Series(values = 1) s1$item() s2 = pl$Series(values = 9:7) s2$cum_sum()$item(-1)
s1 = pl$Series(values = 1) s1$item() s2 = pl$Series(values = 9:7) s2$cum_sum()$item(-1)
Length of a Series
Series_len()
Series_len()
A numeric value
as_polars_series(1:10)$len()
as_polars_series(1:10)$len()
About as slow as regular non-vectorized R. Similar to using R sapply on a vector.
Series_map_elements( fun, datatype = NULL, strict_return_type = TRUE, allow_fail_eval = FALSE )
Series_map_elements( fun, datatype = NULL, strict_return_type = TRUE, allow_fail_eval = FALSE )
fun |
r function, should take a single value as input and return one. |
datatype |
DataType of return value. Default NULL means same as input. |
strict_return_type |
bool, default TRUE: fail on wrong return type, FALSE: convert to polars Null |
allow_fail_eval |
bool, default FALSE: raise R fun error, TRUE: convert to polars Null |
s = as_polars_series(letters[1:5], "ltrs") f = \(x) paste(x, ":", as.integer(charToRaw(x))) s$map_elements(f, pl$String) # same as as_polars_series(sapply(s$to_r(), f), s$name)
s = as_polars_series(letters[1:5], "ltrs") f = \(x) paste(x, ":", as.integer(charToRaw(x))) s$map_elements(f, pl$String) # same as as_polars_series(sapply(s$to_r(), f), s$name)
Find the max of a Series
Series_max()
Series_max()
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
A numeric value
as_polars_series(c(1:2, NA, 3, 5))$max() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$max() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$max() # Inf-Inf is NaN
as_polars_series(c(1:2, NA, 3, 5))$max() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$max() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$max() # Inf-Inf is NaN
Compute the mean of a Series
Series_mean()
Series_mean()
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
A numeric value
as_polars_series(c(1:2, NA, 3, 5))$mean() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$mean() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$mean() # Inf-Inf is NaN
as_polars_series(c(1:2, NA, 3, 5))$mean() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$mean() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$mean() # Inf-Inf is NaN
Compute the median of a Series
Series_median()
Series_median()
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
A numeric value
as_polars_series(c(1:2, NA, 3, 5))$median() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$median() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$median() # Inf-Inf is NaN
as_polars_series(c(1:2, NA, 3, 5))$median() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$median() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$median() # Inf-Inf is NaN
Find the min of a Series
Series_min()
Series_min()
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
A numeric value
as_polars_series(c(1:2, NA, 3, 5))$min() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$min() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$min() # Inf-Inf is NaN
as_polars_series(c(1:2, NA, 3, 5))$min() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$min() # NaN carries / poisons as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$min() # Inf-Inf is NaN
Method equivalent of modulo operator series %% other
.
Series_mod(other)
Series_mod(other)
other |
Series like object of numeric.
Converted to Series by |
as_polars_series(1:4)$mod(2L) as_polars_series(1:3)$mod(as_polars_series(11:13)) as_polars_series(1:3)$mod(1L)
as_polars_series(1:4)$mod(2L) as_polars_series(1:3)$mod(as_polars_series(11:13)) as_polars_series(1:3)$mod(1L)
Method equivalent of multiplication operator series * other
.
Series_mul(other)
Series_mul(other)
other |
Series like object of numeric.
Converted to Series by |
as_polars_series(1:3)$mul(11:13) as_polars_series(1:3)$mul(as_polars_series(11:13)) as_polars_series(1:3)$mul(1L)
as_polars_series(1:3)$mul(11:13) as_polars_series(1:3)$mul(as_polars_series(11:13)) as_polars_series(1:3)$mul(1L)
Get the number of chunks that this Series contains.
Series_n_chunks()
Series_n_chunks()
A numeric value
s = as_polars_series(1:3) s$n_chunks() # Concatenate Series with rechunk = TRUE s2 = as_polars_series(4:6) pl$concat(s, s2, rechunk = TRUE)$n_chunks() # Concatenate Series with rechunk = FALSE pl$concat(s, s2, rechunk = FALSE)$n_chunks()
s = as_polars_series(1:3) s$n_chunks() # Concatenate Series with rechunk = TRUE s2 = as_polars_series(4:6) pl$concat(s, s2, rechunk = TRUE)$n_chunks() # Concatenate Series with rechunk = FALSE pl$concat(s, s2, rechunk = FALSE)$n_chunks()
Count unique values in Series
Series_n_unique()
Series_n_unique()
A numeric value
as_polars_series(c(1, 2, 1, 4, 4, 1, 5))$n_unique()
as_polars_series(c(1, 2, 1, 4, 4, 1, 5))$n_unique()
Method equivalent of power operator series ^ other
.
Series_pow(exponent)
Series_pow(exponent)
exponent |
Series like object of numeric.
Converted to Series by |
s = as_polars_series(1:4, name = "foo") s$pow(3L)
s = as_polars_series(1:4, name = "foo") s$pow(3L)
Print Series
Series_print()
Series_print()
self
as_polars_series(1:3)
as_polars_series(1:3)
Rename a series
Series_rename(name, in_place = FALSE)
Series_rename(name, in_place = FALSE)
name |
New name. |
in_place |
Rename in-place, which breaks immutability. If |
as_polars_series(1:4, "bob")$rename("alice")
as_polars_series(1:4, "bob")$rename("alice")
Note that this function doesn't exist in Python Polars.
Series_rep(n, rechunk = TRUE)
Series_rep(n, rechunk = TRUE)
n |
Number of times to repeat |
rechunk |
If |
as_polars_series(1:2, "bob")$rep(3)
as_polars_series(1:2, "bob")$rep(3)
Set a sorted flag on a Series
Series_set_sorted(..., descending = FALSE, in_place = FALSE)
Series_set_sorted(..., descending = FALSE, in_place = FALSE)
... |
Ignored. |
descending |
Sort the columns in descending order. |
in_place |
If |
Use $flags
to see the values of the sorted flags.
A Series with a flag
s = as_polars_series(1:4)$set_sorted() s$flags
s = as_polars_series(1:4)$set_sorted() s$flags
Sort a Series
Series_sort( ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, in_place = FALSE )
Series_sort( ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, in_place = FALSE )
... |
Ignored. |
descending |
A logical. If |
nulls_last |
A logical. If |
multithreaded |
A logical. If |
in_place |
If |
as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort() as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort(nulls_last = TRUE)
as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort() as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort(nulls_last = TRUE)
Compute the standard deviation of a Series
Series_std(ddof = 1)
Series_std(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A numeric value
as_polars_series(1:10)$std()
as_polars_series(1:10)$std()
Method equivalent of subtraction operator series - other
.
Series_sub(other)
Series_sub(other)
other |
Series like object of numeric.
Converted to Series by |
as_polars_series(1:3)$sub(11:13) as_polars_series(1:3)$sub(as_polars_series(11:13)) as_polars_series(1:3)$sub(1L) 1L - as_polars_series(1:3) as_polars_series(1:3) - 1L
as_polars_series(1:3)$sub(11:13) as_polars_series(1:3)$sub(as_polars_series(11:13)) as_polars_series(1:3)$sub(1L) 1L - as_polars_series(1:3) as_polars_series(1:3) - 1L
Compute the sum of a Series
Series_sum()
Series_sum()
The Dtypes Int8, UInt8, Int16 and UInt16 are cast to Int64 before summing to prevent overflow issues.
A numeric value
as_polars_series(c(1:2, NA, 3, 5))$sum() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$sum() # NaN poisons the result as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$sum() # Inf-Inf is NaN
as_polars_series(c(1:2, NA, 3, 5))$sum() # a NA is dropped always as_polars_series(c(1:2, NA, 3, NaN, 4, Inf))$sum() # NaN poisons the result as_polars_series(c(1:2, 3, Inf, 4, -Inf, 5))$sum() # Inf-Inf is NaN
Convert Series to DataFrame
Series_to_frame()
Series_to_frame()
DataFrame
# default will be a DataFrame with empty name as_polars_series(1:4)$to_frame() as_polars_series(1:4, "bob")$to_frame()
# default will be a DataFrame with empty name as_polars_series(1:4)$to_frame() as_polars_series(1:4, "bob")$to_frame()
Convert a Series to literal
Series_to_lit()
Series_to_lit()
as_polars_series(list(1:1, 1:2, 1:3, 1:4))$ print()$ to_lit()$ list$len()$ sum()$ cast(pl$dtypes$Int8)$ to_series()
as_polars_series(list(1:1, 1:2, 1:3, 1:4))$ print()$ to_lit()$ list$len()$ sum()$ cast(pl$dtypes$Int8)$ to_series()
$to_r()
automatically returns an R vector or list based on the Polars
DataType. It is possible to force the output type by using $to_vector()
or
$to_list()
.
Series_to_r(int64_conversion = polars_options()$int64_conversion) Series_to_vector(int64_conversion = polars_options()$int64_conversion) Series_to_list(int64_conversion = polars_options()$int64_conversion)
Series_to_r(int64_conversion = polars_options()$int64_conversion) Series_to_vector(int64_conversion = polars_options()$int64_conversion) Series_to_list(int64_conversion = polars_options()$int64_conversion)
int64_conversion |
How should Int64 values be handled when converting a polars object to R?
|
R list or vector
When converting Polars objects, such as DataFrames
to R objects, for example via the as.data.frame()
generic function,
each type in the Polars object is converted to an R type.
In some cases, an error may occur because the conversion is not appropriate.
In particular, there is a high possibility of an error when converting
a Datetime type without a time zone.
A Datetime type without a time zone in Polars is converted
to the POSIXct type in R, which takes into account the time zone in which
the R session is running (which can be checked with the Sys.timezone()
function). In this case, if ambiguous times are included, a conversion error
will occur. In such cases, change the session time zone using
Sys.setenv(TZ = "UTC")
and then perform the conversion, or use the
$dt$replace_time_zone()
method on the Datetime type column to
explicitly specify the time zone before conversion.
# Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am # so this particular date-time doesn't exist non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T") withr::with_timezone( "America/New_York", { tryCatch( # This causes an error due to the time zone (the `TZ` env var is affected). as.vector(non_existent_time), error = function(e) e ) } ) #> <error: in to_r: ComputeError(ErrString("datetime '2020-03-08 02:00:00' is non-existent in time zone 'America/New_York'. You may be able to use `non_existent='null'` to return `null` in this case.")) When calling: devtools::document()> withr::with_timezone( "America/New_York", { # This is safe. as.vector(non_existent_time$dt$replace_time_zone("UTC")) } ) #> [1] "2020-03-08 02:00:00 UTC"
# Series with non-list type series_vec = as_polars_series(letters[1:3]) series_vec$to_r() # as vector because Series DataType is not list (is String) series_vec$to_list() # implicit call as.list(), convert to list series_vec$to_vector() # implicit call unlist(), same as to_r() as already vector # make a Series with nested lists series_list = as_polars_series( list( list(c(1:5, NA_integer_)), list(1:2, NA_integer_) ) ) series_list series_list$to_r() # as list because Series DataType is list series_list$to_list() # implicit call as.list(), same as to_r() as already list series_list$to_vector() # implicit call unlist(), append into a vector
# Series with non-list type series_vec = as_polars_series(letters[1:3]) series_vec$to_r() # as vector because Series DataType is not list (is String) series_vec$to_list() # implicit call as.list(), convert to list series_vec$to_vector() # implicit call unlist(), same as to_r() as already vector # make a Series with nested lists series_list = as_polars_series( list( list(c(1:5, NA_integer_)), list(1:2, NA_integer_) ) ) series_list series_list$to_r() # as list because Series DataType is list series_list$to_list() # implicit call as.list(), same as to_r() as already list series_list$to_vector() # implicit call unlist(), append into a vector
Count the occurrences of unique values
Series_value_counts( ..., sort = TRUE, parallel = FALSE, name = "count", normalize = FALSE )
Series_value_counts( ..., sort = TRUE, parallel = FALSE, name = "count", normalize = FALSE )
... |
Ignored. |
sort |
Ensure the output is sorted from most values to least. |
parallel |
Better to turn this off in the aggregation context, as it can lead to contention. |
name |
Give the resulting count column a specific name. The default is
|
normalize |
If |
DataFrame
as_polars_series(iris$Species, name = "flower species")$value_counts()
as_polars_series(iris$Species, name = "flower species")$value_counts()
Compute the variance of a Series
Series_var(ddof = 1)
Series_var(ddof = 1)
ddof |
Delta Degrees of Freedom: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. |
A numeric value
as_polars_series(1:10)$var()
as_polars_series(1:10)$var()
print any object(function, RPolarsDataType) available via pl$
.
pl_show_all_public_functions()
pl_show_all_public_functions()
pl$show_all_public_functions()
pl$show_all_public_functions()
methods are listed by their Class
pl_show_all_public_methods(class_names = NULL)
pl_show_all_public_methods(class_names = NULL)
class_names |
character vector of polars class names to show, Default NULL is all. |
pl$show_all_public_methods()
pl$show_all_public_methods()
Run SQL queries against DataFrame/LazyFrame data.
lf = pl$LazyFrame(a = 1:3, b = c("x", NA, "z")) res = pl$SQLContext(frame = lf)$execute( "SELECT b, a*2 AS two_a FROM frame WHERE b IS NOT NULL" ) res$collect()
lf = pl$LazyFrame(a = 1:3, b = c("x", NA, "z")) res = pl$SQLContext(frame = lf)$execute( "SELECT b, a*2 AS two_a FROM frame WHERE b IS NOT NULL" ) res$collect()
Parse the given SQL query and execute it against the registered frame data.
SQLContext_execute(query)
SQLContext_execute(query)
query |
A character of the SQL query to execute. |
query = "SELECT * FROM mtcars WHERE cyl = 4" pl$SQLContext(mtcars = mtcars)$execute(query)
query = "SELECT * FROM mtcars WHERE cyl = 4" pl$SQLContext(mtcars = mtcars)$execute(query)
Register a single frame as a table, using the given name.
SQLContext_register(name, frame)
SQLContext_register(name, frame)
name |
A string name to register the frame as. |
frame |
A LazyFrame like object to register. |
If a table with the same name is already registered, it will be overwritten.
Returns the SQLContext object invisibly.
ctx = pl$SQLContext() ctx$register("mtcars", mtcars) ctx$execute("SELECT * FROM mtcars LIMIT 5")$collect()
ctx = pl$SQLContext() ctx$register("mtcars", mtcars) ctx$execute("SELECT * FROM mtcars LIMIT 5")$collect()
Automatically maps variable names to table names.
SQLContext_register_globals(..., envir = parent.frame())
SQLContext_register_globals(..., envir = parent.frame())
... |
Ignored. |
envir |
The environment to search for polars DataFrames/LazyFrames. |
If a table with the same name is already registered, it will be overwritten.
Returns the SQLContext object invisibly.
df1 = pl$DataFrame(a = 1:3, b = c("x", NA, "z")) df2 = pl$LazyFrame(a = 2:4, c = c("t", "w", "v")) # Register frames directly from variables found in the current environment. ctx = pl$SQLContext()$register_globals() ctx$tables() ctx$execute( "SELECT a, b, c FROM df1 LEFT JOIN df2 USING (a) ORDER BY a DESC" )$collect()
df1 = pl$DataFrame(a = 1:3, b = c("x", NA, "z")) df2 = pl$LazyFrame(a = 2:4, c = c("t", "w", "v")) # Register frames directly from variables found in the current environment. ctx = pl$SQLContext()$register_globals() ctx$tables() ctx$execute( "SELECT a, b, c FROM df1 LEFT JOIN df2 USING (a) ORDER BY a DESC" )$collect()
Register multiple frames as tables.
SQLContext_register_many(...)
SQLContext_register_many(...)
... |
Name-value pairs of LazyFrame like objects to register. |
If a table with the same name is already registered, it will be overwritten.
Returns the SQLContext object invisibly.
ctx = pl$SQLContext() r_df = mtcars pl_df = as_polars_df(mtcars) pl_lf = as_polars_lf(mtcars) ctx$register_many(r_df = r_df, pl_df = pl_df, pl_lf = pl_lf) ctx$execute( "SELECT * FROM r_df UNION ALL SELECT * FROM pl_df UNION ALL SELECT * FROM pl_lf" )$collect()
ctx = pl$SQLContext() r_df = mtcars pl_df = as_polars_df(mtcars) pl_lf = as_polars_lf(mtcars) ctx$register_many(r_df = r_df, pl_df = pl_df, pl_lf = pl_lf) ctx$execute( "SELECT * FROM r_df UNION ALL SELECT * FROM pl_df UNION ALL SELECT * FROM pl_lf" )$collect()
Return a character vector of the registered table names.
SQLContext_tables()
SQLContext_tables()
A character vector of the registered table names.
ctx = pl$SQLContext() ctx$tables() ctx$register("df1", mtcars) ctx$tables() ctx$register("df2", mtcars) ctx$tables()
ctx = pl$SQLContext() ctx$tables() ctx$register("df1", mtcars) ctx$tables() ctx$register("df2", mtcars) ctx$tables()
Unregister tables by name.
SQLContext_unregister(names)
SQLContext_unregister(names)
names |
A character vector of table names to unregister. |
Returns the SQLContext object invisibly.
# Initialise a new SQLContext and register the given tables. ctx = pl$SQLContext(x = mtcars, y = mtcars, z = mtcars) ctx$tables() # Unregister some tables. ctx$unregister(c("x", "y")) ctx$tables()
# Initialise a new SQLContext and register the given tables. ctx = pl$SQLContext(x = mtcars, y = mtcars, z = mtcars) ctx$tables() # Unregister some tables. ctx$unregister(c("x", "y")) ctx$tables()
Compute the sum
## S3 method for class 'RPolarsDataFrame' sum(x, ...) ## S3 method for class 'RPolarsLazyFrame' sum(x, ...) ## S3 method for class 'RPolarsSeries' sum(x, ...)
## S3 method for class 'RPolarsDataFrame' sum(x, ...) ## S3 method for class 'RPolarsLazyFrame' sum(x, ...) ## S3 method for class 'RPolarsSeries' sum(x, ...)
x |
|
... |
Not used. |
Drop duplicated rows
## S3 method for class 'RPolarsDataFrame' unique(x, incomparables = FALSE, subset = NULL, keep = "first", ...) ## S3 method for class 'RPolarsLazyFrame' unique(x, incomparables = FALSE, subset = NULL, keep = "first", ...)
## S3 method for class 'RPolarsDataFrame' unique(x, incomparables = FALSE, subset = NULL, keep = "first", ...) ## S3 method for class 'RPolarsLazyFrame' unique(x, incomparables = FALSE, subset = NULL, keep = "first", ...)
x |
|
incomparables |
Not used. |
subset |
Character vector of column names to drop duplicated values from. |
keep |
Either |
... |
Not used. |
df = pl$DataFrame( x = as.numeric(c(1, 1:5)), y = as.numeric(c(1, 1:5)), z = as.numeric(c(1, 1, 1:4)) ) unique(df)
df = pl$DataFrame( x = as.numeric(c(1, 1:5)), y = as.numeric(c(1, 1:5)), z = as.numeric(c(1, 1, 1:4)) ) unique(df)