Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
S3method(print,rule)
S3method(print,ruleset)
export(check_data)
export(describe)
export(detect_backend)
export(filter_fails)
export(plot_res)
Expand Down
8 changes: 4 additions & 4 deletions R/check_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,15 @@ detect_backend <- function(x) {

} else if ("tbl_sql" %in% cc) {

if (!has_pkg("DBI"))
stop("The DBI package needs to be installed in order to test a tbl_sql.")
if (!has_pkg("DBI") || !has_pkg("dbplyr"))
stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.")

backend <- "collectibles"

} else if ("ArrowObject" %in% cc) {

if (!has_pkg("arrow"))
stop("The arrow package needs to be installed in order to test an ArrowObject.")
if (!has_pkg("arrow") || !has_pkg("dbplyr"))
stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.")

backend <- "collectibles"

Expand Down
276 changes: 276 additions & 0 deletions R/describe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#' Describes a dataset
#'
#' Note that the current version is in the beta stadium at best, that means the
#' R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster
#' than arrow or SQL-based datasets.
#'
#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection)
#'
#' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing
#' a summary of the dataset given
#' @export
#'
#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html),
#' [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary),
#' and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html)
#'
#' @examples
#' describe(mtcars)
describe <- function(x) {

backend <- detect_backend(x)

# make sure the input dataset has the right class
if (class(x)[[1]] == "data.frame") {
if (backend == "data.table") {
x <- data.table::as.data.table(x)
} else if (backend == "dplyr") {
x <- dplyr::as_tibble(x)
}
}

if (backend == "base-r") {
describe_base_r(x)
} else if (backend == "dplyr") {
describe_dplyr(x)
} else if (backend == "data.table") {
describe_data.table(x)
} else if (backend == "collectibles") {
if ("tbl_sql" %in% class(x)) {
describe_sql(x)
} else if ("ArrowObject" %in% class(x)) {
describe_arrow(x)
}
} else {
stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", ")))
}
}


# internal function to see which values should use the min/max etc part
is_numeric <- function(v) {
any(class(v) %in% c("integer", "numeric", "POSIXt"))
}

# x <- mtcars
describe_base_r <- function(x, max_n = 3) {
ll <- lapply(
seq(ncol(x)),
function(i) {
v <- x[[i]]
type <- class(v)[[1]]
is_num <- is_numeric(v)

tbl <- table(v)
uv <- unique(v)
tab <- tabulate(match(v, uv))
tab_max <- which(tab == max(tab))
# get the indices of the three highest counts
od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))]

nz <- if (!is_num) nchar(as.character(v))

data.frame(
var = names(x)[[i]],
type = type,
n = length(v),
n_distinct = length(unique(v)),
n_na = sum(is.na(v)),
most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]),
collapse = ", "),

min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)),
mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)),
median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)),
max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)),
sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE))
)
}
)

do.call(rbind, ll)
}

# x <- mtcars |> tibble::as_tibble()
describe_dplyr <- function(x, max_n = 3) {
ll <- lapply(
names(x),
function(v) {
mc <- x |>
dplyr::count(.data[[v]]) |>
dplyr::slice_max(n, n = max_n, with_ties = FALSE)

type <- class(mc[[1]])[[1]]
is_num <- is_numeric(mc[[1]])
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")

nz <- if (!is_num) nchar(as.character(x[[v]]))
x |>
dplyr::summarise(
var = v,
type = type,
n = dplyr::n(),
n_distinct = dplyr::n_distinct(.data[[v]]),
n_na = sum(is.na(.data[[v]])),
most_frequent = mf,
min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
)
}
)

dplyr::bind_rows(ll)
}

# x <- mtcars |> data.table::as.data.table()
describe_data.table <- function(x, max_n = 3) {
ll <- lapply(
names(x),
function(v) {
mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)]

type <- class(mc[[1]])[[1]]
is_num <- is_numeric(mc[[1]])
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")

nz <- if (!is_num) nchar(as.character(x[[v]]))

x[, .(
var = v,
type = type,
n = .N,
n_distinct = data.table::uniqueN(get(v)),
n_na = sum(is.na(get(v))),
most_frequent = mf,

min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)),
mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)),
median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)),
max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)),
sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE))
)]
}
)

data.table::rbindlist(ll)
}


# RSQLite, duckdb etc
describe_sql <- function(x, max_n = 3) {
ll <- lapply(names(x), function(v) {
mc <- x |>
dplyr::count(.data[[v]]) |>
dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
dplyr::collect()

type <- class(mc[[1]])[[1]]
is_num <- is_numeric(mc[[1]])
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
nn <- x |>
dplyr::distinct(.data[[v]]) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::collect()
nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow()

r <- dplyr::tibble(
var = v,
type = type,
n_distinct = nn[[1]],
n_na = nna,
most_frequent = mf
)

xx <- x |>
dplyr::select(dplyr::all_of(v)) |>
dplyr::rename(x := dplyr::all_of(v))
if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x)))


rr <- try(
xx |>
dplyr::summarise(
min = min(x, na.rm = TRUE),
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE)
) |>
dplyr::collect(),
silent = TRUE
)
if (inherits(rr, "try-error")) {
rr <- dplyr::tibble(
min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
sd = NA_real_
)
}

dplyr::bind_cols(r, rr)
})

dplyr::bind_rows(ll)
}

# arrow::write_parquet(nycflights13::flights, "flights.parquet")
# x <- arrow::open_dataset("flights.parquet")
describe_arrow <- function(x, max_n = 3) {
# if x is a dbplyr connection string
ll <- lapply(names(x), function(v) {
mc <- x |>
dplyr::count(.data[[v]]) |>
dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
dplyr::collect()

type <- class(mc[[1]])[[1]]
is_num <- is_numeric(mc[[1]])
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
nn <- x |>
dplyr::distinct(.data[[v]]) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::collect()
nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow()

r <- dplyr::tibble(
var = v,
type = type,
n_distinct = nn[[1]],
n_na = nna,
most_frequent = mf
)

if (is_num) {
xx <- x |> dplyr::transmute(x = get(v))
} else {
xx <- x |> dplyr::transmute(x = nchar(as.character(get(v))))
}

suppressWarnings({
rr <- try(
xx |>
dplyr::summarise(
min = min(x, na.rm = TRUE),
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE)
) |>
dplyr::collect(),
silent = TRUE)
})
if (inherits(rr, "try-error")) {
rr <- dplyr::tibble(
min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
sd = NA_real_
)
}

dplyr::bind_cols(r, rr)
})

dplyr::bind_rows(ll)
}
16 changes: 11 additions & 5 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ That way, you can concentrate on writing the rules and making sure that your dat

The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package.

The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages.
The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`).
By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with.

The package also has a helper function to describe a dataset, see `?describe()`.

## Installation

Expand Down Expand Up @@ -63,6 +64,9 @@ At the moment rules work in a window/vectorized approach only, that means that a
```{r example, message=FALSE}
library(dataverifyr)

# create a dataset
data <- mtcars

# define a rule set within our R code; alternatively in a yaml file
rules <- ruleset(
rule(mpg > 10 & mpg < 30), # mpg goes up to 34
Expand All @@ -73,8 +77,11 @@ rules <- ruleset(
# print the rules
rules

# describe the dataset
describe(data)

# check if the data matches our rules
res <- check_data(mtcars, rules)
res <- check_data(data, rules)
res
```

Expand Down Expand Up @@ -206,9 +213,8 @@ if (!file.exists(file)) download.file(url, file, method = "curl")
file.size(file) / 1e6 # in MB

# quick check of the filesize
d <- read_parquet(file)
dim(d)
names(d)
d <- open_dataset(file)
describe(d)

# write the dataset to disk
write_dataset(d, "nyc-taxi-data")
Expand Down
Loading