From 04b86971f988df44e35b65bdba8b5d1c5e76e478 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 03:39:20 +1100
Subject: [PATCH 1/6] up

---
 R/map.r                      |   2 -
 R/two-stage-verbs.R          |   2 +-
 book/01-intro.Rmd            |   5 +-
 book/02-intro-disk-frame.Rmd | 119 ++++----------
 book/03-concepts.Rmd         |  17 +-
 book/04-ingesting-data.Rmd   |  11 +-
 book/06-vs-dask-juliadb.Rmd  |  18 +--
 book/08-more-epic.Rmd        |  14 +-
 book/10-group-by.Rmd         | 301 +++++++++++++++++++++++++++++++++--
 man/map.Rd                   |   3 -
 10 files changed, 360 insertions(+), 132 deletions(-)

diff --git a/R/map.r b/R/map.r
index cc6f0c5d..60a09ea5 100644
--- a/R/map.r
+++ b/R/map.r
@@ -164,7 +164,6 @@ map_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill =
 #' 
 #' # clean up cars.df
 #' delete(cars.df)
-#' @rdname map
 imap <- function(.x, .f, ...) {
   UseMethod("imap")
 }
@@ -269,7 +268,6 @@ delayed <- function(.x, .f, ...) {
 }
 
 #' @export
-#' @rdname map
 delayed.disk.frame <- function(.x, .f, ...) {
   map.disk.frame(.x, .f, ..., lazy = TRUE)
 }
diff --git a/R/two-stage-verbs.R b/R/two-stage-verbs.R
index d90719f5..3a93af53 100644
--- a/R/two-stage-verbs.R
+++ b/R/two-stage-verbs.R
@@ -163,7 +163,7 @@ IQR.collected_agg.disk.frame <- function(listx, ...) {
 
 #' A function to parse the summarize function
 #' @importFrom dplyr filter select pull
-#' @imporFrom purr map_dfr
+#' @importFrom purrr map_dfr
 #' @export
 summarise.grouped_disk.frame <- function(.data, ...) {
   code = substitute(list(...))[-1]
diff --git a/book/01-intro.Rmd b/book/01-intro.Rmd
index 7ea73dd4..d0df5a20 100644
--- a/book/01-intro.Rmd
+++ b/book/01-intro.Rmd
@@ -8,9 +8,10 @@ vignette: >
   %\VignetteEncoding{UTF-8}
 ---
 
-# The story of how `disk.frame` came to be
+# The story of how `{disk.frame}` came to be
 I was working at one of Australia's biggest banks and their shiny new SAS server was experiencing huge instability issues. As a result, we had to run SAS on our laptops to perform huge amounts of data manipulation. A simple SQL query can take around 20 minutes.
 
 I had enough.
 
-That's why I created `disk.frame` - a larger-than-RAM data manipulation framework for R. The same query now only takes 10 seconds.
\ No newline at end of file
+That's why I created `disk.frame` - a larger-than-RAM data manipulation framework for R. The same query now only takes 10 seconds.
+
diff --git a/book/02-intro-disk-frame.Rmd b/book/02-intro-disk-frame.Rmd
index 2f101f45..26800779 100644
--- a/book/02-intro-disk-frame.Rmd
+++ b/book/02-intro-disk-frame.Rmd
@@ -152,7 +152,7 @@ The class of `flights.df1` is also a `disk.frame` after the   `dplyr::select` tr
 
 For lazily constructed `disk.frame`s (e.g. `flights.df1`). The function `collect` can be used to bring the results from disk into R, e.g.
 ```{r, dependson='dfselect'}
-collect(flights.df1) %>% head
+collect(flights.df1) %>% head(2)
 ```
 
 Of course, for larger-than-RAM datasets, one wouldn't call `collect` on the whole `disk.frame` (because why would you need `disk.frame` otherwise). More likely, one would call `collect` on a `filter`ed dataset or one summarized with `group_by`.
@@ -160,11 +160,11 @@ Of course, for larger-than-RAM datasets, one wouldn't call `collect` on the whol
 Some examples of other dplyr verbs applied:
 
 ```{r, dependson='asdiskframe'}
-filter(flights.df, dep_delay > 1000) %>% collect %>% head
+filter(flights.df, dep_delay > 1000) %>% collect %>% head(2)
 ```
 
 ```{r, dependson='asdiskframe'}
-mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head
+mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head(2)
 ```
 
 ### Examples of NOT fully supported `dplyr` verbs
@@ -173,7 +173,7 @@ The `chunk_arrange` function arranges (sorts) each chunk but not the whole datas
 
 ```{r, dependson='asdiskframe'}
 # this only sorts within each chunk
-chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head
+chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head(2)
 ```
 
 
@@ -204,7 +204,9 @@ rename
 filter
 chunk_arrange # within each chunk
 chunk_group_by # within each chunk
-chunk_summarise/chunk_summarize # within each chunk
+chunk_summarize # within each chunk
+group_by # limited functions
+summarize # limited functions
 mutate
 transmute
 left_join
@@ -213,93 +215,25 @@ full_join # careful. Performance!
 semi_join
 anit_join
 ```
+
 ## Sharding and distribution of chunks
 
 Like other distributed data manipulation frameworks `disk.frame` utilizes the *sharding* concept to distribute the data into chunks. For example "to shard by `cust_id`" means that all rows with the same `cust_id` will be stored in the same chunk. This enables `chunk_group_by` by `cust_id` to produce the same results as non-chunked data.
 
 The `by` variables that were used to shard the dataset are called the `shardkey`s. The *sharding* is performed by computing a deterministic hash on the shard keys (the `by` variables) for each row. The hash function produces an integer between `1` and `n`, where `n` is the number of chunks. 
 
-## Grouping
-
-The `disk.frame` implements the `chunk_group_by` operation with a significant caveat. In the `disk.frame` framework, group-by happens WITHIN each chunk and not ACROSS chunks. To achieve group by across chunk we need to put **all rows with the same group keys into the same file chunk**; this can be achieved with `hard_group_by`. However, the `hard_group_by` operation can be **VERY TIME CONSUMING** computationally and should be **avoided** if possible.
-
-The `hard_group_by` operation is best illustrated with an example, suppose a `disk.frame` has three chunks
-```
-# chunk1 = 1.fst
-#  id n
-#1  a 1
-#2  a 2
-#3  b 3
-#4  d 4
-
-# chunk2 = 2.fst
-#  id n
-#1  a 4
-#2  a 5
-#3  b 6
-#4  d 7
-
-# chunk3 = 3.fst
-#  id n
-#1  a 4
-#2  b 5
-#3  c 6
-```
-and notice that the `id` column contains 3 distinct values `"a"`,`"b"`, and `"c"`. To perform `hard_group_by(df, by = id)` MAY give you the following `disk.frame` where all the `id`s with the same values end up in the same chunks. 
-
-```
-# chunk1 = 1.fst
-#  id n
-#1  b 3
-#2  b 6
-
-# chunk2 = 2.fst
-#  id n
-#1  c 6
-#2  d 4
-#3  d 7
-
-# chunk3 = 3.fst
-#  id n
-#1  a 1
-#2  a 2
-#3  a 4
-#4  a 5
-#5  a 4
-```
-
-Also, notice that there is no guaranteed order for the distribution of the `id`s to the chunks. The order is random, but each chunk is likely to have a similar number of rows, provided that `id` does not follow a skewed distribution i.e. where a few distinct values make up the majority of the rows.
+## Group-by
 
-Typically, `chunk_group_by` is performed WITHIN each chunk. This is not an issue if the chunks have already been sharded on the `by` variables beforehand; however, if this is not the case then one may need a second stage aggregation to obtain the correct result, see *Two-stage group by*.
-
-By forcing the user to choose `chunk_group_by` (within each chunk) and `hard_group_by` (across all chunks), this ensures that the user is conscious of the choice they are making. In `sparklyr` the equivalent of a `hard_group_by` is performed, which we should avoid, where possible, as it is time-consuming and expensive. Hence, `disk.frame` has chosen to explain the theory and allow the user to make a conscious choice when performing `group_by`.
+`{disk.frame}` implements the `group_by` operation some caveats. In the `{disk.frame}` framework, only a set functions are supported in `summarize`. However, the user can create more custom `group-by` functions can be defined. For more information see [group-by](10-group-by.Rmd)
 
 ```{r, dependson='asdiskframe'}
 flights.df %>%
-  hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
-  chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
+  group_by(carrier) %>% # notice that hard_group_by needs to be set
+  summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
   collect %>% 
   arrange(carrier)
 ```
 
-### Two-stage group by
-For most group-by tasks, the user can achieve the desired result WITHOUT using `hard = TRUE` by performing the group by in two stages. For example, suppose you aim to count the number of rows group by `carrier`, you can set `hard = F` to find the count within each chunk and then use a second group-by to summaries each chunk's results into the desired result. For example,
-
-```{r, dependson='asdiskframe'}
-flights.df %>%
-  chunk_group_by(carrier) %>% # `chunk_group_by` aggregates within each chunk
-  chunk_summarize(count = n()) %>%  # mean follows normal R rules
-  collect %>%  # collect each individul chunks results and row-bind into a data.table
-  group_by(carrier) %>% 
-  summarize(count = sum(count)) %>% 
-  arrange(carrier)
-```
-
-Because this two-stage approach avoids the expensive `hard group_by` operation, it is often significantly faster. However, it can be tedious to write; and this is a con of the `disk.frame` chunking mechanism.
-
-*Note*: this two-stage approach is similar to a map-reduce operation.
-
-
 ## Restrict input columns for faster processing
 
 One can restrict which input columns to load into memory for each chunk; this can significantly increase the speed of data processing. To restrict the input columns, use the `srckeep` function which only accepts column names as a string vector.
@@ -307,8 +241,8 @@ One can restrict which input columns to load into memory for each chunk; this ca
 ```{r, dependson='asdiskframe'}
 flights.df %>%
   srckeep(c("carrier","dep_delay")) %>%
-  hard_group_by(carrier) %>% 
-  chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
+  group_by(carrier) %>% 
+  summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
   collect
 ```
 
@@ -352,21 +286,35 @@ flights.df %>%
 
 ## Window functions and arbitrary functions
 
-`disk.frame` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `rank` are supported out of the box.
+`{disk.frame}` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `min_rank` and `rank` are supported out of the box. 
+
+For the following example, we will use the `hard_group_by` which performs a group-by and also reorganises the chunks so that all records with the same `year`, `month`, and `day` end up in the same chunk. This is typically not adviced, as `hard_group_by` can be slow for large datasets.
 
 ```{r, dependson='asdiskframe'}
 # Find the most and least delayed flight each day
 bestworst <- flights.df %>%
    srckeep(c("year","month","day", "dep_delay")) %>%
-   chunk_group_by(year, month, day) %>%
-   select(dep_delay) %>%
+   hard_group_by(c("year", "month", "day")) %>%
    filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>%
    collect
    
-
 bestworst %>% head
 ```
 
+another example
+
+```{r, dependson='asdiskframe'}
+ranked <- flights.df %>%
+  srckeep(c("year","month","day", "dep_delay")) %>%
+  hard_group_by(c("year", "month", "day")) %>%
+  filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>%
+  collect
+
+ranked %>% head
+```
+
+one more example
+
 ```{r, dependson='asdiskframe'}
 # Rank each flight within a daily
 ranked <- flights.df %>%
@@ -381,6 +329,7 @@ ranked %>% head
 
 
 ## Arbitrary by-chunk processing
+
 One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `map.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk
 
 ```{r, dependson='asdiskframe'}
@@ -401,7 +350,7 @@ flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(
 flights.df2 %>% head
 ```
 
-Notice `disk.frame` supports the `purrr` syntax for defining a function using `~`.
+Notice `{disk.frame}` supports the `purrr` syntax for defining a function using `~`.
 
 ## Sampling
 
diff --git a/book/03-concepts.Rmd b/book/03-concepts.Rmd
index d5940f59..8232f06e 100644
--- a/book/03-concepts.Rmd
+++ b/book/03-concepts.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "Key disk.frame concepts"
+title: "Key `{disk.frame}` concepts"
 author: "ZJ"
 output: rmarkdown::html_vignette
 vignette: >
@@ -15,15 +15,18 @@ knitr::opts_chunk$set(
 )
 ```
 
+# Key `{disk.frame}` concepts
 There are a number of concepts and terminologies that are useful to understand in order to use `disk.frame` effectively.
 
 ## What is a `disk.frame` and what are chunks?
-A `disk.frame` is nothing more a folder and in that folder there should be [`fst`](https://www.fstpackage.org/) files named "1.fst", "2.fst", "3.fst" etc. Each of the ".fst" file is called a _chunk_.
+
+A `disk.frame` is a folder containing [`fst`](https://www.fstpackage.org/) files named "1.fst", "2.fst", "3.fst" etc. Each of the ".fst" file is called a _chunk_.
 
 ## Workers and parallelism
+
 Parallelism in `disk.frame` is achieved using the [`future` package](https://cran.r-project.org/package=future). When performing many tasks, `disk.frame` uses multiple workers, where each _worker_ is an R session, to perform the tasks in parallel. 
 
-It is recommended that you should running these to set-up immediately after you `library(disk.frame)`. For example:
+It is recommended that you should run the following immediately after `library(disk.frame)` to set-up multiple workers. For example:
 
 ```r
 library(disk.frame)
@@ -55,4 +58,12 @@ To see how many workers are at work, use
 future::nbrOfWorkers()
 ```
 
+## How `{disk.frame}` works
+
+When `df %>% some_fn %>% collect` is callled. The `some_fn` is applied to each chunk of `df`. The collect will row-bind the results from `some_fn(chunk)`together if the returned value of `some_fn` is a data.frame, or it will return a `list` containing the results of `some_fn`.
+
+The session that receives these results is called the **main session**. In general, we should try to minimise the amount of data passed from the worker sessions back to the main session, because passing data around can be slow.
+
+Also, please note that there is no communication between the workers, except for workers passing data back to the main session.
+
 
diff --git a/book/04-ingesting-data.Rmd b/book/04-ingesting-data.Rmd
index d26c433a..541d870a 100644
--- a/book/04-ingesting-data.Rmd
+++ b/book/04-ingesting-data.Rmd
@@ -15,7 +15,12 @@ knitr::opts_chunk$set(
 )
 ```
 
-Let's set-up `disk.frame`
+# Ingesting Data
+
+One of the most important tasks to perform before using the `{disk.frame}` package is to make some `disk.frame`s! There are a few functions to help you do that. Before we do that, we set up the `{disk.frame}` as usual
+
+**Setting up**
+
 ```r
 library(disk.frame)
 
@@ -31,10 +36,8 @@ if(interactive()) {
 
 ```
 
-One of the most important tasks to perform before using the `disk.frame` package is to make some `disk.frame`s! There are a few functions to help you do that.
-
 ## Convert a `data.frame` to `disk.frame`
-Firstly there is `as.disk.frame()` which allows you to make a `disk.frame` from a `data.frame`, e.g.
+Firstly, there is `as.disk.frame()` which allows you to make a `disk.frame` from a `data.frame`, e.g.
 
 ```r
 flights.df = as.disk.frame(nycflights13::flights)
diff --git a/book/06-vs-dask-juliadb.Rmd b/book/06-vs-dask-juliadb.Rmd
index d3376f47..15ffd10d 100644
--- a/book/06-vs-dask-juliadb.Rmd
+++ b/book/06-vs-dask-juliadb.Rmd
@@ -15,7 +15,7 @@ knitr::opts_chunk$set(
 )
 ```
 
-## Intro
+# Intro - Benchmark 1
 
 This is the first in a series to benchmark the performance of disk.frame vs other medium-data tools. For Python, we will benchmark Dask, and for Julia, we will benchmark JuliaDB.jl. In the process, I will do a warts-and-all account of the tools I have tested.
 
@@ -53,7 +53,7 @@ library(ggplot2)
 ggplot(df) +
   geom_bar(aes(x = tool, weight = timing), stat = "count") + 
   ylab("seconds") +
-  ggtitle("Count(*) gorup-by timings")
+  ggtitle("Count(*) group-by timings")
 ```
 
 ## Data 
@@ -110,19 +110,17 @@ We can inspect the result as well.
 summ
 ```
 
-Another way to perform the analysis is to use `dplyr` syntax to perform a two-stage "group-by" which is:
+Another way to perform the analysis is to use `dplyr` syntax to perform group-by in _one-stage_ which is:
 
-```r
-df1 %>% 
+```{r, dependson='convert'}
+system.time(df1 %>% 
   srckeep("V1") %>% 
-  chunk_group_by(V1) %>% 
-  chunk_summarise(N = n()) %>% 
-  collect %>% 
   group_by(V1) %>% 
-  summarise(N = sum(N)) %>% 
+  summarise(N = n()) %>% 
+  collect)
 ```
 
-However, the `dplyr` syntax tends to be slightly slower than using data.table syntax.
+However, the `dplyr` syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.
 
 
 #### Dask
diff --git a/book/08-more-epic.Rmd b/book/08-more-epic.Rmd
index 2b2ee631..0c5e86e0 100644
--- a/book/08-more-epic.Rmd
+++ b/book/08-more-epic.Rmd
@@ -201,7 +201,7 @@ ggplot(data.frame(
 
 ## Can {disk.frame} be even more "epic"?
 
-Well yes! We can actually speed up the group-by operation that Bruno did by using `srckeep`. The use of `srckeep` can't be emphasized enough! It works by reading from disk only the columns needed for the analysis, and hence disk IO time is (drastically) reduced! However, we do have to live with the two-stage group-by annoyance for now.
+Well yes! We can actually speed up the group-by operation that Bruno did by using `srckeep`. The use of `srckeep` can't be emphasized enough! It works by reading from disk only the columns needed for the analysis, and hence disk IO time is (drastically) reduced!
 
 ```r
 tic = Sys.time()
@@ -209,11 +209,8 @@ tic = Sys.time()
 # doing group-by in two-stages which is annoying; I am working on something better
 mean_dep_delay <- flights.df %>%
   srckeep(c("YEAR", "MONTH", "DAY_OF_MONTH", "DEP_DELAY")) %>%
-  chunk_group_by(YEAR, MONTH, DAY_OF_MONTH) %>%
-  chunk_summarise(sum_delay = sum(DEP_DELAY, na.rm = TRUE), n = n()) %>%
-  collect() %>% 
   group_by(YEAR, MONTH, DAY_OF_MONTH) %>%
-  summarise(mean_delay = sum(sum_delay)/sum(n))
+  summarise(mean_delay = mean(DEP_DELAY, na.rm = TRUE)) 
 (toc = Sys.time() - tic)
 #> Time difference of 2.800005 secs
 ```
@@ -223,11 +220,8 @@ Compare the above the to timing without `srckeep`
 ```r
 tic = Sys.time()
 mean_dep_delay <- flights.df %>%
-  chunk_group_by(YEAR, MONTH, DAY_OF_MONTH) %>%
-  chunk_summarise(sum_delay = sum(DEP_DELAY, na.rm = TRUE), n = n()) %>%
-  collect() %>% 
   group_by(YEAR, MONTH, DAY_OF_MONTH) %>%
-  summarise(mean_delay = sum(sum_delay)/sum(n))
+  summarise(mean_delay = mean(DEP_DELAY, na.rm = TRUE)) 
 (toc = Sys.time() - tic)
 #> Time difference of 15.62312 secs
 ```
@@ -248,7 +242,7 @@ ggplot2::ggplot(data1) +
 So there you go! {disk.frame} can be even more "epic"! Here are the two main take-aways
 
 1. Load CSV files as many individual files if possible to take advantage of multi-core parallelism
-2. `srckeep` is your friend! Disk IO is often the bottleneck in data manipulation, and you can reduce Disk IO by specifying only columns that you will use with `srckeep(c(columns1, columns2, ...))`.
+2. `srckeep` is your friend! Disk IO is often the bottleneck in data manipulation, and you can reduce disk IO by specifying only columns that you will use with `srckeep(c(columns1, columns2, ...))`.
 
 ## Advertisements
 
diff --git a/book/10-group-by.Rmd b/book/10-group-by.Rmd
index 93eecb08..618a7745 100644
--- a/book/10-group-by.Rmd
+++ b/book/10-group-by.Rmd
@@ -15,8 +15,23 @@ knitr::opts_chunk$set(
 )
 ```
 
-### Group by
-Starting from {disk.frame} v0.2.2, there is for support `group_by` for a limited set of functions. For example:
+# Group-by in `{disk.frame}`
+
+The group-by framework of [`{disk.frame}`](https://diskframe.com) has been overhauled in v0.2.2. It is now able to perform some group-by-summarize operations in one stage. In this chapter we will cover 
+
+1. How to use one-stage group-by
+2. Manual Two-stage group and hard group-by
+3. The architecture of `{disk.frame}` and its implications for group-by
+4. How to define custom one-stage group-by functions and its limitatons
+
+
+## One-stage Group-by
+
+A one-stage group-by is the same as group-by for data.frames. This would be remarkable, if not for the limitaions imposed by the disk-based nature of `{disk.frame}`.  Before v0.2.2 of `{disk.frame}`, one-stage group-by was not possible, and the users had to rely to two-stage group-by even for simple operations like `mean`. 
+
+However, now that one-stage group-by is possible, there are still limiations and not all functions are supported out-of-the-box. Hence, at the end of the chapter we have described [how to define custom one-stage group-by functions](## Custom one-stage group-by).
+
+An example of one-stage group-by:
 
 ```r
 result_from_disk.frame = iris %>% 
@@ -35,11 +50,11 @@ result_from_disk.frame = iris %>%
   collect
 ```
 
-The results should be exactly the same as if applying the same group-by operations on a data.frame. If not then please [report a bug](https://github.com/xiaodaigh/disk.frame/issues).
+It is important to note that not all functions that can run in `data.frame` `summarize` would work automatically. This is because of how `{disk.frame}` works. Please see the secion on [defining your own one-stage-group-by](### Defining your own one-stage group-by) if you wish to learn how to define your own one-stage group-by functions.
 
-#### List of supported group-by functions
+### List of supported group-by functions
 
-If a function you like is missing, please make a feature request [here](https://github.com/xiaodaigh/disk.frame/issues). It is a limitation that function that depend on the order a column can only obtained using estimated methods.
+If a function you need/like is missing, please make a feature request [here](https://github.com/xiaodaigh/disk.frame/issues). It is a limitation that function that depend on the order a column can only obtained using estimated methods.
 
 | Function | Exact/Estimate | Notes |
 | -- | -- | -- |
@@ -58,11 +73,95 @@ If a function you like is missing, please make a feature request [here](https://
 | `quantile` | Estimate | One quantile only |
 | `IQR` | Estimate |  |
 
-### Two-Stage Group by
-Given the list of group-by functions is limited, so {disk.frame} supports a two-stage style grouping, enable maximum flexibility. The key is understand that `chunk_group_by` performs `group-by` within each chunk.
+### Notes on One-Stage group-by
+
+The results should be exactly the same as if applying the same group-by operations on a `data.frame`. If not then please [report a bug](https://github.com/xiaodaigh/disk.frame/issues).
+
+
+## Group-by notes
+
+The `disk.frame` implements the `chunk_group_by` operation with a significant caveat. In the `disk.frame` framework, group-by happens WITHIN each chunk and not ACROSS chunks. To achieve group by across chunk we need to put **all rows with the same group keys into the same file chunk**; this can be achieved with `hard_group_by`. However, the `hard_group_by` operation can be **VERY TIME CONSUMING** computationally and should be **avoided** if possible.
+
+The `hard_group_by` operation is best illustrated with an example, suppose a `disk.frame` has three chunks
+```
+# chunk1 = 1.fst
+#  id n
+#1  a 1
+#2  a 2
+#3  b 3
+#4  d 4
+
+# chunk2 = 2.fst
+#  id n
+#1  a 4
+#2  a 5
+#3  b 6
+#4  d 7
+
+# chunk3 = 3.fst
+#  id n
+#1  a 4
+#2  b 5
+#3  c 6
+```
+and notice that the `id` column contains 3 distinct values `"a"`,`"b"`, and `"c"`. To perform `hard_group_by(df, by = id)` MAY give you the following `disk.frame` where all the `id`s with the same values end up in the same chunks. 
+
+```
+# chunk1 = 1.fst
+#  id n
+#1  b 3
+#2  b 6
+
+# chunk2 = 2.fst
+#  id n
+#1  c 6
+#2  d 4
+#3  d 7
+
+# chunk3 = 3.fst
+#  id n
+#1  a 1
+#2  a 2
+#3  a 4
+#4  a 5
+#5  a 4
+```
+
+Also, notice that there is no guaranteed order for the distribution of the `id`s to the chunks. The order is random, but each chunk is likely to have a similar number of rows, provided that `id` does not follow a skewed distribution i.e. where a few distinct values make up the majority of the rows.
+
+Typically, `chunk_group_by` is performed WITHIN each chunk. This is not an issue if the chunks have already been sharded on the `by` variables beforehand; however, if this is not the case then one may need a second stage aggregation to obtain the correct result, see *Two-stage group by*.
+
+By forcing the user to choose `chunk_group_by` (within each chunk) and `hard_group_by` (across all chunks), this ensures that the user is conscious of the choice they are making. In `sparklyr` the equivalent of a `hard_group_by` is performed, which we should avoid, where possible, as it is time-consuming and expensive. Hence, `disk.frame` has chosen to explain the theory and allow the user to make a conscious choice when performing `group_by`.
+
+```{r, dependson='asdiskframe'}
+flights.df %>%
+  hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
+  chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
+  collect %>% 
+  arrange(carrier)
+```
+
+## Two-Stage Group-by
+
+Prior to `{disk.frame}` v0.2.2, there is no general support for one-stage group-by. Hence a two-stage style group-by is needed. The key is understand is the `chunk_group_by` which performs `group-by` within each chunk.
+
+For most group-by tasks, the user can achieve the desired result WITHOUT using `hard = TRUE` by performing the group by in two stages. For example, suppose you aim to count the number of rows group by `carrier`, you can set `hard = F` to find the count within each chunk and then use a second group-by to summaries each chunk's results into the desired result. For example,
+
+```{r, dependson='asdiskframe'}
+flights.df %>%
+  chunk_group_by(carrier) %>% # `chunk_group_by` aggregates within each chunk
+  chunk_summarize(count = n()) %>%  # mean follows normal R rules
+  collect %>%  # collect each individul chunks results and row-bind into a data.table
+  group_by(carrier) %>% 
+  summarize(count = sum(count)) %>% 
+  arrange(carrier)
+```
+
+Because this two-stage approach avoids the expensive `hard group_by` operation, it is often significantly faster. However, it can be tedious to write; and this is a con of the `disk.frame` chunking mechanism.
+
+*Note*: this two-stage approach is similar to a map-reduce operation.
 
 ```{r setup, cache=TRUE}
-knitr::opts_chunk$set(include = FALSE)
 suppressPackageStartupMessages(library(disk.frame))
 setup_disk.frame()
 ```
@@ -101,8 +200,9 @@ cat("filtering a < 0.1 took: ", data.table::timetaken(pt), "\n")
 nrow(df_filtered)
 ```
 
-### Hard group by
-Another way to perform a one-stage `group_by` is to perform a `hard_group_by` on a `disk.frame. This will rechunk the `disk.frame` by the by columns. This is **not** recommended for performance reasons, as it can quite slow to rechunk the chunks on disk.
+## Hard group-by
+
+Another way to perform a one-stage `group_by` is to perform a `hard_group_by` on a `disk.frame`. This will rechunk the `disk.frame` by the by-columns. This is **not** recommended for performance reasons, as it can be quite slow to rechunk the file chunks on disk.
 ```{r}
 pt = proc.time()
 res1 <- flights.df %>% 
@@ -112,14 +212,191 @@ res1 <- flights.df %>%
   hard_group_by(qtr) %>% # hard group_by is MUCH SLOWER but avoid a 2nd stage aggregation
   chunk_summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) %>% 
   collect
-cat("group by took: ", data.table::timetaken(pt), "\n")
+cat("group-by took: ", data.table::timetaken(pt), "\n")
 
 collect(res1)
 ```
 
+
+## Custom one-stage group-by
+
+### At a glance
+
+`{disk.frame}` allows the user to enable create custom one-stage group-by functions. To make a function `fn` one stage. One needs to define two functions
+
+1. `fn.chunk_agg.disk.frame` which applies the itself to each chunk
+2. `fn.collected_agg.disk.frame` which accpets a `list` of returns from `fn.chunk_agg.disk.frame` and finalize the computation.
+
+For example, to make `mean` a one-stage group-by function, `{disk.frame}` has defined `fn.chunk_agg.disk.frame` and `fn.collected_agg.disk.frame`, which we will illustrate with examples below.
+
+But first, we shall explain some theory behind `{disk.frame}` to help you better understand "why does `{disk.frame}` do it like that?".
+
+### How does `{disk.frame}` work
+
+One may ask, how come only a few functions are supported for one-stage group-by? And why are some functions like `median` only produce estimates instead of producing the exact figure? To answer these question, we need to have an understanding of how `{disk.frame}` works.
+
+A `disk.frame` is organized as chunks stored on disk. Each chunk is a file stored in [fst format](https://www.fstpackage.org/). The [`{future}` package](https://cran.r-project.org/web/packages/future/index.html) is used to apply the same function to each chunk, each of these operations are carried out in a separate R session. These R sessions cannot communicate with each other during the execution of the operations.
+
+Once the operation has been performed the results will be bought back to the session from which the operation was called. This is the only point of interprocess communication. The process of making group-by in one stage does require some additional work.
+
+To summarize, the two phases of a `df %>% some_fn %>% collect` operation is 
+
+1. The `some_fn` is applied to each chunk, and the result is assumed to be a data.frame
+2. `collect` then row-binds (`rbind`/`bind_rows`/`rbindlist`) the results together to form a data.frame in the main session
+
+### How group-by works
+
+Except for passing the result back to the main session, communication between worker sessions are not allowed. This limits how group-by operations can be performed, hence why group-by can be done in two stages for many functions. However, R's meta-programming abilities allows us to rewrite code to that automatically perform the two-stage group-bys. For example, consider:
+
+```r
+df %>% 
+  group_by(grp1) %>% 
+  summarize(sum(x)) %>% 
+  collect
+```
+
+we can use meta-programming to transform that to 
+  
+```r
+df %>% 
+  chunk_group_by(grp1) %>% 
+  chunk_summarize(__tmp1__= sum(x)) %>% 
+  collect() %>% 
+  group_by(grp1) %>% 
+  summarize(x = sum(__tmp1__))
+```
+
+Basically, we are "compiling" one-stage group-by code to two-stage group-by code, and then executing it.
+
+For `mean`, it's trickier, as one needs to keep track on the numerator and the denominator separately in computing `mean(x) = sum(x)/length(x)`. 
+
+Therefore, `{disk.frame}` compiles 
+
+```r
+df %>% 
+  group_by(grp1) %>% 
+  summarize(meanx = mean(x)) %>% 
+  collect
+```
+
+to
+
+```r
+df %>% 
+  chunk_group_by(grp1) %>% 
+  chunk_summarize(__tmp1__ = list(mean.chunk_agg.disk.frame(x))) %>% 
+  collect %>% 
+  group_by(grp1) %>% 
+  chunk_summarize(meanx = mean.chunk_agg.disk.frame(__tmp1__))
+```
+
+where `mean.chunk_agg.disk.frame` defines what needs to be done to each chunk, as you can see, the return value is a vector where the elements are named `sumx` and `lengthx`. Here is an example implementation of `mean.chunk_agg.disk.frame`
+
+```r
+mean.chunk_agg.disk.frame <- function(x, na.rm = FALSE, ...) {
+  sumx = sum(x, na.rm = na.rm)
+  lengthx = length(x) - ifelse(na.rm, sum(is.na(x)), 0)
+  c(sumx = sumx, lengthx = lengthx)
+}
+
+```
+
+because the return value is not a scalar, we need to write it in a `list` (line 3). 
+
+The `mean.collected_agg.disk.frame` receives a list of outputs from `mean.chunk_agg.disk.frame`. Recall that `mean.chunk_agg.disk.frame` returns a vector for each chunk, so the input to `mean.collected_agg.disk.frame` is a *list of vectors*
+
+```r
+mean.collected_agg.disk.frame <- function(listx) {
+  sum(sapply(listx, function(x) x["sumx"]))/sum(sapply(listx, function(x) x["lengthx"]))
+}
+```
+
+### How to define your own one-stage group-by function 
+
+Now that we have seen two examples, namely `sum` and `mean`, we are ready summarize how group-by functions are implemented. 
+
+Given the below
+
+```r
+df %>% 
+  group_by(grp1) %>% 
+  summarize(namex = fn(x)) %>% 
+  collect
+```
+
+`{disk.frame}` compiles it to
+
+```r
+df %>% 
+  chunk_group_by(grp1) %>% 
+  chunk_summarize(__tmp1__ = list(fn.chunk_agg.disk.frame(x))) %>% 
+  collect %>% 
+  group_by(grp1) %>% 
+  chunk_summarize(namex = fn.chunk_agg.disk.frame(__tmp1__))
+```
+
+Based on the above information, to make `fn` a one-stage group-by function, the user has to
+
+1. Define `fn.chunk_agg.disk.frame` which is a function to be applied at each chunk
+2. Define `fn.collected_agg.disk.frame` which is a function to be applied to *a `list` containing the returns from `fn.chunk_agg.disk.frame` applied on each chunk*
+
+**Example of implementing `sum`**:
+
+1. Define `sum.chunk_agg.disk.frame`
+
+```r
+sum.chunk_agg.disk.frame <- function(x, na.rm = FALSE) {
+  sum(x, na.rm=na.rm)
+}
+```
+
+2. Define `sum.collected_agg.disk.frame`, which needs to accept a list of `sum(x, na.rm)`, but `sum(x, na.rm)` is just a numeric, so
+
+```r
+sum.collected_agg.disk.frame <- function(list_sum) {
+  sum(unlist(list_sum))
+}
+```
+
+**Example of implementing `n_distinct`**:
+
+The `n_distinct` function counts the number of distint values from a vector `x`
+
+1. Define `n_distinct.chunk_agg.disk.frame`, to return a list of unique values. Because the same value can appear in multiple chunks, so to ensure that we don't double count, we simply return all the unique values from each chunk which is then deduplicated in the next phase
+
+```r
+n_distinct.chunk_agg.disk.frame <- function(x, na.rm = FALSE) {
+  if(na.rm) {
+    setdiff(unique(x), NA)
+  } else {
+    unique(x)
+  }
+}
+```
+
+2. Define `n_distinct.collected_agg.disk.frame`, which deduplicates the unique values
+
+```r
+n_distinct.collected_agg.disk.frame <- function(list_of_chunkwise_uniques) {
+  dplyr::n_distinct(unlist(list_of_chunkwise_uniques))
+}
+```
+
+### Limitations
+
+We have seen that `{disk.frame}` performs operations in two phases 
+
+1. apply the same function to each chunk
+2. row-bind the results
+
+and there are no communication between the sessions that applies the functions at chunk level.
+
+Hence, it is generally difficult to compute rank based summarizations like `median` exactly. Hence most rank based calculations are estimates only. This is also true of distributed data system like Spark whose median function is also estimates only.
+
+
 ## Advertisements
 
-### Interested in learning {disk.frame} in a structured course?
+### Interested in learning `{disk.frame}` in a structured course?
 
 Please register your interest at:
 
diff --git a/man/map.Rd b/man/map.Rd
index 242a3bdc..018ec71f 100644
--- a/man/map.Rd
+++ b/man/map.Rd
@@ -15,7 +15,6 @@
 \alias{lazy}
 \alias{lazy.disk.frame}
 \alias{delayed}
-\alias{delayed.disk.frame}
 \alias{chunk_lapply}
 \title{Apply the same function to all chunks}
 \usage{
@@ -69,8 +68,6 @@ lazy(.x, .f, ...)
 
 delayed(.x, .f, ...)
 
-\method{delayed}{disk.frame}(.x, .f, ...)
-
 chunk_lapply(...)
 }
 \arguments{

From 6bbf4c6d5c1b621d03888c794ff39566c80439f9 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 03:44:39 +1100
Subject: [PATCH 2/6] update site

---
 R/dplyr_verbs.r                              | 65 +++++++++++++++++++-
 docs/404.html                                |  2 +-
 docs/LICENSE-text.html                       |  2 +-
 docs/articles/concepts.html                  | 28 +++++++--
 docs/articles/convenience-features.html      |  2 +-
 docs/articles/data-table-syntax.html         |  2 +-
 docs/articles/glm.html                       |  2 +-
 docs/articles/index.html                     |  4 +-
 docs/authors.html                            |  2 +-
 docs/index.html                              |  2 +-
 docs/reference/add_chunk.html                | 10 +--
 docs/reference/as.data.frame.disk.frame.html |  2 +-
 docs/reference/as.data.table.disk.frame.html |  2 +-
 docs/reference/as.disk.frame.html            |  2 +-
 docs/reference/collect.html                  |  2 +-
 docs/reference/colnames.html                 |  2 +-
 docs/reference/compute.disk.frame.html       |  2 +-
 docs/reference/create_dplyr_mapper.html      |  2 +-
 docs/reference/csv_to_disk.frame.html        |  2 +-
 docs/reference/delete.html                   |  2 +-
 docs/reference/df_ram_size.html              |  2 +-
 docs/reference/dfglm.html                    |  2 +-
 docs/reference/disk.frame.html               |  4 +-
 docs/reference/dplyr_verbs.html              |  2 +-
 docs/reference/evalparseglue.html            |  2 +-
 docs/reference/foverlaps.disk.frame.html     |  2 +-
 docs/reference/gen_datatable_synthetic.html  |  2 +-
 docs/reference/get_chunk.html                |  2 +-
 docs/reference/get_chunk_ids.html            | 14 ++---
 docs/reference/group_by.html                 |  2 +-
 docs/reference/groups.disk.frame.html        |  2 +-
 docs/reference/hard_arrange.html             |  2 +-
 docs/reference/hard_group_by.html            |  2 +-
 docs/reference/head_tail.html                |  2 +-
 docs/reference/index.html                    |  2 +-
 docs/reference/is_disk.frame.html            |  2 +-
 docs/reference/join.html                     |  2 +-
 docs/reference/make_glm_streaming_fn.html    |  2 +-
 docs/reference/map.html                      |  5 +-
 docs/reference/map2.html                     |  2 +-
 docs/reference/merge.disk.frame.html         |  2 +-
 docs/reference/move_to.html                  |  2 +-
 docs/reference/nchunks.html                  |  2 +-
 docs/reference/ncol_nrow.html                |  2 +-
 docs/reference/overwrite_check.html          |  2 +-
 docs/reference/print.disk.frame.html         |  2 +-
 docs/reference/rbindlist.disk.frame.html     |  2 +-
 docs/reference/rechunk.html                  |  4 +-
 docs/reference/recommend_nchunks.html        |  2 +-
 docs/reference/remove_chunk.html             |  8 +--
 docs/reference/sample.html                   |  2 +-
 docs/reference/setup_disk.frame.html         |  2 +-
 docs/reference/shard.html                    |  2 +-
 docs/reference/shardkey.html                 |  2 +-
 docs/reference/shardkey_equal.html           |  2 +-
 docs/reference/show_ceremony.html            |  2 +-
 docs/reference/srckeep.html                  |  2 +-
 docs/reference/sub-.disk.frame.html          |  2 +-
 docs/reference/tbl_vars.disk.frame.html      |  2 +-
 docs/reference/write_disk.frame.html         |  2 +-
 docs/reference/zip_to_disk.frame.html        |  4 +-
 utils/build_utils.R                          |  1 -
 62 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r
index 94c0fe7c..e5d28c56 100644
--- a/R/dplyr_verbs.r
+++ b/R/dplyr_verbs.r
@@ -133,7 +133,70 @@ chunk_summarise <- create_chunk_mapper(dplyr::summarise)
 #' @rdname dplyr_verbs
 summarize.disk.frame <- function(...) {
   # comment summarize.grouped_disk.frame
-  stop("`summarize.disk.frame` has been removed. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework")
+  warning("`summarize.disk.frame`'s behaviour has changed. Please use `chunk_summarize` if you wish to `dplyr::summarize` to each chunk")
+  
+  stop("TODO: adapt this for no group-by")
+  code = substitute(list(...))[-1]
+  expr_id = 0
+  temp_varn = 0
+  #browser()
+  
+  list_of_chunk_agg_fns <- as.character(methods(class = "chunk_agg.disk.frame"))
+  list_of_collected_agg_fns <- as.character(methods(class = "collected_agg.disk.frame"))
+  
+  # generate the chunk_summarize_code
+  summarize_code = purrr::map_dfr(code, ~{
+    expr_id <<- expr_id  + 1
+    # parse the function into table form for easy interrogration
+    gpd = getParseData(parse(text = deparse(.x)), includeText = TRUE); 
+    grp_funcs = gpd %>% filter(token == "SYMBOL_FUNCTION_CALL") %>% select(text) %>% pull
+    
+    # search in the space to find functions name `fn`.chunk_agg.disk.frame
+    # only allow one such functions for now TODO improve it
+    #stopifnot(sum(paste0(unique(grp_funcs), ".chunk_agg.disk.frame") %in% list_of_chunk_agg_fns) == 1)
+    #stopifnot(sum(paste0(unique(grp_funcs), ".collected_agg.disk.frame") %in% list_of_collected_agg_fns) == 1)
+    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) == 1)
+    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) == 1)
+    
+    
+    
+    temp_varn <<- temp_varn + 1
+    tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ")
+    
+    chunk_code = data.frame(assign_to = as.character(glue::glue("tmp{temp_varn}")), expr = tmpcode, stringsAsFactors = FALSE)
+    
+    chunk_code$orig_code = deparse(.x)
+    chunk_code$expr_id = expr_id
+    chunk_code$grp_fn = grp_funcs
+    chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id]))
+    
+    # create the aggregation code
+    chunk_code$agg_expr = glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")
+    
+    #print(sapply(chunk_code, typeof))
+    chunk_code
+  })
+  
+  chunk_summ_code = paste0(summarize_code$assign_to, "=list(", summarize_code$expr, ")") %>% paste0(collapse = ", ")
+  
+  agg_code_df = summarize_code %>% 
+    select(expr_id, name, agg_expr, orig_code) %>% 
+    unique %>% 
+    transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr))
+  
+  agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",")
+  
+  # get the by variables
+  group_by_cols = purrr::map_chr(attr(.data, "group_by_cols"), ~{deparse(.x)})
+  
+  list(group_by_cols = group_by_cols, chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)
+  
+  # generate full code
+  code_to_run = glue::glue("chunk_group_by({group_by_cols}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({group_by_cols}) %>% summarize({agg_summ_code})")
+  
+  class(.data) <- c("summarized_disk.frame", "disk.frame")
+  attr(.data, "summarize_code") = code_to_run
+  .data
 }
 
 
diff --git a/docs/404.html b/docs/404.html
index e2b3eb75..88202fcc 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -90,7 +90,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="articles/concepts.html">Key disk.frame concepts</a>
+      <a href="articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="articles/convenience-features.html">Convenience features</a>
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
index ad12935e..ea515238 100644
--- a/docs/LICENSE-text.html
+++ b/docs/LICENSE-text.html
@@ -90,7 +90,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="articles/concepts.html">Key disk.frame concepts</a>
+      <a href="articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="articles/convenience-features.html">Convenience features</a>
diff --git a/docs/articles/concepts.html b/docs/articles/concepts.html
index 31e59cc8..c60f9fb9 100644
--- a/docs/articles/concepts.html
+++ b/docs/articles/concepts.html
@@ -5,12 +5,12 @@
 <meta charset="utf-8">
 <meta http-equiv="X-UA-Compatible" content="IE=edge">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>Key disk.frame concepts • disk.frame</title>
+<title>Key `{disk.frame}` concepts • disk.frame</title>
 <!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous">
 <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.7.1/css/all.min.css" integrity="sha256-nAmazAk6vS34Xqo0BSrTb+abbtFlgsFK7NKSi6o7Y78=" crossorigin="anonymous">
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.7.1/css/v4-shims.min.css" integrity="sha256-6qHlizsOWFskGlwVOKuns+D1nB6ssZrHQrNj1wGplHc=" crossorigin="anonymous">
 <!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.9.4/headroom.min.js" integrity="sha256-DJFC1kqIhelURkuza0AvYal5RxMtpzLjFhsnVIeuk+U=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.9.4/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet">
-<script src="../pkgdown.js"></script><meta property="og:title" content="Key disk.frame concepts">
+<script src="../pkgdown.js"></script><meta property="og:title" content="Key `{disk.frame}` concepts">
 <meta property="og:description" content="">
 <meta name="twitter:card" content="summary">
 <!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
@@ -54,7 +54,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -102,7 +102,7 @@
       </header><div class="row">
   <div class="col-md-9 contents">
     <div class="page-header toc-ignore">
-      <h1>Key disk.frame concepts</h1>
+      <h1>Key <code>{disk.frame}</code> concepts</h1>
                         <h4 class="author">ZJ</h4>
             
       
@@ -113,17 +113,20 @@ <h4 class="author">ZJ</h4>
 
     
     
+<div id="key-disk-frame-concepts" class="section level1">
+<h1 class="hasAnchor">
+<a href="#key-disk-frame-concepts" class="anchor"></a>Key <code>{disk.frame}</code> concepts</h1>
 <p>There are a number of concepts and terminologies that are useful to understand in order to use <code>disk.frame</code> effectively.</p>
 <div id="what-is-a-disk-frame-and-what-are-chunks" class="section level2">
 <h2 class="hasAnchor">
 <a href="#what-is-a-disk-frame-and-what-are-chunks" class="anchor"></a>What is a <code>disk.frame</code> and what are chunks?</h2>
-<p>A <code>disk.frame</code> is nothing more a folder and in that folder there should be <a href="https://www.fstpackage.org/"><code>fst</code></a> files named “1.fst”, “2.fst”, “3.fst” etc. Each of the “.fst” file is called a <em>chunk</em>.</p>
+<p>A <code>disk.frame</code> is a folder containing <a href="https://www.fstpackage.org/"><code>fst</code></a> files named “1.fst”, “2.fst”, “3.fst” etc. Each of the “.fst” file is called a <em>chunk</em>.</p>
 </div>
 <div id="workers-and-parallelism" class="section level2">
 <h2 class="hasAnchor">
 <a href="#workers-and-parallelism" class="anchor"></a>Workers and parallelism</h2>
 <p>Parallelism in <code>disk.frame</code> is achieved using the <a href="https://cran.r-project.org/package=future"><code>future</code> package</a>. When performing many tasks, <code>disk.frame</code> uses multiple workers, where each <em>worker</em> is an R session, to perform the tasks in parallel.</p>
-<p>It is recommended that you should running these to set-up immediately after you <code><a href="https://rdrr.io/r/base/library.html">library(disk.frame)</a></code>. For example:</p>
+<p>It is recommended that you should run the following immediately after <code><a href="https://rdrr.io/r/base/library.html">library(disk.frame)</a></code> to set-up multiple workers. For example:</p>
 <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" title="1"><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span>(disk.frame)</a>
 <a class="sourceLine" id="cb1-2" title="2"><span class="kw"><a href="../reference/setup_disk.frame.html">setup_disk.frame</a></span>()</a>
 <a class="sourceLine" id="cb1-3" title="3"></a>
@@ -139,6 +142,14 @@ <h2 class="hasAnchor">
 <p>To see how many workers are at work, use</p>
 <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" title="1"><span class="co"># see how many workers are available for work</span></a>
 <a class="sourceLine" id="cb4-2" title="2">future<span class="op">::</span><span class="kw"><a href="https://rdrr.io/pkg/future/man/nbrOfWorkers.html">nbrOfWorkers</a></span>()</a></code></pre></div>
+</div>
+<div id="how-disk-frame-works" class="section level2">
+<h2 class="hasAnchor">
+<a href="#how-disk-frame-works" class="anchor"></a>How <code>{disk.frame}</code> works</h2>
+<p>When <code>df %&gt;% some_fn %&gt;% collect</code> is callled. The <code>some_fn</code> is applied to each chunk of <code>df</code>. The collect will row-bind the results from <code>some_fn(chunk)</code>together if the returned value of <code>some_fn</code> is a data.frame, or it will return a <code>list</code> containing the results of <code>some_fn</code>.</p>
+<p>The session that receives these results is called the <strong>main session</strong>. In general, we should try to minimise the amount of data passed from the worker sessions back to the main session, because passing data around can be slow.</p>
+<p>Also, please note that there is no communication between the workers, except for workers passing data back to the main session.</p>
+</div>
 </div>
   </div>
 
@@ -148,8 +159,13 @@ <h2 class="hasAnchor">
       <h2 class="hasAnchor">
 <a href="#tocnav" class="anchor"></a>Contents</h2>
       <ul class="nav nav-pills nav-stacked">
+<li>
+<a href="#key-disk-frame-concepts">Key <code>{disk.frame}</code> concepts</a><ul class="nav nav-pills nav-stacked">
 <li><a href="#what-is-a-disk-frame-and-what-are-chunks">What is a <code>disk.frame</code> and what are chunks?</a></li>
       <li><a href="#workers-and-parallelism">Workers and parallelism</a></li>
+      <li><a href="#how-disk-frame-works">How <code>{disk.frame}</code> works</a></li>
+      </ul>
+</li>
       </ul>
 </div>
       </div>
diff --git a/docs/articles/convenience-features.html b/docs/articles/convenience-features.html
index 5f049e92..5d3bbe50 100644
--- a/docs/articles/convenience-features.html
+++ b/docs/articles/convenience-features.html
@@ -54,7 +54,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/articles/data-table-syntax.html b/docs/articles/data-table-syntax.html
index 742804bc..320e1e5a 100644
--- a/docs/articles/data-table-syntax.html
+++ b/docs/articles/data-table-syntax.html
@@ -54,7 +54,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/articles/glm.html b/docs/articles/glm.html
index 1bf99071..ece7aad0 100644
--- a/docs/articles/glm.html
+++ b/docs/articles/glm.html
@@ -54,7 +54,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/articles/index.html b/docs/articles/index.html
index 476ae805..258605f8 100644
--- a/docs/articles/index.html
+++ b/docs/articles/index.html
@@ -90,7 +90,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -148,7 +148,7 @@ <h3>All vignettes</h3>
       <p class="section-desc"></p>
 
       <ul>
-        <li><a href="concepts.html">Key disk.frame concepts</a></li>
+        <li><a href="concepts.html">Key `{disk.frame}` concepts</a></li>
         <li><a href="convenience-features.html">Convenience features</a></li>
         <li><a href="data-table-syntax.html">Using data.table syntax with disk.frame</a></li>
         <li><a href="glm.html">Generalized Linear Models (GLM) including logistic regression with disk.frame</a></li>
diff --git a/docs/authors.html b/docs/authors.html
index 7b455465..a7da6e6d 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -90,7 +90,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="articles/concepts.html">Key disk.frame concepts</a>
+      <a href="articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="articles/convenience-features.html">Convenience features</a>
diff --git a/docs/index.html b/docs/index.html
index ec89bced..5a39bb10 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -58,7 +58,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="articles/concepts.html">Key disk.frame concepts</a>
+      <a href="articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html
index 1ee4d42f..d3fed281 100644
--- a/docs/reference/add_chunk.html
+++ b/docs/reference/add_chunk.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -193,12 +193,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>diskf</span> <span class='kw'>=</span> <span class='fu'><a href='disk.frame.html'>disk.frame</a></span>(<span class='no'>df_path</span>)
 
 <span class='co'># add a chunk to diskf</span>
-<span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN/tmp_add_chunk"</span>
+<span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
-#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN/tmp_add_chunk"</span>
+#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 100</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
@@ -210,12 +210,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='co'># add chunks by specifying the chunk_id number; this is especially useful if</span>
 <span class='co'># you wish to add multiple chunk in parralel</span>
 
-<span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>1</span>), <span class='fl'>1</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN/tmp_add_chunk2"</span>
+<span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>1</span>), <span class='fl'>1</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk2"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 1</span>
 #&gt; <span class='message'>ncol (at source): 1</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
-#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>2</span>), <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN/tmp_add_chunk2"</span>
+#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>2</span>), <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk2"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 2</span>
 #&gt; <span class='message'>ncol (at source): 1</span>
diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html
index 289bcf1d..dd8db157 100644
--- a/docs/reference/as.data.frame.disk.frame.html
+++ b/docs/reference/as.data.frame.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html
index bb6142dd..bbd36dcf 100644
--- a/docs/reference/as.data.table.disk.frame.html
+++ b/docs/reference/as.data.table.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html
index 9a41d30d..8214eef9 100644
--- a/docs/reference/as.disk.frame.html
+++ b/docs/reference/as.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/collect.html b/docs/reference/collect.html
index 1cac7182..43da32ac 100644
--- a/docs/reference/collect.html
+++ b/docs/reference/collect.html
@@ -95,7 +95,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html
index 3385e74b..005ccf43 100644
--- a/docs/reference/colnames.html
+++ b/docs/reference/colnames.html
@@ -94,7 +94,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html
index da72cd93..ae3b1c6c 100644
--- a/docs/reference/compute.disk.frame.html
+++ b/docs/reference/compute.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/create_dplyr_mapper.html b/docs/reference/create_dplyr_mapper.html
index 19f0b0b9..42faa828 100644
--- a/docs/reference/create_dplyr_mapper.html
+++ b/docs/reference/create_dplyr_mapper.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html
index 2609873b..78e319b5 100644
--- a/docs/reference/csv_to_disk.frame.html
+++ b/docs/reference/csv_to_disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/delete.html b/docs/reference/delete.html
index 89eea582..9ceefa05 100644
--- a/docs/reference/delete.html
+++ b/docs/reference/delete.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html
index bd12e28a..a37b69c4 100644
--- a/docs/reference/df_ram_size.html
+++ b/docs/reference/df_ram_size.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html
index 8d91d91b..be0f6fb7 100644
--- a/docs/reference/dfglm.html
+++ b/docs/reference/dfglm.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html
index c05c6617..736744d0 100644
--- a/docs/reference/disk.frame.html
+++ b/docs/reference/disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -169,7 +169,7 @@ <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arg
 
     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
     <pre class="examples"><div class='input'><span class='no'>path</span> <span class='kw'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/tempfile.html'>tempdir</a></span>(),<span class='st'>"cars"</span>)
-<span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>outdir</span><span class='kw'>=</span><span class='no'>path</span>, <span class='kw'>overwrite</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN/cars"</span>
+<span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>outdir</span><span class='kw'>=</span><span class='no'>path</span>, <span class='kw'>overwrite</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/cars"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 57538df9..51c64073 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html
index 90a78156..a100a7e1 100644
--- a/docs/reference/evalparseglue.html
+++ b/docs/reference/evalparseglue.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html
index f6086f98..8c72885c 100644
--- a/docs/reference/foverlaps.disk.frame.html
+++ b/docs/reference/foverlaps.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html
index 7ba9f2d1..62d50135 100644
--- a/docs/reference/gen_datatable_synthetic.html
+++ b/docs/reference/gen_datatable_synthetic.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html
index 548fc5bd..247045aa 100644
--- a/docs/reference/get_chunk.html
+++ b/docs/reference/get_chunk.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html
index fcad8494..317c0c07 100644
--- a/docs/reference/get_chunk_ids.html
+++ b/docs/reference/get_chunk_ids.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -181,12 +181,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='co'># return the integer-string chunk IDs</span>
 <span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>)</div><div class='output co'>#&gt; [1] "1" "2" "3" "4" "5" "6"</div><div class='input'>
 <span class='co'># return the file name chunk IDs</span>
-<span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/1.fst"
-#&gt; [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/2.fst"
-#&gt; [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/3.fst"
-#&gt; [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/4.fst"
-#&gt; [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/5.fst"
-#&gt; [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpGARjcN\\file47907bbe35ed.df/6.fst"</div><div class='input'>
+<span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/1.fst"
+#&gt; [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/2.fst"
+#&gt; [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/3.fst"
+#&gt; [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/4.fst"
+#&gt; [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/5.fst"
+#&gt; [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/6.fst"</div><div class='input'>
 <span class='co'># return the file name chunk IDs with file extension</span>
 <span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>strip_extension</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)</div><div class='output co'>#&gt; [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"</div><div class='input'>
 <span class='co'># clean up cars.df</span>
diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html
index aeb9f01f..3d6ca892 100644
--- a/docs/reference/group_by.html
+++ b/docs/reference/group_by.html
@@ -99,7 +99,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html
index 2b8e78db..2694bc14 100644
--- a/docs/reference/groups.disk.frame.html
+++ b/docs/reference/groups.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/hard_arrange.html b/docs/reference/hard_arrange.html
index 8acd37c7..5f694979 100644
--- a/docs/reference/hard_arrange.html
+++ b/docs/reference/hard_arrange.html
@@ -94,7 +94,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/hard_group_by.html b/docs/reference/hard_group_by.html
index 2882281e..81d0b552 100644
--- a/docs/reference/hard_group_by.html
+++ b/docs/reference/hard_group_by.html
@@ -94,7 +94,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html
index 56b33eb9..29d40fd5 100644
--- a/docs/reference/head_tail.html
+++ b/docs/reference/head_tail.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/index.html b/docs/reference/index.html
index 52f383cd..a5e2e22e 100644
--- a/docs/reference/index.html
+++ b/docs/reference/index.html
@@ -90,7 +90,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html
index 02114c60..d704589a 100644
--- a/docs/reference/is_disk.frame.html
+++ b/docs/reference/is_disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/join.html b/docs/reference/join.html
index e5ba2a33..20924c2f 100644
--- a/docs/reference/join.html
+++ b/docs/reference/join.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html
index 69011bde..d4ee74d4 100644
--- a/docs/reference/make_glm_streaming_fn.html
+++ b/docs/reference/make_glm_streaming_fn.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/map.html b/docs/reference/map.html
index d69840ea..167398fd 100644
--- a/docs/reference/map.html
+++ b/docs/reference/map.html
@@ -96,7 +96,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -217,9 +217,6 @@ <h1>Apply the same function to all chunks</h1>
 
 <span class='fu'>delayed</span>(<span class='no'>.x</span>, <span class='no'>.f</span>, <span class='no'>...</span>)
 
-<span class='co'># S3 method for disk.frame</span>
-<span class='fu'>delayed</span>(<span class='no'>.x</span>, <span class='no'>.f</span>, <span class='no'>...</span>)
-
 <span class='fu'>chunk_lapply</span>(<span class='no'>...</span>)</pre>
 
     <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
diff --git a/docs/reference/map2.html b/docs/reference/map2.html
index 47e99e25..ac08cf6b 100644
--- a/docs/reference/map2.html
+++ b/docs/reference/map2.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html
index f4c88c4a..17064a0b 100644
--- a/docs/reference/merge.disk.frame.html
+++ b/docs/reference/merge.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html
index 50ba26b6..88a9f584 100644
--- a/docs/reference/move_to.html
+++ b/docs/reference/move_to.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html
index fe9fcb0b..1cfa3190 100644
--- a/docs/reference/nchunks.html
+++ b/docs/reference/nchunks.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html
index 88133573..f4b1ea58 100644
--- a/docs/reference/ncol_nrow.html
+++ b/docs/reference/ncol_nrow.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html
index 95745c12..168f9379 100644
--- a/docs/reference/overwrite_check.html
+++ b/docs/reference/overwrite_check.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html
index 5ac53f89..f8f79bee 100644
--- a/docs/reference/print.disk.frame.html
+++ b/docs/reference/print.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html
index ea0b9a22..7beb1c8d 100644
--- a/docs/reference/rbindlist.disk.frame.html
+++ b/docs/reference/rbindlist.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html
index 90347b12..ad8c0ec5 100644
--- a/docs/reference/rechunk.html
+++ b/docs/reference/rechunk.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -205,7 +205,7 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>cars.df</span> <span class='kw'>=</span> <span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)
 
 <span class='co'># re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df</span>
-<span class='fu'>rechunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\back_up_tmp_dir47903dd4fc1. You can recover there files until you restart your R session</span></div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\file47905ff27fdc.df"</span>
+<span class='fu'>rechunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\back_up_tmp_dirba846ff7b59. You can recover there files until you restart your R session</span></div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba8610f395b.df"</span>
 #&gt; <span class='message'>nchunks: 3</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html
index c26abb51..c00883e9 100644
--- a/docs/reference/recommend_nchunks.html
+++ b/docs/reference/recommend_nchunks.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html
index eff6dbe4..02b4af4a 100644
--- a/docs/reference/remove_chunk.html
+++ b/docs/reference/remove_chunk.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -176,21 +176,21 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>cars.df</span> <span class='kw'>=</span> <span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>4</span>)
 
 <span class='co'># removes 3rd chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\file47902649742f.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
 #&gt; <span class='message'>nchunks: 3</span>
 #&gt; <span class='message'>nrow (at source): 37</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
 #&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'><a href='nchunks.html'>nchunks</a></span>(<span class='no'>cars.df</span>) <span class='co'># 3</span></div><div class='output co'>#&gt; [1] 3</div><div class='input'>
 <span class='co'># removes 4th chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='st'>"4.fst"</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\file47902649742f.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='st'>"4.fst"</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 26</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
 #&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'><a href='nchunks.html'>nchunks</a></span>(<span class='no'>cars.df</span>) <span class='co'># 3</span></div><div class='output co'>#&gt; [1] 2</div><div class='input'>
 <span class='co'># removes 2nd chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/attr.html'>attr</a></span>(<span class='no'>cars.df</span>, <span class='st'>"path"</span>), <span class='st'>"2.fst"</span>), <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\file47902649742f.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/attr.html'>attr</a></span>(<span class='no'>cars.df</span>, <span class='st'>"path"</span>), <span class='st'>"2.fst"</span>), <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 13</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/sample.html b/docs/reference/sample.html
index 2a091b6e..816db283 100644
--- a/docs/reference/sample.html
+++ b/docs/reference/sample.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html
index 47e769f9..20af2536 100644
--- a/docs/reference/setup_disk.frame.html
+++ b/docs/reference/setup_disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/shard.html b/docs/reference/shard.html
index f1875a76..8bde1668 100644
--- a/docs/reference/shard.html
+++ b/docs/reference/shard.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html
index daade967..01adfd80 100644
--- a/docs/reference/shardkey.html
+++ b/docs/reference/shardkey.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html
index a1bd2e16..e05988c2 100644
--- a/docs/reference/shardkey_equal.html
+++ b/docs/reference/shardkey_equal.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html
index 6cfe041b..bf611a74 100644
--- a/docs/reference/show_ceremony.html
+++ b/docs/reference/show_ceremony.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html
index a75ffe59..9adcc365 100644
--- a/docs/reference/srckeep.html
+++ b/docs/reference/srckeep.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html
index f9dc84ee..064e2c0b 100644
--- a/docs/reference/sub-.disk.frame.html
+++ b/docs/reference/sub-.disk.frame.html
@@ -92,7 +92,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html
index 36f6c7af..46734d61 100644
--- a/docs/reference/tbl_vars.disk.frame.html
+++ b/docs/reference/tbl_vars.disk.frame.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html
index 38e8b1c8..275afd5c 100644
--- a/docs/reference/write_disk.frame.html
+++ b/docs/reference/write_disk.frame.html
@@ -93,7 +93,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html
index fbad0458..f2768d88 100644
--- a/docs/reference/zip_to_disk.frame.html
+++ b/docs/reference/zip_to_disk.frame.html
@@ -95,7 +95,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
     <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -208,7 +208,7 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>zip.df</span> <span class='kw'>=</span> <span class='fu'>zip_to_disk.frame</span>(<span class='no'>zipfile</span>, <span class='fu'><a href='https://rdrr.io/r/base/tempfile.html'>tempfile</a></span>(<span class='kw'>fileext</span> <span class='kw'>=</span> <span class='st'>".df"</span>))
 
 <span class='co'># there is only one csv file so it return a list of one disk.frame</span>
-<span class='no'>zip.df</span><span class='kw'>[[</span><span class='fl'>1</span>]]</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpGARjcN\file479063de6d26.df/Users/RTX2080/AppData/Local/Temp/RtmpGARjcN/file479013c752a5.csv"</span>
+<span class='no'>zip.df</span><span class='kw'>[[</span><span class='fl'>1</span>]]</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba84e5121d3.df/Users/RTX2080/AppData/Local/Temp/RtmpyQhR8D/fileba81cfa6b9e.csv"</span>
 #&gt; <span class='message'>nchunks: 6</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 3</span>
diff --git a/utils/build_utils.R b/utils/build_utils.R
index b91c581e..e944346a 100644
--- a/utils/build_utils.R
+++ b/utils/build_utils.R
@@ -23,7 +23,6 @@ df_build_site <- function() {
 }
 
 df_setup_vignette <- function(excl = "") {
-  
   # remove cache
   purrr::walk(list.dirs("vignettes/",recursive = FALSE), ~{
     fs::dir_delete(.x)

From 6373b1854d786eb587a3716dc9287d14e939dcb1 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 12:04:32 +1100
Subject: [PATCH 3/6] ready for group-by framework

---
 NAMESPACE                             |   1 +
 R/dplyr_verbs.r                       |  85 ------------------
 R/two-stage-verbs.R                   | 124 ++++++++++++++++++--------
 book/10-group-by.Rmd                  |  15 ++--
 docs/articles/ingesting-data.html     |  16 +++-
 docs/reference/add_chunk.html         |   8 +-
 docs/reference/disk.frame.html        |   2 +-
 docs/reference/get_chunk_ids.html     |  12 +--
 docs/reference/rechunk.html           |   2 +-
 docs/reference/remove_chunk.html      |   6 +-
 docs/reference/zip_to_disk.frame.html |   2 +-
 man/dplyr_verbs.Rd                    |  14 +--
 man/generate_summ_code.Rd             |  11 +++
 man/group_by.Rd                       |   6 +-
 tests/testthat/test-group-by.R        |  67 +++++++++++++-
 15 files changed, 208 insertions(+), 163 deletions(-)
 create mode 100644 man/generate_summ_code.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 7cba21c0..743a6961 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -184,6 +184,7 @@ importFrom(data.table,foverlaps)
 importFrom(data.table,fread)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDT)
+importFrom(data.table,setkey)
 importFrom(data.table,setkeyv)
 importFrom(data.table,timetaken)
 importFrom(dplyr,add_count)
diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r
index e5d28c56..704b2622 100644
--- a/R/dplyr_verbs.r
+++ b/R/dplyr_verbs.r
@@ -128,85 +128,6 @@ chunk_summarize <- create_chunk_mapper(dplyr::summarize)
 chunk_summarise <- create_chunk_mapper(dplyr::summarise)
 
 
-#' @export
-#' @importFrom dplyr summarize
-#' @rdname dplyr_verbs
-summarize.disk.frame <- function(...) {
-  # comment summarize.grouped_disk.frame
-  warning("`summarize.disk.frame`'s behaviour has changed. Please use `chunk_summarize` if you wish to `dplyr::summarize` to each chunk")
-  
-  stop("TODO: adapt this for no group-by")
-  code = substitute(list(...))[-1]
-  expr_id = 0
-  temp_varn = 0
-  #browser()
-  
-  list_of_chunk_agg_fns <- as.character(methods(class = "chunk_agg.disk.frame"))
-  list_of_collected_agg_fns <- as.character(methods(class = "collected_agg.disk.frame"))
-  
-  # generate the chunk_summarize_code
-  summarize_code = purrr::map_dfr(code, ~{
-    expr_id <<- expr_id  + 1
-    # parse the function into table form for easy interrogration
-    gpd = getParseData(parse(text = deparse(.x)), includeText = TRUE); 
-    grp_funcs = gpd %>% filter(token == "SYMBOL_FUNCTION_CALL") %>% select(text) %>% pull
-    
-    # search in the space to find functions name `fn`.chunk_agg.disk.frame
-    # only allow one such functions for now TODO improve it
-    #stopifnot(sum(paste0(unique(grp_funcs), ".chunk_agg.disk.frame") %in% list_of_chunk_agg_fns) == 1)
-    #stopifnot(sum(paste0(unique(grp_funcs), ".collected_agg.disk.frame") %in% list_of_collected_agg_fns) == 1)
-    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) == 1)
-    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) == 1)
-    
-    
-    
-    temp_varn <<- temp_varn + 1
-    tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ")
-    
-    chunk_code = data.frame(assign_to = as.character(glue::glue("tmp{temp_varn}")), expr = tmpcode, stringsAsFactors = FALSE)
-    
-    chunk_code$orig_code = deparse(.x)
-    chunk_code$expr_id = expr_id
-    chunk_code$grp_fn = grp_funcs
-    chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id]))
-    
-    # create the aggregation code
-    chunk_code$agg_expr = glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")
-    
-    #print(sapply(chunk_code, typeof))
-    chunk_code
-  })
-  
-  chunk_summ_code = paste0(summarize_code$assign_to, "=list(", summarize_code$expr, ")") %>% paste0(collapse = ", ")
-  
-  agg_code_df = summarize_code %>% 
-    select(expr_id, name, agg_expr, orig_code) %>% 
-    unique %>% 
-    transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr))
-  
-  agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",")
-  
-  # get the by variables
-  group_by_cols = purrr::map_chr(attr(.data, "group_by_cols"), ~{deparse(.x)})
-  
-  list(group_by_cols = group_by_cols, chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)
-  
-  # generate full code
-  code_to_run = glue::glue("chunk_group_by({group_by_cols}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({group_by_cols}) %>% summarize({agg_summ_code})")
-  
-  class(.data) <- c("summarized_disk.frame", "disk.frame")
-  attr(.data, "summarize_code") = code_to_run
-  .data
-}
-
-
-
-#' @export
-#' @importFrom dplyr summarize
-#' @rdname dplyr_verbs
-summarise.disk.frame <- summarize.disk.frame
-
-
 #' @export
 #' @rdname dplyr_verbs
 #' @importFrom dplyr do
@@ -352,12 +273,6 @@ groups.disk.frame <- function(x){
 #' @param .data a disk.frame
 #' @param ... same as the dplyr::group_by
 #' @export
-#' @rdname group_by
-group_by.disk.frame <- function(...) {
-  stop("`arrange.disk.frame` has been removed. Please use `chunk_arrange` instead. This is preparation for a more powerful `group_by` framework")
-}
-
-
 #' @rdname group_by
 #' @export
 chunk_group_by <- create_chunk_mapper(dplyr::group_by)
diff --git a/R/two-stage-verbs.R b/R/two-stage-verbs.R
index 3a93af53..05006c32 100644
--- a/R/two-stage-verbs.R
+++ b/R/two-stage-verbs.R
@@ -166,10 +166,71 @@ IQR.collected_agg.disk.frame <- function(listx, ...) {
 #' @importFrom purrr map_dfr
 #' @export
 summarise.grouped_disk.frame <- function(.data, ...) {
+  ca_code = generate_summ_code(...)
+  
+  chunk_summ_code = ca_code$chunk_summ_code
+  agg_summ_code = ca_code$agg_summ_code
+  
+  # get the by variables
+  group_by_cols = purrr::map_chr(attr(.data, "group_by_cols"), ~{deparse(.x)})
+  
+  list(group_by_cols = group_by_cols, chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)
+  
+  # generate full code
+  code_to_run = glue::glue("chunk_group_by({group_by_cols}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({group_by_cols}) %>% summarize({agg_summ_code})")
+  
+  class(.data) <- c("summarized_disk.frame", "disk.frame")
+  attr(.data, "summarize_code") = code_to_run
+  .data
+}
+
+#' @export
+summarize.grouped_disk.frame = summarise.grouped_disk.frame
+
+#' Group by within each disk.frame
+#' @description
+#' The disk.frame group by operation perform group WITHIN each chunk. This is
+#' often used for performance reasons. If the user wishes to perform group-by,
+#' they may choose to use the `hard_group_by` function which is expensive as it
+#' reorganizes the chunks by the shard key.
+#' @seealso hard_group_by
+#' @param .data a disk.frame
+#' @param ... same as the dplyr::group_by
+#' @export
+#' @rdname group_by
+# learning from https://docs.dask.org/en/latest/dataframe-groupby.html
+group_by.disk.frame <- function(.data, ..., add = FALSE, .drop = group_by_drop_default(.data)) {
+  class(.data) <- c("grouped_disk.frame", "disk.frame")
+  attr(.data, "group_by_cols") = substitute(list(...))[-1]
+  .data
+}
+
+#' @export
+#' @importFrom dplyr summarize
+#' @rdname dplyr_verbs
+summarize.disk.frame <- function(.data, ...) {
+  # comment summarize.grouped_disk.frame
+  warning("`summarize.disk.frame`'s behaviour has changed. Please use `chunk_summarize` if you wish to `dplyr::summarize` to each chunk")
+  
+  ca_code = generate_summ_code(...)
+  
+  chunk_summ_code = ca_code$chunk_summ_code
+  agg_summ_code = ca_code$agg_summ_code
+  
+  # generate full code
+  code_to_run = glue::glue("chunk_summarize({chunk_summ_code}) %>% collect %>% summarize({agg_summ_code})")
+  
+  class(.data) <- c("summarized_disk.frame", "disk.frame")
+  attr(.data, "summarize_code") = code_to_run
+  .data
+}
+
+#' Helper function to generate summarisation code
+#' @importFrom data.table setDT setkey
+generate_summ_code <- function(...) {
   code = substitute(list(...))[-1]
   expr_id = 0
   temp_varn = 0
-  #browser()
   
   list_of_chunk_agg_fns <- as.character(methods(class = "chunk_agg.disk.frame"))
   list_of_collected_agg_fns <- as.character(methods(class = "collected_agg.disk.frame"))
@@ -183,12 +244,28 @@ summarise.grouped_disk.frame <- function(.data, ...) {
     
     # search in the space to find functions name `fn`.chunk_agg.disk.frame
     # only allow one such functions for now TODO improve it
-    #stopifnot(sum(paste0(unique(grp_funcs), ".chunk_agg.disk.frame") %in% list_of_chunk_agg_fns) == 1)
-    #stopifnot(sum(paste0(unique(grp_funcs), ".collected_agg.disk.frame") %in% list_of_collected_agg_fns) == 1)
-    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) == 1)
-    stopifnot(sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) == 1)
+    num_of_chunk_functions = sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame"))))
+    num_of_collected_functions= sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame"))))
+    
+    # the number chunk and aggregation functions must match
+    stopifnot(num_of_chunk_functions == num_of_collected_functions)
     
+    # keep only grp_functions
+    grp_funcs= grp_funcs[sapply(grp_funcs, function(x) exists(paste0(x, ".chunk_agg.disk.frame")))]
     
+    if(num_of_chunk_functions == 0) {
+      stop(sprintf("There must be at least one summarization function in %s", deparse(.x)))
+    } else if (num_of_chunk_functions > 1) {
+      stop(sprintf("Two or more summarisation functions are detected in \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n    * Nestling (like mean(sum(x) + y)) or \n    * combinations (like sum(x) + mean(x))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x)))
+    }
+    
+    # check to see if the mean is only two from parent 0, otherwise it would a statement in the form of 1 + mean(x)
+    # which isn't supported
+    data.table::setDT(gpd)
+    data.table::setkey(gpd, parent)
+    if (gpd[id == gpd[id == gpd[(text == grp_funcs) & (token == "SYMBOL_FUNCTION_CALL"), parent], parent], parent] != 0) {
+      stop(sprintf("Combining summarization with other operations \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n    * combinations (like sum(x) + 1)\n* combinations (like list(sum(x)))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x)))
+    }
     
     temp_varn <<- temp_varn + 1
     tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ")
@@ -201,7 +278,7 @@ summarise.grouped_disk.frame <- function(.data, ...) {
     chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id]))
     
     # create the aggregation code
-    chunk_code$agg_expr = glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")
+    chunk_code$agg_expr = as.character(glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})"))
     
     #print(sapply(chunk_code, typeof))
     chunk_code
@@ -213,42 +290,17 @@ summarise.grouped_disk.frame <- function(.data, ...) {
     select(expr_id, name, agg_expr, orig_code) %>% 
     unique %>% 
     transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr))
-    
-  agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",")
-  
-  # get the by variables
-  group_by_cols = purrr::map_chr(attr(.data, "group_by_cols"), ~{deparse(.x)})
-  
-  list(group_by_cols = group_by_cols, chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)
   
-  # generate full code
-  code_to_run = glue::glue("chunk_group_by({group_by_cols}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({group_by_cols}) %>% summarize({agg_summ_code})")
+  agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",")
   
-  class(.data) <- c("summarized_disk.frame", "disk.frame")
-  attr(.data, "summarize_code") = code_to_run
-  .data
+  list(chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)
 }
 
-#' @export
-summarize.grouped_disk.frame = summarise.grouped_disk.frame
 
-#' Group by within each disk.frame
-#' @description
-#' The disk.frame group by operation perform group WITHIN each chunk. This is
-#' often used for performance reasons. If the user wishes to perform group-by,
-#' they may choose to use the `hard_group_by` function which is expensive as it
-#' reorganizes the chunks by the shard key.
-#' @seealso hard_group_by
-#' @param .data a disk.frame
-#' @param ... same as the dplyr::group_by
 #' @export
-#' @rdname group_by
-# learning from https://docs.dask.org/en/latest/dataframe-groupby.html
-group_by.disk.frame <- function(.data, ..., add = FALSE, .drop = group_by_drop_default(.data)) {
-  class(.data) <- c("grouped_disk.frame", "disk.frame")
-  attr(.data, "group_by_cols") = substitute(list(...))[-1]
-  .data
-}
+#' @importFrom dplyr summarize
+#' @rdname dplyr_verbs
+summarise.disk.frame <- summarize.disk.frame
 
 
 
diff --git a/book/10-group-by.Rmd b/book/10-group-by.Rmd
index 618a7745..66d9b7d2 100644
--- a/book/10-group-by.Rmd
+++ b/book/10-group-by.Rmd
@@ -133,7 +133,8 @@ Typically, `chunk_group_by` is performed WITHIN each chunk. This is not an issue
 
 By forcing the user to choose `chunk_group_by` (within each chunk) and `hard_group_by` (across all chunks), this ensures that the user is conscious of the choice they are making. In `sparklyr` the equivalent of a `hard_group_by` is performed, which we should avoid, where possible, as it is time-consuming and expensive. Hence, `disk.frame` has chosen to explain the theory and allow the user to make a conscious choice when performing `group_by`.
 
-```{r, dependson='asdiskframe'}
+```r
+suppressMessages(library(disk.frame))
 flights.df %>%
   hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
   chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
@@ -147,7 +148,7 @@ Prior to `{disk.frame}` v0.2.2, there is no general support for one-stage group-
 
 For most group-by tasks, the user can achieve the desired result WITHOUT using `hard = TRUE` by performing the group by in two stages. For example, suppose you aim to count the number of rows group by `carrier`, you can set `hard = F` to find the count within each chunk and then use a second group-by to summaries each chunk's results into the desired result. For example,
 
-```{r, dependson='asdiskframe'}
+```r
 flights.df %>%
   chunk_group_by(carrier) %>% # `chunk_group_by` aggregates within each chunk
   chunk_summarize(count = n()) %>%  # mean follows normal R rules
@@ -161,12 +162,12 @@ Because this two-stage approach avoids the expensive `hard group_by` operation,
 
 *Note*: this two-stage approach is similar to a map-reduce operation.
 
-```{r setup, cache=TRUE}
+```r
 suppressPackageStartupMessages(library(disk.frame))
 setup_disk.frame()
 ```
 
-```{r flights_df, dependson='setup', cache=TRUE}
+```r
 flights.df = as.disk.frame(nycflights13::flights)
 
 flights.df %>%
@@ -177,7 +178,7 @@ flights.df %>%
 ```
 
 This is two-stage group-by in action
-```{r, dependson='flights_df'}
+```r
 # need a 2nd stage to finalise summing
 flights.df %>%
   srckeep(c("year","distance")) %>%  # keep only carrier and distance columns
@@ -190,7 +191,7 @@ flights.df %>%
 
 You can mix group-by with other dplyr verbs as below, here is an example of using `filter`. 
 
-```{r, dependson='flights_df'}
+```r
 # filter
 pt = proc.time()
 df_filtered <-
@@ -203,7 +204,7 @@ nrow(df_filtered)
 ## Hard group-by
 
 Another way to perform a one-stage `group_by` is to perform a `hard_group_by` on a `disk.frame`. This will rechunk the `disk.frame` by the by-columns. This is **not** recommended for performance reasons, as it can be quite slow to rechunk the file chunks on disk.
-```{r}
+```r
 pt = proc.time()
 res1 <- flights.df %>% 
   srckeep(c("month", "dep_delay")) %>% 
diff --git a/docs/articles/ingesting-data.html b/docs/articles/ingesting-data.html
index 2b31b748..209eee69 100644
--- a/docs/articles/ingesting-data.html
+++ b/docs/articles/ingesting-data.html
@@ -54,7 +54,7 @@
   </a>
   <ul class="dropdown-menu" role="menu">
 <li>
-      <a href="../articles/concepts.html">Key disk.frame concepts</a>
+      <a href="../articles/concepts.html">Key `{disk.frame}` concepts</a>
     </li>
     <li>
       <a href="../articles/convenience-features.html">Convenience features</a>
@@ -113,7 +113,11 @@ <h4 class="author">ZJ</h4>
 
     
     
-<p>Let’s set-up <code>disk.frame</code></p>
+<div id="ingesting-data" class="section level1">
+<h1 class="hasAnchor">
+<a href="#ingesting-data" class="anchor"></a>Ingesting Data</h1>
+<p>One of the most important tasks to perform before using the <code>{disk.frame}</code> package is to make some <code>disk.frame</code>s! There are a few functions to help you do that. Before we do that, we set up the <code>{disk.frame}</code> as usual</p>
+<p><strong>Setting up</strong></p>
 <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" title="1"><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span>(disk.frame)</a>
 <a class="sourceLine" id="cb1-2" title="2"></a>
 <a class="sourceLine" id="cb1-3" title="3"><span class="co"># set-up disk.frame to use multiple workers</span></a>
@@ -125,12 +129,11 @@ <h4 class="author">ZJ</h4>
 <a class="sourceLine" id="cb1-9" title="9">} <span class="cf">else</span> {</a>
 <a class="sourceLine" id="cb1-10" title="10">  <span class="kw"><a href="../reference/setup_disk.frame.html">setup_disk.frame</a></span>(<span class="dv">2</span>)</a>
 <a class="sourceLine" id="cb1-11" title="11">}</a></code></pre></div>
-<p>One of the most important tasks to perform before using the <code>disk.frame</code> package is to make some <code>disk.frame</code>s! There are a few functions to help you do that.</p>
 <div id="convert-a-data-frame-to-disk-frame" class="section level2">
 <h2 class="hasAnchor">
 <a href="#convert-a-data-frame-to-disk-frame" class="anchor"></a>Convert a <code>data.frame</code> to <code>disk.frame</code>
 </h2>
-<p>Firstly there is <code><a href="../reference/as.disk.frame.html">as.disk.frame()</a></code> which allows you to make a <code>disk.frame</code> from a <code>data.frame</code>, e.g.</p>
+<p>Firstly, there is <code><a href="../reference/as.disk.frame.html">as.disk.frame()</a></code> which allows you to make a <code>disk.frame</code> from a <code>data.frame</code>, e.g.</p>
 <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" title="1">flights.df =<span class="st"> </span><span class="kw"><a href="../reference/as.disk.frame.html">as.disk.frame</a></span>(nycflights13<span class="op">::</span>flights)</a></code></pre></div>
 <p>will convert the <code><a href="https://rdrr.io/pkg/nycflights13/man/flights.html">nycflights13::flights</a></code> <code>data.frame</code> to a <code>disk.frame</code> somewhere in <code><a href="https://rdrr.io/r/base/tempfile.html">tempdir()</a></code>. To find out the location of the <code>disk.frame</code> use:</p>
 <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" title="1"><span class="kw"><a href="https://rdrr.io/r/base/attr.html">attr</a></span>(flights.df, <span class="st">"path"</span>)</a></code></pre></div>
@@ -230,6 +233,7 @@ <h2 class="hasAnchor">
 <h2 class="hasAnchor">
 <a href="#exploiting-the-structure-of-a-disk-frame" class="anchor"></a>Exploiting the structure of a disk.frame</h2>
 <p>Of course, a <code>disk.frame</code> is just a folder with many <code>fst</code> files named as <code>1.fst</code>, <code>2.fst</code> etc. So one can simply create these <code>fst</code> files and ensure they have the same variable names and put them in a folder.</p>
+</div>
 </div>
   </div>
 
@@ -239,6 +243,8 @@ <h2 class="hasAnchor">
       <h2 class="hasAnchor">
 <a href="#tocnav" class="anchor"></a>Contents</h2>
       <ul class="nav nav-pills nav-stacked">
+<li>
+<a href="#ingesting-data">Ingesting Data</a><ul class="nav nav-pills nav-stacked">
 <li><a href="#convert-a-data-frame-to-disk-frame">Convert a <code>data.frame</code> to <code>disk.frame</code></a></li>
       <li><a href="#creating-disk-frame-from-csvs">Creating <code>disk.frame</code> from CSVs</a></li>
       <li><a href="#multiple-csv-files">Multiple CSV files</a></li>
@@ -249,6 +255,8 @@ <h2 class="hasAnchor">
       <li><a href="#using-add_chunk">Using <code>add_chunk</code></a></li>
       <li><a href="#exploiting-the-structure-of-a-disk-frame">Exploiting the structure of a disk.frame</a></li>
       </ul>
+</li>
+      </ul>
 </div>
       </div>
 
diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html
index d3fed281..a7e59e7b 100644
--- a/docs/reference/add_chunk.html
+++ b/docs/reference/add_chunk.html
@@ -193,12 +193,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>diskf</span> <span class='kw'>=</span> <span class='fu'><a href='disk.frame.html'>disk.frame</a></span>(<span class='no'>df_path</span>)
 
 <span class='co'># add a chunk to diskf</span>
-<span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk"</span>
+<span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB/tmp_add_chunk"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
-#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk"</span>
+#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>diskf</span>, <span class='no'>cars</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB/tmp_add_chunk"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 100</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
@@ -210,12 +210,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='co'># add chunks by specifying the chunk_id number; this is especially useful if</span>
 <span class='co'># you wish to add multiple chunk in parralel</span>
 
-<span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>1</span>), <span class='fl'>1</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk2"</span>
+<span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>1</span>), <span class='fl'>1</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB/tmp_add_chunk2"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 1</span>
 #&gt; <span class='message'>ncol (at source): 1</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
-#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>2</span>), <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/tmp_add_chunk2"</span>
+#&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'>add_chunk</span>(<span class='no'>df2</span>, <span class='fu'><a href='https://rdrr.io/r/base/data.frame.html'>data.frame</a></span>(<span class='kw'>chunk</span><span class='kw'>=</span><span class='fl'>2</span>), <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB/tmp_add_chunk2"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 2</span>
 #&gt; <span class='message'>ncol (at source): 1</span>
diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html
index 736744d0..4abb4d77 100644
--- a/docs/reference/disk.frame.html
+++ b/docs/reference/disk.frame.html
@@ -169,7 +169,7 @@ <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arg
 
     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
     <pre class="examples"><div class='input'><span class='no'>path</span> <span class='kw'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/tempfile.html'>tempdir</a></span>(),<span class='st'>"cars"</span>)
-<span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>outdir</span><span class='kw'>=</span><span class='no'>path</span>, <span class='kw'>overwrite</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D/cars"</span>
+<span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>outdir</span><span class='kw'>=</span><span class='no'>path</span>, <span class='kw'>overwrite</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB/cars"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html
index 317c0c07..3f313b21 100644
--- a/docs/reference/get_chunk_ids.html
+++ b/docs/reference/get_chunk_ids.html
@@ -181,12 +181,12 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='co'># return the integer-string chunk IDs</span>
 <span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>)</div><div class='output co'>#&gt; [1] "1" "2" "3" "4" "5" "6"</div><div class='input'>
 <span class='co'># return the file name chunk IDs</span>
-<span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/1.fst"
-#&gt; [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/2.fst"
-#&gt; [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/3.fst"
-#&gt; [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/4.fst"
-#&gt; [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/5.fst"
-#&gt; [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyQhR8D\\fileba87056566b.df/6.fst"</div><div class='input'>
+<span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/1.fst"
+#&gt; [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/2.fst"
+#&gt; [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/3.fst"
+#&gt; [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/4.fst"
+#&gt; [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/5.fst"
+#&gt; [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsJfEtB\\file23c034694924.df/6.fst"</div><div class='input'>
 <span class='co'># return the file name chunk IDs with file extension</span>
 <span class='fu'>get_chunk_ids</span>(<span class='no'>cars.df</span>, <span class='kw'>strip_extension</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)</div><div class='output co'>#&gt; [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"</div><div class='input'>
 <span class='co'># clean up cars.df</span>
diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html
index ad8c0ec5..525cf62b 100644
--- a/docs/reference/rechunk.html
+++ b/docs/reference/rechunk.html
@@ -205,7 +205,7 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>cars.df</span> <span class='kw'>=</span> <span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>2</span>)
 
 <span class='co'># re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df</span>
-<span class='fu'>rechunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\back_up_tmp_dirba846ff7b59. You can recover there files until you restart your R session</span></div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba8610f395b.df"</span>
+<span class='fu'>rechunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\back_up_tmp_dir23c053023c39. You can recover there files until you restart your R session</span></div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\file23c065452fab.df"</span>
 #&gt; <span class='message'>nchunks: 3</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html
index 02b4af4a..19604ec5 100644
--- a/docs/reference/remove_chunk.html
+++ b/docs/reference/remove_chunk.html
@@ -176,21 +176,21 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>cars.df</span> <span class='kw'>=</span> <span class='fu'><a href='as.disk.frame.html'>as.disk.frame</a></span>(<span class='no'>cars</span>, <span class='kw'>nchunks</span> <span class='kw'>=</span> <span class='fl'>4</span>)
 
 <span class='co'># removes 3rd chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fl'>3</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\file23c026184dd6.df"</span>
 #&gt; <span class='message'>nchunks: 3</span>
 #&gt; <span class='message'>nrow (at source): 37</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
 #&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'><a href='nchunks.html'>nchunks</a></span>(<span class='no'>cars.df</span>) <span class='co'># 3</span></div><div class='output co'>#&gt; [1] 3</div><div class='input'>
 <span class='co'># removes 4th chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='st'>"4.fst"</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='st'>"4.fst"</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\file23c026184dd6.df"</span>
 #&gt; <span class='message'>nchunks: 2</span>
 #&gt; <span class='message'>nrow (at source): 26</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
 #&gt; <span class='message'>nrow (post operations): ???</span>
 #&gt; <span class='message'>ncol (post operations): ???</span></div><div class='input'><span class='fu'><a href='nchunks.html'>nchunks</a></span>(<span class='no'>cars.df</span>) <span class='co'># 3</span></div><div class='output co'>#&gt; [1] 2</div><div class='input'>
 <span class='co'># removes 2nd chunk</span>
-<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/attr.html'>attr</a></span>(<span class='no'>cars.df</span>, <span class='st'>"path"</span>), <span class='st'>"2.fst"</span>), <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba857837ce.df"</span>
+<span class='fu'>remove_chunk</span>(<span class='no'>cars.df</span>, <span class='fu'><a href='https://rdrr.io/r/base/file.path.html'>file.path</a></span>(<span class='fu'><a href='https://rdrr.io/r/base/attr.html'>attr</a></span>(<span class='no'>cars.df</span>, <span class='st'>"path"</span>), <span class='st'>"2.fst"</span>), <span class='kw'>full.names</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>)</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\file23c026184dd6.df"</span>
 #&gt; <span class='message'>nchunks: 1</span>
 #&gt; <span class='message'>nrow (at source): 13</span>
 #&gt; <span class='message'>ncol (at source): 2</span>
diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html
index f2768d88..61aa1346 100644
--- a/docs/reference/zip_to_disk.frame.html
+++ b/docs/reference/zip_to_disk.frame.html
@@ -208,7 +208,7 @@ <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examp
 <span class='no'>zip.df</span> <span class='kw'>=</span> <span class='fu'>zip_to_disk.frame</span>(<span class='no'>zipfile</span>, <span class='fu'><a href='https://rdrr.io/r/base/tempfile.html'>tempfile</a></span>(<span class='kw'>fileext</span> <span class='kw'>=</span> <span class='st'>".df"</span>))
 
 <span class='co'># there is only one csv file so it return a list of one disk.frame</span>
-<span class='no'>zip.df</span><span class='kw'>[[</span><span class='fl'>1</span>]]</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyQhR8D\fileba84e5121d3.df/Users/RTX2080/AppData/Local/Temp/RtmpyQhR8D/fileba81cfa6b9e.csv"</span>
+<span class='no'>zip.df</span><span class='kw'>[[</span><span class='fl'>1</span>]]</div><div class='output co'>#&gt; <span class='message'>path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpsJfEtB\file23c058a13e39.df/Users/RTX2080/AppData/Local/Temp/RtmpsJfEtB/file23c02db2100e.csv"</span>
 #&gt; <span class='message'>nchunks: 6</span>
 #&gt; <span class='message'>nrow (at source): 50</span>
 #&gt; <span class='message'>ncol (at source): 3</span>
diff --git a/man/dplyr_verbs.Rd b/man/dplyr_verbs.Rd
index f188c314..7edeee72 100644
--- a/man/dplyr_verbs.Rd
+++ b/man/dplyr_verbs.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dplyr_verbs.r
+% Please edit documentation in R/dplyr_verbs.r, R/two-stage-verbs.R
 \name{select.disk.frame}
 \alias{select.disk.frame}
 \alias{rename.disk.frame}
@@ -17,8 +17,6 @@
 \alias{add_tally.disk.frame}
 \alias{chunk_summarize}
 \alias{chunk_summarise}
-\alias{summarize.disk.frame}
-\alias{summarise.disk.frame}
 \alias{do.disk.frame}
 \alias{group_by_all.disk.frame}
 \alias{group_by_at.disk.frame}
@@ -40,6 +38,8 @@
 \alias{distinct.disk.frame}
 \alias{chunk_distinct}
 \alias{glimpse.disk.frame}
+\alias{summarize.disk.frame}
+\alias{summarise.disk.frame}
 \title{The dplyr verbs implemented for disk.frame}
 \usage{
 \method{select}{disk.frame}(.data, ...)
@@ -74,10 +74,6 @@ chunk_summarize(.data, ...)
 
 chunk_summarise(.data, ...)
 
-\method{summarize}{disk.frame}(...)
-
-\method{summarise}{disk.frame}(...)
-
 \method{do}{disk.frame}(.data, ...)
 
 group_by_all.disk.frame(.data, ...)
@@ -121,6 +117,10 @@ chunk_summarize_if(.data, ...)
 chunk_distinct(.data, ...)
 
 \method{glimpse}{disk.frame}(.data, ...)
+
+\method{summarize}{disk.frame}(.data, ...)
+
+\method{summarise}{disk.frame}(.data, ...)
 }
 \arguments{
 \item{.data}{a disk.frame}
diff --git a/man/generate_summ_code.Rd b/man/generate_summ_code.Rd
new file mode 100644
index 00000000..77cb74a2
--- /dev/null
+++ b/man/generate_summ_code.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/two-stage-verbs.R
+\name{generate_summ_code}
+\alias{generate_summ_code}
+\title{Helper function to generate summarisation code}
+\usage{
+generate_summ_code(...)
+}
+\description{
+Helper function to generate summarisation code
+}
diff --git a/man/group_by.Rd b/man/group_by.Rd
index b2b90c89..7f5e607b 100644
--- a/man/group_by.Rd
+++ b/man/group_by.Rd
@@ -1,12 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dplyr_verbs.r, R/two-stage-verbs.R
-\name{group_by.disk.frame}
-\alias{group_by.disk.frame}
+\name{chunk_group_by}
 \alias{chunk_group_by}
+\alias{group_by.disk.frame}
 \title{Group by within each disk.frame}
 \usage{
-\method{group_by}{disk.frame}(.data, ..., add = FALSE, .drop = group_by_drop_default(.data))
-
 chunk_group_by(.data, ...)
 
 \method{group_by}{disk.frame}(.data, ..., add = FALSE, .drop = group_by_drop_default(.data))
diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R
index 0888f3b6..222c0ee1 100644
--- a/tests/testthat/test-group-by.R
+++ b/tests/testthat/test-group-by.R
@@ -11,10 +11,10 @@ test_that("new group_by framework", {
     iris.df = iris %>% 
       as.disk.frame
     
-    grpby = expect_warning(iris.df %>% 
-                             group_by(Species) %>% 
-                             summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% 
-                             collect)
+    grpby = iris.df %>% 
+        group_by(Species) %>% 
+        summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% 
+        collect
     
     grpby2 = iris %>% 
       group_by(Species) %>% 
@@ -30,6 +30,65 @@ test_that("new group_by framework", {
   expect_true(TRUE)
 })
 
+test_that("new group_by framework - no group-by just summarise", {
+  if(interactive()) {
+    iris.df = iris %>% 
+      as.disk.frame
+    
+    grpby = iris.df %>% 
+      summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% 
+      collect
+    
+    grpby2 = iris %>% 
+      summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% 
+      arrange()
+    
+    for (n in names(grpby)) {
+      expect_true(all(grpby2[, n] == grpby[, n]) || all(abs(grpby2[, n] - grpby[, n]) < 0.0001))
+    }
+    
+    delete(iris.df)
+  }
+  expect_true(TRUE)
+})
+
+test_that("new group_by framework - nested-group-by", {
+  if(interactive()) {
+    iris.df = iris %>% 
+      as.disk.frame
+    
+    expect_warning(expect_error(grpby <- iris.df %>% 
+      summarize(mean(Petal.Length + max(Petal.Length))) %>% 
+      collect))
+    
+    expect_warning(expect_error(grpby <- iris.df %>% 
+      summarize(mean(Petal.Length) + max(Petal.Length)) %>% 
+      collect))
+    
+    expect_warning(expect_error(grpby <- iris.df %>% 
+      summarize(mean(Petal.Length) + 1) %>% 
+      collect))
+    
+    expect_warning(expect_error(grpby <- iris.df %>% 
+      summarize(list(mean(Petal.Length))) %>% 
+      collect))
+    
+    fn_tmp = function(x) x + 1
+    expect_warning(grpby <- iris.df %>% 
+        summarize(mean(fn_tmp(Petal.Length))) %>% 
+        collect)
+    
+    grpby2 <- iris %>% 
+      summarize(mean(fn_tmp(Petal.Length)))
+    
+    for (n in names(grpby)) {
+      expect_true(all(grpby2[, n] == grpby[, n]) || all(abs(grpby2[, n] - grpby[, n]) < 0.0001))
+    }
+    delete(iris.df)
+  }
+  expect_true(TRUE)
+})
+
 
 test_that("group_by", {
   dff = csv_to_disk.frame(

From e3415cd6d21e4465d8c04a4762e4ebe10ad7c70d Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 12:20:33 +1100
Subject: [PATCH 4/6] initialise

---
 DESCRIPTION      | 4 ++--
 R/maditr_verbs.r | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 R/maditr_verbs.r

diff --git a/DESCRIPTION b/DESCRIPTION
index 75353829..f495d3d7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -51,8 +51,8 @@ Suggests:
     speedglm,
     broom,
     learnr,
-    ggplot2, 
-    tidyfast (>= 0.1.8)
+    ggplot2,
+    maditr (>= 0.6.3)
 LinkingTo: 
     Rcpp
 RoxygenNote: 7.0.2
diff --git a/R/maditr_verbs.r b/R/maditr_verbs.r
new file mode 100644
index 00000000..682e0560
--- /dev/null
+++ b/R/maditr_verbs.r
@@ -0,0 +1,9 @@
+#' Verbs from maditr
+#' @rdname maditr_verbs
+#' @importFrom maditr let
+#' @export
+let.disk.frame <- create_chunk_mapper(maditr::let, as.data.frame = FALSE)
+
+take.disk.frame <- create_chunk_mapper(maditr::take, as.data.frame = FALSE)
+
+take_if.disk.frame <- create_chunk_mapper(maditr::take_if, as.data.frame = FALSE)
\ No newline at end of file

From a056fda374011710642f6a257c057fe18458b876 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 12:25:14 +1100
Subject: [PATCH 5/6] full verbs

---
 R/maditr_verbs.r | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/R/maditr_verbs.r b/R/maditr_verbs.r
index 682e0560..7a1fe2f7 100644
--- a/R/maditr_verbs.r
+++ b/R/maditr_verbs.r
@@ -4,6 +4,42 @@
 #' @export
 let.disk.frame <- create_chunk_mapper(maditr::let, as.data.frame = FALSE)
 
+#' @export
+#' @importFrom maditr dt_mutate
+dt_mutate.disk.frame <- create_chunk_mapper(maditr::mutate, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_summarize
+chunk_dt_summarize.disk.frame <- create_chunk_mapper(maditr::dt_summarize, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_filter
+dt_filter.disk.frame <- create_chunk_mapper(maditr::filter, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_select
+dt_select.disk.frame <- create_chunk_mapper(maditr::select, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_arrange
+chunk_dt_arrange.disk.frame <- create_chunk_mapper(maditr::dt_arrange, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr take
 take.disk.frame <- create_chunk_mapper(maditr::take, as.data.frame = FALSE)
 
-take_if.disk.frame <- create_chunk_mapper(maditr::take_if, as.data.frame = FALSE)
\ No newline at end of file
+#' @export
+#' @importFrom maditr take_if
+take_if.disk.frame <- create_chunk_mapper(maditr::take_if, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_inner_join
+dt_inner_join.disk.frame <- create_chunk_mapper(maditr::dt_inner_join.disk.frame, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_left_join
+dt_left_join.disk.frame <- create_chunk_mapper(maditr::dt_left_join.disk.frame, as.data.frame = FALSE)
+
+#' @export
+#' @importFrom maditr dt_full_join
+dt_full_join.disk.frame <- create_chunk_mapper(maditr::dt_full_join.disk.frame, as.data.frame = FALSE)

From 1fc6f6f215824ca406c181d6fbc18ff31703f9f9 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sun, 15 Dec 2019 12:25:52 +1100
Subject: [PATCH 6/6] docs

---
 NAMESPACE           | 22 ++++++++++++++++++++++
 man/maditr_verbs.Rd | 11 +++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 man/maditr_verbs.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 743a6961..6e735302 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,6 +17,8 @@ S3method(compute,disk.frame)
 S3method(delayed,disk.frame)
 S3method(distinct,disk.frame)
 S3method(do,disk.frame)
+S3method(dt_filter,disk.frame)
+S3method(dt_select,disk.frame)
 S3method(filter,disk.frame)
 S3method(full_join,disk.frame)
 S3method(get_chunk,disk.frame)
@@ -37,6 +39,7 @@ S3method(lazy,disk.frame)
 S3method(left_join,disk.frame)
 S3method(length,chunk_agg.disk.frame)
 S3method(length,collected_agg.disk.frame)
+S3method(let,disk.frame)
 S3method(map,default)
 S3method(map,disk.frame)
 S3method(map2,default)
@@ -74,6 +77,8 @@ S3method(summarise,grouped_disk.frame)
 S3method(summarize,disk.frame)
 S3method(summarize,grouped_disk.frame)
 S3method(tail,disk.frame)
+S3method(take,disk.frame)
+S3method(take_if,disk.frame)
 S3method(tbl_vars,disk.frame)
 S3method(transmute,disk.frame)
 export(IQR.chunk_agg.disk.frame)
@@ -85,6 +90,8 @@ export(as.disk.frame)
 export(ceremony_text)
 export(chunk_arrange)
 export(chunk_distinct)
+export(chunk_dt_arrange.disk.frame)
+export(chunk_dt_summarize.disk.frame)
 export(chunk_group_by)
 export(chunk_lapply)
 export(chunk_summarise)
@@ -108,6 +115,10 @@ export(df_ram_size)
 export(dfglm)
 export(disk.frame)
 export(distribute)
+export(dt_full_join.disk.frame)
+export(dt_inner_join.disk.frame)
+export(dt_left_join.disk.frame)
+export(dt_mutate.disk.frame)
 export(evalparseglue)
 export(filter_all.disk.frame)
 export(filter_at.disk.frame)
@@ -254,6 +265,17 @@ importFrom(globals,findGlobals)
 importFrom(glue,glue)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)
+importFrom(maditr,dt_arrange)
+importFrom(maditr,dt_filter)
+importFrom(maditr,dt_full_join)
+importFrom(maditr,dt_inner_join)
+importFrom(maditr,dt_left_join)
+importFrom(maditr,dt_mutate)
+importFrom(maditr,dt_select)
+importFrom(maditr,dt_summarize)
+importFrom(maditr,let)
+importFrom(maditr,take)
+importFrom(maditr,take_if)
 importFrom(pryr,do_call)
 importFrom(pryr,object_size)
 importFrom(purrr,as_mapper)
diff --git a/man/maditr_verbs.Rd b/man/maditr_verbs.Rd
new file mode 100644
index 00000000..d4dc7279
--- /dev/null
+++ b/man/maditr_verbs.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/maditr_verbs.r
+\name{let.disk.frame}
+\alias{let.disk.frame}
+\title{Verbs from maditr}
+\usage{
+\method{let}{disk.frame}(.data, ...)
+}
+\description{
+Verbs from maditr
+}