Apply the same function to all chunks

`cimap.disk.frame` accepts a two argument function where the first argument is a data.frame and the second is the chunk ID

`lazy` is convenience function to apply `.f` to every chunk

`delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl

cmap(.x, .f, ...)

# S3 method for disk.frame
cmap(.x, .f, ...)

cmap_dfr(.x, .f, ..., .id = NULL)

# S3 method for disk.frame
cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)

cimap(.x, .f, ...)

# S3 method for disk.frame
cimap(
  .x,
  .f,
  outdir = NULL,
  keep = NULL,
  lazy = TRUE,
  overwrite = FALSE,
  compress = 50,
  ...
)

cimap_dfr(.x, .f, ..., .id = NULL)

# S3 method for disk.frame
cimap_dfr(
  .x,
  .f,
  ...,
  .id = NULL,
  use.names = fill,
  fill = FALSE,
  idcol = NULL
)

lazy(.x, .f, ...)

# S3 method for disk.frame
lazy(.x, .f, ...)

delayed(.x, .f, ...)

clapply(...)

Arguments

.x: a disk.frame
.f: a function to apply to each of the chunks
...: Passed to `collect` and `write_disk.frame`
.id: ignored
use.names: for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
fill: for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
idcol: for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
outdir: the output directory
keep: The columns to keep at source
lazy: if TRUE then do this lazily
overwrite: Whether to overwrite any files in the output directory
compress: The compression setting. 0-100

Examples

cars.df = as.disk.frame(cars)

# return the first row of each chunk lazily
# 
cars2 = cmap(cars.df, function(chunk) {
 chunk[,1]
})

collect(cars2)
#>     speed
#>  1:     4
#>  2:     4
#>  3:     7
#>  4:     7
#>  5:     8
#>  6:     9
#>  7:    10
#>  8:    10
#>  9:    10
#> 10:    11
#> 11:    11
#> 12:    12
#> 13:    12
#> 14:    12
#> 15:    12
#> 16:    13
#> 17:    13
#> 18:    13
#> 19:    13
#> 20:    14
#> 21:    14
#> 22:    14
#> 23:    14
#> 24:    15
#> 25:    15
#> 26:    15
#> 27:    16
#> 28:    16
#> 29:    17
#> 30:    17
#> 31:    17
#> 32:    18
#> 33:    18
#> 34:    18
#> 35:    18
#> 36:    19
#> 37:    19
#> 38:    19
#> 39:    20
#> 40:    20
#> 41:    20
#> 42:    20
#> 43:    20
#> 44:    22
#> 45:    23
#> 46:    24
#> 47:    24
#> 48:    24
#> 49:    24
#> 50:    25
#>     speed

# same as above but using purrr 
cars2 = cmap(cars.df, ~.x[1,])

collect(cars2)
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# return the first row of each chunk eagerly as list
cmap(cars.df, ~.x[1,], lazy = FALSE)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpCo1OFr\file2b642873746.df"
#> nchunks: 6
#> nrow (at source): 50
#> ncol (at source): 2
#> nrow (post operations): ???
#> ncol (post operations): ???

# return the first row of each chunk eagerly as data.table/data.frame by row-binding
cmap_dfr(cars.df, ~.x[1,])
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
collect(lazy(cars.df, ~.x[1,]))
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70
collect(delayed(cars.df, ~.x[1,]))
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# clean up cars.df
delete(cars.df)