Apply the same function to all chunks
`cimap.disk.frame` accepts a two argument function where the first argument is a data.frame and the second is the chunk ID
`lazy` is convenience function to apply `.f` to every chunk
`delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl
cmap(.x, .f, ...)
# S3 method for disk.frame
cmap(.x, .f, ...)
cmap_dfr(.x, .f, ..., .id = NULL)
# S3 method for disk.frame
cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)
cimap(.x, .f, ...)
# S3 method for disk.frame
cimap(
.x,
.f,
outdir = NULL,
keep = NULL,
lazy = TRUE,
overwrite = FALSE,
compress = 50,
...
)
cimap_dfr(.x, .f, ..., .id = NULL)
# S3 method for disk.frame
cimap_dfr(
.x,
.f,
...,
.id = NULL,
use.names = fill,
fill = FALSE,
idcol = NULL
)
lazy(.x, .f, ...)
# S3 method for disk.frame
lazy(.x, .f, ...)
delayed(.x, .f, ...)
clapply(...)
a disk.frame
a function to apply to each of the chunks
Passed to `collect` and `write_disk.frame`
ignored
for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
the output directory
The columns to keep at source
if TRUE then do this lazily
Whether to overwrite any files in the output directory
The compression setting. 0-100
cars.df = as.disk.frame(cars)
# return the first row of each chunk lazily
#
cars2 = cmap(cars.df, function(chunk) {
chunk[,1]
})
collect(cars2)
#> speed
#> 1: 4
#> 2: 4
#> 3: 7
#> 4: 7
#> 5: 8
#> 6: 9
#> 7: 10
#> 8: 10
#> 9: 10
#> 10: 11
#> 11: 11
#> 12: 12
#> 13: 12
#> 14: 12
#> 15: 12
#> 16: 13
#> 17: 13
#> 18: 13
#> 19: 13
#> 20: 14
#> 21: 14
#> 22: 14
#> 23: 14
#> 24: 15
#> 25: 15
#> 26: 15
#> 27: 16
#> 28: 16
#> 29: 17
#> 30: 17
#> 31: 17
#> 32: 18
#> 33: 18
#> 34: 18
#> 35: 18
#> 36: 19
#> 37: 19
#> 38: 19
#> 39: 20
#> 40: 20
#> 41: 20
#> 42: 20
#> 43: 20
#> 44: 22
#> 45: 23
#> 46: 24
#> 47: 24
#> 48: 24
#> 49: 24
#> 50: 25
#> speed
# same as above but using purrr
cars2 = cmap(cars.df, ~.x[1,])
collect(cars2)
#> speed dist
#> 1: 4 2
#> 2: 11 17
#> 3: 13 46
#> 4: 16 40
#> 5: 19 46
#> 6: 24 70
# return the first row of each chunk eagerly as list
cmap(cars.df, ~.x[1,], lazy = FALSE)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpCo1OFr\file2b642873746.df"
#> nchunks: 6
#> nrow (at source): 50
#> ncol (at source): 2
#> nrow (post operations): ???
#> ncol (post operations): ???
# return the first row of each chunk eagerly as data.table/data.frame by row-binding
cmap_dfr(cars.df, ~.x[1,])
#> speed dist
#> 1: 4 2
#> 2: 11 17
#> 3: 13 46
#> 4: 16 40
#> 5: 19 46
#> 6: 24 70
# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
collect(lazy(cars.df, ~.x[1,]))
#> speed dist
#> 1: 4 2
#> 2: 11 17
#> 3: 13 46
#> 4: 16 40
#> 5: 19 46
#> 6: 24 70
collect(delayed(cars.df, ~.x[1,]))
#> speed dist
#> 1: 4 2
#> 2: 11 17
#> 3: 13 46
#> 4: 16 40
#> 5: 19 46
#> 6: 24 70
# clean up cars.df
delete(cars.df)