Apply the same function to all chunks

`cimap.disk.frame` accepts a two argument function where the first argument is a data.frame and the second is the chunk ID

`lazy` is convenience function to apply `.f` to every chunk

`delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl

cmap(.x, .f, ...)

# S3 method for disk.frame
cmap(.x, .f, ...)

cmap_dfr(.x, .f, ..., .id = NULL)

# S3 method for disk.frame
cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)

cimap(.x, .f, ...)

# S3 method for disk.frame
cimap(
  .x,
  .f,
  outdir = NULL,
  keep = NULL,
  lazy = TRUE,
  overwrite = FALSE,
  compress = 50,
  ...
)

cimap_dfr(.x, .f, ..., .id = NULL)

# S3 method for disk.frame
cimap_dfr(
  .x,
  .f,
  ...,
  .id = NULL,
  use.names = fill,
  fill = FALSE,
  idcol = NULL
)

lazy(.x, .f, ...)

# S3 method for disk.frame
lazy(.x, .f, ...)

delayed(.x, .f, ...)

clapply(...)

Arguments

.x

a disk.frame

.f

a function to apply to each of the chunks

...

Passed to `collect` and `write_disk.frame`

.id

ignored

use.names

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

fill

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

idcol

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

outdir

the output directory

keep

The columns to keep at source

lazy

if TRUE then do this lazily

overwrite

Whether to overwrite any files in the output directory

compress

The compression setting. 0-100

Examples

cars.df = as.disk.frame(cars)

# return the first row of each chunk lazily
# 
cars2 = cmap(cars.df, function(chunk) {
 chunk[,1]
})

collect(cars2)
#>     speed
#>  1:     4
#>  2:     4
#>  3:     7
#>  4:     7
#>  5:     8
#>  6:     9
#>  7:    10
#>  8:    10
#>  9:    10
#> 10:    11
#> 11:    11
#> 12:    12
#> 13:    12
#> 14:    12
#> 15:    12
#> 16:    13
#> 17:    13
#> 18:    13
#> 19:    13
#> 20:    14
#> 21:    14
#> 22:    14
#> 23:    14
#> 24:    15
#> 25:    15
#> 26:    15
#> 27:    16
#> 28:    16
#> 29:    17
#> 30:    17
#> 31:    17
#> 32:    18
#> 33:    18
#> 34:    18
#> 35:    18
#> 36:    19
#> 37:    19
#> 38:    19
#> 39:    20
#> 40:    20
#> 41:    20
#> 42:    20
#> 43:    20
#> 44:    22
#> 45:    23
#> 46:    24
#> 47:    24
#> 48:    24
#> 49:    24
#> 50:    25
#>     speed

# same as above but using purrr 
cars2 = cmap(cars.df, ~.x[1,])

collect(cars2)
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# return the first row of each chunk eagerly as list
cmap(cars.df, ~.x[1,], lazy = FALSE)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpCo1OFr\file2b642873746.df"
#> nchunks: 6
#> nrow (at source): 50
#> ncol (at source): 2
#> nrow (post operations): ???
#> ncol (post operations): ???

# return the first row of each chunk eagerly as data.table/data.frame by row-binding
cmap_dfr(cars.df, ~.x[1,])
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
collect(lazy(cars.df, ~.x[1,]))
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70
collect(delayed(cars.df, ~.x[1,]))
#>    speed dist
#> 1:     4    2
#> 2:    11   17
#> 3:    13   46
#> 4:    16   40
#> 5:    19   46
#> 6:    24   70

# clean up cars.df
delete(cars.df)