Bring the disk.frame into RAM by loading the data and running all lazy operations as data.table/data.frame or as a list

Bring the disk.frame into RAM by loading the data and running all lazy operations as data.table/data.frame or as a list

# S3 method for disk.frame
collect(x, ..., parallel = !is.null(attr(x, "recordings")))

collect_list(
  x,
  simplify = FALSE,
  parallel = !is.null(attr(x, "recordings")),
  ...
)

# S3 method for summarized_disk.frame
collect(x, ..., parallel = !is.null(attr(x, "recordings")))

Arguments

x

a disk.frame

...

not used

parallel

if TRUE the collection is performed in parallel. By default if there are delayed/lazy steps then it will be parallel, otherwise it will not be in parallel. This is because parallel requires transferring data from background R session to the current R session and if there is no computation then it's better to avoid transferring data between session, hence parallel = FALSE is a better choice

simplify

Should the result be simplified to array

Value

collect return a data.frame/data.table collect_list returns a list collect return a data.frame/data.table

Examples

cars.df = as.disk.frame(cars)
# use collect to bring the data into RAM as a data.table/data.frame
collect(cars.df)
#>     speed dist
#>  1:     4    2
#>  2:     4   10
#>  3:     7    4
#>  4:     7   22
#>  5:     8   16
#>  6:     9   10
#>  7:    10   18
#>  8:    10   26
#>  9:    10   34
#> 10:    11   17
#> 11:    11   28
#> 12:    12   14
#> 13:    12   20
#> 14:    12   24
#> 15:    12   28
#> 16:    13   26
#> 17:    13   34
#> 18:    13   34
#> 19:    13   46
#> 20:    14   26
#> 21:    14   36
#> 22:    14   60
#> 23:    14   80
#> 24:    15   20
#> 25:    15   26
#> 26:    15   54
#> 27:    16   32
#> 28:    16   40
#> 29:    17   32
#> 30:    17   40
#> 31:    17   50
#> 32:    18   42
#> 33:    18   56
#> 34:    18   76
#> 35:    18   84
#> 36:    19   36
#> 37:    19   46
#> 38:    19   68
#> 39:    20   32
#> 40:    20   48
#> 41:    20   52
#> 42:    20   56
#> 43:    20   64
#> 44:    22   66
#> 45:    23   54
#> 46:    24   70
#> 47:    24   92
#> 48:    24   93
#> 49:    24  120
#> 50:    25   85
#>     speed dist

# clean up
delete(cars.df)
cars.df = as.disk.frame(cars)

# returns the result as a list
collect_list(cmap(cars.df, ~1))
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 1
#> 
#> [[3]]
#> [1] 1
#> 
#> [[4]]
#> [1] 1
#> 
#> [[5]]
#> [1] 1
#> 
#> [[6]]
#> [1] 1
#> 

# clean up
delete(cars.df)
cars.df = as.disk.frame(cars)
# use collect to bring the data into RAM as a data.table/data.frame
collect(cars.df)
#>     speed dist
#>  1:     4    2
#>  2:     4   10
#>  3:     7    4
#>  4:     7   22
#>  5:     8   16
#>  6:     9   10
#>  7:    10   18
#>  8:    10   26
#>  9:    10   34
#> 10:    11   17
#> 11:    11   28
#> 12:    12   14
#> 13:    12   20
#> 14:    12   24
#> 15:    12   28
#> 16:    13   26
#> 17:    13   34
#> 18:    13   34
#> 19:    13   46
#> 20:    14   26
#> 21:    14   36
#> 22:    14   60
#> 23:    14   80
#> 24:    15   20
#> 25:    15   26
#> 26:    15   54
#> 27:    16   32
#> 28:    16   40
#> 29:    17   32
#> 30:    17   40
#> 31:    17   50
#> 32:    18   42
#> 33:    18   56
#> 34:    18   76
#> 35:    18   84
#> 36:    19   36
#> 37:    19   46
#> 38:    19   68
#> 39:    20   32
#> 40:    20   48
#> 41:    20   52
#> 42:    20   56
#> 43:    20   64
#> 44:    22   66
#> 45:    23   54
#> 46:    24   70
#> 47:    24   92
#> 48:    24   93
#> 49:    24  120
#> 50:    25   85
#>     speed dist

# clean up
delete(cars.df)