library(dplyr)
library(MASS) # get a dataset for manipulation
my.data <- tbl_df(Boston) # wraps a local data frame for better printing
my.data
## Source: local data frame [506 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.090 1 296 15.3 396.9
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.967 2 242 17.8 396.9
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.967 2 242 17.8 392.8
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.062 3 222 18.7 394.6
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.062 3 222 18.7 396.9
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.062 3 222 18.7 394.1
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.561 5 311 15.2 395.6
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.950 5 311 15.2 396.9
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.082 5 311 15.2 386.6
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.592 5 311 15.2 386.7
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl)
These correspond to basic actions of data manipulation.
filter(my.data, rad==4)
## Source: local data frame [110 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.6298 0 8.14 0 0.538 5.949 61.8 4.707 4 307 21 396.9 8.26
## 2 0.6380 0 8.14 0 0.538 6.096 84.5 4.462 4 307 21 380.0 10.26
## 3 0.6274 0 8.14 0 0.538 5.834 56.5 4.499 4 307 21 395.6 8.47
## 4 1.0539 0 8.14 0 0.538 5.935 29.3 4.499 4 307 21 386.9 6.58
## 5 0.7842 0 8.14 0 0.538 5.990 81.7 4.258 4 307 21 386.8 14.67
## 6 0.8027 0 8.14 0 0.538 5.456 36.6 3.796 4 307 21 289.0 11.69
## 7 0.7258 0 8.14 0 0.538 5.727 69.5 3.796 4 307 21 390.9 11.28
## 8 1.2518 0 8.14 0 0.538 5.570 98.1 3.798 4 307 21 376.6 21.02
## 9 0.8520 0 8.14 0 0.538 5.965 89.2 4.012 4 307 21 392.5 13.83
## 10 1.2325 0 8.14 0 0.538 6.142 91.7 3.977 4 307 21 396.9 18.72
## .. ... .. ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: medv (dbl)
filter(my.data, rad==1 & tax>320)
## Source: local data frame [3 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.02498 0 1.89 0 0.518 6.540 59.7 6.267 1 422 15.9 390.0 8.65
## 2 0.02899 40 1.25 0 0.429 6.939 34.5 8.792 1 335 19.7 389.9 5.89
## 3 0.06211 40 1.25 0 0.429 6.490 44.4 8.792 1 335 19.7 396.9 5.98
## Variables not shown: medv (dbl)
filter(my.data, rad==1 & (tax<200|tax>300))
## Source: local data frame [8 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.04819 80 3.64 0 0.3920 6.108 32.0 9.220 1 315 16.4 392.9
## 2 0.03548 80 3.64 0 0.3920 5.876 19.1 9.220 1 315 16.4 395.2
## 3 0.01501 90 1.21 1 0.4010 7.923 24.8 5.885 1 198 13.6 395.5
## 4 0.05023 35 6.06 0 0.4379 5.706 28.4 6.641 1 304 16.9 394.0
## 5 0.03466 35 6.06 0 0.4379 6.031 23.3 6.641 1 304 16.9 362.2
## 6 0.02498 0 1.89 0 0.5180 6.540 59.7 6.267 1 422 15.9 390.0
## 7 0.02899 40 1.25 0 0.4290 6.939 34.5 8.792 1 335 19.7 389.9
## 8 0.06211 40 1.25 0 0.4290 6.490 44.4 8.792 1 335 19.7 396.9
## Variables not shown: lstat (dbl), medv (dbl)
arrange(my.data, rad, tax) # order first by 'rad' then by 'tax'
## Source: local data frame [506 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.01501 90 1.21 1 0.401 7.923 24.8 5.885 1 198 13.6 395.5
## 2 0.01965 80 1.76 0 0.385 6.230 31.5 9.089 1 241 18.2 341.6
## 3 0.02187 60 2.93 0 0.401 6.800 9.9 6.220 1 265 15.6 393.4
## 4 0.01439 60 2.93 0 0.401 6.604 18.8 6.220 1 265 15.6 376.7
## 5 0.06263 0 11.93 0 0.573 6.593 69.1 2.479 1 273 21.0 392.0
## 6 0.04527 0 11.93 0 0.573 6.120 76.7 2.288 1 273 21.0 396.9
## 7 0.06076 0 11.93 0 0.573 6.976 91.0 2.167 1 273 21.0 396.9
## 8 0.10959 0 11.93 0 0.573 6.794 89.3 2.389 1 273 21.0 393.4
## 9 0.04741 0 11.93 0 0.573 6.030 80.8 2.505 1 273 21.0 396.9
## 10 0.01301 35 1.52 0 0.442 7.241 49.3 7.038 1 284 15.5 394.7
## .. ... .. ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl)
arrange(my.data, rad, desc(tax), age)
## Source: local data frame [506 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.02498 0 1.89 0 0.5180 6.540 59.7 6.267 1 422 15.9 390.0
## 2 0.02899 40 1.25 0 0.4290 6.939 34.5 8.792 1 335 19.7 389.9
## 3 0.06211 40 1.25 0 0.4290 6.490 44.4 8.792 1 335 19.7 396.9
## 4 0.03548 80 3.64 0 0.3920 5.876 19.1 9.220 1 315 16.4 395.2
## 5 0.04819 80 3.64 0 0.3920 6.108 32.0 9.220 1 315 16.4 392.9
## 6 0.03466 35 6.06 0 0.4379 6.031 23.3 6.641 1 304 16.9 362.2
## 7 0.05023 35 6.06 0 0.4379 5.706 28.4 6.641 1 304 16.9 394.0
## 8 0.01096 55 2.25 0 0.3890 6.453 31.9 7.307 1 300 15.3 394.7
## 9 0.00632 18 2.31 0 0.5380 6.575 65.2 4.090 1 296 15.3 396.9
## 10 0.00906 90 2.97 0 0.4000 7.088 20.8 7.307 1 285 15.3 394.7
## .. ... .. ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl)
dplyr::select(my.data, rad, age) # explicit package since MASS masked dplyr's 'select'
## Source: local data frame [506 x 2]
##
## rad age
## 1 1 65.2
## 2 2 78.9
## 3 2 61.1
## 4 3 45.8
## 5 3 54.2
## 6 3 58.7
## 7 5 66.6
## 8 5 96.1
## 9 5 100.0
## 10 5 85.9
## .. ... ...
dplyr::select(my.data, nox:dis) # select data frame features from 'nox' to 'dis'
## Source: local data frame [506 x 4]
##
## nox rm age dis
## 1 0.538 6.575 65.2 4.090
## 2 0.469 6.421 78.9 4.967
## 3 0.469 7.185 61.1 4.967
## 4 0.458 6.998 45.8 6.062
## 5 0.458 7.147 54.2 6.062
## 6 0.458 6.430 58.7 6.062
## 7 0.524 6.012 66.6 5.561
## 8 0.524 6.172 96.1 5.950
## 9 0.524 5.631 100.0 6.082
## 10 0.524 6.004 85.9 6.592
## .. ... ... ... ...
dplyr::select(my.data, starts_with('c')) # select columns that start with 'c'
## Source: local data frame [506 x 2]
##
## crim chas
## 1 0.00632 0
## 2 0.02731 0
## 3 0.02729 0
## 4 0.03237 0
## 5 0.06905 0
## 6 0.02985 0
## 7 0.08829 0
## 8 0.14455 0
## 9 0.21124 0
## 10 0.17004 0
## .. ... ...
dplyr::select(my.data, matches('^[cd]')) # select columns that start with 'c' or 'd' (regular expression)
## Source: local data frame [506 x 3]
##
## crim chas dis
## 1 0.00632 0 4.090
## 2 0.02731 0 4.967
## 3 0.02729 0 4.967
## 4 0.03237 0 6.062
## 5 0.06905 0 6.062
## 6 0.02985 0 6.062
## 7 0.08829 0 5.561
## 8 0.14455 0 5.950
## 9 0.21124 0 6.082
## 10 0.17004 0 6.592
## .. ... ... ...
dplyr::select(my.data, -matches('^[cd]')) # drop columns that start with 'c' or 'd' (regular expression)
## Source: local data frame [506 x 11]
##
## zn indus nox rm age rad tax ptratio black lstat medv
## 1 18.0 2.31 0.538 6.575 65.2 1 296 15.3 396.9 4.98 24.0
## 2 0.0 7.07 0.469 6.421 78.9 2 242 17.8 396.9 9.14 21.6
## 3 0.0 7.07 0.469 7.185 61.1 2 242 17.8 392.8 4.03 34.7
## 4 0.0 2.18 0.458 6.998 45.8 3 222 18.7 394.6 2.94 33.4
## 5 0.0 2.18 0.458 7.147 54.2 3 222 18.7 396.9 5.33 36.2
## 6 0.0 2.18 0.458 6.430 58.7 3 222 18.7 394.1 5.21 28.7
## 7 12.5 7.87 0.524 6.012 66.6 5 311 15.2 395.6 12.43 22.9
## 8 12.5 7.87 0.524 6.172 96.1 5 311 15.2 396.9 19.15 27.1
## 9 12.5 7.87 0.524 5.631 100.0 5 311 15.2 386.6 29.93 16.5
## 10 12.5 7.87 0.524 6.004 85.9 5 311 15.2 386.7 17.10 18.9
## .. ... ... ... ... ... ... ... ... ... ... ...
rename(my.data, Z.N=zn)
## Source: local data frame [506 x 14]
##
## crim Z.N indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.090 1 296 15.3 396.9
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.967 2 242 17.8 396.9
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.967 2 242 17.8 392.8
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.062 3 222 18.7 394.6
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.062 3 222 18.7 396.9
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.062 3 222 18.7 394.1
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.561 5 311 15.2 395.6
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.950 5 311 15.2 396.9
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.082 5 311 15.2 386.6
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.592 5 311 15.2 386.7
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl)
mutate(my.data, twice.age = 2*age, new.col = sqrt(twice.age)) # can refer to new columns of the spot
## Source: local data frame [506 x 16]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.090 1 296 15.3 396.9
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.967 2 242 17.8 396.9
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.967 2 242 17.8 392.8
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.062 3 222 18.7 394.6
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.062 3 222 18.7 396.9
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.062 3 222 18.7 394.1
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.561 5 311 15.2 395.6
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.950 5 311 15.2 396.9
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.082 5 311 15.2 386.6
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.592 5 311 15.2 386.7
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl), twice.age (dbl), new.col
## (dbl)
transmute(my.data, twice.age = 2*age, new.col = sqrt(twice.age)) # transmute is like mutate bt only keeps the new columns
## Source: local data frame [506 x 2]
##
## twice.age new.col
## 1 130.4 11.419
## 2 157.8 12.562
## 3 122.2 11.054
## 4 91.6 9.571
## 5 108.4 10.412
## 6 117.4 10.835
## 7 133.2 11.541
## 8 192.2 13.864
## 9 200.0 14.142
## 10 171.8 13.107
## .. ... ...
summarise(my.data, mean.age = mean(age, na.rm = TRUE))
## Source: local data frame [1 x 1]
##
## mean.age
## 1 68.57
slice(my.data, c(2:6,8))
## Source: local data frame [6 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.967 2 242 17.8 396.9
## 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.967 2 242 17.8 392.8
## 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.062 3 222 18.7 394.6
## 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.062 3 222 18.7 396.9
## 5 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.062 3 222 18.7 394.1
## 6 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.950 5 311 15.2 396.9
## Variables not shown: lstat (dbl), medv (dbl)
slice(my.data, n()) # last record
## Source: local data frame [1 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.04741 0 11.93 0 0.573 6.03 80.8 2.505 1 273 21 396.9 7.88
## Variables not shown: medv (dbl)
d <- dplyr::select(my.data, tax)
arrange( distinct(d, tax), tax)
## Source: local data frame [66 x 1]
##
## tax
## 1 187
## 2 188
## 3 193
## 4 198
## 5 216
## 6 222
## 7 223
## 8 224
## 9 226
## 10 233
## 11 241
## 12 242
## 13 243
## 14 244
## 15 245
## 16 247
## 17 252
## 18 254
## 19 255
## 20 256
## 21 264
## 22 265
## 23 270
## 24 273
## 25 276
## 26 277
## 27 279
## 28 280
## 29 281
## 30 284
## 31 285
## 32 287
## 33 289
## 34 293
## 35 296
## 36 300
## 37 304
## 38 305
## 39 307
## 40 311
## 41 313
## 42 315
## 43 329
## 44 330
## 45 334
## 46 335
## 47 337
## 48 345
## 49 348
## 50 351
## 51 352
## 52 358
## 53 370
## 54 384
## 55 391
## 56 398
## 57 402
## 58 403
## 59 411
## 60 422
## 61 430
## 62 432
## 63 437
## 64 469
## 65 666
## 66 711
sample_n(my.data, 6) # a sample of 6 records
## Source: local data frame [6 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 13.67810 0 18.10 0 0.740 5.935 87.9 1.821 24 666 20.2 68.95
## 2 0.03113 0 4.39 0 0.442 6.014 48.5 8.014 3 352 18.8 385.64
## 3 0.03548 80 3.64 0 0.392 5.876 19.1 9.220 1 315 16.4 395.18
## 4 16.81180 0 18.10 0 0.700 5.277 98.1 1.426 24 666 20.2 396.90
## 5 0.03150 95 1.47 0 0.403 6.975 15.3 7.653 3 402 17.0 396.90
## 6 1.62864 0 21.89 0 0.624 5.019 100.0 1.439 4 437 21.2 396.90
## Variables not shown: lstat (dbl), medv (dbl)
sample_frac(my.data, .03) # a sample of 3% of the records
## Source: local data frame [15 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 1.05393 0 8.14 0 0.538 5.935 29.3 4.499 4 307 21.0 386.85
## 2 6.71772 0 18.10 0 0.713 6.749 92.6 2.324 24 666 20.2 0.32
## 3 0.57529 0 6.20 0 0.507 8.337 73.3 3.838 8 307 17.4 385.91
## 4 7.05042 0 18.10 0 0.614 6.103 85.1 2.022 24 666 20.2 2.52
## 5 0.06664 0 4.05 0 0.510 6.546 33.1 3.132 5 296 16.6 390.96
## 6 0.08664 45 3.44 0 0.437 7.178 26.3 6.480 5 398 15.2 390.49
## 7 0.27957 0 9.69 0 0.585 5.926 42.6 2.382 6 391 19.2 396.90
## 8 0.61154 20 3.97 0 0.647 8.704 86.9 1.801 5 264 13.0 389.70
## 9 0.76162 20 3.97 0 0.647 5.560 62.8 1.986 5 264 13.0 392.40
## 10 0.01778 95 1.47 0 0.403 7.135 13.9 7.653 3 402 17.0 384.30
## 11 3.69695 0 18.10 0 0.718 4.963 91.4 1.752 24 666 20.2 316.03
## 12 0.02498 0 1.89 0 0.518 6.540 59.7 6.267 1 422 15.9 389.96
## 13 0.09164 0 10.81 0 0.413 6.065 7.8 5.287 4 305 19.2 390.91
## 14 0.25915 0 21.89 0 0.624 5.693 96.0 1.788 4 437 21.2 392.11
## 15 13.07510 0 18.10 0 0.580 5.713 56.7 2.824 24 666 20.2 396.90
## Variables not shown: lstat (dbl), medv (dbl)
sample_frac(my.data, .03, replace=TRUE) # a bootstrap sample of 3% of the records
## Source: local data frame [15 x 14]
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.06664 0 4.05 0 0.510 6.546 33.1 3.132 5 296 16.6 390.96
## 2 37.66190 0 18.10 0 0.679 6.202 78.7 1.863 24 666 20.2 18.82
## 3 0.10469 40 6.41 1 0.447 7.267 49.0 4.787 4 254 17.6 389.25
## 4 0.04932 33 2.18 0 0.472 6.849 70.3 3.183 7 222 18.4 396.90
## 5 0.17331 0 9.69 0 0.585 5.707 54.0 2.382 6 391 19.2 396.90
## 6 0.04932 33 2.18 0 0.472 6.849 70.3 3.183 7 222 18.4 396.90
## 7 6.28807 0 18.10 0 0.740 6.341 96.4 2.072 24 666 20.2 318.01
## 8 0.66351 20 3.97 0 0.647 7.333 100.0 1.895 5 264 13.0 383.29
## 9 3.53501 0 19.58 1 0.871 6.152 82.6 1.746 5 403 14.7 88.01
## 10 0.22438 0 9.69 0 0.585 6.027 79.7 2.498 6 391 19.2 396.90
## 11 0.17899 0 9.69 0 0.585 5.670 28.8 2.799 6 391 19.2 393.29
## 12 25.04610 0 18.10 0 0.693 5.987 100.0 1.589 24 666 20.2 396.90
## 13 0.08187 0 2.89 0 0.445 7.820 36.9 3.495 2 276 18.0 393.53
## 14 2.33099 0 19.58 0 0.871 5.186 93.8 1.530 5 403 14.7 356.99
## 15 0.35809 0 6.20 1 0.507 6.951 88.5 2.862 8 307 17.4 391.70
## Variables not shown: lstat (dbl), medv (dbl)
There is also an option weight=vector
that allows us to sample a weightned sample of the dataset. The vector must have the same size as the number of rows of the dataset (weights are automatically standardised to sum to 1).
These verbs are useful, but they become really powerful when you combine them with the idea of “group by”, repeating the operation individually on groups of observations within the tbl. In dplyr, you use the group_by() function to describe how to break a dataset down into groups of rows. You can then use the resulting object in the exactly the same functions as above; they’ll automatically work “by group” when the input is a grouped tbl. ref.
taxes <- group_by(my.data, tax) # group dataset by tax
report <- summarise(taxes, count=n(), mean.age=mean(age, na.rm = TRUE))
arrange(report, count, tax)
## Source: local data frame [66 x 3]
##
## tax count mean.age
## 1 187 1 36.100
## 2 198 1 24.800
## 3 226 1 21.900
## 4 241 1 31.500
## 5 244 1 34.200
## 6 255 1 32.000
## 7 256 1 40.500
## 8 280 1 29.700
## 9 285 1 20.800
## 10 313 1 35.700
## 11 351 1 27.700
## 12 422 1 59.700
## 13 469 1 47.600
## 14 242 2 70.000
## 15 252 2 18.800
## 16 265 2 14.350
## 17 315 2 25.550
## 18 334 2 20.700
## 19 335 2 39.450
## 20 337 2 24.450
## 21 348 2 27.050
## 22 352 2 50.400
## 23 370 2 42.250
## 24 402 2 14.600
## 25 411 2 27.200
## 26 245 3 26.333
## 27 293 3 33.267
## 28 345 3 30.400
## 29 358 3 25.833
## 30 430 3 25.067
## 31 243 4 37.800
## 32 247 4 51.500
## 33 279 4 50.325
## 34 281 4 45.700
## 35 305 4 9.525
## 36 216 5 48.620
## 37 223 5 46.080
## 38 254 5 36.880
## 39 273 5 81.380
## 40 289 5 40.160
## 41 711 5 94.260
## 42 329 6 30.917
## 43 188 7 89.071
## 44 222 7 57.157
## 45 270 7 64.171
## 46 284 7 56.643
## 47 300 7 38.957
## 48 311 7 80.686
## 49 193 8 75.487
## 50 287 8 44.775
## 51 296 8 66.825
## 52 391 8 60.925
## 53 233 9 40.656
## 54 276 9 70.011
## 55 432 9 81.744
## 56 224 10 42.330
## 57 330 10 36.450
## 58 277 11 56.873
## 59 384 11 84.591
## 60 264 12 83.558
## 61 398 12 37.442
## 62 304 14 65.757
## 63 437 15 97.173
## 64 403 30 95.163
## 65 307 40 76.918
## 66 666 132 89.805
Some tools:
rads <- group_by(my.data, rad)
summarise(rads, n.taxes = n(), diff.taxes = n_distinct(tax), first.tax=first(tax), last.tax=last(tax))
## Source: local data frame [9 x 5]
##
## rad n.taxes diff.taxes first.tax last.tax
## 1 1 20 12 296 273
## 2 2 24 7 242 348
## 3 3 38 11 222 352
## 4 4 110 21 307 711
## 5 5 115 16 311 187
## 6 6 26 4 432 391
## 7 7 17 3 330 222
## 8 8 24 2 284 307
## 9 24 132 1 666 666
We can provide our own functions (slower performance, unless is in C++)
my.f <- function(x) sqrt(x)+1
summarise(rads, f = my.f(mean(age)))
## Source: local data frame [9 x 2]
##
## rad f
## 1 1 7.710
## 2 2 9.048
## 3 3 8.022
## 4 4 8.800
## 5 5 9.320
## 6 6 8.755
## 7 7 7.336
## 8 8 9.207
## 9 24 10.477
When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:
taxes.rad <- group_by(my.data, tax, rad)
by.rad <- summarise(taxes.rad, size=n())
arrange(by.rad, tax)
## Source: local data frame [77 x 3]
## Groups: tax
##
## tax rad size
## 1 187 5 1
## 2 188 2 7
## 3 193 3 8
## 4 198 1 1
## 5 216 3 1
## 6 216 5 4
## 7 222 3 3
## 8 222 7 4
## 9 223 3 5
## 10 224 4 2
## 11 224 5 8
## 12 226 5 1
## 13 233 3 9
## 14 241 1 1
## 15 242 2 2
## 16 243 4 4
## 17 244 3 1
## 18 245 4 3
## 19 247 3 4
## 20 252 3 2
## 21 254 4 5
## 22 255 4 1
## 23 256 5 1
## 24 264 5 12
## 25 265 1 2
## 26 270 2 4
## 27 270 4 3
## 28 273 1 5
## 29 276 2 5
## 30 276 5 4
## 31 277 4 11
## 32 279 5 4
## 33 280 4 1
## 34 281 4 4
## 35 284 1 1
## 36 284 8 6
## 37 285 1 1
## 38 287 5 8
## 39 289 4 5
## 40 293 6 3
## 41 296 1 1
## 42 296 5 7
## 43 300 1 1
## 44 300 6 6
## 45 304 1 2
## 46 304 4 12
## 47 305 4 4
## 48 307 4 22
## 49 307 8 18
## 50 311 5 7
## 51 313 2 1
## 52 315 1 2
## 53 329 2 3
## 54 329 7 3
## 55 330 7 10
## 56 334 4 2
## 57 335 1 2
## 58 337 4 2
## 59 345 4 3
## 60 348 2 2
## 61 351 4 1
## 62 352 3 2
## 63 358 5 3
## 64 370 5 2
## 65 384 5 11
## 66 391 6 8
## 67 398 5 12
## 68 402 3 2
## 69 403 5 30
## 70 411 4 2
## 71 422 1 1
## 72 430 4 3
## 73 432 6 9
## 74 437 4 15
## 75 469 3 1
## 76 666 24 132
## 77 711 4 5
by.tax <- summarise(by.rad, sum=sum(size))
arrange(by.tax,tax)
## Source: local data frame [66 x 2]
##
## tax sum
## 1 187 1
## 2 188 7
## 3 193 8
## 4 198 1
## 5 216 5
## 6 222 7
## 7 223 5
## 8 224 10
## 9 226 1
## 10 233 9
## 11 241 1
## 12 242 2
## 13 243 4
## 14 244 1
## 15 245 3
## 16 247 4
## 17 252 2
## 18 254 5
## 19 255 1
## 20 256 1
## 21 264 12
## 22 265 2
## 23 270 7
## 24 273 5
## 25 276 9
## 26 277 11
## 27 279 4
## 28 280 1
## 29 281 4
## 30 284 7
## 31 285 1
## 32 287 8
## 33 289 5
## 34 293 3
## 35 296 8
## 36 300 7
## 37 304 14
## 38 305 4
## 39 307 40
## 40 311 7
## 41 313 1
## 42 315 2
## 43 329 6
## 44 330 10
## 45 334 2
## 46 335 2
## 47 337 2
## 48 345 3
## 49 348 2
## 50 351 1
## 51 352 2
## 52 358 3
## 53 370 2
## 54 384 11
## 55 391 8
## 56 398 12
## 57 402 2
## 58 403 30
## 59 411 2
## 60 422 1
## 61 430 3
## 62 432 9
## 63 437 15
## 64 469 1
## 65 666 132
## 66 711 5
It’s the ability to chain operations together from left to right with the %>% operator. Expression x %>% f(y)
turns into f(x, y)
so this makes dplyr behave a little like a grammar of data manipulation:
my.data %>% group_by(tax)
## Source: local data frame [506 x 14]
## Groups: tax
##
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.090 1 296 15.3 396.9
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.967 2 242 17.8 396.9
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.967 2 242 17.8 392.8
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.062 3 222 18.7 394.6
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.062 3 222 18.7 396.9
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.062 3 222 18.7 394.1
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.561 5 311 15.2 395.6
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.950 5 311 15.2 396.9
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.082 5 311 15.2 386.6
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.592 5 311 15.2 386.7
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: lstat (dbl), medv (dbl)
my.data %>% group_by(tax) %>% summarise(total = sum(dis))
## Source: local data frame [66 x 2]
##
## tax total
## 1 187 12.127
## 2 188 14.243
## 3 193 23.182
## 4 198 5.885
## 5 216 28.475
## 6 222 31.861
## 7 223 21.059
## 8 224 54.421
## 9 226 8.697
## 10 233 50.731
## 11 241 9.089
## 12 242 9.934
## 13 243 27.259
## 14 244 6.336
## 15 245 15.350
## 16 247 17.392
## 17 252 10.802
## 18 254 22.135
## 19 255 5.648
## 20 256 8.325
## 21 264 25.669
## 22 265 12.439
## 23 270 23.967
## 24 273 11.828
## 25 276 30.262
## 26 277 43.442
## 27 279 14.520
## 28 280 8.344
## 29 281 21.603
## 30 284 51.035
## 31 285 7.307
## 32 287 39.492
## 33 289 29.246
## 34 293 21.952
## 35 296 24.954
## 36 300 46.430
## 37 304 54.070
## 38 305 21.149
## 39 307 154.386
## 40 311 42.209
## 41 313 9.188
## 42 315 18.441
## 43 329 38.402
## 44 330 80.281
## 45 334 21.171
## 46 335 17.584
## 47 337 13.223
## 48 345 19.494
## 49 348 12.540
## 50 351 8.535
## 51 352 16.027
## 52 358 23.483
## 53 370 12.197
## 54 384 27.680
## 55 391 20.561
## 56 398 58.775
## 57 402 15.307
## 58 403 54.932
## 59 411 21.421
## 60 422 6.267
## 61 430 16.303
## 62 432 23.013
## 63 437 29.430
## 64 469 7.320
## 65 666 272.086
## 66 711 9.377
my.data %>% group_by(tax) %>% summarise(total = sum(dis)) %>% arrange(desc(total))
## Source: local data frame [66 x 2]
##
## tax total
## 1 666 272.086
## 2 307 154.386
## 3 330 80.281
## 4 398 58.775
## 5 403 54.932
## 6 224 54.421
## 7 304 54.070
## 8 284 51.035
## 9 233 50.731
## 10 300 46.430
## 11 277 43.442
## 12 311 42.209
## 13 287 39.492
## 14 329 38.402
## 15 222 31.861
## 16 276 30.262
## 17 437 29.430
## 18 289 29.246
## 19 216 28.475
## 20 384 27.680
## 21 243 27.259
## 22 264 25.669
## 23 296 24.954
## 24 270 23.967
## 25 358 23.483
## 26 193 23.182
## 27 432 23.013
## 28 254 22.135
## 29 293 21.952
## 30 281 21.603
## 31 411 21.421
## 32 334 21.171
## 33 305 21.149
## 34 223 21.059
## 35 391 20.561
## 36 345 19.494
## 37 315 18.441
## 38 335 17.584
## 39 247 17.392
## 40 430 16.303
## 41 352 16.027
## 42 245 15.350
## 43 402 15.307
## 44 279 14.520
## 45 188 14.243
## 46 337 13.223
## 47 348 12.540
## 48 265 12.439
## 49 370 12.197
## 50 187 12.127
## 51 273 11.828
## 52 252 10.802
## 53 242 9.934
## 54 711 9.377
## 55 313 9.188
## 56 241 9.089
## 57 226 8.697
## 58 351 8.535
## 59 280 8.344
## 60 256 8.325
## 61 469 7.320
## 62 285 7.307
## 63 244 6.336
## 64 422 6.267
## 65 198 5.885
## 66 255 5.648
my.data %>% group_by(tax) %>% summarise(total = sum(dis)) %>% arrange(desc(total)) %>% filter(total < 50) %>% head(7)
## Source: local data frame [7 x 2]
##
## tax total
## 1 300 46.43
## 2 277 43.44
## 3 311 42.21
## 4 287 39.49
## 5 329 38.40
## 6 222 31.86
## 7 276 30.26
df <- data.frame(Date=c("2014-01-01" ,"2014-01-01" ,"2014-01-01" ,"2014-01-01" ,"2014-01-02" ,"2014-01-02" ,"2014-01-02" ,"2014-01-03" ,"2014-01-03" ,"2014-01-03"),
Col1=c(123,123,124,125,123,126,127,521,123,126))
df
## Date Col1
## 1 2014-01-01 123
## 2 2014-01-01 123
## 3 2014-01-01 124
## 4 2014-01-01 125
## 5 2014-01-02 123
## 6 2014-01-02 126
## 7 2014-01-02 127
## 8 2014-01-03 521
## 9 2014-01-03 123
## 10 2014-01-03 126
# now: count unique values in Col1 for the each date (that did not repeat in previous date), and add to the previous count
df %>%
arrange(Date) %>%
filter(!duplicated(Col1)) %>%
group_by(Date) %>%
summarise(Count=length(Date)) %>%
mutate(Count = cumsum(Count))
## Source: local data frame [3 x 2]
##
## Date Count
## 1 2014-01-01 3
## 2 2014-01-02 5
## 3 2014-01-03 6
To work with databases cf http://cran.rstudio.com/web/packages/dplyr/vignettes/databases.html