l5 <- c(1,2,NA,4,NA,6)
l5a <- l5[!is.na(l5)]
l5a
## [1] 1 2 4 6
l5b <- c("a","b",NA,"d",NA,"f")
good.pairs <- complete.cases(l5,l5b) # if we need to join both lists only with complete pairs...
data.frame(a=l5[good.pairs],b=l5b[good.pairs]) # another example: airquality is a data frame
## a b
## 1 1 a
## 2 2 b
## 3 4 d
## 4 6 f
airquality[4,][c(1,4,3)] # shows cols 1, 4 & 3 from the 4th row
## Ozone Temp Wind
## 4 18 62 11.5
airquality[complete.cases(airquality),][1:6,] # to show all non NA rows for the first 6 columns
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
Function sample takes a sample of the specified size from the elements of x using either with or without replacement.
coin <- c("Heads","Tails")
sample(coin,8,rep=T)
## [1] "Tails" "Tails" "Tails" "Heads" "Tails" "Tails" "Heads" "Heads"
dice.events <- 1:6 # dice possible results
ten.throws <- sample(dice.events, 10, replace=T) # balanced dice
ten.throws
## [1] 5 6 1 5 3 1 6 4 5 6
ten.more.throws <- sample(dice.events, 10, replace=T, prob=c(.1,.1,.1,.1,.1,.5)) # unbalanced dice (50% it falls 6)
ten.more.throws
## [1] 6 6 6 6 6 6 5 3 6 5
# Without replacement, we can create permutations
xs <- 1:10
sample(xs,10)
## [1] 2 6 5 10 8 7 3 1 4 9
sample(xs,10)
## [1] 5 9 3 8 6 7 2 4 10 1
# Eg: we have 50 pacients that we want to separate into two equal size groups (Control or Treatment)
pacients <- data.frame(patient = 1:50,
age = rnorm(100, mean = 60, sd = 12))
head(pacients)
## patient age
## 1 1 62.34
## 2 2 76.81
## 3 3 67.50
## 4 4 61.09
## 5 5 69.93
## 6 6 69.68
xs <- rep(c("Control", "Treat"),25)
xs
## [1] "Control" "Treat" "Control" "Treat" "Control" "Treat" "Control"
## [8] "Treat" "Control" "Treat" "Control" "Treat" "Control" "Treat"
## [15] "Control" "Treat" "Control" "Treat" "Control" "Treat" "Control"
## [22] "Treat" "Control" "Treat" "Control" "Treat" "Control" "Treat"
## [29] "Control" "Treat" "Control" "Treat" "Control" "Treat" "Control"
## [36] "Treat" "Control" "Treat" "Control" "Treat" "Control" "Treat"
## [43] "Control" "Treat" "Control" "Treat" "Control" "Treat" "Control"
## [50] "Treat"
xs <- sample(xs,50) # a random permutation to remove selection bias
xs
## [1] "Control" "Control" "Control" "Treat" "Control" "Control" "Control"
## [8] "Control" "Treat" "Treat" "Treat" "Control" "Control" "Treat"
## [15] "Treat" "Control" "Control" "Control" "Treat" "Treat" "Treat"
## [22] "Control" "Treat" "Treat" "Treat" "Treat" "Treat" "Treat"
## [29] "Treat" "Treat" "Treat" "Control" "Treat" "Control" "Control"
## [36] "Control" "Treat" "Control" "Control" "Treat" "Control" "Control"
## [43] "Control" "Treat" "Treat" "Treat" "Control" "Treat" "Control"
## [50] "Control"
pacients$group <- as.factor(xs) # add new column with required information
head(pacients)
## patient age group
## 1 1 62.34 Control
## 2 2 76.81 Control
## 3 3 67.50 Control
## 4 4 61.09 Treat
## 5 5 69.93 Control
## 6 6 69.68 Control
# Eg: simple bootstrap example (from: www.sigmafield.org/2010/05/23/r-function-of-the-day-sample)
rn <- rnorm(1000, 10) # random sample from a normal distribution
# making 1000 samples and keeping the means
sample.means <- replicate(1000, mean(sample(rn, replace = TRUE)))
sample.means[1:20]
## [1] 9.991 10.032 10.040 10.000 10.064 10.027 9.981 10.067 9.978 9.989
## [11] 10.043 10.057 10.007 10.005 9.977 10.014 10.020 9.949 10.031 10.080
quantile(sample.means, probs = c(0.025, 0.975)) # produces sample quantiles corresponding to the given probabilities
## 2.5% 97.5%
## 9.947 10.065
# Compare this to the standard parametric technique.
t.test(rn)$conf.int
## [1] 9.943 10.068
## attr(,"conf.level")
## [1] 0.95
References: + http://www.sigmafield.org/2009/09/20/r-function-of-the-day-tapply + http://stackoverflow.com/questions/11562656/averaging-column-values-for-specific-sections-of-data-corresponding-to-other-col/11562850#11562850
check also: http://www.jstatsoft.org/v40/i01/paper
We use tapply when: * A dataset that can be broken up into groups * We want to break it up into groups * Within each group, we want to apply a function
The tapply function is useful when we need to break up a vector into groups defined by some classifying factor, compute a function on the subsets, and return the results in a convenient form.
## generate data for medical example
medical.example <- data.frame(patient = 1:100,
age = rnorm(100, mean = 60, sd = 12),
treatment = gl(2, 50, labels = c("Treatment", "Control")))
# we want to break the dataset by Treatment, and then find the age's mean
tapply(medical.example$age, medical.example$treatment, mean)
## Treatment Control
## 60.12 60.17
# Another eg
baseball.example <- data.frame(team = gl(5, 5,
labels = paste("Team", LETTERS[1:5])),
player = sample(letters, 25), batting.average = runif(25, .200, .400))
# we want to break the dataset by Team, and then find the max batting avg
tapply(baseball.example$batting.average, baseball.example$team, max)
## Team A Team B Team C Team D Team E
## 0.3852 0.3832 0.3165 0.3686 0.3186
# an artificial eg from R's help:
n <- 17
fac <- factor(rep(1:3, length = n), levels = 1:5)
fac
## [1] 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2
## Levels: 1 2 3 4 5
table(fac)
## fac
## 1 2 3 4 5
## 6 6 5 0 0
# this next function, separates all 1:n value into the factors given by fac,
# and then sum each factor
tapply(1:n, fac, sum)
## 1 2 3 4 5
## 51 57 45 NA NA
# Let's explicitly sum the first factor just to confirm:
fac == 1
## [1] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [12] FALSE TRUE FALSE FALSE TRUE FALSE
(1:n)[fac==1]
## [1] 1 4 7 10 13 16
sum((1:n)[fac==1])
## [1] 51
# we can apply any unary function over each factor
tapply(1:n, fac, range)
## $`1`
## [1] 1 16
##
## $`2`
## [1] 2 17
##
## $`3`
## [1] 3 15
##
## $`4`
## NULL
##
## $`5`
## NULL
tapply(1:n, fac, quantile)
## $`1`
## 0% 25% 50% 75% 100%
## 1.00 4.75 8.50 12.25 16.00
##
## $`2`
## 0% 25% 50% 75% 100%
## 2.00 5.75 9.50 13.25 17.00
##
## $`3`
## 0% 25% 50% 75% 100%
## 3 6 9 12 15
##
## $`4`
## NULL
##
## $`5`
## NULL
Other possibilities:
df <- data.frame(dive=factor(sample(c("dive1","dive2"),10,replace=TRUE)),
speed=runif(10))
df
## dive speed
## 1 dive1 0.398424
## 2 dive1 0.725862
## 3 dive1 0.308170
## 4 dive2 0.090411
## 5 dive1 0.886866
## 6 dive2 0.205537
## 7 dive2 0.603882
## 8 dive1 0.323705
## 9 dive2 0.288841
## 10 dive2 0.004747
# 'by()' takes in vectors and applies a function to them
res.by <- by( df$speed, df$dive, mean)
library(taRifx)
## Warning: package 'taRifx' was built under R version 3.1.1
as.data.frame(res.by)
## IDX1 value
## 1 dive1 0.5286
## 2 dive2 0.2387
# takes in data.frames, outputs data.frames, and uses a formula interface.
aggregate( speed ~ dive, df, mean )
## dive speed
## 1 dive1 0.5286
## 2 dive2 0.2387
# Check also library plyr which facilitates this type of data frama manipulation
Function rle computes the lengths and values of runs of equal values in a vector.
Function inverse.rle does the reverse operation.
coin <- c("H","T")
experiment <- sample(coin,100,rep=T)
experiment
## [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T"
## [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T"
## [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T"
## [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T"
## [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H"
## [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H"
experiment.rle <- rle(experiment)
max.seq <- max(experiment.rle$lengths) # the maximum sequence of equal values
max.seq
## [1] 6
experiment.rle$values[which(experiment.rle$lengths == max.seq)] # was it heads or tails?
## [1] "T" "H"
# we can compute both max sequences of heads and tails by using tapply:
tapply(experiment.rle$lengths, experiment.rle$values, max)
## H T
## 6 6
inverse.rle(experiment.rle) # recomputes the initial sequence
## [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T"
## [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T"
## [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T"
## [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T"
## [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H"
## [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H"
Function split divides the data in the vector x into the groups defined by f. The replacement forms replace values corresponding to such a division.
Function cut divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on.
split(1:10, 1:2)
## $`1`
## [1] 1 3 5 7 9
##
## $`2`
## [1] 2 4 6 8 10
split(1:10, 1:2)[[1]] # or: split(1:10, 1:2)$"1"
## [1] 1 3 5 7 9
ma <- cbind(x = 1:10, y = (-4:5)^2)
ma
## x y
## [1,] 1 16
## [2,] 2 9
## [3,] 3 4
## [4,] 4 1
## [5,] 5 0
## [6,] 6 1
## [7,] 7 4
## [8,] 8 9
## [9,] 9 16
## [10,] 10 25
split(ma, colnames(ma))
## $x
## [1] 1 3 5 7 9 16 4 0 4 16
##
## $y
## [1] 2 4 6 8 10 9 1 1 9 25
split(ma, col(ma))
## $`1`
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $`2`
## [1] 16 9 4 1 0 1 4 9 16 25
split(ma, col(ma))[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
ma[,1] # easier alternative
## [1] 1 2 3 4 5 6 7 8 9 10
ma[,"x"] # another alternative
## [1] 1 2 3 4 5 6 7 8 9 10
xs <- 1:20
split(xs,factor(c("yes","no","yes","yes"))) # every four elements are attached to the given level sequence "yes","no","yes","yes"
## $no
## [1] 2 6 10 14 18
##
## $yes
## [1] 1 3 4 5 7 8 9 11 12 13 15 16 17 19 20
is <- split(xs,factor(c("yes","no","yes","yes")))[["yes"]]
xs[is]
## [1] 1 3 4 5 7 8 9 11 12 13 15 16 17 19 20
# Use of cut (from: www.sigmafield.org/2009/09/23/r-function-of-the-day-cut)
# generate some data
clinical.trial <-
data.frame(patient = 1:100, age = rnorm(100, mean = 60, sd = 8),
year = sample(paste("19", 85:99, sep = ""), 100, replace = T))
# Now, we will use the cut function to make age a factor, ie,
# age will be a categorical variable
c1 <- cut(clinical.trial$age, breaks = 4)
table(c1)
## c1
## (39.8,50.4] (50.4,61] (61,71.5] (71.5,82.1]
## 9 43 44 4
# year.enroll is a factor, so must convert to numeric first!
years <- as.numeric(as.character(clinical.trial$year))
years
## [1] 1999 1995 1991 1995 1996 1998 1989 1988 1987 1999 1991 1998 1994 1992
## [15] 1990 1989 1987 1996 1995 1987 1990 1990 1997 1994 1999 1999 1995 1989
## [29] 1996 1987 1992 1992 1997 1987 1988 1997 1985 1985 1989 1988 1985 1990
## [43] 1986 1999 1997 1997 1995 1986 1988 1996 1989 1987 1999 1991 1999 1994
## [57] 1990 1993 1995 1990 1988 1995 1995 1985 1989 1987 1989 1992 1988 1998
## [71] 1985 1998 1989 1986 1987 1993 1990 1999 1998 1998 1993 1998 1986 1997
## [85] 1991 1989 1989 1990 1994 1998 1990 1994 1994 1999 1987 1999 1985 1985
## [99] 1997 1986
c2 <- cut(years, breaks = 3)
table(c2)
## c2
## (1985,1990] (1990,1994] (1994,1999]
## 37 26 37
# The previous intervals where determined by R.
# We can force our own (in this case, I'll use 'seq')
c3 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10))
table(c3)
## c3
## (30,40] (40,50] (50,60] (60,70] (70,80]
## 1 7 41 45 5
# the interval size can also be variable:
age.cat <- function(x, lower = 0, upper, by = 10,
sep = "-", above.char = "+") {
labs <- c(paste(seq(lower, upper - by, by = by),
seq(lower + by - 1, upper - 1, by = by),
sep = sep),
paste(upper, above.char, sep = ""))
cut(floor(x), breaks = c(seq(lower, upper, by = by), Inf),
right = FALSE, labels = labs)
}
# This function categorizes age in a fairly flexible way. The first assignment to labs inside the function creates a vector of labels. Then, the cut function is called to do the work, with the custom labels as an argument.
# only specifying an upper bound, uses 0 as lower bound, and breaks up categories by 10
table(age.cat(clinical.trial$age, upper = 70))
##
## 0-9 10-19 20-29 30-39 40-49 50-59 60-69 70+
## 0 0 0 1 7 41 45 6
# now specifying a lower bound
table(age.cat(clinical.trial$age, lower = 30, upper = 70))
##
## 30-39 40-49 50-59 60-69 70+
## 1 7 41 45 6
# now specifying a lower bound AND the "by" argument
table(age.cat(clinical.trial$age, lower = 30, upper = 70, by = 5))
##
## 30-34 35-39 40-44 45-49 50-54 55-59 60-64 65-69 70+
## 0 1 1 6 17 24 25 20 6
subset
returns subsets of vectors, matrices or data frames which meet conditions.
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
head(subset(airquality, Temp > 80, select = c(Ozone, Temp)))
## Ozone Temp
## 29 45 81
## 35 NA 84
## 36 NA 85
## 38 29 82
## 39 NA 87
## 40 71 90
head(subset(airquality, Day == 1, select = -Temp))
## Ozone Solar.R Wind Month Day
## 1 41 190 7.4 5 1
## 32 NA 286 8.6 6 1
## 62 135 269 4.1 7 1
## 93 39 83 6.9 8 1
## 124 96 167 6.9 9 1
head(subset(airquality, select = Ozone:Wind))
## Ozone Solar.R Wind
## 1 41 190 7.4
## 2 36 118 8.0
## 3 12 149 12.6
## 4 18 313 11.5
## 5 NA NA 14.3
## 6 28 NA 14.9