Let's say we want to:
fpe <- read.table("http://data.princeton.edu/wws509/datasets/effort.dat")
head(fpe)
## setting effort change
## Bolivia 46 0 1
## Brazil 74 0 10
## Chile 89 16 29
## Colombia 77 16 25
## CostaRica 84 21 29
## Cuba 89 15 40
names(fpe)
## [1] "setting" "effort" "change"
nrow(fpe)
## [1] 20
ncol(fpe)
## [1] 3
summary(fpe)
## setting effort change
## Min. :35.0 Min. : 0.00 Min. : 0.0
## 1st Qu.:66.0 1st Qu.: 3.00 1st Qu.: 5.5
## Median :74.0 Median : 8.00 Median :10.5
## Mean :72.1 Mean : 9.55 Mean :14.3
## 3rd Qu.:84.0 3rd Qu.:15.25 3rd Qu.:22.8
## Max. :91.0 Max. :23.00 Max. :40.0
write.table(fpe, file = "./effort.dat", sep = ";")
write.csv(fpe, file = "./effort.csv") # we can also save as a csv
Other functions are
Vectors contain only data from one class.
1:6
## [1] 1 2 3 4 5 6
rep(1, 6)
## [1] 1 1 1 1 1 1
seq(0, 1, 0.1)
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
v <- -5:5
v[1] # use operator [] to access vector elements, indexes start at 1
## [1] -5
v <- c(0.1, 0.2, 0.3, 0.4, 0.8, 0.9, 1, 1.5) # function c() is used to join vectors
v
## [1] 0.1 0.2 0.3 0.4 0.8 0.9 1.0 1.5
v[2:4] # subvector
## [1] 0.2 0.3 0.4
v[-1] # vector except the first element
## [1] 0.2 0.3 0.4 0.8 0.9 1.0 1.5
v[-3:-1] # vector except the first three elements
## [1] 0.4 0.8 0.9 1.0 1.5
length(v) # size of the vector
## [1] 8
v[length(v)] # last element
## [1] 1.5
v[-length(v)] # all except last element
## [1] 0.1 0.2 0.3 0.4 0.8 0.9 1.0
sum(v) # sum all vector elements
## [1] 5.2
vector() # empty vector
## logical(0)
vector("numeric", 10)
## [1] 0 0 0 0 0 0 0 0 0 0
c(1.7, "a") # implicit coercion for vectors
## [1] "1.7" "a"
c(T, 2)
## [1] 1 2
c("a", T)
## [1] "a" "TRUE"
as.numeric(1:6) # explicit coercion
## [1] 1 2 3 4 5 6
as.logical(0:6)
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE
as.character(1:6)
## [1] "1" "2" "3" "4" "5" "6"
as.complex(1:6)
## [1] 1+0i 2+0i 3+0i 4+0i 5+0i 6+0i
as.numeric(c("a", "b", "c")) #but...
## Warning: NAs introduced by coercion
## [1] NA NA NA
v1 <- 1:3
names(v1) <- c("data1", "data2", "data3") # add names to elems
v1
## data1 data2 data3
## 1 2 3
v1["data1"] # accessing elements using names
## data1
## 1
letters # pre-defined vector
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
# more complex operations
vector <- seq(1, 100, 3)
vector
## [1] 1 4 7 10 13 16 19 22 25 28 31 34 37 40 43 46 49
## [18] 52 55 58 61 64 67 70 73 76 79 82 85 88 91 94 97 100
u <- vector%%2 == 0 # only T for pairs
u
## [1] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [12] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [23] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [34] TRUE
v <- vector[u] # subset only with pairs
v
## [1] 4 10 16 22 28 34 40 46 52 58 64 70 76 82 88 94 100
# str gives the structure of a data structure
str(v)
## num [1:17] 4 10 16 22 28 34 40 46 52 58 ...
# typeof gives the type
typeof(v)
## [1] "double"
# vectors are homogenuous structures, but R coerces to the most flexibe
# type
c("a", 1)
## [1] "a" "1"
c(TRUE, 2)
## [1] 1 2
# subsetting
v <- 1:5
v[c(1, 2)] <- 10:11
v
## [1] 10 11 3 4 5
v[-1] <- 20:23 # The length of the LHS needs to match the RHS
v
## [1] 10 20 21 22 23
v[c(T, F)] <- 0 # the subsetting cycles if it reaches the end
v
## [1] 0 20 0 22 0
# subsetting can be used for lookup tables:
x <- c("m", "f", "u", "f", "f", "m", "m")
lookup <- c(m = "Male", f = "Female", u = NA)
lookup[x]
## m f u f f m m
## "Male" "Female" NA "Female" "Female" "Male" "Male"
unname(lookup[x])
## [1] "Male" "Female" NA "Female" "Female" "Male" "Male"
# Matching and merging by hand
grades <- c(1, 2, 2, 3, 1)
info <- data.frame(grade = 3:1, desc = c("Excellent", "Good", "Poor"), fail = c(F,
F, T))
id <- match(grades, info$grade) # returns a vector of the positions of (1st) matches of its 1st argument in its 2nd
id
## [1] 3 2 2 1 3
info
## grade desc fail
## 1 3 Excellent FALSE
## 2 2 Good FALSE
## 3 1 Poor TRUE
info[id, ]
## grade desc fail
## 3 1 Poor TRUE
## 2 2 Good FALSE
## 2.1 2 Good FALSE
## 1 3 Excellent FALSE
## 3.1 1 Poor TRUE
# NA is a logical vector!
typeof(NA)
## [1] "logical"
NA & TRUE
## [1] NA
NA & FALSE
## [1] FALSE
# There are also constants NA_integer_, NA_real_, NA_complex_ and
# NA_character_ (all are reserved words)
Matrixes are vectors with dimensions
m <- 1:16 # just a vector for now
m
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
class(m) # function to determine the object's type
## [1] "integer"
dim(m) <- c(4, 4) # make a matrix out of it (rows, columns)
m
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
class(m)
## [1] "matrix"
dim(m) <- c(2, 8) # make a diff matrix
m
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1 3 5 7 9 11 13 15
## [2,] 2 4 6 8 10 12 14 16
dim(m) <- c(4, 2, 2) # make a 3D matrix
m
## , , 1
##
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
##
## , , 2
##
## [,1] [,2]
## [1,] 9 13
## [2,] 10 14
## [3,] 11 15
## [4,] 12 16
m <- matrix(1:16, nrow = 2, ncol = 8, byrow = T)
m
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1 2 3 4 5 6 7 8
## [2,] 9 10 11 12 13 14 15 16
m <- matrix(1:16, nrow = 2, ncol = 8, dimnames = list(c("row.1", "row.2"), letters[1:8]))
m
## a b c d e f g h
## row.1 1 3 5 7 9 11 13 15
## row.2 2 4 6 8 10 12 14 16
m <- matrix(1:6, 3, 2)
dim(m)
## [1] 3 2
m
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
dim(m) <- c(2, 3)
m
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
m[1, ] # first row
## [1] 1 3 5
m[, 2] # second column
## [1] 3 4
m[1, 2] # element in first row, 2nd col
## [1] 3
m[, c(1, 3)] # the first and third column
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
m1 <- 1:3
m2 <- 10:12
cbind(m1, m2) # matrix formation with binding cols or rows
## m1 m2
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
rbind(m1, m2)
## [,1] [,2] [,3]
## m1 1 2 3
## m2 10 11 12
m1 <- matrix(1:9, nrow = 3, ncol = 3)
m2 <- matrix(seq(18, 2, -2), nrow = 3, ncol = 3)
m1
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
m2
## [,1] [,2] [,3]
## [1,] 18 12 6
## [2,] 16 10 4
## [3,] 14 8 2
m1 + m2
## [,1] [,2] [,3]
## [1,] 19 16 13
## [2,] 18 15 12
## [3,] 17 14 11
m1 * m2 # product item by item
## [,1] [,2] [,3]
## [1,] 18 48 42
## [2,] 32 50 32
## [3,] 42 48 18
m1 %*% m2 # real matrix multiplication
## [,1] [,2] [,3]
## [1,] 180 108 36
## [2,] 228 138 48
## [3,] 276 168 60
t(matrix(1:6, nrow = 2, ncol = 3)) # transpose
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
## [3,] 5 6
sum(1:5 * 5:1) # inner vector product
## [1] 35
outer(1:5, 5:1) # outer vector product
## [,1] [,2] [,3] [,4] [,5]
## [1,] 5 4 3 2 1
## [2,] 10 8 6 4 2
## [3,] 15 12 9 6 3
## [4,] 20 16 12 8 4
## [5,] 25 20 15 10 5
diag(x = 1, nrow = 5, ncol = 3)
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
## [4,] 0 0 0
## [5,] 0 0 0
m3 <- diag(1:4) # makes a diagonal matrix using the vector to initialize diagonal
m3
## [,1] [,2] [,3] [,4]
## [1,] 1 0 0 0
## [2,] 0 2 0 0
## [3,] 0 0 3 0
## [4,] 0 0 0 4
m3[upper.tri(m3, diag = T)] <- NA
m3
## [,1] [,2] [,3] [,4]
## [1,] NA NA NA NA
## [2,] 0 NA NA NA
## [3,] 0 0 NA NA
## [4,] 0 0 0 NA
Lists can contain values of different types (including lists)
l1 <- list(atr1 = 1:4, atr2 = 0.6)
l1
## $atr1
## [1] 1 2 3 4
##
## $atr2
## [1] 0.6
l1[1]
## $atr1
## [1] 1 2 3 4
l1["atr1"] # same thing
## $atr1
## [1] 1 2 3 4
l1[[1]] # operator [[]] extracts a single element
## [1] 1 2 3 4
class(l1[[1]])
## [1] "integer"
class(l1[1])
## [1] "list"
l1$atr1 # operator $ extracts part of the object
## [1] 1 2 3 4
l1[["atr1"]] # same thing, except $ does partial matching
## [1] 1 2 3 4
l2 <- list(atr1 = 1:4, atr2 = 0.6, atr3 = "hello")
l2
## $atr1
## [1] 1 2 3 4
##
## $atr2
## [1] 0.6
##
## $atr3
## [1] "hello"
l2[c(1, 3)]
## $atr1
## [1] 1 2 3 4
##
## $atr3
## [1] "hello"
l3 <- list(a = list(10, 12, 14), b = c(3.14, 2.81))
l3
## $a
## $a[[1]]
## [1] 10
##
## $a[[2]]
## [1] 12
##
## $a[[3]]
## [1] 14
##
##
## $b
## [1] 3.14 2.81
l3[[c(1, 3)]]
## [1] 14
l3[[1]][[3]]
## [1] 14
l3a <- list(a = list(b = list(c = list(d = 1))))
l3a
## $a
## $a$b
## $a$b$c
## $a$b$c$d
## [1] 1
l3a[[c("a", "b", "c", "d")]]
## [1] 1
l4 <- list(aarvark = 1.5, ox = 3.4)
l4$a # partial matching is possible with $ (proceed with caution!)
## [1] 1.5
l5 <- list(list(list(list())))
str(l5)
## List of 1
## $ :List of 1
## ..$ :List of 1
## .. ..$ : list()
is.recursive(l5) # returns TRUE if arg has a recursive (list-like) structure
## [1] TRUE
# c() will combine several lists into one. If given a combination of
# atomic vectors and lists, c() will coerce the vectors to list before
# combining them.
l6 <- list(list(1, 2), c(3, 4))
l7 <- c(list(1, 2), c(3, 4))
str(l6)
## List of 2
## $ :List of 2
## ..$ : num 1
## ..$ : num 2
## $ : num [1:2] 3 4
str(l7)
## List of 4
## $ : num 1
## $ : num 2
## $ : num 3
## $ : num 4
# coerce with as.list(...) check with is.list(...) convert to vector
# with unlist()
Lists are used to build up many of the more complicated data structures in R. For example, both data frames (described below), and linear models objects (as produced by lm()) are lists ref
Data frames are used to store tabular data, they are lists of same-length vectors vertically aligned. Useful to keep datasets
df <- data.frame(col1 = 1:4, col2 = c(TRUE, TRUE, FALSE, TRUE))
df
## col1 col2
## 1 1 TRUE
## 2 2 TRUE
## 3 3 FALSE
## 4 4 TRUE
df$col1 # show a column, ie, an attribute
## [1] 1 2 3 4
df[, 2]
## [1] TRUE TRUE FALSE TRUE
df[1, ] # show a row, ie, an observation
## col1 col2
## 1 1 TRUE
df$newAtr <- letters[1:4] # add a new attribute
df
## col1 col2 newAtr
## 1 1 TRUE a
## 2 2 TRUE b
## 3 3 FALSE c
## 4 4 TRUE d
names(df) # the name of the columns
## [1] "col1" "col2" "newAtr"
row.names(df) # the name of the rows
## [1] "1" "2" "3" "4"
row.names(df) <- c("first", "second", "3rd", "4th")
df
## col1 col2 newAtr
## first 1 TRUE a
## second 2 TRUE b
## 3rd 3 FALSE c
## 4th 4 TRUE d
nrow(df) # number of rows
## [1] 4
ncol(df) # number of cols
## [1] 3
df[5, ] = list(5, FALSE) # add a new observation
df[df$col2 == T, ] # select observations where col2 is true
## col1 col2 newAtr
## first 1 TRUE a
## second 2 TRUE b
## 4th 4 TRUE d
mean(df$col1) # find statistics over a certain column
## [1] 3
df <- data.frame(x = 1:3) # it is possible for a data frame to have a column that is a list:
df$y <- list(1:2, 1:3, 1:4)
df
## x y
## 1 1 1, 2
## 2 2 1, 2, 3
## 3 3 1, 2, 3, 4
All objects can have arbitrary additional attributes. These can be thought of as a named list (with unique names). Attributes can be accessed individually with attr() or all at once (as a list) with attributes().
v1 <- 1:5
attr(v1, "text") <- "this is a vector"
v1
## [1] 1 2 3 4 5
## attr(,"text")
## [1] "this is a vector"
str(v1)
## atomic [1:5] 1 2 3 4 5
## - attr(*, "text")= chr "this is a vector"
# The structure() function returns a new object with modified attributes
structure(1:10, my_attribute = "This is a vector")
## [1] 1 2 3 4 5 6 7 8 9 10
## attr(,"my_attribute")
## [1] "This is a vector"
# There are 3 special attributes: names(), character vector of element
# names class(), used to implement the S3 object system, described in the
# next section dim(), used to turn vectors into high-dimensional
# structures
You can name a vector in three ways:
Names should be unique
v1 <- c(a = 1, 2, 3)
v1
## a
## 1 2 3
names(v1) <- c("a", "b", "c")
v1
## a b c
## 1 2 3
names(v1) <- NULL # erase names
v1
## [1] 1 2 3
A factor is a vector that can contain only predefined values.
Factors have two key attributes: their class(), “factor”, which controls their behaviour; and their levels(), the set of allowed values.
Factors represent categorical data, can be ordered or not can be seen an integer vector where each int has a label used to store tabular data.
Check www.stat.berkeley.edu/classes/s133/factors.html for more information
f1 <- factor(c("yes", "no", "yes", "yes"))
f1
## [1] yes no yes yes
## Levels: no yes
# make a contingency table, ie, displays the frequency distribution of the
# variables
table(f1)
## f1
## no yes
## 1 3
f1a <- factor(c("yes", "no", "yes", "yes"), levels = c("yes", "no")) # redefine the order of the levels
f1a
## [1] yes no yes yes
## Levels: yes no
levels(f1)
## [1] "no" "yes"
# Egs of use
set.seed(143) # deterministic random generation
lets = factor(sample(letters, size = 15, replace = T))
lets
## [1] y a t k b y p p p k l e y h w
## Levels: a b e h k l p t w y
levels(lets)
## [1] "a" "b" "e" "h" "k" "l" "p" "t" "w" "y"
table(lets[1:10])
##
## a b e h k l p t w y
## 1 1 0 0 2 0 3 1 0 2
# A strange eg: each value of the factor is translated into i, where i is
# its i-th level. Since 'p' is a vector of one position, only where 'lets'
# as values 'a' (which are elements of the 1st level) does the result is
# not NA
"p"[lets]
## [1] NA "p" NA NA NA NA NA NA NA NA NA NA NA NA NA
levels(lets)[lets] # left as an eg :-)
## [1] "y" "a" "t" "k" "b" "y" "p" "p" "p" "k" "l" "e" "y" "h" "w"
While factors look (and often behave) like character vectors, they are actually integers under the hood and you need to be careful when treating them like strings. Some string methods (like gsub() and grepl()) will coerce factors to strings, while others (like nchar()) will throw an error, and still others (like c()) will use the underlying integer IDs. For this reason, it's usually best to explicitly convert factors to strings when modifying their levels.
Unfortunately, most data loading functions in R automatically convert character vectors to factors. This is suboptimal, because there's no way for those functions to know the set of all possible levels and their optimal order. Instead, use the argument stringsAsFactors = FALSE to suppress this behaviour, and then manually convert character vectors to factors using your knowledge of the data. ref