[wikipedia] _In computer science, relational algebra is an offshoot of first-order logic and of algebra of sets concerned with operations over finitary relations, usually made more convenient to work with by identifying the components of a tuple by a name (called attribute) rather than by a numeric column index, which is what is called a relation in database terminology.
The main application of relational algebra is providing a theoretical foundation for relational databases, particularly query languages for such databases, chiefly among which is SQL._
sqldf
This package performs SQL queries on R data frames
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.1.1
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.1.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.1.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.1.1
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.1.1
## Loading required package: RSQLite.extfuns
## Warning: package 'RSQLite.extfuns' was built under R version 3.1.1
set.seed(101)
size = 50
get.name <- function(size)paste(sample(letters,size),collapse="")
mydf <- data.frame(id=1:size,
name=as.vector(Map(get.name, rep(5,size)), mode="character"),
time=rpois(size,100),
size=floor(runif(size,1,6)))
head(mydf)
## id name time size
## 1 1 jbrpf 92 5
## 2 2 hoiym 109 3
## 3 3 wryvk 103 4
## 4 4 pufja 107 5
## 5 5 sxfpu 92 1
## 6 6 ubjxo 102 3
sqldf("SELECT *
FROM mydf
LIMIT 6")
## Loading required package: tcltk
## id name time size
## 1 1 jbrpf 92 5
## 2 2 hoiym 109 3
## 3 3 wryvk 103 4
## 4 4 pufja 107 5
## 5 5 sxfpu 92 1
## 6 6 ubjxo 102 3
sqldf("SELECT *
FROM mydf
WHERE name IN ('megku','qdaym')")
## [1] id name time size
## <0 rows> (or 0-length row.names)
sqldf("SELECT *
FROM mydf
WHERE name LIKE '%a%'") # names with at least one 'a'
## id name time size
## 1 4 pufja 107 5
## 2 8 xftau 83 4
## 3 14 sahuw 119 1
## 4 17 rtswa 112 2
## 5 18 yikda 99 2
## 6 21 dajtp 102 4
## 7 22 icarv 92 2
## 8 23 danut 103 1
## 9 30 elpaj 113 4
## 10 34 zhvan 99 5
## 11 38 afcoe 109 3
## 12 39 fahgu 103 1
## 13 43 kumla 112 2
## 14 44 qimva 90 3
## 15 49 gvfea 91 3
sqldf("SELECT id, MAX(time)
FROM mydf")
## id MAX(time)
## 1 26 121
sqldf("SELECT *
FROM mydf
WHERE time BETWEEN 90 AND 93")
## id name time size
## 1 1 jbrpf 92 5
## 2 5 sxfpu 92 1
## 3 22 icarv 92 2
## 4 24 zvmhn 91 3
## 5 33 zrumg 92 5
## 6 44 qimva 90 3
## 7 45 hmlkj 92 2
## 8 49 gvfea 91 3
sqldf("SELECT *
FROM mydf
WHERE name BETWEEN 'a' AND 'g'")
## id name time size
## 1 11 ckqxj 96 4
## 2 21 dajtp 102 4
## 3 23 danut 103 1
## 4 30 elpaj 113 4
## 5 31 dlovc 105 5
## 6 32 fnslt 107 2
## 7 38 afcoe 109 3
## 8 39 fahgu 103 1
## 9 42 cvfrk 99 3
## 10 46 exdrw 101 5
## 11 47 fhgme 105 5
sqldf("SELECT time, count(*) AS n_refs
FROM mydf
GROUP BY time HAVING count(*) > 2")
## time n_refs
## 1 92 5
## 2 96 3
## 3 99 4
## 4 103 4
## 5 105 3
## 6 112 3
sqldf("SELECT *
FROM mydf
WHERE time = 103")
## id name time size
## 1 3 wryvk 103 4
## 2 10 gbvrc 103 5
## 3 23 danut 103 1
## 4 39 fahgu 103 1
sqldf("SELECT id, time
FROM mydf
WHERE time >= 70 AND time <= 86
ORDER BY time;")
## id time
## 1 35 81
## 2 8 83
## 3 37 84
sqldf("SELECT size, avg(time) as avg_time
FROM mydf
GROUP BY size;")
## size avg_time
## 1 1 103.56
## 2 2 99.12
## 3 3 99.30
## 4 4 102.88
## 5 5 101.20
size2 <- 200
mydf2 <- data.frame(id=sample(1:50,size2,replace=TRUE),
value=rexp(size2,1/50))
head(mydf2)
## id value
## 1 23 34.3517
## 2 26 19.9782
## 3 42 4.1858
## 4 46 0.2366
## 5 38 36.0750
## 6 39 41.6617
sqldf("SELECT count(*) AS n_values
FROM mydf2
GROUP BY id
LIMIT 12")
## n_values
## 1 3
## 2 2
## 3 4
## 4 5
## 5 2
## 6 4
## 7 6
## 8 5
## 9 3
## 10 2
## 11 1
## 12 5
sqldf("SELECT id, SUM(value) AS sum_vals
FROM mydf2
GROUP BY id
HAVING SUM(value) > 300")
## id sum_vals
## 1 4 321.9
## 2 12 340.2
## 3 18 440.5
## 4 23 395.6
## 5 30 641.2
## 6 31 306.4
## 7 32 311.7
## 8 36 395.9
## 9 40 338.4
## 10 45 542.7
# Let the joins begin!
# INNER JOIN, ie, returns rows when there is at least one match in both tables
sqldf("SELECT mydf.name, mydf2.value
FROM mydf JOIN mydf2
ON mydf.id = mydf2.id
ORDER BY mydf.name
LIMIT 12")
## name value
## 1 afcoe 36.075
## 2 afcoe 48.312
## 3 afcoe 48.471
## 4 afcoe 136.284
## 5 ckqxj 27.994
## 6 cvfrk 4.186
## 7 cvfrk 42.393
## 8 dajtp 14.256
## 9 dajtp 26.939
## 10 dajtp 38.055
## 11 dajtp 42.254
## 12 dajtp 48.238
# LEFT (OUTER) JOIN keyword returns all rows from the left table (table_name1),
# even if there are no matches in the right table (table_name2).
sqldf("SELECT mydf.name, mydf2.value
FROM mydf LEFT JOIN mydf2
ON mydf.id = mydf2.id
ORDER BY mydf.name
LIMIT 12") # in this case there is no difference, all id's have values in mydf2
## name value
## 1 afcoe 36.075
## 2 afcoe 48.312
## 3 afcoe 48.471
## 4 afcoe 136.284
## 5 ckqxj 27.994
## 6 cvfrk 4.186
## 7 cvfrk 42.393
## 8 dajtp 14.256
## 9 dajtp 26.939
## 10 dajtp 38.055
## 11 dajtp 42.254
## 12 dajtp 48.238
mydf3 <- data.frame(id=sample(51:60,size2,replace=TRUE),
value=rexp(size2,1/50))
# UNION combines the result-set of two or more SELECT statements
# It does not include repetitions (for that, use UNION ALL)
sqldf("SELECT mydf2.id FROM mydf2
UNION
SELECT mydf3.id FROM mydf3
ORDER BY id DESC
LIMIT 12")
## mydf2.id
## 1 60
## 2 59
## 3 58
## 4 57
## 5 56
## 6 55
## 7 54
## 8 53
## 9 52
## 10 51
## 11 50
## 12 49
df <- read.csv("effort.csv")
head(df, 10)
## X setting effort change
## 1 Bolivia 46 0 1
## 2 Brazil 74 0 10
## 3 Chile 89 16 29
## 4 Colombia 77 16 25
## 5 CostaRica 84 21 29
## 6 Cuba 89 15 40
## 7 DominicanRep 68 14 21
## 8 Ecuador 70 6 0
## 9 ElSalvador 60 13 13
## 10 Guatemala 55 9 4
df <- read.csv.sql("effort.csv", "SELECT * FROM file WHERE effort>10")
head(df, 10)
## X setting effort change
## 1 "Chile" 89 16 29
## 2 "Colombia" 77 16 25
## 3 "CostaRica" 84 21 29
## 4 "Cuba" 89 15 40
## 5 "DominicanRep" 68 14 21
## 6 "ElSalvador" 60 13 13
## 7 "Jamaica" 87 23 21
## 8 "Panama" 84 19 22
## 9 "TrinidadTobago" 84 15 29
df <- read.csv.sql("effort.csv", "SELECT X, effort FROM file")
head(df, 10)
## X effort
## 1 "Bolivia" 0
## 2 "Brazil" 0
## 3 "Chile" 16
## 4 "Colombia" 16
## 5 "CostaRica" 21
## 6 "Cuba" 15
## 7 "DominicanRep" 14
## 8 "Ecuador" 6
## 9 "ElSalvador" 13
## 10 "Guatemala" 9
df <- read.csv.sql("effort.csv", "SELECT X, effort FROM file ORDER BY X DESC")
head(df, 10)
## X effort
## 1 "Venezuela" 7
## 2 "TrinidadTobago" 15
## 3 "Peru" 0
## 4 "Paraguay" 3
## 5 "Panama" 19
## 6 "Nicaragua" 0
## 7 "Mexico" 4
## 8 "Jamaica" 23
## 9 "Honduras" 7
## 10 "Haiti" 3
df <- read.csv.sql("effort.csv", "SELECT effort, COUNT(*) AS count FROM file GROUP BY effort")
head(df, 10)
## effort count
## 1 0 4
## 2 3 2
## 3 4 1
## 4 6 1
## 5 7 2
## 6 9 1
## 7 13 1
## 8 14 1
## 9 15 2
## 10 16 2
relations
Data Structures and Algorithms for for k-ary relations with arbitrary domains, featuring relational algebra, predicate functions, and fitters for consensus relations
Check http://cran.r-project.org/web/packages/relations/index.html
library(relations)
## Warning: package 'relations' was built under R version 3.1.1
PersonDF <-
data.frame(Name = c("Harry", "Sally", "George", "Helena", "Peter"),
Age = c(34, 28, 29, 54, 34),
Weight = c(80, 64, 70, 54, 80),
stringsAsFactors = FALSE)
Person <- as.relation(PersonDF)
## see relation
relation_table(Person)
## Name Age Weight
## Helena 54 54
## Sally 28 64
## George 29 70
## Harry 34 80
## Peter 34 80
## projection
relation_table(relation_projection(Person, c("Age", "Weight")))
## Age Weight
## 54 54
## 28 64
## 29 70
## 34 80
## selection
relation_table(R1 <- relation_selection(Person, Age < 29))
## Name Age Weight
## Sally 28 64
relation_table(R2 <- relation_selection(Person, Age >= 34))
## Name Age Weight
## Helena 54 54
## Harry 34 80
## Peter 34 80
relation_table(R3 <- relation_selection(Person, Age == Weight))
## Name Age Weight
## Helena 54 54
## union
relation_table(R1 %U% R2)
## Name Age Weight
## Helena 54 54
## Sally 28 64
## Harry 34 80
## Peter 34 80
## works only for the same domains:
relation_table(R2 | R3)
## Name Age Weight
## Helena 54 54
## Harry 34 80
## Peter 34 80
## complement
relation_table(Person - R2)
## Name Age Weight
## Sally 28 64
## George 29 70
## intersection
relation_table(relation_intersection(R2, R3))
## Name Age Weight
## Helena 54 54
## works only for the same domains:
relation_table(R2 & R3)
## Name Age Weight
## Helena 54 54
## symmetric difference
relation_table(relation_symdiff(R2, R3))
## Name Age Weight
## Harry 34 80
## Peter 34 80
## cartesian product
Employee <-
data.frame(Name = c("Harry", "Sally", "George", "Harriet", "John"),
EmpId = c(3415, 2241, 3401, 2202, 3999),
DeptName = c("Finance", "Sales", "Finance", "Sales", "N.N."),
stringsAsFactors = FALSE)
Employee <- as.relation(Employee)
relation_table(Employee)
## Name EmpId DeptName
## George 3401 Finance
## Harry 3415 Finance
## John 3999 N.N.
## Harriet 2202 Sales
## Sally 2241 Sales
Dept <- data.frame(DeptName = c("Finance", "Sales", "Production"),
Manager = c("George", "Harriet", "Charles"),
stringsAsFactors = FALSE)
Dept <- as.relation(Dept)
relation_table(Dept)
## DeptName Manager
## Production Charles
## Finance George
## Sales Harriet
relation_table(Employee %><% Dept)
## Name EmpId DeptName DeptName Manager
## George 3401 Finance Production Charles
## Harry 3415 Finance Production Charles
## John 3999 N.N. Production Charles
## Harriet 2202 Sales Production Charles
## Sally 2241 Sales Production Charles
## George 3401 Finance Finance George
## Harry 3415 Finance Finance George
## John 3999 N.N. Finance George
## Harriet 2202 Sales Finance George
## Sally 2241 Sales Finance George
## George 3401 Finance Sales Harriet
## Harry 3415 Finance Sales Harriet
## John 3999 N.N. Sales Harriet
## Harriet 2202 Sales Sales Harriet
## Sally 2241 Sales Sales Harriet
## Natural join
relation_table(Employee %|><|% Dept)
## Name EmpId DeptName Manager
## George 3401 Finance George
## Harry 3415 Finance George
## Harriet 2202 Sales Harriet
## Sally 2241 Sales Harriet
## left (outer) join
relation_table(Employee %=><% Dept)
## Name EmpId DeptName Manager
## George 3401 Finance George
## Harry 3415 Finance George
## Harriet 2202 Sales Harriet
## Sally 2241 Sales Harriet
## John 3999 N.N. NA
## right (outer) join
relation_table(Employee %><=% Dept)
## Name EmpId DeptName Manager
## NA NA Production Charles
## George 3401 Finance George
## Harry 3415 Finance George
## Harriet 2202 Sales Harriet
## Sally 2241 Sales Harriet
## full outer join
relation_table(Employee %=><=% Dept)
## Name EmpId DeptName Manager
## NA NA Production Charles
## George 3401 Finance George
## Harry 3415 Finance George
## Harriet 2202 Sales Harriet
## Sally 2241 Sales Harriet
## John 3999 N.N. NA
## antijoin
relation_table(Employee %|>% Dept)
## Name EmpId DeptName
## John 3999 N.N.
relation_table(Employee %<|% Dept)
## DeptName Manager
## Production Charles
## semijoin
relation_table(Employee %|><% Dept)
## Name EmpId DeptName
## George 3401 Finance
## Harry 3415 Finance
## Harriet 2202 Sales
## Sally 2241 Sales
relation_table(Employee %><|% Dept)
## DeptName Manager
## Finance George
## Sales Harriet
## division
Completed <-
data.frame(Student = c("Fred", "Fred", "Fred", "Eugene",
"Eugene", "Sara", "Sara"),
Task = c("Database1", "Database2", "Compiler1",
"Database1", "Compiler1", "Database1",
"Database2"),
stringsAsFactors = FALSE)
Completed <- as.relation(Completed)
relation_table(Completed)
## Student Task
## Eugene Compiler1
## Fred Compiler1
## Eugene Database1
## Fred Database1
## Sara Database1
## Fred Database2
## Sara Database2
DBProject <- data.frame(Task = c("Database1", "Database2"),
stringsAsFactors = FALSE)
DBProject <- as.relation(DBProject)
relation_table(DBProject)
## Task
## Database1
## Database2
relation_table(Completed %/% DBProject)
## Student
## Fred
## Sara
## division remainder
relation_table(Completed %% DBProject)
## Student Task
## Eugene Compiler1
## Fred Compiler1
## Eugene Database1