Herein we are using the open-source database postgreSQL (more details below).
All databases and commands to include them in your computer’s postgreSQL are here.
How to access a postgreSQL database using R:
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
db <- dbConnect(drv, dbname="dvdrental", user="postgres", password="admin")
And to execute a query over the chosen database:
query <- "SELECT
*
FROM
customer;"
data <- dbGetQuery(db, query)
head(data)
## customer_id store_id first_name last_name
## 1 524 1 Jared Ely
## 2 1 1 Mary Smith
## 3 2 1 Patricia Johnson
## 4 3 1 Linda Williams
## 5 4 2 Barbara Jones
## 6 5 1 Elizabeth Brown
## email address_id activebool create_date
## 1 jared.ely@sakilacustomer.org 530 TRUE 2006-02-14
## 2 mary.smith@sakilacustomer.org 5 TRUE 2006-02-14
## 3 patricia.johnson@sakilacustomer.org 6 TRUE 2006-02-14
## 4 linda.williams@sakilacustomer.org 7 TRUE 2006-02-14
## 5 barbara.jones@sakilacustomer.org 8 TRUE 2006-02-14
## 6 elizabeth.brown@sakilacustomer.org 9 TRUE 2006-02-14
## last_update active
## 1 2013-05-26 14:49:45 1
## 2 2013-05-26 14:49:45 1
## 3 2013-05-26 14:49:45 1
## 4 2013-05-26 14:49:45 1
## 5 2013-05-26 14:49:45 1
## 6 2013-05-26 14:49:45 1
So SQL is a declarative language that specify relational algebra operations over tables (where rows represent records, and columns record attributes) in a relational database. Check wikipedia for more info.
This exercises are from http://sqlschool.modeanalytics.com/
A query is the SQL way to retrieve information from one or more tables.
This is an eg of projection, ie, selection of just some columns:
query <- "SELECT
first_name, email
FROM
customer;"
data <- dbGetQuery(db, query)
head(data)
## first_name email
## 1 Jared jared.ely@sakilacustomer.org
## 2 Mary mary.smith@sakilacustomer.org
## 3 Patricia patricia.johnson@sakilacustomer.org
## 4 Linda linda.williams@sakilacustomer.org
## 5 Barbara barbara.jones@sakilacustomer.org
## 6 Elizabeth elizabeth.brown@sakilacustomer.org
Rename the resulting columns:
query <- 'SELECT
first_name AS "FIRST NAME",
email AS "EMAIL"
FROM
customer;'
data <- dbGetQuery(db, query)
head(data)
## FIRST NAME EMAIL
## 1 Jared jared.ely@sakilacustomer.org
## 2 Mary mary.smith@sakilacustomer.org
## 3 Patricia patricia.johnson@sakilacustomer.org
## 4 Linda linda.williams@sakilacustomer.org
## 5 Barbara barbara.jones@sakilacustomer.org
## 6 Elizabeth elizabeth.brown@sakilacustomer.org
Fix a limit on the queried data:
query <- "SELECT
first_name, email
FROM
customer
LIMIT 3;"
data <- dbGetQuery(db, query)
head(data)
## first_name email
## 1 Jared jared.ely@sakilacustomer.org
## 2 Mary mary.smith@sakilacustomer.org
## 3 Patricia patricia.johnson@sakilacustomer.org
Another relational algebra operation is a filter:
query <- "SELECT
first_name, address_id, store_id
FROM
customer
WHERE
(address_id > 15 AND first_name <= 'm') OR store_id != 1;"
data <- dbGetQuery(db, query)
head(data,10)
## first_name address_id store_id
## 1 Jared 530 1
## 2 Barbara 8 2
## 3 Jennifer 10 2
## 4 Susan 12 2
## 5 Margaret 13 2
## 6 Lisa 15 2
## 7 Karen 17 2
## 8 Betty 18 2
## 9 Helen 19 1
## 10 Sandra 20 2
The logical operators are AND
, OR
and NOT
To mix two columns:
query <- "SELECT
customer_id,
store_id,
1000 * customer_id + store_id AS mix_id
FROM
customer;"
data <- dbGetQuery(db, query)
head(data)
## customer_id store_id mix_id
## 1 524 1 524001
## 2 1 1 1001
## 3 2 1 2001
## 4 3 1 3001
## 5 4 2 4002
## 6 5 1 5001
To make sorts:
query <- "SELECT
first_name,
last_name
FROM
customer
ORDER BY
first_name DESC, last_name;" # first name by decreasing order, then last name by increasing order
data <- dbGetQuery(db, query)
head(data,7)
## first_name last_name
## 1 Zachary Hite
## 2 Yvonne Watkins
## 3 Yolanda Weaver
## 4 Wilma Richards
## 5 Willie Howell
## 6 Willie Markham
## 7 William Satterfield
Using (simple) regular expressions:
query <- "SELECT
first_name,
last_name
FROM
customer
WHERE
first_name LIKE 'W%a' OR last_name LIKE 'J_n%';" # % means 0+ chars, _ means 1 char
data <- dbGetQuery(db, query)
head(data)
## first_name last_name
## 1 Barbara Jones
## 2 Louise Jenkins
## 3 Wanda Patterson
## 4 Wilma Richards
## 5 Lena Jensen
## 6 Naomi Jennings
ILIKE
is for case insensitive.
IN
is used to compare with a set of values:
query <- "SELECT
first_name, -- btw, this is a SQL comment
last_name
FROM
customer
/*
this is a multiline comment
*/
WHERE
first_name IN ('Mary', 'Barbara');"
data <- dbGetQuery(db, query)
head(data)
## first_name last_name
## 1 Mary Smith
## 2 Barbara Jones
The next egs use this google stock csv. To include it in postgreSQL do:
CREATE DATABASE stocks;
\c stocks;
CREATE TABLE stocks ( DATE_STOCK varchar(8), YEAR integer, MONTH integer, OPEN double precision, HIGH double precision, LOW double precision, CLOSE double precision, VOLUME integer, id integer PRIMARY KEY );
\COPY stocks FROM 'C:\Users\jpn\Downloads\stocks.csv' DELIMITER ',' CSV HEADER;
(adjust to your local dir)Now we can access it here:
db2 <- dbConnect(drv, dbname="stocks", user="postgres", password="admin")
query <- "SELECT
*
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data,10)
## date_stock year month open high low close volume id
## 1 1/30/14 2014 1 502.54 506.50 496.70 499.78 24182996 1
## 2 1/29/14 2014 1 503.95 507.37 498.62 500.75 17991828 2
## 3 1/28/14 2014 1 508.76 515.00 502.07 506.50 38119083 3
## 4 1/27/14 2014 1 550.07 554.80 545.75 550.50 20602736 4
## 5 1/24/14 2014 1 554.00 555.62 544.75 546.07 15483491 5
## 6 1/23/14 2014 1 549.94 556.50 544.81 556.18 14425478 6
## 7 1/22/14 2014 1 550.91 557.29 547.81 551.51 13602762 7
## 8 1/21/14 2014 1 540.99 550.07 540.42 549.07 11750792 8
## 9 1/17/14 2014 1 551.48 552.07 539.90 540.67 15489527 9
## 10 1/16/14 2014 1 554.90 556.85 551.68 554.25 8210190 10
We can aggregate information into some available functions:
query <- "SELECT
COUNT(*), -- count the number of records
COUNT(high) AS highs, -- count the number of not null values of column 'high'
SUM(high) AS sum_highs, -- null values are treated as zero
MIN(volume) AS min_volume,
MAX(volume) AS max_volume,
AVG(low) AS avg_low -- ignore nulls
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data,10)
## count highs sum_highs min_volume max_volume avg_low
## 1 100 99 52176.27 5984105 38119083 518.493
We can mix the aggregators with grouping the table into values:
query <- "SELECT
year,
COUNT(*), -- count the number of records
COUNT(high) AS highs, -- count the number of not null values of column 'high'
SUM(high) AS sum_highs, -- null values are treated as zero
MIN(volume) AS min_volume,
MAX(volume) AS max_volume,
AVG(low) AS avg_low -- ignore nulls
FROM
stocks
GROUP BY
year;"
data <- dbGetQuery(db2, query)
head(data,10)
## year count highs sum_highs min_volume max_volume avg_low
## 1 2013 80 80 41824.02 5984105 31932384 514.3559
## 2 2014 20 19 10352.25 8210190 38119083 535.0415
Groups can be nested:
query <- "SELECT
year, month,
COUNT(*), -- count the number of records
COUNT(high) AS highs, -- count the number of not null values of column 'high'
SUM(high) AS sum_highs, -- null values are treated as zero
MIN(volume) AS min_volume,
MAX(volume) AS max_volume,
AVG(low) AS avg_low -- ignore nulls
FROM
stocks
GROUP BY
year, month
ORDER BY
year, month;"
data <- dbGetQuery(db2, query)
head(data,10)
## year month count highs sum_highs min_volume max_volume avg_low
## 1 2013 9 16 16 7715.25 8144247 31932384 473.1256
## 2 2013 10 23 23 11700.72 8967859 22707445 499.5426
## 3 2013 11 20 20 10556.59 6935114 14335104 520.2230
## 4 2013 12 21 21 11851.46 5984105 20209401 556.4057
## 5 2014 1 20 19 10352.25 8210190 38119083 535.0415
Distinct values can be selected:
query <- "SELECT DISTINCT
month
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data,10)
## month
## 1 11
## 2 12
## 3 1
## 4 9
## 5 10
With more than one column, SQL returns all the distinct tuples it finds:
query <- "SELECT DISTINCT
month, year
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data)
## month year
## 1 9 2013
## 2 10 2013
## 3 12 2013
## 4 1 2014
## 5 11 2013
We can use it with count
query <- "SELECT
COUNT(DISTINCT month) -- how many distinct months?
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data)
## count
## 1 5
For the next egs let’s use this csv.
db3 <- dbConnect(drv, dbname="rosters", user="postgres", password="admin")
query <- "SELECT
*
FROM
players;"
data <- dbGetQuery(db3, query)
head(data,10)
## school_full_name school_name name position height weight
## 1 Cincinnati Bearcats Cincinnati Ralph Abernathy RB 67 161
## 2 Cincinnati Bearcats Cincinnati Mekale McKay WR 78 195
## 3 Cincinnati Bearcats Cincinnati Trenier Orr CB 71 177
## 4 Cincinnati Bearcats Cincinnati Bennie Coney QB 75 216
## 5 Cincinnati Bearcats Cincinnati Johnny Holton WR 75 190
## 6 Cincinnati Bearcats Cincinnati Howard Wilder DB 71 180
## 7 Cincinnati Bearcats Cincinnati Munchie Legaux QB 77 200
## 8 Cincinnati Bearcats Cincinnati Mark Barr WR 73 163
## 9 Cincinnati Bearcats Cincinnati Aaron Brown CB 71 172
## 10 Cincinnati Bearcats Cincinnati Anthony McClung WR 73 177
## year home_town state id
## 1 JR ATLANTA, GA GA 1
## 2 SO LOUISVILLE, KY KY 2
## 3 SO WINTER GARDEN, FL FL 3
## 4 FR PLANT CITY, FL FL 4
## 5 JR MIAMI, FL FL 5
## 6 JR SEA ISLAND, GA GA 6
## 7 SR NEW ORLEANS, LA LA 7
## 8 FR FORT LAUDERDALE, FL FL 8
## 9 FR MIAMI, FL FL 9
## 10 SR INDIANAPOLIS, IN IN 10
To deal with conditional statements, SQL uses CASE WHEN ... THEN ... ELSE ... END
query <- "SELECT
name,
year,
CASE WHEN year = 'SR' -- create a new column where only senior players have a 'yes' values
THEN 'yes'
ELSE NULL
END AS is_a_senior
FROM
players;"
data <- dbGetQuery(db3, query)
head(data,10)
## name year is_a_senior
## 1 Ralph Abernathy JR <NA>
## 2 Mekale McKay SO <NA>
## 3 Trenier Orr SO <NA>
## 4 Bennie Coney FR <NA>
## 5 Johnny Holton JR <NA>
## 6 Howard Wilder JR <NA>
## 7 Munchie Legaux SR yes
## 8 Mark Barr FR <NA>
## 9 Aaron Brown FR <NA>
## 10 Anthony McClung SR yes
It’s possible to have a sequence of conditionals:
query <- "SELECT
name,
weight,
CASE WHEN weight > 250 THEN 'over 250'
WHEN weight > 200 THEN '200-250'
WHEN weight > 175 THEN '175-200'
ELSE 'under 175'
END AS weight_group
FROM
players;"
data <- dbGetQuery(db3, query)
head(data,10)
## name weight weight_group
## 1 Ralph Abernathy 161 under 175
## 2 Mekale McKay 195 175-200
## 3 Trenier Orr 177 175-200
## 4 Bennie Coney 216 200-250
## 5 Johnny Holton 190 175-200
## 6 Howard Wilder 180 175-200
## 7 Munchie Legaux 200 175-200
## 8 Mark Barr 163 under 175
## 9 Aaron Brown 172 under 175
## 10 Anthony McClung 177 175-200
Conditional statements can be mixed with aggregatorsto achieve more expression power:
query <- "SELECT
CASE WHEN year = 'FR'
THEN 'FR'
ELSE 'Not FR'
END AS year_group,
COUNT(1) AS count -- 1 means the first column of the select, aka year_group
FROM
players
GROUP BY 1;"
data <- dbGetQuery(db3, query)
head(data)
## year_group count
## 1 Not FR 3018
## 2 FR 1982
Another eg:
query <- "SELECT
CASE WHEN year = 'FR' THEN 'FR'
WHEN year = 'SO' THEN 'SO'
WHEN year = 'JR' THEN 'JR'
WHEN year = 'SR' THEN 'SR'
ELSE 'No Year Data'
END AS year_group,
COUNT(1) AS count
FROM
players
GROUP BY 1;"
data <- dbGetQuery(db3, query)
head(data)
## year_group count
## 1 JR 1086
## 2 FR 1982
## 3 SR 846
## 4 SO 1086
The previous eg now rotated, aka, pivoting:
query <- "SELECT
COUNT(CASE WHEN year = 'JR' THEN 1 ELSE NULL END) AS jr_count,
COUNT(CASE WHEN year = 'FR' THEN 1 ELSE NULL END) AS fr_count,
COUNT(CASE WHEN year = 'SR' THEN 1 ELSE NULL END) AS sr_count,
COUNT(CASE WHEN year = 'SO' THEN 1 ELSE NULL END) AS so_count
FROM
players;"
data <- dbGetQuery(db3, query)
head(data)
## jr_count fr_count sr_count so_count
## 1 1086 1982 846 1086
Joins permit that two or more tables can be associated to recover common associated information.
In the next egs we use a 2nd table names teams
from the same database (csv).
query <- "SELECT
*
FROM
teams;"
data <- dbGetQuery(db3, query)
head(data)
## division conference school_name
## 1 FBS (Division I-A Teams) American Athletic Cincinnati
## 2 FBS (Division I-A Teams) American Athletic Connecticut
## 3 FBS (Division I-A Teams) American Athletic Houston
## 4 FBS (Division I-A Teams) American Athletic Louisville
## 5 FBS (Division I-A Teams) American Athletic Memphis
## 6 FBS (Division I-A Teams) American Athletic Rutgers
## roster_url id
## 1 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 2 http://espn.go.com/ncf/teams/roster?teamId=41 2
## 3 http://espn.go.com/ncf/teams/roster?teamId=248 3
## 4 http://espn.go.com/ncf/teams/roster?teamId=97 4
## 5 http://espn.go.com/ncf/teams/roster?teamId=235 5
## 6 http://espn.go.com/ncf/teams/roster?teamId=164 6
Say we want to know which conference (in teams table) has the highest average weight (in players table):
query <- "SELECT
* -- this is the full table returned by join, not the solution
FROM
players JOIN teams -- this is the new table, the joined table, where select is going to operate
ON players.school_name = teams.school_name;"
data <- dbGetQuery(db3, query)
head(data)
## school_full_name school_name name position height weight
## 1 Cincinnati Bearcats Cincinnati Ralph Abernathy RB 67 161
## 2 Cincinnati Bearcats Cincinnati Mekale McKay WR 78 195
## 3 Cincinnati Bearcats Cincinnati Trenier Orr CB 71 177
## 4 Cincinnati Bearcats Cincinnati Bennie Coney QB 75 216
## 5 Cincinnati Bearcats Cincinnati Johnny Holton WR 75 190
## 6 Cincinnati Bearcats Cincinnati Howard Wilder DB 71 180
## year home_town state id division
## 1 JR ATLANTA, GA GA 1 FBS (Division I-A Teams)
## 2 SO LOUISVILLE, KY KY 2 FBS (Division I-A Teams)
## 3 SO WINTER GARDEN, FL FL 3 FBS (Division I-A Teams)
## 4 FR PLANT CITY, FL FL 4 FBS (Division I-A Teams)
## 5 JR MIAMI, FL FL 5 FBS (Division I-A Teams)
## 6 JR SEA ISLAND, GA GA 6 FBS (Division I-A Teams)
## conference school_name
## 1 American Athletic Cincinnati
## 2 American Athletic Cincinnati
## 3 American Athletic Cincinnati
## 4 American Athletic Cincinnati
## 5 American Athletic Cincinnati
## 6 American Athletic Cincinnati
## roster_url id
## 1 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 2 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 3 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 4 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 5 http://espn.go.com/ncf/teams/roster?teamId=2132 1
## 6 http://espn.go.com/ncf/teams/roster?teamId=2132 1
query <- "SELECT
teams.conference AS conference,
AVG(players.weight) AS average_weight
FROM
players JOIN teams
ON teams.school_name = players.school_name
GROUP BY teams.conference
ORDER BY AVG(players.weight) DESC;"
data <- dbGetQuery(db3, query)
head(data)
## conference average_weight
## 1 ACC 228.9955
## 2 Big Ten 228.1336
## 3 American Athletic 227.9795
## 4 Big 12 226.4017
This JOIN
is an inner join, ie, it will only show rows that satisfy the ON
condition. It is an intersection of the two tables (check outer joins below).
If there are columns in different tables with the same names but different information, we should use AS
to give them distinct names.
These are are tables for the next egs:
db4 <- dbConnect(drv, dbname="crunchbase", user="postgres", password="admin")
query <- "SELECT
*
FROM
companies;"
data <- dbGetQuery(db4, query)
head(data)
## permalink name homepage
## 1 /company/8868 8868 http://www.8868.cn
## 2 /company/21e6 2.10E+07 <NA>
## 3 /company/club-domains .Club Domains http://dotclub.com
## 4 /company/fox-networks .Fox Networks http://www.dotfox.com
## 5 /company/a-list-games [a]list games http://www.alistgames.com
## 6 /company/pay-mobile-checkout @Pay http://atpay.com
## category_code funding_total_usd status country_code state_code
## 1 <NA> NA operating <NA> <NA>
## 2 <NA> 5050000 operating USA CA
## 3 software 7000000 operating USA FL
## 4 advertising 4912394 closed ARG <NA>
## 5 games_video 9300000 operating <NA> <NA>
## 6 mobile 3500000 operating USA NM
## region city funding_rounds id
## 1 unknown <NA> 1 1
## 2 SF Bay San Francisco 1 2
## 3 Fort Lauderdale Oakland Park 1 3
## 4 Buenos Aires Buenos Aires 1 4
## 5 unknown <NA> 1 5
## 6 Albuquerque Albuquerque 1 6
nrow(data)
## [1] 27325
query <- "SELECT
*
FROM
acquisitions;"
data <- dbGetQuery(db4, query)
head(data)
## company_permalink
## 1 /company/waywire
## 2 /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4 /company/1-800-contacts-2
## 5 /company/1000memories
## 6 /company/10best
## company_name company_category_code
## 1 #waywire news
## 2 1 Nation Technology <NA>
## 3 1 Stop Financial Service Centers of America <NA>
## 4 1-800 Contacts <NA>
## 5 1000memories web
## 6 10best web
## company_country_code company_state_code company_region company_city
## 1 USA NY New York New York
## 2 <NA> <NA> unknown <NA>
## 3 USA TX Austin Round Rock
## 4 <NA> <NA> unknown <NA>
## 5 USA CA SF Bay San Francisco
## 6 USA SC Greenville Greenville
## acquirer_permalink acquirer_name
## 1 /company/magnify Magnify
## 2 /company/vology Vology
## 3 /company/confie-seguros Confie Seguros
## 4 /company/thomas-h-lee-partners Thomas H. Lee Partners
## 5 /company/ancestry-com Ancestry
## 6 /company/nile-guide NileGuide
## acquirer_category_code acquirer_country_code acquirer_state_code
## 1 games_video USA NY
## 2 other <NA> <NA>
## 3 enterprise USA CA
## 4 <NA> USA MA
## 5 ecommerce USA UT
## 6 social USA CA
## acquirer_region acquirer_city price_amount id
## 1 New York New York NA 1
## 2 unknown <NA> NA 2
## 3 Los Angeles Buena Park NA 3
## 4 Boston Boston NA 4
## 5 Salt Lake City Provo NA 5
## 6 SF Bay San Francisco NA 6
nrow(data)
## [1] 5000
There are three types of Outer Join: + Left Outer Join, which includes unmatched rows from the left table + Right Outer Join, which includes unmatched rows from the right table (ie, T2 LEFT JOIN T1
produces the same results as T1 RIGHT JOIN T2
) + Full Outer Join, which includes unmatched rows from both tables
First the inner join:
query <- "SELECT
companies.permalink AS companies_permalink,
companies.name AS companies_name,
acquisitions.company_permalink AS acquisitions_permalink
FROM companies JOIN acquisitions
ON companies.permalink = acquisitions.company_permalink;"
data <- dbGetQuery(db4, query)
head(data)
## companies_permalink companies_name acquisitions_permalink
## 1 /company/waywire #waywire /company/waywire
## 2 /company/1000memories 1000memories /company/1000memories
## 3 /company/12society 12Society /company/12society
## 4 /company/280-north 280 North /company/280-north
## 5 /company/280-north 280 North /company/280-north
## 6 /company/2web-technologies 2Web Technologies /company/2web-technologies
Now a left join which includes several rows from companies table that does not have an entry at the acquisitions table:
query <- "SELECT
companies.permalink AS companies_permalink,
companies.name AS companies_name,
acquisitions.company_permalink AS acquisitions_permalink
FROM companies LEFT JOIN acquisitions
ON companies.permalink = acquisitions.company_permalink;"
data <- dbGetQuery(db4, query)
head(data)
## companies_permalink companies_name acquisitions_permalink
## 1 /company/waywire #waywire /company/waywire
## 2 /company/1000memories 1000memories /company/1000memories
## 3 /company/12society 12Society /company/12society
## 4 /company/280-north 280 North /company/280-north
## 5 /company/280-north 280 North /company/280-north
## 6 /company/2web-technologies 2Web Technologies /company/2web-technologies
The next eg count the number of unique companies (not double-counting companies) and unique acquired companies by state. It does not include results for which there is no state data, and order by the number of acquired companies from highest to lowest.
query <- "SELECT companies.state_code,
COUNT(DISTINCT companies.permalink) AS unique_companies,
COUNT(DISTINCT acquisitions.company_permalink) AS unique_companies_acquired
FROM companies LEFT JOIN acquisitions
ON companies.permalink = acquisitions.company_permalink
WHERE companies.state_code IS NOT NULL
GROUP BY 1
ORDER BY 3 DESC;"
data <- dbGetQuery(db4, query)
head(data)
## state_code unique_companies unique_companies_acquired
## 1 CA 6170 415
## 2 NY 1730 74
## 3 MA 1272 68
## 4 WA 638 41
## 5 TX 808 40
## 6 CO 453 19
If you want to provide a default value for the missing attributes, check function COALESCE
.
Functions LEFT
and RIGHT
allow to snip \(n\) chars from a string. LENGTH
returns the size of the string.
query <- "SELECT
company_permalink,
LEFT(company_permalink, 6) AS left_permalink,
RIGHT(company_permalink, 6) AS right_permalink,
LENGTH(company_permalink) AS length
FROM
acquisitions;"
data <- dbGetQuery(db4, query)
head(data)
## company_permalink left_permalink
## 1 /company/waywire /compa
## 2 /company/1-nation-technology /compa
## 3 /company/1-stop-financial-service-centers-of-america /compa
## 4 /company/1-800-contacts-2 /compa
## 5 /company/1000memories /compa
## 6 /company/10best /compa
## right_permalink length
## 1 aywire 16
## 2 nology 28
## 3 merica 52
## 4 acts-2 25
## 5 mories 21
## 6 10best 15
TRIM
allows to remove certain chars from the extremeties of the string. It has 3 args: first from where we should start, then a string of char to remove, then a FROM followed by which column to process:
query <- "SELECT
company_permalink,
TRIM(leading '/e' FROM company_permalink) AS trim_begin,
TRIM(trailing '/e' FROM company_permalink) AS trim_end,
TRIM(both '/e' FROM company_permalink) AS trim_begin_and_end
FROM
acquisitions;"
data <- dbGetQuery(db4, query)
head(data)
## company_permalink
## 1 /company/waywire
## 2 /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4 /company/1-800-contacts-2
## 5 /company/1000memories
## 6 /company/10best
## trim_begin
## 1 company/waywire
## 2 company/1-nation-technology
## 3 company/1-stop-financial-service-centers-of-america
## 4 company/1-800-contacts-2
## 5 company/1000memories
## 6 company/10best
## trim_end
## 1 /company/waywir
## 2 /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4 /company/1-800-contacts-2
## 5 /company/1000memories
## 6 /company/10best
## trim_begin_and_end
## 1 company/waywir
## 2 company/1-nation-technology
## 3 company/1-stop-financial-service-centers-of-america
## 4 company/1-800-contacts-2
## 5 company/1000memories
## 6 company/10best
Other functions: + POSITION
, STRPOS
, to find chars within a string + SUBSTR
, LEFT
, RIGHT
, to produce substrings + CONCAT
, ||
, to concatenate + UPPER
, LOWER
, to make uppercase/lowercase string
For date manipulation let’s recheck the second db:
query <- "SELECT
date_stock, id
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data,20)
## date_stock id
## 1 1/30/14 1
## 2 1/29/14 2
## 3 1/28/14 3
## 4 1/27/14 4
## 5 1/24/14 5
## 6 1/23/14 6
## 7 1/22/14 7
## 8 1/21/14 8
## 9 1/17/14 9
## 10 1/16/14 10
## 11 1/15/14 11
## 12 1/14/14 12
## 13 1/13/14 13
## 14 1/10/14 14
## 15 1/9/14 15
## 16 1/8/14 16
## 17 1/7/14 17
## 18 1/6/14 18
## 19 1/3/14 19
## 20 1/2/14 20
The date_stock
column is a string with some quirks in it (different sizes for different dates). How to convert it to date?
query <- "SELECT
id,
date_stock,
CASE WHEN LENGTH(date_stock) = 6 -- months 1 to 9, days 1 to 9
THEN ('20' || RIGHT(date_stock, 2) || '-' ||
LEFT(date_stock, 1) || '-' ||
SUBSTR(date_stock, 3, 1))::date
WHEN LENGTH(date_stock) = 8 -- months > 9, days > 9
THEN ('20' || RIGHT(date_stock, 2) || '-' ||
LEFT(date_stock, 2) || '-' ||
SUBSTR(date_stock, 4, 2))::date
WHEN LENGTH(date_stock) = 7 AND SUBSTR(date_stock,2,1) = '/' -- months < 10, days > 9
THEN ('20' || RIGHT(date_stock, 2) || '-' ||
LEFT(date_stock, 1) || '-' ||
SUBSTR(date_stock, 3, 2))::date
ELSE ('20' || RIGHT(date_stock, 2) || '-' || -- months > 9, days < 10
LEFT(date_stock, 2) || '-' ||
SUBSTR(date_stock, 4, 2))::date -- notice the data cast (à lá Haskell)
END AS clean_date
FROM
stocks;"
data <- dbGetQuery(db2, query)
## Warning in postgresqlQuickSQL(conn, statement, ...): Could not create executeSELECT
## id,
## date_stock,
## CASE WHEN LENGTH(date_stock) = 6 -- months 1 to 9, days 1 to 9
## THEN ('20' || RIGHT(date_stock, 2) || '-' ||
## LEFT(date_stock, 1) || '-' ||
## SUBSTR(date_stock, 3, 1))::date
## WHEN LENGTH(date_stock) = 8 -- months > 9, days > 9
## THEN ('20' || RIGHT(date_stock, 2) || '-' ||
## LEFT(date_stock, 2) || '-' ||
## SUBSTR(date_stock, 4, 2))::date
## WHEN LENGTH(date_stock) = 7 AND SUBSTR(date_stock,2,1) = '/' -- months < 10, days > 9
## THEN ('20' || RIGHT(date_stock, 2) || '-' ||
## LEFT(date_stock, 1) || '-' ||
## SUBSTR(date_stock, 3, 2))::date
## ELSE ('20' || RIGHT(date_stock, 2) || '-' || -- months > 9, days < 10
## LEFT(date_stock, 2) || '-' ||
## SUBSTR(date_stock, 4, 2))::date -- notice the data cast (à lá Haskell)
## END AS clean_date
## FROM
## stocks;
head(data, 20)
## NULL
The function NOW()
provides the current time (as a timestamp datatype):
query <- "SELECT
date_stock, id, NOW() AS current_time
FROM
stocks;"
data <- dbGetQuery(db2, query)
head(data)
## date_stock id current_time
## 1 1/30/14 1 2015-03-01 09:56:50
## 2 1/29/14 2 2015-03-01 09:56:50
## 3 1/28/14 3 2015-03-01 09:56:50
## 4 1/27/14 4 2015-03-01 09:56:50
## 5 1/24/14 5 2015-03-01 09:56:50
## 6 1/23/14 6 2015-03-01 09:56:50
Timestamps have specific function to retrive their composite information:
query <- "SELECT
EXTRACT('year' FROM temp_table.current_time) AS current_year
FROM
( SELECT -- make a sub_query, in order to get the timestamp for the eg
date_stock, id, NOW() AS current_time
FROM stocks
) AS temp_table;"
data <- dbGetQuery(db2, query)
head(data)
## current_year
## 1 2015
## 2 2015
## 3 2015
## 4 2015
## 5 2015
## 6 2015
The previous eg has a subquery, the construction of a temporary (named) table needed to get the desired result.
Here’s another eg that shows the rows with the smaller date in the table (the result is not ok, since, as we seen, the dates do not follow a standard description, but let’s assume that was not a problem here). In this case the temporary table was made in the WHERE
clause:
query <- "SELECT
date_stock, id
FROM
stocks
WHERE
date_stock = ( SELECT min(date_stock) FROM stocks );"
data <- dbGetQuery(db2, query)
head(data)
## date_stock id
## 1 1/10/14 14
We can also have 2+ subqueries and then join them:
query <- "SELECT
first_name, last_name
FROM
customer;"
data <- dbGetQuery(db, query)
head(data)
## first_name last_name
## 1 Jared Ely
## 2 Mary Smith
## 3 Patricia Johnson
## 4 Linda Williams
## 5 Barbara Jones
## 6 Elizabeth Brown
# Show customers names that have last names started by 'An' or 'Ar'
query <- "SELECT
temp.first_name, temp.last_name
FROM
( SELECT first_name, last_name FROM customer WHERE last_name LIKE 'An%'
UNION ALL
SELECT first_name, last_name FROM customer WHERE last_name LIKE 'Ar%'
) AS temp;"
data <- dbGetQuery(db, query)
head(data,20)
## first_name last_name
## 1 Lisa Anderson
## 2 Ida Andrews
## 3 Jose Andrew
## 4 Beatrice Arnold
## 5 Melanie Armstrong
## 6 Carl Artis
## 7 Harry Arce
## 8 Jordan Archuleta
## 9 Kent Arsenault
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
psql -Upostgres
CREATE DATABASE dvdrental;
dvdrental
database (at shell): pg_restore -U postgres -d dvdrental dvdrental.tar
More info at http://www.postgresqltutorial.com/
Helpful postgreSQL commands (ref):
sudo -u postgres psql postgres
postgres=# \l
postgres=# \du
postgres=#CREATE ROLE demorole1 WITH LOGIN ENCRYPTED PASSWORD 'password1' CREATEDB;
postgres=#ALTER ROLE demorole1 CREATEROLE CREATEDB REPLICATION SUPERUSER;
postgres=#DROP ROLE demorole1;
postgres=#CREATE DATABASE demodb1 WITH OWNER demorole1 ENCODING 'UTF8';
GRANT ALL PRIVILEGES ON DATABASE demodb1 TO demorole1;
postgres=#DROP DATABASE demodb1;
\c <databasename>
\dt
\d <tablename>
pg_dump <databasename> > <outfile>
\q
\?
\h COMMAND
\i FILE
\! COMMAND
Operations besides select using RPostgreSQL
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
# before this, need to CREATE DATABASE tests;
db <- dbConnect(drv, dbname="tests", user="postgres", password="admin")
# delete if exists
if (dbExistsTable(db, "books"))
dbSendQuery(db, "DROP TABLE books;") # or dbRemoveTable("books");
## <PostgreSQLResult:(3688,4,3)>
# create table
dbSendQuery(db, "CREATE TABLE books (intcolumn integer, floatcolumn float);")
## <PostgreSQLResult:(3688,4,4)>
# insert data, row by row
dbSendQuery(db, "INSERT INTO books VALUES(12, 19.95);")
## <PostgreSQLResult:(3688,4,5)>
dbSendQuery(db, "INSERT INTO books VALUES( 1, 9.95);")
## <PostgreSQLResult:(3688,4,6)>
dbSendQuery(db, "INSERT INTO books VALUES(10, 14.95);")
## <PostgreSQLResult:(3688,4,7)>
dbSendQuery(db, "INSERT INTO books VALUES( 2, 4.95);")
## <PostgreSQLResult:(3688,4,8)>
dbSendQuery(db, "INSERT INTO books VALUES( 3, 1.45);")
## <PostgreSQLResult:(3688,4,9)>
# get contents
dbGetQuery(db, "SELECT * FROM books;")
## intcolumn floatcolumn
## 1 12 19.95
## 2 1 9.95
## 3 10 14.95
## 4 2 4.95
## 5 3 1.45
# get contents (lazy)
data <- dbSendQuery(db, "SELECT * FROM books;")
fetch(data, 3) # get the next 2 records (use -1 to fetch all)
## intcolumn floatcolumn
## 1 12 19.95
## 2 1 9.95
## 3 10 14.95
dbGetRowCount(data) # how many rows were processed so far
## [1] 3
fetch(data, 2)
## intcolumn floatcolumn
## 4 2 4.95
## 5 3 1.45
dbGetRowCount(data)
## [1] 5
# get the DBMS statement of a given resulst
dbGetStatement(data)
## [1] "SELECT * FROM books;"
dbGetRowsAffected(data) # how many rows were affected (-1 if none was)
## [1] -1
# get avalailable tables in database
dbListTables(db)
## [1] "books"
# get list of table's column names
dbListFields(db, "books")
## [1] "intcolumn" "floatcolumn"
# import table into data frame
df <- dbReadTable(db, "books")
df
## intcolumn floatcolumn
## 1 12 19.95
## 2 1 9.95
## 3 10 14.95
## 4 2 4.95
## 5 3 1.45
# export data frame into db table
df2 <- data.frame(id=1:5, x=runif(10), y=sample(letters,10))
df2
## id x y
## 1 1 0.09060102 v
## 2 2 0.73327937 i
## 3 3 0.99108526 p
## 4 4 0.41674766 o
## 5 5 0.37172027 b
## 6 1 0.30767700 q
## 7 2 0.47351955 u
## 8 3 0.26539024 c
## 9 4 0.03744036 x
## 10 5 0.90293034 m
dbWriteTable(db, "new_table", df2)
## [1] TRUE
dbReadTable(db, "new_table")
## id x y
## 1 1 0.09060102 v
## 2 2 0.73327937 i
## 3 3 0.99108526 p
## 4 4 0.41674766 o
## 5 5 0.37172027 b
## 6 1 0.30767700 q
## 7 2 0.47351955 u
## 8 3 0.26539024 c
## 9 4 0.03744036 x
## 10 5 0.90293034 m
dbClearResult(data) # flushes any pending data and frees the resources used by result set
## [1] TRUE
# postgreSQL transactions
dbSendQuery(db, "BEGIN;") # begin transaction
## <PostgreSQLResult:(3688,4,25)>
dbRemoveTable(db,"new_table")
## [1] TRUE
dbExistsTable(db,"new_table")
## [1] FALSE
dbRollback(db) # rollback, ie, undo all changes since last begin
## [1] TRUE
dbExistsTable(db,"new_table")
## [1] TRUE
dbSendQuery(db, "BEGIN;")
## <PostgreSQLResult:(3688,4,34)>
dbRemoveTable(db,"new_table")
## [1] TRUE
dbExistsTable(db,"new_table")
## [1] FALSE
dbCommit(db) # or dbSendQuery(db, "COMMIT;")
## [1] TRUE
dbExistsTable(db,"new_table")
## [1] FALSE
dbDisconnect(db) # disconnects with the database
## [1] TRUE
dbUnloadDriver(drv) # unload driver (frees all resources)
## [1] TRUE