Introduction
Avec le package dplyr on peut operer des transformations sur des dataframes R classiques mais aussi sur des tables de bases de donnees distantes, car dans les coulisses le package dbplyr traduit le code en SQL. Les packages sqldf et tidyquery permettent eux de requeter en SQL des dataframes R, le package sqldf a plus de possibilites (jointures) que tidyquery qui est un nouveau package, mais sqldf est connu pour etre assez lent.
Traduire le code dplyr en SQL
code SQL generique
# pour les donnees
library("nycflights13")
library("completejourney")
library("dplyr")
library("dbplyr")
library("tidyquery")
library("sqldf")
<- lazy_frame(id = c(1,1,2,2,2),
df1 date_transac = Sys.Date() - c(0,1,5,2,3),
montant = c(10, 30, 20, 80, 70))
df1
%>% count(id)
df1
%>% group_by(id) %>% summarise(moy = mean(montant))
df1
%>% group_by(id) %>% mutate(moy = mean(montant)) %>% filter(montant > moyenne) df1
<SQL>
SELECT *
FROM `df`
<SQL>
SELECT `id`, COUNT(*) AS `n`
FROM `df`
GROUP BY `id`
<SQL>
SELECT `id`, AVG(`montant`) AS `moy`
FROM `df`
GROUP BY `id`
<SQL>
SELECT *
FROM (SELECT `id`, `date_transac`, `montant`, AVG(`montant`) OVER (PARTITION BY `id`) AS `moy`
FROM `df`) `q01`
WHERE (`montant` > `moyenne`)
code SQL pour une base de donnees Access
Sous Access les fonctions de fenetrage ne sont pas disponibles, le dernier mutate va donc echouer.
### une database en particulier : access
<-data.frame(id = c(1,1,2,2,2),
df date_transac = Sys.Date() - c(0,1,5,2,3),
montant = c(10, 30, 20, 80, 70)) %>% tbl_lazy(con = simulate_access())
%>% count(id) %>% show_query()
df
%>% group_by(id) %>% summarise(moy = mean(montant))
df
try(df %>% group_by(id) %>% mutate(moy = mean(montant)) %>% filter(montant > moyenne) %>% print,
outFile = stdout())
<SQL>
SELECT `id`, COUNT(*) AS `n`
FROM `df`
GROUP BY `id`
<SQL>
SELECT `id`, AVG(`montant`) AS `moy`
FROM `df`
GROUP BY `id`
Error : Window function `AVG()` is not supported by this database
Appliquer du SQL a des dataframes R avec sqldf et tidyquery
= " SELECT origin, dest,
requete COUNT(flight) AS num_flts,
round(AVG(distance)) AS dist,
round(AVG(arr_delay)) AS avg_delay
FROM flights
WHERE distance BETWEEN 200 AND 300
AND air_time IS NOT NULL
GROUP BY origin, dest
HAVING num_flts > 3000
ORDER BY num_flts DESC, avg_delay DESC;"
query(requete)
sqldf(requete)
# A tibble: 3 x 5
origin dest num_flts dist avg_delay
<chr> <chr> <int> <dbl> <dbl>
1 EWR BOS 5247 200 5
2 LGA DCA 4468 214 6
3 JFK DCA 3076 213 8
origin dest num_flts dist avg_delay
1 EWR BOS 5247 200 5
2 LGA DCA 4468 214 6
3 JFK DCA 3076 213 8
Pour s’exercer
Le package completejourney fournit 8 tables echantillonnees et le schema de la base de donnees, et la documentation https://cran.r-project.org/web/packages/completejourney/vignettes/completejourney.html indique comment recuperer l’integralite des tables (20 millions de lignes pour la plus grande).
head(demographics)
head(transactions_sample)
# A tibble: 6 x 8
household_id age income home_ownership marital_status household_size
<chr> <ord> <ord> <ord> <ord> <ord>
1 1 65+ 35-49K Homeowner Married 2
2 1001 45-54 50-74K Homeowner Unmarried 1
3 1003 35-44 25-34K <NA> Unmarried 1
4 1004 25-34 15-24K <NA> Unmarried 1
5 101 45-54 Under 15K Homeowner Married 4
6 1012 35-44 35-49K <NA> Married 5+
# ... with 2 more variables: household_comp <ord>, kids_count <ord>
# A tibble: 6 x 11
household_id store_id basket_id product_id quantity sales_value retail_disc
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 2261 309 31625220889 940996 1 3.86 0.43
2 2131 368 32053127496 873902 1 1.59 0.9
3 511 316 32445856036 847901 1 1 0.69
4 400 388 31932241118 13094913 2 11.9 2.9
5 918 340 32074655895 1085604 1 1.29 0
6 718 324 32614612029 883203 1 2.5 0.49
# ... with 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>,
# week <int>, transaction_timestamp <dttm>
Exercices
- quelle est la fonction du package tidyr qui permet d’obtenir facilement le resultat de la requete requete1bis a partir du resultat de requete1 ?
- meme question avec requete2bis et requete2
= demographics %>% mutate(marital_status = as.character(marital_status),
demog marital_status = case_when(is.na(marital_status) |
=='Unknown'~'inconnu',
marital_statusTRUE ~ marital_status))
= "SELECT age,
requete1 marital_status,
COUNT(*) as nb
FROM demog
GROUP BY age,
marital_status"
= "SELECT age,
requete1bis COUNT(CASE WHEN marital_status = 'Married' THEN 1 END) as Married,
COUNT(CASE WHEN marital_status = 'Unmarried' THEN 1 END) as Unmarried,
COUNT(CASE WHEN marital_status = 'inconnu' THEN 1 END) as inconnu
FROM demog
GROUP BY age"
sqldf(requete1)
sqldf(requete1bis)
age marital_status nb
1 19-24 Married 9
2 19-24 Unmarried 27
3 19-24 inconnu 10
4 25-34 Married 52
5 25-34 Unmarried 58
6 25-34 inconnu 32
7 35-44 Married 96
8 35-44 Unmarried 70
9 35-44 inconnu 28
10 45-54 Married 112
11 45-54 Unmarried 124
12 45-54 inconnu 52
13 55-64 Married 36
14 55-64 Unmarried 18
15 55-64 inconnu 5
16 65+ Married 35
17 65+ Unmarried 27
18 65+ inconnu 10
age Married Unmarried inconnu
1 19-24 9 27 10
2 25-34 52 58 32
3 35-44 96 70 28
4 45-54 112 124 52
5 55-64 36 18 5
6 65+ 35 27 10
= transactions_sample %>%
transac filter(coupon_disc > 0, coupon_match_disc > 0, week <= 5) %>% tail(30)
= "SELECT week,
requete2 AVG(retail_disc) as retail_disc,
AVG(coupon_disc) as coupon_disc,
AVG(coupon_match_disc) as coupon_match_disc
FROM transac
GROUP BY week"
= data.frame(KPI = c("retail_disc", "coupon_disc", "coupon_match_disc"))
noms_KPI
= "SELECT croisements.week,
requete2bis KPI,
AVG(CASE WHEN KPI = 'retail_disc' THEN retail_disc
WHEN KPI = 'coupon_disc' THEN coupon_disc
WHEN KPI = 'coupon_match_disc' THEN coupon_match_disc
END) as valeur
FROM (SELECT week,
KPI FROM transac
CROSS JOIN noms_KPI
) AS croisements
LEFT JOIN transac
ON croisements.week = transac.week
GROUP BY croisements.week,
KPI"
sqldf(requete2)
sqldf(requete2bis)
week retail_disc coupon_disc coupon_match_disc
1 1 1.7800000 0.5500000 0.4500000
2 2 0.3450000 0.5125000 0.4875000
3 3 0.5138462 0.5576923 0.4500000
4 4 0.8233333 0.6333333 0.3000000
5 5 0.5088889 0.5000000 0.3777778
week KPI valeur
1 1 coupon_disc 0.5500000
2 1 coupon_match_disc 0.4500000
3 1 retail_disc 1.7800000
4 2 coupon_disc 0.5125000
5 2 coupon_match_disc 0.4875000
6 2 retail_disc 0.3450000
7 3 coupon_disc 0.5576923
8 3 coupon_match_disc 0.4500000
9 3 retail_disc 0.5138462
10 4 coupon_disc 0.6333333
11 4 coupon_match_disc 0.3000000
12 4 retail_disc 0.8233333
13 5 coupon_disc 0.5000000
14 5 coupon_match_disc 0.3777778
15 5 retail_disc 0.5088889