Introduction

Avec le package dplyr on peut operer des transformations sur des dataframes R classiques mais aussi sur des tables de bases de donnees distantes, car dans les coulisses le package dbplyr traduit le code en SQL. Les packages sqldf et tidyquery permettent eux de requeter en SQL des dataframes R, le package sqldf a plus de possibilites (jointures) que tidyquery qui est un nouveau package, mais sqldf est connu pour etre assez lent.

Traduire le code dplyr en SQL

code SQL generique

# pour les donnees
library("nycflights13")
library("completejourney")

library("dplyr")
library("dbplyr")
library("tidyquery")
library("sqldf")

df1 <- lazy_frame(id = c(1,1,2,2,2),
                  date_transac = Sys.Date() - c(0,1,5,2,3),
                  montant = c(10, 30, 20, 80, 70))

df1

df1 %>% count(id)

df1 %>% group_by(id) %>% summarise(moy = mean(montant))

df1 %>% group_by(id) %>% mutate(moy = mean(montant)) %>% filter(montant > moyenne)
<SQL>
SELECT *
FROM `df`
<SQL>
SELECT `id`, COUNT(*) AS `n`
FROM `df`
GROUP BY `id`
<SQL>
SELECT `id`, AVG(`montant`) AS `moy`
FROM `df`
GROUP BY `id`
<SQL>
SELECT *
FROM (SELECT `id`, `date_transac`, `montant`, AVG(`montant`) OVER (PARTITION BY `id`) AS `moy`
FROM `df`) `q01`
WHERE (`montant` > `moyenne`)

code SQL pour une base de donnees Access

Sous Access les fonctions de fenetrage ne sont pas disponibles, le dernier mutate va donc echouer.

### une database en particulier : access
df <-data.frame(id = c(1,1,2,2,2),
                date_transac = Sys.Date() - c(0,1,5,2,3),
                montant = c(10, 30, 20, 80, 70)) %>% tbl_lazy(con = simulate_access())

df %>% count(id)  %>% show_query()

df %>% group_by(id) %>% summarise(moy = mean(montant))

try(df %>% group_by(id) %>% mutate(moy = mean(montant)) %>% filter(montant > moyenne) %>% print,
    outFile = stdout())
<SQL>
SELECT `id`, COUNT(*) AS `n`
FROM `df`
GROUP BY `id`
<SQL>
SELECT `id`, AVG(`montant`) AS `moy`
FROM `df`
GROUP BY `id`
Error : Window function `AVG()` is not supported by this database

Appliquer du SQL a des dataframes R avec sqldf et tidyquery

requete = " SELECT origin, dest,
    COUNT(flight) AS num_flts,
    round(AVG(distance)) AS dist,
    round(AVG(arr_delay)) AS avg_delay
  FROM flights
  WHERE distance BETWEEN 200 AND 300
    AND air_time IS NOT NULL
  GROUP BY origin, dest
  HAVING num_flts > 3000
  ORDER BY num_flts DESC, avg_delay DESC;"

query(requete)

sqldf(requete)
# A tibble: 3 x 5
  origin dest  num_flts  dist avg_delay
  <chr>  <chr>    <int> <dbl>     <dbl>
1 EWR    BOS       5247   200         5
2 LGA    DCA       4468   214         6
3 JFK    DCA       3076   213         8
  origin dest num_flts dist avg_delay
1    EWR  BOS     5247  200         5
2    LGA  DCA     4468  214         6
3    JFK  DCA     3076  213         8

Pour s’exercer

Le package completejourney fournit 8 tables echantillonnees et le schema de la base de donnees, et la documentation https://cran.r-project.org/web/packages/completejourney/vignettes/completejourney.html indique comment recuperer l’integralite des tables (20 millions de lignes pour la plus grande).

head(demographics)
head(transactions_sample)
# A tibble: 6 x 8
  household_id age   income    home_ownership marital_status household_size
  <chr>        <ord> <ord>     <ord>          <ord>          <ord>         
1 1            65+   35-49K    Homeowner      Married        2             
2 1001         45-54 50-74K    Homeowner      Unmarried      1             
3 1003         35-44 25-34K    <NA>           Unmarried      1             
4 1004         25-34 15-24K    <NA>           Unmarried      1             
5 101          45-54 Under 15K Homeowner      Married        4             
6 1012         35-44 35-49K    <NA>           Married        5+            
# ... with 2 more variables: household_comp <ord>, kids_count <ord>
# A tibble: 6 x 11
  household_id store_id basket_id   product_id quantity sales_value retail_disc
  <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
1 2261         309      31625220889 940996            1        3.86        0.43
2 2131         368      32053127496 873902            1        1.59        0.9 
3 511          316      32445856036 847901            1        1           0.69
4 400          388      31932241118 13094913          2       11.9         2.9 
5 918          340      32074655895 1085604           1        1.29        0   
6 718          324      32614612029 883203            1        2.5         0.49
# ... with 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>,
#   week <int>, transaction_timestamp <dttm>

Exercices

demog = demographics %>% mutate(marital_status = as.character(marital_status),
                                marital_status = case_when(is.na(marital_status) | 
                                                             marital_status=='Unknown'~'inconnu',
                                          TRUE ~ marital_status))

requete1 = "SELECT age,
                  marital_status,
                  COUNT(*) as nb
            FROM demog
            GROUP BY age,
                    marital_status"

requete1bis = "SELECT age,
                    COUNT(CASE WHEN marital_status = 'Married' THEN 1 END) as Married,
                    COUNT(CASE WHEN marital_status = 'Unmarried' THEN 1 END) as Unmarried,
                    COUNT(CASE WHEN marital_status = 'inconnu' THEN 1 END) as inconnu
                FROM demog
                GROUP BY age"

sqldf(requete1)
sqldf(requete1bis)
     age marital_status  nb
1  19-24        Married   9
2  19-24      Unmarried  27
3  19-24        inconnu  10
4  25-34        Married  52
5  25-34      Unmarried  58
6  25-34        inconnu  32
7  35-44        Married  96
8  35-44      Unmarried  70
9  35-44        inconnu  28
10 45-54        Married 112
11 45-54      Unmarried 124
12 45-54        inconnu  52
13 55-64        Married  36
14 55-64      Unmarried  18
15 55-64        inconnu   5
16   65+        Married  35
17   65+      Unmarried  27
18   65+        inconnu  10
    age Married Unmarried inconnu
1 19-24       9        27      10
2 25-34      52        58      32
3 35-44      96        70      28
4 45-54     112       124      52
5 55-64      36        18       5
6   65+      35        27      10

transac = transactions_sample %>% 
  filter(coupon_disc > 0, coupon_match_disc > 0, week <= 5) %>% tail(30)

requete2 = "SELECT week,
                  AVG(retail_disc) as retail_disc,
                  AVG(coupon_disc) as coupon_disc,
                  AVG(coupon_match_disc) as coupon_match_disc
            FROM transac
            GROUP BY week"

noms_KPI = data.frame(KPI = c("retail_disc", "coupon_disc", "coupon_match_disc"))

requete2bis = "SELECT croisements.week,
                      KPI,
                      AVG(CASE WHEN KPI = 'retail_disc' THEN retail_disc 
                                WHEN KPI = 'coupon_disc' THEN coupon_disc 
                                WHEN KPI = 'coupon_match_disc' THEN coupon_match_disc 
                          END) as valeur
              FROM (SELECT week, 
                            KPI FROM transac
                    CROSS JOIN noms_KPI
                    ) AS croisements
              LEFT JOIN transac
              ON croisements.week = transac.week 
              GROUP BY croisements.week,
                        KPI"

sqldf(requete2)
sqldf(requete2bis)
  week retail_disc coupon_disc coupon_match_disc
1    1   1.7800000   0.5500000         0.4500000
2    2   0.3450000   0.5125000         0.4875000
3    3   0.5138462   0.5576923         0.4500000
4    4   0.8233333   0.6333333         0.3000000
5    5   0.5088889   0.5000000         0.3777778
   week               KPI    valeur
1     1       coupon_disc 0.5500000
2     1 coupon_match_disc 0.4500000
3     1       retail_disc 1.7800000
4     2       coupon_disc 0.5125000
5     2 coupon_match_disc 0.4875000
6     2       retail_disc 0.3450000
7     3       coupon_disc 0.5576923
8     3 coupon_match_disc 0.4500000
9     3       retail_disc 0.5138462
10    4       coupon_disc 0.6333333
11    4 coupon_match_disc 0.3000000
12    4       retail_disc 0.8233333
13    5       coupon_disc 0.5000000
14    5 coupon_match_disc 0.3777778
15    5       retail_disc 0.5088889

retour au debut du document