Use PRQL executable code blocks on Quarto

Author

@eitsupi

Published

January 25, 2023

Modified

January 26, 2023

PRQL (Pipelined Relational Query Language, pronounced “Prequel”) has attracted high attention, attracting more than 6,000 GitHub stars in the year since its arrival in early 2022.

PRQL can easily be tried in the playground, and it can also be run on R by prqlr, which was recently registered on CRAN.

And with the wonderful flexibility of knitr, the development version of prqlr allows us to run PRQL code blocks on Quarto documents!¹

This document demonstrates the use of PRQL code blocks on Quarto and compares them to SQL code blocks² and dplyr (and dbplyr)³.

Caution

PRQL is under active development and will likely undergo frequent breaking changes before reaching version 1.0.

This document is based on PRQL 0.4.1, the latest version at this time, which will probably be outdated in a few mounths!

Setup

As with the SQL code blocks that exists as a default feature of knitr, a connection to the database must be set up in R in order to use PRQL code blocks. In this document, I use an in-memory database of DuckDB that can be queried directly to Data Frames in R.

library(DBI)
library(duckdb)
library(prqlr)
library(dplyr, warn.conflicts = FALSE)
library(dbplyr, warn.conflicts = FALSE)

# Generate some Data Frames from sample csv files.
base_url <- "https://raw.githubusercontent.com/PRQL/prql/0.4.1/prql-compiler/tests/integration/data/chinook/{name}.csv"
options(readr.show_col_types = FALSE)
albums <- readr::read_csv(glue::glue(base_url, name = "albums"))
artists <- readr::read_csv(glue::glue(base_url, name = "artists"))
tracks <- readr::read_csv(glue::glue(base_url, name = "tracks"))

# Connect to DuckDB in-memory database.
con <- dbConnect(duckdb(), ":memory:")

# Register Data Frames to the database.
duckdb_register(con, "mtcars", mtcars)
duckdb_register(con, "albums", albums)
duckdb_register(con, "artists", artists)
duckdb_register(con, "tracks", tracks)

Basic example

Note that only dplyr does the computation without duckdb, so the row names of the mtcars DataFrame are retained and the results are different.

PRQL

from mtcars
filter cyl > 6
select [cyl, mpg]
sort [-cyl]
derive [mpg_int = round 0 mpg]
take 3

3 records
cyl	mpg	mpg_int
8	18.7	19
8	14.3	14
8	16.4	16

SQL

SELECT
  cyl,
  mpg,
  ROUND(mpg, 0) AS mpg_int
FROM
  mtcars
WHERE
  cyl > 6
ORDER BY
  cyl DESC
LIMIT
  3

3 records
cyl	mpg	mpg_int
8	18.7	19
8	14.3	14
8	16.4	16

tbl(con, "mtcars") |>
  filter(cyl > 6) |>
  select(cyl, mpg) |>
  arrange(desc(cyl)) |>
  mutate(mpg_int = round(mpg, 0)) |>
  head(3) |>
  collect()

cyl	mpg	mpg_int
8	18.7	19
8	14.3	14
8	16.4	16

mtcars |>
  filter(cyl > 6) |>
  select(cyl, mpg) |>
  arrange(desc(cyl)) |>
  mutate(mpg_int = round(mpg, 0)) |>
  head(3)

	cyl	mpg	mpg_int
Hornet Sportabout	8	18.7	19
Duster 360	8	14.3	14
Merc 450SE	8	16.4	16

Join multiple tables

We can clearly see how similar PRQL and dplyr are for long queries involving joins.

PRQL

from tracks
select [album_id, name, unit_price]
sort [-unit_price, name]
group album_id (
  aggregate [
    track_count = count,
    album_price = sum unit_price
  ]
)
join albums [==album_id]
group artist_id (
  aggregate [
    track_count = sum track_count,
    artist_price = sum album_price
  ]
)
join artists [==artist_id]
select [artists.name, artist_price, track_count]
sort [-artist_price]
derive avg_track_price = artist_price / track_count
take 3

3 records
name	artist_price	track_count	avg_track_price
Iron Maiden	210.87	213	0.99
Lost	183.08	92	1.99
U2	133.65	135	0.99

SQL

WITH table_1 AS (
  SELECT
    SUM(unit_price) AS _expr_0,
    COUNT(*) AS _expr_1,
    album_id
  FROM
    tracks
  GROUP BY
    album_id
),
table_2 AS (
  SELECT
    SUM(table_1._expr_0) AS artist_price,
    SUM(table_1._expr_1) AS track_count,
    albums.artist_id
  FROM
    table_1
    JOIN albums ON table_1.album_id = albums.album_id
  GROUP BY
    albums.artist_id
)
SELECT
  artists.name,
  table_2.artist_price,
  table_2.track_count,
  table_2.artist_price / table_2.track_count AS avg_track_price
FROM
  table_2
  JOIN artists ON table_2.artist_id = artists.artist_id
ORDER BY
  table_2.artist_price DESC
LIMIT
  3

3 records
name	artist_price	track_count	avg_track_price
Iron Maiden	210.87	213	0.99
Lost	183.08	92	1.99
U2	133.65	135	0.99

tbl(con, "tracks") |>
  select(album_id, name, unit_price) |>
  window_order(desc(unit_price), name) |>
  group_by(album_id) |>
  summarise(
    track_count = n(),
    album_price = sum(unit_price, na.rm = TRUE),
    .groups = "drop"
  ) |>
  inner_join(tbl(con, "albums"), by = "album_id") |>
  group_by(artist_id) |>
  summarise(
    track_count = sum(track_count, na.rm = TRUE),
    artist_price = sum(album_price, na.rm = TRUE),
    .groups = "drop"
  ) |>
  inner_join(tbl(con, "artists"), by = "artist_id") |>
  select(name, artist_price, track_count) |>
  arrange(desc(artist_price)) |>
  mutate(avg_track_price = artist_price / track_count) |>
  head(3)

name	artist_price	track_count	avg_track_price
Iron Maiden	210.87	213	0.99
Lost	183.08	92	1.99
U2	133.65	135	0.99

tracks |>
  select(album_id, name, unit_price) |>
  arrange(desc(unit_price), name) |>
  group_by(album_id) |>
  summarise(
    track_count = n(),
    album_price = sum(unit_price, na.rm = TRUE),
    .groups = "drop"
  ) |>
  inner_join(albums, by = "album_id") |>
  group_by(artist_id) |>
  summarise(
    track_count = sum(track_count, na.rm = TRUE),
    artist_price = sum(album_price, na.rm = TRUE),
    .groups = "drop"
  ) |>
  inner_join(artists, by = "artist_id") |>
  select(name, artist_price, track_count) |>
  arrange(desc(artist_price)) |>
  mutate(avg_track_price = artist_price / track_count) |>
  head(3)

name	artist_price	track_count	avg_track_price
Iron Maiden	210.87	213	0.99
Lost	183.08	92	1.99
U2	133.65	135	0.99

Session info

R packages used in this document are these.

sessioninfo::package_info() |>
  print()

 package     * version    date (UTC) lib source
 assertthat    0.2.1      2019-03-21 [1] RSPM (R 4.2.0)
 bit           4.0.5      2022-11-15 [1] RSPM (R 4.2.0)
 bit64         4.0.5      2020-08-30 [1] RSPM (R 4.2.0)
 blob          1.2.3      2022-04-10 [1] RSPM (R 4.2.0)
 cli           3.5.0      2022-12-20 [1] RSPM (R 4.2.0)
 crayon        1.5.2      2022-09-29 [1] RSPM (R 4.2.0)
 curl          4.3.3      2022-10-06 [1] RSPM (R 4.2.0)
 DBI         * 1.1.3      2022-06-18 [1] RSPM (R 4.2.0)
 dbplyr      * 2.3.0      2023-01-16 [1] RSPM
 digest        0.6.31     2022-12-11 [1] RSPM (R 4.2.0)
 dplyr       * 1.0.10     2022-09-01 [1] RSPM (R 4.2.0)
 duckdb      * 0.6.2      2023-01-16 [1] RSPM
 ellipsis      0.3.2      2021-04-29 [1] RSPM (R 4.2.0)
 evaluate      0.19       2022-12-13 [1] RSPM (R 4.2.0)
 fansi         1.0.3      2022-03-24 [1] RSPM (R 4.2.0)
 fastmap       1.1.0      2021-01-25 [1] RSPM (R 4.2.0)
 generics      0.1.3      2022-07-05 [1] RSPM (R 4.2.0)
 glue          1.6.2      2022-02-24 [1] RSPM (R 4.2.0)
 highr         0.9        2021-04-16 [1] RSPM (R 4.2.0)
 hms           1.1.2      2022-08-19 [1] RSPM (R 4.2.0)
 htmltools     0.5.4      2022-12-07 [1] RSPM (R 4.2.0)
 htmlwidgets   1.6.0      2022-12-15 [1] RSPM (R 4.2.0)
 jsonlite      1.8.4      2022-12-06 [1] RSPM (R 4.2.0)
 knitr         1.41       2022-11-18 [1] RSPM (R 4.2.0)
 lifecycle     1.0.3      2022-10-07 [1] RSPM (R 4.2.0)
 magrittr      2.0.3      2022-03-30 [1] RSPM (R 4.2.0)
 pillar        1.8.1      2022-08-19 [1] RSPM (R 4.2.0)
 pkgconfig     2.0.3      2019-09-22 [1] RSPM (R 4.2.0)
 prqlr       * 0.1.0.9002 2023-01-24 [1] https://eitsupi.r-universe.dev (R 4.2.2)
 purrr         1.0.1      2023-01-10 [1] RSPM
 R6            2.5.1      2021-08-19 [1] RSPM (R 4.2.0)
 readr         2.1.3      2022-10-01 [1] RSPM (R 4.2.0)
 rlang         1.0.6      2022-09-24 [1] RSPM (R 4.2.0)
 rmarkdown     2.20       2023-01-19 [1] RSPM
 sessioninfo   1.2.2      2021-12-06 [1] RSPM (R 4.2.0)
 stringi       1.7.8      2022-07-11 [1] RSPM (R 4.2.0)
 stringr       1.5.0      2022-12-02 [1] RSPM (R 4.2.0)
 tibble        3.1.8      2022-07-22 [1] RSPM (R 4.2.0)
 tidyselect    1.2.0      2022-10-10 [1] RSPM (R 4.2.0)
 tzdb          0.3.0      2022-03-28 [1] RSPM (R 4.2.0)
 utf8          1.2.2      2021-07-24 [1] RSPM (R 4.2.0)
 vctrs         0.5.1      2022-11-16 [1] RSPM (R 4.2.0)
 vroom         1.6.0      2022-09-30 [1] RSPM (R 4.2.0)
 withr         2.5.0      2022-03-03 [1] RSPM (R 4.2.0)
 xfun          0.36       2022-12-21 [1] RSPM
 yaml          2.3.6      2022-10-18 [1] RSPM (R 4.2.0)

 [1] /usr/local/lib/R/site-library
 [2] /usr/local/lib/R/library

Footnotes

pyprql allows PRQL code to be executed interactively on Jupyter by Jupyter Magic, so PRQL can also be executed on Quarto when using Jupyter engine.↩︎
The SQL queries written in the SQL code blocks are generated from PRQL queries by prql-compiler (on the playground).↩︎
I’m a big fan of dplyr, so I have high hopes for PRQL that can be written like dplyr!↩︎