Favourite Things

By Carl Goodwin in R

July 26, 2020

Each project closes with a table summarising the R tools used. By visualising the most frequently used packages and functions I can get a sense of where I may most benefit from going deeper into the latest package versions.

I may also spot superseded functions e.g. spread and gather improved by pivot_wider and pivot_longer. Or an opportunity to switch a non-tidyverse package for a newer tidyverse / tidymodels (or extension) alternative, e.g. the new tidyclust package brings cluster modelling to tidymodels and is used in Finding Happiness in ‘The Smoke’.

This page is regularly refreshed to incorporate new or modified projects; see the collapsed Details section at the foot of this page for the last modified date.

library(tidyverse)
library(tidytext)
library(rvest)
library(paletteer)
library(janitor)
library(glue)
library(kableExtra)
library(ggwordcloud)
library(fpp3)
library(tidymodels)
library(patchwork)

theme_set(theme_bw())

n <- 4
palette <- "harrypotter::always"

cols <- paletteer_c(palette, n = n)

tibble(x = 1:n, y = 1) |>
  ggplot(aes(x, y, fill = fct_rev(cols))) +
  geom_col() +
  geom_label(aes(label = cols |> str_remove("FF$")), 
             size = 4, vjust = 2, fill = "white") +
  annotate(
    "label",
    x = (n + 1) / 2, y = 0.5,
    label = palette,
    fill = "white",
    alpha = 0.8,
    size = 6
  ) +
  scale_fill_manual(values = as.character(cols)) +
  theme_void() +
  theme(legend.position = "none")

I’ll start by grabbing the url for all projects.

urls <- "https://www.quantumjitter.com/project/" |> 
  read_html() |> 
  html_elements(".underline .db") |> 
  html_attr("href") |> 
  as_tibble() |> 
  transmute(str_c("https://www.quantumjitter.com/", value)) |> 
  pull()

This enables me to extract the package and function usage table for each one.

table_df <- map_dfr(urls, function(x) {
  x |>
    read_html() |>
    html_elements("#r-toolbox , table") |>
    html_table()
}) |>
  clean_names(replace = c("io" = "")) |>
  select(package, functn) |>
  drop_na()

A little “spring cleaning” is needed, and separation of tidyverse and non-tidyverse packages.

tidy <-
  c(
    tidyverse_packages(),
    fpp3_packages(),
    tidymodels_packages()
  ) |>
  unique()

tidy_df <- table_df |>
  separate_rows(functn, sep = ";") |>
  separate(functn, c("functn", "count"), "\\Q[\\E") |>
  mutate(
    count = str_remove(count, "]") |> as.integer(),
    functn = str_squish(functn)
  ) |>
  count(package, functn, wt = count) |>
  mutate(multiverse = case_when(
    package %in% tidy ~ "tidy",
    package %in% c("base", "graphics") ~ "base",
    TRUE ~ "special"
  ))

Then I can summarise usage and prepare for a faceted plot.

pack_df <- tidy_df |>
  count(package, multiverse, wt = n) |>
  mutate(name = "package")

fun_df <- tidy_df |>
  count(functn, multiverse, wt = n) |>
  mutate(name = "function")

n_url <- urls |> n_distinct()

packfun_df <- pack_df |>
  bind_rows(fun_df) |>
  group_by(name) |>
  arrange(desc(n)) |>
  mutate(
    packfun = coalesce(package, functn),
    name = fct_rev(name)
  )

Clearly dplyr reigns supreme driven by mutate and filter.

p1 <- packfun_df |>
  filter(name == "package") |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(
    title = glue("Favourite Things\nAcross {n_url} Projects"),
    subtitle = "Package Usage",
    x = NULL, y = NULL
  )

p2 <- packfun_df |>
  filter(name == "function", n >= 4) |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col() +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(x = NULL, y = NULL, 
       subtitle = "Function Usage >= 4")

p1 + p2

I’d also like a word cloud generated as the new featured image for this project.

set.seed = 123

packfun_df |>
  mutate(angle = 45 * sample(-2:2, n(), 
                             replace = TRUE, 
                             prob = c(1, 1, 4, 1, 1))) |>
  ggplot(aes(
    label = packfun,
    size = n,
    colour = multiverse,
    angle = angle
  )) +
  geom_text_wordcloud(
    eccentricity = 1,
    seed = 789
  ) +
  scale_size_area(max_size = 20) +
  scale_colour_manual(values = cols[c(2, 3, 4)]) +
  theme_void() +
  theme(plot.background = element_rect(fill = cols[1]))

R Toolbox

A little bit circular, but I might as well include this code too in my “favourite things”.

Package	Function
base	as.character[1]; as.integer[1]; c[5]; conflicts[1]; cumsum[1]; function[2]; sample[1]; search[1]; sum[1]; unique[1]
dplyr	filter[7]; arrange[3]; bind_rows[1]; case_when[1]; coalesce[1]; count[4]; desc[3]; group_by[2]; if_else[3]; mutate[10]; n_distinct[1]; pull[1]; select[1]; summarise[1]; transmute[1]
forcats	fct_reorder[2]; fct_rev[2]
fpp3	fpp3_packages[1]
ggplot2	aes[7]; annotate[1]; coord_flip[2]; element_rect[1]; geom_col[3]; geom_label[3]; ggplot[4]; labs[2]; scale_colour_manual[1]; scale_fill_manual[3]; scale_size_area[1]; theme[2]; theme_bw[1]; theme_set[1]; theme_void[2]
ggwordcloud	geom_text_wordcloud[1]
glue	glue[1]
janitor	clean_names[1]
kableExtra	kbl[1]
paletteer	paletteer_c[1]
purrr	map[1]; map_dfr[1]; map2_dfr[1]; possibly[1]; set_names[1]
readr	read_lines[1]
rvest	html_attr[1]; html_elements[2]; html_table[1]; read_html[2]
stringr	str_c[6]; str_count[1]; str_detect[2]; str_remove[4]; str_remove_all[1]; str_squish[1]; str_starts[1]
tibble	as_tibble[2]; tibble[3]; enframe[1]
tidymodels	tidymodels_packages[1]
tidyr	drop_na[1]; separate[1]; separate_rows[1]; unnest[1]
tidyverse	tidyverse_packages[1]

Posted:: July 26, 2020

Updated:: October 8, 2022

Length:: 4 minute read, 722 words

Categories:: R

Tags:: web scraping quarto

See Also:: Plots Thicken; Set Operations