Favourite Things

By Carl Goodwin in R

July 26, 2020

Each project closes with a table summarising the R tools used. By visualising the most frequently used packages and functions I can get a sense of where I may most benefit from going deeper into the latest package versions.

I may also spot superseded functions e.g. spread and gather improved by pivot_wider and pivot_longer. Or an opportunity to switch a non-tidyverse package for a newer tidyverse / tidymodels (or extension) alternative, e.g. the new tidyclust package brings cluster modelling to tidymodels and is used in Finding Happiness in ‘The Smoke’.

This page is regularly refreshed to incorporate new or modified projects; see the collapsed Details section at the foot of this page for the last modified date.

library(tidyverse)
library(tidytext)
library(rvest)
library(paletteer)
library(janitor)
library(glue)
library(kableExtra)
library(ggwordcloud)
library(fpp3)
library(tidymodels)
library(patchwork)
theme_set(theme_bw())

n <- 4
palette <- "harrypotter::always"

cols <- paletteer_c(palette, n = n)

tibble(x = 1:n, y = 1) |>
  ggplot(aes(x, y, fill = fct_rev(cols))) +
  geom_col() +
  geom_label(aes(label = cols |> str_remove("FF$")), 
             size = 4, vjust = 2, fill = "white") +
  annotate(
    "label",
    x = (n + 1) / 2, y = 0.5,
    label = palette,
    fill = "white",
    alpha = 0.8,
    size = 6
  ) +
  scale_fill_manual(values = as.character(cols)) +
  theme_void() +
  theme(legend.position = "none")

I’ll start by grabbing the url for all projects.

urls <- "https://www.quantumjitter.com/project/" |> 
  read_html() |> 
  html_elements(".underline .db") |> 
  html_attr("href") |> 
  as_tibble() |> 
  transmute(str_c("https://www.quantumjitter.com/", value)) |> 
  pull()

This enables me to extract the package and function usage table for each one.

table_df <- map_dfr(urls, function(x) {
  x |>
    read_html() |>
    html_elements("#r-toolbox , table") |>
    html_table()
}) |>
  clean_names(replace = c("io" = "")) |>
  select(package, functn) |>
  drop_na()

A little “spring cleaning” is needed, and separation of tidyverse and non-tidyverse packages.

tidy <-
  c(
    tidyverse_packages(),
    fpp3_packages(),
    tidymodels_packages()
  ) |>
  unique()

tidy_df <- table_df |>
  separate_rows(functn, sep = ";") |>
  separate(functn, c("functn", "count"), "\\Q[\\E") |>
  mutate(
    count = str_remove(count, "]") |> as.integer(),
    functn = str_squish(functn)
  ) |>
  count(package, functn, wt = count) |>
  mutate(multiverse = case_when(
    package %in% tidy ~ "tidy",
    package %in% c("base", "graphics") ~ "base",
    TRUE ~ "special"
  ))

Then I can summarise usage and prepare for a faceted plot.

pack_df <- tidy_df |>
  count(package, multiverse, wt = n) |>
  mutate(name = "package")

fun_df <- tidy_df |>
  count(functn, multiverse, wt = n) |>
  mutate(name = "function")

n_url <- urls |> n_distinct()

packfun_df <- pack_df |>
  bind_rows(fun_df) |>
  group_by(name) |>
  arrange(desc(n)) |>
  mutate(
    packfun = coalesce(package, functn),
    name = fct_rev(name)
  )

Clearly dplyr reigns supreme driven by mutate and filter.

p1 <- packfun_df |>
  filter(name == "package") |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(
    title = glue("Favourite Things\nAcross {n_url} Projects"),
    subtitle = "Package Usage",
    x = NULL, y = NULL
  )

p2 <- packfun_df |>
  filter(name == "function", n >= 4) |> 
  ggplot(aes(fct_reorder(packfun, n), n, fill = multiverse)) +
  geom_col() +
  coord_flip() +
  geom_label(aes(label = n), hjust = "inward", size = 2, fill = "white") +
  scale_fill_manual(values = cols[c(1, 2, 4)]) +
  labs(x = NULL, y = NULL, 
       subtitle = "Function Usage >= 4")

p1 + p2

I’d also like a word cloud generated as the new featured image for this project.

set.seed = 123

packfun_df |>
  mutate(angle = 45 * sample(-2:2, n(), 
                             replace = TRUE, 
                             prob = c(1, 1, 4, 1, 1))) |>
  ggplot(aes(
    label = packfun,
    size = n,
    colour = multiverse,
    angle = angle
  )) +
  geom_text_wordcloud(
    eccentricity = 1,
    seed = 789
  ) +
  scale_size_area(max_size = 20) +
  scale_colour_manual(values = cols[c(2, 3, 4)]) +
  theme_void() +
  theme(plot.background = element_rect(fill = cols[1]))

R Toolbox

A little bit circular, but I might as well include this code too in my “favourite things”.

Package Function
base as.character[1]; as.integer[1]; c[5]; conflicts[1]; cumsum[1]; function[2]; sample[1]; search[1]; sum[1]; unique[1]
dplyr filter[7]; arrange[3]; bind_rows[1]; case_when[1]; coalesce[1]; count[4]; desc[3]; group_by[2]; if_else[3]; mutate[10]; n_distinct[1]; pull[1]; select[1]; summarise[1]; transmute[1]
forcats fct_reorder[2]; fct_rev[2]
fpp3 fpp3_packages[1]
ggplot2 aes[7]; annotate[1]; coord_flip[2]; element_rect[1]; geom_col[3]; geom_label[3]; ggplot[4]; labs[2]; scale_colour_manual[1]; scale_fill_manual[3]; scale_size_area[1]; theme[2]; theme_bw[1]; theme_set[1]; theme_void[2]
ggwordcloud geom_text_wordcloud[1]
glue glue[1]
janitor clean_names[1]
kableExtra kbl[1]
paletteer paletteer_c[1]
purrr map[1]; map_dfr[1]; map2_dfr[1]; possibly[1]; set_names[1]
readr read_lines[1]
rvest html_attr[1]; html_elements[2]; html_table[1]; read_html[2]
stringr str_c[6]; str_count[1]; str_detect[2]; str_remove[4]; str_remove_all[1]; str_squish[1]; str_starts[1]
tibble as_tibble[2]; tibble[3]; enframe[1]
tidymodels tidymodels_packages[1]
tidyr drop_na[1]; separate[1]; separate_rows[1]; unnest[1]
tidyverse tidyverse_packages[1]
Posted:
July 26, 2020
Updated:
October 8, 2022
Length:
4 minute read, 722 words
Categories:
R
Tags:
web scraping quarto
See Also:
Plots Thicken
Set Operations