Created
November 14, 2022 13:47
-
-
Save hadley/61f0bcaf79c695eca46c06be55ffad69 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# https://twitter.com/buddyherms/status/1576966150680121344 -------------- | |
# PROs: at, by, and regexp examples | |
# CONs: quite simple | |
vt_census <- tidycensus::get_decennial( | |
geography = "block", | |
state = "VT", | |
county = "Washington", | |
variables = "P1_001N", | |
year = 2020 | |
) | |
vt_census |> | |
separate_wider_position( | |
GEOID, | |
widths = c(state = 2, county = 3, tract = 6, block = 4) | |
) | |
vt_census |> | |
separate_wider_delim( | |
NAME, | |
delim = ", ", | |
names = c("block", "block_group", "tract", "county", "state") | |
) | |
vt_census |> | |
separate_wider_regex( | |
NAME, | |
patterns = c( | |
"Block ", block = "\\d+", ", ", | |
"Block Group ", block_group = "\\d+", ", ", | |
"Census Tract ", tract = "\\d+.\\d+", ", ", | |
county = "[^,]+", ", ", | |
state = ".*" | |
) | |
) | |
# https://twitter.com/iandgow/status/1577198761612189696 ------------------ | |
# https://iangow.github.io/far_book/web-data.html?q=Extract#extracting-data-from-messy-formats | |
library(pdftools) | |
library(lubridate) | |
# Could almost use width, but different set of widths on each page | |
# (because of pdf_text()'s table algorithm) | |
url <- "https://site.warrington.ufl.edu/ritter/files/money-left-on-the-table.pdf" | |
lines <- readr::read_lines(pdftools::pdf_text(url)[-1]) | |
money <- tibble(line = lines) | |
regex <- c( | |
amount_left_on_table = "\\S+", "\\s+", | |
company = ".+", "\\s+", | |
ipo_date = "[0-9]{6}", "\\s+", | |
offer_price = "\\S+", "\\s+", | |
first_close_price = "\\S+", "\\s+", | |
shares_offered = "\\S+", "\\s+", | |
ticker = "\\S+" | |
) | |
money %>% | |
filter(str_detect(line, "^\\$")) %>% | |
separate_wider_regex(line, regex) %>% | |
mutate( | |
amount_left_on_table = parse_number(amount_left_on_table), | |
company = str_trim(company), | |
ipo_date = lubridate::ymd(ipo_date), | |
offer_price = parse_number(offer_price), | |
first_close_price = parse_number(first_close_price), | |
shares_offered = parse_number(shares_offered), | |
) %>% | |
arrange(ipo_date) %>% | |
relocate(amount_left_on_table, .after = shares_offered) %>% | |
relocate(ticker, .before = 1) | |
# https://twitter.com/randyboyes/status/1576933550393171968 --------------- | |
# PROs: wider + longer, simple problem | |
# CONs: inauthentic, licensing??? | |
# https://adventofcode.com/2020/day/7 | |
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt") | |
advent7 <- tibble(line = lines) | |
advent7 %>% | |
filter(!str_detect(lines, "no other bags")) %>% | |
separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>% | |
separate_by_longer(bags, ", ") %>% | |
separate_by_wider(bags, sep = " ", names = c("number", "property", "colour", NA)) | |
# https://adventofcode.com/2020/day/14 | |
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_14.txt") | |
advent14 <- tibble(line = lines) | |
advent14 %>% | |
separate_regex_wider(line, c(var = "\\w+", address = "(?:\\[\\d+\\])?", " = ", value = ".+")) %>% | |
mutate(address = parse_number(address)) | |
# https://twitter.com/Natttersley/status/1577026577350946817 -------------- | |
# PRO: simple, built-in dataset, shows align_short | |
# CON: have to talk about rownames | |
mtcars %>% | |
as_tibble(rownames = "rn") %>% | |
separate_regex_wider(rn, | |
c(manufacturer = "\\w+", " ", model = ".*?"), | |
align_short = "start" | |
) | |
# https://twitter.com/josephjefe/status/1576931962366763008 --------------- | |
particip <- nflreadr::load_participation() | |
particip %>% | |
as_tibble() %>% | |
select(nflverse_game_id, play_id, offense_formation, offense_personnel) %>% | |
filter(!is.na(offense_formation)) %>% | |
separate_longer_delim(offense_personnel, regex(", ?")) %>% | |
separate_wider_delim(offense_personnel, " ", names = c("n", "position")) | |
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ -------- | |
all_drinks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/all_drinks.csv') | |
all_drinks %>% | |
select(idDrink, contains("Ingredient"), contains("Measure")) %>% | |
pivot_longer( | |
-idDrink, | |
names_pattern = "str(Ingredient|Measure)(\\d+)", | |
names_to = c(".value", "i"), | |
values_drop_na = TRUE, | |
) %>% | |
separate_by_wider(Measure, " ", c("Quanitity", "Unit"), align_long = "merge") | |
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ -------- | |
raw_df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv") | |
raw_df %>% | |
select(title_synonyms) %>% | |
separate_regex_wider(title_synonyms, c("[", contents = ".*", "]")) %>% | |
separate_by_longer(contents, ", ") | |
raw_df %>% | |
pull(aired) %>% | |
str_replace_all("None", "null") %>% | |
str_replace_all("'", '"') %>% | |
map_if(is.na, \(x) "null") %>% | |
map(jsonlite::fromJSON) %>% | |
head() %>% | |
str() | |
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ -------- | |
# if want to do teams, will need https://github.com/tidyverse/rvest/issues/361 | |
library(rvest) | |
html <- read_html("https://db.ipc-services.org/sdms/hira/web/competition/code/PG2016/sport/AR") | |
html %>% | |
html_node(".table-responsive-lg") %>% | |
html_table() %>% | |
rename_with(\(x) str_to_lower(str_replace(x, fixed(" Medallist(s)"), ""))) %>% | |
pivot_longer(names_to = "medal", values_to = "athlete", cols = -event) %>% | |
separate_regex_wider(event, | |
patterns = c( | |
gender = "Men's|Women's|Mixed", | |
" ", | |
team = "Team|Individual", | |
" ", | |
event = ".*" | |
) | |
) %>% | |
filter(team == "Individual") %>% | |
separate_regex_wider(athlete, | |
patterns = c( | |
last_name = "[A-Z]+", | |
" ", | |
first_name = "[A-Za-z ]+", | |
" \\(", | |
country = "[A-Z]+", | |
"\\)" | |
)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# https://github.com/tidyverse/tidyups/blob/main/002-tidyr-stringr.md | |
# For now: no paired stringr functions | |
# Big change to problems reporting | |
# Add warning when debug mode on: | |
# Warning: align_short="debug" adding debug columns x_ok, x_pieces, x_remainder | |
# separate_*_wider -------------------------------------------------------- | |
# * separate_at_wider() = separate() | |
# * separate_by_wider() = separate() | |
# * separate_regex_wider() = extract() | |
vt_census <- tidycensus::get_decennial( | |
geography = "block", | |
state = "VT", | |
county = "Washington", | |
variables = "P1_001N", | |
year = 2020 | |
) | |
vt_census | |
vt_census |> | |
separate_at_wider( | |
GEOID, | |
widths = c(state = 2, county = 3, tract = 6, block = 4) | |
) | |
# by -> delim, by -> sep | |
# at -> width, separate_loc_wider() | |
# separate_by_wider / separate_by_longer | |
# which order of components? | |
# should sep be delim? | |
# separate_wider_delim / separate_longer_delim | |
vt_census |> | |
separate_by_wider( | |
NAME, | |
sep = ", ", | |
names = c("block", "block_group", "tract", "county", "state") | |
) |> | |
mutate( | |
block = block %>% parse_number(), | |
block_group = block_group %>% parse_number(), | |
tract = tract %>% parse_number() | |
) | |
vt_census |> | |
separate_regex_wider( | |
NAME, | |
patterns = c( | |
"Block ", block = "\\d+", ", ", | |
"Block Group ", block_group = "\\d+", ", ", | |
"Census Tract ", tract = "\\d+.\\d+", ", ", | |
county = "[^,]+", ", ", | |
state = ".*" | |
) | |
) | |
# separate_*_longer ------------------------------------------------------- | |
# * separate_by_longer() = separate_rows() | |
# * separate_at_longer() | |
# https://adventofcode.com/2020/day/7 | |
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt") | |
advent7 <- tibble(line = lines) | |
advent7 %>% | |
filter(!str_detect(lines, "no other bags")) %>% | |
separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>% | |
separate_by_longer(bags, ", ") %>% | |
separate_by_wider(bags, sep = " ", names = c("number", "adjective", "colour", NA)) %>% | |
separate_by_wider(container, " ", names = c("adjective", "colour"), names_sep = "_") | |
# Debugging --------------------------------------------------------------- | |
df <- tibble( | |
x = c("a", "a-b", "a-b-c") | |
) | |
df %>% separate(x, c("x", "y")) | |
# Now errors if there's a problem | |
df %>% separate_by_wider(x, sep = "-", names = c("x", "y")) | |
df %>% separate_by_wider(x, sep = "-", names = c("x")) | |
# Debug with: | |
probs <- df %>% | |
separate_by_wider( | |
x, | |
sep = "-", | |
names = c("a", "b"), | |
align_short = "debug", | |
align_long = "debug" | |
) | |
probs | |
probs %>% filter(!x_ok) | |
df %>% | |
separate_by_wider( | |
x, | |
sep = "-", | |
names = c("x", "y"), | |
align_short = "end", | |
align_long = "merge" | |
) | |
# New feature: | |
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "start") | |
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "end") | |
# Really handy for separate_regex_wider | |
vt_census |> | |
separate_regex_wider( | |
NAME, | |
patterns = c( | |
"Block ", block = "\\d+", ", ", | |
"Block Group ", block_group = "\\d+", ", ", | |
"Census Tract ", tract = "\\d+.\\d+", ", ", | |
county = "[^,]+", ", ", | |
state = ".*" | |
), | |
align_short = "debug" | |
) | |
%>% | |
select(block:NAME_remainder) %>% | |
select(NAME_remainder) | |
df <- tibble(x = c("b-d", "e-f", "e")) | |
df %>% separate_by_wider( | |
x, "-", | |
names = c("x", "y"), | |
align_long = "debug", | |
align_short = "start" | |
) | |
github_repos <- c( | |
"metacran/crandb", | |
"jeroenooms/[email protected]", | |
"jimhester/covr#47", | |
"hadley/dplyr@*release", | |
"mangothecat/remotes@550a3c7d3f9e1493a2ba", | |
"/$&@R64&3" | |
) | |
owner_rx <- "(?:(?<owner>[^/]+)/)?" | |
repo_rx <- "(?<repo>[^/@#]+)" | |
subdir_rx <- "(?:/(?<subdir>[^@#]*[^@#/]))?" | |
ref_rx <- "(?:@(?<ref>[^*].*))" | |
pull_rx <- "(?:#(?<pull>[0-9]+))" | |
release_rx <- "(?:@(?<release>[*]release))" | |
subtype_rx <- sprintf("(?:%s|%s|%s)?", ref_rx, pull_rx, release_rx) | |
github_rx <- sprintf( | |
"^(?:%s%s%s%s|(?<catchall>.*))$", | |
owner_rx, repo_rx, subdir_rx, subtype_rx | |
) | |
re_match(text = github_repos, pattern = github_rx) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment