Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Last active July 2, 2025 16:44
Show Gist options
  • Save MichaelChirico/64e981b4359a36d2279bf55c39c7d3dd to your computer and use it in GitHub Desktop.
Save MichaelChirico/64e981b4359a36d2279bf55c39c7d3dd to your computer and use it in GitHub Desktop.
Scrape GitHub PRs in a repo to see which files they touch
# INITIALLY GENERATED BY GEMINI
# PR File Scraper
#
# Description:
# This script inspects a local Git repository, identifies its GitHub remote,
# and fetches all open pull requests. It then compiles a data.frame
# where each row represents a file modified in a specific pull request,
# along with metadata about that PR.
#
# Requirements:
# 1. R installed on your system.
# 2. The following R packages: gh, gert, dplyr, purrr, stringr, tibble.
# 3. A GitHub Personal Access Token (PAT) configured for the `gh` package.
# The easiest way to set this up is to run `usethis::create_github_token()`
# and then `usethis::edit_r_environ()` to add the line:
# GITHUB_PAT="your_newly_created_token"
# Save the .Renviron file and restart R.
#
# How to run:
# 1. Make sure the required packages are installed (see install block below).
# 2. Set the `repo_path` variable to the directory of your local git repo.
# 3. Run the entire script. The final data.frame will be printed to the console.
#
#-------------------------------------------------------------------------------
# 1. SETUP: Install and load required packages
#-------------------------------------------------------------------------------
library(gh)
library(gert)
library(data.table)
#-------------------------------------------------------------------------------
# 2. CONFIGURATION: Set the path to your local repository
#-------------------------------------------------------------------------------
# <<< CHANGE THIS to the path of your local git repository >>>
if (length(args <- commandArgs(TRUE))) {
repo_path <- args[1L]
target_remtoe <- args[2L]
} else {
repo_path <- "."
target_remote <- "origin" # which remote has the PRs of interest?
}
#-------------------------------------------------------------------------------
# 3. HELPER FUNCTION: To process a single pull request
#-------------------------------------------------------------------------------
#' Process a single PR object from the GitHub API
#'
#' This function takes a list representing a single PR, fetches the files
#' associated with it, and returns a tidy tibble (data.frame).
#'
#' @param pr A list object for one PR, returned from the `gh` package.
#' @param repo_slug A character string of the repo in "owner/repo" format.
#' @return A tibble where each row is a file in the PR. Returns NULL if the
#' PR has no files.
process_single_pr <- function(pr, repo_slug) {
message("Processing PR #", pr$number, ": ", pr$title)
# API call to get all files for the specified pull request.
# .limit = Inf ensures we get all files, even if there are more than 100.
pr_files <- tryCatch(
gh(
"GET /repos/{repo}/pulls/{pull_number}/files",
repo = repo_slug,
pull_number = pr$number,
.limit = Inf
), error = function(e) {
warning("Could not fetch files for PR #", pr$number, ". Error: ", e$message)
return(NULL)
}
)
if (is.null(pr_files) || length(pr_files) == 0) {
return(NULL) # Return NULL if no files are found or an error occurred
}
# Extract just the filenames from the list of file objects
filenames <- vapply(pr_files, \(x) x$filename, "")
# Extract the label names and collapse them into a single comma-separated string
labels <- if (length(pr$labels) > 0) {
paste(vapply(pr$labels, \(x) x$name, ""), collapse = ",")
} else {
NA_character_
}
data.table(
pr_number = pr$number,
pr_title = pr$title,
filename = filenames,
author = pr$user$login,
branch_name = pr$head$ref,
created_at = pr$created_at,
updated_at = pr$updated_at,
is_draft = pr$draft,
labels = labels,
pr_url = pr$html_url
)
}
#-------------------------------------------------------------------------------
# 4. MAIN SCRIPT LOGIC
#-------------------------------------------------------------------------------
# Validate that the repo_path exists and is a git repository
if (!dir.exists(repo_path) || !dir.exists(file.path(repo_path, ".git"))) {
stop("The specified `repo_path` ('", repo_path, "') is not a valid git repository.")
}
# Get the list of remotes from the local git repo
tryCatch({
remotes <- git_remote_list(repo = repo_path)
}, error = function(e) {
stop("Could not read git remotes. Is '", repo_path, "' a valid git repository with remotes configured?")
})
# Find the 'origin' remote URL
origin_url <- remotes$url[remotes$name == target_remote]
if (length(origin_url) == 0) {
stop("No remote named '", target_remote, "' found in this repository.")
}
# Extract the "owner/repo" slug from the git URL (works for https and ssh)
repo_owner_slug <- gsub(R"{^git@github\.com[/:]([\w\d.-]+/[\w\d.-]+?)(?:\.git)?$}", "\\1", origin_url, perl=TRUE)
if (is.na(repo_owner_slug)) {
stop("Could not parse a 'owner/repo' slug from the origin URL: ", origin_url)
}
message("Identified repository: ", repo_owner_slug)
message("Fetching open pull requests...")
# Get all open pull requests for the repository
# .limit = Inf ensures we get all pages of results
open_prs <- tryCatch(
gh("GET /repos/{repo}/pulls",
repo = repo_owner_slug,
state = "open",
.limit = Inf
), error = function(e) {
message("Failed to fetch pull requests. Please check the following:")
message("1. Your internet connection.")
message("2. Your GITHUB_PAT is valid and has 'repo' scope.")
message("3. The repository '", repo_owner_slug, "' exists and is accessible.")
stop(e)
})
if (length(open_prs) == 0) {
message("No open pull requests found for this repository.")
pr_files <- data.table(NULL) # create an empty table
} else {
message(length(open_prs), " open pull requests found. Fetching files for each...")
# Iterate over each pull request, process it, and bind the results into a single table
pr_files <- rbindlist(lapply(open_prs, process_single_pr, repo_slug=repo_owner_slug))
}
pr_files
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment