Last active
July 2, 2025 16:44
-
-
Save MichaelChirico/64e981b4359a36d2279bf55c39c7d3dd to your computer and use it in GitHub Desktop.
Scrape GitHub PRs in a repo to see which files they touch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# INITIALLY GENERATED BY GEMINI | |
# PR File Scraper | |
# | |
# Description: | |
# This script inspects a local Git repository, identifies its GitHub remote, | |
# and fetches all open pull requests. It then compiles a data.frame | |
# where each row represents a file modified in a specific pull request, | |
# along with metadata about that PR. | |
# | |
# Requirements: | |
# 1. R installed on your system. | |
# 2. The following R packages: gh, gert, dplyr, purrr, stringr, tibble. | |
# 3. A GitHub Personal Access Token (PAT) configured for the `gh` package. | |
# The easiest way to set this up is to run `usethis::create_github_token()` | |
# and then `usethis::edit_r_environ()` to add the line: | |
# GITHUB_PAT="your_newly_created_token" | |
# Save the .Renviron file and restart R. | |
# | |
# How to run: | |
# 1. Make sure the required packages are installed (see install block below). | |
# 2. Set the `repo_path` variable to the directory of your local git repo. | |
# 3. Run the entire script. The final data.frame will be printed to the console. | |
# | |
#------------------------------------------------------------------------------- | |
# 1. SETUP: Install and load required packages | |
#------------------------------------------------------------------------------- | |
library(gh) | |
library(gert) | |
library(data.table) | |
#------------------------------------------------------------------------------- | |
# 2. CONFIGURATION: Set the path to your local repository | |
#------------------------------------------------------------------------------- | |
# <<< CHANGE THIS to the path of your local git repository >>> | |
if (length(args <- commandArgs(TRUE))) { | |
repo_path <- args[1L] | |
target_remtoe <- args[2L] | |
} else { | |
repo_path <- "." | |
target_remote <- "origin" # which remote has the PRs of interest? | |
} | |
#------------------------------------------------------------------------------- | |
# 3. HELPER FUNCTION: To process a single pull request | |
#------------------------------------------------------------------------------- | |
#' Process a single PR object from the GitHub API | |
#' | |
#' This function takes a list representing a single PR, fetches the files | |
#' associated with it, and returns a tidy tibble (data.frame). | |
#' | |
#' @param pr A list object for one PR, returned from the `gh` package. | |
#' @param repo_slug A character string of the repo in "owner/repo" format. | |
#' @return A tibble where each row is a file in the PR. Returns NULL if the | |
#' PR has no files. | |
process_single_pr <- function(pr, repo_slug) { | |
message("Processing PR #", pr$number, ": ", pr$title) | |
# API call to get all files for the specified pull request. | |
# .limit = Inf ensures we get all files, even if there are more than 100. | |
pr_files <- tryCatch( | |
gh( | |
"GET /repos/{repo}/pulls/{pull_number}/files", | |
repo = repo_slug, | |
pull_number = pr$number, | |
.limit = Inf | |
), error = function(e) { | |
warning("Could not fetch files for PR #", pr$number, ". Error: ", e$message) | |
return(NULL) | |
} | |
) | |
if (is.null(pr_files) || length(pr_files) == 0) { | |
return(NULL) # Return NULL if no files are found or an error occurred | |
} | |
# Extract just the filenames from the list of file objects | |
filenames <- vapply(pr_files, \(x) x$filename, "") | |
# Extract the label names and collapse them into a single comma-separated string | |
labels <- if (length(pr$labels) > 0) { | |
paste(vapply(pr$labels, \(x) x$name, ""), collapse = ",") | |
} else { | |
NA_character_ | |
} | |
data.table( | |
pr_number = pr$number, | |
pr_title = pr$title, | |
filename = filenames, | |
author = pr$user$login, | |
branch_name = pr$head$ref, | |
created_at = pr$created_at, | |
updated_at = pr$updated_at, | |
is_draft = pr$draft, | |
labels = labels, | |
pr_url = pr$html_url | |
) | |
} | |
#------------------------------------------------------------------------------- | |
# 4. MAIN SCRIPT LOGIC | |
#------------------------------------------------------------------------------- | |
# Validate that the repo_path exists and is a git repository | |
if (!dir.exists(repo_path) || !dir.exists(file.path(repo_path, ".git"))) { | |
stop("The specified `repo_path` ('", repo_path, "') is not a valid git repository.") | |
} | |
# Get the list of remotes from the local git repo | |
tryCatch({ | |
remotes <- git_remote_list(repo = repo_path) | |
}, error = function(e) { | |
stop("Could not read git remotes. Is '", repo_path, "' a valid git repository with remotes configured?") | |
}) | |
# Find the 'origin' remote URL | |
origin_url <- remotes$url[remotes$name == target_remote] | |
if (length(origin_url) == 0) { | |
stop("No remote named '", target_remote, "' found in this repository.") | |
} | |
# Extract the "owner/repo" slug from the git URL (works for https and ssh) | |
repo_owner_slug <- gsub(R"{^git@github\.com[/:]([\w\d.-]+/[\w\d.-]+?)(?:\.git)?$}", "\\1", origin_url, perl=TRUE) | |
if (is.na(repo_owner_slug)) { | |
stop("Could not parse a 'owner/repo' slug from the origin URL: ", origin_url) | |
} | |
message("Identified repository: ", repo_owner_slug) | |
message("Fetching open pull requests...") | |
# Get all open pull requests for the repository | |
# .limit = Inf ensures we get all pages of results | |
open_prs <- tryCatch( | |
gh("GET /repos/{repo}/pulls", | |
repo = repo_owner_slug, | |
state = "open", | |
.limit = Inf | |
), error = function(e) { | |
message("Failed to fetch pull requests. Please check the following:") | |
message("1. Your internet connection.") | |
message("2. Your GITHUB_PAT is valid and has 'repo' scope.") | |
message("3. The repository '", repo_owner_slug, "' exists and is accessible.") | |
stop(e) | |
}) | |
if (length(open_prs) == 0) { | |
message("No open pull requests found for this repository.") | |
pr_files <- data.table(NULL) # create an empty table | |
} else { | |
message(length(open_prs), " open pull requests found. Fetching files for each...") | |
# Iterate over each pull request, process it, and bind the results into a single table | |
pr_files <- rbindlist(lapply(open_prs, process_single_pr, repo_slug=repo_owner_slug)) | |
} | |
pr_files |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment