Last active
February 13, 2019 19:02
-
-
Save eddking/97c26012f5fdad603ef02c2b4bc5c4bb to your computer and use it in GitHub Desktop.
export pull request & review data from github via the http api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const request = require('request-promise-native'); | |
const parseLinkHeader = require('parse-link-header'); | |
const fs = require('fs'); | |
/* | |
* This is an idempotent script for downloading pr & review data for a single repository | |
* Set the repo and auth variables below, then run and wait. its not speedy, but it works | |
* | |
* If you stop the script or it dies or you trigger abuse detection, it is safe to restart. | |
* it will only download the remaining data, any requests that were pending at the time of | |
* the crash will be retried. | |
* | |
* If you start the script with a data.json file already present, it will only download | |
* new pull requests or update data for pull requests whose updated_at property has changed | |
* | |
* The resulting data will be stored as one big file 'data.json', pull request data is located | |
* under the 'pulls' key. the other two keys are for recording state for recovery | |
*/ | |
const repo = 'YieldifyLabs/khaleesi-tag'; | |
const auth = { | |
'user': 'usernameHere', // Your github username | |
'pass': 'githubPersonalAuthTokenHere' // A personal auth token you created | |
}; | |
const dataFile = './data.json'; | |
// Even though we request in parallel, we do at most 1 req per x milliseconds | |
// Otherwise you trigger abuse detection | |
const WAIT_BETWEEN_REQ = 250; // about 200ms is ok for full speed | |
const opts = { | |
auth: auth, | |
headers: { | |
'User-Agent': 'curl/7.43.0' // Why not? | |
}, | |
resolveWithFullResponse: true | |
}; | |
// Default empty data, if one is persisted, that will be used instead | |
let data = { | |
pulls: {}, | |
queue: [], | |
pending: {} | |
}; | |
function enqueue(obj) { | |
id = JSON.stringify(obj); | |
if (data.pending[id] !== undefined && !data.pending[id]) { | |
return; | |
} | |
data.queue.push(obj); | |
data.pending[id] = false; | |
} | |
let waitpromise = Promise.resolve(); | |
function wait(delay = WAIT_BETWEEN_REQ) { | |
return new Promise((resolve) => { | |
setTimeout(resolve, delay); | |
}); | |
} | |
let ratelimitPromise = Promise.resolve(); | |
async function doApiRequest(url) { | |
const tmp = waitpromise; | |
waitpromise = waitpromise.then(wait); | |
await tmp; | |
await ratelimitPromise; | |
const result = await request.get(url, opts); | |
const rateLimitRemaining = parseInt(result.headers['x-ratelimit-remaining'], 10); | |
if (rateLimitRemaining < 10) { | |
console.log('About to hit rate limit, stopping for now .....'); | |
writeData(); | |
console.log('The next rate limit reset be at:'); | |
let nextReset = new Date(parseInt(result.headers['x-ratelimit-reset'], 10) * 1000); | |
console.log(nextReset); | |
ratelimitPromise = wait(nextReset - new Date().getTime() + 5000); | |
await ratelimitPromise; | |
} | |
if (rateLimitRemaining % 50 === 0) { | |
console.log(`----- Rate limit remaining: ${rateLimitRemaining}`); | |
} | |
return result; | |
} | |
function forEachRemainingPage(result, currentPage, callback) { | |
const links = parseLinkHeader(result.headers.link); | |
if (links !== null) { | |
const lastpage = (links.last || {}).page || currentPage; | |
for (let i = currentPage + 1; i <= parseInt(lastpage, 10); i++) { | |
callback(i); | |
} | |
} | |
} | |
const columnSpacing = 15; | |
function pad(string, chars=columnSpacing) { | |
let padding = chars - string.length; | |
for (let i = 0; i < padding; i++) { // Its slow but meh cba | |
string = string + ' '; | |
} | |
return string; | |
} | |
async function readPulls(page) { | |
page = page || 1; | |
const url = `https://api.github.com/repos/${repo}/pulls?state=closed&page=${page}`; | |
const result = await doApiRequest(url, opts); | |
console.log(progress() + pad('[OK]', 5) + pad('[List]') + `[${repo}/pulls]` + (page > 1 ? `[Page:${page}]` : '')); | |
if (page === 1) { // If we're on the starting page, queue requests for the other pages | |
forEachRemainingPage(result, page, (i) => { | |
enqueue({ | |
type: 'pulls', | |
page: i, | |
}); | |
}) | |
} | |
const body = JSON.parse(result.body); | |
let anyDiff = false; | |
for (let pull of body) { | |
let number = pull.number; | |
let prev = data.pulls[number]; | |
let isChanged = prev === undefined || prev.updated_at !== pull.updated_at; | |
anyDiff = anyDiff || isChanged; | |
if (!isChanged) { | |
continue; | |
} | |
enqueue({ | |
type: 'pull', | |
pull: number, | |
}); | |
} | |
} | |
async function readPull(number) { | |
const url = `https://api.github.com/repos/${repo}/pulls/${number}`; | |
const result = await doApiRequest(url, opts); | |
const pull = JSON.parse(result.body); | |
console.log(progress() + pad('[OK]', 5) + pad('[Pull]') + `[${repo}/pulls/${number}]`); | |
data.pulls[number] = { | |
number: pull.number, | |
state: pull.state, | |
title: pull.title, | |
user: pull.user.login, | |
body: pull.body, | |
commits: pull.commits, | |
additions: pull.additions, | |
deletions: pull.deletions, | |
changed_files: pull.changed_files, | |
created_at: pull.created_at, | |
updated_at: pull.updated_at, | |
pushed_at: pull.pushed_at, | |
closed_at: pull.closed_at, | |
merged_at: pull.merged_at, | |
merged: pull.merged, | |
merged_by: (pull.merged_by || {}).login, | |
merge_commit_sha: pull.merge_commit_sha, | |
head: { | |
label: pull.head.label, | |
ref: pull.head.ref, | |
sha: pull.head.sha | |
}, | |
base: { | |
label: pull.base.label, | |
ref: pull.base.ref, | |
sha: pull.base.sha | |
}, | |
comments: pull.comments, | |
review_comments: pull.review_comments, | |
reviews: [], | |
diffComments: [], | |
prComments: [], | |
}; | |
enqueue({ | |
type: 'reviews', | |
pull: number, | |
page: 1 | |
}); | |
enqueue({ | |
type: 'comments', | |
pull: number, | |
page: 1 | |
}); | |
enqueue({ | |
type: 'prcomments', | |
pull: number, | |
page: 1 | |
}); | |
} | |
async function readComments(pull, page) { | |
const url = `https://api.github.com/repos/${repo}/pulls/${pull}/comments?page=${page}`; | |
const result = await doApiRequest(url, opts); | |
console.log(progress() + pad('[OK]', 5) + pad('[DiffComments]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : '')); | |
if (page === 1) { | |
forEachRemainingPage(result, page, (i) => { | |
enqueue({ | |
type: 'comments', | |
pull: pull, | |
page: i, | |
}); | |
}) | |
} | |
const body = JSON.parse(result.body); | |
for (let comment of body) { | |
data.pulls[pull].diffComments.push({ | |
pull_request_review_id: comment.pull_request_review_id, | |
user: comment.user.login, | |
body: comment.body, | |
created_at: comment.created_at, | |
updated_at: comment.updated_at | |
}); | |
} | |
// re-sort since we're downloading multiple pages async | |
data.pulls[pull].diffComments.sort((a, b) => { | |
const aDate = new Date(a.created_at); | |
const bDate = new Date(b.created_at); | |
if (aDate > bDate) { | |
return 1; | |
} else if (bDate > aDate) { | |
return -1; | |
} | |
return 0; | |
}); | |
} | |
async function readIssueComments(pull, page) { | |
const url = `https://api.github.com/repos/${repo}/issues/${pull}/comments?page=${page}`; | |
const result = await doApiRequest(url, opts); | |
console.log(progress() + pad('[OK]', 5) + pad('[PrComments]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : '')); | |
if (page === 1) { | |
forEachRemainingPage(result, page, (i) => { | |
enqueue({ | |
type: 'prcomments', | |
pull: pull, | |
page: i | |
}); | |
}) | |
} | |
const body = JSON.parse(result.body); | |
for (let comment of body) { | |
data.pulls[pull].prComments.push({ | |
user: comment.user.login, | |
body: comment.body, | |
created_at: comment.created_at, | |
updated_at: comment.updated_at | |
}); | |
} | |
// re-sort since we're downloading multiple pages async | |
data.pulls[pull].prComments.sort((a, b) => { | |
const aDate = new Date(a.created_at); | |
const bDate = new Date(b.created_at); | |
if (aDate > bDate) { | |
return 1; | |
} else if (bDate > aDate) { | |
return -1; | |
} | |
return 0; | |
}); | |
} | |
async function readReviews(pull, page) { | |
const url = `https://api.github.com/repos/${repo}/pulls/${pull}/reviews?page=${page}`; | |
const result = await doApiRequest(url, opts); | |
console.log(progress() + pad('[OK]', 5) + pad('[Reviews]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : '')); | |
if (page === 1) { | |
forEachRemainingPage(result, page, (i) => { | |
enqueue({ | |
type: 'reviews', | |
pull: pull, | |
page: i | |
}); | |
}) | |
} | |
const body = JSON.parse(result.body); | |
for (let review of body) { | |
data.pulls[pull].reviews.push({ | |
id: review.id, | |
user: review.user.login, | |
body: review.body, | |
state: review.state, | |
submitted_at: review.submitted_at | |
}); | |
} | |
// re-sort since we're downloading multiple pages async | |
data.pulls[pull].reviews.sort((a, b) => { | |
const aDate = new Date(a.submitted_at); | |
const bDate = new Date(b.submitted_at); | |
if (aDate > bDate) { | |
return 1; | |
} else if (bDate > aDate) { | |
return -1; | |
} | |
return 0; | |
}); | |
} | |
let totalProcessed = 0; | |
let pending = 0; | |
function progress() { | |
return pad(`[${totalProcessed}/${totalProcessed + data.queue.length + pending}]`, 14); | |
} | |
let processedCounter = 0; | |
async function go() { | |
let fork = null; | |
try { | |
while (data.queue.length !== 0) { | |
const item = data.queue.pop(); | |
const id = JSON.stringify(item); | |
data.pending[id] = true; | |
pending = pending + 1; | |
if (fork === null && data.queue.length !== 0) { | |
// lets do some work in parallel | |
fork = go(); | |
} | |
switch(item.type) { | |
case 'pulls': | |
await readPulls(item.page); | |
break; | |
case 'pull': | |
await readPull(item.pull); | |
break; | |
case 'reviews': | |
await readReviews(item.pull, item.page); | |
break; | |
case 'prcomments': | |
await readComments(item.pull, item.page); | |
break; | |
case 'comments': | |
await readIssueComments(item.pull, item.page); | |
break; | |
default: | |
throw new Error('unknown: ' + id) | |
} | |
data.pending[id] = false; | |
pending = pending - 1; | |
processedCounter = (processedCounter + 1) % 10; | |
if (processedCounter === 0) { // Write to disk after every 10 requests | |
writeData(); | |
} | |
totalProcessed = totalProcessed + 1; | |
} | |
} catch (errr) { | |
console.log(pad('', 10) + pad('[FAIL]', 5) + id); | |
throw errr; | |
} finally { | |
if (fork !== null) { | |
await fork; | |
} | |
} | |
} | |
function readData() { | |
if (fs.existsSync(dataFile)) { | |
data = JSON.parse(fs.readFileSync(dataFile, 'utf8')); | |
} | |
} | |
function writeData() { | |
fs.writeFileSync(dataFile, JSON.stringify(data)); | |
} | |
async function main() { | |
readData(); | |
try { | |
// Re-enqueue any pending requests from the state, i guess they | |
// need to be retried | |
for (let key in data.pending) { | |
if (data.pending[key]) { | |
enqueue(JSON.parse(key)); | |
} | |
} | |
// If there are no pending requests from a crash, read the list of pull requests again | |
// Only the changed pull requests will be re-requested | |
if (data.queue.length === 0) { | |
data.pending = {}; // wipe out pending, so we can re-request things we need to | |
enqueue({ | |
type: 'pulls', | |
page: 1 | |
}); | |
} | |
await go(); | |
} catch (err) { | |
console.log(err) | |
throw err; | |
} finally { | |
writeData(); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment