Skip to content

Instantly share code, notes, and snippets.

@cesarcneto
Last active March 31, 2022 07:41
Show Gist options
  • Save cesarcneto/f25207f22e6e49d22df011169fb47a49 to your computer and use it in GitHub Desktop.
Save cesarcneto/f25207f22e6e49d22df011169fb47a49 to your computer and use it in GitHub Desktop.
Babashka script created to help hiring managers to reach out to OpenSource contributors. It simply fetches contributors from a give repo, fetches their user profile plus most recent git commit of each one.

fetch-gh-contributors

This is a babashka script created to help hiring managers to reach out to OpenSource contributors. It fetches contributors from a given Github repo, fetches their user profile plus most recent git commit of each user.

Requirements

To make use of this script you're expected to have installed the following softwares:

Quickstart

Once babashka is installed (see requirements above), you got follow a couple of steps:

1 - Create your personal access token in Github

Note: This is a required step in order to run the script without being rate limitted.

2 - Make GH_FETCH_CONTRIBUTORS_USER and GH_FETCH_CONTRIBUTORS_TOKEN environment variables available

E.g. in a terminal window run:

export GH_FETCH_CONTRIBUTORS_USER=<YOUR_GITHUB_USER_NAME_HERE>
export GH_FETCH_CONTRIBUTORS_TOKEN=<YOUR_GITHUB_ACCESS_TOKNE_HERE>

3 - Run the script

In the same terminal window that you ran step #2, run:

bb -m gh-contributors-cli.main --repo apache/spark

Once the script run is completed, a file called result.csv should be created at .data/apache/spark (assuming that you provided --repo apache/spark to the script).

IMPORTANT NOTE: This script will keep data cached in .data folder. Make sure you delete the folder whenever you want to fetch data for a given repo again.

#!/usr/bin/env bb
(ns gh-contributors-cli.main
(:gen-class)
(:require [babashka.curl :as curl]
[cheshire.core :as json]
[clojure.string :as str]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.tools.cli :refer [parse-opts]]))
(def user-credentials
{:username (System/getenv "GH_FETCH_CONTRIBUTORS_USER")
:password (System/getenv "GH_FETCH_CONTRIBUTORS_TOKEN")})
(def cli-options
[["-r" "--repo REPO" "Repository name. E.g.: apache/spark"
:validate [#(re-matches #"[\w,\-,\_]+\/[\w,\-,\_]+" %) "Must be a valid owner/repository name"]]
["-h" "--help"]])
(defn- parse-cli-args! []
(parse-opts *command-line-args* cli-options))
(def output-attributes
[:html_url :email :name :most_recent_commit_date
:twitter_username :login :following :updated_at
:bio :contributions :location :blog
:followers :company :created_at])
(def application-gh-json "application/vnd.github.v3+json")
(def contributors-filename "contributors.json")
(def contributors+user-filename "contributors+user.json")
(def contributors+user+commit-filename "contributors+user+commit.json")
(def data-dir ".data")
(def contributors+user-dir "contributors+user")
(def user+commit-dir "user+commit")
(defn- target-repository
[]
(get-in (parse-cli-args!) [:options :repo]))
(def base-path (str data-dir "/" (target-repository)))
(def contributors+user-path (str base-path "/" contributors+user-dir))
(def user+commit-path (str base-path "/" user+commit-dir))
(defn- data-dir!
[]
(.mkdirs (io/as-file base-path))
(.mkdirs (io/as-file contributors+user-path))
(.mkdirs (io/as-file user+commit-path))
data-dir)
(defn- contributors!
([repo]
(contributors! repo 1))
([repo page]
(-> (curl/get (str "https://api.github.com/repos/" repo "/contributors")
{:basic-auth [(:username user-credentials) (:password user-credentials)]
:query-params {"page" (str page)
"per_page" (str 100)}
:headers {"Accept" application-gh-json}})
:body
(json/parse-string true))))
(defn- repo+filename->file
[repo filename]
(io/as-file (str data-dir "/" repo "/" filename)))
(defn- file->json!
[file]
(when (.exists file)
(-> (slurp file)
(json/parse-string true))))
(defn- json->file!
[json file]
(spit file (json/generate-string json))
json)
(defn all-contributors!
[repo]
(let [cf (repo+filename->file repo contributors-filename)
stored-contributors (file->json! cf)]
(if (some? stored-contributors)
stored-contributors
(do
(println (str "Fetching all " repo " contributors"))
(let [all-contributors (loop [page 1
acc []]
(println (str "Fetching page " page))
(let [contributors (contributors! repo page)]
(if (not-empty contributors)
(recur (inc page) (concat acc contributors))
acc)))]
(json->file! all-contributors cf)
(file->json! cf))))))
(defn- user-file
[id]
(io/as-file (str contributors+user-path "/" id)))
(defn user!
[contributor]
(let [id (:id contributor)
url (:url contributor)
stored-contributor+user (file->json! (user-file id))]
(if (some? stored-contributor+user)
stored-contributor+user
(do
(println (str "Fetching user " (:url contributor)))
(try (let [user (-> (curl/get url {:basic-auth [(:username user-credentials) (:password user-credentials)]
:headers {"Accept" application-gh-json}})
:body
(json/parse-string true))
contributor+user (conj contributor user)]
(spit (user-file id) (json/generate-string contributor+user))
(file->json! (user-file id)))
(catch java.lang.Exception e
(throw (ex-info (.getMessage e) contributor))))))))
(defn- user+commit-file
[id]
(io/as-file (str user+commit-path "/" id)))
(defn- id->user+commit!
[id]
(let [user+commit-file (user+commit-file id)]
(when (.exists user+commit-file)
(-> (slurp user+commit-file)
(json/parse-string true)))))
(defn user+commit!
[repo contributor+user]
(let [id (:id contributor+user)
login (:login contributor+user)
stored-user+commit (id->user+commit! id)]
(if (some? stored-user+commit)
stored-user+commit
(do
(println (str "Fetching latest commit of " login))
(try (let [most-recent-commit-date (-> (curl/get (str "https://api.github.com/repos/" repo "/commits")
{:basic-auth [(:username user-credentials) (:password user-credentials)]
:headers {"Accept" application-gh-json}
:query-params {"page" "1"
"per_page" "5"
"author" login}})
:body
(json/parse-string true)
(first)
(get-in [:commit :author :date]))
user+commit (conj contributor+user {:most_recent_commit_date most-recent-commit-date})]
(spit (user+commit-file id) (json/generate-string user+commit))
(id->user+commit! id))
(catch java.lang.Exception e
(throw (ex-info (.getMessage e) contributor+user))))))))
(defn- entry->csv-row
[map-entry]
(map #(str (get-in map-entry [%])) output-attributes))
(defn ->output-headers
[attrs]
(vec (map name attrs)))
(defn -main [& args]
(if (and (:username user-credentials)
(:password user-credentials)
(target-repository))
(do
(data-dir!)
(let [repo (target-repository)
contributors (all-contributors! repo)
contributors+user-file (repo+filename->file repo contributors+user-filename)
contributors+u+c-file (repo+filename->file repo contributors+user+commit-filename)
contributors+user-json (json->file! (map user! contributors) contributors+user-file)
contributors+user+commit-json (json->file!
(map (partial user+commit! repo) contributors+user-json)
contributors+u+c-file)]
(with-open [csv-writer (io/writer (io/as-file
(str data-dir "/" repo "/result.csv")))]
(csv/write-csv csv-writer [(->output-headers output-attributes)])
(doseq [contributor contributors+user+commit-json]
(csv/write-csv csv-writer [(entry->csv-row contributor)])))))
(let [cli-args (parse-cli-args!)]
(cond
(not-empty (:errors cli-args)) (println (str/join "\n" (:errors cli-args)))
(empty? (:options cli-args)) (println (:summary cli-args))
:else (println "GH_FETCH_CONTRIBUTORS_USER and GH_FETCH_CONTRIBUTORS_TOKEN env vars are not set.")))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment