Last active
March 29, 2016 18:27
-
-
Save raydog/a12701b5ba9a2c4f5e9c to your computer and use it in GitHub Desktop.
Dumps entire subreddits out of imgur into the PWD.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// rDump -- Dumps images in an Imgur sub-reddit thing | |
// Dependencies: | |
// go get github.com/PuerkitoBio/goquery | |
package main | |
import ( | |
"encoding/json" | |
"flag" | |
"fmt" | |
"github.com/PuerkitoBio/goquery" | |
"io" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"net/url" | |
"os" | |
"path" | |
"strconv" | |
) | |
// Magic values go here: | |
const ( | |
user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36" | |
base_url_fmt string = "https://imgur.com/r/%s" | |
next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled" | |
detail_url_fmt string = "https://imgur.com%s" | |
ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true" | |
ajax_img_fmt string = "https://i.imgur.com/%s%s" | |
download_fmt string = "https:%s" | |
album_cutoff int = 8 | |
num_workers int = 5 | |
max_pages int = 10 | |
) | |
// Used for parsing the AJAX endpoints: | |
type PostDetail struct { | |
Hash string `json:"hash"` | |
Title string `json:"title"` | |
Desc string `json:"description"` | |
Width int `json:"width"` | |
Height int `json:"height"` | |
Size int `json:"size"` | |
Ext string `json:"ext"` | |
Anim bool `json:"animated"` | |
PreferVid bool `json:"prefer_video"` | |
Looping bool `json:"looping"` | |
Timestamp string `json:"datetime"` | |
} | |
type ListData struct { | |
Count int `json:"count"` | |
Images []PostDetail `json:"images"` | |
} | |
type AJAXResponse struct { | |
Data ListData `json:"data"` | |
Success bool `json:"success"` | |
Status int `json:"status"` | |
} | |
func (pd PostDetail) GetURL() string { | |
if pd.Hash == "" || pd.Ext == "" { | |
return "" | |
} | |
return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext) | |
} | |
// From a subreddit name, fetches all urls from that subreddit: | |
func fetchAllImageLinks(subreddit string) chan string { | |
// We give this channel a buffer, just so that page changes are less likely to | |
// block image workers: | |
out := make(chan string, 10) | |
go (func() { | |
defer close(out) | |
for link := range urlGenerator(subreddit) { | |
pageNo, linkChannel := fetchUrlList(link) | |
log.Printf("Entering Page #%d : %s", pageNo, link) | |
for link := range linkChannel { | |
out <- link | |
} | |
} | |
})() | |
return out | |
} | |
// Given a subreddit name, returns a channel of URLs to scrape: | |
func urlGenerator(seed string) chan string { | |
out := make(chan string) | |
base := fmt.Sprintf(base_url_fmt, seed) | |
go (func() { | |
out <- base | |
for n := 1; n < max_pages; n++ { | |
out <- fmt.Sprintf(next_url_fmt, seed, n) | |
} | |
close(out) | |
})() | |
return out | |
} | |
// Performs an HTTP GET, with the correct fake headers: | |
func httpGET(url string) (*http.Response, error) { | |
request, err := http.NewRequest("GET", url, nil) | |
if err != nil { | |
return nil, err | |
} | |
request.Header.Set("User-Agent", user_agent) | |
return http.DefaultClient.Do(request) | |
} | |
// A stupid hack so we can manipulate our user-agent when fetching pages: | |
func buildGoQueryDocument(url string) (*goquery.Document, error) { | |
resp, err := httpGET(url) | |
if err != nil { | |
return nil, err | |
} | |
return goquery.NewDocumentFromResponse(resp) | |
} | |
func extractFilename(link string) (string, error) { | |
parsed, err := url.Parse(link) | |
if err != nil { | |
return "", err | |
} | |
return path.Base(parsed.Path), nil | |
} | |
// Download a file... Unless we already have it: | |
func maybeDownload(link string) { | |
fname, err := extractFilename(link) | |
if err != nil { | |
log.Printf("Cannot download [%s] : Bad link. %v", link, err) | |
return | |
} | |
stat, err := os.Stat(fname) | |
if err == nil && stat.Size() > 0 { | |
log.Printf("Already have '%s'. Skipping.", fname) | |
return | |
} | |
destFile, err := os.Create(fname) | |
if err != nil { | |
log.Printf("Failed to create '%s': %v", fname, err) | |
return | |
} | |
defer destFile.Close() | |
httpResp, err := httpGET(link) | |
if err != nil { | |
log.Printf("Couldn't download '%s': %v", fname, err) | |
return | |
} | |
defer httpResp.Body.Close() | |
if httpResp.StatusCode > 299 { | |
log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode) | |
return | |
} | |
n, err := io.Copy(destFile, httpResp.Body) | |
if err != nil { | |
log.Printf("Download failed for '%s': %v", fname, err) | |
return | |
} | |
log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n) | |
} | |
// Parses images and the data-page thing out of the entry lists: | |
func fetchUrlList(link string) (pageNum int, urls chan string) { | |
pageNum, urls = -1, make(chan string) | |
doc, err := buildGoQueryDocument(link) | |
if err != nil { | |
log.Printf("Failed to read URL: %s", link) | |
close(urls) | |
return | |
} | |
// We recieve a single value on this, which is the page num: | |
pageNumSent, pageNumChan := false, make(chan int) | |
defer close(pageNumChan) | |
go (func() { | |
defer close(urls) | |
doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) { | |
page, pageExists := s.Attr("data-page") | |
href, hrefExists := s.Attr("href") | |
if pageExists && !pageNumSent { | |
pageNo, _ := strconv.ParseInt(page, 10, 32) | |
pageNumSent = true | |
pageNumChan <- int(pageNo) | |
} | |
if hrefExists { | |
urls <- href | |
} | |
}) | |
// If page was malformed, and/or had no useable content, just send back page -1 | |
if !pageNumSent { | |
log.Printf("Page [%s] contained no usable data", link) | |
pageNumChan <- -1 | |
} | |
})() | |
pageNum = <-pageNumChan | |
return | |
} | |
func httpAJAX(detailLink string) ([]byte, error) { | |
albumId, err := extractFilename(detailLink) | |
if err != nil { | |
return nil, err | |
} | |
albumUrl := fmt.Sprintf(ajax_url_fmt, albumId) | |
resp, err := httpGET(albumUrl) | |
if err != nil { | |
return nil, err | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode > 299 { | |
return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode) | |
} | |
return ioutil.ReadAll(resp.Body) | |
} | |
// Will use the AJAX endpoint to pluck all images in an album out: | |
func fetchAJAXUrls(detailLink string) chan string { | |
out := make(chan string) | |
data, err := httpAJAX(detailLink) | |
if err != nil { | |
close(out) | |
return out | |
} | |
go (func() { | |
defer close(out) | |
parsed := AJAXResponse{} | |
err = json.Unmarshal(data, &parsed) | |
if err != nil { | |
log.Printf("AJAX Parse failed: %v", err) | |
return | |
} | |
for _, img := range parsed.Data.Images { | |
if imgUrl := img.GetURL(); imgUrl != "" { | |
out <- imgUrl | |
} | |
} | |
})() | |
return out | |
} | |
// Given the URL to a post detail page, returns the URLs to download: | |
func fetchDownloadUrls(detailLink string) chan string { | |
out := make(chan string) | |
detailUrl := fmt.Sprintf(detail_url_fmt, detailLink) | |
doc, err := buildGoQueryDocument(detailUrl) | |
if err != nil { | |
log.Printf("Failed to read detail URL: %s", detailUrl) | |
close(out) | |
return out | |
} | |
_maybeSend := func(s string, exists bool) { | |
if exists && s != "" { | |
fullUrl := fmt.Sprintf(download_fmt, s) | |
out <- fullUrl | |
} | |
} | |
go (func() { | |
defer close(out) | |
// Albums could have TONS of pics, so use AJAX if too many pics: | |
if doc.Find("div.post-image").Length() >= album_cutoff { | |
log.Printf("Large album: %s", detailLink) | |
for linkz := range fetchAJAXUrls(detailLink) { | |
out <- linkz | |
} | |
return | |
} | |
// Else, emit a single entry: | |
doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) { | |
_maybeSend(s.Find("img").Attr("src")) | |
_maybeSend(s.Find("source").Attr("src")) | |
}) | |
})() | |
return out | |
} | |
// Will read from a channel, downloading links until the channel dies: | |
func imageWorker(urls chan string, workerName string) chan bool { | |
out := make(chan bool) | |
go (func() { | |
defer close(out) | |
log.Printf("Starting up worker: %s", workerName) | |
for link := range urls { | |
log.Printf("%s : Handling %s", workerName, link) | |
for downloadMe := range fetchDownloadUrls(link) { | |
log.Printf("%s : Found: %s", workerName, downloadMe) | |
maybeDownload(downloadMe) | |
} | |
} | |
})() | |
return out | |
} | |
// Main func parses args, and sets things up: | |
func main() { | |
verbose := flag.Bool("v", false, "Verbosely log what's happening") | |
flag.Parse() | |
target := flag.Arg(0) | |
if !(*verbose) { | |
log.SetOutput(ioutil.Discard) | |
} | |
imageChan := fetchAllImageLinks(target) | |
var workers [num_workers]chan bool | |
for i := range workers { | |
name := fmt.Sprintf("Worker[%d]", i+1) | |
workers[i] = imageWorker(imageChan, name) | |
} | |
for _, w := range workers { | |
_ = <-w | |
} | |
log.Printf("Done.") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment