raydog · March 29, 2016 18:27
diff --git a/rdump.go b/rdump.go
 // rDump -- Dumps images in an Imgur sub-reddit thing

 // Dependencies:
 //   go get github.com/PuerkitoBio/goquery

 package main

 import (
 	"encoding/json"
 	"flag"
 	"fmt"
 	"github.com/PuerkitoBio/goquery"
 	"io"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"net/url"
 	"os"
 	"path"
 	"strconv"
 )

 // Magic values go here:
 const (
 	user_agent     string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
 	base_url_fmt   string = "https://imgur.com/r/%s"
 	next_url_fmt   string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
 	detail_url_fmt string = "https://imgur.com%s"
 	ajax_url_fmt   string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
 	ajax_img_fmt   string = "https://i.imgur.com/%s%s"
 	download_fmt   string = "https:%s"
 	album_cutoff   int    = 8
 	num_workers    int    = 5
 	max_pages      int    = 10
 )

 // Used for parsing the AJAX endpoints:
 type PostDetail struct {
 	Hash      string `json:"hash"`
 	Title     string `json:"title"`
 	Desc      string `json:"description"`
 	Width     int    `json:"width"`
 	Height    int    `json:"height"`
 	Size      int    `json:"size"`
 	Ext       string `json:"ext"`
 	Anim      bool   `json:"animated"`
 	PreferVid bool   `json:"prefer_video"`
 	Looping   bool   `json:"looping"`
 	Timestamp string `json:"datetime"`
 }
 type ListData struct {
 	Count  int          `json:"count"`
 	Images []PostDetail `json:"images"`
 }
 type AJAXResponse struct {
 	Data    ListData `json:"data"`
 	Success bool     `json:"success"`
 	Status  int      `json:"status"`
 }

 func (pd PostDetail) GetURL() string {
 	if pd.Hash == "" || pd.Ext == "" {
 		return ""
 	}
 	return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
 }

 // From a subreddit name, fetches all urls from that subreddit:
 func fetchAllImageLinks(subreddit string) chan string {
 	// We give this channel a buffer, just so that page changes are less likely to
 	// block image workers:
 	out := make(chan string, 10)

 	go (func() {
 		defer close(out)
 		for link := range urlGenerator(subreddit) {
 			pageNo, linkChannel := fetchUrlList(link)
 			log.Printf("Entering Page #%d : %s", pageNo, link)

 			for link := range linkChannel {
 				out <- link
 			}
 		}
 	})()
 	return out
 }

 // Given a subreddit name, returns a channel of URLs to scrape:
 func urlGenerator(seed string) chan string {
 	out := make(chan string)
 	base := fmt.Sprintf(base_url_fmt, seed)
 	go (func() {
 		out <- base
 		for n := 1; n < max_pages; n++ {
 			out <- fmt.Sprintf(next_url_fmt, seed, n)
 		}
 		close(out)
 	})()
 	return out
 }

 // Performs an HTTP GET, with the correct fake headers:
 func httpGET(url string) (*http.Response, error) {
 	request, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return nil, err
 	}

 	request.Header.Set("User-Agent", user_agent)

 	return http.DefaultClient.Do(request)
 }

 // A stupid hack so we can manipulate our user-agent when fetching pages:
 func buildGoQueryDocument(url string) (*goquery.Document, error) {
 	resp, err := httpGET(url)
 	if err != nil {
 		return nil, err
 	}

 	return goquery.NewDocumentFromResponse(resp)
 }

 func extractFilename(link string) (string, error) {
 	parsed, err := url.Parse(link)
 	if err != nil {
 		return "", err
 	}
 	return path.Base(parsed.Path), nil
 }

 // Download a file... Unless we already have it:
 func maybeDownload(link string) {

 	fname, err := extractFilename(link)
 	if err != nil {
 		log.Printf("Cannot download [%s] : Bad link. %v", link, err)
 		return
 	}

 	stat, err := os.Stat(fname)
 	if err == nil && stat.Size() > 0 {
 		log.Printf("Already have '%s'. Skipping.", fname)
 		return
 	}

 	destFile, err := os.Create(fname)
 	if err != nil {
 		log.Printf("Failed to create '%s': %v", fname, err)
 		return
 	}
 	defer destFile.Close()

 	httpResp, err := httpGET(link)
 	if err != nil {
 		log.Printf("Couldn't download '%s': %v", fname, err)
 		return
 	}
 	defer httpResp.Body.Close()

 	if httpResp.StatusCode > 299 {
 		log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
 		return
 	}

 	n, err := io.Copy(destFile, httpResp.Body)
 	if err != nil {
 		log.Printf("Download failed for '%s': %v", fname, err)
 		return
 	}

 	log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
 }

 // Parses images and the data-page thing out of the entry lists:
 func fetchUrlList(link string) (pageNum int, urls chan string) {
 	pageNum, urls = -1, make(chan string)

 	doc, err := buildGoQueryDocument(link)
 	if err != nil {
 		log.Printf("Failed to read URL: %s", link)
 		close(urls)
 		return
 	}

 	// We recieve a single value on this, which is the page num:
 	pageNumSent, pageNumChan := false, make(chan int)
 	defer close(pageNumChan)

 	go (func() {
 		defer close(urls)

 		doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
 			page, pageExists := s.Attr("data-page")
 			href, hrefExists := s.Attr("href")

 			if pageExists && !pageNumSent {
 				pageNo, _ := strconv.ParseInt(page, 10, 32)
 				pageNumSent = true
 				pageNumChan <- int(pageNo)
 			}

 			if hrefExists {
 				urls <- href
 			}
 		})

 		// If page was malformed, and/or had no useable content, just send back page -1
 		if !pageNumSent {
 			log.Printf("Page [%s] contained no usable data", link)
 			pageNumChan <- -1
 		}
 	})()

 	pageNum = <-pageNumChan
 	return
 }

 func httpAJAX(detailLink string) ([]byte, error) {
 	albumId, err := extractFilename(detailLink)
 	if err != nil {
 		return nil, err
 	}

 	albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)

 	resp, err := httpGET(albumUrl)
 	if err != nil {
 		return nil, err
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode > 299 {
 		return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
 	}

 	return ioutil.ReadAll(resp.Body)
 }

 // Will use the AJAX endpoint to pluck all images in an album out:
 func fetchAJAXUrls(detailLink string) chan string {
 	out := make(chan string)

 	data, err := httpAJAX(detailLink)
 	if err != nil {
 		close(out)
 		return out
 	}

 	go (func() {
 		defer close(out)

 		parsed := AJAXResponse{}
 		err = json.Unmarshal(data, &parsed)
 		if err != nil {
 			log.Printf("AJAX Parse failed: %v", err)
 			return
 		}

 		for _, img := range parsed.Data.Images {
 			if imgUrl := img.GetURL(); imgUrl != "" {
 				out <- imgUrl
 			}
 		}
 	})()

 	return out
 }

 // Given the URL to a post detail page, returns the URLs to download:
 func fetchDownloadUrls(detailLink string) chan string {
 	out := make(chan string)

 	detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
 	doc, err := buildGoQueryDocument(detailUrl)
 	if err != nil {
 		log.Printf("Failed to read detail URL: %s", detailUrl)
 		close(out)
 		return out
 	}

 	_maybeSend := func(s string, exists bool) {
 		if exists && s != "" {
 			fullUrl := fmt.Sprintf(download_fmt, s)
 			out <- fullUrl
 		}
 	}

 	go (func() {
 		defer close(out)

 		// Albums could have TONS of pics, so use AJAX if too many pics:
 		if doc.Find("div.post-image").Length() >= album_cutoff {
 			log.Printf("Large album: %s", detailLink)
 			for linkz := range fetchAJAXUrls(detailLink) {
 				out <- linkz
 			}
 			return
 		}

 		// Else, emit a single entry:
 		doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
 			_maybeSend(s.Find("img").Attr("src"))
 			_maybeSend(s.Find("source").Attr("src"))
 		})

 	})()

 	return out
 }

 // Will read from a channel, downloading links until the channel dies:
 func imageWorker(urls chan string, workerName string) chan bool {
 	out := make(chan bool)
 	go (func() {
 		defer close(out)
 		log.Printf("Starting up worker: %s", workerName)
 		for link := range urls {
 			log.Printf("%s : Handling %s", workerName, link)
 			for downloadMe := range fetchDownloadUrls(link) {
 				log.Printf("%s : Found: %s", workerName, downloadMe)
 				maybeDownload(downloadMe)
 			}
 		}
 	})()
 	return out
 }

 // Main func parses args, and sets things up:
 func main() {

 	verbose := flag.Bool("v", false, "Verbosely log what's happening")
 	flag.Parse()

 	target := flag.Arg(0)

 	if !(*verbose) {
 		log.SetOutput(ioutil.Discard)
 	}

 	imageChan := fetchAllImageLinks(target)

 	var workers [num_workers]chan bool
 	for i := range workers {
 		name := fmt.Sprintf("Worker[%d]", i+1)
 		workers[i] = imageWorker(imageChan, name)
 	}

 	for _, w := range workers {
 		_ = <-w
 	}

 	log.Printf("Done.")
 }
	// rDump -- Dumps images in an Imgur sub-reddit thing

	// Dependencies:
	// go get github.com/PuerkitoBio/goquery

	package main

	import (
	"encoding/json"
	"flag"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"os"
	"path"
	"strconv"
	)

	// Magic values go here:
	const (
	user_agent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36"
	base_url_fmt string = "https://imgur.com/r/%s"
	next_url_fmt string = "https://imgur.com/r/%s/new/page/%d/hit?scrolled"
	detail_url_fmt string = "https://imgur.com%s"
	ajax_url_fmt string = "https://imgur.com/ajaxalbums/getimages/%s/hit.json?all=true"
	ajax_img_fmt string = "https://i.imgur.com/%s%s"
	download_fmt string = "https:%s"
	album_cutoff int = 8
	num_workers int = 5
	max_pages int = 10
	)

	// Used for parsing the AJAX endpoints:
	type PostDetail struct {
	Hash string `json:"hash"`
	Title string `json:"title"`
	Desc string `json:"description"`
	Width int `json:"width"`
	Height int `json:"height"`
	Size int `json:"size"`
	Ext string `json:"ext"`
	Anim bool `json:"animated"`
	PreferVid bool `json:"prefer_video"`
	Looping bool `json:"looping"`
	Timestamp string `json:"datetime"`
	}
	type ListData struct {
	Count int `json:"count"`
	Images []PostDetail `json:"images"`
	}
	type AJAXResponse struct {
	Data ListData `json:"data"`
	Success bool `json:"success"`
	Status int `json:"status"`
	}

	func (pd PostDetail) GetURL() string {
	if pd.Hash == "" \|\| pd.Ext == "" {
	return ""
	}
	return fmt.Sprintf(ajax_img_fmt, pd.Hash, pd.Ext)
	}

	// From a subreddit name, fetches all urls from that subreddit:
	func fetchAllImageLinks(subreddit string) chan string {
	// We give this channel a buffer, just so that page changes are less likely to
	// block image workers:
	out := make(chan string, 10)

	go (func() {
	defer close(out)
	for link := range urlGenerator(subreddit) {
	pageNo, linkChannel := fetchUrlList(link)
	log.Printf("Entering Page #%d : %s", pageNo, link)

	for link := range linkChannel {
	out <- link
	}
	}
	})()
	return out
	}

	// Given a subreddit name, returns a channel of URLs to scrape:
	func urlGenerator(seed string) chan string {
	out := make(chan string)
	base := fmt.Sprintf(base_url_fmt, seed)
	go (func() {
	out <- base
	for n := 1; n < max_pages; n++ {
	out <- fmt.Sprintf(next_url_fmt, seed, n)
	}
	close(out)
	})()
	return out
	}

	// Performs an HTTP GET, with the correct fake headers:
	func httpGET(url string) (*http.Response, error) {
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
	return nil, err
	}

	request.Header.Set("User-Agent", user_agent)

	return http.DefaultClient.Do(request)
	}

	// A stupid hack so we can manipulate our user-agent when fetching pages:
	func buildGoQueryDocument(url string) (*goquery.Document, error) {
	resp, err := httpGET(url)
	if err != nil {
	return nil, err
	}

	return goquery.NewDocumentFromResponse(resp)
	}

	func extractFilename(link string) (string, error) {
	parsed, err := url.Parse(link)
	if err != nil {
	return "", err
	}
	return path.Base(parsed.Path), nil
	}

	// Download a file... Unless we already have it:
	func maybeDownload(link string) {

	fname, err := extractFilename(link)
	if err != nil {
	log.Printf("Cannot download [%s] : Bad link. %v", link, err)
	return
	}

	stat, err := os.Stat(fname)
	if err == nil && stat.Size() > 0 {
	log.Printf("Already have '%s'. Skipping.", fname)
	return
	}

	destFile, err := os.Create(fname)
	if err != nil {
	log.Printf("Failed to create '%s': %v", fname, err)
	return
	}
	defer destFile.Close()

	httpResp, err := httpGET(link)
	if err != nil {
	log.Printf("Couldn't download '%s': %v", fname, err)
	return
	}
	defer httpResp.Body.Close()

	if httpResp.StatusCode > 299 {
	log.Printf("Download failed for '%s': Status code: %d", fname, httpResp.StatusCode)
	return
	}

	n, err := io.Copy(destFile, httpResp.Body)
	if err != nil {
	log.Printf("Download failed for '%s': %v", fname, err)
	return
	}

	log.Printf("Downloaded successful: '%s' (%d bytes)", fname, n)
	}

	// Parses images and the data-page thing out of the entry lists:
	func fetchUrlList(link string) (pageNum int, urls chan string) {
	pageNum, urls = -1, make(chan string)

	doc, err := buildGoQueryDocument(link)
	if err != nil {
	log.Printf("Failed to read URL: %s", link)
	close(urls)
	return
	}

	// We recieve a single value on this, which is the page num:
	pageNumSent, pageNumChan := false, make(chan int)
	defer close(pageNumChan)

	go (func() {
	defer close(urls)

	doc.Find("a.image-list-link").Each(func(_ int, s *goquery.Selection) {
	page, pageExists := s.Attr("data-page")
	href, hrefExists := s.Attr("href")

	if pageExists && !pageNumSent {
	pageNo, _ := strconv.ParseInt(page, 10, 32)
	pageNumSent = true
	pageNumChan <- int(pageNo)
	}

	if hrefExists {
	urls <- href
	}
	})

	// If page was malformed, and/or had no useable content, just send back page -1
	if !pageNumSent {
	log.Printf("Page [%s] contained no usable data", link)
	pageNumChan <- -1
	}
	})()

	pageNum = <-pageNumChan
	return
	}

	func httpAJAX(detailLink string) ([]byte, error) {
	albumId, err := extractFilename(detailLink)
	if err != nil {
	return nil, err
	}

	albumUrl := fmt.Sprintf(ajax_url_fmt, albumId)

	resp, err := httpGET(albumUrl)
	if err != nil {
	return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode > 299 {
	return nil, fmt.Errorf("Bad status code: %d", resp.StatusCode)
	}

	return ioutil.ReadAll(resp.Body)
	}

	// Will use the AJAX endpoint to pluck all images in an album out:
	func fetchAJAXUrls(detailLink string) chan string {
	out := make(chan string)

	data, err := httpAJAX(detailLink)
	if err != nil {
	close(out)
	return out
	}

	go (func() {
	defer close(out)

	parsed := AJAXResponse{}
	err = json.Unmarshal(data, &parsed)
	if err != nil {
	log.Printf("AJAX Parse failed: %v", err)
	return
	}

	for _, img := range parsed.Data.Images {
	if imgUrl := img.GetURL(); imgUrl != "" {
	out <- imgUrl
	}
	}
	})()

	return out
	}

	// Given the URL to a post detail page, returns the URLs to download:
	func fetchDownloadUrls(detailLink string) chan string {
	out := make(chan string)

	detailUrl := fmt.Sprintf(detail_url_fmt, detailLink)
	doc, err := buildGoQueryDocument(detailUrl)
	if err != nil {
	log.Printf("Failed to read detail URL: %s", detailUrl)
	close(out)
	return out
	}

	_maybeSend := func(s string, exists bool) {
	if exists && s != "" {
	fullUrl := fmt.Sprintf(download_fmt, s)
	out <- fullUrl
	}
	}

	go (func() {
	defer close(out)

	// Albums could have TONS of pics, so use AJAX if too many pics:
	if doc.Find("div.post-image").Length() >= album_cutoff {
	log.Printf("Large album: %s", detailLink)
	for linkz := range fetchAJAXUrls(detailLink) {
	out <- linkz
	}
	return
	}

	// Else, emit a single entry:
	doc.Find("div.post-image").Each(func(_ int, s *goquery.Selection) {
	_maybeSend(s.Find("img").Attr("src"))
	_maybeSend(s.Find("source").Attr("src"))
	})

	})()

	return out
	}

	// Will read from a channel, downloading links until the channel dies:
	func imageWorker(urls chan string, workerName string) chan bool {
	out := make(chan bool)
	go (func() {
	defer close(out)
	log.Printf("Starting up worker: %s", workerName)
	for link := range urls {
	log.Printf("%s : Handling %s", workerName, link)
	for downloadMe := range fetchDownloadUrls(link) {
	log.Printf("%s : Found: %s", workerName, downloadMe)
	maybeDownload(downloadMe)
	}
	}
	})()
	return out
	}

	// Main func parses args, and sets things up:
	func main() {

	verbose := flag.Bool("v", false, "Verbosely log what's happening")
	flag.Parse()

	target := flag.Arg(0)

	if !(*verbose) {
	log.SetOutput(ioutil.Discard)
	}

	imageChan := fetchAllImageLinks(target)

	var workers [num_workers]chan bool
	for i := range workers {
	name := fmt.Sprintf("Worker[%d]", i+1)
	workers[i] = imageWorker(imageChan, name)
	}

	for _, w := range workers {
	_ = <-w
	}

	log.Printf("Done.")
	}