Created
March 14, 2016 07:30
-
-
Save mikeflynn/c70068a26d93f3b82b83 to your computer and use it in GitHub Desktop.
YT Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"log" | |
"net/http" | |
"regexp" | |
"strings" | |
"time" | |
"github.com/PuerkitoBio/gocrawl" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/boltdb/bolt" | |
) | |
var boltDB *bolt.DB | |
var rxOk = regexp.MustCompile(`^(http|https)://(www\.)?youtube\.com(/|/watch.*)?$`) // Video URLs | |
type ExampleExtender struct { | |
gocrawl.DefaultExtender | |
} | |
func (x *ExampleExtender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) { | |
// Use the goquery document or res.Body to manipulate the data | |
// ... | |
attribution := "NONE" | |
channelID := "" | |
doc.Find("meta").Each(func(i int, s *goquery.Selection) { | |
if name, _ := s.Attr("name"); strings.EqualFold(name, "attribution") { | |
attribution, _ = s.Attr("content") | |
} else if itemprop, _ := s.Attr("itemprop"); strings.EqualFold(itemprop, "channelId") { | |
channelID, _ = s.Attr("content") | |
} | |
}) | |
log.Println("VID: " + ctx.URL().Query().Get("v") + "; CO: " + attribution + "; CID: " + channelID) | |
// Return nil and true - let gocrawl find the links | |
return nil, true | |
} | |
// Override Filter for our need. | |
func (x *ExampleExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool { | |
return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String()) | |
} | |
func main() { | |
var err error | |
boltDB, err = bolt.Open("crawler.db", 0600, nil) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer boltDB.Close() | |
// Set custom options | |
opts := gocrawl.NewOptions(new(ExampleExtender)) | |
opts.RobotUserAgent = "S71Cralwer" | |
opts.UserAgent = "Mozilla/5.0 (compatible; S71Crawler/1.0; +http://studio71.com)" | |
opts.CrawlDelay = 100 * time.Millisecond | |
//opts.LogFlags = gocrawl.LogAll | |
opts.MaxVisits = 100 | |
opts.SameHostOnly = true | |
// Create crawler and start at root of duckduckgo | |
c := gocrawl.NewCrawlerWithOptions(opts) | |
c.Run("https://www.youtube.com/") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment