Last active
December 17, 2015 20:49
-
-
Save synhershko/5670157 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. Install-Package NEST | |
2. Download and reference crawler binaries: https://code.google.com/p/abot/downloads/detail?name=Abotv1.1.1_Bin.zip&can=2&q= | |
3. Add the code attached to this gist | |
4. Have fun |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using Abot.Poco; | |
namespace Basics | |
{ | |
public class Page | |
{ | |
public Page(CrawledPage crawledPage) | |
{ | |
var doc = crawledPage.HtmlDocument.DocumentNode; | |
Title = doc.SelectSingleNode("//title").InnerText.Replace(" - Wikipedia, the free encyclopedia", String.Empty); | |
Content = doc.SelectSingleNode("//body").InnerText; | |
Uri = crawledPage.Uri.ToString(); | |
ParentUri = crawledPage.ParentUri.ToString(); | |
PageSizeInBytes = crawledPage.PageSizeInBytes; | |
CrawlDepth = crawledPage.CrawlDepth; | |
Timestamp = DateTimeOffset.UtcNow; | |
} | |
public string Uri { get; set; } | |
public string ParentUri { get; set; } | |
public long PageSizeInBytes { get; set; } | |
public int CrawlDepth { get; set; } | |
public string Title { get; set; } | |
public string Content { get; set; } | |
public DateTimeOffset Timestamp { get; set; } | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Abot.Crawler; | |
using Abot.Core; | |
using Abot.Poco; | |
namespace Basics | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var client = new ElasticClient(new ConnectionSettings(new Uri("http://localhost:9200")).SetDefaultIndex("pages")); | |
ConnectionStatus connectionStatus; | |
if (!client.TryConnect(out connectionStatus)) | |
{ | |
Console.WriteLine("Error"); | |
// Error handling | |
return; | |
} | |
Console.WriteLine("Connected to " + client.Settings.Uri); | |
var crawlConfig = new CrawlConfiguration | |
{ | |
CrawlTimeoutSeconds = 3600, | |
MaxConcurrentThreads = 10, | |
UserAgentString = "abot v1.0 http://code.google.com/p/abot" | |
}; | |
//Will use the manually created crawlConfig object created above | |
var crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); | |
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; | |
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; | |
crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; | |
crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; | |
crawler.CrawlBag.ElasticClient = client; | |
var result = crawler.Crawl(new Uri("http://en.wikipedia.org/wiki/Main_Page/")); | |
if (result.ErrorOccurred) | |
Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException); | |
else | |
Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); | |
// ... | |
} | |
static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) | |
{ | |
PageToCrawl pageToCrawl = e.PageToCrawl; | |
Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri); | |
} | |
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) | |
{ | |
CrawledPage crawledPage = e.CrawledPage; | |
if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) | |
Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); | |
else | |
Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); | |
if (string.IsNullOrEmpty(crawledPage.RawContent)) | |
Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); | |
var doc = e.CrawledPage.HtmlDocument.DocumentNode; | |
ElasticClient client = e.CrawlContext.CrawlBag.ElasticClient; | |
// TODO | |
} | |
static void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e) | |
{ | |
CrawledPage crawledPage = e.CrawledPage; | |
Console.WriteLine("Did not crawl the links on page {0} due to {1}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason); | |
} | |
static void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) | |
{ | |
PageToCrawl pageToCrawl = e.PageToCrawl; | |
Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment