Created
January 22, 2011 00:45
-
-
Save davepeck/790721 to your computer and use it in GitHub Desktop.
A scrapy link extractor that uses BeautifulSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from scrapy.link import Link | |
from urlparse import urljoin | |
from BeautifulSoup import BeautifulSoup | |
class SoupLinkExtractor(object): | |
def __init__(self, *args, **kwargs): | |
super(SoupLinkExtractor, self).__init__() | |
allow_re = kwargs.get('allow', None) | |
self._allow = re.compile(allow_re) if allow_re else None | |
def extract_links(self, response): | |
raw_follow_urls = [] | |
soup = BeautifulSoup(response.body_as_unicode()) | |
anchors = soup.findAll('a') | |
for anchor in anchors: | |
anchor_href = anchor.get('href', None) | |
if anchor_href and not anchor_href.startswith('#'): | |
raw_follow_urls.append(anchor_href) | |
potential_follow_urls = [urljoin(response.url, raw_follow_url) for raw_follow_url in raw_follow_urls] | |
if self._allow: | |
follow_urls = [potential_follow_url for potential_follow_url in potential_follow_urls if self._allow.search(potential_follow_url) is not None] | |
else: | |
follow_urls = potential_follow_urls | |
return [Link(url = follow_url) for follow_url in follow_urls] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have modified the code so that it also extracts the anchor text: https://gist.github.com/3261149