Created
March 27, 2012 01:01
-
-
Save johnfink8/2211339 to your computer and use it in GitHub Desktop.
Simple Django one-off view that pulls down a listing of Craigslist ads from all Craigslist domains. Ain't exactly pretty, but it works. It's not fast by web standards, but it sure beats searching manually.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2,os,cookielib | |
from urlparse import urlparse | |
from BeautifulSoup import BeautifulSoup | |
from django.core.cache import cache | |
from django.http import HttpResponse | |
import multiprocessing | |
m=multiprocessing.Manager() | |
links=m.list() | |
hosts=m.Queue() | |
COOKIEFILEBASE = '/tmp/cookie.tmp.' | |
#Fetch a url. Simple enough, maybe. But we also check against a cache | |
#backend, if available. And we honor cookies. Cache the results for 12 | |
#hours, so we don't smear anyone's servers unnecessarily, because we're | |
#not looking for any time-sensitive data. | |
def fetch_url(request,url): | |
server_name=urlparse(url).netloc | |
COOKIEFILE=COOKIEFILEBASE + server_name | |
headers={ | |
'Host':server_name, | |
'User-Agent':request.META['HTTP_USER_AGENT'] | |
} | |
cache_key = url | |
if cache.get(cache_key): | |
return cache.get(cache_key) | |
cj = cookielib.LWPCookieJar() | |
urlopen = urllib2.urlopen | |
Request = urllib2.Request | |
if os.path.isfile(COOKIEFILE): | |
cj.load(COOKIEFILE) | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
urllib2.install_opener(opener) | |
req=Request(url,None,headers) | |
cj.save(COOKIEFILE) | |
response=urlopen(req).read() | |
cache.set(cache_key,response,60*60*12) | |
return response | |
#Called for each process created in the view function. | |
#Pulls a queued hostname, runs a standardized Craigslist search on that | |
#hostname, and adds an HTML anchor to the list of output in links. | |
#This particular Craigslist search is for 'django' with the | |
#'telecommuting' flag set. | |
def worker_thread(request): | |
while True: | |
try: | |
host=hosts.get_nowait() | |
except: | |
print "thread done" | |
return | |
url='http://%s/search/jjj?query=django&srchType=A&addOne=telecommuting'%(host,) | |
html=fetch_url(request,url) | |
try: | |
soup=BeautifulSoup(html) | |
except: | |
print html | |
for p in soup.findAll('blockquote')[-1].findAll('p'): | |
links.append('<a href="%s">%s - %s</a>'%(p.a['href'],p.text,host)) | |
hosts.task_done() | |
print host | |
#Actual view that is mapped to a url. | |
#Builds a list of sites based on the craigslist front page. | |
#Pushes them all into a queue, and starts off 10 processes working | |
#that queue | |
def find_all_postings(request): | |
url='http://www.craigslist.org/about/sites' | |
soup=BeautifulSoup(fetch_url(request,url)) | |
for li in soup.find(attrs={'class':'colmask'}).findAll('li'): | |
hosts.put(urlparse(li.a['href']).netloc) | |
for index in range(0,10): | |
#If you'll notice, these two lines (other than some variables) are | |
#the only two we need to make this a multi-process app | |
thread=multiprocessing.Process(target=worker_thread, args=(request,)) | |
thread.start() | |
hosts.join() | |
sorted_links = sorted(links, key=lambda x: BeautifulSoup(x).text) | |
return HttpResponse('<br/>'.join(sorted_links)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment