johnfink8 · March 27, 2012 01:01
diff --git a/views.py b/views.py
 import urllib2,os,cookielib
 from urlparse import urlparse
 from BeautifulSoup import BeautifulSoup
 from django.core.cache import cache
 from django.http import HttpResponse
 import multiprocessing
 m=multiprocessing.Manager()
 links=m.list()
 hosts=m.Queue()

 COOKIEFILEBASE = '/tmp/cookie.tmp.'


 #Fetch a url.  Simple enough, maybe.  But we also check against a cache
 #backend, if available.  And we honor cookies.  Cache the results for 12
 #hours, so we don't smear anyone's servers unnecessarily, because we're
 #not looking for any time-sensitive data.
 def fetch_url(request,url):
    server_name=urlparse(url).netloc
    COOKIEFILE=COOKIEFILEBASE + server_name
    headers={
        'Host':server_name,
        'User-Agent':request.META['HTTP_USER_AGENT']
        }
    cache_key = url
    if cache.get(cache_key):
        return cache.get(cache_key)
    cj = cookielib.LWPCookieJar()
    urlopen = urllib2.urlopen
    Request = urllib2.Request
    if os.path.isfile(COOKIEFILE):
        cj.load(COOKIEFILE)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    req=Request(url,None,headers)
    cj.save(COOKIEFILE)
    response=urlopen(req).read()
    cache.set(cache_key,response,60*60*12)
    return response

 #Called for each process created in the view function.
 #Pulls a queued hostname, runs a standardized Craigslist search on that
 #hostname, and adds an HTML anchor to the list of output in links.
 #This particular Craigslist search is for 'django' with the 
 #'telecommuting' flag set.
 def worker_thread(request):
    while True:
        try:
            host=hosts.get_nowait()
        except:
            print "thread done"
            return
        url='http://%s/search/jjj?query=django&srchType=A&addOne=telecommuting'%(host,)
        html=fetch_url(request,url)
        try:
            soup=BeautifulSoup(html)
        except:
            print html
        for p in soup.findAll('blockquote')[-1].findAll('p'):
            links.append('<a href="%s">%s - %s</a>'%(p.a['href'],p.text,host))
        hosts.task_done()
        print host

 #Actual view that is mapped to a url.
 #Builds a list of sites based on the craigslist front page.
 #Pushes them all into a queue, and starts off 10 processes working
 #that queue
 def find_all_postings(request):
    url='http://www.craigslist.org/about/sites'
    soup=BeautifulSoup(fetch_url(request,url))
    for li in soup.find(attrs={'class':'colmask'}).findAll('li'):
        hosts.put(urlparse(li.a['href']).netloc)
    for index in range(0,10):
        #If you'll notice, these two lines (other than some variables) are 
        #the only two we need to make this a multi-process app
        thread=multiprocessing.Process(target=worker_thread, args=(request,))
        thread.start()
    hosts.join()
    sorted_links = sorted(links, key=lambda x: BeautifulSoup(x).text)
    return HttpResponse('<br/>'.join(sorted_links))
	import urllib2,os,cookielib
	from urlparse import urlparse
	from BeautifulSoup import BeautifulSoup
	from django.core.cache import cache
	from django.http import HttpResponse
	import multiprocessing
	m=multiprocessing.Manager()
	links=m.list()
	hosts=m.Queue()

	COOKIEFILEBASE = '/tmp/cookie.tmp.'


	#Fetch a url. Simple enough, maybe. But we also check against a cache
	#backend, if available. And we honor cookies. Cache the results for 12
	#hours, so we don't smear anyone's servers unnecessarily, because we're
	#not looking for any time-sensitive data.
	def fetch_url(request,url):
	server_name=urlparse(url).netloc
	COOKIEFILE=COOKIEFILEBASE + server_name
	headers={
	'Host':server_name,
	'User-Agent':request.META['HTTP_USER_AGENT']
	}
	cache_key = url
	if cache.get(cache_key):
	return cache.get(cache_key)
	cj = cookielib.LWPCookieJar()
	urlopen = urllib2.urlopen
	Request = urllib2.Request
	if os.path.isfile(COOKIEFILE):
	cj.load(COOKIEFILE)
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	req=Request(url,None,headers)
	cj.save(COOKIEFILE)
	response=urlopen(req).read()
	cache.set(cache_key,response,606012)
	return response

	#Called for each process created in the view function.
	#Pulls a queued hostname, runs a standardized Craigslist search on that
	#hostname, and adds an HTML anchor to the list of output in links.
	#This particular Craigslist search is for 'django' with the
	#'telecommuting' flag set.
	def worker_thread(request):
	while True:
	try:
	host=hosts.get_nowait()
	except:
	print "thread done"
	return
	url='http://%s/search/jjj?query=django&srchType=A&addOne=telecommuting'%(host,)
	html=fetch_url(request,url)
	try:
	soup=BeautifulSoup(html)
	except:
	print html
	for p in soup.findAll('blockquote')[-1].findAll('p'):
	links.append('<a href="%s">%s - %s</a>'%(p.a['href'],p.text,host))
	hosts.task_done()
	print host

	#Actual view that is mapped to a url.
	#Builds a list of sites based on the craigslist front page.
	#Pushes them all into a queue, and starts off 10 processes working
	#that queue
	def find_all_postings(request):
	url='http://www.craigslist.org/about/sites'
	soup=BeautifulSoup(fetch_url(request,url))
	for li in soup.find(attrs={'class':'colmask'}).findAll('li'):
	hosts.put(urlparse(li.a['href']).netloc)
	for index in range(0,10):
	#If you'll notice, these two lines (other than some variables) are
	#the only two we need to make this a multi-process app
	thread=multiprocessing.Process(target=worker_thread, args=(request,))
	thread.start()
	hosts.join()
	sorted_links = sorted(links, key=lambda x: BeautifulSoup(x).text)
	return HttpResponse('<br/>'.join(sorted_links))