Created
April 20, 2013 01:16
-
-
Save qz267/5424301 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin python | |
# coding=utf-8 | |
import urllib2 | |
import lxml.etree as etree | |
import httplib | |
import urlparse | |
def get_solution(url): | |
print url, | |
page = urlreader(url) | |
if is_url_instock(page): | |
print get_product_price(page) | |
print " " | |
elif is_product_onsale(page): | |
print get_onsale_price(page) | |
else: | |
print "$999999999" # out of stock | |
def urlreader(url): | |
page_html = urllib2.urlopen(url).read() | |
page = etree.HTML(page_html.lower()) | |
return page | |
def is_url_available(url): | |
host, path = urlparse.urlsplit(url)[1:3] | |
found = 0 | |
try: | |
connection = httplib.HTTPConnection(host) # Make HTTPConnection Object | |
connection.request("HEAD", path) | |
responseOb = connection.getresponse() # Grab HTTPResponse Object | |
if responseOb.status == 200: | |
found = 1 | |
else: | |
print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url) | |
except Exception, e: | |
print e.__class__, e, url | |
return found | |
def is_url_schema(page): | |
flag = page.xpath('//*[@itemprop="name"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_url_instock(page): | |
flag = page.xpath('//*[@itemprop="price"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_product_onsale(page): | |
flag = page.xpath('//*[@id="ourprice"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def get_onsale_price(page): | |
prices = page.xpath('//*[@id="ourprice"]') | |
price = prices[0] | |
return price.text.strip() | |
def get_product_price(page): | |
prices = page.xpath('//*[@itemprop="price"]') | |
price = prices[0] | |
return price.text.strip() | |
def get_product_name(page): | |
names = page.xpath('//*[@itemprop ="name"]') | |
name = names[0] | |
return name.text.strip() | |
if __name__ == '__main__': | |
urls = { | |
'http://www.modcloth.com/shop/handbags/a-coast-call-bag', | |
'http://store.apple.com/us/browse/home/shop_mac/family/mac_pro?mco=MjI4NDU1', | |
'http://www.modcloth.com/shop/kitchen-gadgets/talented-mr-apple-bottle', | |
'http://www.insound.com/Y-Com-Earphones-with-Microphone-Grey-Headphones-AIAIAI/P/INS52376/', | |
'http://www.overstock.com/Luggage-Bags/Floto-Leather-Venezia-Leather-Duffel-Bag/3821244/product.html?sec_iid=33%20969', | |
'http://www.barnesandnoble.com/p/home-gift-homer-and-aristotle-cast-marble-bookends-set-of-2/12601703?ean=9780830078097&isbn=9780830078097', | |
'http://www.bbq.com/item_name_Smokin-Tex-1400-Pro-Series-Electric-BBQ-Smoker_path_7119-7122_item_1530808.html' | |
} | |
for url in urls: | |
if is_url_available(url): | |
get_solution(url) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
import urllib2 | |
import lxml.etree as etree | |
import httplib | |
import urlparse | |
import json | |
def get_solution(url): | |
page = urlreader(url) | |
if is_url_schema(page): | |
myprice = get_price(page) | |
elif is_url_opg(page): | |
myprice = get_ogp_price(page) | |
else: | |
print "-2" | |
return myprice | |
def get_price(page): | |
# print url | |
# page = urlreader(url) | |
# if is_url_instock(page): | |
# myprice = get_product_price(page) | |
# elif is_product_onsale(page): | |
# myprice = get_onsale_price(page) | |
# elif is_sub_price(page): | |
# myprice = get_sub_price(page) | |
# else: | |
# myprice = "-1" # out of stock | |
# page = urlreader(url) | |
if is_sub_price(page): | |
myprice1 = get_sub_price(page) | |
elif is_url_instock(page): | |
myprice1 = get_product_price(page) | |
elif is_product_onsale(page): | |
myprice1 = get_onsale_price(page) | |
else: | |
myprice1 = "-1" # out of stock | |
return myprice1 | |
def get_ogp_price(page): | |
# page = urlreader(url) | |
prices = page.xpath('//*[@itemprop ="price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="active_price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="sale-price"]') | |
price = prices[0] | |
# class="sale-price" | |
# price = prices[0] | |
return filter(lambda ch: ch in '€¥£$0123456789.,', price.text) | |
def urlreader(url): | |
page_html = urllib2.urlopen(url).read() | |
page = etree.HTML(page_html.lower()) | |
return page | |
def is_url_available(url): | |
host, path = urlparse.urlsplit(url)[1:3] | |
found = 0 | |
try: | |
connection = httplib.HTTPConnection(host) # Make HTTPConnection Object | |
connection.request("HEAD", path) | |
responseOb = connection.getresponse() # Grab HTTPResponse Object | |
if responseOb.status == 200: | |
found = 1 | |
else: | |
print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url) | |
except Exception, e: | |
print e.__class__, e, url | |
return found | |
def is_url_schema(page): | |
flag = page.xpath('//*[@itemprop="name"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_url_opg(page): | |
flag = page.xpath('//meta[@name="description"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_url_instock(page): | |
flag = page.xpath('//*[@itemprop="price"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_product_onsale(page): | |
flag = page.xpath('//*[@id="ourprice"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_sub_price(page): | |
flag = page.xpath('//*[@class="amount"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def get_onsale_price(page): | |
prices = page.xpath('//*[@id="ourprice"]') | |
price = prices[0] | |
return filter(lambda ch: ch in '€¥£$0123456789.,', price.text) | |
# return price.text.strip() | |
def get_sub_price(page): | |
prices = page.xpath('//*[@class="amount"]') | |
price = prices[0] | |
return filter(lambda ch: ch in '€¥£$0123456789.,', price.text) | |
# return price.text.strip() | |
def get_product_price(page): | |
prices = page.xpath('//*[@itemprop="price"]') | |
price = prices[0] | |
# €,¥,£,$,円 | |
return filter(lambda ch: ch in '€¥£円$0123456789.,', price.text) | |
# filter(str.isalnum, crazystring) | |
# return price.text.strip() | |
def get_product_name(page): | |
names = page.xpath('//*[@itemprop ="name"]') | |
name = names[0] | |
return name.text.strip() | |
if __name__ == '__main__': | |
''' | |
urls = { | |
'http://www.modcloth.com/shop/handbags/a-coast-call-bag', | |
'http://store.apple.com/us/browse/home/shop_mac/family/mac_pro?mco=MjI4NDU1', | |
'http://www.modcloth.com/shop/kitchen-gadgets/talented-mr-apple-bottle', | |
'http://www.insound.com/Y-Com-Earphones-with-Microphone-Grey-Headphones-AIAIAI/P/INS52376/', | |
'http://www.overstock.com/Luggage-Bags/Floto-Leather-Venezia-Leather-Duffel-Bag/3821244/product.html?sec_iid=33%20969', | |
'http://www.barnesandnoble.com/p/home-gift-homer-and-aristotle-cast-marble-bookends-set-of-2/12601703?ean=9780830078097&isbn=9780830078097', | |
'http://www.bbq.com/item_name_Smokin-Tex-1400-Pro-Series-Electric-BBQ-Smoker_path_7119-7122_item_1530808.html' | |
} | |
# for url in urls: | |
# if is_url_available(url): | |
# print get_solution(url) | |
# ''' | |
myfile = open('schema_urls_org.txt') | |
for line in myfile: | |
# (weburl, imgurl) = line.split('\t') | |
# print weburl.rstrip() | |
weburl = line` | |
print weburl | |
page = urlreader(weburl) | |
if is_url_schema(page): | |
print get_price(page) | |
elif is_url_opg(page): | |
print get_ogp_price(page) | |
else: | |
print "-2" | |
# print get_solution(weburl) | |
myfile.close() | |
# url = 'http://store.hypebeast.com/brands/undefeated/olive-play-dirty-new-era-beanie' | |
# page = urlreader(url) | |
# print is_product_onsale(page) | |
# print is_url_instock(page) | |
# print is_sub_price(page) | |
# print get_product_price(page) | |
# print get_sub_price(page) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin python | |
# coding=utf-8 | |
import urllib2 | |
import lxml.etree as etree | |
import httplib | |
import urlparse | |
def urlreader(url): | |
page_html = urllib2.urlopen(url).read() | |
page = etree.HTML(page_html.lower()) | |
return page | |
def is_url_available(url): | |
host, path = urlparse.urlsplit(url)[1:3] | |
found = 0 | |
try: | |
connection = httplib.HTTPConnection(host) # Make HTTPConnection Object | |
connection.request("HEAD", path) | |
responseOb = connection.getresponse() # Grab HTTPResponse Object | |
if responseOb.status == 200: | |
found = 1 | |
else: | |
print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url) | |
except Exception, e: | |
print e.__class__, e, url | |
return found | |
def is_url_opg(page): | |
flag = page.xpath('//meta[@name="description"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def get_ogp_price(page): | |
prices = page.xpath('//*[@itemprop ="price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="active_price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="price"]') | |
if prices != []: | |
price = prices[0] | |
else: | |
prices = page.xpath('//*[@class="sale-price"]') | |
price = prices[0] | |
# class="sale-price" | |
price = prices[0] | |
return filter(lambda ch: ch in '€¥£円$0123456789.,', price.text) | |
# if __name__ == '__main__': | |
# file = open('good_urls.txt') | |
# while 1: | |
# urls = file.readlines(100000) | |
# if not urls: | |
# break | |
# for url in urls: | |
# # print url | |
# if is_url_available(url): | |
# page = urlreader(url) | |
# try: | |
# if is_url_opg(page): | |
# f = open('ogp_urls.txt','a') | |
# f.write(url,) | |
# print "###############################################################" | |
# print url, | |
# print "****************************************************************" | |
# print get_product_price(page) | |
# except Exception, e: | |
# print e.__class__, e, url |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin python | |
# coding=utf-8 | |
import urllib2 | |
import lxml.etree as etree | |
import httplib | |
import urlparse | |
def urlreader(url): | |
page_html = urllib2.urlopen(url).read() | |
page = etree.HTML(page_html.lower()) | |
return page | |
def is_url_available(url): | |
host, path = urlparse.urlsplit(url)[1:3] | |
found = 0 | |
try: | |
connection = httplib.HTTPConnection(host) # Make HTTPConnection Object | |
connection.request("HEAD", path) | |
responseOb = connection.getresponse() # Grab HTTPResponse Object | |
if responseOb.status == 200: | |
found = 1 | |
else: | |
print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url) | |
except Exception, e: | |
print e.__class__, e, url | |
return found | |
# def is_url_available(url): | |
# host, path = urlparse.urlsplit(url)[1:3] | |
# found = 0 | |
# # try: | |
# connection = httplib.HTTPConnection(host) # Make HTTPConnection Object | |
# connection.request("HEAD", path) | |
# responseOb = connection.getresponse() # Grab HTTPResponse Object | |
# if responseOb.status == 200: | |
# found = 1 | |
# # else: | |
# # print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url) | |
# # except Exception, e: | |
# # print e.__class__, e, url | |
# return found | |
def is_url_schema(page): | |
flag = page.xpath('//*[@itemprop="name"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_url_instock(page): | |
flag = page.xpath('//*[@itemprop="price"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def is_product_onsale(page): | |
flag = page.xpath('//*[@id="ourprice"]') | |
result = 1 | |
if (flag == []): | |
result = 0 | |
return result | |
def get_onsale_price(page): | |
prices = page.xpath('//*[@id="ourprice"]') | |
price = prices[0] | |
return price.text.strip() | |
def get_product_price(page): | |
prices = page.xpath('//*[@itemprop="price"]') | |
price = prices[0] | |
return price.text.strip() | |
def get_product_name(page): | |
names = page.xpath('//*[@itemprop ="name"]') | |
name = names[0] | |
return name.text.strip() | |
# check if the url is a schema style | |
# def if_schema_page(url): | |
# pass | |
# get the domain name of a url | |
# def get_domain_name(): | |
# pass | |
# connect to the database and get | |
# def load_dbconnected_profile(domain_name): | |
# content = {} | |
# root = minidom.parse(<span style="background-color: rgb(255, 255, 255); ">domain_name</span>) | |
# table = root.getElementsByTagName("table")[0]∫ | |
# read dbname and table name. | |
# table_name = table.getAttribute("name") | |
# db_name = table.getAttribute("db_name") | |
# if len(table_name) > 0 and len(db_name) > 0: | |
# db_sql = "create database if not exists `" + db_name +"`; use " + db_name + ";" | |
# table_drop_sql = "drop " + table_name + " if exists " + table_name + ";" | |
# content.update({"db_sql" : db_sql}) | |
# content.update({"table_sql" : table_drop_sql }) | |
# else: | |
# print "Error:attribute is not define well! db_name=" + db_name + " ;table_name=" + table_name | |
# sys.exit(1) | |
if __name__ == '__main__': | |
file = open('web_urls.txt') | |
while 1: | |
urls = file.readlines(100000) | |
if not urls: | |
break | |
for url in urls: | |
# print url | |
if is_url_available(url): | |
f = open('good_urls.txt','a') | |
f.write(url,) | |
print url, | |
page = urlreader(url) | |
try: | |
if is_url_schema(page): | |
print "************************" | |
print url, | |
print "########################" | |
except Exception, e: | |
print e.__class__, e, url | |
# urls = [ | |
# 'http://www.barnesandnoble.com/p/home-gift-ihome-ihm60-20-rechargable-mini-speaker-gray/25547311?ean=47532896213&isbn=47532896213&urlkeywords=ihome+ihm60+20+rechargable+mini+speaker+gray', | |
# 'http://www.barnesandnoble.com/p/toys-games-kiss-8-gb-usb-flash-drive-peter-criss-catman/25209496?ean=895221380051&isbn=895221380051', | |
# 'http://www.bbq.com/item_name_Kamado-Joe-ClassicJoe-Ceramic-Kamado-Grill-On-Cart-Red_path_2112-11447_item_2854890', | |
# 'http://www.barnesandnoble.com/p/home-gift-ihome-colortunes-noise-isolating-headphones-black/25210773?ean=47532897302&isbn=47532897302', | |
# 'http://www.manufactum.com/maplewood-foldable-wardrobe-p1465202/' | |
# 'http://www.barnesandnoble.com/p/home-gift-ihome-ib40b-over-the-ear-headphones-with-volume-control-black/22201677?ean=47532895629&isbn=47532895629', | |
# 'http://www.bbq.com/item_name_Cookshack-AmeriQue-Electric-Barbecue-Smoker_path_7122_item_2121460.html', | |
# 'http://www.bbq.com/item_name_Cookshack-Smokette-Elite-Electric-BBQ-Smoker_path_7122_item_2512310.html', | |
# 'http://www.bbq.com/item_name_Smokin-Tex-1400-Pro-Series-Electric-BBQ-Smoker_path_7122_item_1530808.html', | |
# 'http://www.barnesandnoble.com/p/home-gift-portable-stereo-speaker-system-in-black/25550011?ean=47532895520&isbn=47532895520123123123', | |
# 'http://www.manufactum.com/devold-nansen-troyer-style-pullover-p1465134/', | |
# 'http://www.barnesandnoble.com/p/elan-passport-wallet-for-iphone-4-in-platinum-with-lanyard/25218472?ean=685387307999&isbn=685387307999', | |
# 'http://www.bbq.com/item_name_Kamado-Joe-ClassicJoe-Ceramic-Kamado-Grill-On-Cart-Black_path_7122_item_2854892.html' | |
# ] | |
# outfile = open(‘schemaPrices.txt’, ‘w’) | |
# for url in urls: | |
# if is_url_available(url): | |
# page = urlreader(url) | |
# if is_url_schema(page): | |
# if is_url_instock(page): | |
# print (get_product_name(page) + " is " + get_product_price(page)) | |
# print " " | |
# # elif is_product_onsale(page): | |
# # print (get_product_name(page) + " is ON SALE!! The on sale price is " + get_onsale_price(page)) | |
# # print " " | |
# else: | |
# if is_product_onsale(page): | |
# print (get_product_name(page) + " is ON SALE!! The on sale price is " + get_onsale_price(page)) | |
# print " " | |
# else: | |
# print (get_product_name(page)+" is out of stock!") | |
# print " " | |
# else: | |
# print ("***"+url+"*** ") | |
# print "This is not a Schema type website!" | |
# print " " | |
# else: | |
# print "URL not exists" | |
# print " " |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NYUschema.py is a function to get price, currency given a webpage url.
Getting web page url, check if this web page is follow schema.org.
If this is true, get price, status of the product.
If this is false, return false.
Check if the web page has open graph.
If this is true, get price from open graph,
If this is false, return false.
get_solution.py is a function is to check if the webpage is available given url.
If the webpage is not available, return false,
if the webpage is available, check if the webpage is schema.org or open graph, then call right function.
opg.py is used to check if the webpage has open graph and get price if the webpage has one.
schemaTest.py is used to check if the webpage is follow schema.org and get price if the web page follow schema.org.