Last active
July 3, 2022 06:38
-
-
Save scrapehero/c7794d9e4522d9c72ba167496b849228 to your computer and use it in GitHub Desktop.
Script to scrape financial data from NASDAQ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from lxml import html | |
import requests | |
from time import sleep | |
import json | |
import argparse | |
from random import randint | |
def parse_finance_page(ticker): | |
""" | |
Grab financial data from NASDAQ page | |
Args: | |
ticker (str): Stock symbol | |
Returns: | |
dict: Scraped data | |
""" | |
key_stock_dict = {} | |
headers = { | |
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Accept-Encoding":"gzip, deflate", | |
"Accept-Language":"en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7", | |
"Connection":"keep-alive", | |
"Host":"www.nasdaq.com", | |
"Referer":"http://www.nasdaq.com", | |
"Upgrade-Insecure-Requests":"1", | |
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36" | |
} | |
# Retrying for failed request | |
for retries in range(5): | |
try: | |
url = "http://www.nasdaq.com/symbol/%s"%(ticker) | |
response = requests.get(url, headers = headers, verify=False) | |
if response.status_code!=200: | |
raise ValueError("Invalid Response Received From Webserver") | |
print("Parsing %s"%(url)) | |
# Adding random delay | |
sleep(randint(1,3)) | |
parser = html.fromstring(response.text) | |
xpath_head = "//div[contains(@id,'pageheader')]//h1//text()" | |
xpath_key_stock_table = '//div[contains(@class,"overview-results")]//div[contains(@class,"table-table")]/div' | |
xpath_open_price = '//b[contains(text(),"Open Price:")]/following-sibling::span/text()' | |
xpath_open_date = '//b[contains(text(),"Open Date:")]/following-sibling::span/text()' | |
xpath_close_price = '//b[contains(text(),"Close Price:")]/following-sibling::span/text()' | |
xpath_close_date = '//b[contains(text(),"Close Date:")]/following-sibling::span/text()' | |
xpath_key = './/div[@class="table-cell"]/b/text()' | |
xpath_value = './/div[@class="table-cell"]/text()' | |
raw_name = parser.xpath(xpath_head) | |
key_stock_table = parser.xpath(xpath_key_stock_table) | |
raw_open_price = parser.xpath(xpath_open_price) | |
raw_open_date = parser.xpath(xpath_open_date) | |
raw_close_price = parser.xpath(xpath_close_price) | |
raw_close_date = parser.xpath(xpath_close_date) | |
company_name = raw_name[0].replace("Common Stock Quote & Summary Data","").strip() if raw_name else '' | |
open_price =raw_open_price[0].strip() if raw_open_price else None | |
open_date = raw_open_date[0].strip() if raw_open_date else None | |
close_price = raw_close_price[0].strip() if raw_close_price else None | |
close_date = raw_close_date[0].strip() if raw_close_date else None | |
# Grabbing and cleaning keystock data | |
for i in key_stock_table: | |
key = i.xpath(xpath_key) | |
value = i.xpath(xpath_value) | |
key = ''.join(key).strip() | |
value = ' '.join(''.join(value).split()) | |
key_stock_dict[key] = value | |
nasdaq_data = { | |
"company_name":company_name, | |
"ticker":ticker, | |
"url":url, | |
"open price":open_price, | |
"open_date":open_date, | |
"close_price":close_price, | |
"close_date":close_date, | |
"key_stock_data":key_stock_dict | |
} | |
return nasdaq_data | |
except Exception as e: | |
print("Failed to process the request, Exception:%s"%(e)) | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('ticker',help = 'Company stock symbol') | |
args = argparser.parse_args() | |
ticker = args.ticker | |
print("Fetching data for %s"%(ticker)) | |
scraped_data = parse_finance_page(ticker) | |
print("Writing scraped data to output file") | |
with open('%s-summary.json'%(ticker),'w') as fp: | |
json.dump(scraped_data,fp,indent = 4,ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nice