-
-
Save CMingTseng/79447ccb2bb41e4bd8ec36d020fccab9 to your computer and use it in GitHub Desktop.
分點進出取資料研究
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import requests | |
import sys | |
import json | |
from bs4 import BeautifulSoup | |
session = requests.Session() | |
resp = session.get('https://bsr.twse.com.tw/bshtm/bsMenu.aspx') | |
if resp.status_code == 200: | |
soup = BeautifulSoup(resp.text, 'lxml') | |
nodes = soup.select('form input') | |
params = {} | |
for node in nodes: | |
name = node.attrs['name'] | |
# 忽略鉅額交易的 radio button | |
if name in ('RadioButton_Excd', 'Button_Reset'): | |
continue | |
if 'value' in node.attrs: | |
params[node.attrs['name']] = node.attrs['value'] | |
else: | |
params[node.attrs['name']] = '' | |
# 找 captcha 圖片 | |
captcha_image = soup.select('#Panel_bshtm img')[0]['src'] | |
m = re.search(r'guid=(.+)', captcha_image) | |
if m is None: | |
exit(1) | |
# 顯示 captcha 圖片 | |
imgpath = '%s.jpg' % m.group(1) | |
url = 'https://bsr.twse.com.tw/bshtm/' + captcha_image | |
resp = requests.get(url) | |
if resp.status_code == 200: | |
with open(imgpath, 'wb') as f: | |
f.write(resp.content) | |
os.system('open ' + imgpath) | |
# 詢問 captcha 圖片的文字 | |
# https://blog.steven5538.tw/2014/06/22/captcha-ocr-preprocess-python/ | |
# http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version | |
print('輸入圖型驗證碼: ', end='', flush=True) | |
vcode = sys.stdin.readline().strip() | |
params['CaptchaControl1'] = vcode | |
params['TextBox_Stkno'] = '2330' | |
# 送出 | |
print(json.dumps(params, indent=2)) | |
resp = session.post('https://bsr.twse.com.tw/bshtm/bsMenu.aspx', data=params) | |
if resp.status_code != 200: | |
print('任務失敗: %d' % resp.status_code) | |
exit(1) | |
soup = BeautifulSoup(resp.text, 'lxml') | |
nodes = soup.select('#HyperLink_DownloadCSV') | |
if len(nodes) == 0: | |
print('任務失敗,沒有下載連結') | |
exit(1) | |
# 下載分點進出 CSV | |
resp = session.get('https://bsr.twse.com.tw/bshtm/bsContent.aspx') | |
if resp.status_code != 200: | |
print('任務失敗,無法下載分點進出 CSV') | |
exit(1) | |
print(resp.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment