Created
April 10, 2023 13:40
-
-
Save opabravo/b866a0819a704fc082307892dd495e98 to your computer and use it in GitHub Desktop.
Yahoo moovie rating scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Intro\n", | |
"\n", | |
"This is a script to scrape user ratings from Yahoo! Movies." | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Imports\n", | |
"\n", | |
"Commands to install required packages:\n", | |
"\n", | |
"```bash\n", | |
"pip install selenium webdriver-manager\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from selenium.webdriver.chrome.service import Service as ChromeService\n", | |
"from webdriver_manager.chrome import ChromeDriverManager\n", | |
"from selenium.webdriver.common.by import By\n", | |
"from selenium.webdriver.support.wait import WebDriverWait\n", | |
"from selenium.webdriver.support import expected_conditions as EC\n" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define Variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"URL = \"https://movies.yahoo.com.tw/movieinfo_review.html/id=14653?sort=update_ts&order=desc&page=1\"" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Init Driver" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"options = webdriver.ChromeOptions()\n", | |
"# options.add_argument('--headless')\n", | |
"driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Driver: Visit the URL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"driver.get(URL)\n", | |
"element = WebDriverWait(driver, 10).until(\n", | |
" EC.presence_of_element_located((By.ID, \"maincontainer\"))\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print(driver.page_source)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get user comments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"source = driver.page_source\n", | |
"comments_ul = driver.find_element(By.CSS_SELECTOR, \"ul.usercom_list\")\n", | |
"user_comments = comments_ul.find_elements(By.CSS_SELECTOR, \"li\")" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parse user comments for ratings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for c in user_comments:\n", | |
" print(f\"{'='*20}\")\n", | |
" starbox = c.find_element(By.CSS_SELECTOR, \"div.starbox\")\n", | |
" stars = starbox.find_elements(By.CLASS_NAME, \"starovr\")\n", | |
" print([s.value_of_css_property(\"width\") for s in stars])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.0" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment