Created
January 17, 2024 03:47
-
-
Save Granitosaurus/05d39b0167641b6daf57028d6d199620 to your computer and use it in GitHub Desktop.
scraper for ufcstats.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import httpx | |
from bs4 import BeautifulSoup | |
async def scrape(): | |
async with httpx.AsyncClient() as client: | |
# get first page and extract links | |
resp_first_page = await client.get("http://ufcstats.com/statistics/events/completed?page=all") | |
soup_first_page = BeautifulSoup(resp_first_page.content, "html.parser") | |
# use dictionary to track results | |
results = { | |
"fighters": [], | |
"weight_class": [], | |
"referee": [], | |
"winner": [], | |
"loser": [], | |
"method": [], | |
} | |
# extract all page links | |
fight_links = soup_first_page.find_all("a", {"class": "b-link b-link_style_black"}) | |
# then create coroutines for each requests | |
tasks = [] | |
for link in fight_links: | |
tasks.append(client.get(link.get("href"))) | |
print(f'scraping {len(tasks)} fight pages') | |
# which allows to run them concurrently using asyncio.as_completed or asyncio.gather | |
for resp_fight in asyncio.as_completed(tasks): # this will run all get request together and execute below for each | |
resp_fight = await resp_fight | |
print("scraping: {}", resp_fight.url) | |
href_page = BeautifulSoup(resp_fight.content, "lxml") | |
fight_page = href_page.find_all("tr") | |
onclick = None | |
for page in fight_page: | |
onclick = page.get("onclick") | |
if not onclick: | |
continue # skip this? | |
start_index = onclick.find("('") + 2 | |
end_index = onclick.find("')") | |
extracted_url = onclick[start_index:end_index] | |
fight_url = await client.get(extracted_url) | |
fight_soup = BeautifulSoup(fight_url.content, "lxml") | |
for fighter in fight_soup.find_all("a", {"class": "b-link b-link_style_black"}, limit=2): | |
results["fighters"].append(fighter.get_text().strip()) | |
for weight in fight_soup.find_all("i", {"class": "b-fight-details__fight-title"}): | |
results["weight_class"].append(weight.get_text().strip()) | |
for result in fight_soup.find_all("i", {"style": "font-style: normal"}): | |
results["method"].append(result.get_text().strip()) | |
for ref in fight_soup.find_all("span"): | |
results["referee"].append(ref.get_text().strip()) | |
return results | |
print(asyncio.run(scrape())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment