Granitosaurus · January 17, 2024 03:47
diff --git a/fightscraper.py b/fightscraper.py
 import asyncio
 import httpx
 from bs4 import BeautifulSoup


 async def scrape():
    async with httpx.AsyncClient() as client:
        # get first page and extract links
        resp_first_page = await client.get("http://ufcstats.com/statistics/events/completed?page=all")
        soup_first_page = BeautifulSoup(resp_first_page.content, "html.parser")

        # use dictionary to track results
        results = {
            "fighters": [],
            "weight_class": [],
            "referee": [],
            "winner": [],
            "loser": [],
            "method": [],
        }
        
        # extract all page links
        fight_links = soup_first_page.find_all("a", {"class": "b-link b-link_style_black"})
        # then create coroutines for each requests
        tasks = []
        for link in fight_links:
            tasks.append(client.get(link.get("href")))
        print(f'scraping {len(tasks)} fight pages')
        # which allows to run them concurrently using asyncio.as_completed or asyncio.gather 
        for resp_fight in asyncio.as_completed(tasks):  # this will run all get request together and execute below for each
            resp_fight = await resp_fight
            print("scraping: {}", resp_fight.url)
            href_page = BeautifulSoup(resp_fight.content, "lxml")

            fight_page = href_page.find_all("tr")

            onclick = None
            for page in fight_page:
                onclick = page.get("onclick")
            if not onclick:
                continue  # skip this?
            start_index = onclick.find("('") + 2
            end_index = onclick.find("')")
            extracted_url = onclick[start_index:end_index]
            fight_url = await client.get(extracted_url)
            fight_soup = BeautifulSoup(fight_url.content, "lxml")

            for fighter in fight_soup.find_all("a", {"class": "b-link b-link_style_black"}, limit=2):
                results["fighters"].append(fighter.get_text().strip())

            for weight in fight_soup.find_all("i", {"class": "b-fight-details__fight-title"}):
                results["weight_class"].append(weight.get_text().strip())

            for result in fight_soup.find_all("i", {"style": "font-style: normal"}):
                results["method"].append(result.get_text().strip())

            for ref in fight_soup.find_all("span"):
                results["referee"].append(ref.get_text().strip())
        return results


 print(asyncio.run(scrape()))
	import asyncio
	import httpx
	from bs4 import BeautifulSoup


	async def scrape():
	async with httpx.AsyncClient() as client:
	# get first page and extract links
	resp_first_page = await client.get("http://ufcstats.com/statistics/events/completed?page=all")
	soup_first_page = BeautifulSoup(resp_first_page.content, "html.parser")

	# use dictionary to track results
	results = {
	"fighters": [],
	"weight_class": [],
	"referee": [],
	"winner": [],
	"loser": [],
	"method": [],
	}

	# extract all page links
	fight_links = soup_first_page.find_all("a", {"class": "b-link b-link_style_black"})
	# then create coroutines for each requests
	tasks = []
	for link in fight_links:
	tasks.append(client.get(link.get("href")))
	print(f'scraping {len(tasks)} fight pages')
	# which allows to run them concurrently using asyncio.as_completed or asyncio.gather
	for resp_fight in asyncio.as_completed(tasks): # this will run all get request together and execute below for each
	resp_fight = await resp_fight
	print("scraping: {}", resp_fight.url)
	href_page = BeautifulSoup(resp_fight.content, "lxml")

	fight_page = href_page.find_all("tr")

	onclick = None
	for page in fight_page:
	onclick = page.get("onclick")
	if not onclick:
	continue # skip this?
	start_index = onclick.find("('") + 2
	end_index = onclick.find("')")
	extracted_url = onclick[start_index:end_index]
	fight_url = await client.get(extracted_url)
	fight_soup = BeautifulSoup(fight_url.content, "lxml")

	for fighter in fight_soup.find_all("a", {"class": "b-link b-link_style_black"}, limit=2):
	results["fighters"].append(fighter.get_text().strip())

	for weight in fight_soup.find_all("i", {"class": "b-fight-details__fight-title"}):
	results["weight_class"].append(weight.get_text().strip())

	for result in fight_soup.find_all("i", {"style": "font-style: normal"}):
	results["method"].append(result.get_text().strip())

	for ref in fight_soup.find_all("span"):
	results["referee"].append(ref.get_text().strip())
	return results


	print(asyncio.run(scrape()))