Created
November 17, 2021 07:20
-
-
Save Granitosaurus/ed4df68e3249bfd274669512962f55a0 to your computer and use it in GitHub Desktop.
aircraft scraper for avbuyer.com website.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from aiohttp.client import ClientSession | |
from parsel import Selector | |
async def get_page(page: int, session: ClientSession): | |
"""This is a little shortcut function that requests specific page of our pagination""" | |
url = f"https://www.avbuyer.com/aircraft/private-jets/page-{page}" | |
print(f"requesting {url}") | |
return await session.get(url) | |
async def parse_page(resp): | |
"""this is our parse function that extracts airplane details from the pagination page""" | |
# first we need to find all airplane boxes on the page, there should be 20 of them | |
airplanes = Selector(await resp.text()).css(".listing-item") | |
# then we should parse box one by one for airplane data | |
parsed = [] | |
for plane in airplanes: | |
# plane.css will select RELATIVE to plane box | |
# so using plane.css(".item-title") will only select node that is under current plane box | |
parsed.append({ | |
# note: we are using ::text selector to select text of the node rather than whole node content | |
# note: parsel selector have a convenient .get() method that takes first text value of selected nodes | |
"title": plane.css(".item-title ::text").get(), | |
"price": plane.css(".price::text").get(), | |
"updated": plane.css(".list-update::text").get(), | |
"description": plane.css(".list-item-para::text").get().strip(), | |
# since location can contain multiple nodes we want to join them explicitly | |
"location": " ".join( | |
plane.css(".list-item-location::text").extract() | |
).strip(), | |
# same for other details | |
"other_details": "\n".join( | |
plane.css(".list-other-dtl ::text").extract() | |
).strip(), | |
# etc. other fields just have different classes so I leave them up to you. | |
}) | |
return parsed | |
async def scrape(): | |
async with ClientSession() as session: | |
# we start with the first page of listing pagination, there we get total amount of pages | |
# so we could scrape all pages concurrently rather than going to the next page one by one which | |
# is very slow! | |
first_page = await get_page(1, session) | |
first_page_sel = Selector(await first_page.text()) | |
total_pages = int( | |
first_page_sel.xpath( | |
'//li[@class="pagination-next"]/preceding-sibling::li[1]/a/text()' | |
).get() | |
) | |
# now that we have total pages and first page data we can extract airplane data for the first page and.. | |
parsed = await parse_page(first_page) | |
# and for other pages we can schedule them asynchronously | |
# This might appear complex but `asyncio.as_completed` turns a list of coroutines | |
# into a list of futures. meaning this loop will execute in first-come-first-serve order | |
for page_future in asyncio.as_completed( | |
[get_page(i, session) for i in range(2, total_pages)] | |
): | |
# since here we get future not a full response we need to await it | |
response = await page_future | |
# once we have response object all we have to do is parse the airplane data from it | |
# with the function we wrote above! | |
parsed.extend(await parse_page(response)) | |
# this will generate a list of dictionaries | |
# you can further dump this as json with `json.dumps(parsed)` | |
return parsed | |
if __name__ == "__main__": | |
asyncio.run(scrape()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
as requested on https://www.reddit.com/r/webscraping/comments/qux1w2/web_scraping_beautifulsoup_multiple_pages/