Forked from lorey/selenium_xhr_requests_via_performance_logging.py
Created
July 12, 2021 17:04
-
-
Save dormanh/f44163055ed0e9cc794d389f8acc24de to your computer and use it in GitHub Desktop.
Access Chrome's network tab (e.g. XHR requests) with Selenium
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This small example shows you how to access JS-based requests via Selenium | |
# Like this, one can access raw data for scraping, | |
# for example on many JS-intensive/React-based websites | |
# | |
from time import sleep | |
from selenium import webdriver | |
from selenium.webdriver import DesiredCapabilities | |
# make chrome log requests | |
capabilities = DesiredCapabilities.CHROME | |
capabilities["loggingPrefs"] = {"performance": "ALL"} # newer: goog:loggingPrefs | |
driver = webdriver.Chrome( | |
desired_capabilities=capabilities, executable_path="./chromedriver" | |
) | |
# fetch a site that does xhr requests | |
driver.get("https://sitewithajaxorsomething.com") | |
sleep(5) # wait for the requests to take place | |
# extract requests from logs | |
logs_raw = driver.get_log("performance") | |
logs = [json.loads(lr["message"])["message"] for lr in logs_raw] | |
def log_filter(log_): | |
return ( | |
# is an actual response | |
log_["method"] == "Network.responseReceived" | |
# and json | |
and "json" in log_["params"]["response"]["mimeType"] | |
) | |
for log in filter(log_filter, logs): | |
request_id = log["params"]["requestId"] | |
resp_url = log["params"]["response"]["url"] | |
print(f"Caught {resp_url}") | |
print(driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment