Forked from lorey/selenium_xhr_requests_via_performance_logging.py
Created
April 19, 2021 12:53
-
-
Save wondermu/fa8a6a14ad1de4f7e7f07a63cb1fcd3b to your computer and use it in GitHub Desktop.
Access Chrome's network tab (e.g. XHR requests) with Selenium
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This small example shows you how to access JS-based requests via Selenium | |
# Like this, one can access raw data for scraping, | |
# for example on many JS-intensive/React-based websites | |
# | |
from time import sleep | |
from selenium import webdriver | |
from selenium.webdriver import DesiredCapabilities | |
# make chrome log requests | |
capabilities = DesiredCapabilities.CHROME | |
capabilities["loggingPrefs"] = {"performance": "ALL"} # newer: goog:loggingPrefs | |
driver = webdriver.Chrome( | |
desired_capabilities=capabilities, executable_path="./chromedriver" | |
) | |
# fetch a site that does xhr requests | |
driver.get("https://sitewithajaxorsomething.com") | |
sleep(5) # wait for the requests to take place | |
# extract requests from logs | |
logs_raw = driver.get_log("performance") | |
logs = [json.loads(lr["message"])["message"] for lr in logs_raw] | |
def log_filter(log_): | |
return ( | |
# is an actual response | |
log_["method"] == "Network.responseReceived" | |
# and json | |
and "json" in log_["params"]["response"]["mimeType"] | |
) | |
for log in filter(log_filter, logs): | |
request_id = log["params"]["requestId"] | |
resp_url = log["params"]["response"]["url"] | |
print(f"Caught {resp_url}") | |
print(driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment