Skip to content

Instantly share code, notes, and snippets.

@jalotra
Created August 24, 2025 19:05
Show Gist options
  • Save jalotra/955d2f57aad9d027628d798a6f871a55 to your computer and use it in GitHub Desktop.
Save jalotra/955d2f57aad9d027628d798a6f871a55 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import json
import random
import os
COOKIE_FILE = 'linkedin_cookies.json'
DATA_FILE = 'data/linkedin_schools.csv'
URLS_DIR = 'data/urls'
def pause():
"""Sleep for a random interval to mimic human behavior."""
time.sleep(random.randint(3, 8))
def load_cookies(driver, path=COOKIE_FILE):
"""Load cookies from a file into the browser session."""
if not os.path.exists(path):
raise FileNotFoundError(f"Cookie file not found: {path}")
with open(path, 'r') as f:
cookies = json.load(f)
for cookie in cookies:
if 'expiry' in cookie:
cookie['expiry'] = int(cookie['expiry'])
driver.add_cookie(cookie)
def init_driver(headless: bool = False) -> webdriver.Chrome:
"""Initialize the Chrome WebDriver with optional headless mode."""
options = webdriver.ChromeOptions()
if headless:
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.maximize_window()
return driver
def get_page(driver, start: int, end: int, urls_df: pd.DataFrame):
"""Scrape pages, click "Show more" repeatedly, and append profile URLs incrementally."""
segment = urls_df.iloc[start:end]
os.makedirs(URLS_DIR, exist_ok=True)
for idx, row in segment.iterrows():
school_name = row['name'].replace('/', '_')
url = row['url']
try:
driver.get(url)
pause()
urls_txt = os.path.join(URLS_DIR, f'{school_name}_urls.txt')
with open(urls_txt, 'w', encoding='utf-8') as f:
pass
def extract_and_append():
profile_urls = set()
profiles = driver.find_elements(By.CSS_SELECTOR,
'div.org-people-profile-card__profile-info a[data-test-app-aware-link]')
for a in profiles:
href = a.get_attribute('href')
profile_urls.add(href.split('?')[0])
with open(urls_txt, 'a', encoding='utf-8') as f:
for href in profile_urls:
f.write(href + '\n')
print(f'Appended {len(profiles)} profiles to {urls_txt}')
extract_and_append()
while True:
try:
load_button = driver.find_element(By.CSS_SELECTOR,
'button.scaffold-finite-scroll__load-button')
driver.execute_script('arguments[0].scrollIntoView(true)', load_button)
load_button.click()
pause()
extract_and_append()
except Exception:
print('No more "Show more" button found, moving on.')
break
except Exception as e:
print(f'{idx} - {school_name}: failed ({e})')
finally:
time.sleep(2)
def main():
urls_df = pd.read_csv(
DATA_FILE,
usecols=['InstiName', 'SchoolLinkedinUrl']
).rename(columns={'InstiName': 'name', 'SchoolLinkedinUrl': 'url'})
driver = init_driver(headless=False)
try:
driver.get('https://www.linkedin.com/')
load_cookies(driver, COOKIE_FILE)
driver.refresh()
pause()
get_page(driver, start=0, end=len(urls_df), urls_df=urls_df)
finally:
driver.quit()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment