jalotra · August 24, 2025 19:05
diff --git a/find_people.py b/find_people.py
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from bs4 import BeautifulSoup as bs
 import pandas as pd
 import time
 import json
 import random
 import os

 COOKIE_FILE = 'linkedin_cookies.json'
 DATA_FILE = 'data/linkedin_schools.csv'
 URLS_DIR = 'data/urls'

 def pause():
    """Sleep for a random interval to mimic human behavior."""
    time.sleep(random.randint(3, 8))


 def load_cookies(driver, path=COOKIE_FILE):
    """Load cookies from a file into the browser session."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"Cookie file not found: {path}")
    with open(path, 'r') as f:
        cookies = json.load(f)
    for cookie in cookies:
        if 'expiry' in cookie:
            cookie['expiry'] = int(cookie['expiry'])
        driver.add_cookie(cookie)

 def init_driver(headless: bool = False) -> webdriver.Chrome:
    """Initialize the Chrome WebDriver with optional headless mode."""
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    return driver

 def get_page(driver, start: int, end: int, urls_df: pd.DataFrame):
    """Scrape pages, click "Show more" repeatedly, and append profile URLs incrementally."""
    segment = urls_df.iloc[start:end]
    os.makedirs(URLS_DIR, exist_ok=True)

    for idx, row in segment.iterrows():
        school_name = row['name'].replace('/', '_')
        url = row['url']
        try:
            driver.get(url)
            pause()

            urls_txt = os.path.join(URLS_DIR, f'{school_name}_urls.txt')
            with open(urls_txt, 'w', encoding='utf-8') as f:
                pass

            def extract_and_append():
                profile_urls = set()
                profiles = driver.find_elements(By.CSS_SELECTOR,
                    'div.org-people-profile-card__profile-info a[data-test-app-aware-link]')
                for a in profiles:
                        href = a.get_attribute('href')
                        profile_urls.add(href.split('?')[0])
                
                with open(urls_txt, 'a', encoding='utf-8') as f:
                    for href in profile_urls:
                        f.write(href + '\n')
                print(f'Appended {len(profiles)} profiles to {urls_txt}')

            extract_and_append()
            while True:
                try:
                    load_button = driver.find_element(By.CSS_SELECTOR,
                        'button.scaffold-finite-scroll__load-button')
                    driver.execute_script('arguments[0].scrollIntoView(true)', load_button)
                    load_button.click()
                    pause()
                    extract_and_append()
                except Exception:
                    print('No more "Show more" button found, moving on.')
                    break

        except Exception as e:
            print(f'{idx} - {school_name}: failed ({e})')
        finally:
            time.sleep(2)

 def main():
    urls_df = pd.read_csv(
        DATA_FILE,
        usecols=['InstiName', 'SchoolLinkedinUrl']
    ).rename(columns={'InstiName': 'name', 'SchoolLinkedinUrl': 'url'})

    driver = init_driver(headless=False)
    try:
        driver.get('https://www.linkedin.com/')
        load_cookies(driver, COOKIE_FILE)
        driver.refresh()
        pause()

        get_page(driver, start=0, end=len(urls_df), urls_df=urls_df)
    finally:
        driver.quit()

 if __name__ == '__main__':
    main()
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from bs4 import BeautifulSoup as bs
	import pandas as pd
	import time
	import json
	import random
	import os

	COOKIE_FILE = 'linkedin_cookies.json'
	DATA_FILE = 'data/linkedin_schools.csv'
	URLS_DIR = 'data/urls'

	def pause():
	"""Sleep for a random interval to mimic human behavior."""
	time.sleep(random.randint(3, 8))


	def load_cookies(driver, path=COOKIE_FILE):
	"""Load cookies from a file into the browser session."""
	if not os.path.exists(path):
	raise FileNotFoundError(f"Cookie file not found: {path}")
	with open(path, 'r') as f:
	cookies = json.load(f)
	for cookie in cookies:
	if 'expiry' in cookie:
	cookie['expiry'] = int(cookie['expiry'])
	driver.add_cookie(cookie)

	def init_driver(headless: bool = False) -> webdriver.Chrome:
	"""Initialize the Chrome WebDriver with optional headless mode."""
	options = webdriver.ChromeOptions()
	if headless:
	options.add_argument('--headless')
	driver = webdriver.Chrome(options=options)
	driver.maximize_window()
	return driver

	def get_page(driver, start: int, end: int, urls_df: pd.DataFrame):
	"""Scrape pages, click "Show more" repeatedly, and append profile URLs incrementally."""
	segment = urls_df.iloc[start:end]
	os.makedirs(URLS_DIR, exist_ok=True)

	for idx, row in segment.iterrows():
	school_name = row['name'].replace('/', '_')
	url = row['url']
	try:
	driver.get(url)
	pause()

	urls_txt = os.path.join(URLS_DIR, f'{school_name}_urls.txt')
	with open(urls_txt, 'w', encoding='utf-8') as f:
	pass

	def extract_and_append():
	profile_urls = set()
	profiles = driver.find_elements(By.CSS_SELECTOR,
	'div.org-people-profile-card__profile-info a[data-test-app-aware-link]')
	for a in profiles:
	href = a.get_attribute('href')
	profile_urls.add(href.split('?')[0])

	with open(urls_txt, 'a', encoding='utf-8') as f:
	for href in profile_urls:
	f.write(href + '\n')
	print(f'Appended {len(profiles)} profiles to {urls_txt}')

	extract_and_append()
	while True:
	try:
	load_button = driver.find_element(By.CSS_SELECTOR,
	'button.scaffold-finite-scroll__load-button')
	driver.execute_script('arguments[0].scrollIntoView(true)', load_button)
	load_button.click()
	pause()
	extract_and_append()
	except Exception:
	print('No more "Show more" button found, moving on.')
	break

	except Exception as e:
	print(f'{idx} - {school_name}: failed ({e})')
	finally:
	time.sleep(2)

	def main():
	urls_df = pd.read_csv(
	DATA_FILE,
	usecols=['InstiName', 'SchoolLinkedinUrl']
	).rename(columns={'InstiName': 'name', 'SchoolLinkedinUrl': 'url'})

	driver = init_driver(headless=False)
	try:
	driver.get('https://www.linkedin.com/')
	load_cookies(driver, COOKIE_FILE)
	driver.refresh()
	pause()

	get_page(driver, start=0, end=len(urls_df), urls_df=urls_df)
	finally:
	driver.quit()

	if __name__ == '__main__':
	main()