BexTuychiev · June 13, 2026 07:21
diff --git a/job_scraper.py b/job_scraper.py
 """Scrape a job board into structured data, then rank roles against a resume.

 Pipeline: scrape the listing, batch-extract each ATS detail page, match to a
 resume with an LLM. One schema works across Ashby, Greenhouse, and any other ATS,
 so there are no per-site selectors to maintain.
 """

 import json
 import os
 import sys
 import time
 from typing import List, Optional

 from firecrawl import Firecrawl
 from openai import OpenAI
 from pydantic import BaseModel

 fc = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
 oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


 class Job(BaseModel):
    title: str
    location: Optional[str] = None
    url: Optional[str] = None


 class JobBoard(BaseModel):
    jobs: List[Job]


 class JobDetail(BaseModel):
    job_title: str
    team: Optional[str] = None
    location: Optional[str] = None
    employment_type: Optional[str] = None
    compensation: Optional[str] = None
    key_responsibilities: List[str] = []
    required_skills: List[str] = []
    apply_url: Optional[str] = None


 class Match(BaseModel):
    title: str
    url: Optional[str] = None
    why: str
    score: int


 class Matches(BaseModel):
    top: List[Match]


 def scrape_listing(board_url: str) -> List[Job]:
    """Pull the role list off the board. One scrape, one schema."""
    try:
        result = fc.scrape(
            board_url,
            formats=[{"type": "json", "schema": JobBoard}],
        )
    except Exception as err:
        print(f"  scrape failed for {board_url}: {err}", file=sys.stderr)
        return []

    jobs = [Job(**j) for j in result.json.get("jobs", [])]

    if len(jobs) >= 25:
        print(f"  note: got {len(jobs)} roles; large boards lazy-load, so this may be a sample")

    return jobs


 def extract_details(urls: List[str], max_concurrency: int = 5) -> List[JobDetail]:
    """Scrape every detail page in one batch call, all against the same schema."""
    job = fc.batch_scrape(
        urls,
        formats=[{"type": "json", "schema": JobDetail}],
        max_concurrency=max_concurrency,
    )
    return [JobDetail(**doc.json) for doc in job.data if doc.json]


 def match_resume(resume: str, jobs: List[Job]) -> List[Match]:
    """Rank jobs against a resume with structured outputs (typed object, no manual parsing)."""
    jobs_json = json.dumps([j.model_dump() for j in jobs])
    prompt = (
        f"Resume:\n{resume}\n\n"
        f"Jobs:\n{jobs_json}\n\n"
        "Return the top 3 fits, each with a one-line reason and a 0-100 score."
    )

    res = oai.chat.completions.parse(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
        response_format=Matches,
    )

    parsed = res.choices[0].message.parsed
    if parsed is None:
        raise RuntimeError("model returned no parsed match (refusal or schema mismatch)")

    return parsed.top


 if __name__ == "__main__":
    board_url = sys.argv[1] if len(sys.argv) > 1 else "https://www.firecrawl.dev/careers"
    resume = open("sample_resume.md").read()
    t0 = time.time()
    jobs = scrape_listing(board_url)
    print(f"listing: {len(jobs)} roles in {time.time() - t0:.0f}s")
    details = extract_details([j.url for j in jobs if j.url][:5])
    print(f"details: extracted {len(details)} of 5 sampled pages")
    matches = match_resume(resume, jobs)
    print("\ntop matches:")
    for m in matches:
        print(f"  [{m.score}] {m.title} -> {m.why}")
    json.dump([j.model_dump() for j in jobs], open("scraped_jobs.json", "w"), indent=2)
    print(f"\nsaved {len(jobs)} jobs to scraped_jobs.json in {time.time() - t0:.0f}s total")
	"""Scrape a job board into structured data, then rank roles against a resume.

	Pipeline: scrape the listing, batch-extract each ATS detail page, match to a
	resume with an LLM. One schema works across Ashby, Greenhouse, and any other ATS,
	so there are no per-site selectors to maintain.
	"""

	import json
	import os
	import sys
	import time
	from typing import List, Optional

	from firecrawl import Firecrawl
	from openai import OpenAI
	from pydantic import BaseModel

	fc = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
	oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


	class Job(BaseModel):
	title: str
	location: Optional[str] = None
	url: Optional[str] = None


	class JobBoard(BaseModel):
	jobs: List[Job]


	class JobDetail(BaseModel):
	job_title: str
	team: Optional[str] = None
	location: Optional[str] = None
	employment_type: Optional[str] = None
	compensation: Optional[str] = None
	key_responsibilities: List[str] = []
	required_skills: List[str] = []
	apply_url: Optional[str] = None


	class Match(BaseModel):
	title: str
	url: Optional[str] = None
	why: str
	score: int


	class Matches(BaseModel):
	top: List[Match]


	def scrape_listing(board_url: str) -> List[Job]:
	"""Pull the role list off the board. One scrape, one schema."""
	try:
	result = fc.scrape(
	board_url,
	formats=[{"type": "json", "schema": JobBoard}],
	)
	except Exception as err:
	print(f" scrape failed for {board_url}: {err}", file=sys.stderr)
	return []

	jobs = [Job(**j) for j in result.json.get("jobs", [])]

	if len(jobs) >= 25:
	print(f" note: got {len(jobs)} roles; large boards lazy-load, so this may be a sample")

	return jobs


	def extract_details(urls: List[str], max_concurrency: int = 5) -> List[JobDetail]:
	"""Scrape every detail page in one batch call, all against the same schema."""
	job = fc.batch_scrape(
	urls,
	formats=[{"type": "json", "schema": JobDetail}],
	max_concurrency=max_concurrency,
	)
	return [JobDetail(**doc.json) for doc in job.data if doc.json]


	def match_resume(resume: str, jobs: List[Job]) -> List[Match]:
	"""Rank jobs against a resume with structured outputs (typed object, no manual parsing)."""
	jobs_json = json.dumps([j.model_dump() for j in jobs])
	prompt = (
	f"Resume:\n{resume}\n\n"
	f"Jobs:\n{jobs_json}\n\n"
	"Return the top 3 fits, each with a one-line reason and a 0-100 score."
	)

	res = oai.chat.completions.parse(
	model="gpt-5-nano",
	messages=[{"role": "user", "content": prompt}],
	response_format=Matches,
	)

	parsed = res.choices[0].message.parsed
	if parsed is None:
	raise RuntimeError("model returned no parsed match (refusal or schema mismatch)")

	return parsed.top


	if __name__ == "__main__":
	board_url = sys.argv[1] if len(sys.argv) > 1 else "https://www.firecrawl.dev/careers"
	resume = open("sample_resume.md").read()
	t0 = time.time()
	jobs = scrape_listing(board_url)
	print(f"listing: {len(jobs)} roles in {time.time() - t0:.0f}s")
	details = extract_details([j.url for j in jobs if j.url][:5])
	print(f"details: extracted {len(details)} of 5 sampled pages")
	matches = match_resume(resume, jobs)
	print("\ntop matches:")
	for m in matches:
	print(f" [{m.score}] {m.title} -> {m.why}")
	json.dump([j.model_dump() for j in jobs], open("scraped_jobs.json", "w"), indent=2)
	print(f"\nsaved {len(jobs)} jobs to scraped_jobs.json in {time.time() - t0:.0f}s total")
No results found