Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Last active June 13, 2026 07:21
Show Gist options
  • Select an option

  • Save BexTuychiev/a9225263d95b83f8a7fb8585233cb9a4 to your computer and use it in GitHub Desktop.

Select an option

Save BexTuychiev/a9225263d95b83f8a7fb8585233cb9a4 to your computer and use it in GitHub Desktop.
Scrape any company job board into structured JSON with one Firecrawl schema, then rank roles against a resume (firecrawl-py + OpenAI)
"""Scrape a job board into structured data, then rank roles against a resume.
Pipeline: scrape the listing, batch-extract each ATS detail page, match to a
resume with an LLM. One schema works across Ashby, Greenhouse, and any other ATS,
so there are no per-site selectors to maintain.
"""
import json
import os
import sys
import time
from typing import List, Optional
from firecrawl import Firecrawl
from openai import OpenAI
from pydantic import BaseModel
fc = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
class Job(BaseModel):
title: str
location: Optional[str] = None
url: Optional[str] = None
class JobBoard(BaseModel):
jobs: List[Job]
class JobDetail(BaseModel):
job_title: str
team: Optional[str] = None
location: Optional[str] = None
employment_type: Optional[str] = None
compensation: Optional[str] = None
key_responsibilities: List[str] = []
required_skills: List[str] = []
apply_url: Optional[str] = None
class Match(BaseModel):
title: str
url: Optional[str] = None
why: str
score: int
class Matches(BaseModel):
top: List[Match]
def scrape_listing(board_url: str) -> List[Job]:
"""Pull the role list off the board. One scrape, one schema."""
try:
result = fc.scrape(
board_url,
formats=[{"type": "json", "schema": JobBoard}],
)
except Exception as err:
print(f" scrape failed for {board_url}: {err}", file=sys.stderr)
return []
jobs = [Job(**j) for j in result.json.get("jobs", [])]
if len(jobs) >= 25:
print(f" note: got {len(jobs)} roles; large boards lazy-load, so this may be a sample")
return jobs
def extract_details(urls: List[str], max_concurrency: int = 5) -> List[JobDetail]:
"""Scrape every detail page in one batch call, all against the same schema."""
job = fc.batch_scrape(
urls,
formats=[{"type": "json", "schema": JobDetail}],
max_concurrency=max_concurrency,
)
return [JobDetail(**doc.json) for doc in job.data if doc.json]
def match_resume(resume: str, jobs: List[Job]) -> List[Match]:
"""Rank jobs against a resume with structured outputs (typed object, no manual parsing)."""
jobs_json = json.dumps([j.model_dump() for j in jobs])
prompt = (
f"Resume:\n{resume}\n\n"
f"Jobs:\n{jobs_json}\n\n"
"Return the top 3 fits, each with a one-line reason and a 0-100 score."
)
res = oai.chat.completions.parse(
model="gpt-5-nano",
messages=[{"role": "user", "content": prompt}],
response_format=Matches,
)
parsed = res.choices[0].message.parsed
if parsed is None:
raise RuntimeError("model returned no parsed match (refusal or schema mismatch)")
return parsed.top
if __name__ == "__main__":
board_url = sys.argv[1] if len(sys.argv) > 1 else "https://www.firecrawl.dev/careers"
resume = open("sample_resume.md").read()
t0 = time.time()
jobs = scrape_listing(board_url)
print(f"listing: {len(jobs)} roles in {time.time() - t0:.0f}s")
details = extract_details([j.url for j in jobs if j.url][:5])
print(f"details: extracted {len(details)} of 5 sampled pages")
matches = match_resume(resume, jobs)
print("\ntop matches:")
for m in matches:
print(f" [{m.score}] {m.title} -> {m.why}")
json.dump([j.model_dump() for j in jobs], open("scraped_jobs.json", "w"), indent=2)
print(f"\nsaved {len(jobs)} jobs to scraped_jobs.json in {time.time() - t0:.0f}s total")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment