Last active
June 13, 2026 07:21
-
-
Save BexTuychiev/a9225263d95b83f8a7fb8585233cb9a4 to your computer and use it in GitHub Desktop.
Scrape any company job board into structured JSON with one Firecrawl schema, then rank roles against a resume (firecrawl-py + OpenAI)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Scrape a job board into structured data, then rank roles against a resume. | |
| Pipeline: scrape the listing, batch-extract each ATS detail page, match to a | |
| resume with an LLM. One schema works across Ashby, Greenhouse, and any other ATS, | |
| so there are no per-site selectors to maintain. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from typing import List, Optional | |
| from firecrawl import Firecrawl | |
| from openai import OpenAI | |
| from pydantic import BaseModel | |
| fc = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"]) | |
| oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) | |
| class Job(BaseModel): | |
| title: str | |
| location: Optional[str] = None | |
| url: Optional[str] = None | |
| class JobBoard(BaseModel): | |
| jobs: List[Job] | |
| class JobDetail(BaseModel): | |
| job_title: str | |
| team: Optional[str] = None | |
| location: Optional[str] = None | |
| employment_type: Optional[str] = None | |
| compensation: Optional[str] = None | |
| key_responsibilities: List[str] = [] | |
| required_skills: List[str] = [] | |
| apply_url: Optional[str] = None | |
| class Match(BaseModel): | |
| title: str | |
| url: Optional[str] = None | |
| why: str | |
| score: int | |
| class Matches(BaseModel): | |
| top: List[Match] | |
| def scrape_listing(board_url: str) -> List[Job]: | |
| """Pull the role list off the board. One scrape, one schema.""" | |
| try: | |
| result = fc.scrape( | |
| board_url, | |
| formats=[{"type": "json", "schema": JobBoard}], | |
| ) | |
| except Exception as err: | |
| print(f" scrape failed for {board_url}: {err}", file=sys.stderr) | |
| return [] | |
| jobs = [Job(**j) for j in result.json.get("jobs", [])] | |
| if len(jobs) >= 25: | |
| print(f" note: got {len(jobs)} roles; large boards lazy-load, so this may be a sample") | |
| return jobs | |
| def extract_details(urls: List[str], max_concurrency: int = 5) -> List[JobDetail]: | |
| """Scrape every detail page in one batch call, all against the same schema.""" | |
| job = fc.batch_scrape( | |
| urls, | |
| formats=[{"type": "json", "schema": JobDetail}], | |
| max_concurrency=max_concurrency, | |
| ) | |
| return [JobDetail(**doc.json) for doc in job.data if doc.json] | |
| def match_resume(resume: str, jobs: List[Job]) -> List[Match]: | |
| """Rank jobs against a resume with structured outputs (typed object, no manual parsing).""" | |
| jobs_json = json.dumps([j.model_dump() for j in jobs]) | |
| prompt = ( | |
| f"Resume:\n{resume}\n\n" | |
| f"Jobs:\n{jobs_json}\n\n" | |
| "Return the top 3 fits, each with a one-line reason and a 0-100 score." | |
| ) | |
| res = oai.chat.completions.parse( | |
| model="gpt-5-nano", | |
| messages=[{"role": "user", "content": prompt}], | |
| response_format=Matches, | |
| ) | |
| parsed = res.choices[0].message.parsed | |
| if parsed is None: | |
| raise RuntimeError("model returned no parsed match (refusal or schema mismatch)") | |
| return parsed.top | |
| if __name__ == "__main__": | |
| board_url = sys.argv[1] if len(sys.argv) > 1 else "https://www.firecrawl.dev/careers" | |
| resume = open("sample_resume.md").read() | |
| t0 = time.time() | |
| jobs = scrape_listing(board_url) | |
| print(f"listing: {len(jobs)} roles in {time.time() - t0:.0f}s") | |
| details = extract_details([j.url for j in jobs if j.url][:5]) | |
| print(f"details: extracted {len(details)} of 5 sampled pages") | |
| matches = match_resume(resume, jobs) | |
| print("\ntop matches:") | |
| for m in matches: | |
| print(f" [{m.score}] {m.title} -> {m.why}") | |
| json.dump([j.model_dump() for j in jobs], open("scraped_jobs.json", "w"), indent=2) | |
| print(f"\nsaved {len(jobs)} jobs to scraped_jobs.json in {time.time() - t0:.0f}s total") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment