Created
July 4, 2025 11:54
-
-
Save jkbjh/9a3b944c0165de76405464c27899c085 to your computer and use it in GitHub Desktop.
ai-deadlines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import re | |
from datetime import datetime | |
# Function to parse dates with different formats | |
def parse_date_flexible(date_str): | |
for fmt in ("%b %d %Y", "%B %d %Y", "%b %d, %Y", "%B %d, %Y"): | |
try: | |
return datetime.strptime(date_str, fmt).date() | |
except ValueError: | |
continue | |
raise ValueError(f"Could not parse date: {date_str}") | |
# Function to extract abstract deadline date from text | |
def extract_deadline_date(text): | |
# Define regex patterns for dates | |
patterns = [ | |
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*[.,]?\s+\d{1,2}(?:st|nd|rd|th)?[.,]?\s+\d{4}", # Oct 2, 2024 | |
r"\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)[.,]?\s+\d{4}", # 2 October 2024 | |
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)[.,]?\s+\d{1,2}(?:st|nd|rd|th)?[.,]?\s+\d{4}", # October 2, 2024 | |
r"\b\d{4}-\d{2}-\d{2}", # 2023-12-20 | |
] | |
# Try all patterns until one matches | |
for pattern in patterns: | |
match = re.search(pattern, text) | |
if match: | |
date_str = match.group(0) | |
cleaned = re.sub(r"(st|nd|rd|th|,)", "", date_str).strip() # Remove suffixes and commas | |
# Try parsing the cleaned string with known formats | |
for fmt in [ | |
"%b %d %Y", # Oct 2 2024 | |
"%B %d %Y", # October 2 2024 | |
"%d %B %Y", # 2 October 2024 | |
"%Y-%m-%d", # 2023-12-20 | |
]: | |
try: | |
return datetime.strptime(cleaned, fmt).date() | |
except ValueError: | |
continue | |
return None # No valid date found | |
# Step 1: Load the HTML page | |
url = "https://aideadlin.es/?sub=ML,CV,RO,KR,AP" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Step 2: Extract conference items | |
conf_divs = soup.find_all("div", class_="ConfItem") | |
# Step 3: Parse information | |
data = [] | |
for div in conf_divs: | |
try: | |
title_tag = div.find("span", class_="conf-title") | |
name = title_tag.get_text(strip=True) if title_tag else None | |
link_tag = title_tag.find("a") if title_tag else None | |
page_link = "https://aideadlin.es" + link_tag["href"] if link_tag else None | |
website_tag = div.find("span", class_="conf-title-icon").find("a") | |
website_link = website_tag["href"] if website_tag else None | |
date_span = div.find("span", class_="conf-date") | |
date_text = date_span.get_text(strip=True).replace("\xa0", " ") if date_span else None | |
start_date = end_date = None | |
if date_text: | |
date_text = date_text.replace("–", "-") | |
match = re.match(r"([A-Za-z]+) (\d+)[ -]+(\d+), (\d{4})", date_text) | |
if match: | |
month, start_day, end_day, year = match.groups() | |
start_date = parse_date_flexible(f"{month} {start_day} {year}") | |
end_date = parse_date_flexible(f"{month} {end_day} {year}") | |
else: | |
match = re.match(r"([A-Za-z]+ \d+)\s*-\s*([A-Za-z]+ \d+), (\d{4})", date_text) | |
if match: | |
month_day_start, month_day_end, year = match.groups() | |
start_date = parse_date_flexible(f"{month_day_start} {year}") | |
end_date = parse_date_flexible(f"{month_day_end} {year}") | |
place_tag = div.find("span", class_="conf-place") | |
location = place_tag.get_text(strip=True) if place_tag else None | |
note_div = div.find("div", class_="note") | |
note = note_div.get_text(strip=True).replace("Note:", "") if note_div else None | |
# Extract abstract deadline using the new function | |
abstract_deadline = None | |
if note: | |
abstract_deadline = extract_deadline_date(note) | |
tags = [span["data-sub"] for span in div.find_all("span", class_="conf-sub")] | |
data.append( | |
{ | |
"Name": name, | |
"Detail Page": page_link, | |
"Website": website_link, | |
"Start Date": start_date, | |
"End Date": end_date, | |
"Location": location, | |
"Note": note, | |
"Deadline": abstract_deadline, # Add extracted deadline here | |
"Tags": tags, | |
} | |
) | |
except Exception as e: | |
print("\n--- Error parsing the following conference div ---\n") | |
print(div.prettify()) | |
raise e # Re-raise so you can still break execution if needed | |
# Step 4: Convert to DataFrame | |
df = pd.DataFrame(data) | |
# Show result | |
print(df.to_markdown()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment