jkbjh · July 4, 2025 11:54
diff --git a/ai-deadlines-to-md.py b/ai-deadlines-to-md.py
 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import re
 from datetime import datetime


 # Function to parse dates with different formats
 def parse_date_flexible(date_str):
    for fmt in ("%b %d %Y", "%B %d %Y", "%b %d, %Y", "%B %d, %Y"):
        try:
            return datetime.strptime(date_str, fmt).date()
        except ValueError:
            continue
    raise ValueError(f"Could not parse date: {date_str}")


 # Function to extract abstract deadline date from text
 def extract_deadline_date(text):
    # Define regex patterns for dates
    patterns = [
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*[.,]?\s+\d{1,2}(?:st|nd|rd|th)?[.,]?\s+\d{4}",  # Oct 2, 2024
        r"\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)[.,]?\s+\d{4}",  # 2 October 2024
        r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)[.,]?\s+\d{1,2}(?:st|nd|rd|th)?[.,]?\s+\d{4}",  # October 2, 2024
        r"\b\d{4}-\d{2}-\d{2}",  # 2023-12-20
    ]

    # Try all patterns until one matches
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            date_str = match.group(0)
            cleaned = re.sub(r"(st|nd|rd|th|,)", "", date_str).strip()  # Remove suffixes and commas

            # Try parsing the cleaned string with known formats
            for fmt in [
                "%b %d %Y",  # Oct 2 2024
                "%B %d %Y",  # October 2 2024
                "%d %B %Y",  # 2 October 2024
                "%Y-%m-%d",  # 2023-12-20
            ]:
                try:
                    return datetime.strptime(cleaned, fmt).date()
                except ValueError:
                    continue
    return None  # No valid date found


 # Step 1: Load the HTML page
 url = "https://aideadlin.es/?sub=ML,CV,RO,KR,AP"
 response = requests.get(url)
 soup = BeautifulSoup(response.text, "html.parser")

 # Step 2: Extract conference items
 conf_divs = soup.find_all("div", class_="ConfItem")

 # Step 3: Parse information
 data = []
 for div in conf_divs:
    try:
        title_tag = div.find("span", class_="conf-title")
        name = title_tag.get_text(strip=True) if title_tag else None

        link_tag = title_tag.find("a") if title_tag else None
        page_link = "https://aideadlin.es" + link_tag["href"] if link_tag else None

        website_tag = div.find("span", class_="conf-title-icon").find("a")
        website_link = website_tag["href"] if website_tag else None

        date_span = div.find("span", class_="conf-date")
        date_text = date_span.get_text(strip=True).replace("\xa0", " ") if date_span else None

        start_date = end_date = None
        if date_text:
            date_text = date_text.replace("–", "-")
            match = re.match(r"([A-Za-z]+) (\d+)[ -]+(\d+), (\d{4})", date_text)
            if match:
                month, start_day, end_day, year = match.groups()
                start_date = parse_date_flexible(f"{month} {start_day} {year}")
                end_date = parse_date_flexible(f"{month} {end_day} {year}")
            else:
                match = re.match(r"([A-Za-z]+ \d+)\s*-\s*([A-Za-z]+ \d+), (\d{4})", date_text)
                if match:
                    month_day_start, month_day_end, year = match.groups()
                    start_date = parse_date_flexible(f"{month_day_start} {year}")
                    end_date = parse_date_flexible(f"{month_day_end} {year}")

        place_tag = div.find("span", class_="conf-place")
        location = place_tag.get_text(strip=True) if place_tag else None

        note_div = div.find("div", class_="note")
        note = note_div.get_text(strip=True).replace("Note:", "") if note_div else None

        # Extract abstract deadline using the new function
        abstract_deadline = None
        if note:
            abstract_deadline = extract_deadline_date(note)

        tags = [span["data-sub"] for span in div.find_all("span", class_="conf-sub")]

        data.append(
            {
                "Name": name,
                "Detail Page": page_link,
                "Website": website_link,
                "Start Date": start_date,
                "End Date": end_date,
                "Location": location,
                "Note": note,
                "Deadline": abstract_deadline,  # Add extracted deadline here
                "Tags": tags,
            }
        )

    except Exception as e:
        print("\n--- Error parsing the following conference div ---\n")
        print(div.prettify())
        raise e  # Re-raise so you can still break execution if needed

 # Step 4: Convert to DataFrame
 df = pd.DataFrame(data)

 # Show result
 print(df.to_markdown())
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import re
	from datetime import datetime


	# Function to parse dates with different formats
	def parse_date_flexible(date_str):
	for fmt in ("%b %d %Y", "%B %d %Y", "%b %d, %Y", "%B %d, %Y"):
	try:
	return datetime.strptime(date_str, fmt).date()
	except ValueError:
	continue
	raise ValueError(f"Could not parse date: {date_str}")


	# Function to extract abstract deadline date from text
	def extract_deadline_date(text):
	# Define regex patterns for dates
	patterns = [
	r"\b(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Sept\|Oct\|Nov\|Dec)[a-z]*[.,]?\s+\d{1,2}(?:st\|nd\|rd\|th)?[.,]?\s+\d{4}", # Oct 2, 2024
	r"\b\d{1,2}(?:st\|nd\|rd\|th)?\s+(?:January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)[.,]?\s+\d{4}", # 2 October 2024
	r"\b(?:January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)[.,]?\s+\d{1,2}(?:st\|nd\|rd\|th)?[.,]?\s+\d{4}", # October 2, 2024
	r"\b\d{4}-\d{2}-\d{2}", # 2023-12-20
	]

	# Try all patterns until one matches
	for pattern in patterns:
	match = re.search(pattern, text)
	if match:
	date_str = match.group(0)
	cleaned = re.sub(r"(st\|nd\|rd\|th\|,)", "", date_str).strip() # Remove suffixes and commas

	# Try parsing the cleaned string with known formats
	for fmt in [
	"%b %d %Y", # Oct 2 2024
	"%B %d %Y", # October 2 2024
	"%d %B %Y", # 2 October 2024
	"%Y-%m-%d", # 2023-12-20
	]:
	try:
	return datetime.strptime(cleaned, fmt).date()
	except ValueError:
	continue
	return None # No valid date found


	# Step 1: Load the HTML page
	url = "https://aideadlin.es/?sub=ML,CV,RO,KR,AP"
	response = requests.get(url)
	soup = BeautifulSoup(response.text, "html.parser")

	# Step 2: Extract conference items
	conf_divs = soup.find_all("div", class_="ConfItem")

	# Step 3: Parse information
	data = []
	for div in conf_divs:
	try:
	title_tag = div.find("span", class_="conf-title")
	name = title_tag.get_text(strip=True) if title_tag else None

	link_tag = title_tag.find("a") if title_tag else None
	page_link = "https://aideadlin.es" + link_tag["href"] if link_tag else None

	website_tag = div.find("span", class_="conf-title-icon").find("a")
	website_link = website_tag["href"] if website_tag else None

	date_span = div.find("span", class_="conf-date")
	date_text = date_span.get_text(strip=True).replace("\xa0", " ") if date_span else None

	start_date = end_date = None
	if date_text:
	date_text = date_text.replace("–", "-")
	match = re.match(r"([A-Za-z]+) (\d+)[ -]+(\d+), (\d{4})", date_text)
	if match:
	month, start_day, end_day, year = match.groups()
	start_date = parse_date_flexible(f"{month} {start_day} {year}")
	end_date = parse_date_flexible(f"{month} {end_day} {year}")
	else:
	match = re.match(r"([A-Za-z]+ \d+)\s-\s([A-Za-z]+ \d+), (\d{4})", date_text)
	if match:
	month_day_start, month_day_end, year = match.groups()
	start_date = parse_date_flexible(f"{month_day_start} {year}")
	end_date = parse_date_flexible(f"{month_day_end} {year}")

	place_tag = div.find("span", class_="conf-place")
	location = place_tag.get_text(strip=True) if place_tag else None

	note_div = div.find("div", class_="note")
	note = note_div.get_text(strip=True).replace("Note:", "") if note_div else None

	# Extract abstract deadline using the new function
	abstract_deadline = None
	if note:
	abstract_deadline = extract_deadline_date(note)

	tags = [span["data-sub"] for span in div.find_all("span", class_="conf-sub")]

	data.append(
	{
	"Name": name,
	"Detail Page": page_link,
	"Website": website_link,
	"Start Date": start_date,
	"End Date": end_date,
	"Location": location,
	"Note": note,
	"Deadline": abstract_deadline, # Add extracted deadline here
	"Tags": tags,
	}
	)

	except Exception as e:
	print("\n--- Error parsing the following conference div ---\n")
	print(div.prettify())
	raise e # Re-raise so you can still break execution if needed

	# Step 4: Convert to DataFrame
	df = pd.DataFrame(data)

	# Show result
	print(df.to_markdown())