"""
define fn for extracting articles


Example usage:

url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3'
markdown = extract_article(url)
print(markdown)
"""

import requests
import re
from bs4 import BeautifulSoup


def extract_article(url, strip_links=True):
    """
    Given a URL to a website, extracts all the text article from the URL formatted as markdown.
    :param url: str, URL of the website to extract the article from
    :param strip_links: bool, decides whether or not to strip links (by default true)
    :return: str, article content in markdown format
    """
    # Make a request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(
            f"Request to {url} failed with status code {response.status_code}"
        )

    try:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the article content
        article = soup.find("article")

        # Remove unwanted elements from the article
        for element in article.find_all(["script", "style"]):
            element.extract()

        # Convert the article to markdown format
        markdown = ""

        # Add the article title
        title_elem = article.find("h1")
        if title_elem:
            title = title_elem.get_text()
            markdown += f"\n# {title}\n\n"

        # Add the article image
        image = article.find("img")
        if image:
            alt = image.get("alt")
            src = image.get("src")
            markdown += f"![{alt}]({src})\n\n"

        # Add the article content
        for paragraph in article.find_all("p"):
            text = paragraph.get_text()
            if strip_links:
                try:
                    text = re.sub(r"\[.*?\]\(.*?\)", "", text)  # Strip links
                except re.error as e:
                    print(f"Error stripping links from article text: {e}")
            markdown += f"{text}\n\n"

        # Log the number of words
        try:
            word_count = len(re.findall(r"\b\w+\b", markdown))
            print(f"The article contains {word_count} words.")
        except TypeError:
            print("The article content is empty.")

        return markdown

    except Exception as e:
        print(f"Error extracting article content: {e}")
        return ""