""" define fn for extracting articles Example usage: url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3' markdown = extract_article(url) print(markdown) """ import requests import re from bs4 import BeautifulSoup def extract_article(url, strip_links=True): """ Given a URL to a website, extracts all the text article from the URL formatted as markdown. :param url: str, URL of the website to extract the article from :param strip_links: bool, decides whether or not to strip links (by default true) :return: str, article content in markdown format """ # Make a request to the URL response = requests.get(url) if response.status_code != 200: raise Exception( f"Request to {url} failed with status code {response.status_code}" ) try: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, "html.parser") # Find the article content article = soup.find("article") # Remove unwanted elements from the article for element in article.find_all(["script", "style"]): element.extract() # Convert the article to markdown format markdown = "" # Add the article title title_elem = article.find("h1") if title_elem: title = title_elem.get_text() markdown += f"\n# {title}\n\n" # Add the article image image = article.find("img") if image: alt = image.get("alt") src = image.get("src") markdown += f"\n\n" # Add the article content for paragraph in article.find_all("p"): text = paragraph.get_text() if strip_links: try: text = re.sub(r"\[.*?\]\(.*?\)", "", text) # Strip links except re.error as e: print(f"Error stripping links from article text: {e}") markdown += f"{text}\n\n" # Log the number of words try: word_count = len(re.findall(r"\b\w+\b", markdown)) print(f"The article contains {word_count} words.") except TypeError: print("The article content is empty.") return markdown except Exception as e: print(f"Error extracting article content: {e}") return ""