vignesh865 · April 14, 2025 01:13
diff --git a/web_scrap_agent.py b/web_scrap_agent.py
 import asyncio
 import json
 import logging
 import os
 import re
 from contextlib import AsyncExitStack
 import yaml
 from dotenv import load_dotenv
 from mcp import ClientSession, StdioServerParameters, Tool
 from mcp.client.stdio import stdio_client
 from openai import AsyncOpenAI

 load_dotenv()  # load environment variables from .env
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)


 def extract_yaml(text):
    match = re.search(r'```yaml\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        extracted = match.group(1)
        return yaml.safe_load(extracted)
    else:
        raise ValueError("Yaml cannot be extracted")


 def ignore_tags(tag):
    return tag.split(" ")[0] == "img" or tag.split()[0] == "a" or tag.split()[0] == "link"


 def clean_elements(data):
    # If it's a string, return None if it starts with /img or /a
    if isinstance(data, str):
        if ignore_tags(data):
            return None
        return data

    # If it's a list, clean each item recursively
    elif isinstance(data, list):
        cleaned_list = []
        for item in data:
            cleaned_item = clean_elements(item)
            if cleaned_item:
                cleaned_list.append(cleaned_item)
        return cleaned_list

    # If it's a dict, clean each value recursively
    elif isinstance(data, dict):
        cleaned_dict = {}
        for key, value in data.items():
            if ignore_tags(key):
                continue
            cleaned_value = clean_elements(value)
            if cleaned_value:
                cleaned_dict[key] = cleaned_value
        return cleaned_dict

    # If it's some other type, return as-is
    return data


 def convert_to_openai_tool(tool: Tool):
    return {
        "type": "function",
        "function": {
            "name": tool.name,
            "description": tool.description,
            "parameters": tool.inputSchema,
        },
        "isRevision": True
    }


 class WebScrapClient:
    def __init__(self, llm_api_key):
        # Initialize session and client objects
        self.exit_stack = AsyncExitStack()
        self.openai = AsyncOpenAI(api_key=llm_api_key)
        self.session: ClientSession = None
        self.tools = []

    async def connect_to_server(self):
        """Connect to an MCP server"""

        server_params = StdioServerParameters(
            command="npx",
            args=["@playwright/mcp@latest"],
            env=None
        )

        stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
        stdio, write = stdio_transport
        self.session = await self.exit_stack.enter_async_context(ClientSession(stdio, write))

        await self.session.initialize()
        tools = await self.session.list_tools()
        self.tools = [convert_to_openai_tool(tool) for tool in tools.tools]

    async def scrap_article(self, link: str) -> str:
        """Process a query using Claude and available tools"""
        logger.info(f"Scrapping the link - {link}")
        scrap_request = [
            {"role": "system", "content": """You are an expert web scraper. You will be provided with a URL. 
 Your task is to:

 1. Navigate to the given webpage.
 2. If the page contains a button such as "View More", "Read More", "Load More", or anything similar that reveals additional article content, click it.
 3. Ensure that the full content of the article is visible.
 4. Extract the complete content of the article.

 Use available tools as needed to accomplish the task."""},

            {"role": "user",
             "content": f"Link:{link}"}
        ]

        is_tool_called_before = False
        while True:
            completion = await self.openai.chat.completions.create(
                model="gpt-4o",
                messages=scrap_request,
                tools=self.tools
            )

            message = completion.choices[0].message

            if message.tool_calls:
                tool = message.tool_calls[0].function
                playwright_result = await self.session.call_tool(tool.name, json.loads(tool.arguments))

                # Token optimization
                snapshot_yaml = extract_yaml(playwright_result.content[0].text)
                cleaned_yaml = yaml.dump(clean_elements(snapshot_yaml))
                # Append to the initial tool prompt
                status_msg = {"role": "user", "content": f"Status {tool.name} with {tool.arguments}: Success"}
                result_msg = {"role": "user", "content": f"Result of {tool.name}: {cleaned_yaml}"}

                if not is_tool_called_before:
                    scrap_request.extend([status_msg, result_msg])
                    is_tool_called_before = True
                else:
                    # Remove the previous tool execution result message
                    scrap_request.pop()
                    scrap_request.extend([status_msg, result_msg])

                continue

            if message.content:
                return message.content

    async def cleanup(self):
        """Clean up resources"""
        await self.exit_stack.aclose()


 async def main(link):
    client = WebScrapClient(llm_api_key=os.getenv("LLM_API_KEY"))
    try:
        await client.connect_to_server()
        return await client.scrap_article(link)
    finally:
        await client.cleanup()


 if __name__ == "__main__":
    # print(asyncio.run(main("https://finance.yahoo.com/news/nvidia-corporation-nvda-top-tech-122957910.html")))
    print(asyncio.run(main("https://finance.yahoo.com/news/furiosaai-turns-down-metas-800m-230222314.html")))
	import asyncio
	import json
	import logging
	import os
	import re
	from contextlib import AsyncExitStack
	import yaml
	from dotenv import load_dotenv
	from mcp import ClientSession, StdioServerParameters, Tool
	from mcp.client.stdio import stdio_client
	from openai import AsyncOpenAI

	load_dotenv() # load environment variables from .env
	logger = logging.getLogger()
	logger.setLevel(logging.INFO)


	def extract_yaml(text):
	match = re.search(r'```yaml\s(.?)\s*```', text, re.DOTALL)
	if match:
	extracted = match.group(1)
	return yaml.safe_load(extracted)
	else:
	raise ValueError("Yaml cannot be extracted")


	def ignore_tags(tag):
	return tag.split(" ")[0] == "img" or tag.split()[0] == "a" or tag.split()[0] == "link"


	def clean_elements(data):
	# If it's a string, return None if it starts with /img or /a
	if isinstance(data, str):
	if ignore_tags(data):
	return None
	return data

	# If it's a list, clean each item recursively
	elif isinstance(data, list):
	cleaned_list = []
	for item in data:
	cleaned_item = clean_elements(item)
	if cleaned_item:
	cleaned_list.append(cleaned_item)
	return cleaned_list

	# If it's a dict, clean each value recursively
	elif isinstance(data, dict):
	cleaned_dict = {}
	for key, value in data.items():
	if ignore_tags(key):
	continue
	cleaned_value = clean_elements(value)
	if cleaned_value:
	cleaned_dict[key] = cleaned_value
	return cleaned_dict

	# If it's some other type, return as-is
	return data


	def convert_to_openai_tool(tool: Tool):
	return {
	"type": "function",
	"function": {
	"name": tool.name,
	"description": tool.description,
	"parameters": tool.inputSchema,
	},
	"isRevision": True
	}


	class WebScrapClient:
	def __init__(self, llm_api_key):
	# Initialize session and client objects
	self.exit_stack = AsyncExitStack()
	self.openai = AsyncOpenAI(api_key=llm_api_key)
	self.session: ClientSession = None
	self.tools = []

	async def connect_to_server(self):
	"""Connect to an MCP server"""

	server_params = StdioServerParameters(
	command="npx",
	args=["@playwright/mcp@latest"],
	env=None
	)

	stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
	stdio, write = stdio_transport
	self.session = await self.exit_stack.enter_async_context(ClientSession(stdio, write))

	await self.session.initialize()
	tools = await self.session.list_tools()
	self.tools = [convert_to_openai_tool(tool) for tool in tools.tools]

	async def scrap_article(self, link: str) -> str:
	"""Process a query using Claude and available tools"""
	logger.info(f"Scrapping the link - {link}")
	scrap_request = [
	{"role": "system", "content": """You are an expert web scraper. You will be provided with a URL.
	Your task is to:

	1. Navigate to the given webpage.
	2. If the page contains a button such as "View More", "Read More", "Load More", or anything similar that reveals additional article content, click it.
	3. Ensure that the full content of the article is visible.
	4. Extract the complete content of the article.

	Use available tools as needed to accomplish the task."""},

	{"role": "user",
	"content": f"Link:{link}"}
	]

	is_tool_called_before = False
	while True:
	completion = await self.openai.chat.completions.create(
	model="gpt-4o",
	messages=scrap_request,
	tools=self.tools
	)

	message = completion.choices[0].message

	if message.tool_calls:
	tool = message.tool_calls[0].function
	playwright_result = await self.session.call_tool(tool.name, json.loads(tool.arguments))

	# Token optimization
	snapshot_yaml = extract_yaml(playwright_result.content[0].text)
	cleaned_yaml = yaml.dump(clean_elements(snapshot_yaml))
	# Append to the initial tool prompt
	status_msg = {"role": "user", "content": f"Status {tool.name} with {tool.arguments}: Success"}
	result_msg = {"role": "user", "content": f"Result of {tool.name}: {cleaned_yaml}"}

	if not is_tool_called_before:
	scrap_request.extend([status_msg, result_msg])
	is_tool_called_before = True
	else:
	# Remove the previous tool execution result message
	scrap_request.pop()
	scrap_request.extend([status_msg, result_msg])

	continue

	if message.content:
	return message.content

	async def cleanup(self):
	"""Clean up resources"""
	await self.exit_stack.aclose()


	async def main(link):
	client = WebScrapClient(llm_api_key=os.getenv("LLM_API_KEY"))
	try:
	await client.connect_to_server()
	return await client.scrap_article(link)
	finally:
	await client.cleanup()


	if __name__ == "__main__":
	# print(asyncio.run(main("https://finance.yahoo.com/news/nvidia-corporation-nvda-top-tech-122957910.html")))
	print(asyncio.run(main("https://finance.yahoo.com/news/furiosaai-turns-down-metas-800m-230222314.html")))