Skip to content

Instantly share code, notes, and snippets.

@vignesh865
Created April 14, 2025 01:13
Show Gist options
  • Save vignesh865/511b19ebc0672b22426c6998514ff3f3 to your computer and use it in GitHub Desktop.
Save vignesh865/511b19ebc0672b22426c6998514ff3f3 to your computer and use it in GitHub Desktop.
Web Scrapper Client Using Playwright MCP.
import asyncio
import json
import logging
import os
import re
from contextlib import AsyncExitStack
import yaml
from dotenv import load_dotenv
from mcp import ClientSession, StdioServerParameters, Tool
from mcp.client.stdio import stdio_client
from openai import AsyncOpenAI
load_dotenv() # load environment variables from .env
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def extract_yaml(text):
match = re.search(r'```yaml\s*(.*?)\s*```', text, re.DOTALL)
if match:
extracted = match.group(1)
return yaml.safe_load(extracted)
else:
raise ValueError("Yaml cannot be extracted")
def ignore_tags(tag):
return tag.split(" ")[0] == "img" or tag.split()[0] == "a" or tag.split()[0] == "link"
def clean_elements(data):
# If it's a string, return None if it starts with /img or /a
if isinstance(data, str):
if ignore_tags(data):
return None
return data
# If it's a list, clean each item recursively
elif isinstance(data, list):
cleaned_list = []
for item in data:
cleaned_item = clean_elements(item)
if cleaned_item:
cleaned_list.append(cleaned_item)
return cleaned_list
# If it's a dict, clean each value recursively
elif isinstance(data, dict):
cleaned_dict = {}
for key, value in data.items():
if ignore_tags(key):
continue
cleaned_value = clean_elements(value)
if cleaned_value:
cleaned_dict[key] = cleaned_value
return cleaned_dict
# If it's some other type, return as-is
return data
def convert_to_openai_tool(tool: Tool):
return {
"type": "function",
"function": {
"name": tool.name,
"description": tool.description,
"parameters": tool.inputSchema,
},
"isRevision": True
}
class WebScrapClient:
def __init__(self, llm_api_key):
# Initialize session and client objects
self.exit_stack = AsyncExitStack()
self.openai = AsyncOpenAI(api_key=llm_api_key)
self.session: ClientSession = None
self.tools = []
async def connect_to_server(self):
"""Connect to an MCP server"""
server_params = StdioServerParameters(
command="npx",
args=["@playwright/mcp@latest"],
env=None
)
stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
stdio, write = stdio_transport
self.session = await self.exit_stack.enter_async_context(ClientSession(stdio, write))
await self.session.initialize()
tools = await self.session.list_tools()
self.tools = [convert_to_openai_tool(tool) for tool in tools.tools]
async def scrap_article(self, link: str) -> str:
"""Process a query using Claude and available tools"""
logger.info(f"Scrapping the link - {link}")
scrap_request = [
{"role": "system", "content": """You are an expert web scraper. You will be provided with a URL.
Your task is to:
1. Navigate to the given webpage.
2. If the page contains a button such as "View More", "Read More", "Load More", or anything similar that reveals additional article content, click it.
3. Ensure that the full content of the article is visible.
4. Extract the complete content of the article.
Use available tools as needed to accomplish the task."""},
{"role": "user",
"content": f"Link:{link}"}
]
is_tool_called_before = False
while True:
completion = await self.openai.chat.completions.create(
model="gpt-4o",
messages=scrap_request,
tools=self.tools
)
message = completion.choices[0].message
if message.tool_calls:
tool = message.tool_calls[0].function
playwright_result = await self.session.call_tool(tool.name, json.loads(tool.arguments))
# Token optimization
snapshot_yaml = extract_yaml(playwright_result.content[0].text)
cleaned_yaml = yaml.dump(clean_elements(snapshot_yaml))
# Append to the initial tool prompt
status_msg = {"role": "user", "content": f"Status {tool.name} with {tool.arguments}: Success"}
result_msg = {"role": "user", "content": f"Result of {tool.name}: {cleaned_yaml}"}
if not is_tool_called_before:
scrap_request.extend([status_msg, result_msg])
is_tool_called_before = True
else:
# Remove the previous tool execution result message
scrap_request.pop()
scrap_request.extend([status_msg, result_msg])
continue
if message.content:
return message.content
async def cleanup(self):
"""Clean up resources"""
await self.exit_stack.aclose()
async def main(link):
client = WebScrapClient(llm_api_key=os.getenv("LLM_API_KEY"))
try:
await client.connect_to_server()
return await client.scrap_article(link)
finally:
await client.cleanup()
if __name__ == "__main__":
# print(asyncio.run(main("https://finance.yahoo.com/news/nvidia-corporation-nvda-top-tech-122957910.html")))
print(asyncio.run(main("https://finance.yahoo.com/news/furiosaai-turns-down-metas-800m-230222314.html")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment