Created
April 14, 2025 01:13
-
-
Save vignesh865/511b19ebc0672b22426c6998514ff3f3 to your computer and use it in GitHub Desktop.
Web Scrapper Client Using Playwright MCP.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
import logging | |
import os | |
import re | |
from contextlib import AsyncExitStack | |
import yaml | |
from dotenv import load_dotenv | |
from mcp import ClientSession, StdioServerParameters, Tool | |
from mcp.client.stdio import stdio_client | |
from openai import AsyncOpenAI | |
load_dotenv() # load environment variables from .env | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
def extract_yaml(text): | |
match = re.search(r'```yaml\s*(.*?)\s*```', text, re.DOTALL) | |
if match: | |
extracted = match.group(1) | |
return yaml.safe_load(extracted) | |
else: | |
raise ValueError("Yaml cannot be extracted") | |
def ignore_tags(tag): | |
return tag.split(" ")[0] == "img" or tag.split()[0] == "a" or tag.split()[0] == "link" | |
def clean_elements(data): | |
# If it's a string, return None if it starts with /img or /a | |
if isinstance(data, str): | |
if ignore_tags(data): | |
return None | |
return data | |
# If it's a list, clean each item recursively | |
elif isinstance(data, list): | |
cleaned_list = [] | |
for item in data: | |
cleaned_item = clean_elements(item) | |
if cleaned_item: | |
cleaned_list.append(cleaned_item) | |
return cleaned_list | |
# If it's a dict, clean each value recursively | |
elif isinstance(data, dict): | |
cleaned_dict = {} | |
for key, value in data.items(): | |
if ignore_tags(key): | |
continue | |
cleaned_value = clean_elements(value) | |
if cleaned_value: | |
cleaned_dict[key] = cleaned_value | |
return cleaned_dict | |
# If it's some other type, return as-is | |
return data | |
def convert_to_openai_tool(tool: Tool): | |
return { | |
"type": "function", | |
"function": { | |
"name": tool.name, | |
"description": tool.description, | |
"parameters": tool.inputSchema, | |
}, | |
"isRevision": True | |
} | |
class WebScrapClient: | |
def __init__(self, llm_api_key): | |
# Initialize session and client objects | |
self.exit_stack = AsyncExitStack() | |
self.openai = AsyncOpenAI(api_key=llm_api_key) | |
self.session: ClientSession = None | |
self.tools = [] | |
async def connect_to_server(self): | |
"""Connect to an MCP server""" | |
server_params = StdioServerParameters( | |
command="npx", | |
args=["@playwright/mcp@latest"], | |
env=None | |
) | |
stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params)) | |
stdio, write = stdio_transport | |
self.session = await self.exit_stack.enter_async_context(ClientSession(stdio, write)) | |
await self.session.initialize() | |
tools = await self.session.list_tools() | |
self.tools = [convert_to_openai_tool(tool) for tool in tools.tools] | |
async def scrap_article(self, link: str) -> str: | |
"""Process a query using Claude and available tools""" | |
logger.info(f"Scrapping the link - {link}") | |
scrap_request = [ | |
{"role": "system", "content": """You are an expert web scraper. You will be provided with a URL. | |
Your task is to: | |
1. Navigate to the given webpage. | |
2. If the page contains a button such as "View More", "Read More", "Load More", or anything similar that reveals additional article content, click it. | |
3. Ensure that the full content of the article is visible. | |
4. Extract the complete content of the article. | |
Use available tools as needed to accomplish the task."""}, | |
{"role": "user", | |
"content": f"Link:{link}"} | |
] | |
is_tool_called_before = False | |
while True: | |
completion = await self.openai.chat.completions.create( | |
model="gpt-4o", | |
messages=scrap_request, | |
tools=self.tools | |
) | |
message = completion.choices[0].message | |
if message.tool_calls: | |
tool = message.tool_calls[0].function | |
playwright_result = await self.session.call_tool(tool.name, json.loads(tool.arguments)) | |
# Token optimization | |
snapshot_yaml = extract_yaml(playwright_result.content[0].text) | |
cleaned_yaml = yaml.dump(clean_elements(snapshot_yaml)) | |
# Append to the initial tool prompt | |
status_msg = {"role": "user", "content": f"Status {tool.name} with {tool.arguments}: Success"} | |
result_msg = {"role": "user", "content": f"Result of {tool.name}: {cleaned_yaml}"} | |
if not is_tool_called_before: | |
scrap_request.extend([status_msg, result_msg]) | |
is_tool_called_before = True | |
else: | |
# Remove the previous tool execution result message | |
scrap_request.pop() | |
scrap_request.extend([status_msg, result_msg]) | |
continue | |
if message.content: | |
return message.content | |
async def cleanup(self): | |
"""Clean up resources""" | |
await self.exit_stack.aclose() | |
async def main(link): | |
client = WebScrapClient(llm_api_key=os.getenv("LLM_API_KEY")) | |
try: | |
await client.connect_to_server() | |
return await client.scrap_article(link) | |
finally: | |
await client.cleanup() | |
if __name__ == "__main__": | |
# print(asyncio.run(main("https://finance.yahoo.com/news/nvidia-corporation-nvda-top-tech-122957910.html"))) | |
print(asyncio.run(main("https://finance.yahoo.com/news/furiosaai-turns-down-metas-800m-230222314.html"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment