弃用langgrpah更换deepagent
This commit is contained in:
@@ -1,118 +1,189 @@
|
||||
import time
|
||||
import asyncio
|
||||
from typing import List
|
||||
from typing import List, Dict, Any
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from langchain_core.tools import tool
|
||||
|
||||
# ─────────────── 重要:計算路徑 ───────────────
|
||||
# 目前這個檔案 (crawl4ai_batch.py) 所在的目錄
|
||||
TOOL_DIR = Path(__file__).resolve().parent
|
||||
# ─────────────────────────────────────
|
||||
# 路径配置
|
||||
# ─────────────────────────────────────
|
||||
|
||||
# 專案根目錄(假設 tools 資料夾與主程式同級)
|
||||
TOOL_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = TOOL_DIR.parent
|
||||
|
||||
# 儲存爬取結果的目錄(你可以自由決定放在哪裡)
|
||||
# 建議選項 A:放在專案根目錄下的 workspace/raw_data
|
||||
SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data"
|
||||
|
||||
# 建議選項 B:如果你打算讓 deep agent 直接讀取,建議放在 agent_workspace 底下
|
||||
# SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
|
||||
|
||||
# 確保目錄存在
|
||||
# DeepAgents 推荐目录
|
||||
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
|
||||
SAVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Browser 配置
|
||||
# ─────────────────────────────────────
|
||||
|
||||
# ────────────────────────────────────────────────
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=False,
|
||||
java_script_enabled=True,
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/118.0 Safari/537.36"
|
||||
),
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=5,
|
||||
excluded_tags=["script", "style", "nav", "footer"],
|
||||
remove_overlay_elements=True,
|
||||
process_iframes=True,
|
||||
)
|
||||
|
||||
|
||||
@tool
|
||||
async def crawl4ai_batch(urls: List[str]) -> str:
|
||||
"""
|
||||
高性能网页爬虫,支持并行处理多个 URL。
|
||||
爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中。
|
||||
返回执行结果摘要和保存的文件路径列表。
|
||||
"""
|
||||
# ─────────────────────────────────────
|
||||
# URL → 文件名
|
||||
# ─────────────────────────────────────
|
||||
|
||||
def build_filename(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
|
||||
domain = parsed.netloc.replace("www.", "").replace(".", "_")
|
||||
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
|
||||
|
||||
ts = int(time.time())
|
||||
rand = uuid.uuid4().hex[:6]
|
||||
|
||||
return f"{ts}_{rand}_{domain}_{path_part}.md"
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# 单个 URL 抓取
|
||||
# ─────────────────────────────────────
|
||||
|
||||
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
|
||||
async with sem:
|
||||
try:
|
||||
result = await crawler.arun(url=url, config=run_config)
|
||||
|
||||
if not result.success:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": f"status={getattr(result, 'status_code', 'unknown')}"
|
||||
}
|
||||
|
||||
markdown = result.markdown or ""
|
||||
|
||||
if len(markdown) < 500:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": "content too short"
|
||||
}
|
||||
|
||||
filename = build_filename(url)
|
||||
filepath = SAVE_DIR / filename
|
||||
|
||||
header = (
|
||||
f"<!-- Source: {url} -->\n"
|
||||
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
|
||||
)
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(header + markdown)
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"success": True,
|
||||
"file": str(filepath)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Async 主逻辑
|
||||
# ─────────────────────────────────────
|
||||
|
||||
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
|
||||
urls = list(set(urls)) # 去重
|
||||
|
||||
if not urls:
|
||||
return "❌ 错误: 未提供任何 URL。"
|
||||
return {"error": "no urls"}
|
||||
|
||||
# print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...")
|
||||
# print(f"儲存目錄: {SAVE_DIR}")
|
||||
sem = asyncio.Semaphore(5) # 并发限制
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=False,
|
||||
java_script_enabled=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/118.0.5993.118 Safari/537.36",
|
||||
proxy=None, # 可选,如果需要代理填 "http://user:pass@ip:port"
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=5,
|
||||
excluded_tags=["script", "style", "nav", "footer"],
|
||||
remove_overlay_elements=True,
|
||||
process_iframes=True,
|
||||
)
|
||||
tasks = [
|
||||
crawl_one(crawler, url, sem)
|
||||
for url in urls
|
||||
]
|
||||
|
||||
results_summary = []
|
||||
saved_files = []
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
success_files = []
|
||||
summary = []
|
||||
|
||||
for r in results:
|
||||
|
||||
if r["success"]:
|
||||
success_files.append(r["file"])
|
||||
summary.append(f"✅ {r['url']}")
|
||||
else:
|
||||
summary.append(f"❌ {r['url']} ({r['error']})")
|
||||
|
||||
return {
|
||||
"saved_files": success_files,
|
||||
"count": len(success_files),
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Tool(同步)
|
||||
# ─────────────────────────────────────
|
||||
@tool
|
||||
def crawl4ai_batch(urls: List[str]) -> str:
|
||||
"""
|
||||
Batch crawl webpages and save their content as markdown files.
|
||||
|
||||
Args:
|
||||
urls: List of webpage URLs to crawl.
|
||||
|
||||
Returns:
|
||||
A summary of crawling results and saved file paths.
|
||||
"""
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
tasks = [crawler.arun(url=url, config=run_config) for url in urls]
|
||||
crawl_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
result = asyncio.run(_crawl4ai_batch(urls))
|
||||
|
||||
for i, result in enumerate(crawl_results):
|
||||
url = urls[i]
|
||||
if "error" in result:
|
||||
return f"❌ Error: {result['error']}"
|
||||
|
||||
if isinstance(result, Exception):
|
||||
results_summary.append(f"❌ 抓取失败 {url}: {str(result)}")
|
||||
continue
|
||||
output = [
|
||||
"### 批量抓取完成 ###",
|
||||
f"成功保存文件: {result['count']}",
|
||||
f"保存目录: {SAVE_DIR}",
|
||||
"",
|
||||
"抓取详情:"
|
||||
]
|
||||
|
||||
if result.success:
|
||||
markdown_content = result.markdown or ""
|
||||
output.extend(result["summary"])
|
||||
|
||||
if len(markdown_content) < 500:
|
||||
results_summary.append(f"⏩ 跳过 {url} (内容过短)")
|
||||
continue
|
||||
if result["saved_files"]:
|
||||
output.append("\n可读取文件:")
|
||||
output.extend(result["saved_files"])
|
||||
|
||||
# 生成檔名
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace("www.", "").replace(".", "_")
|
||||
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
|
||||
filename = f"{int(time.time())}_{domain}_{path_part}.md"
|
||||
|
||||
# 完整檔案路徑
|
||||
filepath = SAVE_DIR / filename
|
||||
|
||||
# 寫入檔案
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
header = f"<!-- Source: {url} -->\n<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
|
||||
f.write(header + markdown_content)
|
||||
|
||||
saved_files.append(str(filepath)) # 建議轉成字串
|
||||
results_summary.append(f"✅ 成功: {url} → {filepath}")
|
||||
|
||||
else:
|
||||
status = getattr(result, 'status_code', '未知错误')
|
||||
results_summary.append(f"❌ 失败: {url} (状态码: {status})")
|
||||
return "\n".join(output)
|
||||
|
||||
except Exception as e:
|
||||
return f"🚨 爬虫系统崩溃: {str(e)}"
|
||||
|
||||
# 回傳給 agent 的結果
|
||||
final_output = (
|
||||
f"### 批量抓取完成 ###\n"
|
||||
f"已成功保存 {len(saved_files)} 个文件。\n"
|
||||
f"儲存目錄: {SAVE_DIR}\n"
|
||||
f"详情:\n" + "\n".join(results_summary)
|
||||
)
|
||||
|
||||
if saved_files:
|
||||
final_output += "\n\n已保存的文件列表(可供後續讀取):\n" + "\n".join(saved_files)
|
||||
|
||||
return final_output
|
||||
return f"🚨 爬虫系统异常: {str(e)}"
|
||||
|
||||
Reference in New Issue
Block a user