弃用langgrpah更换deepagent

This commit is contained in:
zcr
2026-03-11 21:45:46 +08:00
parent c862121b48
commit 7042d428fa
44 changed files with 2847 additions and 619 deletions

View File

@@ -1,118 +1,189 @@
import time
import asyncio
from typing import List
from typing import List, Dict, Any
from urllib.parse import urlparse
from pathlib import Path
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from langchain_core.tools import tool
# ─────────────── 重要:計算路徑 ───────────────
# 目前這個檔案 (crawl4ai_batch.py) 所在的目錄
TOOL_DIR = Path(__file__).resolve().parent
# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────
# 專案根目錄(假設 tools 資料夾與主程式同級)
TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent
# 儲存爬取結果的目錄(你可以自由決定放在哪裡)
# 建議選項 A放在專案根目錄下的 workspace/raw_data
SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data"
# 建議選項 B如果你打算讓 deep agent 直接讀取,建議放在 agent_workspace 底下
# SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
# 確保目錄存在
# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────
# ────────────────────────────────────────────────
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0 Safari/537.36"
),
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
@tool
async def crawl4ai_batch(urls: List[str]) -> str:
"""
高性能网页爬虫,支持并行处理多个 URL。
爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中。
返回执行结果摘要和保存的文件路径列表。
"""
# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────
def build_filename(url: str) -> str:
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
ts = int(time.time())
rand = uuid.uuid4().hex[:6]
return f"{ts}_{rand}_{domain}_{path_part}.md"
# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
async with sem:
try:
result = await crawler.arun(url=url, config=run_config)
if not result.success:
return {
"url": url,
"success": False,
"error": f"status={getattr(result, 'status_code', 'unknown')}"
}
markdown = result.markdown or ""
if len(markdown) < 500:
return {
"url": url,
"success": False,
"error": "content too short"
}
filename = build_filename(url)
filepath = SAVE_DIR / filename
header = (
f"<!-- Source: {url} -->\n"
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
)
with open(filepath, "w", encoding="utf-8") as f:
f.write(header + markdown)
return {
"url": url,
"success": True,
"file": str(filepath)
}
except Exception as e:
return {
"url": url,
"success": False,
"error": str(e)
}
# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
urls = list(set(urls)) # 去重
if not urls:
return "❌ 错误: 未提供任何 URL。"
return {"error": "no urls"}
# print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...")
# print(f"儲存目錄: {SAVE_DIR}")
sem = asyncio.Semaphore(5) # 并发限制
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0.5993.118 Safari/537.36",
proxy=None, # 可选,如果需要代理填 "http://user:pass@ip:port"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
tasks = [
crawl_one(crawler, url, sem)
for url in urls
]
results_summary = []
saved_files = []
results = await asyncio.gather(*tasks)
success_files = []
summary = []
for r in results:
if r["success"]:
success_files.append(r["file"])
summary.append(f"{r['url']}")
else:
summary.append(f"{r['url']} ({r['error']})")
return {
"saved_files": success_files,
"count": len(success_files),
"summary": summary,
}
# ─────────────────────────────────────
# Tool同步
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
"""
Batch crawl webpages and save their content as markdown files.
Args:
urls: List of webpage URLs to crawl.
Returns:
A summary of crawling results and saved file paths.
"""
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
tasks = [crawler.arun(url=url, config=run_config) for url in urls]
crawl_results = await asyncio.gather(*tasks, return_exceptions=True)
result = asyncio.run(_crawl4ai_batch(urls))
for i, result in enumerate(crawl_results):
url = urls[i]
if "error" in result:
return f"❌ Error: {result['error']}"
if isinstance(result, Exception):
results_summary.append(f"❌ 抓取失败 {url}: {str(result)}")
continue
output = [
"### 批量抓取完成 ###",
f"成功保存文件: {result['count']}",
f"保存目录: {SAVE_DIR}",
"",
"抓取详情:"
]
if result.success:
markdown_content = result.markdown or ""
output.extend(result["summary"])
if len(markdown_content) < 500:
results_summary.append(f"⏩ 跳过 {url} (内容过短)")
continue
if result["saved_files"]:
output.append("\n可读取文件:")
output.extend(result["saved_files"])
# 生成檔名
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
filename = f"{int(time.time())}_{domain}_{path_part}.md"
# 完整檔案路徑
filepath = SAVE_DIR / filename
# 寫入檔案
with open(filepath, "w", encoding="utf-8") as f:
header = f"<!-- Source: {url} -->\n<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
f.write(header + markdown_content)
saved_files.append(str(filepath)) # 建議轉成字串
results_summary.append(f"✅ 成功: {url}{filepath}")
else:
status = getattr(result, 'status_code', '未知错误')
results_summary.append(f"❌ 失败: {url} (状态码: {status})")
return "\n".join(output)
except Exception as e:
return f"🚨 爬虫系统崩溃: {str(e)}"
# 回傳給 agent 的結果
final_output = (
f"### 批量抓取完成 ###\n"
f"已成功保存 {len(saved_files)} 个文件。\n"
f"儲存目錄: {SAVE_DIR}\n"
f"详情:\n" + "\n".join(results_summary)
)
if saved_files:
final_output += "\n\n已保存的文件列表(可供後續讀取):\n" + "\n".join(saved_files)
return final_output
return f"🚨 爬虫系统异常: {str(e)}"