弃用langgrpah更换deepagent

2026-03-11 21:45:46 +08:00
parent c862121b48
commit 7042d428fa
44 changed files with 2847 additions and 619 deletions
--- a/src/server/agent/tools/crawl_tool.py
+++ b/src/server/agent/tools/crawl_tool.py
@@ -1,118 +1,189 @@
 import time
 import asyncio
-from typing import List
+from typing import List, Dict, Any
 from urllib.parse import urlparse
 from pathlib import Path
+
+import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from langchain_core.tools import tool

-# ─────────────── 重要：計算路徑 ───────────────
-# 目前這個檔案 (crawl4ai_batch.py) 所在的目錄
-TOOL_DIR = Path(__file__).resolve().parent
+# ─────────────────────────────────────
+# 路径配置
+# ─────────────────────────────────────

-# 專案根目錄（假設 tools 資料夾與主程式同級）
+TOOL_DIR = Path(__file__).resolve().parent
 PROJECT_ROOT = TOOL_DIR.parent

-# 儲存爬取結果的目錄（你可以自由決定放在哪裡）
-# 建議選項 A：放在專案根目錄下的 workspace/raw_data
-SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data"
-
-# 建議選項 B：如果你打算讓 deep agent 直接讀取，建議放在 agent_workspace 底下
-# SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
-
-# 確保目錄存在
+# DeepAgents 推荐目录
+SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
 SAVE_DIR.mkdir(parents=True, exist_ok=True)

+# ─────────────────────────────────────
+# Browser 配置
+# ─────────────────────────────────────

-# ────────────────────────────────────────────────
+browser_config = BrowserConfig(
+    headless=True,
+    verbose=False,
+    java_script_enabled=True,
+    user_agent=(
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/118.0 Safari/537.36"
+    ),
+)
+
+run_config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    word_count_threshold=5,
+    excluded_tags=["script", "style", "nav", "footer"],
+    remove_overlay_elements=True,
+    process_iframes=True,
+)


-@tool
-async def crawl4ai_batch(urls: List[str]) -> str:
-    """
-    高性能网页爬虫，支持并行处理多个 URL。
-    爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中。
-    返回执行结果摘要和保存的文件路径列表。
-    """
+# ─────────────────────────────────────
+# URL → 文件名
+# ─────────────────────────────────────
+
+def build_filename(url: str) -> str:
+    parsed = urlparse(url)
+
+    domain = parsed.netloc.replace("www.", "").replace(".", "_")
+    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
+
+    ts = int(time.time())
+    rand = uuid.uuid4().hex[:6]
+
+    return f"{ts}_{rand}_{domain}_{path_part}.md"
+
+
+# ─────────────────────────────────────
+# 单个 URL 抓取
+# ─────────────────────────────────────
+
+async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
+    async with sem:
+        try:
+            result = await crawler.arun(url=url, config=run_config)
+
+            if not result.success:
+                return {
+                    "url": url,
+                    "success": False,
+                    "error": f"status={getattr(result, 'status_code', 'unknown')}"
+                }
+
+            markdown = result.markdown or ""
+
+            if len(markdown) < 500:
+                return {
+                    "url": url,
+                    "success": False,
+                    "error": "content too short"
+                }
+
+            filename = build_filename(url)
+            filepath = SAVE_DIR / filename
+
+            header = (
+                f"<!-- Source: {url} -->\n"
+                f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
+            )
+
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write(header + markdown)
+
+            return {
+                "url": url,
+                "success": True,
+                "file": str(filepath)
+            }
+
+        except Exception as e:
+            return {
+                "url": url,
+                "success": False,
+                "error": str(e)
+            }
+
+
+# ─────────────────────────────────────
+# Async 主逻辑
+# ─────────────────────────────────────
+
+async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
+    urls = list(set(urls))  # 去重
+
    if not urls:
-        return "❌ 错误: 未提供任何 URL。"
+        return {"error": "no urls"}

-    # print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...")
-    # print(f"儲存目錄: {SAVE_DIR}")
+    sem = asyncio.Semaphore(5)  # 并发限制

-    browser_config = BrowserConfig(
-        headless=True,
-        verbose=False,
-        java_script_enabled=True,
-        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                   "AppleWebKit/537.36 (KHTML, like Gecko) "
-                   "Chrome/118.0.5993.118 Safari/537.36",
-        proxy=None,  # 可选，如果需要代理填 "http://user:pass@ip:port"
-    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:

-    run_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        word_count_threshold=5,
-        excluded_tags=["script", "style", "nav", "footer"],
-        remove_overlay_elements=True,
-        process_iframes=True,
-    )
+        tasks = [
+            crawl_one(crawler, url, sem)
+            for url in urls
+        ]

-    results_summary = []
-    saved_files = []
+        results = await asyncio.gather(*tasks)
+
+    success_files = []
+    summary = []
+
+    for r in results:
+
+        if r["success"]:
+            success_files.append(r["file"])
+            summary.append(f"✅ {r['url']}")
+        else:
+            summary.append(f"❌ {r['url']} ({r['error']})")
+
+    return {
+        "saved_files": success_files,
+        "count": len(success_files),
+        "summary": summary,
+    }
+
+
+# ─────────────────────────────────────
+# Tool（同步）
+# ─────────────────────────────────────
+@tool
+def crawl4ai_batch(urls: List[str]) -> str:
+    """
+    Batch crawl webpages and save their content as markdown files.
+
+    Args:
+        urls: List of webpage URLs to crawl.
+
+    Returns:
+        A summary of crawling results and saved file paths.
+    """

    try:
-        async with AsyncWebCrawler(config=browser_config) as crawler:
-            tasks = [crawler.arun(url=url, config=run_config) for url in urls]
-            crawl_results = await asyncio.gather(*tasks, return_exceptions=True)
+        result = asyncio.run(_crawl4ai_batch(urls))

-            for i, result in enumerate(crawl_results):
-                url = urls[i]
+        if "error" in result:
+            return f"❌ Error: {result['error']}"

-                if isinstance(result, Exception):
-                    results_summary.append(f"❌ 抓取失败 {url}: {str(result)}")
-                    continue
+        output = [
+            "### 批量抓取完成 ###",
+            f"成功保存文件: {result['count']}",
+            f"保存目录: {SAVE_DIR}",
+            "",
+            "抓取详情:"
+        ]

-                if result.success:
-                    markdown_content = result.markdown or ""
+        output.extend(result["summary"])

-                    if len(markdown_content) < 500:
-                        results_summary.append(f"⏩ 跳过 {url} (内容过短)")
-                        continue
+        if result["saved_files"]:
+            output.append("\n可读取文件:")
+            output.extend(result["saved_files"])

-                    # 生成檔名
-                    parsed = urlparse(url)
-                    domain = parsed.netloc.replace("www.", "").replace(".", "_")
-                    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
-                    filename = f"{int(time.time())}_{domain}_{path_part}.md"
-
-                    # 完整檔案路徑
-                    filepath = SAVE_DIR / filename
-
-                    # 寫入檔案
-                    with open(filepath, "w", encoding="utf-8") as f:
-                        header = f"<!-- Source: {url} -->\n<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
-                        f.write(header + markdown_content)
-
-                    saved_files.append(str(filepath))  # 建議轉成字串
-                    results_summary.append(f"✅ 成功: {url} → {filepath}")
-
-                else:
-                    status = getattr(result, 'status_code', '未知错误')
-                    results_summary.append(f"❌ 失败: {url} (状态码: {status})")
+        return "\n".join(output)

    except Exception as e:
-        return f"🚨 爬虫系统崩溃: {str(e)}"
-
-    # 回傳給 agent 的結果
-    final_output = (
-            f"### 批量抓取完成 ###\n"
-            f"已成功保存 {len(saved_files)} 个文件。\n"
-            f"儲存目錄: {SAVE_DIR}\n"
-            f"详情:\n" + "\n".join(results_summary)
-    )
-
-    if saved_files:
-        final_output += "\n\n已保存的文件列表（可供後續讀取）:\n" + "\n".join(saved_files)
-
-    return final_output
+        return f"🚨 爬虫系统异常: {str(e)}"