src/server/agent/tools/crawl_tool.py

import time
import asyncio
from typing import List, Dict, Any
from urllib.parse import urlparse
from pathlib import Path

import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from langchain_core.tools import tool

# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────

TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent

# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────

browser_config = BrowserConfig(
    headless=True,
    verbose=False,
    java_script_enabled=True,
    user_agent=(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0 Safari/537.36"
    ),
)

run_config = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS,
    word_count_threshold=5,
    excluded_tags=["script", "style", "nav", "footer"],
    remove_overlay_elements=True,
    process_iframes=True,
)


# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────

def build_filename(url: str) -> str:
    parsed = urlparse(url)

    domain = parsed.netloc.replace("www.", "").replace(".", "_")
    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"

    ts = int(time.time())
    rand = uuid.uuid4().hex[:6]

    return f"{ts}_{rand}_{domain}_{path_part}.md"


# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────

async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
    async with sem:
        try:
            result = await crawler.arun(url=url, config=run_config)

            if not result.success:
                return {
                    "url": url,
                    "success": False,
                    "error": f"status={getattr(result, 'status_code', 'unknown')}"
                }

            markdown = result.markdown or ""

            if len(markdown) < 500:
                return {
                    "url": url,
                    "success": False,
                    "error": "content too short"
                }

            filename = build_filename(url)
            filepath = SAVE_DIR / filename

            header = (
                f"<!-- Source: {url} -->\n"
                f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
            )

            with open(filepath, "w", encoding="utf-8") as f:
                f.write(header + markdown)

            return {
                "url": url,
                "success": True,
                "file": str(filepath)
            }

        except Exception as e:
            return {
                "url": url,
                "success": False,
                "error": str(e)
            }


# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────

async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
    urls = list(set(urls))  # 去重

    if not urls:
        return {"error": "no urls"}

    sem = asyncio.Semaphore(5)  # 并发限制

    async with AsyncWebCrawler(config=browser_config) as crawler:

        tasks = [
            crawl_one(crawler, url, sem)
            for url in urls
        ]

        results = await asyncio.gather(*tasks)

    success_files = []
    summary = []

    for r in results:

        if r["success"]:
            success_files.append(r["file"])
            summary.append(f"✅ {r['url']}")
        else:
            summary.append(f"❌ {r['url']} ({r['error']})")

    return {
        "saved_files": success_files,
        "count": len(success_files),
        "summary": summary,
    }


# ─────────────────────────────────────
# Tool（同步）
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
    """
    Batch crawl webpages and save their content as markdown files.

    Args:
        urls: List of webpage URLs to crawl.

    Returns:
        A summary of crawling results and saved file paths.
    """

    try:
        result = asyncio.run(_crawl4ai_batch(urls))

        if "error" in result:
            return f"❌ Error: {result['error']}"

        output = [
            "### 批量抓取完成 ###",
            f"成功保存文件: {result['count']}",
            f"保存目录: {SAVE_DIR}",
            "",
            "抓取详情:"
        ]

        output.extend(result["summary"])

        if result["saved_files"]:
            output.append("\n可读取文件:")
            output.extend(result["saved_files"])

        return "\n".join(output)

    except Exception as e:
        return f"🚨 爬虫系统异常: {str(e)}"
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
+								import time
 								import asyncio
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								from typing import List, Dict, Any
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
+								from urllib.parse import urlparse
 								from pathlib import Path
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
 								import uuid
-												feat 接入report

											
										
										
											2026-03-04 09:57:38 +08:00
+								from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
+								from langchain_core.tools import tool
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								# ─────────────────────────────────────
 								# 路径配置
 								# ─────────────────────────────────────
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								TOOL_DIR = Path(__file__).resolve().parent
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
+								PROJECT_ROOT = TOOL_DIR.parent
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								# DeepAgents 推荐目录
 								SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
 								SAVE_DIR.mkdir(parents=True, exist_ok=True)
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								# ─────────────────────────────────────
 								# Browser 配置
 								# ─────────────────────────────────────
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								browser_config = BrowserConfig(
 								    headless=True,
 								    verbose=False,
 								    java_script_enabled=True,
 								    user_agent=(
 								        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 								        "AppleWebKit/537.36 (KHTML, like Gecko) "
 								        "Chrome/118.0 Safari/537.36"
 								    ),
 								)
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								run_config = CrawlerRunConfig(
 								    cache_mode=CacheMode.BYPASS,
 								    word_count_threshold=5,
 								    excluded_tags=["script", "style", "nav", "footer"],
 								    remove_overlay_elements=True,
 								    process_iframes=True,
 								)
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								# ─────────────────────────────────────
 								# URL → 文件名
 								# ─────────────────────────────────────
 								def build_filename(url: str) -> str:
 								    parsed = urlparse(url)
 								    domain = parsed.netloc.replace("www.", "").replace(".", "_")
 								    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
 								    ts = int(time.time())
 								    rand = uuid.uuid4().hex[:6]
 								    return f"{ts}_{rand}_{domain}_{path_part}.md"
 								# ─────────────────────────────────────
 								# 单个 URL 抓取
 								# ─────────────────────────────────────
 								async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
 								    async with sem:
 								        try:
 								            result = await crawler.arun(url=url, config=run_config)
 								            if not result.success:
 								                return {
 								                    "url": url,
 								                    "success": False,
 								                    "error": f"status={getattr(result, 'status_code', 'unknown')}"
 								                }
 								            markdown = result.markdown or ""
 								            if len(markdown) < 500:
 								                return {
 								                    "url": url,
 								                    "success": False,
 								                    "error": "content too short"
 								                }
 								            filename = build_filename(url)
 								            filepath = SAVE_DIR / filename
 								            header = (
 								                f"<!-- Source: {url} -->\n"
 								                f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
 								            )
 								            with open(filepath, "w", encoding="utf-8") as f:
 								                f.write(header + markdown)
 								            return {
 								                "url": url,
 								                "success": True,
 								                "file": str(filepath)
 								            }
 								        except Exception as e:
 								            return {
 								                "url": url,
 								                "success": False,
 								                "error": str(e)
 								            }
 								# ─────────────────────────────────────
 								# Async 主逻辑
 								# ─────────────────────────────────────
 								async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
 								    urls = list(set(urls))  # 去重
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
 								    if not urls:
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        return {"error": "no urls"}
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    sem = asyncio.Semaphore(5)  # 并发限制
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    async with AsyncWebCrawler(config=browser_config) as crawler:
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        tasks = [
 								            crawl_one(crawler, url, sem)
 								            for url in urls
 								        ]
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        results = await asyncio.gather(*tasks)
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    success_files = []
 								    summary = []
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    for r in results:
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        if r["success"]:
 								            success_files.append(r["file"])
 								            summary.append(f"✅ {r['url']}")
 								        else:
 								            summary.append(f"❌ {r['url']} ({r['error']})")
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    return {
 								        "saved_files": success_files,
 								        "count": len(success_files),
 								        "summary": summary,
 								    }
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								# ─────────────────────────────────────
 								# Tool（同步）
 								# ─────────────────────────────────────
 								@tool
 								def crawl4ai_batch(urls: List[str]) -> str:
 								    """
 								    Batch crawl webpages and save their content as markdown files.
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								    Args:
 								        urls: List of webpage URLs to crawl.
 								    Returns:
 								        A summary of crawling results and saved file paths.
 								    """
 								    try:
 								        result = asyncio.run(_crawl4ai_batch(urls))
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        if "error" in result:
 								            return f"❌ Error: {result['error']}"
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        output = [
 								            "### 批量抓取完成 ###",
 								            f"成功保存文件: {result['count']}",
 								            f"保存目录: {SAVE_DIR}",
 								            "",
 								            "抓取详情:"
 								        ]
-												feat 接入report

											
										
										
											2026-03-03 17:33:51 +08:00
-												弃用langgrpah更换deepagent

											
										
										
											2026-03-11 21:45:46 +08:00
+								        output.extend(result["summary"])
 								        if result["saved_files"]:
 								            output.append("\n可读取文件:")
 								            output.extend(result["saved_files"])
 								        return "\n".join(output)
 								    except Exception as e:
 								        return f"🚨 爬虫系统异常: {str(e)}"