FiDA_Python/src/server/agent/tools/crawl_tool.py

import time
import asyncio
from typing import List, Dict, Any
from urllib.parse import urlparse
from pathlib import Path

import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from langchain_core.tools import tool

# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────

TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent

# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────

browser_config = BrowserConfig(
    headless=True,
    verbose=False,
    java_script_enabled=True,
    user_agent=(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0 Safari/537.36"
    ),
)

run_config = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS,
    word_count_threshold=5,
    excluded_tags=["script", "style", "nav", "footer"],
    remove_overlay_elements=True,
    process_iframes=True,
)


# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────

def build_filename(url: str) -> str:
    parsed = urlparse(url)

    domain = parsed.netloc.replace("www.", "").replace(".", "_")
    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"

    ts = int(time.time())
    rand = uuid.uuid4().hex[:6]

    return f"{ts}_{rand}_{domain}_{path_part}.md"


# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────

async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
    async with sem:
        try:
            result = await crawler.arun(url=url, config=run_config)

            if not result.success:
                return {
                    "url": url,
                    "success": False,
                    "error": f"status={getattr(result, 'status_code', 'unknown')}"
                }

            markdown = result.markdown or ""

            if len(markdown) < 500:
                return {
                    "url": url,
                    "success": False,
                    "error": "content too short"
                }

            filename = build_filename(url)
            filepath = SAVE_DIR / filename

            header = (
                f"<!-- Source: {url} -->\n"
                f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
            )

            with open(filepath, "w", encoding="utf-8") as f:
                f.write(header + markdown)

            return {
                "url": url,
                "success": True,
                "file": str(filepath)
            }

        except Exception as e:
            return {
                "url": url,
                "success": False,
                "error": str(e)
            }


# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────

async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
    urls = list(set(urls))  # 去重

    if not urls:
        return {"error": "no urls"}

    sem = asyncio.Semaphore(5)  # 并发限制

    async with AsyncWebCrawler(config=browser_config) as crawler:

        tasks = [
            crawl_one(crawler, url, sem)
            for url in urls
        ]

        results = await asyncio.gather(*tasks)

    success_files = []
    summary = []

    for r in results:

        if r["success"]:
            success_files.append(r["file"])
            summary.append(f"✅ {r['url']}")
        else:
            summary.append(f"❌ {r['url']} ({r['error']})")

    return {
        "saved_files": success_files,
        "count": len(success_files),
        "summary": summary,
    }


# ─────────────────────────────────────
# Tool（同步）
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
    """
    Batch crawl webpages and save their content as markdown files.

    Args:
        urls: List of webpage URLs to crawl.

    Returns:
        A summary of crawling results and saved file paths.
    """

    try:
        result = asyncio.run(_crawl4ai_batch(urls))

        if "error" in result:
            return f"❌ Error: {result['error']}"

        output = [
            "### 批量抓取完成 ###",
            f"成功保存文件: {result['count']}",
            f"保存目录: {SAVE_DIR}",
            "",
            "抓取详情:"
        ]

        output.extend(result["summary"])

        if result["saved_files"]:
            output.append("\n可读取文件:")
            output.extend(result["saved_files"])

        return "\n".join(output)

    except Exception as e:
        return f"🚨 爬虫系统异常: {str(e)}"