import time import asyncio from typing import List, Dict, Any from urllib.parse import urlparse from pathlib import Path import uuid from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from langchain_core.tools import tool # ───────────────────────────────────── # 路径配置 # ───────────────────────────────────── TOOL_DIR = Path(__file__).resolve().parent PROJECT_ROOT = TOOL_DIR.parent # DeepAgents 推荐目录 SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data" SAVE_DIR.mkdir(parents=True, exist_ok=True) # ───────────────────────────────────── # Browser 配置 # ───────────────────────────────────── browser_config = BrowserConfig( headless=True, verbose=False, java_script_enabled=True, user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/118.0 Safari/537.36" ), ) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=5, excluded_tags=["script", "style", "nav", "footer"], remove_overlay_elements=True, process_iframes=True, ) # ───────────────────────────────────── # URL → 文件名 # ───────────────────────────────────── def build_filename(url: str) -> str: parsed = urlparse(url) domain = parsed.netloc.replace("www.", "").replace(".", "_") path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index" ts = int(time.time()) rand = uuid.uuid4().hex[:6] return f"{ts}_{rand}_{domain}_{path_part}.md" # ───────────────────────────────────── # 单个 URL 抓取 # ───────────────────────────────────── async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]: async with sem: try: result = await crawler.arun(url=url, config=run_config) if not result.success: return { "url": url, "success": False, "error": f"status={getattr(result, 'status_code', 'unknown')}" } markdown = result.markdown or "" if len(markdown) < 500: return { "url": url, "success": False, "error": "content too short" } filename = build_filename(url) filepath = SAVE_DIR / filename header = ( f"\n" f"\n\n" ) with open(filepath, "w", encoding="utf-8") as f: f.write(header + markdown) return { "url": url, "success": True, "file": str(filepath) } except Exception as e: return { "url": url, "success": False, "error": str(e) } # ───────────────────────────────────── # Async 主逻辑 # ───────────────────────────────────── async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]: urls = list(set(urls)) # 去重 if not urls: return {"error": "no urls"} sem = asyncio.Semaphore(5) # 并发限制 async with AsyncWebCrawler(config=browser_config) as crawler: tasks = [ crawl_one(crawler, url, sem) for url in urls ] results = await asyncio.gather(*tasks) success_files = [] summary = [] for r in results: if r["success"]: success_files.append(r["file"]) summary.append(f"✅ {r['url']}") else: summary.append(f"❌ {r['url']} ({r['error']})") return { "saved_files": success_files, "count": len(success_files), "summary": summary, } # ───────────────────────────────────── # Tool(同步) # ───────────────────────────────────── @tool def crawl4ai_batch(urls: List[str]) -> str: """ Batch crawl webpages and save their content as markdown files. Args: urls: List of webpage URLs to crawl. Returns: A summary of crawling results and saved file paths. """ try: result = asyncio.run(_crawl4ai_batch(urls)) if "error" in result: return f"❌ Error: {result['error']}" output = [ "### 批量抓取完成 ###", f"成功保存文件: {result['count']}", f"保存目录: {SAVE_DIR}", "", "抓取详情:" ] output.extend(result["summary"]) if result["saved_files"]: output.append("\n可读取文件:") output.extend(result["saved_files"]) return "\n".join(output) except Exception as e: return f"🚨 爬虫系统异常: {str(e)}"