Files
FiDA_Python/src/server/agent/tools/crawl_tool.py
2026-03-11 21:45:46 +08:00

190 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import asyncio
from typing import List, Dict, Any
from urllib.parse import urlparse
from pathlib import Path
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from langchain_core.tools import tool
# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────
TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent
# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0 Safari/537.36"
),
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────
def build_filename(url: str) -> str:
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
ts = int(time.time())
rand = uuid.uuid4().hex[:6]
return f"{ts}_{rand}_{domain}_{path_part}.md"
# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
async with sem:
try:
result = await crawler.arun(url=url, config=run_config)
if not result.success:
return {
"url": url,
"success": False,
"error": f"status={getattr(result, 'status_code', 'unknown')}"
}
markdown = result.markdown or ""
if len(markdown) < 500:
return {
"url": url,
"success": False,
"error": "content too short"
}
filename = build_filename(url)
filepath = SAVE_DIR / filename
header = (
f"<!-- Source: {url} -->\n"
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
)
with open(filepath, "w", encoding="utf-8") as f:
f.write(header + markdown)
return {
"url": url,
"success": True,
"file": str(filepath)
}
except Exception as e:
return {
"url": url,
"success": False,
"error": str(e)
}
# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
urls = list(set(urls)) # 去重
if not urls:
return {"error": "no urls"}
sem = asyncio.Semaphore(5) # 并发限制
async with AsyncWebCrawler(config=browser_config) as crawler:
tasks = [
crawl_one(crawler, url, sem)
for url in urls
]
results = await asyncio.gather(*tasks)
success_files = []
summary = []
for r in results:
if r["success"]:
success_files.append(r["file"])
summary.append(f"{r['url']}")
else:
summary.append(f"{r['url']} ({r['error']})")
return {
"saved_files": success_files,
"count": len(success_files),
"summary": summary,
}
# ─────────────────────────────────────
# Tool同步
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
"""
Batch crawl webpages and save their content as markdown files.
Args:
urls: List of webpage URLs to crawl.
Returns:
A summary of crawling results and saved file paths.
"""
try:
result = asyncio.run(_crawl4ai_batch(urls))
if "error" in result:
return f"❌ Error: {result['error']}"
output = [
"### 批量抓取完成 ###",
f"成功保存文件: {result['count']}",
f"保存目录: {SAVE_DIR}",
"",
"抓取详情:"
]
output.extend(result["summary"])
if result["saved_files"]:
output.append("\n可读取文件:")
output.extend(result["saved_files"])
return "\n".join(output)
except Exception as e:
return f"🚨 爬虫系统异常: {str(e)}"