Files
FiDA_Python/src/server/agent/tools/crawl_tool.py

190 lines
5.6 KiB
Python
Raw Normal View History

2026-03-03 17:33:51 +08:00
import time
import asyncio
2026-03-11 21:45:46 +08:00
from typing import List, Dict, Any
2026-03-03 17:33:51 +08:00
from urllib.parse import urlparse
from pathlib import Path
2026-03-11 21:45:46 +08:00
import uuid
2026-03-04 09:57:38 +08:00
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
2026-03-03 17:33:51 +08:00
from langchain_core.tools import tool
2026-03-11 21:45:46 +08:00
# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
TOOL_DIR = Path(__file__).resolve().parent
2026-03-03 17:33:51 +08:00
PROJECT_ROOT = TOOL_DIR.parent
2026-03-11 21:45:46 +08:00
# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0 Safari/537.36"
),
)
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────
def build_filename(url: str) -> str:
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
ts = int(time.time())
rand = uuid.uuid4().hex[:6]
return f"{ts}_{rand}_{domain}_{path_part}.md"
# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
async with sem:
try:
result = await crawler.arun(url=url, config=run_config)
if not result.success:
return {
"url": url,
"success": False,
"error": f"status={getattr(result, 'status_code', 'unknown')}"
}
markdown = result.markdown or ""
if len(markdown) < 500:
return {
"url": url,
"success": False,
"error": "content too short"
}
filename = build_filename(url)
filepath = SAVE_DIR / filename
header = (
f"<!-- Source: {url} -->\n"
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
)
with open(filepath, "w", encoding="utf-8") as f:
f.write(header + markdown)
return {
"url": url,
"success": True,
"file": str(filepath)
}
except Exception as e:
return {
"url": url,
"success": False,
"error": str(e)
}
# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
urls = list(set(urls)) # 去重
2026-03-03 17:33:51 +08:00
if not urls:
2026-03-11 21:45:46 +08:00
return {"error": "no urls"}
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
sem = asyncio.Semaphore(5) # 并发限制
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
async with AsyncWebCrawler(config=browser_config) as crawler:
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
tasks = [
crawl_one(crawler, url, sem)
for url in urls
]
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
results = await asyncio.gather(*tasks)
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
success_files = []
summary = []
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
for r in results:
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
if r["success"]:
success_files.append(r["file"])
summary.append(f"{r['url']}")
else:
summary.append(f"{r['url']} ({r['error']})")
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
return {
"saved_files": success_files,
"count": len(success_files),
"summary": summary,
}
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
# ─────────────────────────────────────
# Tool同步
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
"""
Batch crawl webpages and save their content as markdown files.
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
Args:
urls: List of webpage URLs to crawl.
Returns:
A summary of crawling results and saved file paths.
"""
try:
result = asyncio.run(_crawl4ai_batch(urls))
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
if "error" in result:
return f"❌ Error: {result['error']}"
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
output = [
"### 批量抓取完成 ###",
f"成功保存文件: {result['count']}",
f"保存目录: {SAVE_DIR}",
"",
"抓取详情:"
]
2026-03-03 17:33:51 +08:00
2026-03-11 21:45:46 +08:00
output.extend(result["summary"])
if result["saved_files"]:
output.append("\n可读取文件:")
output.extend(result["saved_files"])
return "\n".join(output)
except Exception as e:
return f"🚨 爬虫系统异常: {str(e)}"