2026-03-03 17:33:51 +08:00
|
|
|
|
import time
|
|
|
|
|
|
import asyncio
|
2026-03-11 21:45:46 +08:00
|
|
|
|
from typing import List, Dict, Any
|
2026-03-03 17:33:51 +08:00
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
from pathlib import Path
|
2026-03-11 21:45:46 +08:00
|
|
|
|
|
|
|
|
|
|
import uuid
|
2026-03-04 09:57:38 +08:00
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
2026-03-03 17:33:51 +08:00
|
|
|
|
from langchain_core.tools import tool
|
|
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# 路径配置
|
|
|
|
|
|
# ─────────────────────────────────────
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
TOOL_DIR = Path(__file__).resolve().parent
|
2026-03-03 17:33:51 +08:00
|
|
|
|
PROJECT_ROOT = TOOL_DIR.parent
|
|
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
# DeepAgents 推荐目录
|
|
|
|
|
|
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
|
|
|
|
|
|
SAVE_DIR.mkdir(parents=True, exist_ok=True)
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# Browser 配置
|
|
|
|
|
|
# ─────────────────────────────────────
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
browser_config = BrowserConfig(
|
|
|
|
|
|
headless=True,
|
|
|
|
|
|
verbose=False,
|
|
|
|
|
|
java_script_enabled=True,
|
|
|
|
|
|
user_agent=(
|
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
|
|
"Chrome/118.0 Safari/537.36"
|
|
|
|
|
|
),
|
|
|
|
|
|
)
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
run_config = CrawlerRunConfig(
|
|
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
|
|
word_count_threshold=5,
|
|
|
|
|
|
excluded_tags=["script", "style", "nav", "footer"],
|
|
|
|
|
|
remove_overlay_elements=True,
|
|
|
|
|
|
process_iframes=True,
|
|
|
|
|
|
)
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# URL → 文件名
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
def build_filename(url: str) -> str:
|
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
|
|
|
|
|
|
|
domain = parsed.netloc.replace("www.", "").replace(".", "_")
|
|
|
|
|
|
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
|
|
|
|
|
|
|
|
|
|
|
|
ts = int(time.time())
|
|
|
|
|
|
rand = uuid.uuid4().hex[:6]
|
|
|
|
|
|
|
|
|
|
|
|
return f"{ts}_{rand}_{domain}_{path_part}.md"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# 单个 URL 抓取
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
|
|
|
|
|
|
async with sem:
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = await crawler.arun(url=url, config=run_config)
|
|
|
|
|
|
|
|
|
|
|
|
if not result.success:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"url": url,
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"error": f"status={getattr(result, 'status_code', 'unknown')}"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
markdown = result.markdown or ""
|
|
|
|
|
|
|
|
|
|
|
|
if len(markdown) < 500:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"url": url,
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"error": "content too short"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
filename = build_filename(url)
|
|
|
|
|
|
filepath = SAVE_DIR / filename
|
|
|
|
|
|
|
|
|
|
|
|
header = (
|
|
|
|
|
|
f"<!-- Source: {url} -->\n"
|
|
|
|
|
|
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
|
|
|
|
f.write(header + markdown)
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"url": url,
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"file": str(filepath)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"url": url,
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"error": str(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# Async 主逻辑
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
|
|
|
|
|
|
urls = list(set(urls)) # 去重
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
|
|
|
|
|
if not urls:
|
2026-03-11 21:45:46 +08:00
|
|
|
|
return {"error": "no urls"}
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
sem = asyncio.Semaphore(5) # 并发限制
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
tasks = [
|
|
|
|
|
|
crawl_one(crawler, url, sem)
|
|
|
|
|
|
for url in urls
|
|
|
|
|
|
]
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
results = await asyncio.gather(*tasks)
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
success_files = []
|
|
|
|
|
|
summary = []
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
for r in results:
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
if r["success"]:
|
|
|
|
|
|
success_files.append(r["file"])
|
|
|
|
|
|
summary.append(f"✅ {r['url']}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
summary.append(f"❌ {r['url']} ({r['error']})")
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
return {
|
|
|
|
|
|
"saved_files": success_files,
|
|
|
|
|
|
"count": len(success_files),
|
|
|
|
|
|
"summary": summary,
|
|
|
|
|
|
}
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
# Tool(同步)
|
|
|
|
|
|
# ─────────────────────────────────────
|
|
|
|
|
|
@tool
|
|
|
|
|
|
def crawl4ai_batch(urls: List[str]) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Batch crawl webpages and save their content as markdown files.
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
urls: List of webpage URLs to crawl.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
A summary of crawling results and saved file paths.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = asyncio.run(_crawl4ai_batch(urls))
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
if "error" in result:
|
|
|
|
|
|
return f"❌ Error: {result['error']}"
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
output = [
|
|
|
|
|
|
"### 批量抓取完成 ###",
|
|
|
|
|
|
f"成功保存文件: {result['count']}",
|
|
|
|
|
|
f"保存目录: {SAVE_DIR}",
|
|
|
|
|
|
"",
|
|
|
|
|
|
"抓取详情:"
|
|
|
|
|
|
]
|
2026-03-03 17:33:51 +08:00
|
|
|
|
|
2026-03-11 21:45:46 +08:00
|
|
|
|
output.extend(result["summary"])
|
|
|
|
|
|
|
|
|
|
|
|
if result["saved_files"]:
|
|
|
|
|
|
output.append("\n可读取文件:")
|
|
|
|
|
|
output.extend(result["saved_files"])
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(output)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return f"🚨 爬虫系统异常: {str(e)}"
|