2026-03-03 17:33:51 +08:00
|
|
|
|
import time
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
from typing import List
|
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
from pathlib import Path
|
2026-03-04 09:57:38 +08:00
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
2026-03-03 17:33:51 +08:00
|
|
|
|
from langchain_core.tools import tool
|
|
|
|
|
|
|
|
|
|
|
|
# ─────────────── 重要:計算路徑 ───────────────
|
|
|
|
|
|
# 目前這個檔案 (crawl4ai_batch.py) 所在的目錄
|
|
|
|
|
|
TOOL_DIR = Path(__file__).resolve().parent
|
|
|
|
|
|
|
|
|
|
|
|
# 專案根目錄(假設 tools 資料夾與主程式同級)
|
|
|
|
|
|
PROJECT_ROOT = TOOL_DIR.parent
|
|
|
|
|
|
|
|
|
|
|
|
# 儲存爬取結果的目錄(你可以自由決定放在哪裡)
|
|
|
|
|
|
# 建議選項 A:放在專案根目錄下的 workspace/raw_data
|
|
|
|
|
|
SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data"
|
|
|
|
|
|
|
|
|
|
|
|
# 建議選項 B:如果你打算讓 deep agent 直接讀取,建議放在 agent_workspace 底下
|
|
|
|
|
|
# SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
|
|
|
|
|
|
|
|
|
|
|
|
# 確保目錄存在
|
|
|
|
|
|
SAVE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@tool
|
|
|
|
|
|
async def crawl4ai_batch(urls: List[str]) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
高性能网页爬虫,支持并行处理多个 URL。
|
|
|
|
|
|
爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中。
|
|
|
|
|
|
返回执行结果摘要和保存的文件路径列表。
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not urls:
|
|
|
|
|
|
return "❌ 错误: 未提供任何 URL。"
|
|
|
|
|
|
|
|
|
|
|
|
# print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...")
|
|
|
|
|
|
# print(f"儲存目錄: {SAVE_DIR}")
|
|
|
|
|
|
|
|
|
|
|
|
browser_config = BrowserConfig(
|
|
|
|
|
|
headless=True,
|
|
|
|
|
|
verbose=False,
|
|
|
|
|
|
java_script_enabled=True,
|
|
|
|
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
|
|
"Chrome/118.0.5993.118 Safari/537.36",
|
|
|
|
|
|
proxy=None, # 可选,如果需要代理填 "http://user:pass@ip:port"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
run_config = CrawlerRunConfig(
|
|
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
|
|
word_count_threshold=5,
|
|
|
|
|
|
excluded_tags=["script", "style", "nav", "footer"],
|
|
|
|
|
|
remove_overlay_elements=True,
|
|
|
|
|
|
process_iframes=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
results_summary = []
|
|
|
|
|
|
saved_files = []
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
|
|
tasks = [crawler.arun(url=url, config=run_config) for url in urls]
|
|
|
|
|
|
crawl_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
|
|
|
|
|
|
|
|
for i, result in enumerate(crawl_results):
|
|
|
|
|
|
url = urls[i]
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(result, Exception):
|
|
|
|
|
|
results_summary.append(f"❌ 抓取失败 {url}: {str(result)}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if result.success:
|
|
|
|
|
|
markdown_content = result.markdown or ""
|
|
|
|
|
|
|
|
|
|
|
|
if len(markdown_content) < 500:
|
|
|
|
|
|
results_summary.append(f"⏩ 跳过 {url} (内容过短)")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 生成檔名
|
|
|
|
|
|
parsed = urlparse(url)
|
|
|
|
|
|
domain = parsed.netloc.replace("www.", "").replace(".", "_")
|
|
|
|
|
|
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
|
|
|
|
|
|
filename = f"{int(time.time())}_{domain}_{path_part}.md"
|
|
|
|
|
|
|
|
|
|
|
|
# 完整檔案路徑
|
|
|
|
|
|
filepath = SAVE_DIR / filename
|
|
|
|
|
|
|
|
|
|
|
|
# 寫入檔案
|
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
|
|
|
|
header = f"<!-- Source: {url} -->\n<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
|
|
|
|
|
|
f.write(header + markdown_content)
|
|
|
|
|
|
|
|
|
|
|
|
saved_files.append(str(filepath)) # 建議轉成字串
|
|
|
|
|
|
results_summary.append(f"✅ 成功: {url} → {filepath}")
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
status = getattr(result, 'status_code', '未知错误')
|
|
|
|
|
|
results_summary.append(f"❌ 失败: {url} (状态码: {status})")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return f"🚨 爬虫系统崩溃: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
# 回傳給 agent 的結果
|
|
|
|
|
|
final_output = (
|
|
|
|
|
|
f"### 批量抓取完成 ###\n"
|
|
|
|
|
|
f"已成功保存 {len(saved_files)} 个文件。\n"
|
|
|
|
|
|
f"儲存目錄: {SAVE_DIR}\n"
|
|
|
|
|
|
f"详情:\n" + "\n".join(results_summary)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if saved_files:
|
|
|
|
|
|
final_output += "\n\n已保存的文件列表(可供後續讀取):\n" + "\n".join(saved_files)
|
|
|
|
|
|
|
|
|
|
|
|
return final_output
|