Files
FiDA_Python/src/server/agent/tools/crawl_tool.py

122 lines
4.5 KiB
Python
Raw Normal View History

2026-03-03 17:33:51 +08:00
import time
import asyncio
from typing import List
from urllib.parse import urlparse
from pathlib import Path
from langchain_core.tools import tool
# ─────────────── 重要:計算路徑 ───────────────
# 目前這個檔案 (crawl4ai_batch.py) 所在的目錄
TOOL_DIR = Path(__file__).resolve().parent
# 專案根目錄(假設 tools 資料夾與主程式同級)
PROJECT_ROOT = TOOL_DIR.parent
# 儲存爬取結果的目錄(你可以自由決定放在哪裡)
# 建議選項 A放在專案根目錄下的 workspace/raw_data
SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data"
# 建議選項 B如果你打算讓 deep agent 直接讀取,建議放在 agent_workspace 底下
# SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
# 確保目錄存在
SAVE_DIR.mkdir(parents=True, exist_ok=True)
# ────────────────────────────────────────────────
@tool
async def crawl4ai_batch(urls: List[str]) -> str:
"""
高性能网页爬虫支持并行处理多个 URL
爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中
返回执行结果摘要和保存的文件路径列表
"""
if not urls:
return "❌ 错误: 未提供任何 URL。"
# print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...")
# print(f"儲存目錄: {SAVE_DIR}")
# Crawl4AI 配置(保持原樣)
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0.5993.118 Safari/537.36",
proxy=None, # 可选,如果需要代理填 "http://user:pass@ip:port"
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
results_summary = []
saved_files = []
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
tasks = [crawler.arun(url=url, config=run_config) for url in urls]
crawl_results = await asyncio.gather(*tasks, return_exceptions=True)
for i, result in enumerate(crawl_results):
url = urls[i]
if isinstance(result, Exception):
results_summary.append(f"❌ 抓取失败 {url}: {str(result)}")
continue
if result.success:
markdown_content = result.markdown or ""
if len(markdown_content) < 500:
results_summary.append(f"⏩ 跳过 {url} (内容过短)")
continue
# 生成檔名
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
filename = f"{int(time.time())}_{domain}_{path_part}.md"
# 完整檔案路徑
filepath = SAVE_DIR / filename
# 寫入檔案
with open(filepath, "w", encoding="utf-8") as f:
header = f"<!-- Source: {url} -->\n<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
f.write(header + markdown_content)
saved_files.append(str(filepath)) # 建議轉成字串
results_summary.append(f"✅ 成功: {url}{filepath}")
else:
status = getattr(result, 'status_code', '未知错误')
results_summary.append(f"❌ 失败: {url} (状态码: {status})")
except Exception as e:
return f"🚨 爬虫系统崩溃: {str(e)}"
# 回傳給 agent 的結果
final_output = (
f"### 批量抓取完成 ###\n"
f"已成功保存 {len(saved_files)} 个文件。\n"
f"儲存目錄: {SAVE_DIR}\n"
f"详情:\n" + "\n".join(results_summary)
)
if saved_files:
final_output += "\n\n已保存的文件列表(可供後續讀取):\n" + "\n".join(saved_files)
return final_output