import time import asyncio from typing import List from urllib.parse import urlparse from pathlib import Path from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from langchain_core.tools import tool # ─────────────── 重要:計算路徑 ─────────────── # 目前這個檔案 (crawl4ai_batch.py) 所在的目錄 TOOL_DIR = Path(__file__).resolve().parent # 專案根目錄(假設 tools 資料夾與主程式同級) PROJECT_ROOT = TOOL_DIR.parent # 儲存爬取結果的目錄(你可以自由決定放在哪裡) # 建議選項 A:放在專案根目錄下的 workspace/raw_data SAVE_DIR = PROJECT_ROOT / "workspace" / "raw_data" # 建議選項 B:如果你打算讓 deep agent 直接讀取,建議放在 agent_workspace 底下 # SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data" # 確保目錄存在 SAVE_DIR.mkdir(parents=True, exist_ok=True) # ──────────────────────────────────────────────── @tool async def crawl4ai_batch(urls: List[str]) -> str: """ 高性能网页爬虫,支持并行处理多个 URL。 爬取后的 Markdown 内容将保存到本地 workspace/raw_data 目录中。 返回执行结果摘要和保存的文件路径列表。 """ if not urls: return "❌ 错误: 未提供任何 URL。" # print(f"🕷️ 正在并行爬取 {len(urls)} 个 URL...") # print(f"儲存目錄: {SAVE_DIR}") browser_config = BrowserConfig( headless=True, verbose=False, java_script_enabled=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/118.0.5993.118 Safari/537.36", proxy=None, # 可选,如果需要代理填 "http://user:pass@ip:port" ) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=5, excluded_tags=["script", "style", "nav", "footer"], remove_overlay_elements=True, process_iframes=True, ) results_summary = [] saved_files = [] try: async with AsyncWebCrawler(config=browser_config) as crawler: tasks = [crawler.arun(url=url, config=run_config) for url in urls] crawl_results = await asyncio.gather(*tasks, return_exceptions=True) for i, result in enumerate(crawl_results): url = urls[i] if isinstance(result, Exception): results_summary.append(f"❌ 抓取失败 {url}: {str(result)}") continue if result.success: markdown_content = result.markdown or "" if len(markdown_content) < 500: results_summary.append(f"⏩ 跳过 {url} (内容过短)") continue # 生成檔名 parsed = urlparse(url) domain = parsed.netloc.replace("www.", "").replace(".", "_") path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index" filename = f"{int(time.time())}_{domain}_{path_part}.md" # 完整檔案路徑 filepath = SAVE_DIR / filename # 寫入檔案 with open(filepath, "w", encoding="utf-8") as f: header = f"\n\n\n" f.write(header + markdown_content) saved_files.append(str(filepath)) # 建議轉成字串 results_summary.append(f"✅ 成功: {url} → {filepath}") else: status = getattr(result, 'status_code', '未知错误') results_summary.append(f"❌ 失败: {url} (状态码: {status})") except Exception as e: return f"🚨 爬虫系统崩溃: {str(e)}" # 回傳給 agent 的結果 final_output = ( f"### 批量抓取完成 ###\n" f"已成功保存 {len(saved_files)} 个文件。\n" f"儲存目錄: {SAVE_DIR}\n" f"详情:\n" + "\n".join(results_summary) ) if saved_files: final_output += "\n\n已保存的文件列表(可供後續讀取):\n" + "\n".join(saved_files) return final_output