弃用langgrpah更换deepagent

2026-03-11 21:45:46 +08:00
parent c862121b48
commit 7042d428fa
44 changed files with 2847 additions and 619 deletions
--- a/src/server/deep_agent/tools/conversation_title_tool.py
+++ b/src/server/deep_agent/tools/conversation_title_tool.py
@@ -0,0 +1,27 @@
+from langchain_core.prompts import PromptTemplate
+
+from src.server.deep_agent.agents.init_llm import title_llm
+
+
+def conversation_title(full_conversation):
+    title_prompt = PromptTemplate(
+        input_variables=["full_conversation"],
+        template="""
+        请严格按照以下要求生成对话标题：
+        1. 标题长度：8-15个字，纯中文，无标点、无特殊符号、无换行
+        2. 标题内容：基于完整对话，精准概括核心主题（兼顾用户需求和助手回复）
+        3. 标题风格：自然口语化，符合中文表达习惯，不冗余
+        
+        完整对话内容：
+        {full_conversation}
+        
+        仅输出标题，不要输出任何额外解释、说明或标点符号。
+        """
+    )
+    title_chain = title_prompt | title_llm
+    response = title_chain.invoke({"full_conversation": full_conversation})
+    return response
+
+
+if __name__ == '__main__':
+    print(conversation_title("你好"))
--- a/src/server/deep_agent/tools/crawl_tool.py
+++ b/src/server/deep_agent/tools/crawl_tool.py
@@ -0,0 +1,191 @@
+import time
+import asyncio
+from typing import List, Dict, Any
+from urllib.parse import urlparse
+from pathlib import Path
+
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from langchain_core.tools import tool
+
+# ─────────────────────────────────────
+# 路径配置
+# ─────────────────────────────────────
+
+TOOL_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = TOOL_DIR.parent
+
+# DeepAgents 推荐目录
+SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
+SAVE_DIR.mkdir(parents=True, exist_ok=True)
+
+print(f"tool save : {str(PROJECT_ROOT / "agent_workspace")}")
+
+# ─────────────────────────────────────
+# Browser 配置
+# ─────────────────────────────────────
+
+browser_config = BrowserConfig(
+    headless=True,
+    verbose=False,
+    java_script_enabled=True,
+    user_agent=(
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/118.0 Safari/537.36"
+    ),
+)
+
+run_config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    word_count_threshold=5,
+    excluded_tags=["script", "style", "nav", "footer"],
+    remove_overlay_elements=True,
+    process_iframes=True,
+)
+
+
+# ─────────────────────────────────────
+# URL → 文件名
+# ─────────────────────────────────────
+
+def build_filename(url: str) -> str:
+    parsed = urlparse(url)
+
+    domain = parsed.netloc.replace("www.", "").replace(".", "_")
+    path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
+
+    ts = int(time.time())
+    rand = uuid.uuid4().hex[:6]
+
+    return f"{ts}_{rand}_{domain}_{path_part}.md"
+
+
+# ─────────────────────────────────────
+# 单个 URL 抓取
+# ─────────────────────────────────────
+
+async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
+    async with sem:
+        try:
+            result = await crawler.arun(url=url, config=run_config)
+
+            if not result.success:
+                return {
+                    "url": url,
+                    "success": False,
+                    "error": f"status={getattr(result, 'status_code', 'unknown')}"
+                }
+
+            markdown = result.markdown or ""
+
+            if len(markdown) < 500:
+                return {
+                    "url": url,
+                    "success": False,
+                    "error": "content too short"
+                }
+
+            filename = build_filename(url)
+            filepath = SAVE_DIR / filename
+
+            header = (
+                f"<!-- Source: {url} -->\n"
+                f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
+            )
+
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write(header + markdown)
+
+            return {
+                "url": url,
+                "success": True,
+                "file": str(filepath)
+            }
+
+        except Exception as e:
+            return {
+                "url": url,
+                "success": False,
+                "error": str(e)
+            }
+
+
+# ─────────────────────────────────────
+# Async 主逻辑
+# ─────────────────────────────────────
+
+async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
+    urls = list(set(urls))  # 去重
+
+    if not urls:
+        return {"error": "no urls"}
+
+    sem = asyncio.Semaphore(5)  # 并发限制
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+
+        tasks = [
+            crawl_one(crawler, url, sem)
+            for url in urls
+        ]
+
+        results = await asyncio.gather(*tasks)
+
+    success_files = []
+    summary = []
+
+    for r in results:
+
+        if r["success"]:
+            success_files.append(r["file"])
+            summary.append(f"✅ {r['url']}")
+        else:
+            summary.append(f"❌ {r['url']} ({r['error']})")
+
+    return {
+        "saved_files": success_files,
+        "count": len(success_files),
+        "summary": summary,
+    }
+
+
+# ─────────────────────────────────────
+# Tool（同步）
+# ─────────────────────────────────────
+@tool
+def crawl4ai_batch(urls: List[str]) -> str:
+    """
+    Batch crawl webpages and save their content as markdown files.
+
+    Args:
+        urls: List of webpage URLs to crawl.
+
+    Returns:
+        A summary of crawling results and saved file paths.
+    """
+
+    try:
+        result = asyncio.run(_crawl4ai_batch(urls))
+
+        if "error" in result:
+            return f"❌ Error: {result['error']}"
+
+        output = [
+            "### 批量抓取完成 ###",
+            f"成功保存文件: {result['count']}",
+            f"保存目录: {SAVE_DIR}",
+            "",
+            "抓取详情:"
+        ]
+
+        output.extend(result["summary"])
+
+        if result["saved_files"]:
+            output.append("\n可读取文件:")
+            output.extend(result["saved_files"])
+
+        return "\n".join(output)
+
+    except Exception as e:
+        return f"🚨 爬虫系统异常: {str(e)}"
--- a/src/server/deep_agent/tools/generate_furniture_sketch.py
+++ b/src/server/deep_agent/tools/generate_furniture_sketch.py
@@ -0,0 +1,94 @@
+import json
+import logging
+import uuid
+from google.oauth2 import service_account
+from langchain_core.tools import tool
+from google import genai
+from google.genai.types import GenerateContentConfig, Modality
+
+from minio import Minio
+
+from src.core.config import settings
+from src.server.utils.new_oss_client import oss_upload_image
+
+logger = logging.getLogger(__name__)
+# 初始化全局凭证和客户端
+creds = service_account.Credentials.from_service_account_file(
+    settings.GOOGLE_GENAI_USE_VERTEXAI,
+    scopes=["https://www.googleapis.com/auth/cloud-platform"],
+)
+
+minio_client = Minio(settings.MINIO_URL, access_key=settings.MINIO_ACCESS, secret_key=settings.MINIO_SECRET, secure=settings.MINIO_SECURE)
+client = genai.Client(
+    credentials=creds,
+    project=settings.GOOGLE_CLOUD_PROJECT,
+    location=settings.GOOGLE_CLOUD_LOCATION,
+    vertexai=True
+)
+
+
+@tool
+async def generate_furniture(prompt: str) -> str:
+    """
+    使用 Gemini 图像生成模型根据详细的英文提示词生成家具设计草图。
+    """
+    print(f"\n[系统日志] 正在调用 Nano Banana (Gemini Image Gen) ...")
+
+    try:
+        response = client.models.generate_content(
+            model="gemini-2.5-flash-image",
+            contents=(f"Generate a professional furniture design sketch: {prompt}"),
+            config=GenerateContentConfig(
+                response_modalities=[Modality.TEXT, Modality.IMAGE],
+            ),
+        )
+
+        image_bytes = None
+        for part in response.candidates[0].content.parts:
+            if part.inline_data:
+                image_bytes = part.inline_data.data
+                break
+
+        if not image_bytes:
+            return "未能生成图像数据。"
+        object_name = f"furniture/sketches/{uuid.uuid4()}.png"
+        bucket = "fida-test"  # 替换为你的 bucket 名称
+        # 3. 调用你的上传函数
+        upload_res = oss_upload_image(
+            oss_client=minio_client,
+            bucket=bucket,
+            object_name=object_name,
+            image_bytes=image_bytes
+        )
+
+        if upload_res:
+            # 4. 构造访问链接 (如果是私有 bucket，需使用 presigned_get_object)
+            # 这里简单示例为直接访问地址
+            image_url = f"{bucket}/{object_name}"
+            return json.dumps(
+                {
+                    "tool_name": "generate_furniture",
+                    "data": image_url,
+                    "tool_status": "success"
+                },
+                ensure_ascii=False
+            )
+        else:
+            return json.dumps(
+                {
+                    "tool_name": "generate_furniture",
+                    "data": "图片生成成功，但上传至存储服务器失败。",
+                    "tool_status": "error"
+                },
+                ensure_ascii=False
+            )
+    except Exception as e:
+        logger.warning(e)
+        return json.dumps(
+            {
+                "tool_name": "generate_furniture",
+                "data": f"绘图流程异常",
+                "tool_status": "error"
+            },
+            ensure_ascii=False
+        )
--- a/src/server/deep_agent/tools/report_generator_tool.py
+++ b/src/server/deep_agent/tools/report_generator_tool.py
@@ -0,0 +1,151 @@
+import os
+import json
+import re
+from typing import Optional, List, Dict
+from langchain_qwq import ChatQwen
+from langgraph.config import get_stream_writer
+from pydantic import BaseModel, Field
+from langchain_core.tools import tool
+from langchain_core.messages import SystemMessage, HumanMessage
+
+from src.core.config import settings
+
+# =========================
+# LLM 初始化
+# =========================
+
+
+llm = ChatQwen(
+    enable_thinking=False,
+    model="qwen3.5-flash",
+    temperature=0.2,
+    max_tokens=3_000,
+    timeout=None,
+    max_retries=2,
+    api_key=settings.QWEN_API_KEY)
+
+
+# =========================
+# Tool 输入 Schema
+# =========================
+
+class ReportInput(BaseModel):
+    report_topic: str = Field(
+        ...,
+        description="Main topic of the report, e.g. '2026 Sofa Design Trends'"
+    )
+    structured_data: List[Dict] = Field(
+        ...,
+        description="Structured retrieval result items"
+    )
+    language: Optional[str] = Field(
+        default="English",
+        description="Output language"
+    )
+
+
+# =========================
+# LangGraph Tool
+# =========================
+
+@tool("report_generator", args_schema=ReportInput)
+async def report_generator(
+        report_topic: str,
+        structured_data: List[Dict],
+        language: str = "English"
+) -> dict:
+    """
+    Generate a professional design/market report
+    directly from structured retrieval results.
+    """
+
+    writer = get_stream_writer()
+    if not structured_data:
+        error_msg = "Error: No structured data provided."
+        writer({"type": "report_error", "message": error_msg})
+        return error_msg
+
+    collected_data_str = json.dumps(
+        structured_data,
+        ensure_ascii=False,
+        indent=2
+    )
+
+    # =========================
+    # Prompt
+    # =========================
+
+    system_prompt = f"""
+    You are a professional design trend analyst.
+    
+    Generate a long, structured Markdown report.
+    
+    REQUIREMENTS:
+    
+    1. Follow MECE principle.
+    2. Embed images ONLY if they start with https://
+       using: ![alt](url)
+    3. Insert images inline.
+    4. Every key insight must cite source:
+       [Website Name](url)
+    5. Use Markdown headings.
+    6. Start directly with title.
+    7. Be detailed and analytical.
+    
+    Output Language: {language}
+    """
+
+    user_prompt = f"""
+    Topic: {report_topic}
+    
+    Input Data:
+    {collected_data_str}
+    """
+
+    # =========================
+    # 调用 LLM
+    # =========================
+    writer({"type": "report_start", "topic": report_topic, "language": language})
+
+    full_report = ""
+    try:
+        report_llm = llm.with_config(
+            callbacks=[]
+        )
+        async for chunk in report_llm.astream(
+                [
+                    SystemMessage(content=system_prompt),
+                    HumanMessage(content=user_prompt)
+                ]
+        ):
+            if chunk.content:  # Gemini 返回的 chunk.content
+                delta = chunk.content
+                full_report += delta
+                # return {"type": "report_delta", "delta": delta}
+                writer({"type": "report_delta", "delta": delta})  # ← 实时推送给前端
+        writer({"type": "report_stop", "topic": report_topic, "language": language})
+    except Exception as e:
+        error_msg = f"LLM generation failed: {str(e)}"
+        writer({"type": "report_error", "message": error_msg})
+        return error_msg
+
+    report_content = full_report.strip()
+
+    # =========================
+    # 保存报告
+    # =========================
+    output_dir = "workspace/reports"
+    os.makedirs(output_dir, exist_ok=True)
+
+    safe_topic = re.sub(r'[\\/*?:"<>|]', "", report_topic.replace(" ", "_"))
+    filename = f"{output_dir}/{safe_topic}.md"
+
+    try:
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(report_content)
+        writer({"type": "report_complete", "file_path": filename})
+    except Exception as e:
+        writer({"type": "report_save_warning", "message": str(e)})
+
+    # 返回完整内容（作为 tool result），同时正文已通过 delta 流式输出
+    return report_content + f"\n\n✅ Report saved to: {filename}"
--- a/src/server/deep_agent/tools/research_tool.py
+++ b/src/server/deep_agent/tools/research_tool.py
@@ -0,0 +1,67 @@
+import asyncio
+import json
+from datetime import datetime
+from typing import List, Set, Optional
+from langchain_core.tools import tool
+from tavily import TavilyClient
+
+from src.core.config import settings
+
+# 模拟配置加载
+TAVILY_API_KEY = settings.TAVILY_API_KEY
+
+
+@tool
+async def topic_research(topic: list[str], max_urls: int = 5) -> str:
+    """
+    深度调研工具。该工具会利用 Tavily 搜索引擎针对特定主题进行多维度搜索。
+    它会自动生成针对性的搜索词（包含年份和趋势），并返回去重后的高质量 URL 列表。
+    """
+    if not TAVILY_API_KEY:
+        return "❌ 错误: 未配置 TAVILY_API_KEY。"
+
+    client = TavilyClient(api_key=TAVILY_API_KEY)
+
+    # 1. 自动生成多维度搜索词 (在工具内部快速生成)
+
+    # 2. 并行执行搜索
+    async def perform_search(q: str):
+        # 使用 asyncio.to_thread 运行同步的 Tavily SDK
+        def sync_search():
+            try:
+                response = client.search(
+                    query=q,
+                    search_depth="advanced",
+                    max_results=5,
+                    include_answer=False
+                )
+                return response.get('results', [])
+            except Exception as e:
+                print(f"Search error: {e}")
+                return []
+
+        return await asyncio.to_thread(sync_search)
+
+    search_tasks = [perform_search(q) for q in topic]
+    search_results_list = await asyncio.gather(*search_tasks)
+
+    # 3. 结果去重与过滤
+    seen_urls: Set[str] = set()
+    final_urls = []
+
+    # 常见的非内容页面过滤
+    skip_extensions = ('.pdf', '.jpg', '.png', '.zip', '.exe')
+
+    for results in search_results_list:
+        for item in results:
+            url = item.get('url')
+            if url and url not in seen_urls:
+                if not url.lower().endswith(skip_extensions):
+                    seen_urls.add(url)
+                    final_urls.append(url)
+
+    # 4. 结果截断
+    selected_urls = final_urls[:max_urls]
+
+    # 返回 JSON 字符串，便于 Agent 下一步调用批量爬虫 (Crawl4ai)
+    return json.dumps(selected_urls, ensure_ascii=False)
--- a/src/server/deep_agent/tools/structured_retrieval_tool.py
+++ b/src/server/deep_agent/tools/structured_retrieval_tool.py
@@ -0,0 +1,225 @@
+import os
+import re
+import json
+from datetime import datetime
+from typing import List, Dict, Optional
+
+from pydantic import BaseModel, Field
+from langchain_core.tools import tool
+from langchain_core.documents import Document
+
+# RAG
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from sentence_transformers import CrossEncoder
+
+# =========================
+# 全局模型（单例）
+# =========================
+
+_EMBEDDING_MODEL = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2"
+)
+
+_RERANK_MODEL = CrossEncoder(
+    "cross-encoder/ms-marco-MiniLM-L-6-v2"
+)
+
+
+class StructuredRetrievalInput(BaseModel):
+    file_paths: List[str] = Field(..., description="List of local markdown file paths.")
+    query: str = Field(..., description="Extraction query")
+    source_url: Optional[str] = Field(None, description="Optional global source URL")
+
+
+@tool("structured_retrieval", args_schema=StructuredRetrievalInput)
+def structured_retrieval(
+        file_paths: List[str],
+        query: str,
+        source_url: Optional[str] = None
+) -> Dict:
+    """
+    Batch structured extraction from markdown files.
+    - Performs vector search + re-ranking
+    - Saves extracted structured data as JSON file to disk
+    - Returns ONLY summary (status, count, file path)
+    """
+
+    # ── 1. 收集所有文件內容 ──────────────────────────────────────
+    all_docs_pool: List[Document] = []
+
+    for path in file_paths:
+        if not os.path.exists(path) or not path.endswith((".md", ".markdown")):
+            continue
+
+        file_name = os.path.basename(path)
+
+        with open(path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        current_source = source_url or _extract_source_from_md(content) or "unknown"
+
+        sections = _split_markdown_by_headers(content)
+
+        for sec in sections:
+            all_docs_pool.append(
+                Document(
+                    page_content=sec,
+                    metadata={"source_url": current_source, "file_name": file_name}
+                )
+            )
+
+    if not all_docs_pool:
+        return {"status": "no_documents_found", "items_count": 0, "json_path": None}
+
+    # ── 2. Vector search ────────────────────────────────────────────
+    vector_store = FAISS.from_documents(all_docs_pool, _EMBEDDING_MODEL)
+    retrieved = vector_store.similarity_search(query, k=200)
+
+    # ── 3. 提取結構化片段 ──────────────────────────────────────────
+    structured_items = []
+
+    for doc in retrieved:
+        text = doc.page_content.strip()
+        if len(text) < 30:
+            continue
+
+        images = list(set(re.findall(r"!\[.*?\]\((.*?)\)", text)))
+
+        structured_items.append(
+            {
+                "text": text,
+                "images": images,
+                "source_url": doc.metadata.get("source_url"),
+                "file_name": doc.metadata.get("file_name")
+            }
+        )
+
+    # ── 4. Re-rank ──────────────────────────────────────────────────
+    if structured_items:
+        unique_items = {item["text"]: item for item in structured_items}.values()
+        pairs = [[query, item["text"]] for item in unique_items]
+        scores = _RERANK_MODEL.predict(pairs)
+
+        sorted_items = sorted(
+            zip(scores, unique_items),
+            key=lambda x: x[0],
+            reverse=True
+        )
+        top_items = [item for _, item in sorted_items[:50]]
+    else:
+        top_items = []
+
+    # ── 5. 寫入 JSON 文件 ──────────────────────────────────────────
+    if not top_items:
+        return {"status": "no_relevant_content", "items_count": 0, "json_path": None}
+
+    # 產生有意義的檔名
+    safe_query = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fa5]', '_', query)[:40]
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    json_filename = f"extracted_{safe_query}_{timestamp}.json"
+
+    # 建議的儲存目錄（與 crawl4ai_batch 對齊）
+    output_dir = os.path.join(os.path.dirname(file_paths[0]), "..", "extracted")
+    os.makedirs(output_dir, exist_ok=True)
+
+    json_path = os.path.join(output_dir, json_filename)
+
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "query": query,
+                "extracted_at": timestamp,
+                "item_count": len(top_items),
+                "items": top_items
+            },
+            f,
+            ensure_ascii=False,
+            indent=2
+        )
+
+    # ── 6. 只回傳摘要 ──────────────────────────────────────────────
+    return {
+        "status": "success",
+        "items_count": len(top_items),
+        "json_path": json_path,
+        "summary": f"已提取 {len(top_items)} 個高相關片段，儲存於 {json_path}"
+    }
+
+
+def _extract_source_from_md(content: str) -> Optional[str]:
+    match = re.search(r"<!--\s*Source:\s*(.*?)\s*-->", content)
+    return match.group(1).strip() if match else None
+
+
+# =========================
+# Markdown Header Split
+# =========================
+
+def _split_markdown_by_headers(
+        content: str,
+        max_chars: int = 2000,
+        overlap: int = 150,
+):
+    header_re = re.compile(
+        r'^(#{1,6})\s+(.+?)\s*$',
+        re.MULTILINE
+    )
+
+    matches = list(header_re.finditer(content))
+
+    if not matches:
+        return _chunk_text(content, max_chars, overlap)
+
+    sections = []
+
+    for i, m in enumerate(matches):
+        start = m.start()
+        end = (
+            matches[i + 1].start()
+            if i + 1 < len(matches)
+            else len(content)
+        )
+
+        block = content[start:end].strip()
+        if block:
+            sections.append(block)
+
+    final_sections = []
+
+    for s in sections:
+        if len(s) > max_chars:
+            final_sections.extend(
+                _chunk_text(s, max_chars, overlap)
+            )
+        else:
+            final_sections.append(s)
+
+    return final_sections
+
+
+def _chunk_text(
+        text: str,
+        max_chars: int = 2000,
+        overlap: int = 150
+):
+    text = text.strip()
+    if len(text) <= max_chars:
+        return [text]
+
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        chunk = text[start:end].strip()
+
+        if chunk:
+            chunks.append(chunk)
+
+        if end == len(text):
+            break
+
+        start = max(0, end - overlap)
+
+    return chunks
--- a/src/server/deep_agent/tools/user_persona_tool.py
+++ b/src/server/deep_agent/tools/user_persona_tool.py
@@ -0,0 +1,57 @@
+from datetime import datetime
+
+from langchain_core.runnables import RunnableConfig
+from langchain_core.tools import tool
+from pymongo import MongoClient
+from src.core.config import MONGO_URI
+
+client = MongoClient(MONGO_URI)
+db = client["report_agent"]
+collection = db["user_profiles"]
+
+
+@tool
+def query_report_profile(config: RunnableConfig, ) -> dict:
+    """
+    查询用户报告画像
+    """
+    thread_id = config['configurable']['thread_id']
+    doc = collection.find_one({"thread_id": thread_id})
+
+    if not doc:
+        return {"profile": {}}
+
+    doc.pop("_id", None)
+    return doc
+
+
+@tool
+def update_report_profile(config: RunnableConfig, profile: dict) -> dict:
+    """
+    更新用户画像信息
+    """
+    thread_id = config['configurable']['thread_id']
+    collection.update_one(
+        {"thread_id": thread_id},
+        {
+            "$set": {
+                "profile": profile
+            }
+        },
+        upsert=True
+    )
+
+    return {"status": "success", "profile": profile}
+
+
+@tool
+def check_profile_complete(profile: dict) -> dict:
+    """
+    判断画像是否完整
+    """
+    required = ["style", "room_type", "budget"]
+    missing = [f for f in required if f not in profile]
+    return {
+        "complete": len(missing) == 0,
+        "missing_fields": missing
+    }