弃用langgrpah更换deepagent
This commit is contained in:
27
src/server/deep_agent/tools/conversation_title_tool.py
Normal file
27
src/server/deep_agent/tools/conversation_title_tool.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
from src.server.deep_agent.agents.init_llm import title_llm
|
||||
|
||||
|
||||
def conversation_title(full_conversation):
|
||||
title_prompt = PromptTemplate(
|
||||
input_variables=["full_conversation"],
|
||||
template="""
|
||||
请严格按照以下要求生成对话标题:
|
||||
1. 标题长度:8-15个字,纯中文,无标点、无特殊符号、无换行
|
||||
2. 标题内容:基于完整对话,精准概括核心主题(兼顾用户需求和助手回复)
|
||||
3. 标题风格:自然口语化,符合中文表达习惯,不冗余
|
||||
|
||||
完整对话内容:
|
||||
{full_conversation}
|
||||
|
||||
仅输出标题,不要输出任何额外解释、说明或标点符号。
|
||||
"""
|
||||
)
|
||||
title_chain = title_prompt | title_llm
|
||||
response = title_chain.invoke({"full_conversation": full_conversation})
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(conversation_title("你好"))
|
||||
191
src/server/deep_agent/tools/crawl_tool.py
Normal file
191
src/server/deep_agent/tools/crawl_tool.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import time
|
||||
import asyncio
|
||||
from typing import List, Dict, Any
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from langchain_core.tools import tool
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# 路径配置
|
||||
# ─────────────────────────────────────
|
||||
|
||||
TOOL_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = TOOL_DIR.parent
|
||||
|
||||
# DeepAgents 推荐目录
|
||||
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
|
||||
SAVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"tool save : {str(PROJECT_ROOT / "agent_workspace")}")
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Browser 配置
|
||||
# ─────────────────────────────────────
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=False,
|
||||
java_script_enabled=True,
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/118.0 Safari/537.36"
|
||||
),
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=5,
|
||||
excluded_tags=["script", "style", "nav", "footer"],
|
||||
remove_overlay_elements=True,
|
||||
process_iframes=True,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# URL → 文件名
|
||||
# ─────────────────────────────────────
|
||||
|
||||
def build_filename(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
|
||||
domain = parsed.netloc.replace("www.", "").replace(".", "_")
|
||||
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
|
||||
|
||||
ts = int(time.time())
|
||||
rand = uuid.uuid4().hex[:6]
|
||||
|
||||
return f"{ts}_{rand}_{domain}_{path_part}.md"
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# 单个 URL 抓取
|
||||
# ─────────────────────────────────────
|
||||
|
||||
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
|
||||
async with sem:
|
||||
try:
|
||||
result = await crawler.arun(url=url, config=run_config)
|
||||
|
||||
if not result.success:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": f"status={getattr(result, 'status_code', 'unknown')}"
|
||||
}
|
||||
|
||||
markdown = result.markdown or ""
|
||||
|
||||
if len(markdown) < 500:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": "content too short"
|
||||
}
|
||||
|
||||
filename = build_filename(url)
|
||||
filepath = SAVE_DIR / filename
|
||||
|
||||
header = (
|
||||
f"<!-- Source: {url} -->\n"
|
||||
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
|
||||
)
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(header + markdown)
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"success": True,
|
||||
"file": str(filepath)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Async 主逻辑
|
||||
# ─────────────────────────────────────
|
||||
|
||||
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
|
||||
urls = list(set(urls)) # 去重
|
||||
|
||||
if not urls:
|
||||
return {"error": "no urls"}
|
||||
|
||||
sem = asyncio.Semaphore(5) # 并发限制
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
tasks = [
|
||||
crawl_one(crawler, url, sem)
|
||||
for url in urls
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
success_files = []
|
||||
summary = []
|
||||
|
||||
for r in results:
|
||||
|
||||
if r["success"]:
|
||||
success_files.append(r["file"])
|
||||
summary.append(f"✅ {r['url']}")
|
||||
else:
|
||||
summary.append(f"❌ {r['url']} ({r['error']})")
|
||||
|
||||
return {
|
||||
"saved_files": success_files,
|
||||
"count": len(success_files),
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────
|
||||
# Tool(同步)
|
||||
# ─────────────────────────────────────
|
||||
@tool
|
||||
def crawl4ai_batch(urls: List[str]) -> str:
|
||||
"""
|
||||
Batch crawl webpages and save their content as markdown files.
|
||||
|
||||
Args:
|
||||
urls: List of webpage URLs to crawl.
|
||||
|
||||
Returns:
|
||||
A summary of crawling results and saved file paths.
|
||||
"""
|
||||
|
||||
try:
|
||||
result = asyncio.run(_crawl4ai_batch(urls))
|
||||
|
||||
if "error" in result:
|
||||
return f"❌ Error: {result['error']}"
|
||||
|
||||
output = [
|
||||
"### 批量抓取完成 ###",
|
||||
f"成功保存文件: {result['count']}",
|
||||
f"保存目录: {SAVE_DIR}",
|
||||
"",
|
||||
"抓取详情:"
|
||||
]
|
||||
|
||||
output.extend(result["summary"])
|
||||
|
||||
if result["saved_files"]:
|
||||
output.append("\n可读取文件:")
|
||||
output.extend(result["saved_files"])
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
except Exception as e:
|
||||
return f"🚨 爬虫系统异常: {str(e)}"
|
||||
94
src/server/deep_agent/tools/generate_furniture_sketch.py
Normal file
94
src/server/deep_agent/tools/generate_furniture_sketch.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from google.oauth2 import service_account
|
||||
from langchain_core.tools import tool
|
||||
from google import genai
|
||||
from google.genai.types import GenerateContentConfig, Modality
|
||||
|
||||
from minio import Minio
|
||||
|
||||
from src.core.config import settings
|
||||
from src.server.utils.new_oss_client import oss_upload_image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# 初始化全局凭证和客户端
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
settings.GOOGLE_GENAI_USE_VERTEXAI,
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||
)
|
||||
|
||||
minio_client = Minio(settings.MINIO_URL, access_key=settings.MINIO_ACCESS, secret_key=settings.MINIO_SECRET, secure=settings.MINIO_SECURE)
|
||||
client = genai.Client(
|
||||
credentials=creds,
|
||||
project=settings.GOOGLE_CLOUD_PROJECT,
|
||||
location=settings.GOOGLE_CLOUD_LOCATION,
|
||||
vertexai=True
|
||||
)
|
||||
|
||||
|
||||
@tool
|
||||
async def generate_furniture(prompt: str) -> str:
|
||||
"""
|
||||
使用 Gemini 图像生成模型根据详细的英文提示词生成家具设计草图。
|
||||
"""
|
||||
print(f"\n[系统日志] 正在调用 Nano Banana (Gemini Image Gen) ...")
|
||||
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model="gemini-2.5-flash-image",
|
||||
contents=(f"Generate a professional furniture design sketch: {prompt}"),
|
||||
config=GenerateContentConfig(
|
||||
response_modalities=[Modality.TEXT, Modality.IMAGE],
|
||||
),
|
||||
)
|
||||
|
||||
image_bytes = None
|
||||
for part in response.candidates[0].content.parts:
|
||||
if part.inline_data:
|
||||
image_bytes = part.inline_data.data
|
||||
break
|
||||
|
||||
if not image_bytes:
|
||||
return "未能生成图像数据。"
|
||||
object_name = f"furniture/sketches/{uuid.uuid4()}.png"
|
||||
bucket = "fida-test" # 替换为你的 bucket 名称
|
||||
# 3. 调用你的上传函数
|
||||
upload_res = oss_upload_image(
|
||||
oss_client=minio_client,
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
image_bytes=image_bytes
|
||||
)
|
||||
|
||||
if upload_res:
|
||||
# 4. 构造访问链接 (如果是私有 bucket,需使用 presigned_get_object)
|
||||
# 这里简单示例为直接访问地址
|
||||
image_url = f"{bucket}/{object_name}"
|
||||
return json.dumps(
|
||||
{
|
||||
"tool_name": "generate_furniture",
|
||||
"data": image_url,
|
||||
"tool_status": "success"
|
||||
},
|
||||
ensure_ascii=False
|
||||
)
|
||||
else:
|
||||
return json.dumps(
|
||||
{
|
||||
"tool_name": "generate_furniture",
|
||||
"data": "图片生成成功,但上传至存储服务器失败。",
|
||||
"tool_status": "error"
|
||||
},
|
||||
ensure_ascii=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
return json.dumps(
|
||||
{
|
||||
"tool_name": "generate_furniture",
|
||||
"data": f"绘图流程异常",
|
||||
"tool_status": "error"
|
||||
},
|
||||
ensure_ascii=False
|
||||
)
|
||||
151
src/server/deep_agent/tools/report_generator_tool.py
Normal file
151
src/server/deep_agent/tools/report_generator_tool.py
Normal file
@@ -0,0 +1,151 @@
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, List, Dict
|
||||
from langchain_qwq import ChatQwen
|
||||
from langgraph.config import get_stream_writer
|
||||
from pydantic import BaseModel, Field
|
||||
from langchain_core.tools import tool
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
# =========================
|
||||
# LLM 初始化
|
||||
# =========================
|
||||
|
||||
|
||||
llm = ChatQwen(
|
||||
enable_thinking=False,
|
||||
model="qwen3.5-flash",
|
||||
temperature=0.2,
|
||||
max_tokens=3_000,
|
||||
timeout=None,
|
||||
max_retries=2,
|
||||
api_key=settings.QWEN_API_KEY)
|
||||
|
||||
|
||||
# =========================
|
||||
# Tool 输入 Schema
|
||||
# =========================
|
||||
|
||||
class ReportInput(BaseModel):
|
||||
report_topic: str = Field(
|
||||
...,
|
||||
description="Main topic of the report, e.g. '2026 Sofa Design Trends'"
|
||||
)
|
||||
structured_data: List[Dict] = Field(
|
||||
...,
|
||||
description="Structured retrieval result items"
|
||||
)
|
||||
language: Optional[str] = Field(
|
||||
default="English",
|
||||
description="Output language"
|
||||
)
|
||||
|
||||
|
||||
# =========================
|
||||
# LangGraph Tool
|
||||
# =========================
|
||||
|
||||
@tool("report_generator", args_schema=ReportInput)
|
||||
async def report_generator(
|
||||
report_topic: str,
|
||||
structured_data: List[Dict],
|
||||
language: str = "English"
|
||||
) -> dict:
|
||||
"""
|
||||
Generate a professional design/market report
|
||||
directly from structured retrieval results.
|
||||
"""
|
||||
|
||||
writer = get_stream_writer()
|
||||
if not structured_data:
|
||||
error_msg = "Error: No structured data provided."
|
||||
writer({"type": "report_error", "message": error_msg})
|
||||
return error_msg
|
||||
|
||||
collected_data_str = json.dumps(
|
||||
structured_data,
|
||||
ensure_ascii=False,
|
||||
indent=2
|
||||
)
|
||||
|
||||
# =========================
|
||||
# Prompt
|
||||
# =========================
|
||||
|
||||
system_prompt = f"""
|
||||
You are a professional design trend analyst.
|
||||
|
||||
Generate a long, structured Markdown report.
|
||||
|
||||
REQUIREMENTS:
|
||||
|
||||
1. Follow MECE principle.
|
||||
2. Embed images ONLY if they start with https://
|
||||
using: 
|
||||
3. Insert images inline.
|
||||
4. Every key insight must cite source:
|
||||
[Website Name](url)
|
||||
5. Use Markdown headings.
|
||||
6. Start directly with title.
|
||||
7. Be detailed and analytical.
|
||||
|
||||
Output Language: {language}
|
||||
"""
|
||||
|
||||
user_prompt = f"""
|
||||
Topic: {report_topic}
|
||||
|
||||
Input Data:
|
||||
{collected_data_str}
|
||||
"""
|
||||
|
||||
# =========================
|
||||
# 调用 LLM
|
||||
# =========================
|
||||
writer({"type": "report_start", "topic": report_topic, "language": language})
|
||||
|
||||
full_report = ""
|
||||
try:
|
||||
report_llm = llm.with_config(
|
||||
callbacks=[]
|
||||
)
|
||||
async for chunk in report_llm.astream(
|
||||
[
|
||||
SystemMessage(content=system_prompt),
|
||||
HumanMessage(content=user_prompt)
|
||||
]
|
||||
):
|
||||
if chunk.content: # Gemini 返回的 chunk.content
|
||||
delta = chunk.content
|
||||
full_report += delta
|
||||
# return {"type": "report_delta", "delta": delta}
|
||||
writer({"type": "report_delta", "delta": delta}) # ← 实时推送给前端
|
||||
writer({"type": "report_stop", "topic": report_topic, "language": language})
|
||||
except Exception as e:
|
||||
error_msg = f"LLM generation failed: {str(e)}"
|
||||
writer({"type": "report_error", "message": error_msg})
|
||||
return error_msg
|
||||
|
||||
report_content = full_report.strip()
|
||||
|
||||
# =========================
|
||||
# 保存报告
|
||||
# =========================
|
||||
output_dir = "workspace/reports"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
safe_topic = re.sub(r'[\\/*?:"<>|]', "", report_topic.replace(" ", "_"))
|
||||
filename = f"{output_dir}/{safe_topic}.md"
|
||||
|
||||
try:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(report_content)
|
||||
writer({"type": "report_complete", "file_path": filename})
|
||||
except Exception as e:
|
||||
writer({"type": "report_save_warning", "message": str(e)})
|
||||
|
||||
# 返回完整内容(作为 tool result),同时正文已通过 delta 流式输出
|
||||
return report_content + f"\n\n✅ Report saved to: {filename}"
|
||||
67
src/server/deep_agent/tools/research_tool.py
Normal file
67
src/server/deep_agent/tools/research_tool.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import List, Set, Optional
|
||||
from langchain_core.tools import tool
|
||||
from tavily import TavilyClient
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
# 模拟配置加载
|
||||
TAVILY_API_KEY = settings.TAVILY_API_KEY
|
||||
|
||||
|
||||
@tool
|
||||
async def topic_research(topic: list[str], max_urls: int = 5) -> str:
|
||||
"""
|
||||
深度调研工具。该工具会利用 Tavily 搜索引擎针对特定主题进行多维度搜索。
|
||||
它会自动生成针对性的搜索词(包含年份和趋势),并返回去重后的高质量 URL 列表。
|
||||
"""
|
||||
if not TAVILY_API_KEY:
|
||||
return "❌ 错误: 未配置 TAVILY_API_KEY。"
|
||||
|
||||
client = TavilyClient(api_key=TAVILY_API_KEY)
|
||||
|
||||
# 1. 自动生成多维度搜索词 (在工具内部快速生成)
|
||||
|
||||
# 2. 并行执行搜索
|
||||
async def perform_search(q: str):
|
||||
# 使用 asyncio.to_thread 运行同步的 Tavily SDK
|
||||
def sync_search():
|
||||
try:
|
||||
response = client.search(
|
||||
query=q,
|
||||
search_depth="advanced",
|
||||
max_results=5,
|
||||
include_answer=False
|
||||
)
|
||||
return response.get('results', [])
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}")
|
||||
return []
|
||||
|
||||
return await asyncio.to_thread(sync_search)
|
||||
|
||||
search_tasks = [perform_search(q) for q in topic]
|
||||
search_results_list = await asyncio.gather(*search_tasks)
|
||||
|
||||
# 3. 结果去重与过滤
|
||||
seen_urls: Set[str] = set()
|
||||
final_urls = []
|
||||
|
||||
# 常见的非内容页面过滤
|
||||
skip_extensions = ('.pdf', '.jpg', '.png', '.zip', '.exe')
|
||||
|
||||
for results in search_results_list:
|
||||
for item in results:
|
||||
url = item.get('url')
|
||||
if url and url not in seen_urls:
|
||||
if not url.lower().endswith(skip_extensions):
|
||||
seen_urls.add(url)
|
||||
final_urls.append(url)
|
||||
|
||||
# 4. 结果截断
|
||||
selected_urls = final_urls[:max_urls]
|
||||
|
||||
# 返回 JSON 字符串,便于 Agent 下一步调用批量爬虫 (Crawl4ai)
|
||||
return json.dumps(selected_urls, ensure_ascii=False)
|
||||
225
src/server/deep_agent/tools/structured_retrieval_tool.py
Normal file
225
src/server/deep_agent/tools/structured_retrieval_tool.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from langchain_core.tools import tool
|
||||
from langchain_core.documents import Document
|
||||
|
||||
# RAG
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from sentence_transformers import CrossEncoder
|
||||
|
||||
# =========================
|
||||
# 全局模型(单例)
|
||||
# =========================
|
||||
|
||||
_EMBEDDING_MODEL = HuggingFaceEmbeddings(
|
||||
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
|
||||
_RERANK_MODEL = CrossEncoder(
|
||||
"cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
)
|
||||
|
||||
|
||||
class StructuredRetrievalInput(BaseModel):
|
||||
file_paths: List[str] = Field(..., description="List of local markdown file paths.")
|
||||
query: str = Field(..., description="Extraction query")
|
||||
source_url: Optional[str] = Field(None, description="Optional global source URL")
|
||||
|
||||
|
||||
@tool("structured_retrieval", args_schema=StructuredRetrievalInput)
|
||||
def structured_retrieval(
|
||||
file_paths: List[str],
|
||||
query: str,
|
||||
source_url: Optional[str] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Batch structured extraction from markdown files.
|
||||
- Performs vector search + re-ranking
|
||||
- Saves extracted structured data as JSON file to disk
|
||||
- Returns ONLY summary (status, count, file path)
|
||||
"""
|
||||
|
||||
# ── 1. 收集所有文件內容 ──────────────────────────────────────
|
||||
all_docs_pool: List[Document] = []
|
||||
|
||||
for path in file_paths:
|
||||
if not os.path.exists(path) or not path.endswith((".md", ".markdown")):
|
||||
continue
|
||||
|
||||
file_name = os.path.basename(path)
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
current_source = source_url or _extract_source_from_md(content) or "unknown"
|
||||
|
||||
sections = _split_markdown_by_headers(content)
|
||||
|
||||
for sec in sections:
|
||||
all_docs_pool.append(
|
||||
Document(
|
||||
page_content=sec,
|
||||
metadata={"source_url": current_source, "file_name": file_name}
|
||||
)
|
||||
)
|
||||
|
||||
if not all_docs_pool:
|
||||
return {"status": "no_documents_found", "items_count": 0, "json_path": None}
|
||||
|
||||
# ── 2. Vector search ────────────────────────────────────────────
|
||||
vector_store = FAISS.from_documents(all_docs_pool, _EMBEDDING_MODEL)
|
||||
retrieved = vector_store.similarity_search(query, k=200)
|
||||
|
||||
# ── 3. 提取結構化片段 ──────────────────────────────────────────
|
||||
structured_items = []
|
||||
|
||||
for doc in retrieved:
|
||||
text = doc.page_content.strip()
|
||||
if len(text) < 30:
|
||||
continue
|
||||
|
||||
images = list(set(re.findall(r"!\[.*?\]\((.*?)\)", text)))
|
||||
|
||||
structured_items.append(
|
||||
{
|
||||
"text": text,
|
||||
"images": images,
|
||||
"source_url": doc.metadata.get("source_url"),
|
||||
"file_name": doc.metadata.get("file_name")
|
||||
}
|
||||
)
|
||||
|
||||
# ── 4. Re-rank ──────────────────────────────────────────────────
|
||||
if structured_items:
|
||||
unique_items = {item["text"]: item for item in structured_items}.values()
|
||||
pairs = [[query, item["text"]] for item in unique_items]
|
||||
scores = _RERANK_MODEL.predict(pairs)
|
||||
|
||||
sorted_items = sorted(
|
||||
zip(scores, unique_items),
|
||||
key=lambda x: x[0],
|
||||
reverse=True
|
||||
)
|
||||
top_items = [item for _, item in sorted_items[:50]]
|
||||
else:
|
||||
top_items = []
|
||||
|
||||
# ── 5. 寫入 JSON 文件 ──────────────────────────────────────────
|
||||
if not top_items:
|
||||
return {"status": "no_relevant_content", "items_count": 0, "json_path": None}
|
||||
|
||||
# 產生有意義的檔名
|
||||
safe_query = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fa5]', '_', query)[:40]
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
json_filename = f"extracted_{safe_query}_{timestamp}.json"
|
||||
|
||||
# 建議的儲存目錄(與 crawl4ai_batch 對齊)
|
||||
output_dir = os.path.join(os.path.dirname(file_paths[0]), "..", "extracted")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
json_path = os.path.join(output_dir, json_filename)
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
"query": query,
|
||||
"extracted_at": timestamp,
|
||||
"item_count": len(top_items),
|
||||
"items": top_items
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False,
|
||||
indent=2
|
||||
)
|
||||
|
||||
# ── 6. 只回傳摘要 ──────────────────────────────────────────────
|
||||
return {
|
||||
"status": "success",
|
||||
"items_count": len(top_items),
|
||||
"json_path": json_path,
|
||||
"summary": f"已提取 {len(top_items)} 個高相關片段,儲存於 {json_path}"
|
||||
}
|
||||
|
||||
|
||||
def _extract_source_from_md(content: str) -> Optional[str]:
|
||||
match = re.search(r"<!--\s*Source:\s*(.*?)\s*-->", content)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
# =========================
|
||||
# Markdown Header Split
|
||||
# =========================
|
||||
|
||||
def _split_markdown_by_headers(
|
||||
content: str,
|
||||
max_chars: int = 2000,
|
||||
overlap: int = 150,
|
||||
):
|
||||
header_re = re.compile(
|
||||
r'^(#{1,6})\s+(.+?)\s*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
matches = list(header_re.finditer(content))
|
||||
|
||||
if not matches:
|
||||
return _chunk_text(content, max_chars, overlap)
|
||||
|
||||
sections = []
|
||||
|
||||
for i, m in enumerate(matches):
|
||||
start = m.start()
|
||||
end = (
|
||||
matches[i + 1].start()
|
||||
if i + 1 < len(matches)
|
||||
else len(content)
|
||||
)
|
||||
|
||||
block = content[start:end].strip()
|
||||
if block:
|
||||
sections.append(block)
|
||||
|
||||
final_sections = []
|
||||
|
||||
for s in sections:
|
||||
if len(s) > max_chars:
|
||||
final_sections.extend(
|
||||
_chunk_text(s, max_chars, overlap)
|
||||
)
|
||||
else:
|
||||
final_sections.append(s)
|
||||
|
||||
return final_sections
|
||||
|
||||
|
||||
def _chunk_text(
|
||||
text: str,
|
||||
max_chars: int = 2000,
|
||||
overlap: int = 150
|
||||
):
|
||||
text = text.strip()
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = min(len(text), start + max_chars)
|
||||
chunk = text[start:end].strip()
|
||||
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
if end == len(text):
|
||||
break
|
||||
|
||||
start = max(0, end - overlap)
|
||||
|
||||
return chunks
|
||||
57
src/server/deep_agent/tools/user_persona_tool.py
Normal file
57
src/server/deep_agent/tools/user_persona_tool.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from datetime import datetime
|
||||
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langchain_core.tools import tool
|
||||
from pymongo import MongoClient
|
||||
from src.core.config import MONGO_URI
|
||||
|
||||
client = MongoClient(MONGO_URI)
|
||||
db = client["report_agent"]
|
||||
collection = db["user_profiles"]
|
||||
|
||||
|
||||
@tool
|
||||
def query_report_profile(config: RunnableConfig, ) -> dict:
|
||||
"""
|
||||
查询用户报告画像
|
||||
"""
|
||||
thread_id = config['configurable']['thread_id']
|
||||
doc = collection.find_one({"thread_id": thread_id})
|
||||
|
||||
if not doc:
|
||||
return {"profile": {}}
|
||||
|
||||
doc.pop("_id", None)
|
||||
return doc
|
||||
|
||||
|
||||
@tool
|
||||
def update_report_profile(config: RunnableConfig, profile: dict) -> dict:
|
||||
"""
|
||||
更新用户画像信息
|
||||
"""
|
||||
thread_id = config['configurable']['thread_id']
|
||||
collection.update_one(
|
||||
{"thread_id": thread_id},
|
||||
{
|
||||
"$set": {
|
||||
"profile": profile
|
||||
}
|
||||
},
|
||||
upsert=True
|
||||
)
|
||||
|
||||
return {"status": "success", "profile": profile}
|
||||
|
||||
|
||||
@tool
|
||||
def check_profile_complete(profile: dict) -> dict:
|
||||
"""
|
||||
判断画像是否完整
|
||||
"""
|
||||
required = ["style", "room_type", "budget"]
|
||||
missing = [f for f in required if f not in profile]
|
||||
return {
|
||||
"complete": len(missing) == 0,
|
||||
"missing_fields": missing
|
||||
}
|
||||
Reference in New Issue
Block a user