弃用langgrpah更换deepagent

This commit is contained in:
zcr
2026-03-11 21:45:46 +08:00
parent c862121b48
commit 7042d428fa
44 changed files with 2847 additions and 619 deletions

View File

View File

@@ -0,0 +1,23 @@
from langchain_qwq import ChatQwen
from src.core.config import settings
llm = ChatQwen(
model="qwen3.5-flash",
max_tokens=3_000,
timeout=None,
max_retries=2,
enable_thinking=False,
api_key=settings.QWEN_API_KEY
)
title_llm = ChatQwen(
model="qwen-plus",
max_tokens=3_000,
timeout=None,
max_retries=2,
streaming=False,
temperature=0.1,
top_p=0.8,
api_key=settings.QWEN_API_KEY
)

View File

@@ -0,0 +1,51 @@
from pathlib import Path
from deepagents import create_deep_agent
from deepagents.backends import FilesystemBackend
from langchain.agents.middleware import SummarizationMiddleware
from langgraph.checkpoint.mongodb import MongoDBSaver
from langgraph.checkpoint.serde.jsonplus import JsonPlusSerializer
from pymongo import MongoClient
from src.core.config import MONGO_URI
from src.server.deep_agent.agents.painter import painter_subagent
from src.server.deep_agent.agents.researcher import research_subagent
from src.server.deep_agent.agents.user_profile import user_profile_subagent
from src.server.deep_agent.init_prompt import build_system_prompt
from src.server.deep_agent.tools.report_generator_tool import llm
TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent
client = MongoClient(MONGO_URI)
checkpointer = MongoDBSaver(
client=client["furniture_agent_db"],
db_name="fida_agent_db",
collection_name="fida_agent_collection",
serde=JsonPlusSerializer(pickle_fallback=True), # ← 關鍵這一行
)
subagents = [
painter_subagent,
research_subagent,
user_profile_subagent
]
def build_main_agent(use_report):
main_agent = create_deep_agent(
model=llm,
system_prompt=build_system_prompt(use_report=use_report),
subagents=subagents,
checkpointer=checkpointer,
backend=FilesystemBackend(
root_dir=str(PROJECT_ROOT / "agent_workspace"),
virtual_mode=False, # 重要:關掉虛擬模式 → 真的寫硬碟
),
middleware=[
SummarizationMiddleware(
model=llm,
trigger=("tokens", 3000),
keep=("messages", 100),
),
],
)
return main_agent

View File

@@ -0,0 +1,22 @@
from langchain.agents.middleware import wrap_tool_call
from src.server.deep_agent.agents.init_llm import llm
from src.server.deep_agent.init_prompt import build_painter_prompt
from src.server.deep_agent.tools.generate_furniture_sketch import generate_furniture
@wrap_tool_call
async def log_tool_calls(request, handler):
"""Intercept and log every tool call - demonstrates cross-cutting concern."""
print(request)
return handler(request)
painter_subagent = {
"name": "painter_subagent",
"description": "理解用户意图使用prompt,调用generate_furniture工具生成家具sketch草图.",
"system_prompt": build_painter_prompt(),
"tools": [generate_furniture],
"model": llm,
# "middleware": [log_tool_calls],
}

View File

@@ -0,0 +1,21 @@
from src.server.deep_agent.agents.init_llm import llm
from src.server.deep_agent.init_prompt import build_researcher_prompt
from src.server.deep_agent.tools.crawl_tool import crawl4ai_batch
from src.server.deep_agent.tools.report_generator_tool import report_generator
from src.server.deep_agent.tools.research_tool import topic_research
from src.server.deep_agent.tools.structured_retrieval_tool import structured_retrieval
from src.server.deep_agent.tools.user_persona_tool import query_report_profile
research_subagent = {
"name": "research-agent",
"description": "通过网络搜索对家具设计开展深度研究并整合结论",
"system_prompt": build_researcher_prompt(),
"tools": [
query_report_profile,
topic_research,
crawl4ai_batch,
structured_retrieval,
report_generator
],
"model": llm
}

View File

@@ -0,0 +1,15 @@
from src.server.deep_agent.agents.init_llm import llm
from src.server.deep_agent.init_prompt import build_user_persona_prompt
from src.server.deep_agent.tools.user_persona_tool import query_report_profile, update_report_profile, check_profile_complete
user_profile_subagent = {
"name": "user_profile_subagent",
"description": "收集用户报告画像并存储到MongoDB",
"system_prompt": build_user_persona_prompt(),
"model": llm,
"tools": [
query_report_profile,
update_report_profile,
check_profile_complete,
],
}

View File

@@ -0,0 +1,141 @@
def build_system_prompt(use_report):
system_prompt = f"""
你是主调度 AgentSupervisor负责理解用户意图并选择合适的子Agent。
当前参数:
use_report = {use_report}
系统中存在两个相关子Agent
1. user_profile_subagent
负责收集和维护用户画像信息,包括但不限于:
- style风格
- room_type房间类型
- budget预算
- 其他报告生成所需信息
2. research-subagent
负责生成完整报告、调研、总结、分析。
3. painter_subagent
负责根据用户描述,构造适用于生成家具sketch的prompt,使用prompt用工具生成图片.
========================
执行规则
========================
【1】当用户请求报告 / 调研 / 分析 / 总结时:
先判断是否已经具备足够的用户画像信息。
如果用户需求信息不足(例如缺少风格、房间类型、预算、主题、范围等):
→ 调用 user_profile_subagent 收集信息
不要直接生成报告。
如果用户画像信息已经完整:
→ 调用 research-subagent 生成报告。
------------------------
【2】当 use_report = False 时:
- 严禁调用 research-subagent
- 如果用户明确请求报告、调研、总结、分析:
请礼貌回复:
"报告功能当前未开启,你可以打开 use_report=True 后我来帮你生成报告。"
- 其他普通问题可以正常回答或调用其他子Agent。
------------------------
【3】用户画像优先级规则
只要用户输入包含以下情况:
- 表达设计需求
- 提供偏好信息(例如风格、预算、房间类型)
- 修改之前的偏好
- 补充报告信息
都应该优先调用:
user_profile_subagent
用于更新或收集用户画像。
------------------------
【4】调度原则
- user_profile_subagent 只负责 **信息收集**
- research-subagent 只负责 **报告生成**
不要混用职责。
========================
严格输出规则
========================
- 当生成图片时绝对不要输出图片路径、file:// 地址、URL、本地链接
- 只输出文字描述,不输出任何图片链接或路径
"""
return system_prompt
def build_painter_prompt():
prompt = """
你是一名专业的prompt优化专家专注于家具设计草图生成。你的任务是
1. 分析用户查询,理解核心意图,包括家具类型、风格、尺寸、颜色、材料等关键元素
2. 基于意图优化并生成一个详细、精确的prompt适合用于AI图片生成工具创建家具sketch草图例如线条简洁、手绘风格、焦点在设计细节上
3. 使用优化的prompt调用图片生成工具生成并返回草图图片
4. 如果需要,建议额外变体或改进
输出格式:
- 用户意图总结12段
- 优化后的prompt完整文本
- 生成的图片描述(如果工具返回)
- 建议改进(项目符号,可选)
【严格输出规则】
- 当生成图片时,**绝对不要输出图片路径、file:// 地址、URL、本地链接**。
- 只输出文字描述,不输出任何图片链接或路径。
"""
return prompt
def build_researcher_prompt():
prompt = """
你是一名专业的家具设计研究员。你的任务是:
【0】获取用户画像
- 首先调用 get_user_profile 工具,获取当前用户画像信息(如风格、房间类型、预算等)。
- 根据用户画像,生成五个与用户需求和偏好高度相关的研究词条。
【1】关键词拆解
1. 将研究主题结合用户画像拆解为可搜索的查询关键词
2. 将关键词组合成五个待搜索的词条
【2】搜索与爬取
3. 使用 topic_research 工具搜索这五个词条获取相关、权威的网址
4. 使用 crawl4ai_batch 批量爬取网址(仅可调用一次,禁止重复调用)
【3】结构化处理与报告
5. 使用 structured_retrieval 对爬取内容进行结构化提取(重点:设计趋势、材质创新、颜色应用、代表案例、品牌参考)
6. 使用 report_generator 基于提取内容生成完整 Markdown 报告
【严格工具调用规则】:
- 调用顺序必须严格get_user_profile → topic_research → crawl4ai_batch仅一次 → structured_retrieval → report_generator。
- 不得跳回前面步骤或重复任何工具。
- 如果爬取结果为空或极少,直接说明:
“由于部分来源暂时不可访问,本报告基于有限可用信息生成,可能不够全面。如需更完整资料,请提供具体网址或调整需求。”
- 一旦生成 report_generator 的输出,就视为任务完成,直接结束,不要再思考或调用其他工具。
- crawl4ai_batch 最多只能调用一次,即使部分网址失败,也禁止再次调用 crawl4ai_batch 或 topic_research。
现在开始严格执行以上规则。
"""
return prompt
def build_user_persona_prompt():
prompt = """
你是用户画像收集助手。
你的任务是从用户对话中理解并提取报告画像信息,包括但不限于:
- style装修风格
- room_type房间类型
- budget预算
工作流程:
1. 先调用 query_report_profile 查询当前画像
2. 从用户输入中理解是否包含新的画像信息
3. 如果有新的信息,合并旧画像并调用 update_report_profile 更新
4. 调用 check_profile_complete 判断是否完整
5. 如果缺少字段,引导用户补充
6. 如果完整,回复:
"画像收集完成,即将为你生成报告!"
注意:
- 不要编造信息
- 不要覆盖已有字段,除非用户明确修改
- 只负责画像收集,不生成报告
"""
return prompt

View File

@@ -0,0 +1,131 @@
import asyncio
import uuid
from langchain_core.messages import AIMessageChunk, ToolMessageChunk, ToolMessage
from src.server.deep_agent.agents.main_agent import build_main_agent
agent = build_main_agent(use_report=True)
async def continuous_chat():
thread_id = str(uuid.uuid4())
print("===== 家具设计助手(支持持续对话+记忆)=====")
print("输入 'exit''退出' 结束对话\n")
while True:
user_input = input("你:") # 注意input() 在异步中仍是阻塞的,但对 CLI 够用
if user_input.lower() in ["exit", "退出", "q", "quit"]:
print("助手:再见!如需继续设计,随时回来~")
break
if not user_input.strip():
print("助手:请输入有效的设计需求,我会尽力解答~")
continue
print("\n助手:正在处理你的需求...\n")
# 现在可以安全使用 async for
async for stream in agent.astream(
{"messages": user_input},
stream_mode=["updates", "messages", "custom"],
subgraphs=True,
version="v2",
config={"configurable": {"thread_id": thread_id}}
):
print(stream)
_, mode, chunks = stream
if mode == "updates":
print(f"[updates] {chunks}")
elif mode == "messages":
token, metadata = chunks
subagent_name = metadata.get('lc_agent_name', "main_agent")
if isinstance(token, AIMessageChunk): # 默认回复 思考内容
reasoning = [b for b in token.content_blocks if b["type"] == "reasoning"]
text = [b for b in token.content_blocks if b["type"] == "text"]
if reasoning:
print(f"[thinking] {reasoning[0]['reasoning']}", end="")
if text:
print(text[0]["text"], end="")
elif isinstance(token, ToolMessageChunk): # 工具返回
print(f"[tool|{token.name}] {token.content}", end="")
elif isinstance(token, ToolMessage): # 工具返回
print(f"[tool|{token.name}] {token.content}", end="")
else:
continue
elif mode == "custom":
print(f"[report] {chunks.get('delta', '')}", end="")
# if chunk["type"] == "messages":
# token, metadata = chunk["data"]
# if not isinstance(token, AIMessageChunk):
# continue
# reasoning = [b for b in token.content_blocks if b["type"] == "reasoning"]
# text = [b for b in token.content_blocks if b["type"] == "text"]
# if reasoning:
# print(f"[thinking] {reasoning[0]['reasoning']}", end="")
# if text:
# print(text[0]["text"], end="")
# print(chunk)
# namespace, _, chunk = event
# token, metadata = chunk
# Identify source: "main" or the subagent namespace segment
# is_subagent = any(s.startswith("tools:") for s in namespace)
# source = next((s for s in namespace if s.startswith("tools:")), "main") if is_subagent else "main"
# if token.content_blocks:
# if token.additional_kwargs.get("reasoning_content", None): # 粗糙但常见判断
# if not has_printed_thinking_header:
# print("[思考过程]")
# has_printed_thinking_header = True
# print(token.content_blocks[0].get("reasoning", ""), end="", flush=True)
# else:
# if not has_printed_header:
# print("[agent回答]")
# has_printed_header = True
# print(token.content_blocks[0].get("text", ""), end="", flush=True)
#
# # Tool call chunks (streaming tool invocations)
# if token.tool_call_chunks:
# for tc in token.tool_call_chunks:
# if tc.get("name"):
# print(f"\n[{source}] Tool call: {tc['name']}")
# # Args stream in chunks - write them incrementally
# if tc.get("args"):
# print(tc["args"], end="", flush=True)
#
# # Tool results
# if token.type == "tool":
# print(f"\n[{source}] Tool result [{token.name}]: {str(token.content)[:150]}")
#
# # Regular AI content (skip tool call messages)
# if token.type == "ai" and token.content and not token.tool_call_chunks:
# print(token.content, end="", flush=True)
# if namespace:
# print(f"[子代理: {namespace}]")
# else:
# print("[主助手]")
# print(chunk)
# print("-" * 50 + "\n")
#
# chunk_list.append(str(chunk))
#
# if not chunk_list:
# assistant_response = "抱歉,我暂时无法处理你的请求,请稍后再试。"
# else:
# assistant_response = "\n".join(chunk_list)
#
# print(f"[最终完整回复]\n{assistant_response}\n" + "=" * 60 + "\n")
# 启动方式改成:
if __name__ == "__main__":
asyncio.run(continuous_chat())

View File

@@ -0,0 +1,27 @@
from langchain_core.prompts import PromptTemplate
from src.server.deep_agent.agents.init_llm import title_llm
def conversation_title(full_conversation):
title_prompt = PromptTemplate(
input_variables=["full_conversation"],
template="""
请严格按照以下要求生成对话标题:
1. 标题长度8-15个字纯中文无标点、无特殊符号、无换行
2. 标题内容:基于完整对话,精准概括核心主题(兼顾用户需求和助手回复)
3. 标题风格:自然口语化,符合中文表达习惯,不冗余
完整对话内容:
{full_conversation}
仅输出标题,不要输出任何额外解释、说明或标点符号。
"""
)
title_chain = title_prompt | title_llm
response = title_chain.invoke({"full_conversation": full_conversation})
return response
if __name__ == '__main__':
print(conversation_title("你好"))

View File

@@ -0,0 +1,191 @@
import time
import asyncio
from typing import List, Dict, Any
from urllib.parse import urlparse
from pathlib import Path
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from langchain_core.tools import tool
# ─────────────────────────────────────
# 路径配置
# ─────────────────────────────────────
TOOL_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = TOOL_DIR.parent
# DeepAgents 推荐目录
SAVE_DIR = PROJECT_ROOT / "agent_workspace" / "raw_data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
print(f"tool save : {str(PROJECT_ROOT / "agent_workspace")}")
# ─────────────────────────────────────
# Browser 配置
# ─────────────────────────────────────
browser_config = BrowserConfig(
headless=True,
verbose=False,
java_script_enabled=True,
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0 Safari/537.36"
),
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=5,
excluded_tags=["script", "style", "nav", "footer"],
remove_overlay_elements=True,
process_iframes=True,
)
# ─────────────────────────────────────
# URL → 文件名
# ─────────────────────────────────────
def build_filename(url: str) -> str:
parsed = urlparse(url)
domain = parsed.netloc.replace("www.", "").replace(".", "_")
path_part = parsed.path.strip("/").replace("/", "_")[:50] or "index"
ts = int(time.time())
rand = uuid.uuid4().hex[:6]
return f"{ts}_{rand}_{domain}_{path_part}.md"
# ─────────────────────────────────────
# 单个 URL 抓取
# ─────────────────────────────────────
async def crawl_one(crawler, url: str, sem: asyncio.Semaphore) -> Dict[str, Any]:
async with sem:
try:
result = await crawler.arun(url=url, config=run_config)
if not result.success:
return {
"url": url,
"success": False,
"error": f"status={getattr(result, 'status_code', 'unknown')}"
}
markdown = result.markdown or ""
if len(markdown) < 500:
return {
"url": url,
"success": False,
"error": "content too short"
}
filename = build_filename(url)
filepath = SAVE_DIR / filename
header = (
f"<!-- Source: {url} -->\n"
f"<!-- Saved: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n\n"
)
with open(filepath, "w", encoding="utf-8") as f:
f.write(header + markdown)
return {
"url": url,
"success": True,
"file": str(filepath)
}
except Exception as e:
return {
"url": url,
"success": False,
"error": str(e)
}
# ─────────────────────────────────────
# Async 主逻辑
# ─────────────────────────────────────
async def _crawl4ai_batch(urls: List[str]) -> Dict[str, Any]:
urls = list(set(urls)) # 去重
if not urls:
return {"error": "no urls"}
sem = asyncio.Semaphore(5) # 并发限制
async with AsyncWebCrawler(config=browser_config) as crawler:
tasks = [
crawl_one(crawler, url, sem)
for url in urls
]
results = await asyncio.gather(*tasks)
success_files = []
summary = []
for r in results:
if r["success"]:
success_files.append(r["file"])
summary.append(f"{r['url']}")
else:
summary.append(f"{r['url']} ({r['error']})")
return {
"saved_files": success_files,
"count": len(success_files),
"summary": summary,
}
# ─────────────────────────────────────
# Tool同步
# ─────────────────────────────────────
@tool
def crawl4ai_batch(urls: List[str]) -> str:
"""
Batch crawl webpages and save their content as markdown files.
Args:
urls: List of webpage URLs to crawl.
Returns:
A summary of crawling results and saved file paths.
"""
try:
result = asyncio.run(_crawl4ai_batch(urls))
if "error" in result:
return f"❌ Error: {result['error']}"
output = [
"### 批量抓取完成 ###",
f"成功保存文件: {result['count']}",
f"保存目录: {SAVE_DIR}",
"",
"抓取详情:"
]
output.extend(result["summary"])
if result["saved_files"]:
output.append("\n可读取文件:")
output.extend(result["saved_files"])
return "\n".join(output)
except Exception as e:
return f"🚨 爬虫系统异常: {str(e)}"

View File

@@ -0,0 +1,94 @@
import json
import logging
import uuid
from google.oauth2 import service_account
from langchain_core.tools import tool
from google import genai
from google.genai.types import GenerateContentConfig, Modality
from minio import Minio
from src.core.config import settings
from src.server.utils.new_oss_client import oss_upload_image
logger = logging.getLogger(__name__)
# 初始化全局凭证和客户端
creds = service_account.Credentials.from_service_account_file(
settings.GOOGLE_GENAI_USE_VERTEXAI,
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
minio_client = Minio(settings.MINIO_URL, access_key=settings.MINIO_ACCESS, secret_key=settings.MINIO_SECRET, secure=settings.MINIO_SECURE)
client = genai.Client(
credentials=creds,
project=settings.GOOGLE_CLOUD_PROJECT,
location=settings.GOOGLE_CLOUD_LOCATION,
vertexai=True
)
@tool
async def generate_furniture(prompt: str) -> str:
"""
使用 Gemini 图像生成模型根据详细的英文提示词生成家具设计草图。
"""
print(f"\n[系统日志] 正在调用 Nano Banana (Gemini Image Gen) ...")
try:
response = client.models.generate_content(
model="gemini-2.5-flash-image",
contents=(f"Generate a professional furniture design sketch: {prompt}"),
config=GenerateContentConfig(
response_modalities=[Modality.TEXT, Modality.IMAGE],
),
)
image_bytes = None
for part in response.candidates[0].content.parts:
if part.inline_data:
image_bytes = part.inline_data.data
break
if not image_bytes:
return "未能生成图像数据。"
object_name = f"furniture/sketches/{uuid.uuid4()}.png"
bucket = "fida-test" # 替换为你的 bucket 名称
# 3. 调用你的上传函数
upload_res = oss_upload_image(
oss_client=minio_client,
bucket=bucket,
object_name=object_name,
image_bytes=image_bytes
)
if upload_res:
# 4. 构造访问链接 (如果是私有 bucket需使用 presigned_get_object)
# 这里简单示例为直接访问地址
image_url = f"{bucket}/{object_name}"
return json.dumps(
{
"tool_name": "generate_furniture",
"data": image_url,
"tool_status": "success"
},
ensure_ascii=False
)
else:
return json.dumps(
{
"tool_name": "generate_furniture",
"data": "图片生成成功,但上传至存储服务器失败。",
"tool_status": "error"
},
ensure_ascii=False
)
except Exception as e:
logger.warning(e)
return json.dumps(
{
"tool_name": "generate_furniture",
"data": f"绘图流程异常",
"tool_status": "error"
},
ensure_ascii=False
)

View File

@@ -0,0 +1,151 @@
import os
import json
import re
from typing import Optional, List, Dict
from langchain_qwq import ChatQwen
from langgraph.config import get_stream_writer
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage, HumanMessage
from src.core.config import settings
# =========================
# LLM 初始化
# =========================
llm = ChatQwen(
enable_thinking=False,
model="qwen3.5-flash",
temperature=0.2,
max_tokens=3_000,
timeout=None,
max_retries=2,
api_key=settings.QWEN_API_KEY)
# =========================
# Tool 输入 Schema
# =========================
class ReportInput(BaseModel):
report_topic: str = Field(
...,
description="Main topic of the report, e.g. '2026 Sofa Design Trends'"
)
structured_data: List[Dict] = Field(
...,
description="Structured retrieval result items"
)
language: Optional[str] = Field(
default="English",
description="Output language"
)
# =========================
# LangGraph Tool
# =========================
@tool("report_generator", args_schema=ReportInput)
async def report_generator(
report_topic: str,
structured_data: List[Dict],
language: str = "English"
) -> dict:
"""
Generate a professional design/market report
directly from structured retrieval results.
"""
writer = get_stream_writer()
if not structured_data:
error_msg = "Error: No structured data provided."
writer({"type": "report_error", "message": error_msg})
return error_msg
collected_data_str = json.dumps(
structured_data,
ensure_ascii=False,
indent=2
)
# =========================
# Prompt
# =========================
system_prompt = f"""
You are a professional design trend analyst.
Generate a long, structured Markdown report.
REQUIREMENTS:
1. Follow MECE principle.
2. Embed images ONLY if they start with https://
using: ![alt](url)
3. Insert images inline.
4. Every key insight must cite source:
[Website Name](url)
5. Use Markdown headings.
6. Start directly with title.
7. Be detailed and analytical.
Output Language: {language}
"""
user_prompt = f"""
Topic: {report_topic}
Input Data:
{collected_data_str}
"""
# =========================
# 调用 LLM
# =========================
writer({"type": "report_start", "topic": report_topic, "language": language})
full_report = ""
try:
report_llm = llm.with_config(
callbacks=[]
)
async for chunk in report_llm.astream(
[
SystemMessage(content=system_prompt),
HumanMessage(content=user_prompt)
]
):
if chunk.content: # Gemini 返回的 chunk.content
delta = chunk.content
full_report += delta
# return {"type": "report_delta", "delta": delta}
writer({"type": "report_delta", "delta": delta}) # ← 实时推送给前端
writer({"type": "report_stop", "topic": report_topic, "language": language})
except Exception as e:
error_msg = f"LLM generation failed: {str(e)}"
writer({"type": "report_error", "message": error_msg})
return error_msg
report_content = full_report.strip()
# =========================
# 保存报告
# =========================
output_dir = "workspace/reports"
os.makedirs(output_dir, exist_ok=True)
safe_topic = re.sub(r'[\\/*?:"<>|]', "", report_topic.replace(" ", "_"))
filename = f"{output_dir}/{safe_topic}.md"
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(report_content)
writer({"type": "report_complete", "file_path": filename})
except Exception as e:
writer({"type": "report_save_warning", "message": str(e)})
# 返回完整内容(作为 tool result同时正文已通过 delta 流式输出
return report_content + f"\n\n✅ Report saved to: {filename}"

View File

@@ -0,0 +1,67 @@
import asyncio
import json
from datetime import datetime
from typing import List, Set, Optional
from langchain_core.tools import tool
from tavily import TavilyClient
from src.core.config import settings
# 模拟配置加载
TAVILY_API_KEY = settings.TAVILY_API_KEY
@tool
async def topic_research(topic: list[str], max_urls: int = 5) -> str:
"""
深度调研工具。该工具会利用 Tavily 搜索引擎针对特定主题进行多维度搜索。
它会自动生成针对性的搜索词(包含年份和趋势),并返回去重后的高质量 URL 列表。
"""
if not TAVILY_API_KEY:
return "❌ 错误: 未配置 TAVILY_API_KEY。"
client = TavilyClient(api_key=TAVILY_API_KEY)
# 1. 自动生成多维度搜索词 (在工具内部快速生成)
# 2. 并行执行搜索
async def perform_search(q: str):
# 使用 asyncio.to_thread 运行同步的 Tavily SDK
def sync_search():
try:
response = client.search(
query=q,
search_depth="advanced",
max_results=5,
include_answer=False
)
return response.get('results', [])
except Exception as e:
print(f"Search error: {e}")
return []
return await asyncio.to_thread(sync_search)
search_tasks = [perform_search(q) for q in topic]
search_results_list = await asyncio.gather(*search_tasks)
# 3. 结果去重与过滤
seen_urls: Set[str] = set()
final_urls = []
# 常见的非内容页面过滤
skip_extensions = ('.pdf', '.jpg', '.png', '.zip', '.exe')
for results in search_results_list:
for item in results:
url = item.get('url')
if url and url not in seen_urls:
if not url.lower().endswith(skip_extensions):
seen_urls.add(url)
final_urls.append(url)
# 4. 结果截断
selected_urls = final_urls[:max_urls]
# 返回 JSON 字符串,便于 Agent 下一步调用批量爬虫 (Crawl4ai)
return json.dumps(selected_urls, ensure_ascii=False)

View File

@@ -0,0 +1,225 @@
import os
import re
import json
from datetime import datetime
from typing import List, Dict, Optional
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from langchain_core.documents import Document
# RAG
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
# =========================
# 全局模型(单例)
# =========================
_EMBEDDING_MODEL = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
_RERANK_MODEL = CrossEncoder(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
)
class StructuredRetrievalInput(BaseModel):
file_paths: List[str] = Field(..., description="List of local markdown file paths.")
query: str = Field(..., description="Extraction query")
source_url: Optional[str] = Field(None, description="Optional global source URL")
@tool("structured_retrieval", args_schema=StructuredRetrievalInput)
def structured_retrieval(
file_paths: List[str],
query: str,
source_url: Optional[str] = None
) -> Dict:
"""
Batch structured extraction from markdown files.
- Performs vector search + re-ranking
- Saves extracted structured data as JSON file to disk
- Returns ONLY summary (status, count, file path)
"""
# ── 1. 收集所有文件內容 ──────────────────────────────────────
all_docs_pool: List[Document] = []
for path in file_paths:
if not os.path.exists(path) or not path.endswith((".md", ".markdown")):
continue
file_name = os.path.basename(path)
with open(path, "r", encoding="utf-8") as f:
content = f.read()
current_source = source_url or _extract_source_from_md(content) or "unknown"
sections = _split_markdown_by_headers(content)
for sec in sections:
all_docs_pool.append(
Document(
page_content=sec,
metadata={"source_url": current_source, "file_name": file_name}
)
)
if not all_docs_pool:
return {"status": "no_documents_found", "items_count": 0, "json_path": None}
# ── 2. Vector search ────────────────────────────────────────────
vector_store = FAISS.from_documents(all_docs_pool, _EMBEDDING_MODEL)
retrieved = vector_store.similarity_search(query, k=200)
# ── 3. 提取結構化片段 ──────────────────────────────────────────
structured_items = []
for doc in retrieved:
text = doc.page_content.strip()
if len(text) < 30:
continue
images = list(set(re.findall(r"!\[.*?\]\((.*?)\)", text)))
structured_items.append(
{
"text": text,
"images": images,
"source_url": doc.metadata.get("source_url"),
"file_name": doc.metadata.get("file_name")
}
)
# ── 4. Re-rank ──────────────────────────────────────────────────
if structured_items:
unique_items = {item["text"]: item for item in structured_items}.values()
pairs = [[query, item["text"]] for item in unique_items]
scores = _RERANK_MODEL.predict(pairs)
sorted_items = sorted(
zip(scores, unique_items),
key=lambda x: x[0],
reverse=True
)
top_items = [item for _, item in sorted_items[:50]]
else:
top_items = []
# ── 5. 寫入 JSON 文件 ──────────────────────────────────────────
if not top_items:
return {"status": "no_relevant_content", "items_count": 0, "json_path": None}
# 產生有意義的檔名
safe_query = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fa5]', '_', query)[:40]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_filename = f"extracted_{safe_query}_{timestamp}.json"
# 建議的儲存目錄(與 crawl4ai_batch 對齊)
output_dir = os.path.join(os.path.dirname(file_paths[0]), "..", "extracted")
os.makedirs(output_dir, exist_ok=True)
json_path = os.path.join(output_dir, json_filename)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(
{
"query": query,
"extracted_at": timestamp,
"item_count": len(top_items),
"items": top_items
},
f,
ensure_ascii=False,
indent=2
)
# ── 6. 只回傳摘要 ──────────────────────────────────────────────
return {
"status": "success",
"items_count": len(top_items),
"json_path": json_path,
"summary": f"已提取 {len(top_items)} 個高相關片段,儲存於 {json_path}"
}
def _extract_source_from_md(content: str) -> Optional[str]:
match = re.search(r"<!--\s*Source:\s*(.*?)\s*-->", content)
return match.group(1).strip() if match else None
# =========================
# Markdown Header Split
# =========================
def _split_markdown_by_headers(
content: str,
max_chars: int = 2000,
overlap: int = 150,
):
header_re = re.compile(
r'^(#{1,6})\s+(.+?)\s*$',
re.MULTILINE
)
matches = list(header_re.finditer(content))
if not matches:
return _chunk_text(content, max_chars, overlap)
sections = []
for i, m in enumerate(matches):
start = m.start()
end = (
matches[i + 1].start()
if i + 1 < len(matches)
else len(content)
)
block = content[start:end].strip()
if block:
sections.append(block)
final_sections = []
for s in sections:
if len(s) > max_chars:
final_sections.extend(
_chunk_text(s, max_chars, overlap)
)
else:
final_sections.append(s)
return final_sections
def _chunk_text(
text: str,
max_chars: int = 2000,
overlap: int = 150
):
text = text.strip()
if len(text) <= max_chars:
return [text]
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
if end == len(text):
break
start = max(0, end - overlap)
return chunks

View File

@@ -0,0 +1,57 @@
from datetime import datetime
from langchain_core.runnables import RunnableConfig
from langchain_core.tools import tool
from pymongo import MongoClient
from src.core.config import MONGO_URI
client = MongoClient(MONGO_URI)
db = client["report_agent"]
collection = db["user_profiles"]
@tool
def query_report_profile(config: RunnableConfig, ) -> dict:
"""
查询用户报告画像
"""
thread_id = config['configurable']['thread_id']
doc = collection.find_one({"thread_id": thread_id})
if not doc:
return {"profile": {}}
doc.pop("_id", None)
return doc
@tool
def update_report_profile(config: RunnableConfig, profile: dict) -> dict:
"""
更新用户画像信息
"""
thread_id = config['configurable']['thread_id']
collection.update_one(
{"thread_id": thread_id},
{
"$set": {
"profile": profile
}
},
upsert=True
)
return {"status": "success", "profile": profile}
@tool
def check_profile_complete(profile: dict) -> dict:
"""
判断画像是否完整
"""
required = ["style", "room_type", "budget"]
missing = [f for f in required if f not in profile]
return {
"complete": len(missing) == 0,
"missing_fields": missing
}