Merge remote-tracking branch 'origin/dev-ltx' into develop
All checks were successful
git commit AiDA python develop 分支构建部署 / scheduled_deploy (push) Has been skipped

# Conflicts:
#	app/api/api_recommendation.py
#	app/service/design_fast/utils/organize.py
This commit is contained in:
litianxiang
2025-12-30 10:19:19 +08:00
16 changed files with 2602 additions and 429 deletions

View File

@@ -9,7 +9,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.cron import CronTrigger
from fastapi import HTTPException, APIRouter from fastapi import HTTPException, APIRouter
from app.service.recommend.service import load_resources, matrix_data
import pymysql import pymysql
from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX
from minio import Minio from minio import Minio

View File

@@ -0,0 +1,116 @@
import logging
import sys
from typing import Optional
from fastapi import APIRouter, HTTPException, Query
from concurrent.futures import ThreadPoolExecutor
import threading
from app.schemas.response_template import ResponseModel
from app.service.recommendation_system.import_sys_sketch_to_milvus import main as import_main
logger = logging.getLogger()
router = APIRouter()
# 使用线程池执行器来运行长时间任务
executor = ThreadPoolExecutor(max_workers=1)
# 用于跟踪任务状态
task_status = {"running": False}
def run_import_task(batch_size: int, retry_times: int, limit: Optional[int], offset: int, skip_create_collection: bool):
"""在后台线程中运行导入任务"""
original_argv = None
try:
task_status["running"] = True
# 保存原始 sys.argv
original_argv = sys.argv.copy()
# 模拟命令行参数
sys.argv = [
"import_sys_sketch_to_milvus.py",
"--batch-size", str(batch_size),
"--retry-times", str(retry_times),
]
if limit is not None:
sys.argv.extend(["--limit", str(limit)])
if offset > 0:
sys.argv.extend(["--offset", str(offset)])
if skip_create_collection:
sys.argv.append("--skip-create-collection")
import_main()
task_status["running"] = False
logger.info("导入任务完成")
except Exception as e:
task_status["running"] = False
logger.error(f"导入任务失败: {e}", exc_info=True)
raise
finally:
# 恢复原始 sys.argv
if original_argv is not None:
sys.argv = original_argv
@router.post("/import-sys-sketch", response_model=ResponseModel)
async def import_sys_sketch(
batch_size: int = Query(1000, description="批量处理大小默认1000"),
retry_times: int = Query(3, description="失败重试次数默认3"),
limit: Optional[int] = Query(None, description="限制处理数量(用于测试,默认:不限制)"),
offset: int = Query(0, description="起始偏移量默认0"),
skip_create_collection: bool = Query(False, description="跳过创建集合(如果集合已存在)"),
):
"""
从 t_sys_file 导入系统图向量到 Milvus
该接口会异步执行导入任务,任务在后台运行。
"""
try:
# 检查是否有任务正在运行
if task_status["running"]:
raise HTTPException(
status_code=409,
detail="已有导入任务正在运行,请等待完成后再试"
)
# 在后台线程中执行任务
executor.submit(
run_import_task,
batch_size,
retry_times,
limit,
offset,
skip_create_collection
)
return ResponseModel(
code=200,
msg="导入任务已启动,正在后台执行",
data={
"status": "started",
"batch_size": batch_size,
"retry_times": retry_times,
"limit": limit,
"offset": offset,
"skip_create_collection": skip_create_collection
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"启动导入任务失败: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"启动导入任务失败: {str(e)}")
@router.get("/import-sys-sketch/status", response_model=ResponseModel)
async def get_import_status():
"""
获取导入任务状态
"""
return ResponseModel(
code=200,
msg="OK",
data={
"running": task_status["running"]
}
)

85
app/api/api_precompute.py Normal file
View File

@@ -0,0 +1,85 @@
import logging
from fastapi import APIRouter, HTTPException
from concurrent.futures import ThreadPoolExecutor
from app.schemas.response_template import ResponseModel
from app.service.recommendation_system.precompute import run_precompute
logger = logging.getLogger()
router = APIRouter()
# 使用线程池执行器来运行长时间任务
executor = ThreadPoolExecutor(max_workers=1)
# 用于跟踪任务状态
task_status = {"running": False}
def run_precompute_task():
"""在后台线程中运行预计算任务"""
try:
task_status["running"] = True
logger.info("开始执行预计算任务...")
run_precompute()
task_status["running"] = False
logger.info("预计算任务完成")
except Exception as e:
task_status["running"] = False
logger.error(f"预计算任务失败: {e}", exc_info=True)
raise
@router.post("/precompute", response_model=ResponseModel)
async def precompute():
"""
运行预计算任务
该接口会异步执行预计算任务,包括:
1. 优化数据库表结构
2. 历史数据迁移
3. 初始用户偏好向量生成
任务在后台运行。
"""
try:
# 检查是否有任务正在运行
if task_status["running"]:
raise HTTPException(
status_code=409,
detail="已有预计算任务正在运行,请等待完成后再试"
)
# 在后台线程中执行任务
executor.submit(run_precompute_task)
return ResponseModel(
code=200,
msg="预计算任务已启动,正在后台执行",
data={
"status": "started",
"tasks": [
"优化数据库表结构",
"历史数据迁移",
"初始用户偏好向量生成"
]
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"启动预计算任务失败: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"启动预计算任务失败: {str(e)}")
@router.get("/precompute/status", response_model=ResponseModel)
async def get_precompute_status():
"""
获取预计算任务状态
"""
return ResponseModel(
code=200,
msg="OK",
data={
"running": task_status["running"]
}
)

View File

@@ -1,207 +1,175 @@
import io import io
import logging import logging
import sys import sys
import time from typing import List, Optional
from typing import List from fastapi import HTTPException, APIRouter, Query
import os
import json
import math
import random
import numpy as np
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from fastapi import HTTPException, APIRouter
from app.service.recommend.service import load_resources, matrix_data from app.service.recommendation_system.recommendation_api import get_recommendations as get_new_recommendations
from app.service.recommendation_system.incremental_listener import start_background_listener
from app.service.recommendation_system.milvus_client import create_collection
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
logger = logging.getLogger() logger = logging.getLogger()
router = APIRouter() router = APIRouter()
@router.on_event("startup") # ========== 旧版推荐接口(基于 npy 矩阵,已废弃)==========
async def startup_event(): # @router.get("/recommend/{user_id}/{category}/{num_recommendations}/{brand_id}/{brand_scale}", response_model=List[str])
# 初始加载 # async def get_recommendations(user_id: int, category: str, brand_id: int, brand_scale: float, num_recommendations: int = 10):
load_resources() # """
# :param user_id: 4
# 配置定时任务 # :param category: female_skirt
scheduler = BackgroundScheduler() # :param num_recommendations: 1
scheduler.add_job( # :return:
load_resources, # [
trigger=CronTrigger(hour=0, minute=30), # "aida-sys-image/images/female/skirt/903000017.jpg"
name="每日资源刷新" # ]
) # """
scheduler.start()
logger.info("定时任务已启动")
def softmax(scores):
max_score = max(scores)
exp_scores = [math.exp(s - max_score) for s in scores]
sum_exp = sum(exp_scores)
return [s / sum_exp for s in exp_scores]
# def get_random_recommendations(category: str, num: int) -> List[str]:
# """根据预加载热度向量推荐(冷启动)"""
# try: # try:
# heat_data = matrix_data.get("heat_data", {}) # start_time = time.time()
# cache_key = (user_id, category)
# # === 新增:用户存在性检查 ===
# user_exists_inter = user_id in matrix_data["user_index_interaction"]
# user_exists_feat = user_id in matrix_data["user_index_feature"]
# #
# if category not in heat_data: # # 任一矩阵不存在用户则返回随机推荐
# raise ValueError(f"热度数据缺少类别 {category},使用随机推荐") # if not (user_exists_inter and user_exists_feat):
# logger.info(f"用户 {user_id} 数据不完整,触发随机推荐")
# return get_random_recommendations(category, num_recommendations)
# #
# heat_dict = heat_data[category] # {url: score} # # 检查缓存
# urls = list(heat_dict.keys()) # if cache_key in matrix_data["cached_scores"]:
# scores = list(heat_dict.values()) # processed_inter, processed_feat = matrix_data["cached_scores"][cache_key]
# valid_sketch_idxs_inter = matrix_data["cached_valid_idxs"][cache_key]
# else:
# # 实时计算逻辑(同原代码)
# user_idx_inter = matrix_data["user_index_interaction"].get(user_id)
# user_idx_feature = matrix_data["user_index_feature"].get(user_id)
# #
# if not urls: # category_iids = matrix_data["category_to_iids"].get(category, [])
# raise ValueError("该类别下无热度记录,使用随机推荐") # valid_sketch_idxs_inter = [
# idx for iid, idx in matrix_data["sketch_index_interaction"].items()
# if iid in category_iids
# ]
# #
# probs = softmax(scores) # # 处理交互分数
# sample_size = min(num, len(urls)) # raw_inter_scores = []
# sampled_urls = random.choices(urls, weights=probs, k=sample_size) # if user_idx_inter is not None and valid_sketch_idxs_inter:
# raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter]
# processed_inter = raw_inter_scores * 0.7
# #
# return sampled_urls # # 处理特征分数
# valid_sketch_idxs_feature = [
# idx for iid, idx in matrix_data["sketch_index_feature"].items()
# if iid in category_iids
# ]
# raw_feat_scores = []
# if user_idx_feature is not None and valid_sketch_idxs_feature:
# raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature]
# raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / (
# np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8)
# processed_feat = raw_feat_scores
# else:
# processed_feat = np.array([])
# #
# # 更新缓存
# matrix_data["cached_scores"][cache_key] = (processed_inter, processed_feat)
# matrix_data["cached_valid_idxs"][cache_key] = valid_sketch_idxs_inter
#
# # 合并分数
# if brand_id is not None:
# brand_idx_feature = matrix_data["brand_index_map"].get(brand_id)
#
# brand_feat_valid = (
# matrix_data["brand_feature_matrix"].size > 0 and # 矩阵非空
# brand_idx_feature is not None and
# valid_sketch_idxs_feature # 有可用索引
# )
#
# if brand_feat_valid:
# raw_brand_feat_scores = matrix_data["brand_feature_matrix"][
# brand_idx_feature, valid_sketch_idxs_feature
# ]
# raw_brand_feat_scores = (raw_brand_feat_scores - np.min(raw_brand_feat_scores)) / (
# np.max(raw_brand_feat_scores) - np.min(raw_brand_feat_scores) + 1e-8
# )
# processed_brand_feat = raw_brand_feat_scores
#
# # 如果 processed_feat 是空的,替换为全 0避免 shape 不一致
# if processed_feat.size == 0:
# processed_feat = np.zeros_like(processed_brand_feat)
#
# final_scores = processed_inter + 0.3 * (
# (1 - brand_scale) * processed_feat + brand_scale * processed_brand_feat
# )
# else:
# # brand 信息不可用
# final_scores = processed_inter + 0.3 * processed_feat if processed_feat.size > 0 else processed_inter
# else:
# final_scores = processed_inter + 0.3 * processed_feat if processed_feat.size > 0 else processed_inter
#
# valid_sketch_idxs = matrix_data["cached_valid_idxs"][cache_key]
#
# # 概率采样
# scores = np.array(final_scores)
#
# # 调整后的概率转换带温度控制的softmax
# def calibrated_softmax(scores, temperature=1.0):
# scores = scores / temperature
# scale = scores - max(scores)
# exps = np.exp(scale)
# return exps / np.sum(exps)
#
# probs = calibrated_softmax(scores, 0.09)
#
# chosen_indices = np.random.choice(
# len(valid_sketch_idxs),
# size=min(num_recommendations, len(valid_sketch_idxs)),
# p=probs,
# replace=False
# )
# recommendations = [matrix_data["iid_to_sketch"][valid_sketch_idxs[idx]] for idx in chosen_indices]
#
# logger.info(f"推荐生成完成,耗时: {time.time() - start_time:.2f}秒")
# return recommendations
# except Exception as e: # except Exception as e:
# # 回退:完全随机推荐 # logger.error(f"推荐失败: {str(e)}", exc_info=True)
# all_iids = list(matrix_data["iid_to_sketch"].keys()) # raise HTTPException(status_code=500, detail=str(e))
# category_iids = matrix_data["category_to_iids"].get(category, all_iids)
# sample_size = min(num, len(category_iids))
# sampled = np.random.choice(category_iids, size=sample_size, replace=False)
# return [matrix_data["iid_to_sketch"][iid] for iid in sampled]
def get_random_recommendations(category: str, num: int) -> List[str]: # @router.on_event("startup")
"""全品类随机推荐""" async def startup_event():
all_iids = list(matrix_data["iid_to_sketch"].keys()) """启动时初始化增量监听任务"""
# 优先从当前品类选择
category_iids = matrix_data["category_to_iids"].get(category, all_iids)
# 确保不超出实际数量
sample_size = min(num, len(category_iids))
sampled = np.random.choice(category_iids, size=sample_size, replace=False)
return [matrix_data["iid_to_sketch"][iid] for iid in sampled]
@router.get("/recommend/{user_id}/{category}/{num_recommendations}/{brand_id}/{brand_scale}", response_model=List[str])
async def get_recommendations(user_id: int, category: str, brand_id: int, brand_scale: float, num_recommendations: int = 10):
"""
:param user_id: 4
:param category: female_skirt
:param num_recommendations: 1
:return:
[
"aida-sys-image/images/female/skirt/903000017.jpg"
]
"""
try: try:
logger.info(f"user_id:{user_id}-----category:{category}-----brand_id:{brand_id}-----brand_scale:{brand_scale}-----num_recommendations:{num_recommendations}") # 确保 Milvus 集合已创建(若已存在则直接返回)
start_time = time.time() try:
cache_key = (user_id, category) create_collection()
# === 新增:用户存在性检查 === except Exception as exc:
user_exists_inter = user_id in matrix_data["user_index_interaction"] logger.error("Milvus 集合创建/检查失败,不影响服务继续启动: %s", exc, exc_info=True)
user_exists_feat = user_id in matrix_data["user_index_feature"]
# 配置定时任务
# 任一矩阵不存在用户则返回随机推荐 scheduler = BackgroundScheduler()
if not (user_exists_inter and user_exists_feat): start_background_listener(scheduler)
logger.info(f"用户 {user_id} 数据不完整,触发随机推荐") scheduler.start()
return get_random_recommendations(category, num_recommendations) logger.info("增量监听定时任务已启动")
# 检查缓存
if cache_key in matrix_data["cached_scores"]:
processed_inter, processed_feat = matrix_data["cached_scores"][cache_key]
valid_sketch_idxs_inter = matrix_data["cached_valid_idxs"][cache_key]
else:
# 实时计算逻辑(同原代码)
user_idx_inter = matrix_data["user_index_interaction"].get(user_id)
user_idx_feature = matrix_data["user_index_feature"].get(user_id)
category_iids = matrix_data["category_to_iids"].get(category, [])
valid_sketch_idxs_inter = [
idx for iid, idx in matrix_data["sketch_index_interaction"].items()
if iid in category_iids
]
# 处理交互分数
raw_inter_scores = []
if user_idx_inter is not None and valid_sketch_idxs_inter:
raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter]
processed_inter = raw_inter_scores * 0.7
# 处理特征分数
valid_sketch_idxs_feature = [
idx for iid, idx in matrix_data["sketch_index_feature"].items()
if iid in category_iids
]
raw_feat_scores = []
if user_idx_feature is not None and valid_sketch_idxs_feature:
raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature]
raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / (
np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8)
processed_feat = raw_feat_scores
else:
processed_feat = np.array([])
# 更新缓存
matrix_data["cached_scores"][cache_key] = (processed_inter, processed_feat)
matrix_data["cached_valid_idxs"][cache_key] = valid_sketch_idxs_inter
# 合并分数
if brand_id is not None:
brand_idx_feature = matrix_data["brand_index_map"].get(brand_id)
brand_feat_valid = (
matrix_data["brand_feature_matrix"].size > 0 and # 矩阵非空
brand_idx_feature is not None and
valid_sketch_idxs_feature # 有可用索引
)
if brand_feat_valid:
raw_brand_feat_scores = matrix_data["brand_feature_matrix"][
brand_idx_feature, valid_sketch_idxs_feature
]
raw_brand_feat_scores = (raw_brand_feat_scores - np.min(raw_brand_feat_scores)) / (
np.max(raw_brand_feat_scores) - np.min(raw_brand_feat_scores) + 1e-8
)
processed_brand_feat = raw_brand_feat_scores
# 如果 processed_feat 是空的,替换为全 0避免 shape 不一致
if processed_feat.size == 0:
processed_feat = np.zeros_like(processed_brand_feat)
final_scores = processed_inter + 0.3 * (
(1 - brand_scale) * processed_feat + brand_scale * processed_brand_feat
)
else:
# brand 信息不可用
final_scores = processed_inter + 0.3 * processed_feat if processed_feat.size > 0 else processed_inter
else:
final_scores = processed_inter + 0.3 * processed_feat if processed_feat.size > 0 else processed_inter
valid_sketch_idxs = matrix_data["cached_valid_idxs"][cache_key]
# 概率采样
scores = np.array(final_scores)
# 调整后的概率转换带温度控制的softmax
def calibrated_softmax(scores, temperature=1.0):
scores = scores / temperature
scale = scores - max(scores)
exps = np.exp(scale)
return exps / np.sum(exps)
probs = calibrated_softmax(scores, 0.09)
chosen_indices = np.random.choice(
len(valid_sketch_idxs),
size=min(num_recommendations, len(valid_sketch_idxs)),
p=probs,
replace=False
)
recommendations = [matrix_data["iid_to_sketch"][valid_sketch_idxs[idx]] for idx in chosen_indices]
logger.info(f"推荐生成完成,耗时: {time.time() - start_time:.2f}")
return recommendations
except Exception as e: except Exception as e:
logger.error(f"推荐失败: {str(e)}", exc_info=True) logger.error(f"启动增量监听任务失败: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/recommend/{user_id}/{category}", response_model=List[str])
async def recommend(
user_id: int,
category: str,
style: Optional[str] = Query(
None,
description="风格样式(可选):若传入,则在利用分支对同 style 的候选进行加分",
),
):
"""新版推荐接口Milvus + Redis 偏好向量)。"""
try:
results = get_new_recommendations(user_id, category, style)
path = results[0] if results else ""
return [path]
except Exception as e:
logger.error("新版推荐接口失败 [user=%s, category=%s]: %s", user_id, category, e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -10,8 +10,10 @@ from app.api import api_design_pre_processing
from app.api import api_extraction_project_info from app.api import api_extraction_project_info
from app.api import api_generate_image from app.api import api_generate_image
from app.api import api_image2sketch from app.api import api_image2sketch
from app.api import api_import_sys_sketch
from app.api import api_mannequins_edit from app.api import api_mannequins_edit
from app.api import api_pose_transform from app.api import api_pose_transform
from app.api import api_precompute
from app.api import api_prompt_generation from app.api import api_prompt_generation
from app.api import api_recommendation from app.api import api_recommendation
from app.api import api_super_resolution from app.api import api_super_resolution
@@ -36,3 +38,5 @@ router.include_router(api_mannequins_edit.router, tags=['api_mannequins_edit'],
router.include_router(api_pose_transform.router, tags=['api_pose_transform'], prefix="/api") router.include_router(api_pose_transform.router, tags=['api_pose_transform'], prefix="/api")
router.include_router(api_clothing_seg.router, tags=['api_clothing_seg'], prefix="/api") router.include_router(api_clothing_seg.router, tags=['api_clothing_seg'], prefix="/api")
router.include_router(api_extraction_project_info.router, tags=['api_extraction_project_info'], prefix="/api") router.include_router(api_extraction_project_info.router, tags=['api_extraction_project_info'], prefix="/api")
router.include_router(api_import_sys_sketch.router, tags=['api_import_sys_sketch'], prefix="/api")
router.include_router(api_precompute.router, tags=['api_precompute'], prefix="/api")

View File

@@ -82,9 +82,9 @@ MILVUS_TABLE_SEG = "seg_cache"
DB_HOST = '18.167.251.121' # 数据库主机地址 DB_HOST = '18.167.251.121' # 数据库主机地址
# DB_PORT = int( 33006) # DB_PORT = int( 33006)
DB_PORT = 33008 # 数据库端口 DB_PORT = 33008 # 数据库端口
DB_USERNAME = 'aida_con_python' # 数据库用户名 DB_USERNAME = 'aida_con' # 数据库用户名
DB_PASSWORD = '123456' # 数据库密码 DB_PASSWORD = '123456' # 数据库密码
DB_NAME = 'aida' # 数据库库名 DB_NAME = 'aida_back' # 数据库库名
# openai # openai
os.environ['SERPAPI_API_KEY'] = "a793513017b0718db7966207c31703d280d12435c982f1e67bbcbffa52e7632c" os.environ['SERPAPI_API_KEY'] = "a793513017b0718db7966207c31703d280d12435c982f1e67bbcbffa52e7632c"

View File

@@ -11,7 +11,6 @@ from app.api.api_route import router
from app.core.config import settings from app.core.config import settings
from app.core.record_api_count import count_api_calls from app.core.record_api_count import count_api_calls
from app.schemas.response_template import ResponseModel from app.schemas.response_template import ResponseModel
from app.service.recommend.service import load_resources
from logging_env import LOGGER_CONFIG_DICT from logging_env import LOGGER_CONFIG_DICT
logging.config.dictConfig(LOGGER_CONFIG_DICT) logging.config.dictConfig(LOGGER_CONFIG_DICT)

View File

@@ -1,240 +1,240 @@
# 预加载资源 # # 预加载资源
import logging # import logging
import time # import time
from collections import defaultdict # from collections import defaultdict
import os # import os
import json # import json
import numpy as np # import numpy as np
#
from app.core.config import DB_CONFIG, RECOMMEND_PATH_PREFIX # from app.core.config import DB_CONFIG, RECOMMEND_PATH_PREFIX
#
logger = logging.getLogger() # logger = logging.getLogger()
import pymysql # import pymysql
from concurrent.futures import ThreadPoolExecutor # from concurrent.futures import ThreadPoolExecutor
#
HEAT_VECTOR_FILE = 'heat_vectors_data/heat_vectors.json' # 可动态加载或配置 # HEAT_VECTOR_FILE = 'heat_vectors_data/heat_vectors.json' # 可动态加载或配置
#
matrix_data = { # matrix_data = {
"interaction_matrix": None, # "interaction_matrix": None,
"feature_matrix": None, # "feature_matrix": None,
"user_index_interaction": None, # "user_index_interaction": None,
"sketch_index_interaction": None, # "sketch_index_interaction": None,
"user_index_feature": None, # "user_index_feature": None,
"sketch_index_feature": None, # "sketch_index_feature": None,
"iid_to_sketch": None, # "iid_to_sketch": None,
"category_to_iids": None, # "category_to_iids": None,
"cached_scores": {}, # "cached_scores": {},
"cached_valid_idxs": {}, # "cached_valid_idxs": {},
"category_sketch_idxs_inter": None, # "category_sketch_idxs_inter": None,
"category_sketch_idxs_feature": None, # "category_sketch_idxs_feature": None,
"user_inter_full": dict(), # "user_inter_full": dict(),
"user_feat_full": dict(), # "user_feat_full": dict(),
"brand_feature_matrix": None, # "brand_feature_matrix": None,
"brand_index_map": None, # "brand_index_map": None,
"heat_data": {}, # "heat_data": {},
} # }
#
#
def load_resources(): # def load_resources():
"""加载所有矩阵和映射关系,并触发预缓存""" # """加载所有矩阵和映射关系,并触发预缓存"""
try: # try:
start_time = time.time() # start_time = time.time()
#
# 清空缓存 # # 清空缓存
matrix_data["cached_scores"].clear() # matrix_data["cached_scores"].clear()
matrix_data["cached_valid_idxs"].clear() # matrix_data["cached_valid_idxs"].clear()
#
# 加载数据 # # 加载数据
sketch_to_iid = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_to_iid.npy', allow_pickle=True).item() # sketch_to_iid = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_to_iid.npy', allow_pickle=True).item()
matrix_data["iid_to_sketch"] = {v: k for k, v in sketch_to_iid.items()} # matrix_data["iid_to_sketch"] = {v: k for k, v in sketch_to_iid.items()}
#
matrix_data["interaction_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}interaction_matrix.npy", allow_pickle=True) # matrix_data["interaction_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}interaction_matrix.npy", allow_pickle=True)
matrix_data["user_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", allow_pickle=True).item() # matrix_data["user_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", allow_pickle=True).item()
matrix_data["sketch_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy", # matrix_data["sketch_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy",
allow_pickle=True).item() # allow_pickle=True).item()
#
matrix_data["feature_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}feature_matrix.npy", allow_pickle=True) # matrix_data["feature_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}feature_matrix.npy", allow_pickle=True)
#
brand_feature_path = f"{RECOMMEND_PATH_PREFIX}brand_feature_matrix.npy" # brand_feature_path = f"{RECOMMEND_PATH_PREFIX}brand_feature_matrix.npy"
if os.path.exists(brand_feature_path): # if os.path.exists(brand_feature_path):
matrix_data["brand_feature_matrix"] = np.load(brand_feature_path, allow_pickle=True) # matrix_data["brand_feature_matrix"] = np.load(brand_feature_path, allow_pickle=True)
else: # else:
logger.warning("brand_feature_matrix 文件不存在,使用空数组") # logger.warning("brand_feature_matrix 文件不存在,使用空数组")
matrix_data["brand_feature_matrix"] = np.array([]) # matrix_data["brand_feature_matrix"] = np.array([])
#
# brand_index_map # # brand_index_map
brand_index_path = f"{RECOMMEND_PATH_PREFIX}brand_index_map.npy" # brand_index_path = f"{RECOMMEND_PATH_PREFIX}brand_index_map.npy"
if os.path.exists(brand_index_path): # if os.path.exists(brand_index_path):
matrix_data["brand_index_map"] = np.load(brand_index_path, allow_pickle=True).item() # matrix_data["brand_index_map"] = np.load(brand_index_path, allow_pickle=True).item()
else: # else:
logger.warning("brand_index_map 文件不存在,使用空字典") # logger.warning("brand_index_map 文件不存在,使用空字典")
matrix_data["brand_index_map"] = {} # matrix_data["brand_index_map"] = {}
#
matrix_data["user_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", allow_pickle=True).item() # matrix_data["user_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", allow_pickle=True).item()
#
matrix_data["sketch_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", allow_pickle=True).item() # matrix_data["sketch_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", allow_pickle=True).item()
#
category_to_iid_map = np.load(f"{RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", allow_pickle=True).item() # category_to_iid_map = np.load(f"{RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", allow_pickle=True).item()
matrix_data["category_to_iids"] = defaultdict(list) # matrix_data["category_to_iids"] = defaultdict(list)
for iid, cat in category_to_iid_map.items(): # for iid, cat in category_to_iid_map.items():
matrix_data["category_to_iids"][cat].append(iid) # matrix_data["category_to_iids"][cat].append(iid)
#
logger.info(f"资源加载完成,耗时: {time.time() - start_time:.2f}") # logger.info(f"资源加载完成,耗时: {time.time() - start_time:.2f}秒")
#
# 触发预缓存 # # 触发预缓存
precache_user_category() # precache_user_category()
#
if os.path.exists(HEAT_VECTOR_FILE): # if os.path.exists(HEAT_VECTOR_FILE):
with open(HEAT_VECTOR_FILE, 'r', encoding='utf-8') as f: # with open(HEAT_VECTOR_FILE, 'r', encoding='utf-8') as f:
heat_json = json.load(f) # heat_json = json.load(f)
matrix_data["heat_data"] = heat_json.get("data", {}) # matrix_data["heat_data"] = heat_json.get("data", {})
logger.info(f"热度向量数据加载完成,共加载 {len(matrix_data['heat_data'])} 个类别") # logger.info(f"热度向量数据加载完成,共加载 {len(matrix_data['heat_data'])} 个类别")
else: # else:
matrix_data["heat_data"] = {} # matrix_data["heat_data"] = {}
#
except Exception as e: # except Exception as e:
logger.error(f"资源加载失败: {str(e)}") # logger.error(f"资源加载失败: {str(e)}")
raise RuntimeError("初始化失败") # raise RuntimeError("初始化失败")
#
#
def precache_user_category(): # def precache_user_category():
"""优化后的用户分类预缓存(添加耗时统计)""" # """优化后的用户分类预缓存(添加耗时统计)"""
if not all([ # if not all([
matrix_data["interaction_matrix"] is not None, # matrix_data["interaction_matrix"] is not None,
matrix_data["feature_matrix"] is not None, # matrix_data["feature_matrix"] is not None,
matrix_data["user_index_interaction"] is not None # matrix_data["user_index_interaction"] is not None
]): # ]):
logger.warning("资源未加载完成,跳过预缓存") # logger.warning("资源未加载完成,跳过预缓存")
return # return
#
start_time = time.perf_counter() # start_time = time.perf_counter()
time_stats = { # time_stats = {
"get_all_user_categories": 0, # "get_all_user_categories": 0,
"process_user_category": 0, # "process_user_category": 0,
"thread_execution": 0, # "thread_execution": 0,
"cache_update": 0, # "cache_update": 0,
"total": 0, # "total": 0,
} # }
#
# 统计用户类别获取时间 # # 统计用户类别获取时间
t1 = time.perf_counter() # t1 = time.perf_counter()
user_categories = get_all_user_categories() # user_categories = get_all_user_categories()
time_stats["get_all_user_categories"] = time.perf_counter() - t1 # time_stats["get_all_user_categories"] = time.perf_counter() - t1
#
precached_count = 0 # precached_count = 0
#
def process_user_category(user_id, categories): # def process_user_category(user_id, categories):
"""单用户类别缓存计算(统计耗时)""" # """单用户类别缓存计算(统计耗时)"""
local_cache = {} # local_cache = {}
local_valid_idxs = {} # local_valid_idxs = {}
t_start = time.perf_counter() # t_start = time.perf_counter()
#
for category in categories: # for category in categories:
cache_key = (user_id, category) # cache_key = (user_id, category)
if cache_key in matrix_data["cached_scores"]: # if cache_key in matrix_data["cached_scores"]:
continue # continue
#
try: # try:
user_idx_inter = matrix_data["user_index_interaction"].get(user_id) # user_idx_inter = matrix_data["user_index_interaction"].get(user_id)
user_idx_feature = matrix_data["user_index_feature"].get(user_id) # user_idx_feature = matrix_data["user_index_feature"].get(user_id)
#
# 统计获取类别 IID 耗时 # # 统计获取类别 IID 耗时
t_iid = time.perf_counter() # t_iid = time.perf_counter()
category_iids = matrix_data["category_to_iids"].get(category, []) # category_iids = matrix_data["category_to_iids"].get(category, [])
valid_sketch_idxs_inter = [matrix_data["sketch_index_interaction"][iid] # valid_sketch_idxs_inter = [matrix_data["sketch_index_interaction"][iid]
for iid in category_iids if iid in matrix_data["sketch_index_interaction"]] # for iid in category_iids if iid in matrix_data["sketch_index_interaction"]]
valid_sketch_idxs_feature = [matrix_data["sketch_index_feature"][iid] # valid_sketch_idxs_feature = [matrix_data["sketch_index_feature"][iid]
for iid in category_iids if iid in matrix_data["sketch_index_feature"]] # for iid in category_iids if iid in matrix_data["sketch_index_feature"]]
time_stats["process_user_category"] += time.perf_counter() - t_iid # time_stats["process_user_category"] += time.perf_counter() - t_iid
#
# 统计矩阵计算耗时 # # 统计矩阵计算耗时
t_matrix = time.perf_counter() # t_matrix = time.perf_counter()
processed_inter = np.zeros(len(valid_sketch_idxs_inter)) # processed_inter = np.zeros(len(valid_sketch_idxs_inter))
if user_idx_inter is not None and valid_sketch_idxs_inter: # if user_idx_inter is not None and valid_sketch_idxs_inter:
raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter] # raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter]
processed_inter = raw_inter_scores * 0.7 # processed_inter = raw_inter_scores * 0.7
#
processed_feat = np.zeros(len(valid_sketch_idxs_feature)) # processed_feat = np.zeros(len(valid_sketch_idxs_feature))
if user_idx_feature is not None and valid_sketch_idxs_feature: # if user_idx_feature is not None and valid_sketch_idxs_feature:
raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature] # raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature]
raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / ( # raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / (
np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8) # np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8)
processed_feat = raw_feat_scores * 0.3 # processed_feat = raw_feat_scores * 0.3
time_stats["process_user_category"] += time.perf_counter() - t_matrix # time_stats["process_user_category"] += time.perf_counter() - t_matrix
#
if len(processed_inter) == len(processed_feat): # if len(processed_inter) == len(processed_feat):
local_cache[cache_key] = (processed_inter, processed_feat) # local_cache[cache_key] = (processed_inter, processed_feat)
local_valid_idxs[cache_key] = valid_sketch_idxs_inter # local_valid_idxs[cache_key] = valid_sketch_idxs_inter
#
except Exception as e: # except Exception as e:
logger.error(f"预缓存失败 (user={user_id}, category={category}): {str(e)}") # logger.error(f"预缓存失败 (user={user_id}, category={category}): {str(e)}")
#
return local_cache, local_valid_idxs # return local_cache, local_valid_idxs
#
# 统计线程执行时间 # # 统计线程执行时间
t2 = time.perf_counter() # t2 = time.perf_counter()
with ThreadPoolExecutor(max_workers=8) as executor: # with ThreadPoolExecutor(max_workers=8) as executor:
futures = {executor.submit(process_user_category, user_id, categories): user_id for user_id, categories in user_categories.items()} # futures = {executor.submit(process_user_category, user_id, categories): user_id for user_id, categories in user_categories.items()}
for future in futures: # for future in futures:
try: # try:
t_cache = time.perf_counter() # t_cache = time.perf_counter()
cache_part, valid_idxs_part = future.result() # cache_part, valid_idxs_part = future.result()
matrix_data["cached_scores"].update(cache_part) # matrix_data["cached_scores"].update(cache_part)
matrix_data["cached_valid_idxs"].update(valid_idxs_part) # matrix_data["cached_valid_idxs"].update(valid_idxs_part)
time_stats["cache_update"] += time.perf_counter() - t_cache # time_stats["cache_update"] += time.perf_counter() - t_cache
precached_count += len(cache_part) # precached_count += len(cache_part)
except Exception as e: # except Exception as e:
logger.error(f"线程执行错误: {str(e)}") # logger.error(f"线程执行错误: {str(e)}")
time_stats["thread_execution"] = time.perf_counter() - t2 # time_stats["thread_execution"] = time.perf_counter() - t2
#
time_stats["total"] = time.perf_counter() - start_time # time_stats["total"] = time.perf_counter() - start_time
#
# 输出统计信息 # # 输出统计信息
logger.info(f""" # logger.info(f"""
预缓存完成,共缓存 {precached_count} 组数据,耗时统计如下: # 预缓存完成,共缓存 {precached_count} 组数据,耗时统计如下:
- 获取用户类别数据: {time_stats["get_all_user_categories"]:.2f}s # - 获取用户类别数据: {time_stats["get_all_user_categories"]:.2f}s
- 计算用户类别缓存: {time_stats["process_user_category"]:.2f}s # - 计算用户类别缓存: {time_stats["process_user_category"]:.2f}s
- 线程任务执行: {time_stats["thread_execution"]:.2f}s # - 线程任务执行: {time_stats["thread_execution"]:.2f}s
- 更新缓存数据: {time_stats["cache_update"]:.2f}s # - 更新缓存数据: {time_stats["cache_update"]:.2f}s
- 总耗时: {time_stats["total"]:.2f}s # - 总耗时: {time_stats["total"]:.2f}s
""") # """)
#
#
def get_all_user_categories(): # def get_all_user_categories():
"""获取所有用户及其对应的分类""" # """获取所有用户及其对应的分类"""
conn = None # conn = None
try: # try:
conn = pymysql.connect(**DB_CONFIG) # conn = pymysql.connect(**DB_CONFIG)
cursor = conn.cursor() # cursor = conn.cursor()
#
query = """ # query = """
SELECT DISTINCT account_id, path # SELECT DISTINCT account_id, path
FROM user_preference_log_prediction # FROM user_preference_log_prediction
""" # """
cursor.execute(query) # cursor.execute(query)
results = cursor.fetchall() # results = cursor.fetchall()
#
user_categories = defaultdict(set) # user_categories = defaultdict(set)
for account_id, path in results: # for account_id, path in results:
category = get_category_from_path(path) # category = get_category_from_path(path)
user_categories[account_id].add(category) # user_categories[account_id].add(category)
#
return dict(user_categories) # return dict(user_categories)
#
except Exception as e: # except Exception as e:
logger.error(f"数据库查询失败: {str(e)}") # logger.error(f"数据库查询失败: {str(e)}")
return {} # return {}
finally: # finally:
if conn: # if conn:
conn.close() # conn.close()
#
#
def get_category_from_path(path: str) -> str: # def get_category_from_path(path: str) -> str:
"""从路径解析类别""" # """从路径解析类别"""
try: # try:
parts = path.split('/') # parts = path.split('/')
if len(parts) >= 4: # if len(parts) >= 4:
return f"{parts[2]}_{parts[3]}" # return f"{parts[2]}_{parts[3]}"
return "unknown" # return "unknown"
except: # except:
return "unknown" # return "unknown"

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,73 @@
"""
推荐系统配置
"""
import os
from app.core.config import (
DB_CONFIG, DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_NAME,
REDIS_HOST, REDIS_PORT, REDIS_DB,
MILVUS_URL, MILVUS_TOKEN, MILVUS_ALIAS,
MINIO_URL, MINIO_ACCESS, MINIO_SECRET, MINIO_SECURE
)
# Milvus 集合名称
MILVUS_COLLECTION_SKETCH_VECTORS = "sketch_vectors_norm"
# Redis key 前缀
REDIS_KEY_USER_PREF_PREFIX = "user_pref"
# 推荐系统配置参数
RECOMMENDATION_CONFIG = {
# 时间衰减半衰期(用于计算时间衰减权重)
# 值越小,最近的行为权重越大
"K_half": 20,
# 探索与利用的比例 (0.0-1.0)
# - 值越大,使用探索分支(随机推荐)的几率越大,结果更随机
# - 值越小,使用利用分支(基于用户偏好)的几率越大,结果更精准
# - 建议范围: 0.3-0.7,要增加随机性可提高到 0.6-0.8
"explore_ratio": 0.5,
# 向量检索返回的候选数量
# 值越大,候选池越大,但计算成本也越高
# 建议范围: 100-1000
"topk": 1000,
# Style 加分系数(同 style 的候选进行加分)
# 值越大,匹配 style 的候选被选中的概率越大
# 要降低某个结果的重复率,可以降低此值(如 0.1 或 0.05
"style_bonus": 0.2,
# Softmax 抽样的温度参数
# - 温度越高(>1.0),概率分布越均匀,结果更随机,重复率更低
# - 温度越低(<1.0),高分项概率越大,结果更集中,重复率更高
# - 温度=1.0 为标准 Softmax
# - 建议范围: 1.0-3.0,要增加随机性可提高到 2.0-3.0
"softmax_temperature": 0.07,
# 监听间隔(秒)
"listen_interval_sec": 30,
# 批量处理大小
"batch_size": 1000,
# Redis 过期时间30天
"redis_expire_seconds": 2592000,
# 向量维度
"vector_dim": 2048,
}
# 数据库表名
TABLE_USER_PREFERENCE_LOG = "user_preference_log_test"
TABLE_SYS_FILE = "t_sys_file"
# MySQL 连接配置(用于推荐系统)
MYSQL_CONFIG = {
"host": DB_HOST,
"port": DB_PORT,
"user": DB_USERNAME,
"password": DB_PASSWORD,
"database": DB_NAME,
"charset": "utf8mb4"
}

View File

@@ -0,0 +1,331 @@
"""
独立脚本:从 t_sys_file 导入系统图向量到 Milvus
可以单独运行,不依赖整个项目启动
使用方法:
python -m app.service.recommendation_system.import_sys_sketch_to_milvus
python app/service/recommendation_system/import_sys_sketch_to_milvus.py
"""
import sys
import os
import logging
import argparse
from pathlib import Path
# 添加项目根目录到 Python 路径
project_root = Path(__file__).parent.parent.parent.parent
sys.path.insert(0, str(project_root))
import numpy as np
import pymysql
from tqdm import tqdm
from app.service.recommendation_system.config import (
MYSQL_CONFIG, TABLE_SYS_FILE,
RECOMMENDATION_CONFIG, MILVUS_COLLECTION_SKETCH_VECTORS
)
from app.service.recommendation_system.vector_utils import extract_feature_vector, normalize_vector
from app.service.recommendation_system.milvus_client import create_collection, insert_vectors
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('import_sys_sketch.log', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
def get_sys_file_records(conn, limit=None, offset=0):
"""
从 t_sys_file 表获取系统图记录
Args:
conn: 数据库连接
limit: 限制数量None 表示不限制)
offset: 偏移量
Returns:
记录列表,每个元素为 (id, url, style, level3_type, level2_type, deprecated)
"""
cursor = conn.cursor()
query = f"""
SELECT id, url, style, level3_type, level2_type, deprecated
FROM {TABLE_SYS_FILE}
WHERE level1_type = 'Images'
AND style IS NOT NULL
AND style != ''
AND deprecated != 1
ORDER BY id
"""
if limit:
query += f" LIMIT {limit} OFFSET {offset}"
cursor.execute(query)
records = cursor.fetchall()
cursor.close()
return records
def get_total_count(conn):
"""获取总记录数"""
cursor = conn.cursor()
cursor.execute(f"""
SELECT COUNT(*)
FROM {TABLE_SYS_FILE}
WHERE level1_type = 'Images'
AND style IS NOT NULL
AND style != ''
AND deprecated != 1
""")
count = cursor.fetchone()[0]
cursor.close()
return count
def process_and_insert_batch(records, batch_size=1000, retry_times=3):
"""
处理并批量插入向量
Args:
records: 记录列表
batch_size: 批量大小
retry_times: 失败重试次数
Returns:
(成功数量, 失败数量)
"""
success_count = 0
failed_count = 0
failed_records = []
batch_data = []
# 使用 tqdm 显示进度
with tqdm(total=len(records), desc="处理记录", unit="") as pbar:
for idx, (sys_file_id, url, style, level3_type, level2_type, deprecated) in enumerate(records):
try:
# 计算 category
category = f"{level3_type.lower()}_{level2_type.lower()}"
# 提取特征向量
feature_vector = extract_feature_vector(url)
# 归一化,便于 IP≈cosine 度量
feature_vector = normalize_vector(feature_vector)
# 检查向量是否有效
if np.all(feature_vector == 0):
logger.warning(f"向量提取失败,跳过: {url} (id={sys_file_id})")
failed_count += 1
failed_records.append((sys_file_id, url))
pbar.update(1)
continue
# 准备数据
data_item = {
"path": url,
"sys_file_id": sys_file_id,
"style": style,
"category": category,
"is_system_sketch": 1,
"deprecated": deprecated if deprecated else 0,
"feature_vector": feature_vector.tolist()
}
batch_data.append(data_item)
# 批量写入
if len(batch_data) >= batch_size:
try:
insert_vectors(batch_data)
success_count += len(batch_data)
batch_data = []
logger.info(f"已成功插入 {success_count} 条记录")
except Exception as e:
logger.error(f"批量写入失败: {e}")
failed_count += len(batch_data)
failed_records.extend([(item["sys_file_id"], item["path"]) for item in batch_data])
batch_data = []
pbar.update(1)
except Exception as e:
logger.error(f"处理记录失败 [id={sys_file_id}, url={url}]: {e}")
failed_count += 1
failed_records.append((sys_file_id, url))
pbar.update(1)
# 写入剩余数据
if batch_data:
try:
insert_vectors(batch_data)
success_count += len(batch_data)
logger.info(f"写入剩余 {len(batch_data)} 条记录")
except Exception as e:
logger.error(f"写入剩余数据失败: {e}")
failed_count += len(batch_data)
failed_records.extend([(item["sys_file_id"], item["path"]) for item in batch_data])
# 重试失败记录
if failed_records and retry_times > 0:
logger.info(f"开始重试 {len(failed_records)} 条失败记录,最多重试 {retry_times} 次...")
for retry in range(retry_times):
if not failed_records:
break
retry_failed = []
with tqdm(total=len(failed_records), desc=f"重试第 {retry + 1}", unit="") as pbar:
for sys_file_id, url in failed_records:
try:
# 重新查询记录信息
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
cursor.execute(f"""
SELECT id, url, style, level3_type, level2_type, deprecated
FROM {TABLE_SYS_FILE}
WHERE id = %s
""", (sys_file_id,))
record = cursor.fetchone()
cursor.close()
conn.close()
if not record:
retry_failed.append((sys_file_id, url))
pbar.update(1)
continue
sys_file_id, url, style, level3_type, level2_type, deprecated = record
category = f"{level3_type.lower()}_{level2_type.lower()}"
feature_vector = extract_feature_vector(url)
feature_vector = normalize_vector(feature_vector)
if np.all(feature_vector == 0):
retry_failed.append((sys_file_id, url))
pbar.update(1)
continue
data_item = {
"path": url,
"sys_file_id": sys_file_id,
"style": style,
"category": category,
"is_system_sketch": 1,
"deprecated": deprecated if deprecated else 0,
"feature_vector": feature_vector.tolist()
}
insert_vectors([data_item])
success_count += 1
failed_count -= 1
pbar.update(1)
except Exception as e:
logger.error(f"重试失败 [id={sys_file_id}, url={url}]: {e}")
retry_failed.append((sys_file_id, url))
pbar.update(1)
failed_records = retry_failed
if failed_records:
logger.warning(f"{retry + 1} 次重试后仍有 {len(failed_records)} 条记录失败")
return success_count, failed_count, failed_records
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='从 t_sys_file 导入系统图向量到 Milvus')
parser.add_argument('--batch-size', type=int, default=1000, help='批量处理大小默认1000')
parser.add_argument('--retry-times', type=int, default=3, help='失败重试次数默认3')
parser.add_argument('--limit', type=int, default=None, help='限制处理数量(用于测试,默认:不限制)')
parser.add_argument('--offset', type=int, default=0, help='起始偏移量默认0')
parser.add_argument('--skip-create-collection', action='store_true', help='跳过创建集合(如果集合已存在)')
args = parser.parse_args()
logger.info("=" * 60)
logger.info("开始从 t_sys_file 导入系统图向量到 Milvus")
logger.info("=" * 60)
logger.info(f"配置参数:")
logger.info(f" - 批量大小: {args.batch_size}")
logger.info(f" - 重试次数: {args.retry_times}")
logger.info(f" - 限制数量: {args.limit if args.limit else '不限制'}")
logger.info(f" - 起始偏移: {args.offset}")
logger.info("=" * 60)
# 1. 创建 Milvus 集合
if not args.skip_create_collection:
logger.info("创建 Milvus 集合...")
try:
create_collection()
logger.info("Milvus 集合创建成功(或已存在)")
except Exception as e:
logger.error(f"创建 Milvus 集合失败: {e}")
return
else:
logger.info("跳过创建集合")
# 2. 连接数据库
logger.info("连接数据库...")
try:
conn = pymysql.connect(**MYSQL_CONFIG)
logger.info("数据库连接成功")
except Exception as e:
logger.error(f"数据库连接失败: {e}")
return
try:
# 3. 获取总记录数
total_count = get_total_count(conn)
logger.info(f"找到 {total_count} 条系统图记录")
if total_count == 0:
logger.warning("没有找到系统图数据")
return
# 4. 获取记录
logger.info("获取记录...")
records = get_sys_file_records(conn, limit=args.limit, offset=args.offset)
logger.info(f"获取到 {len(records)} 条记录")
if not records:
logger.warning("没有获取到记录")
return
# 5. 处理并插入
logger.info("开始处理记录...")
success_count, failed_count, failed_records = process_and_insert_batch(
records,
batch_size=args.batch_size,
retry_times=args.retry_times
)
# 6. 输出结果
logger.info("=" * 60)
logger.info("导入完成!")
logger.info(f" - 成功: {success_count}")
logger.info(f" - 失败: {failed_count}")
if failed_records:
logger.warning(f" - 失败记录列表前10条:")
for sys_file_id, url in failed_records[:10]:
logger.warning(f" ID={sys_file_id}, URL={url}")
if len(failed_records) > 10:
logger.warning(f" ... 还有 {len(failed_records) - 10} 条失败记录")
logger.info("=" * 60)
except Exception as e:
logger.error(f"处理过程中发生错误: {e}", exc_info=True)
finally:
conn.close()
logger.info("数据库连接已关闭")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,343 @@
"""
增量监听模块
实时监听 user_preference_log_test 表的新增记录,更新用户偏好向量
"""
import logging
import math
import pymysql
import numpy as np
from typing import List, Dict, Set, Tuple, Optional
from datetime import datetime
from collections import defaultdict
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from app.service.recommendation_system.config import (
MYSQL_CONFIG, TABLE_USER_PREFERENCE_LOG, TABLE_SYS_FILE,
RECOMMENDATION_CONFIG, REDIS_KEY_USER_PREF_PREFIX
)
from app.service.recommendation_system.vector_utils import extract_feature_vector, compute_weighted_average, normalize_vector
from app.service.recommendation_system.milvus_client import query_vectors_by_paths, insert_vectors
from app.service.utils.redis_utils import Redis
import json
logger = logging.getLogger(__name__)
class IncrementalListener:
"""增量监听器"""
def __init__(self):
self.last_process_time = None
self.processed_combinations: Set[Tuple[int, str]] = set() # 已处理的 (account_id, category) 组合
self.listen_interval = RECOMMENDATION_CONFIG["listen_interval_sec"]
def get_new_like_records(self) -> List[Tuple]:
"""
获取新增点赞记录
Returns:
记录列表,每个元素为 (id, account_id, path, category, style, data_time, is_system_sketch, sys_file_id)
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
if self.last_process_time is None:
# 第一次运行查询最近30分钟的数据
cursor.execute(f"""
SELECT id, account_id, path, category, style, data_time, is_system_sketch, sys_file_id
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE data_time > DATE_SUB(NOW(), INTERVAL 30 MINUTE)
ORDER BY data_time
""")
else:
# 基于上次处理时间查询
cursor.execute(f"""
SELECT id, account_id, path, category, style, data_time, is_system_sketch, sys_file_id
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE data_time > %s
ORDER BY data_time
""", (self.last_process_time,))
records = cursor.fetchall()
return records
except Exception as e:
logger.error(f"获取新增点赞记录失败: {e}", exc_info=True)
return []
finally:
if conn:
conn.close()
def process_new_records(self, records: List[Tuple]):
"""
处理新增记录
Args:
records: 记录列表
"""
if not records:
return
# 按用户+类别分组
user_category_records = defaultdict(list)
for record in records:
account_id = record[1]
category = record[3]
if category: # 只处理有类别的记录
user_category_records[(account_id, category)].append(record)
# 去重:只处理一次每个 (account_id, category) 组合
to_process = []
for (account_id, category), recs in user_category_records.items():
if (account_id, category) not in self.processed_combinations:
to_process.append((account_id, category, recs))
self.processed_combinations.add((account_id, category))
logger.info(f"需要处理 {len(to_process)} 个用户-类别组合")
# 处理每个组合
for account_id, category, recs in to_process:
try:
self.update_user_preference_vector(account_id, category)
except Exception as e:
logger.error(f"更新用户偏好向量失败 [user={account_id}, category={category}]: {e}", exc_info=True)
# 更新最后处理时间
if records:
self.last_process_time = records[-1][5] # data_time
# 重置去重集合,确保下次周期不会跳过同一用户-类别
self.processed_combinations.clear()
def update_user_preference_vector(self, account_id: int, category: str):
"""
更新用户偏好向量
Args:
account_id: 用户ID
category: 类别
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
# 1. 获取该用户该类别的所有点赞记录
cursor.execute(f"""
SELECT path, data_time
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s
ORDER BY data_time DESC
""", (account_id, category))
like_records = cursor.fetchall()
if not like_records:
return
# 2. 批量查询点赞次数
paths = [r[0] for r in like_records]
placeholders = ','.join(['%s'] * len(paths))
cursor.execute(f"""
SELECT path, COUNT(*) as like_count
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s AND path IN ({placeholders})
GROUP BY path
""", (account_id, category) + tuple(paths))
like_counts = {row[0]: row[1] for row in cursor.fetchall()}
# 3. 批量获取向量
vectors_dict = query_vectors_by_paths(paths)
# 处理查询不到的 path新用户图或异常情况
missing_paths = [p for p in paths if p not in vectors_dict]
if missing_paths:
logger.info(f"用户 {account_id} 类别 {category}{len(missing_paths)} 个 path 需要实时计算向量")
self._compute_and_insert_missing_vectors(missing_paths, conn)
# 重新查询
vectors_dict = query_vectors_by_paths(paths)
# 4. 计算权重并加权平均
vectors = []
weights = []
K_half = RECOMMENDATION_CONFIG["K_half"]
for k, (path, data_time) in enumerate(like_records, 1):
if path not in vectors_dict:
continue
vector_data = vectors_dict[path]
feature_vector = np.array(vector_data["feature_vector"])
# 时间衰减权重
d_k = 0.5 ** (k / K_half)
# 点赞次数权重
like_count = like_counts.get(path, 1)
p_i = 1 + math.log(1 + like_count)
# 综合权重
w_i = d_k * p_i
vectors.append(feature_vector)
weights.append(w_i)
if not vectors:
logger.warning(f"用户 {account_id} 类别 {category} 没有有效向量")
return
# 5. 计算加权平均并做 L2 归一化IP≈cosine
preference_vector = compute_weighted_average(vectors, weights)
preference_vector = normalize_vector(preference_vector)
# 6. 写入 Redis
key = f"{REDIS_KEY_USER_PREF_PREFIX}:{account_id}:{category}"
vector_json = json.dumps(preference_vector.tolist())
Redis.write(
key=key,
value=vector_json,
expire=RECOMMENDATION_CONFIG["redis_expire_seconds"]
)
logger.debug(f"用户偏好向量更新成功 [user={account_id}, category={category}]")
except Exception as e:
logger.error(f"更新用户偏好向量失败 [user={account_id}, category={category}]: {e}", exc_info=True)
raise
finally:
if conn:
conn.close()
def _compute_and_insert_missing_vectors(self, paths: List[str], conn: pymysql.connections.Connection):
"""
计算并插入缺失的向量
Args:
paths: 缺失的 path 列表
conn: 数据库连接
"""
cursor = conn.cursor()
data_to_insert = []
for path in paths:
try:
# 判断数据来源(查询 t_sys_file 表)
cursor.execute(f"""
SELECT id, url, style, level3_type, level2_type, deprecated
FROM {TABLE_SYS_FILE}
WHERE url = %s
LIMIT 1
""", (path,))
sys_file = cursor.fetchone()
# 提取特征向量
feature_vector = extract_feature_vector(path)
if np.all(feature_vector == 0):
logger.warning(f"向量提取失败,跳过: {path}")
continue
if sys_file:
# 系统图
sys_file_id, url, style, level3_type, level2_type, deprecated = sys_file
category = f"{level3_type.lower()}_{level2_type.lower()}"
data_item = {
"path": path,
"sys_file_id": sys_file_id,
"style": style,
"category": category,
"is_system_sketch": 1,
"deprecated": deprecated if deprecated else 0,
"feature_vector": feature_vector.tolist()
}
else:
# 用户图
# 从 user_preference_log_test 获取 category如果有
cursor.execute(f"""
SELECT category
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE path = %s AND category IS NOT NULL
LIMIT 1
""", (path,))
category_result = cursor.fetchone()
category = category_result[0] if category_result else None
data_item = {
"path": path,
"sys_file_id": None,
"style": None,
"category": category,
"is_system_sketch": 0,
"deprecated": 0,
"feature_vector": feature_vector.tolist()
}
data_to_insert.append(data_item)
except Exception as e:
logger.error(f"处理缺失向量失败 [{path}]: {e}")
# 批量插入
if data_to_insert:
try:
insert_vectors(data_to_insert)
logger.info(f"成功插入 {len(data_to_insert)} 个缺失向量")
except Exception as e:
logger.error(f"插入缺失向量失败: {e}")
def process_once(self):
"""单次轮询任务,供调度器调用"""
try:
records = self.get_new_like_records()
if records:
logger.info(f"发现 {len(records)} 条新增记录")
self.process_new_records(records)
else:
logger.debug("没有新增记录")
except Exception as e:
logger.error(f"监听轮询异常: {e}", exc_info=True)
def start_background_listener(scheduler: BackgroundScheduler):
"""将增量监听任务注册到后台调度器"""
listener = IncrementalListener()
scheduler.add_job(
listener.process_once,
"interval",
seconds=listener.listen_interval,
max_instances=1,
coalesce=True,
id="recommendation_incremental_listener",
replace_existing=True,
)
logger.info("增量监听任务已注册到调度器")
def start_blocking_listener():
"""以阻塞方式启动调度器(用于独立脚本运行)"""
listener = IncrementalListener()
scheduler = BlockingScheduler()
scheduler.add_job(
listener.process_once,
"interval",
seconds=listener.listen_interval,
max_instances=1,
coalesce=True,
id="recommendation_incremental_listener",
replace_existing=True,
)
logger.info("增量监听调度器已启动BlockingScheduler")
scheduler.start()
if __name__ == "__main__":
start_blocking_listener()

View File

@@ -0,0 +1,295 @@
"""
Milvus 客户端封装
"""
import logging
from typing import List, Dict, Optional, Any
import numpy as np
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType, connections, Collection
from app.core.config import MILVUS_URL, MILVUS_TOKEN, MILVUS_ALIAS
from app.service.recommendation_system.config import MILVUS_COLLECTION_SKETCH_VECTORS, RECOMMENDATION_CONFIG
logger = logging.getLogger(__name__)
# Milvus 客户端(单例)
_milvus_client = None
def get_milvus_client() -> MilvusClient:
"""获取 Milvus 客户端(单例模式)"""
global _milvus_client
if _milvus_client is None:
try:
_milvus_client = MilvusClient(
uri=MILVUS_URL,
token=MILVUS_TOKEN,
db_name=MILVUS_ALIAS
)
logger.info("Milvus 客户端连接成功")
except Exception as e:
logger.error(f"Milvus 客户端连接失败: {e}")
raise
return _milvus_client
def create_collection():
"""
创建 Milvus 集合 sketch_vectors
集合结构:
- path (PK, varchar(512)) - 主键MinIO 逻辑 URL
- sys_file_id (int64, 可为NULL) - 系统文件ID
- style (varchar(50), 可为NULL) - 风格样式
- category (varchar(100), 可为NULL) - 类别
- is_system_sketch (int8, 默认 1) - 标记字段1-系统图0-用户图
- deprecated (int8, 默认 0) - 是否废弃
- feature_vector (FloatVector(2048)) - 2048维特征向量
"""
client = get_milvus_client()
# 检查集合是否已存在
collections = client.list_collections()
if MILVUS_COLLECTION_SKETCH_VECTORS in collections:
logger.info(f"集合 {MILVUS_COLLECTION_SKETCH_VECTORS} 已存在")
return
try:
# 解析 Milvus URL
# 处理 http://host.docker.internal:19530 格式
url_clean = MILVUS_URL.replace("http://", "").replace("https://", "")
if ":" in url_clean:
host, port_str = url_clean.split(":", 1)
port = int(port_str)
else:
host = url_clean
port = 19530
# 使用传统 API 创建集合(更可靠)
# 连接到 Milvus如果未连接
try:
connections.connect(
alias=MILVUS_ALIAS,
host=host,
port=port,
token=MILVUS_TOKEN if MILVUS_TOKEN else None
)
logger.info(f"已连接到 Milvus: {host}:{port}")
except Exception as conn_e:
# 如果连接已存在,忽略错误
if "already exists" in str(conn_e).lower() or "Connection already exists" in str(conn_e):
logger.info("Milvus 连接已存在")
else:
logger.warning(f"连接 Milvus 时出现警告: {conn_e}")
# 定义字段
fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, is_primary=True, max_length=512),
FieldSchema(name="sys_file_id", dtype=DataType.INT64),
FieldSchema(name="style", dtype=DataType.VARCHAR, max_length=50),
FieldSchema(name="category", dtype=DataType.VARCHAR, max_length=50),
FieldSchema(name="is_system_sketch", dtype=DataType.INT8),
FieldSchema(name="deprecated", dtype=DataType.INT8),
FieldSchema(
name="feature_vector",
dtype=DataType.FLOAT_VECTOR,
dim=RECOMMENDATION_CONFIG["vector_dim"]
)
]
# 创建 schema
schema = CollectionSchema(
fields=fields,
description="Sketch vectors collection for recommendation system"
)
# 创建集合
collection = Collection(
name=MILVUS_COLLECTION_SKETCH_VECTORS,
schema=schema,
using=MILVUS_ALIAS
)
# 创建索引
# 注意:使用 IP内积作为度量类型与搜索时保持一致
# 如果向量已归一化IP 等价于 COSINE
index_params = {
"metric_type": "IP", # 内积Inner Product
"index_type": "IVF_FLAT",
"params": {"nlist": 1024}
}
collection.create_index(
field_name="feature_vector",
index_params=index_params
)
logger.info(f"集合 {MILVUS_COLLECTION_SKETCH_VECTORS} 创建成功")
except Exception as e:
logger.error(f"创建集合失败: {e}", exc_info=True)
raise
def insert_vectors(data: List[Dict[str, Any]]):
"""
批量插入向量到 Milvus
Args:
data: 数据列表,每个元素包含:
- path: str
- sys_file_id: int (可选)
- style: str (可选)
- category: str (可选)
- is_system_sketch: int (默认 1)
- deprecated: int (默认 0)
- feature_vector: List[float] (2048维)
"""
if not data:
return
client = get_milvus_client()
try:
client.insert(
collection_name=MILVUS_COLLECTION_SKETCH_VECTORS,
data=data
)
logger.info(f"成功插入 {len(data)} 条向量数据")
except Exception as e:
logger.error(f"插入向量失败: {e}", exc_info=True)
raise
def query_vectors_by_paths(paths: List[str]) -> Dict[str, Dict]:
"""
根据 path 列表批量查询向量
Args:
paths: path 列表
Returns:
{path: {feature_vector: [...], ...}} 字典
"""
if not paths:
return {}
client = get_milvus_client()
try:
# 构建查询表达式
# 使用 filter 参数而不是 expr根据 pymilvus MilvusClient API
# 对于字符串列表,使用单引号包裹每个值
path_list = ", ".join([f"'{p}'" for p in paths])
filter_expr = f"path in [{path_list}]"
results = client.query(
collection_name=MILVUS_COLLECTION_SKETCH_VECTORS,
filter=filter_expr,
output_fields=["path", "feature_vector", "style", "category", "sys_file_id", "is_system_sketch", "deprecated"]
)
# 转换为字典
result_dict = {}
for r in results:
result_dict[r["path"]] = r
return result_dict
except Exception as e:
logger.error(f"查询向量失败: {e}", exc_info=True)
return {}
def search_similar_vectors(
query_vector: np.ndarray,
category: str,
topk: int = 500,
style: Optional[str] = None
) -> List[Dict]:
"""
向量相似度检索
Args:
query_vector: 查询向量2048维
category: 类别过滤
topk: 返回数量
style: 风格过滤(可选)
Returns:
检索结果列表,每个元素包含 path, score, style, category 等字段
"""
client = get_milvus_client()
try:
# 构建过滤表达式
# 使用 filter 参数而不是 expr根据 pymilvus MilvusClient API
filter_expr = f"category == '{category}' && deprecated == 0"
if style:
filter_expr += f" && style == '{style}'"
# 搜索
results = client.search(
collection_name=MILVUS_COLLECTION_SKETCH_VECTORS,
data=[query_vector.tolist()],
anns_field="feature_vector",
search_params={"metric_type": "IP", "params": {"nprobe": 10}},
limit=topk,
filter=filter_expr,
output_fields=["path", "style", "category", "sys_file_id"]
)
# 格式化结果
formatted_results = []
if results and len(results) > 0:
for hit in results[0]:
formatted_results.append({
"path": hit.get("entity", {}).get("path", ""),
"score": hit.get("distance", 0.0),
"style": hit.get("entity", {}).get("style", ""),
"category": hit.get("entity", {}).get("category", ""),
"sys_file_id": hit.get("entity", {}).get("sys_file_id")
})
return formatted_results
except Exception as e:
logger.error(f"向量检索失败: {e}", exc_info=True)
return []
def query_random_candidates(category: str, style: Optional[str] = None, limit: int = 10) -> List[Dict]:
"""
随机查询候选(用于探索分支)
Args:
category: 类别
style: 风格(可选)
limit: 返回数量
Returns:
候选列表
"""
client = get_milvus_client()
try:
# 构建过滤表达式
filter_expr = f"category == '{category}' && deprecated == 0"
if style:
filter_expr += f" && style == '{style}'"
# 查询所有符合条件的记录
results = client.query(
collection_name=MILVUS_COLLECTION_SKETCH_VECTORS,
filter=filter_expr,
output_fields=["path", "style", "category"],
limit=10000 # 先查询大量数据,然后随机选择
)
# 随机选择
if len(results) > limit:
import random
results = random.sample(results, limit)
return results
except Exception as e:
logger.error(f"随机查询候选失败: {e}", exc_info=True)
return []

View File

@@ -0,0 +1,556 @@
"""
预计算模块
包含数据库表结构优化、Milvus集合创建、系统图向量预计算、初始用户偏好向量生成
"""
import logging
import math
import pymysql
import numpy as np
from typing import List, Dict, Tuple, Optional
from collections import defaultdict
from app.service.recommendation_system.config import (
MYSQL_CONFIG, TABLE_USER_PREFERENCE_LOG, TABLE_SYS_FILE,
RECOMMENDATION_CONFIG, REDIS_KEY_USER_PREF_PREFIX
)
from app.service.recommendation_system.vector_utils import extract_feature_vector, normalize_vector, compute_weighted_average
from app.service.recommendation_system.milvus_client import (
create_collection, insert_vectors, query_vectors_by_paths
)
from app.service.utils.redis_utils import Redis
import json
logger = logging.getLogger(__name__)
def optimize_database_table():
"""
优化 user_preference_log_test 表结构
添加冗余字段和索引
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
# 1. 添加冗余字段
logger.info("添加冗余字段...")
alter_sqls = [
f"ALTER TABLE {TABLE_USER_PREFERENCE_LOG} ADD COLUMN category VARCHAR(100) COMMENT '类别lower(level3_type + \"_\" + level2_type)'",
f"ALTER TABLE {TABLE_USER_PREFERENCE_LOG} ADD COLUMN style VARCHAR(50) COMMENT '风格样式'",
f"ALTER TABLE {TABLE_USER_PREFERENCE_LOG} ADD COLUMN is_system_sketch TINYINT(1) DEFAULT 1 COMMENT '是否为系统图1-是0-用户图)'",
f"ALTER TABLE {TABLE_USER_PREFERENCE_LOG} ADD COLUMN sys_file_id BIGINT NULL COMMENT '系统文件ID'",
]
for sql in alter_sqls:
try:
cursor.execute(sql)
logger.info(f"执行成功: {sql[:50]}...")
except Exception as e:
if "Duplicate column name" in str(e):
logger.info(f"字段已存在,跳过: {sql[:50]}...")
else:
logger.warning(f"执行失败: {sql[:50]}... 错误: {e}")
# 2. 创建索引MySQL 不支持 IF NOT EXISTS需要先检查
logger.info("创建索引...")
index_definitions = [
("idx_account_category_time", ["account_id", "category", "data_time"]),
("idx_account_path", ["account_id", "path"]),
]
for index_name, columns in index_definitions:
try:
# 检查索引是否已存在
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.statistics
WHERE table_schema = DATABASE()
AND table_name = '{TABLE_USER_PREFERENCE_LOG}'
AND index_name = '{index_name}'
""")
exists = cursor.fetchone()[0] > 0
if exists:
logger.info(f"索引已存在,跳过: {index_name}")
else:
# 创建索引
columns_str = ', '.join(columns)
create_sql = f"CREATE INDEX {index_name} ON {TABLE_USER_PREFERENCE_LOG}({columns_str})"
cursor.execute(create_sql)
logger.info(f"索引创建成功: {index_name}")
except Exception as e:
logger.warning(f"索引创建失败: {index_name} 错误: {e}")
conn.commit()
logger.info("数据库表结构优化完成")
except Exception as e:
logger.error(f"数据库表结构优化失败: {e}", exc_info=True)
if conn:
conn.rollback()
finally:
if conn:
conn.close()
def migrate_historical_data(batch_size: int = 1000):
"""
历史数据迁移:批量更新冗余字段
Args:
batch_size: 每批处理数量
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
# 查询需要更新的记录数
cursor.execute(f"""
SELECT COUNT(*)
FROM {TABLE_USER_PREFERENCE_LOG} u
WHERE u.category IS NULL
""")
total_count = cursor.fetchone()[0]
logger.info(f"需要迁移的记录数: {total_count}")
if total_count == 0:
logger.info("无需迁移数据")
return
# 分批处理
offset = 0
processed = 0
while offset < total_count:
# 查询一批记录
cursor.execute(f"""
SELECT u.id, u.path
FROM {TABLE_USER_PREFERENCE_LOG} u
WHERE u.category IS NULL
LIMIT {batch_size} OFFSET {offset}
""")
records = cursor.fetchall()
if not records:
break
# 批量更新
for record_id, path in records:
# 查询 t_sys_file 表
cursor.execute(f"""
SELECT id, url, style, level3_type, level2_type, deprecated
FROM {TABLE_SYS_FILE}
WHERE url = %s
LIMIT 1
""", (path,))
sys_file = cursor.fetchone()
if sys_file:
# 系统图
sys_file_id, url, style, level3_type, level2_type, deprecated = sys_file
category = f"{level3_type.lower()}_{level2_type.lower()}"
cursor.execute(f"""
UPDATE {TABLE_USER_PREFERENCE_LOG}
SET category = %s,
style = %s,
is_system_sketch = 1,
sys_file_id = %s
WHERE id = %s
""", (category, style, sys_file_id, record_id))
else:
# 用户图
cursor.execute(f"""
UPDATE {TABLE_USER_PREFERENCE_LOG}
SET is_system_sketch = 0,
category = NULL,
style = NULL,
sys_file_id = NULL
WHERE id = %s
""", (record_id,))
conn.commit()
processed += len(records)
offset += batch_size
logger.info(f"已迁移 {processed}/{total_count} 条记录")
logger.info("历史数据迁移完成")
except Exception as e:
logger.error(f"历史数据迁移失败: {e}", exc_info=True)
if conn:
conn.rollback()
finally:
if conn:
conn.close()
def precompute_system_sketch_vectors(batch_size: int = 1000, retry_times: int = 3):
"""
系统图向量预计算与导入
Args:
batch_size: 每批处理数量
retry_times: 失败重试次数
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
# 1. 数据筛选
logger.info("查询系统图数据...")
cursor.execute(f"""
SELECT id, url, style, level3_type, level2_type, deprecated
FROM {TABLE_SYS_FILE}
WHERE level1_type = 'Images'
AND style IS NOT NULL
AND style != ''
AND deprecated != 1
""")
records = cursor.fetchall()
logger.info(f"找到 {len(records)} 条系统图记录")
if not records:
logger.warning("没有找到系统图数据")
return
# 2. 批量处理
failed_records = []
batch_data = []
for idx, (sys_file_id, url, style, level3_type, level2_type, deprecated) in enumerate(records, 1):
try:
# 计算 category
category = f"{level3_type.lower()}_{level2_type.lower()}"
# 提取特征向量
feature_vector = extract_feature_vector(url)
# 检查向量是否有效
if np.all(feature_vector == 0):
logger.warning(f"向量提取失败,跳过: {url}")
failed_records.append((sys_file_id, url))
continue
# 准备数据
data_item = {
"path": url,
"sys_file_id": sys_file_id,
"style": style,
"category": category,
"is_system_sketch": 1,
"deprecated": deprecated if deprecated else 0,
"feature_vector": feature_vector.tolist()
}
batch_data.append(data_item)
# 批量写入
if len(batch_data) >= batch_size:
try:
insert_vectors(batch_data)
batch_data = []
logger.info(f"已处理 {idx}/{len(records)} 条记录")
except Exception as e:
logger.error(f"批量写入失败: {e}")
failed_records.extend([(item["sys_file_id"], item["path"]) for item in batch_data])
batch_data = []
except Exception as e:
logger.error(f"处理记录失败 [{url}]: {e}")
failed_records.append((sys_file_id, url))
# 写入剩余数据
if batch_data:
try:
insert_vectors(batch_data)
except Exception as e:
logger.error(f"写入剩余数据失败: {e}")
failed_records.extend([(item["sys_file_id"], item["path"]) for item in batch_data])
# 3. 重试失败记录
if failed_records and retry_times > 0:
logger.info(f"重试 {len(failed_records)} 条失败记录...")
for retry in range(retry_times):
retry_failed = []
for sys_file_id, url in failed_records:
try:
category = f"{level3_type.lower()}_{level2_type.lower()}"
feature_vector = extract_feature_vector(url)
if not np.all(feature_vector == 0):
data_item = {
"path": url,
"sys_file_id": sys_file_id,
"style": style,
"category": category,
"is_system_sketch": 1,
"deprecated": 0,
"feature_vector": feature_vector.tolist()
}
insert_vectors([data_item])
else:
retry_failed.append((sys_file_id, url))
except Exception as e:
logger.error(f"重试失败 [{url}]: {e}")
retry_failed.append((sys_file_id, url))
failed_records = retry_failed
if not failed_records:
break
if failed_records:
logger.warning(f"仍有 {len(failed_records)} 条记录处理失败")
logger.info("系统图向量预计算完成")
except Exception as e:
logger.error(f"系统图向量预计算失败: {e}", exc_info=True)
finally:
if conn:
conn.close()
def compute_user_preference_vector(
account_id: int,
category: str,
conn: Optional[pymysql.connections.Connection] = None
# max_date: Optional[datetime] = None
) -> Optional[np.ndarray]:
"""
计算用户偏好向量
Args:
account_id: 用户ID
category: 类别
conn: 数据库连接(可选)
max_date: 最大日期(可选,用于评估时只使用训练集数据)
Returns:
用户偏好向量2048维失败返回 None
"""
from datetime import datetime
should_close = False
if conn is None:
conn = pymysql.connect(**MYSQL_CONFIG)
should_close = True
try:
cursor = conn.cursor()
# 1. 获取点赞记录如果指定了max_date只查询该日期之前的数据
if max_date:
cursor.execute(f"""
SELECT path, data_time
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s AND style is not null
AND data_time < %s
ORDER BY data_time DESC
""", (account_id, category, max_date))
else:
cursor.execute(f"""
SELECT path, data_time
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s AND style is not null
ORDER BY data_time DESC
""", (account_id, category))
like_records = cursor.fetchall()
if not like_records:
return None
# 2. 批量查询点赞次数如果指定了max_date只统计该日期之前的点赞
paths = [r[0] for r in like_records]
if not paths:
return None
placeholders = ','.join(['%s'] * len(paths))
if max_date:
cursor.execute(f"""
SELECT path, COUNT(*) as like_count
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s AND path IN ({placeholders})
AND data_time < %s
GROUP BY path
""", (account_id, category) + tuple(paths) + (max_date,))
else:
cursor.execute(f"""
SELECT path, COUNT(*) as like_count
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE account_id = %s AND category = %s AND path IN ({placeholders})
GROUP BY path
""", (account_id, category) + tuple(paths))
like_counts = {row[0]: row[1] for row in cursor.fetchall()}
# 3. 批量获取向量
vectors_dict = query_vectors_by_paths(paths)
# 处理查询不到的 path用户图或异常情况
missing_paths = [p for p in paths if p not in vectors_dict]
if missing_paths:
logger.info(f"用户 {account_id} 类别 {category}{len(missing_paths)} 个 path 需要实时计算向量")
# 目前未有非系统图向量,跳过
# 这里可以实时计算并写入 Milvus但为了简化先跳过
# 实际实现中应该调用 vector_utils.extract_feature_vector 并写入 Milvus
# 4. 计算权重并加权平均
vectors = []
weights = []
K_half = RECOMMENDATION_CONFIG["K_half"]
for k, (path, data_time) in enumerate(like_records, 1):
if path not in vectors_dict:
continue
vector_data = vectors_dict[path]
feature_vector = np.array(vector_data["feature_vector"])
# 时间衰减权重
d_k = 0.5 ** (k / K_half)
# 点赞次数权重
like_count = like_counts.get(path, 1)
p_i = 1 + math.log(1 + like_count)
# 综合权重
# w_i = d_k * p_i
w_i = p_i
vectors.append(feature_vector)
weights.append(w_i)
if not vectors:
return None
# 5. 计算加权平均并做 L2 归一化IP≈cosine
preference_vector = compute_weighted_average(vectors, weights)
preference_vector = normalize_vector(preference_vector)
return preference_vector
except Exception as e:
logger.error(f"计算用户偏好向量失败 [user={account_id}, category={category}]: {e}", exc_info=True)
return None
finally:
if should_close and conn:
conn.close()
def generate_initial_user_preference_vectors(batch_size: int = 100):
"""
初始用户偏好向量生成
Args:
batch_size: 每批处理用户数
"""
conn = None
try:
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
# 1. 扫描历史数据
logger.info("扫描用户和类别组合...")
cursor.execute(f"""
SELECT DISTINCT account_id, category
FROM {TABLE_USER_PREFERENCE_LOG}
WHERE category IS NOT NULL
AND style IS NOT NULL
""")
user_categories = cursor.fetchall()
logger.info(f"找到 {len(user_categories)} 个用户-类别组合")
if not user_categories:
logger.warning("没有找到用户-类别组合")
return
# 2. 批量处理
processed = 0
failed = 0
for account_id, category in user_categories:
try:
# 计算偏好向量
preference_vector = compute_user_preference_vector(account_id, category, conn)
if preference_vector is not None:
# 写入 Redis
key = f"{REDIS_KEY_USER_PREF_PREFIX}:{account_id}:{category}"
# 序列化向量(使用 JSON
vector_json = json.dumps(preference_vector.tolist())
Redis.write(
key=key,
value=vector_json,
expire=RECOMMENDATION_CONFIG["redis_expire_seconds"]
)
processed += 1
else:
failed += 1
if (processed + failed) % batch_size == 0:
logger.info(f"已处理 {processed + failed}/{len(user_categories)} 个组合,成功: {processed}, 失败: {failed}")
except Exception as e:
logger.error(f"处理失败 [user={account_id}, category={category}]: {e}")
failed += 1
logger.info(f"初始用户偏好向量生成完成,成功: {processed}, 失败: {failed}")
except Exception as e:
logger.error(f"初始用户偏好向量生成失败: {e}", exc_info=True)
finally:
if conn:
conn.close()
def run_precompute():
"""
运行所有预计算任务
"""
logger.info("=" * 50)
logger.info("开始预计算任务")
logger.info("=" * 50)
# 1. 优化数据库表结构
logger.info("\n[1/5] 优化数据库表结构...")
optimize_database_table()
# # 2. 创建 Milvus 集合
# logger.info("\n[2/5] 创建 Milvus 集合...")
# create_collection()
# 3. 历史数据迁移
logger.info("\n[3/5] 历史数据迁移...")
migrate_historical_data()
# # 4. 系统图向量预计算
# logger.info("\n[4/5] 系统图向量预计算...")
# precompute_system_sketch_vectors()
# 5. 初始用户偏好向量生成
logger.info("\n[5/5] 初始用户偏好向量生成...")
generate_initial_user_preference_vectors()
logger.info("=" * 50)
logger.info("预计算任务完成")
logger.info("=" * 50)
if __name__ == "__main__":
# 1. 优化数据库表结构
logger.info("\n[1/5] 优化数据库表结构...")
optimize_database_table()
# 3. 历史数据迁移
logger.info("\n[3/5] 历史数据迁移...")
migrate_historical_data()
# 5. 初始用户偏好向量生成
logger.info("\n[5/5] 初始用户偏好向量生成...")
generate_initial_user_preference_vectors()

View File

@@ -0,0 +1,214 @@
"""
推荐接口实现
实现探索/利用分支、向量检索、Softmax抽样等功能
"""
import logging
import math
import random
import numpy as np
from typing import List, Dict, Optional
from app.service.recommendation_system.config import RECOMMENDATION_CONFIG, REDIS_KEY_USER_PREF_PREFIX
from app.service.recommendation_system.milvus_client import search_similar_vectors, query_random_candidates
from app.service.recommendation_system.precompute import compute_user_preference_vector
from app.service.recommendation_system.vector_utils import normalize_vector
from app.service.utils.redis_utils import Redis
import json
logger = logging.getLogger(__name__)
def get_user_preference_vector(user_id: int, category: str) -> Optional[np.ndarray]:
"""
获取用户偏好向量
Args:
user_id: 用户ID
category: 类别
Returns:
用户偏好向量2048维失败返回 None
"""
# 1. 从 Redis 获取
key = f"{REDIS_KEY_USER_PREF_PREFIX}:{user_id}:{category}"
vector_json = Redis.read(key)
if vector_json:
try:
vector_list = json.loads(vector_json)
return np.array(vector_list, dtype=np.float32)
except Exception as e:
logger.warning(f"解析 Redis 向量失败 [user={user_id}, category={category}]: {e}")
# 2. 如果不存在,实时计算
logger.info(f"Redis 中不存在用户偏好向量,实时计算 [user={user_id}, category={category}]")
preference_vector = compute_user_preference_vector(user_id, category)
if preference_vector is not None:
# 写入 Redis
vector_json = json.dumps(preference_vector.tolist())
Redis.write(
key=key,
value=vector_json,
expire=RECOMMENDATION_CONFIG["redis_expire_seconds"]
)
return preference_vector
def explore_branch(category: str, style: Optional[str] = None) -> List[str]:
"""
探索分支(随机推荐)
Args:
category: 类别
style: 风格(可选)
Returns:
推荐结果列表,每个元素包含 path, style, category 等字段
"""
# 查询候选(随机池)
pool_size = 10 # 固定查询10个然后随机选择
candidates = query_random_candidates(category, style, limit=pool_size)
if not candidates:
logger.warning(f"探索分支:类别 {category} 没有候选数据")
return []
# 随机选择
if len(candidates) > 1:
import random
candidates = random.sample(candidates, 1)
# 格式化返回结果
return [candidate.get("path", "") for candidate in candidates[:1]]
def exploit_branch(
user_id: int,
category: str,
style: Optional[str] = None
) -> List[str]:
"""
利用分支(基于向量相似度推荐)
Args:
user_id: 用户ID
category: 类别
num_recommendations: 返回数量
style: 风格(可选,用于加分)
Returns:
推荐结果列表,每个元素包含 path, style, category, similarity, sample_score 等字段
"""
# 1. 获取用户偏好向量
embedding = get_user_preference_vector(user_id, category)
if embedding is None:
logger.warning(f"利用分支:无法获取用户偏好向量,回退到探索分支 [user={user_id}, category={category}]")
return explore_branch(category, style)
# 2. Milvus 相似度检索(内积 IP
topk = RECOMMENDATION_CONFIG["topk"]
results = search_similar_vectors(embedding, category, topk)
if not results:
logger.warning(f"利用分支:向量检索无结果,回退到探索分支 [user={user_id}, category={category}]")
return explore_branch(category, style)
# 3. Style 加分(可选,需传入 style 参数)
style_bonus = RECOMMENDATION_CONFIG["style_bonus"]
if style:
for result in results:
similarity = result["score"]
if result.get("style") == style:
# 加分:相似度 * (1 + style_bonus)
similarity = similarity * (1 + style_bonus)
result["final_score"] = similarity
else:
for result in results:
result["final_score"] = result["score"]
# 4. Softmax 抽样
scores = [r["final_score"] for r in results]
probabilities = softmax_with_temperature(scores, RECOMMENDATION_CONFIG["softmax_temperature"])
# 根据概率抽样
if not results:
return []
selected_index = np.random.choice(len(results), size=1, p=probabilities, replace=False)
selected_results = [results[int(selected_index[0])]]
# 5. 返回结果
return [result.get("path", "") for result in selected_results]
def softmax_with_temperature(scores: List[float], temperature: float = 1.0) -> List[float]:
"""
Softmax 函数(带温度参数)
Args:
scores: 分数列表
temperature: 温度参数
Returns:
概率列表
"""
if not scores:
return []
# 除以温度
scaled_scores = [s / temperature for s in scores]
# 减去最大值(数值稳定性)
max_score = max(scaled_scores)
exp_scores = [math.exp(s - max_score) for s in scaled_scores]
# 归一化
sum_exp = sum(exp_scores)
if sum_exp == 0:
# 如果所有分数都是负无穷或非常小,返回均匀分布
return [1.0 / len(scores)] * len(scores)
probabilities = [exp_s / sum_exp for exp_s in exp_scores]
return probabilities
def get_recommendations(
user_id: int,
category: str,
style: Optional[str] = None
) -> List[str]:
"""
获取推荐结果(主函数)
Args:
user_id: 用户ID
category: 类别(如 female_skirt
num_recommendations: 返回推荐数量(默认 1
style: 风格(可选):若传入,则在利用分支对同 style 的候选进行加分
Returns:
推荐结果列表,每个元素包含 path 等字段
"""
try:
# 1. 读取配置参数
explore_ratio = RECOMMENDATION_CONFIG["explore_ratio"]
# 2. 探索/利用决策
r = random.random() # 生成随机数 (0-1)
if r < explore_ratio:
logger.debug(f"探索分支 [user={user_id}, category={category}]")
return explore_branch(category, style)
logger.debug(f"利用分支 [user={user_id}, category={category}]")
return exploit_branch(user_id, category, style)
except Exception as e:
logger.error(f"获取推荐结果失败 [user={user_id}, category={category}]: {e}", exc_info=True)
# 容错:回退到探索分支
return explore_branch(category, style)

View File

@@ -0,0 +1,189 @@
"""
向量计算工具类
包含 ResNet50 特征提取、向量归一化等功能
"""
import io
import logging
import numpy as np
import torch
from torchvision import models, transforms
from PIL import Image
from minio import Minio
from app.core.config import MINIO_URL, MINIO_ACCESS, MINIO_SECRET, MINIO_SECURE
from app.service.recommendation_system.config import RECOMMENDATION_CONFIG
logger = logging.getLogger(__name__)
# 图像预处理与ResNet训练时的预处理一致
transform = transforms.Compose([
transforms.Resize((224, 224)), # ResNet 要求 224x224 的输入
transforms.ToTensor(), # 转换为 Tensor
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # 标准化
])
# 加载预训练的 ResNet50 模型(去掉最后全连接层)
_resnet_model = None
def get_resnet_model():
"""获取 ResNet50 模型(单例模式)"""
global _resnet_model
if _resnet_model is None:
logger.info("加载 ResNet50 模型...")
_resnet_model = models.resnet50(pretrained=True)
modules = list(_resnet_model.children())[:-1] # 移除最后的全连接层
_resnet_model = torch.nn.Sequential(*modules)
_resnet_model.eval() # 设置为评估模式
logger.info("ResNet50 模型加载完成")
return _resnet_model
# MinIO 客户端(单例)
_minio_client = None
def get_minio_client():
"""获取 MinIO 客户端(单例模式)"""
global _minio_client
if _minio_client is None:
_minio_client = Minio(
MINIO_URL,
access_key=MINIO_ACCESS,
secret_key=MINIO_SECRET,
secure=MINIO_SECURE
)
return _minio_client
def get_image_from_minio(path: str) -> Image.Image:
"""
从 MinIO 获取图片
Args:
path: MinIO 逻辑 URL格式如 "bucket_name/object_name"
Returns:
PIL Image 对象,失败返回 None
"""
try:
# 分割路径,获取桶名和文件路径
path_parts = path.split('/', 1)
if len(path_parts) != 2:
logger.error(f"路径格式错误: {path}")
return None
bucket_name, file_name = path_parts
minio_client = get_minio_client()
# 获取文件
obj = minio_client.get_object(bucket_name, file_name)
img_data = obj.read() # 读取图像数据
img = Image.open(io.BytesIO(img_data)) # 将数据转为图像对象
return img
except Exception as e:
logger.error(f"从 MinIO 获取图片失败 [{path}]: {e}")
return None
def extract_feature_vector(path: str) -> np.ndarray:
"""
使用 ResNet50 提取图片特征向量2048维
Args:
path: MinIO 逻辑 URL
Returns:
2048维特征向量numpy array失败返回零向量
"""
try:
# 从 MinIO 获取图像
img = get_image_from_minio(path)
if img is None:
logger.warning(f"无法获取图片,返回零向量: {path}")
return np.zeros(RECOMMENDATION_CONFIG["vector_dim"], dtype=np.float32)
# 预处理
# 部分 MinIO 图片可能是 RGBA/CMYK转换成 RGB 以匹配 3 通道标准化参数
if img.mode != "RGB":
try:
img = img.convert("RGB")
except Exception:
logger.warning(f"无法转换图片为RGB返回零向量: {path}")
return np.zeros(RECOMMENDATION_CONFIG["vector_dim"], dtype=np.float32)
img_tensor = transform(img).unsqueeze(0) # 扩展维度以适应批量处理
# 提取特征
resnet_model = get_resnet_model()
with torch.no_grad(): # 在不需要计算梯度的情况下进行推断
feature_vector = resnet_model(img_tensor) # 获取 ResNet 的输出
feature_vector = feature_vector.squeeze().cpu().numpy() # 转换为 NumPy 数组并去掉 batch 维度
# 确保是 2048 维
if feature_vector.ndim > 1:
feature_vector = feature_vector.flatten()
# 确保维度正确
if len(feature_vector) != RECOMMENDATION_CONFIG["vector_dim"]:
logger.warning(f"向量维度不正确: {len(feature_vector)}, 期望: {RECOMMENDATION_CONFIG['vector_dim']}")
# 如果维度不对,尝试调整
if len(feature_vector) > RECOMMENDATION_CONFIG["vector_dim"]:
feature_vector = feature_vector[:RECOMMENDATION_CONFIG["vector_dim"]]
else:
padded = np.zeros(RECOMMENDATION_CONFIG["vector_dim"], dtype=np.float32)
padded[:len(feature_vector)] = feature_vector
feature_vector = padded
return feature_vector.astype(np.float32)
except Exception as e:
logger.error(f"提取特征向量失败 [{path}]: {e}", exc_info=True)
return np.zeros(RECOMMENDATION_CONFIG["vector_dim"], dtype=np.float32)
def normalize_vector(vector: np.ndarray) -> np.ndarray:
"""
L2 归一化向量
Args:
vector: 输入向量
Returns:
归一化后的向量
"""
norm = np.linalg.norm(vector)
if norm == 0:
return vector
return vector / norm
def compute_weighted_average(vectors: list, weights: list) -> np.ndarray:
"""
计算加权平均向量
Args:
vectors: 向量列表
weights: 权重列表
Returns:
加权平均向量(不做归一化,模长为加权平均后的尺度)
"""
if not vectors or not weights:
return np.zeros(RECOMMENDATION_CONFIG["vector_dim"], dtype=np.float32)
# 确保所有向量都是 numpy array
vectors = [np.array(v) for v in vectors]
weights = np.array(weights)
# 计算加权和
weighted_sum = np.zeros_like(vectors[0])
for v, w in zip(vectors, weights):
weighted_sum += v * w
# 返回加权平均(除以权重和,不做 L2 归一化,模长不会随条数线性暴涨)
weight_total = weights.sum()
if weight_total == 0:
return weighted_sum
return weighted_sum / weight_total