feat : 代码梳理移除所有敏感密钥通过环境变量方式配置

2025-12-30 16:49:08 +08:00
parent 1be716e414
commit 18024a2d70
167 changed files with 5283 additions and 10464 deletions
--- a/app/service/recommend/scheduled_task.py
+++ b/app/service/recommend/scheduled_task.py
@@ -18,7 +18,8 @@ import pandas as pd
 from datetime import datetime, timedelta
 import json

-from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX
+from app.core.config import TABLE_CATEGORIES, settings
+from app.core.mysql_config import DB_CONFIG

 # 自动选择可用字体
 try:
@@ -51,7 +52,7 @@ minio_client = Minio(
 )

 # 预加载系统sketch特征向量
-SYSTEM_FEATURES = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()
+SYSTEM_FEATURES = np.load(f'{settings.RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()

 # 行为权重和衰减系数
 BEHAVIOR_CONFIG = {
@@ -61,6 +62,7 @@ BEHAVIOR_CONFIG = {
    'sketchLike': {'weight': 4, 'decay': 0}  # 不衰减
 }

+
 # 保存sketch_to_iid到文件
 def save_sketch_to_iid():
    """保存sketch到iid的映射"""
@@ -147,11 +149,11 @@ def update_user_matrices():
        cursor = conn.cursor()

        # 修改后的查询语句（移除category过滤）
-        cursor.execute(""" 
-            SELECT account_id, path, COUNT(*) as like_count 
-            FROM user_preference_log_test 
-            GROUP BY account_id, path
-        """)
+        cursor.execute("""
+                       SELECT account_id, path, COUNT(*) as like_count
+                       FROM user_preference_log_test
+                       GROUP BY account_id, path
+                       """)
        user_data = cursor.fetchall()
        logging.info(f"成功读取{len(user_data)}条用户偏好记录")

@@ -164,17 +166,17 @@ def update_user_matrices():
        feature_matrix, user_index_feature_matrix, sketch_index_feature_matrix, iid_to_category_feature_matrix = calculate_feature_matrix(user_data)
        # visualize_sparse_matrix(feature_matrix, '系统sketch与用户category平均特征向量关联度矩阵', 'correlation_matrix.png')
        # 存储矩阵
-        np.save(f"{RECOMMEND_PATH_PREFIX}interaction_matrix.npy", interaction_matrix)
-        np.save(f"{RECOMMEND_PATH_PREFIX}feature_matrix.npy", feature_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}interaction_matrix.npy", interaction_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}feature_matrix.npy", feature_matrix)
        #
-        np.save(f"{RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", iid_to_category_interaction_matrix)
-        np.save(f"{RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", user_index_interaction_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", iid_to_category_interaction_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", user_index_interaction_matrix)
        #
-        np.save(f"{RECOMMEND_PATH_PREFIX}iid_to_category_feature_matrix.npy", iid_to_category_feature_matrix)
-        np.save(f"{RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", user_index_feature_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}iid_to_category_feature_matrix.npy", iid_to_category_feature_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", user_index_feature_matrix)
        #
-        np.save(f"{RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy", sketch_index_interaction_matrix)
-        np.save(f"{RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", sketch_index_feature_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy", sketch_index_interaction_matrix)
+        np.save(f"{settings.RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", sketch_index_feature_matrix)
        # logging.info("矩阵更新完成")

    except Exception as e:
@@ -235,6 +237,7 @@ def plot_interaction_count_matrix(interaction_count_matrix):
    except Exception as e:
        logging.error(f"绘图失败: {str(e)}", exc_info=True)

+
 def visualize_sparse_matrix(matrix, title='Non-zero Interactions (Scatter Plot)', filename="scatter_figure_interaction.png"):
    if not sparse.issparse(matrix):
        # 转换为稀疏矩阵
@@ -253,6 +256,7 @@ def visualize_sparse_matrix(matrix, title='Non-zero Interactions (Scatter Plot)'
    plt.ylabel('Item Index')
    plt.savefig(filename)

+
 def calculate_interaction_matrix(user_data):
    """基于新表结构的交互次数矩阵计算（仅系统sketch）"""
    # 获取所有用户ID
@@ -475,6 +479,7 @@ def calculate_heat(row, current_date):
    # 计算热度值 = 权重 * e^(-衰减系数 * 天数)
    return config['weight'] * np.exp(-config['decay'] * days_passed)

+
 def load_heat_matrix_as_array(file_path):
    """
    直接加载为二维numpy数组
@@ -484,10 +489,11 @@ def load_heat_matrix_as_array(file_path):
        saved = json.load(f)
    return (
        np.array(saved['data']),  # 二维矩阵
-        saved['row_labels'],      # 行标签列表
-        saved['col_labels']       # 列标签列表
+        saved['row_labels'],  # 行标签列表
+        saved['col_labels']  # 列标签列表
    )

+
 def update_heat_matrices():
    """每日计算并存储热度矩阵（gender_category × path）"""
    current_date = datetime.now()
--- a/app/service/recommend/service.py
+++ b/app/service/recommend/service.py
@@ -1,240 +1,241 @@
-# # 预加载资源
-# import logging
-# import time
-# from collections import defaultdict
-# import os
-# import json
-# import numpy as np
-#
-# from app.core.config import DB_CONFIG, RECOMMEND_PATH_PREFIX
-#
-# logger = logging.getLogger()
-# import pymysql
-# from concurrent.futures import ThreadPoolExecutor
-#
-# HEAT_VECTOR_FILE = 'heat_vectors_data/heat_vectors.json'  # 可动态加载或配置
-#
-# matrix_data = {
-#     "interaction_matrix": None,
-#     "feature_matrix": None,
-#     "user_index_interaction": None,
-#     "sketch_index_interaction": None,
-#     "user_index_feature": None,
-#     "sketch_index_feature": None,
-#     "iid_to_sketch": None,
-#     "category_to_iids": None,
-#     "cached_scores": {},
-#     "cached_valid_idxs": {},
-#     "category_sketch_idxs_inter": None,
-#     "category_sketch_idxs_feature": None,
-#     "user_inter_full": dict(),
-#     "user_feat_full": dict(),
-#     "brand_feature_matrix": None,
-#     "brand_index_map": None,
-#     "heat_data": {},
-# }
-#
-#
-# def load_resources():
-#     """加载所有矩阵和映射关系，并触发预缓存"""
-#     try:
-#         start_time = time.time()
-#
-#         # 清空缓存
-#         matrix_data["cached_scores"].clear()
-#         matrix_data["cached_valid_idxs"].clear()
-#
-#         # 加载数据
-#         sketch_to_iid = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_to_iid.npy', allow_pickle=True).item()
-#         matrix_data["iid_to_sketch"] = {v: k for k, v in sketch_to_iid.items()}
-#
-#         matrix_data["interaction_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}interaction_matrix.npy", allow_pickle=True)
-#         matrix_data["user_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", allow_pickle=True).item()
-#         matrix_data["sketch_index_interaction"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy",
-#                                                           allow_pickle=True).item()
-#
-#         matrix_data["feature_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}feature_matrix.npy", allow_pickle=True)
-#
-#         brand_feature_path = f"{RECOMMEND_PATH_PREFIX}brand_feature_matrix.npy"
-#         if os.path.exists(brand_feature_path):
-#             matrix_data["brand_feature_matrix"] = np.load(brand_feature_path, allow_pickle=True)
-#         else:
-#             logger.warning("brand_feature_matrix 文件不存在，使用空数组")
-#             matrix_data["brand_feature_matrix"] = np.array([])
-#
-#         # brand_index_map
-#         brand_index_path = f"{RECOMMEND_PATH_PREFIX}brand_index_map.npy"
-#         if os.path.exists(brand_index_path):
-#             matrix_data["brand_index_map"] = np.load(brand_index_path, allow_pickle=True).item()
-#         else:
-#             logger.warning("brand_index_map 文件不存在，使用空字典")
-#             matrix_data["brand_index_map"] = {}
-#
-#         matrix_data["user_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", allow_pickle=True).item()
-#
-#         matrix_data["sketch_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", allow_pickle=True).item()
-#
-#         category_to_iid_map = np.load(f"{RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", allow_pickle=True).item()
-#         matrix_data["category_to_iids"] = defaultdict(list)
-#         for iid, cat in category_to_iid_map.items():
-#             matrix_data["category_to_iids"][cat].append(iid)
-#
-#         logger.info(f"资源加载完成，耗时: {time.time() - start_time:.2f}秒")
-#
-#         # 触发预缓存
-#         precache_user_category()
-#
-#         if os.path.exists(HEAT_VECTOR_FILE):
-#             with open(HEAT_VECTOR_FILE, 'r', encoding='utf-8') as f:
-#                 heat_json = json.load(f)
-#                 matrix_data["heat_data"] = heat_json.get("data", {})
-#             logger.info(f"热度向量数据加载完成，共加载 {len(matrix_data['heat_data'])} 个类别")
-#         else:
-#             matrix_data["heat_data"] = {}
-#
-#     except Exception as e:
-#         logger.error(f"资源加载失败: {str(e)}")
-#         raise RuntimeError("初始化失败")
-#
-#
-# def precache_user_category():
-#     """优化后的用户分类预缓存（添加耗时统计）"""
-#     if not all([
-#         matrix_data["interaction_matrix"] is not None,
-#         matrix_data["feature_matrix"] is not None,
-#         matrix_data["user_index_interaction"] is not None
-#     ]):
-#         logger.warning("资源未加载完成，跳过预缓存")
-#         return
-#
-#     start_time = time.perf_counter()
-#     time_stats = {
-#         "get_all_user_categories": 0,
-#         "process_user_category": 0,
-#         "thread_execution": 0,
-#         "cache_update": 0,
-#         "total": 0,
-#     }
-#
-#     # 统计用户类别获取时间
-#     t1 = time.perf_counter()
-#     user_categories = get_all_user_categories()
-#     time_stats["get_all_user_categories"] = time.perf_counter() - t1
-#
-#     precached_count = 0
-#
-#     def process_user_category(user_id, categories):
-#         """单用户类别缓存计算（统计耗时）"""
-#         local_cache = {}
-#         local_valid_idxs = {}
-#         t_start = time.perf_counter()
-#
-#         for category in categories:
-#             cache_key = (user_id, category)
-#             if cache_key in matrix_data["cached_scores"]:
-#                 continue
-#
-#             try:
-#                 user_idx_inter = matrix_data["user_index_interaction"].get(user_id)
-#                 user_idx_feature = matrix_data["user_index_feature"].get(user_id)
-#
-#                 # 统计获取类别 IID 耗时
-#                 t_iid = time.perf_counter()
-#                 category_iids = matrix_data["category_to_iids"].get(category, [])
-#                 valid_sketch_idxs_inter = [matrix_data["sketch_index_interaction"][iid]
-#                                            for iid in category_iids if iid in matrix_data["sketch_index_interaction"]]
-#                 valid_sketch_idxs_feature = [matrix_data["sketch_index_feature"][iid]
-#                                              for iid in category_iids if iid in matrix_data["sketch_index_feature"]]
-#                 time_stats["process_user_category"] += time.perf_counter() - t_iid
-#
-#                 # 统计矩阵计算耗时
-#                 t_matrix = time.perf_counter()
-#                 processed_inter = np.zeros(len(valid_sketch_idxs_inter))
-#                 if user_idx_inter is not None and valid_sketch_idxs_inter:
-#                     raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter]
-#                     processed_inter = raw_inter_scores * 0.7
-#
-#                 processed_feat = np.zeros(len(valid_sketch_idxs_feature))
-#                 if user_idx_feature is not None and valid_sketch_idxs_feature:
-#                     raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature]
-#                     raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / (
-#                             np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8)
-#                     processed_feat = raw_feat_scores * 0.3
-#                 time_stats["process_user_category"] += time.perf_counter() - t_matrix
-#
-#                 if len(processed_inter) == len(processed_feat):
-#                     local_cache[cache_key] = (processed_inter, processed_feat)
-#                     local_valid_idxs[cache_key] = valid_sketch_idxs_inter
-#
-#             except Exception as e:
-#                 logger.error(f"预缓存失败 (user={user_id}, category={category}): {str(e)}")
-#
-#         return local_cache, local_valid_idxs
-#
-#     # 统计线程执行时间
-#     t2 = time.perf_counter()
-#     with ThreadPoolExecutor(max_workers=8) as executor:
-#         futures = {executor.submit(process_user_category, user_id, categories): user_id for user_id, categories in user_categories.items()}
-#         for future in futures:
-#             try:
-#                 t_cache = time.perf_counter()
-#                 cache_part, valid_idxs_part = future.result()
-#                 matrix_data["cached_scores"].update(cache_part)
-#                 matrix_data["cached_valid_idxs"].update(valid_idxs_part)
-#                 time_stats["cache_update"] += time.perf_counter() - t_cache
-#                 precached_count += len(cache_part)
-#             except Exception as e:
-#                 logger.error(f"线程执行错误: {str(e)}")
-#     time_stats["thread_execution"] = time.perf_counter() - t2
-#
-#     time_stats["total"] = time.perf_counter() - start_time
-#
-#     # 输出统计信息
-#     logger.info(f"""
-#     预缓存完成，共缓存 {precached_count} 组数据，耗时统计如下：
-#     - 获取用户类别数据: {time_stats["get_all_user_categories"]:.2f}s
-#     - 计算用户类别缓存: {time_stats["process_user_category"]:.2f}s
-#     - 线程任务执行: {time_stats["thread_execution"]:.2f}s
-#     - 更新缓存数据: {time_stats["cache_update"]:.2f}s
-#     - 总耗时: {time_stats["total"]:.2f}s
-#     """)
-#
-#
-# def get_all_user_categories():
-#     """获取所有用户及其对应的分类"""
-#     conn = None
-#     try:
-#         conn = pymysql.connect(**DB_CONFIG)
-#         cursor = conn.cursor()
-#
-#         query = """
-#             SELECT DISTINCT account_id, path
-#             FROM user_preference_log_prediction
-#         """
-#         cursor.execute(query)
-#         results = cursor.fetchall()
-#
-#         user_categories = defaultdict(set)
-#         for account_id, path in results:
-#             category = get_category_from_path(path)
-#             user_categories[account_id].add(category)
-#
-#         return dict(user_categories)
-#
-#     except Exception as e:
-#         logger.error(f"数据库查询失败: {str(e)}")
-#         return {}
-#     finally:
-#         if conn:
-#             conn.close()
-#
-#
-# def get_category_from_path(path: str) -> str:
-#     """从路径解析类别"""
-#     try:
-#         parts = path.split('/')
-#         if len(parts) >= 4:
-#             return f"{parts[2]}_{parts[3]}"
-#         return "unknown"
-#     except:
-#         return "unknown"
+# 预加载资源
+import logging
+import time
+from collections import defaultdict
+import os
+import json
+import numpy as np
+
+from app.core.config import settings
+from app.core.mysql_config import DB_CONFIG
+
+logger = logging.getLogger()
+import pymysql
+from concurrent.futures import ThreadPoolExecutor
+
+HEAT_VECTOR_FILE = 'heat_vectors_data/heat_vectors.json'  # 可动态加载或配置
+
+matrix_data = {
+    "interaction_matrix": None,
+    "feature_matrix": None,
+    "user_index_interaction": None,
+    "sketch_index_interaction": None,
+    "user_index_feature": None,
+    "sketch_index_feature": None,
+    "iid_to_sketch": None,
+    "category_to_iids": None,
+    "cached_scores": {},
+    "cached_valid_idxs": {},
+    "category_sketch_idxs_inter": None,
+    "category_sketch_idxs_feature": None,
+    "user_inter_full": dict(),
+    "user_feat_full": dict(),
+    "brand_feature_matrix": None,
+    "brand_index_map": None,
+    "heat_data": {},
+}
+
+
+def load_resources():
+    """加载所有矩阵和映射关系，并触发预缓存"""
+    try:
+        start_time = time.time()
+
+        # 清空缓存
+        matrix_data["cached_scores"].clear()
+        matrix_data["cached_valid_idxs"].clear()
+
+        # 加载数据
+        sketch_to_iid = np.load(f'{settings.RECOMMEND_PATH_PREFIX}sketch_to_iid.npy', allow_pickle=True).item()
+        matrix_data["iid_to_sketch"] = {v: k for k, v in sketch_to_iid.items()}
+
+        matrix_data["interaction_matrix"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}interaction_matrix.npy", allow_pickle=True)
+        matrix_data["user_index_interaction"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}user_index_interaction_matrix.npy", allow_pickle=True).item()
+        matrix_data["sketch_index_interaction"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}sketch_index_interaction_matrix.npy",
+                                                          allow_pickle=True).item()
+
+        matrix_data["feature_matrix"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}feature_matrix.npy", allow_pickle=True)
+
+        brand_feature_path = f"{settings.RECOMMEND_PATH_PREFIX}brand_feature_matrix.npy"
+        if os.path.exists(brand_feature_path):
+            matrix_data["brand_feature_matrix"] = np.load(brand_feature_path, allow_pickle=True)
+        else:
+            logger.warning("brand_feature_matrix 文件不存在，使用空数组")
+            matrix_data["brand_feature_matrix"] = np.array([])
+
+        # brand_index_map
+        brand_index_path = f"{settings.RECOMMEND_PATH_PREFIX}brand_index_map.npy"
+        if os.path.exists(brand_index_path):
+            matrix_data["brand_index_map"] = np.load(brand_index_path, allow_pickle=True).item()
+        else:
+            logger.warning("brand_index_map 文件不存在，使用空字典")
+            matrix_data["brand_index_map"] = {}
+
+        matrix_data["user_index_feature"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", allow_pickle=True).item()
+
+        matrix_data["sketch_index_feature"] = np.load(f"{settings.RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", allow_pickle=True).item()
+
+        category_to_iid_map = np.load(f"{settings.RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", allow_pickle=True).item()
+        matrix_data["category_to_iids"] = defaultdict(list)
+        for iid, cat in category_to_iid_map.items():
+            matrix_data["category_to_iids"][cat].append(iid)
+
+        logger.info(f"资源加载完成，耗时: {time.time() - start_time:.2f}秒")
+
+        # 触发预缓存
+        precache_user_category()
+
+        if os.path.exists(HEAT_VECTOR_FILE):
+            with open(HEAT_VECTOR_FILE, 'r', encoding='utf-8') as f:
+                heat_json = json.load(f)
+                matrix_data["heat_data"] = heat_json.get("data", {})
+            logger.info(f"热度向量数据加载完成，共加载 {len(matrix_data['heat_data'])} 个类别")
+        else:
+            matrix_data["heat_data"] = {}
+
+    except Exception as e:
+        logger.error(f"资源加载失败: {str(e)}")
+        raise RuntimeError("初始化失败")
+
+
+def precache_user_category():
+    """优化后的用户分类预缓存（添加耗时统计）"""
+    if not all([
+        matrix_data["interaction_matrix"] is not None,
+        matrix_data["feature_matrix"] is not None,
+        matrix_data["user_index_interaction"] is not None
+    ]):
+        logger.warning("资源未加载完成，跳过预缓存")
+        return
+
+    start_time = time.perf_counter()
+    time_stats = {
+        "get_all_user_categories": 0,
+        "process_user_category": 0,
+        "thread_execution": 0,
+        "cache_update": 0,
+        "total": 0,
+    }
+
+    # 统计用户类别获取时间
+    t1 = time.perf_counter()
+    user_categories = get_all_user_categories()
+    time_stats["get_all_user_categories"] = time.perf_counter() - t1
+
+    precached_count = 0
+
+    def process_user_category(user_id, categories):
+        """单用户类别缓存计算（统计耗时）"""
+        local_cache = {}
+        local_valid_idxs = {}
+        time.perf_counter()
+
+        for category in categories:
+            cache_key = (user_id, category)
+            if cache_key in matrix_data["cached_scores"]:
+                continue
+
+            try:
+                user_idx_inter = matrix_data["user_index_interaction"].get(user_id)
+                user_idx_feature = matrix_data["user_index_feature"].get(user_id)
+
+                # 统计获取类别 IID 耗时
+                t_iid = time.perf_counter()
+                category_iids = matrix_data["category_to_iids"].get(category, [])
+                valid_sketch_idxs_inter = [matrix_data["sketch_index_interaction"][iid]
+                                           for iid in category_iids if iid in matrix_data["sketch_index_interaction"]]
+                valid_sketch_idxs_feature = [matrix_data["sketch_index_feature"][iid]
+                                             for iid in category_iids if iid in matrix_data["sketch_index_feature"]]
+                time_stats["process_user_category"] += time.perf_counter() - t_iid
+
+                # 统计矩阵计算耗时
+                t_matrix = time.perf_counter()
+                processed_inter = np.zeros(len(valid_sketch_idxs_inter))
+                if user_idx_inter is not None and valid_sketch_idxs_inter:
+                    raw_inter_scores = matrix_data["interaction_matrix"][user_idx_inter, valid_sketch_idxs_inter]
+                    processed_inter = raw_inter_scores * 0.7
+
+                processed_feat = np.zeros(len(valid_sketch_idxs_feature))
+                if user_idx_feature is not None and valid_sketch_idxs_feature:
+                    raw_feat_scores = matrix_data["feature_matrix"][user_idx_feature, valid_sketch_idxs_feature]
+                    raw_feat_scores = (raw_feat_scores - np.min(raw_feat_scores)) / (
+                            np.max(raw_feat_scores) - np.min(raw_feat_scores) + 1e-8)
+                    processed_feat = raw_feat_scores * 0.3
+                time_stats["process_user_category"] += time.perf_counter() - t_matrix
+
+                if len(processed_inter) == len(processed_feat):
+                    local_cache[cache_key] = (processed_inter, processed_feat)
+                    local_valid_idxs[cache_key] = valid_sketch_idxs_inter
+
+            except Exception as e:
+                logger.error(f"预缓存失败 (user={user_id}, category={category}): {str(e)}")
+
+        return local_cache, local_valid_idxs
+
+    # 统计线程执行时间
+    t2 = time.perf_counter()
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = {executor.submit(process_user_category, user_id, categories): user_id for user_id, categories in user_categories.items()}
+        for future in futures:
+            try:
+                t_cache = time.perf_counter()
+                cache_part, valid_idxs_part = future.result()
+                matrix_data["cached_scores"].update(cache_part)
+                matrix_data["cached_valid_idxs"].update(valid_idxs_part)
+                time_stats["cache_update"] += time.perf_counter() - t_cache
+                precached_count += len(cache_part)
+            except Exception as e:
+                logger.error(f"线程执行错误: {str(e)}")
+    time_stats["thread_execution"] = time.perf_counter() - t2
+
+    time_stats["total"] = time.perf_counter() - start_time
+
+    # 输出统计信息
+    logger.info(f"""
+    预缓存完成，共缓存 {precached_count} 组数据，耗时统计如下：
+    - 获取用户类别数据: {time_stats["get_all_user_categories"]:.2f}s
+    - 计算用户类别缓存: {time_stats["process_user_category"]:.2f}s
+    - 线程任务执行: {time_stats["thread_execution"]:.2f}s
+    - 更新缓存数据: {time_stats["cache_update"]:.2f}s
+    - 总耗时: {time_stats["total"]:.2f}s
+    """)
+
+
+def get_all_user_categories():
+    """获取所有用户及其对应的分类"""
+    conn = None
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+        cursor = conn.cursor()
+
+        query = """
+                SELECT DISTINCT account_id, path
+                FROM user_preference_log_prediction \
+                """
+        cursor.execute(query)
+        results = cursor.fetchall()
+
+        user_categories = defaultdict(set)
+        for account_id, path in results:
+            category = get_category_from_path(path)
+            user_categories[account_id].add(category)
+
+        return dict(user_categories)
+
+    except Exception as e:
+        logger.error(f"数据库查询失败: {str(e)}")
+        return {}
+    finally:
+        if conn:
+            conn.close()
+
+
+def get_category_from_path(path: str) -> str:
+    """从路径解析类别"""
+    try:
+        parts = path.split('/')
+        if len(parts) >= 4:
+            return f"{parts[2]}_{parts[3]}"
+        return "unknown"
+    except:
+        return "unknown"