TASK:冷启动热度推荐;

This commit is contained in:
shahaibo
2025-06-10 10:54:20 +08:00
parent a14e6051b1
commit d39dee851f
4 changed files with 400 additions and 16 deletions

View File

@@ -14,6 +14,9 @@ import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import matplotlib.font_manager as fm
from scipy import sparse
import pandas as pd
from datetime import datetime, timedelta
import json
from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX
@@ -50,6 +53,13 @@ minio_client = Minio(
# 预加载系统sketch特征向量
SYSTEM_FEATURES = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()
# 行为权重和衰减系数
BEHAVIOR_CONFIG = {
'portfolioClick': {'weight': 1, 'decay': 0.3},
'portfolioLike': {'weight': 2, 'decay': 0.2},
'secondCreation': {'weight': 3, 'decay': 0.1},
'sketchLike': {'weight': 4, 'decay': 0} # 不衰减
}
# 保存sketch_to_iid到文件
def save_sketch_to_iid():
@@ -418,9 +428,107 @@ def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (norm + 1e-10) if norm != 0 else 0.0
def fetch_user_behavior_data(days=30):
"""从MySQL获取用户行为数据整合旧查询和新需求"""
conn = None
try:
conn = pymysql.connect(**DB_CONFIG)
# 计算日期范围
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
# 整合查询(获取完整行为数据)
query = f"""
SELECT
account_id,
behavior_type,
gender,
category,
url,
create_time
FROM user_behavior
WHERE create_time BETWEEN '{start_date}' AND '{end_date}'
"""
df = pd.read_sql(query, conn)
logging.info(f"成功读取{len(df)}条用户行为记录")
return df
except Exception as e:
logging.error(f"数据库查询失败: {str(e)}")
return pd.DataFrame()
finally:
if conn:
conn.close()
def calculate_heat(row, current_date):
"""计算单个行为的热度值(每次行为独立计算,不考虑聚合次数)"""
# 计算时间差(天)
days_passed = (current_date - row['create_time']).days
# 获取行为配置默认权重为0
config = BEHAVIOR_CONFIG.get(row['behavior_type'], {'weight': 0, 'decay': 0})
# 计算热度值 = 权重 * e^(-衰减系数 * 天数)
return config['weight'] * np.exp(-config['decay'] * days_passed)
def load_heat_matrix_as_array(file_path):
"""
直接加载为二维numpy数组
返回: (data_array, row_labels, col_labels)
"""
with open(file_path) as f:
saved = json.load(f)
return (
np.array(saved['data']), # 二维矩阵
saved['row_labels'], # 行标签列表
saved['col_labels'] # 列标签列表
)
def update_heat_matrices():
"""每日计算并存储热度矩阵gender_category × path"""
current_date = datetime.now()
# 获取数据
df = fetch_user_behavior_data(30)
if df.empty:
logging.warning("无有效数据,跳过今日计算")
return None
# 计算热度值
df['heat'] = df.apply(calculate_heat, axis=1, current_date=current_date)
df['gender_category'] = df['gender'] + '_' + df['category']
# 构建热度向量
heat_vectors = {}
grouped = df.groupby(['gender_category', 'url'])['heat'].sum()
for (gender_category, url), heat in grouped.items():
heat_vectors.setdefault(gender_category, {})[url] = heat
# 存储结果
save_path = 'heat_vectors_data'
os.makedirs(save_path, exist_ok=True)
date_str = current_date.strftime('%Y%m%d')
# vectors_file = f"{save_path}/heat_vectors_{date_str}.json"
vectors_file = f"{save_path}/heat_vectors.json"
with open(vectors_file, 'w', encoding='utf-8') as f:
json.dump({
'update_time': current_date.strftime('%Y-%m-%d %H:%M:%S'),
'data': heat_vectors
}, f, ensure_ascii=False, indent=2)
logging.info(f"成功存储热度向量,共{len(heat_vectors)}个分组,日期: {date_str}")
return heat_vectors
if __name__ == "__main__":
try:
update_user_matrices()
# update_user_matrices()
update_heat_matrices()
# scheduler = BlockingScheduler()
# scheduler.add_job(update_user_matrices, 'cron', hour=12, timezone='Asia/Shanghai')
# logging.info("定时任务已启动每天12:00执行")

View File

@@ -2,7 +2,8 @@
import logging
import time
from collections import defaultdict
import os
import json
import numpy as np
from app.core.config import DB_CONFIG, RECOMMEND_PATH_PREFIX
@@ -11,6 +12,8 @@ logger = logging.getLogger()
import pymysql
from concurrent.futures import ThreadPoolExecutor
HEAT_VECTOR_FILE = 'heat_vectors_data/heat_vectors.json' # 可动态加载或配置
matrix_data = {
"interaction_matrix": None,
"feature_matrix": None,
@@ -26,6 +29,9 @@ matrix_data = {
"category_sketch_idxs_feature": None,
"user_inter_full": dict(),
"user_feat_full": dict(),
"brand_feature_matrix": None,
"brand_index_map": None,
"heat_data": {},
}
@@ -48,7 +54,13 @@ def load_resources():
allow_pickle=True).item()
matrix_data["feature_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}feature_matrix.npy", allow_pickle=True)
matrix_data["brand_feature_matrix"] = np.load(f"{RECOMMEND_PATH_PREFIX}brand_feature_matrix.npy", allow_pickle=True)
matrix_data["brand_index_map"] = np.load(f"{RECOMMEND_PATH_PREFIX}brand_index_map.npy",allow_pickle=True).item()
matrix_data["user_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}user_index_feature_matrix.npy", allow_pickle=True).item()
matrix_data["sketch_index_feature"] = np.load(f"{RECOMMEND_PATH_PREFIX}sketch_index_feature_matrix.npy", allow_pickle=True).item()
category_to_iid_map = np.load(f"{RECOMMEND_PATH_PREFIX}iid_to_category_interaction_matrix.npy", allow_pickle=True).item()
@@ -61,6 +73,14 @@ def load_resources():
# 触发预缓存
precache_user_category()
if os.path.exists(HEAT_VECTOR_FILE):
with open(HEAT_VECTOR_FILE, 'r', encoding='utf-8') as f:
heat_json = json.load(f)
matrix_data["heat_data"] = heat_json.get("data", {})
logger.info(f"热度向量数据加载完成,共加载 {len(matrix_data['heat_data'])} 个类别")
else:
matrix_data["heat_data"] = {}
except Exception as e:
logger.error(f"资源加载失败: {str(e)}")
raise RuntimeError("初始化失败")