TASK:冷启动热度推荐;

This commit is contained in:
shahaibo
2025-06-10 10:54:20 +08:00
parent a14e6051b1
commit d39dee851f
4 changed files with 400 additions and 16 deletions

View File

@@ -14,6 +14,9 @@ import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import matplotlib.font_manager as fm
from scipy import sparse
import pandas as pd
from datetime import datetime, timedelta
import json
from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX
@@ -50,6 +53,13 @@ minio_client = Minio(
# 预加载系统sketch特征向量
SYSTEM_FEATURES = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()
# 行为权重和衰减系数
BEHAVIOR_CONFIG = {
'portfolioClick': {'weight': 1, 'decay': 0.3},
'portfolioLike': {'weight': 2, 'decay': 0.2},
'secondCreation': {'weight': 3, 'decay': 0.1},
'sketchLike': {'weight': 4, 'decay': 0} # 不衰减
}
# 保存sketch_to_iid到文件
def save_sketch_to_iid():
@@ -418,9 +428,107 @@ def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (norm + 1e-10) if norm != 0 else 0.0
def fetch_user_behavior_data(days=30):
"""从MySQL获取用户行为数据整合旧查询和新需求"""
conn = None
try:
conn = pymysql.connect(**DB_CONFIG)
# 计算日期范围
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
# 整合查询(获取完整行为数据)
query = f"""
SELECT
account_id,
behavior_type,
gender,
category,
url,
create_time
FROM user_behavior
WHERE create_time BETWEEN '{start_date}' AND '{end_date}'
"""
df = pd.read_sql(query, conn)
logging.info(f"成功读取{len(df)}条用户行为记录")
return df
except Exception as e:
logging.error(f"数据库查询失败: {str(e)}")
return pd.DataFrame()
finally:
if conn:
conn.close()
def calculate_heat(row, current_date):
"""计算单个行为的热度值(每次行为独立计算,不考虑聚合次数)"""
# 计算时间差(天)
days_passed = (current_date - row['create_time']).days
# 获取行为配置默认权重为0
config = BEHAVIOR_CONFIG.get(row['behavior_type'], {'weight': 0, 'decay': 0})
# 计算热度值 = 权重 * e^(-衰减系数 * 天数)
return config['weight'] * np.exp(-config['decay'] * days_passed)
def load_heat_matrix_as_array(file_path):
"""
直接加载为二维numpy数组
返回: (data_array, row_labels, col_labels)
"""
with open(file_path) as f:
saved = json.load(f)
return (
np.array(saved['data']), # 二维矩阵
saved['row_labels'], # 行标签列表
saved['col_labels'] # 列标签列表
)
def update_heat_matrices():
"""每日计算并存储热度矩阵gender_category × path"""
current_date = datetime.now()
# 获取数据
df = fetch_user_behavior_data(30)
if df.empty:
logging.warning("无有效数据,跳过今日计算")
return None
# 计算热度值
df['heat'] = df.apply(calculate_heat, axis=1, current_date=current_date)
df['gender_category'] = df['gender'] + '_' + df['category']
# 构建热度向量
heat_vectors = {}
grouped = df.groupby(['gender_category', 'url'])['heat'].sum()
for (gender_category, url), heat in grouped.items():
heat_vectors.setdefault(gender_category, {})[url] = heat
# 存储结果
save_path = 'heat_vectors_data'
os.makedirs(save_path, exist_ok=True)
date_str = current_date.strftime('%Y%m%d')
# vectors_file = f"{save_path}/heat_vectors_{date_str}.json"
vectors_file = f"{save_path}/heat_vectors.json"
with open(vectors_file, 'w', encoding='utf-8') as f:
json.dump({
'update_time': current_date.strftime('%Y-%m-%d %H:%M:%S'),
'data': heat_vectors
}, f, ensure_ascii=False, indent=2)
logging.info(f"成功存储热度向量,共{len(heat_vectors)}个分组,日期: {date_str}")
return heat_vectors
if __name__ == "__main__":
try:
update_user_matrices()
# update_user_matrices()
update_heat_matrices()
# scheduler = BlockingScheduler()
# scheduler.add_job(update_user_matrices, 'cron', hour=12, timezone='Asia/Shanghai')
# logging.info("定时任务已启动每天12:00执行")