TASK:冷启动热度推荐；

2025-06-10 10:54:20 +08:00
parent a14e6051b1
commit d39dee851f
4 changed files with 400 additions and 16 deletions
--- a/app/service/recommend/scheduled_task.py
+++ b/app/service/recommend/scheduled_task.py
@@ -14,6 +14,9 @@ import matplotlib.pyplot as plt
 from scipy.sparse import csr_matrix
 import matplotlib.font_manager as fm
 from scipy import sparse
+import pandas as pd
+from datetime import datetime, timedelta
+import json

 from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX

@@ -50,6 +53,13 @@ minio_client = Minio(
 # 预加载系统sketch特征向量
 SYSTEM_FEATURES = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()

+# 行为权重和衰减系数
+BEHAVIOR_CONFIG = {
+    'portfolioClick': {'weight': 1, 'decay': 0.3},
+    'portfolioLike': {'weight': 2, 'decay': 0.2},
+    'secondCreation': {'weight': 3, 'decay': 0.1},
+    'sketchLike': {'weight': 4, 'decay': 0}  # 不衰减
+}

 # 保存sketch_to_iid到文件
 def save_sketch_to_iid():
@@ -418,9 +428,107 @@ def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm + 1e-10) if norm != 0 else 0.0


+def fetch_user_behavior_data(days=30):
+    """从MySQL获取用户行为数据（整合旧查询和新需求）"""
+    conn = None
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+
+        # 计算日期范围
+        end_date = datetime.now()
+        start_date = end_date - timedelta(days=days)
+
+        # 整合查询（获取完整行为数据）
+        query = f"""
+        SELECT 
+            account_id,
+            behavior_type,
+            gender,
+            category,
+            url,
+            create_time
+        FROM user_behavior
+        WHERE create_time BETWEEN '{start_date}' AND '{end_date}'
+        """
+
+        df = pd.read_sql(query, conn)
+        logging.info(f"成功读取{len(df)}条用户行为记录")
+        return df
+
+    except Exception as e:
+        logging.error(f"数据库查询失败: {str(e)}")
+        return pd.DataFrame()
+
+    finally:
+        if conn:
+            conn.close()
+
+
+def calculate_heat(row, current_date):
+    """计算单个行为的热度值（每次行为独立计算，不考虑聚合次数）"""
+    # 计算时间差（天）
+    days_passed = (current_date - row['create_time']).days
+
+    # 获取行为配置（默认权重为0）
+    config = BEHAVIOR_CONFIG.get(row['behavior_type'], {'weight': 0, 'decay': 0})
+
+    # 计算热度值 = 权重 * e^(-衰减系数 * 天数)
+    return config['weight'] * np.exp(-config['decay'] * days_passed)
+
+def load_heat_matrix_as_array(file_path):
+    """
+    直接加载为二维numpy数组
+    返回: (data_array, row_labels, col_labels)
+    """
+    with open(file_path) as f:
+        saved = json.load(f)
+    return (
+        np.array(saved['data']),  # 二维矩阵
+        saved['row_labels'],      # 行标签列表
+        saved['col_labels']       # 列标签列表
+    )
+
+def update_heat_matrices():
+    """每日计算并存储热度矩阵（gender_category × path）"""
+    current_date = datetime.now()
+
+    # 获取数据
+    df = fetch_user_behavior_data(30)
+    if df.empty:
+        logging.warning("无有效数据，跳过今日计算")
+        return None
+
+    # 计算热度值
+    df['heat'] = df.apply(calculate_heat, axis=1, current_date=current_date)
+    df['gender_category'] = df['gender'] + '_' + df['category']
+
+    # 构建热度向量
+    heat_vectors = {}
+    grouped = df.groupby(['gender_category', 'url'])['heat'].sum()
+    for (gender_category, url), heat in grouped.items():
+        heat_vectors.setdefault(gender_category, {})[url] = heat
+
+    # 存储结果
+    save_path = 'heat_vectors_data'
+    os.makedirs(save_path, exist_ok=True)
+    date_str = current_date.strftime('%Y%m%d')
+
+    # vectors_file = f"{save_path}/heat_vectors_{date_str}.json"
+    vectors_file = f"{save_path}/heat_vectors.json"
+    with open(vectors_file, 'w', encoding='utf-8') as f:
+        json.dump({
+            'update_time': current_date.strftime('%Y-%m-%d %H:%M:%S'),
+            'data': heat_vectors
+        }, f, ensure_ascii=False, indent=2)
+
+    logging.info(f"成功存储热度向量，共{len(heat_vectors)}个分组，日期: {date_str}")
+    return heat_vectors
+
+
 if __name__ == "__main__":
    try:
-        update_user_matrices()
+        # update_user_matrices()
+        update_heat_matrices()
        # scheduler = BlockingScheduler()
        # scheduler.add_job(update_user_matrices, 'cron', hour=12, timezone='Asia/Shanghai')
        # logging.info("定时任务已启动，每天12:00执行")