TASK:冷启动热度推荐;
This commit is contained in:
@@ -14,6 +14,9 @@ import matplotlib.pyplot as plt
|
||||
from scipy.sparse import csr_matrix
|
||||
import matplotlib.font_manager as fm
|
||||
from scipy import sparse
|
||||
import pandas as pd
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
from app.core.config import DB_CONFIG, TABLE_CATEGORIES, RECOMMEND_PATH_PREFIX
|
||||
|
||||
@@ -50,6 +53,13 @@ minio_client = Minio(
|
||||
# 预加载系统sketch特征向量
|
||||
SYSTEM_FEATURES = np.load(f'{RECOMMEND_PATH_PREFIX}sketch_feature_dict.npy', allow_pickle=True).item()
|
||||
|
||||
# 行为权重和衰减系数
|
||||
BEHAVIOR_CONFIG = {
|
||||
'portfolioClick': {'weight': 1, 'decay': 0.3},
|
||||
'portfolioLike': {'weight': 2, 'decay': 0.2},
|
||||
'secondCreation': {'weight': 3, 'decay': 0.1},
|
||||
'sketchLike': {'weight': 4, 'decay': 0} # 不衰减
|
||||
}
|
||||
|
||||
# 保存sketch_to_iid到文件
|
||||
def save_sketch_to_iid():
|
||||
@@ -418,9 +428,107 @@ def cosine_similarity(vec1, vec2):
|
||||
return np.dot(vec1, vec2) / (norm + 1e-10) if norm != 0 else 0.0
|
||||
|
||||
|
||||
def fetch_user_behavior_data(days=30):
|
||||
"""从MySQL获取用户行为数据(整合旧查询和新需求)"""
|
||||
conn = None
|
||||
try:
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
|
||||
# 计算日期范围
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=days)
|
||||
|
||||
# 整合查询(获取完整行为数据)
|
||||
query = f"""
|
||||
SELECT
|
||||
account_id,
|
||||
behavior_type,
|
||||
gender,
|
||||
category,
|
||||
url,
|
||||
create_time
|
||||
FROM user_behavior
|
||||
WHERE create_time BETWEEN '{start_date}' AND '{end_date}'
|
||||
"""
|
||||
|
||||
df = pd.read_sql(query, conn)
|
||||
logging.info(f"成功读取{len(df)}条用户行为记录")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"数据库查询失败: {str(e)}")
|
||||
return pd.DataFrame()
|
||||
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
def calculate_heat(row, current_date):
|
||||
"""计算单个行为的热度值(每次行为独立计算,不考虑聚合次数)"""
|
||||
# 计算时间差(天)
|
||||
days_passed = (current_date - row['create_time']).days
|
||||
|
||||
# 获取行为配置(默认权重为0)
|
||||
config = BEHAVIOR_CONFIG.get(row['behavior_type'], {'weight': 0, 'decay': 0})
|
||||
|
||||
# 计算热度值 = 权重 * e^(-衰减系数 * 天数)
|
||||
return config['weight'] * np.exp(-config['decay'] * days_passed)
|
||||
|
||||
def load_heat_matrix_as_array(file_path):
|
||||
"""
|
||||
直接加载为二维numpy数组
|
||||
返回: (data_array, row_labels, col_labels)
|
||||
"""
|
||||
with open(file_path) as f:
|
||||
saved = json.load(f)
|
||||
return (
|
||||
np.array(saved['data']), # 二维矩阵
|
||||
saved['row_labels'], # 行标签列表
|
||||
saved['col_labels'] # 列标签列表
|
||||
)
|
||||
|
||||
def update_heat_matrices():
|
||||
"""每日计算并存储热度矩阵(gender_category × path)"""
|
||||
current_date = datetime.now()
|
||||
|
||||
# 获取数据
|
||||
df = fetch_user_behavior_data(30)
|
||||
if df.empty:
|
||||
logging.warning("无有效数据,跳过今日计算")
|
||||
return None
|
||||
|
||||
# 计算热度值
|
||||
df['heat'] = df.apply(calculate_heat, axis=1, current_date=current_date)
|
||||
df['gender_category'] = df['gender'] + '_' + df['category']
|
||||
|
||||
# 构建热度向量
|
||||
heat_vectors = {}
|
||||
grouped = df.groupby(['gender_category', 'url'])['heat'].sum()
|
||||
for (gender_category, url), heat in grouped.items():
|
||||
heat_vectors.setdefault(gender_category, {})[url] = heat
|
||||
|
||||
# 存储结果
|
||||
save_path = 'heat_vectors_data'
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
date_str = current_date.strftime('%Y%m%d')
|
||||
|
||||
# vectors_file = f"{save_path}/heat_vectors_{date_str}.json"
|
||||
vectors_file = f"{save_path}/heat_vectors.json"
|
||||
with open(vectors_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'update_time': current_date.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'data': heat_vectors
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logging.info(f"成功存储热度向量,共{len(heat_vectors)}个分组,日期: {date_str}")
|
||||
return heat_vectors
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
update_user_matrices()
|
||||
# update_user_matrices()
|
||||
update_heat_matrices()
|
||||
# scheduler = BlockingScheduler()
|
||||
# scheduler.add_job(update_user_matrices, 'cron', hour=12, timezone='Asia/Shanghai')
|
||||
# logging.info("定时任务已启动,每天12:00执行")
|
||||
|
||||
Reference in New Issue
Block a user