app/test/chromadb/embedding_query.py

import random

import chromadb
from typing import Set, List, Dict, Union, Any

import torch
from PIL import Image
from torch import no_grad
from transformers import CLIPModel, CLIPProcessor

from app.server.utils.minio_client import oss_get_image, minio_client
from app.server.utils.minio_config import MINIO_LC_DATA_PATH

# --- 你的配置 ---
DB_PATH = "/workspace/lc_stylist_agent/db"
COLLECTION_NAME = 'lc_clothing_embedding'
# 设置一个足够大的限制来获取所有记录，或者使用分页（如果记录数非常庞大）
MAX_LIMIT = 1000000

client = chromadb.PersistentClient(path=DB_PATH)
try:
    collection = client.get_collection(name=COLLECTION_NAME)
    print(f"✅ 连接到 Collection: {COLLECTION_NAME}")
except ValueError:
    print(f"⚠️ Collection '{COLLECTION_NAME}' 不存在。")
    # 如果 collection 不存在，我们将跳过后续操作
    collection = None

from transformers import CLIPModel, CLIPProcessor


def get_clip_embedding(data: str | Image.Image) -> List[float]:
    """生成图像或文本的 CLIP 嵌入，并进行 L2 归一化。"""

    embedding_model_name = "openai/clip-vit-base-patch32"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = CLIPModel.from_pretrained(embedding_model_name).to(device)
    processor = CLIPProcessor.from_pretrained(embedding_model_name)

    # 强制截断，解决序列长度问题
    inputs = processor(
        text=[data],
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    with no_grad():
        features = model.get_text_features(**inputs)

    # L2 归一化
    features = features / features.norm(p=2, dim=-1, keepdim=True)

    return features.cpu().numpy().flatten().tolist()


def query_local_db(embedding: List[float], category: str, n_results: int = 3) -> List[Dict[str, Any]]:
    """
    基于嵌入向量在本地数据库中查询相似单品。
    实际应执行 ChromaDB 查询,并根据 category 进行过滤(metadatas)。
    """
    # 实际应执行向量查询
    # 为了演示流程,返回一个模拟结果
    results = collection.query(
        query_embeddings=[embedding],
        n_results=n_results,
        where={
            "$and": [
                {"category": category},
                {"modality": "image"},
            ]
        },
        include=['documents', 'metadatas', 'distances']
    )
    return results


if __name__ == '__main__':
    embedding = get_clip_embedding("watch")
    print(embedding)
    result = query_local_db(embedding, "Watches", 20)
    print(result)
    ids = result['ids'][0]

    random_single_id = random.choices(ids, k=2)
    print(random_single_id)

    # for id in ids:
    #     path = id.replace("_img", ".jpg")
    #     img = oss_get_image(oss_client=minio_client, path=f"{MINIO_LC_DATA_PATH}/{path}", data_type="PIL").convert('RGB')
    #     img.save(path)
新增取消agent配饰（保留鞋子）推荐，改为默认随机配饰搭配使用json文件补充stylist删除掉的必要配饰 2025-11-21 14:42:26 +08:00			`import random`

			`import chromadb`
			`from typing import Set, List, Dict, Union, Any`

			`import torch`
			`from PIL import Image`
			`from torch import no_grad`
			`from transformers import CLIPModel, CLIPProcessor`

			`from app.server.utils.minio_client import oss_get_image, minio_client`
			`from app.server.utils.minio_config import MINIO_LC_DATA_PATH`

			`# --- 你的配置 ---`
			`DB_PATH = "/workspace/lc_stylist_agent/db"`
			`COLLECTION_NAME = 'lc_clothing_embedding'`
			`# 设置一个足够大的限制来获取所有记录，或者使用分页（如果记录数非常庞大）`
			`MAX_LIMIT = 1000000`

			`client = chromadb.PersistentClient(path=DB_PATH)`
			`try:`
			`collection = client.get_collection(name=COLLECTION_NAME)`
			`print(f"✅ 连接到 Collection: {COLLECTION_NAME}")`
			`except ValueError:`
			`print(f"⚠️ Collection '{COLLECTION_NAME}' 不存在。")`
			`# 如果 collection 不存在，我们将跳过后续操作`
			`collection = None`

			`from transformers import CLIPModel, CLIPProcessor`


			`def get_clip_embedding(data: str \| Image.Image) -> List[float]:`
			`"""生成图像或文本的 CLIP 嵌入，并进行 L2 归一化。"""`

			`embedding_model_name = "openai/clip-vit-base-patch32"`
			`device = "cuda" if torch.cuda.is_available() else "cpu"`
			`model = CLIPModel.from_pretrained(embedding_model_name).to(device)`
			`processor = CLIPProcessor.from_pretrained(embedding_model_name)`

			`# 强制截断，解决序列长度问题`
			`inputs = processor(`
			`text=[data],`
			`return_tensors="pt",`
			`padding=True,`
			`truncation=True`
			`).to(device)`
			`with no_grad():`
			`features = model.get_text_features(**inputs)`

			`# L2 归一化`
			`features = features / features.norm(p=2, dim=-1, keepdim=True)`

			`return features.cpu().numpy().flatten().tolist()`


			`def query_local_db(embedding: List[float], category: str, n_results: int = 3) -> List[Dict[str, Any]]:`
			`"""`
			`基于嵌入向量在本地数据库中查询相似单品。`
			`实际应执行 ChromaDB 查询,并根据 category 进行过滤(metadatas)。`
			`"""`
			`# 实际应执行向量查询`
			`# 为了演示流程,返回一个模拟结果`
			`results = collection.query(`
			`query_embeddings=[embedding],`
			`n_results=n_results,`
			`where={`
			`"$and": [`
			`{"category": category},`
			`{"modality": "image"},`
			`]`
			`},`
			`include=['documents', 'metadatas', 'distances']`
			`)`
			`return results`


			`if __name__ == '__main__':`
			`embedding = get_clip_embedding("watch")`
			`print(embedding)`
			`result = query_local_db(embedding, "Watches", 20)`
			`print(result)`
			`ids = result['ids'][0]`

			`random_single_id = random.choices(ids, k=2)`
			`print(random_single_id)`

			`# for id in ids:`
			`# path = id.replace("_img", ".jpg")`
			`# img = oss_get_image(oss_client=minio_client, path=f"{MINIO_LC_DATA_PATH}/{path}", data_type="PIL").convert('RGB')`
			`# img.save(path)`