TASK:lookbook上传

This commit is contained in:
shahaibo
2024-10-21 16:57:57 +08:00
parent fdaf6a72f3
commit 7f9caf9d0e
2 changed files with 93 additions and 39 deletions

View File

@@ -1,11 +1,17 @@
import json
import os
import logging
import tqdm
import aiofiles
from openai import OpenAI
from app.service.lookbooks.utils.image_utils import base64_encode_image, generate_text_id
from app.service.lookbooks.utils.openai_utils import wait_for_job_completion
# 设置日志
logger = logging.getLogger()
# OpenAI 配置
OPENAI_API_KEY = "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"
OPENAI_API_BASE = "https://pangkaichen-openai-prox-98.deno.dev/v1"
client = OpenAI(
@@ -84,7 +90,8 @@ def create_image_batch_requests(
}
tasks.append(task)
id2img[current_id] = image_filename
print(f"In total {len(tasks)} images")
logger.info(f"In total {len(tasks)} images to process")
if tasks:
batch_file_name = os.path.join(output_path, "image_batch_requests.jsonl")
with open(batch_file_name, 'w', encoding='utf-8') as file:
@@ -120,9 +127,77 @@ def create_image_batch_requests(
})
f.write(output + '\n')
except json.JSONDecodeError as error:
print(f"Error parsing: {error} -- at line: {line}")
logger.error(f"Error parsing: {error} -- at line: {line}")
else:
print("Job failed")
logger.error("Job failed")
return os.path.join(output_path, "image_description_results.jsonl")
else:
return None
async def process_lookbook_task(lookbook_list, tag, year):
"""后台异步任务,用于处理 lookbook 并保存到向量数据库"""
image_list = []
try:
for look_book_path in tqdm.tqdm(lookbook_list):
lookbook_name = os.path.splitext(os.path.basename(look_book_path))[0]
output_dir = os.path.join("fashion_documents/lookbook/images", lookbook_name)
os.makedirs(output_dir, exist_ok=True)
if not os.listdir(output_dir):
from unstructured.partition.pdf import partition_pdf
partition_pdf(
filename=look_book_path,
extract_images_in_pdf=True,
infer_table_structure=False,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
extract_image_block_output_dir=output_dir,
)
else:
current_images = os.listdir(output_dir)
image_list.extend([os.path.join(output_dir, x) for x in current_images])
# 1. 处理图片并生成批量请求
image_description_results_file = create_image_batch_requests(image_list, "fashion_documents/lookbook/results")
# 2. 保存结果到向量数据库
if image_description_results_file:
save_to_vector_db(image_description_results_file, tag, year)
except Exception as e:
logger.error(f"Error processing lookbooks: {str(e)}")
raise e
def save_to_vector_db(image_description_results_file, tag, year):
"""保存图像描述到向量数据库"""
image_ids = set()
image_summaries = []
image_metadatas = []
try:
with open(image_description_results_file, "r", encoding="utf-8") as f:
for image_content in f:
image_content = json.loads(image_content)
# 确保ID不重复
if image_content["custom_id"] not in image_ids:
image_ids.add(image_content["custom_id"])
image_summaries.append(image_content["summary"])
image_metadatas.append({
"data_type": "image",
"url": image_content["url"].replace("\\", "/"),
"source": "mitu",
"tag": tag,
"year": year,
"gender": "female"
})
# 将图像的描述和元数据添加到向量数据库中
collection.add_texts(texts=image_summaries, metadatas=image_metadatas, ids=list(image_ids))
logger.info("Successfully saved data to vector database")
except Exception as e:
logger.error(f"Error saving to vector database: {e}")
raise e