TASK:lookbook上传

This commit is contained in:
shahaibo
2024-10-21 16:57:57 +08:00
parent fdaf6a72f3
commit 7f9caf9d0e
2 changed files with 93 additions and 39 deletions

View File

@@ -4,52 +4,31 @@ import shutil
from typing import List from typing import List
import tqdm import tqdm
from fastapi import UploadFile, File, APIRouter from fastapi import UploadFile, File, APIRouter, BackgroundTasks
from app.service.lookbooks.service import create_image_batch_requests, process_lookbook_task # 引入服务逻辑
from app.service.lookbooks.service import create_image_batch_requests
logger = logging.getLogger() logger = logging.getLogger()
router = APIRouter() router = APIRouter()
@router.post("/process_lookbooks/") @router.post("/process-lookbooks/")
async def process_lookbooks(files: List[UploadFile] = File(...)): async def process_lookbooks(
background_tasks: BackgroundTasks,
files: List[UploadFile] = File(...),
tag: str = Form(...),
year: str = Form(...)
):
lookbook_dir = "service/lookbooks/temp_lookbooks" lookbook_dir = "service/lookbooks/temp_lookbooks"
os.makedirs(lookbook_dir, exist_ok=True) os.makedirs(lookbook_dir, exist_ok=True)
lookbook_list = [] lookbook_list = []
for file in files: for file in files:
file_path = os.path.join(lookbook_dir, file.filename) file_path = os.path.join(lookbook_dir, file.filename)
with open(file_path, "wb") as f: async with aiofiles.open(file_path, "wb") as f:
shutil.copyfileobj(file.file, f) await f.write(await file.read())
lookbook_list.append(file_path) lookbook_list.append(file_path)
image_list = [] # 将任务放入后台异步执行
for look_book_path in tqdm.tqdm(lookbook_list): background_tasks.add_task(process_lookbook_task, lookbook_list, tag, year)
lookbook_name = os.path.splitext(os.path.basename(look_book_path))[0]
output_dir = os.path.join("app/service/lookbooks/fashion_documents/lookbook/images", lookbook_name)
os.makedirs(output_dir, exist_ok=True)
if not os.listdir(output_dir):
from unstructured.partition.pdf import partition_pdf
partition_pdf(
filename=look_book_path,
extract_images_in_pdf=True,
infer_table_structure=False,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
extract_image_block_output_dir=output_dir,
)
else:
current_images = os.listdir(output_dir)
image_list.extend([os.path.join(output_dir, x) for x in current_images])
image_description_results_file = create_image_batch_requests(image_list, "app/service/lookbooks/fashion_documents/lookbook/results") return {"message": "Lookbooks are being processed in the background."}
shutil.rmtree(lookbook_dir)
if image_description_results_file:
return {"message": "Lookbooks processed successfully", "result_file": image_description_results_file}
else:
return {"message": "No new images to process"}

View File

@@ -1,11 +1,17 @@
import json import json
import os import os
import logging
import tqdm
import aiofiles
from openai import OpenAI from openai import OpenAI
from app.service.lookbooks.utils.image_utils import base64_encode_image, generate_text_id from app.service.lookbooks.utils.image_utils import base64_encode_image, generate_text_id
from app.service.lookbooks.utils.openai_utils import wait_for_job_completion from app.service.lookbooks.utils.openai_utils import wait_for_job_completion
# 设置日志
logger = logging.getLogger()
# OpenAI 配置
OPENAI_API_KEY = "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" OPENAI_API_KEY = "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"
OPENAI_API_BASE = "https://pangkaichen-openai-prox-98.deno.dev/v1" OPENAI_API_BASE = "https://pangkaichen-openai-prox-98.deno.dev/v1"
client = OpenAI( client = OpenAI(
@@ -84,7 +90,8 @@ def create_image_batch_requests(
} }
tasks.append(task) tasks.append(task)
id2img[current_id] = image_filename id2img[current_id] = image_filename
print(f"In total {len(tasks)} images") logger.info(f"In total {len(tasks)} images to process")
if tasks: if tasks:
batch_file_name = os.path.join(output_path, "image_batch_requests.jsonl") batch_file_name = os.path.join(output_path, "image_batch_requests.jsonl")
with open(batch_file_name, 'w', encoding='utf-8') as file: with open(batch_file_name, 'w', encoding='utf-8') as file:
@@ -120,9 +127,77 @@ def create_image_batch_requests(
}) })
f.write(output + '\n') f.write(output + '\n')
except json.JSONDecodeError as error: except json.JSONDecodeError as error:
print(f"Error parsing: {error} -- at line: {line}") logger.error(f"Error parsing: {error} -- at line: {line}")
else: else:
print("Job failed") logger.error("Job failed")
return os.path.join(output_path, "image_description_results.jsonl") return os.path.join(output_path, "image_description_results.jsonl")
else: else:
return None return None
async def process_lookbook_task(lookbook_list, tag, year):
"""后台异步任务,用于处理 lookbook 并保存到向量数据库"""
image_list = []
try:
for look_book_path in tqdm.tqdm(lookbook_list):
lookbook_name = os.path.splitext(os.path.basename(look_book_path))[0]
output_dir = os.path.join("fashion_documents/lookbook/images", lookbook_name)
os.makedirs(output_dir, exist_ok=True)
if not os.listdir(output_dir):
from unstructured.partition.pdf import partition_pdf
partition_pdf(
filename=look_book_path,
extract_images_in_pdf=True,
infer_table_structure=False,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
extract_image_block_output_dir=output_dir,
)
else:
current_images = os.listdir(output_dir)
image_list.extend([os.path.join(output_dir, x) for x in current_images])
# 1. 处理图片并生成批量请求
image_description_results_file = create_image_batch_requests(image_list, "fashion_documents/lookbook/results")
# 2. 保存结果到向量数据库
if image_description_results_file:
save_to_vector_db(image_description_results_file, tag, year)
except Exception as e:
logger.error(f"Error processing lookbooks: {str(e)}")
raise e
def save_to_vector_db(image_description_results_file, tag, year):
"""保存图像描述到向量数据库"""
image_ids = set()
image_summaries = []
image_metadatas = []
try:
with open(image_description_results_file, "r", encoding="utf-8") as f:
for image_content in f:
image_content = json.loads(image_content)
# 确保ID不重复
if image_content["custom_id"] not in image_ids:
image_ids.add(image_content["custom_id"])
image_summaries.append(image_content["summary"])
image_metadatas.append({
"data_type": "image",
"url": image_content["url"].replace("\\", "/"),
"source": "mitu",
"tag": tag,
"year": year,
"gender": "female"
})
# 将图像的描述和元数据添加到向量数据库中
collection.add_texts(texts=image_summaries, metadatas=image_metadatas, ids=list(image_ids))
logger.info("Successfully saved data to vector database")
except Exception as e:
logger.error(f"Error saving to vector database: {e}")
raise e