具有ID信息的文本图像对数据集制作-MuQYY的博客

1. 图像下载（Image Downloading）:

首先，列出了一个名人名单，这些名单可以从VoxCeleb和VGGFace等公开的名人面部数据集中获取。
根据名单，使用搜索引擎爬取数据，大约为每个名字下载100张图片。
在下载过程中，通过分辨率的最短边大于512像素来过滤图像，以生成高质量的肖像图像。

2. 人脸检测与过滤（Face Detection & Filtering）:

使用RetinaNet进行人脸检测，过滤掉尺寸过小（小于256×256像素）的检测框。
如果图像中没有符合要求的检测框，则该图像将被过滤掉。

3. 身份验证（ID Verification）:

由于一张图像可能包含多张脸，需要确定当前身份组内属于哪一张脸。
使用ArcFace提取所有检测框中的人脸区域的身份嵌入，并计算每对人脸之间的L2相似度。
对每个图像中多个人脸的边界框选择最高总分数的边界框。
重新计算每个人脸边界框的总分数，并使用标准差δ作为过滤不一致ID的阈值。

4. 裁剪与分割（Cropping & Segmentation）:

根据检测到的人脸区域，使用更大的正方形框裁剪图像，确保裁剪后的脸部区域占图像的10%以上。
使用Mask2Former对“人”类别进行泛视觉分割，保留与对应ID的人脸检测框重叠度最高的掩码。
如果没有检测到掩码或掩码区域与边界框没有重叠，则丢弃该图像。

import os
from PIL import Image
from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
import torch
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.patches as mpatches

# 根目录路径
root_dir = 'Dataset\VGG-Face2\data\\vggface2_train'
# 输出目录路径
output_dir = 'processed_images'
os.makedirs(output_dir, exist_ok=True)  # 如果输出目录不存在，则创建它

# 检查CUDA是否可用，并将模型和处理器移动到CUDA设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 模型和处理器加载（确保已经下载或指定正确的预训练模型路径）
processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-base-coco-panoptic")
model = Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-base-coco-panoptic").to(device)

# 遍历目录
for subdir, _, files in os.walk(root_dir):
    for file in files:
        # 确保只处理JPEG图像
        if file.lower().endswith(('jpg', 'jpeg')):
            # 构建完整的文件路径
            file_path = os.path.join(subdir, file)
            # 打开图像
            original_image = Image.open(file_path).convert("RGB")

            # 准备输入
            inputs = processor(images=original_image, return_tensors="pt").to(device)

            # 模型推理
            with torch.no_grad():
                outputs = model(**inputs)

            # 后处理
            results = processor.post_process_panoptic_segmentation(outputs, target_sizes=[original_image.size[::-1]])
            segments_info = results[0]["segments_info"]
            segmentation = results[0]["segmentation"].to('cpu')

            # 通过类别名称获取类别ID

            person_masks = []
            cnt = 0
            for segment in segments_info:
                segment_id = segment['id']
                segment_label_id = segment['label_id']
                segment_label = model.config.id2label[segment_label_id]
                if(segment_label == 'person'):
                    cnt += 1
                    if(cnt > 1):
                        break
                    person_class_id = segment_id
                    person_mask = segmentation == person_class_id
                    person_masks.append(person_mask)

            # 如果有多个人则跳过：只需要一个人的情况
            if(cnt > 1):
                continue
            # 合并所有人物掩码
            all_persons_mask = torch.any(torch.stack(person_masks), dim=0) if person_masks else None
            if(all_persons_mask is None):
                continue
            # 将人物掩码应用到原图
            person_image = Image.new("RGB", original_image.size)
            person_mask_pil = Image.fromarray((all_persons_mask.squeeze().numpy() * 255).astype('uint8'), mode='L')
            person_image.paste(original_image, mask=person_mask_pil)  # 使用掩码进行粘贴

            # 构建保存路径
            relative_path = os.path.relpath(subdir, root_dir)
            save_dir_path = os.path.join(output_dir, relative_path)
            os.makedirs(save_dir_path, exist_ok=True)  # 如果保存目录不存在，则创建它
            save_file_path = os.path.join(save_dir_path, file)

            # 保存处理后的图片
            person_image.save(save_file_path)

            print(f"Processed and saved: {save_file_path}")

5. 字幕生成与标记（Captioning & Marking）:

使用BLIP2为每个裁剪后的图像生成字幕。
为了标记输入字幕中的类词（例如“男人”、“女人”、“男孩”），重新生成不包含任何类词的字幕，直到出现类词为止。
对于包含多个类词的字幕，根据每个身份组的类词出现次数来确定当前身份组的类词，并用该类词标记该身份组中的每个字幕。

import os
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import re
from tqdm import tqdm  # 导入tqdm

# 配置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载模型和处理器
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
)
model.to(device)

# 设置根目录路径
root_path = 'Dataset'

# 结果存储路径
output_path = 'descriptions2'
os.makedirs(output_path, exist_ok=True)
keywords = ['woman', 'man', 'child', 'boy', 'girl']

def add_img_to_keywords(text, keyword):
    # 使用正则表达式确保只匹配整个单词，并在单词后有空白或标点
    pattern = rf'\b{re.escape(keyword)}\b(?=\s|[.,;!?])'
    modified_text = re.sub(pattern, f'{keyword} img', text, flags=re.IGNORECASE)

    # 如果文本没有变化，返回原始文本
    return modified_text if modified_text != text else text

# 定义批量大小
batch_size = 128  # 根据GPU显存大小进行调整

# 从哪个子文件夹开始
target_subdir = 'n000002'

flag = False

# 遍历每个子目录
for subdir, dirs, files in os.walk(root_path):
    if flag or os.path.basename(subdir) == target_subdir:
        flag = True
        image_files = [
            os.path.join(subdir, f)
            for f in files
            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))
        ]

        # 分批处理图片
        for i in range(0, len(image_files), batch_size):
            batch_images = image_files[i:i+batch_size]
            images = [Image.open(img).convert('RGB') for img in batch_images]

            # 清理未使用的缓存
            torch.cuda.empty_cache()

            # 批量处理图片
            with torch.no_grad():
                inputs = processor(images=images, return_tensors="pt", padding=True).to(device, torch.float16)
                generated_ids = model.generate(**inputs)

            # 将生成的文本ID解码为文本
            generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

            # 修改文本描述
            modified_texts = []
            for text in generated_texts:
                modified_text = text  # 从原始文本开始
                for keyword in keywords:
                    # 对于每个关键词，如果它在文本中，就在它后面添加'img'
                    modified_text = add_img_to_keywords(modified_text, keyword)
                # 只有当文本被修改后，才添加到列表中
                modified_texts.append(modified_text)

            # 保存文本描述到每个子目录对应的文件中
            for img_file, text, modified_text in zip(batch_images, generated_texts, modified_texts):
                file_name = os.path.basename(img_file)
                save_path = os.path.join(output_path, os.path.basename(subdir), f"{os.path.splitext(file_name)[0]}.txt")
                os.makedirs(os.path.dirname(save_path), exist_ok=True)

                # 只保存被修改过的描述
                if modified_text != text:
                    with open(save_path, 'w', encoding='utf-8') as f:
                        f.write(modified_text)
                    print(f"Modified description for {file_name} saved to {save_path}")
                    # 将文件名添加到列表中
                    with open(os.path.join(output_path, os.path.basename(subdir), 'caption_list.txt'), 'a', encoding='utf-8') as f:
                        f.write(file_name + '\n')
                else:
                    print(f"No modification needed for {file_name}")

print("Descriptions generation complete.")

通过这个自动化管道，构建一个数据集，其中包含大量具有不同表情、属性、场景等的ID的多张图片。最终，这个数据集包含了大约112K的图像，被分类到大约13,000个ID名称下，每张图像都有对应的掩码和注释字幕。