6.1 视觉模型
学习时长:3-4 周
视觉模型是多模态 AI 应用的核心组件,使 AI 能够"看懂"图像和视频。本节覆盖从图像分类到视觉语言模型(VLM)的完整技术栈。
6.1.1 计算机视觉基础
核心任务分类
| 任务类型 | 描述 | 输入 | 输出 | 典型应用 |
|---|---|---|---|---|
| 图像分类 | 判断图像属于哪个类别 | 图像 | 类别标签 | 商品识别、医疗诊断 |
| 目标检测 | 定位并识别图像中的物体 | 图像 | 边界框 + 类别 | 自动驾驶、安防监控 |
| 语义分割 | 像素级分类 | 图像 | 分割掩码 | 医学影像、卫星图像 |
| 实例分割 | 区分同类别的不同实例 | 图像 | 实例掩码 | 机器人抓取、人群计数 |
| 图像生成 | 从文本/噪声生成图像 | 文本/噪声 | 图像 | DALL-E、Midjourney |
| 图像理解 | 描述图像内容 | 图像 | 文本描述 | 视觉问答、图像字幕 |
视觉模型发展历程
2012: AlexNet(深度学习视觉元年)
↓
2014: VGG、GoogLeNet(更深的网络)
↓
2015: ResNet(残差连接,突破深度限制)
↓
2017: Transformer 提出(为 ViT 奠基)
↓
2020: Vision Transformer(ViT)(纯 Attention 架构)
↓
2021: CLIP(视觉-语言对齐)
↓
2022: Stable Diffusion(文生图爆发)
↓
2023: GPT-4V、LLaVA(视觉语言模型)
↓
2024: Qwen-VL、InternVL(开源 VLM 崛起)6.1.2 图像分类与特征提取
1. 使用预训练模型进行图像分类
方法 1:使用 Transformers 库(推荐)
python
# pip install transformers pillow torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import torch
class ImageClassifier:
"""图像分类器"""
def __init__(self, model_name: str = "microsoft/resnet-50"):
"""
Args:
model_name: 预训练模型名称
- microsoft/resnet-50: ResNet-50(ImageNet)
- google/vit-base-patch16-224: Vision Transformer
- facebook/convnext-base-224: ConvNeXt
"""
self.processor = AutoImageProcessor.from_pretrained(model_name)
self.model = AutoModelForImageClassification.from_pretrained(model_name)
self.model.eval()
def classify(self, image_path: str, top_k: int = 5) -> list[dict]:
"""
分类图像
Args:
image_path: 图像路径
top_k: 返回前 k 个预测结果
Returns:
[{"label": "猫", "score": 0.95}, ...]
"""
# 1. 加载图像
image = Image.open(image_path).convert("RGB")
# 2. 预处理
inputs = self.processor(images=image, return_tensors="pt")
# 3. 推理
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
# 4. 解析结果
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
top_probs, top_indices = torch.topk(probs, top_k)
results = []
for prob, idx in zip(top_probs, top_indices):
label = self.model.config.id2label[idx.item()]
results.append({
"label": label,
"score": prob.item()
})
return results
# 使用示例
classifier = ImageClassifier(model_name="microsoft/resnet-50")
# 分类图像
results = classifier.classify("cat.jpg", top_k=3)
print("🖼️ 图像分类结果:")
for i, result in enumerate(results, 1):
print(f"{i}. {result['label']}: {result['score']:.2%}")输出示例
🖼️ 图像分类结果:
1. tabby cat: 68.50%
2. Egyptian cat: 18.30%
3. tiger cat: 9.20%方法 2:使用 timm 库(更多模型选择)
python
# pip install timm
import timm
from PIL import Image
import torch
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
class TimmClassifier:
"""基于 timm 的图像分类器"""
def __init__(self, model_name: str = "resnet50.a1_in1k"):
"""
Args:
model_name: timm 模型名称
- resnet50.a1_in1k: ResNet-50
- vit_base_patch16_224.augreg_in21k_ft_in1k: ViT-Base
- convnext_base.fb_in22k_ft_in1k: ConvNeXt-Base
- efficientnet_b0.ra_in1k: EfficientNet-B0
"""
self.model = timm.create_model(model_name, pretrained=True)
self.model.eval()
# 获取预处理配置
config = resolve_data_config({}, model=self.model)
self.transform = create_transform(**config)
def classify(self, image_path: str, top_k: int = 5) -> list[dict]:
"""分类图像"""
# 加载并预处理
image = Image.open(image_path).convert("RGB")
input_tensor = self.transform(image).unsqueeze(0)
# 推理
with torch.no_grad():
output = self.model(input_tensor)
probs = torch.nn.functional.softmax(output, dim=1)[0]
# 解析结果
top_probs, top_indices = torch.topk(probs, top_k)
# 获取 ImageNet 类别名称
labels = timm.data.ImageNetInfo().index_to_label_name
results = []
for prob, idx in zip(top_probs, top_indices):
label = labels[idx.item()]
results.append({
"label": label,
"score": prob.item()
})
return results
# 使用示例
classifier = TimmClassifier(model_name="vit_base_patch16_224.augreg_in21k_ft_in1k")
results = classifier.classify("dog.jpg")
for result in results:
print(f"• {result['label']}: {result['score']:.2%}")2. 特征提取(用于相似度搜索)
python
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import numpy as np
class ImageFeatureExtractor:
"""图像特征提取器"""
def __init__(self, model_name: str = "facebook/dinov2-base"):
"""
Args:
model_name: 特征提取模型
- facebook/dinov2-base: DINOv2(推荐)
- openai/clip-vit-base-patch32: CLIP
- google/vit-base-patch16-224: ViT
"""
self.processor = AutoImageProcessor.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.model.eval()
def extract_features(self, image_path: str) -> np.ndarray:
"""
提取图像特征向量
Returns:
特征向量(768 维或其他维度)
"""
# 加载图像
image = Image.open(image_path).convert("RGB")
# 预处理
inputs = self.processor(images=image, return_tensors="pt")
# 提取特征
with torch.no_grad():
outputs = self.model(**inputs)
# 使用 [CLS] token 的输出作为图像特征
features = outputs.last_hidden_state[:, 0, :]
# 归一化(用于余弦相似度计算)
features = F.normalize(features, p=2, dim=1)
return features.cpu().numpy()[0]
def compute_similarity(
self,
image1_path: str,
image2_path: str
) -> float:
"""
计算两张图像的相似度
Returns:
相似度分数(0-1,越高越相似)
"""
feat1 = self.extract_features(image1_path)
feat2 = self.extract_features(image2_path)
# 余弦相似度
similarity = np.dot(feat1, feat2)
return float(similarity)
# 使用示例
extractor = ImageFeatureExtractor()
# 提取特征
features = extractor.extract_features("image1.jpg")
print(f"特征维度: {features.shape}") # (768,)
# 计算相似度
similarity = extractor.compute_similarity("cat1.jpg", "cat2.jpg")
print(f"相似度: {similarity:.4f}") # 0.85233. 以图搜图系统
python
# pip install faiss-cpu
import faiss
import numpy as np
from pathlib import Path
from typing import List, Tuple
import pickle
class ImageSearchEngine:
"""以图搜图引擎"""
def __init__(self, feature_extractor: ImageFeatureExtractor):
self.extractor = feature_extractor
self.index = None
self.image_paths = []
def build_index(self, image_dir: str):
"""
构建图像索引
Args:
image_dir: 图像目录路径
"""
print(f"🔨 构建索引:{image_dir}")
# 1. 收集所有图像
image_dir = Path(image_dir)
image_files = list(image_dir.glob("*.jpg")) + \
list(image_dir.glob("*.png")) + \
list(image_dir.glob("*.jpeg"))
print(f"📁 找到 {len(image_files)} 张图像")
# 2. 提取特征
features_list = []
for i, img_path in enumerate(image_files, 1):
if i % 100 == 0:
print(f" 处理进度: {i}/{len(image_files)}")
try:
features = self.extractor.extract_features(str(img_path))
features_list.append(features)
self.image_paths.append(str(img_path))
except Exception as e:
print(f" ⚠️ 跳过 {img_path.name}: {e}")
# 3. 构建 FAISS 索引
features_matrix = np.array(features_list).astype('float32')
dimension = features_matrix.shape[1]
# 使用内积索引(因为特征已归一化,内积等于余弦相似度)
self.index = faiss.IndexFlatIP(dimension)
self.index.add(features_matrix)
print(f"✅ 索引构建完成:{len(self.image_paths)} 张图像")
def search(
self,
query_image: str,
top_k: int = 5
) -> List[Tuple[str, float]]:
"""
搜索相似图像
Args:
query_image: 查询图像路径
top_k: 返回前 k 个结果
Returns:
[(图像路径, 相似度分数), ...]
"""
if self.index is None:
raise ValueError("请先调用 build_index() 构建索引")
# 提取查询图像特征
query_features = self.extractor.extract_features(query_image)
query_features = query_features.reshape(1, -1).astype('float32')
# 搜索
distances, indices = self.index.search(query_features, top_k)
# 整理结果
results = []
for dist, idx in zip(distances[0], indices[0]):
results.append((self.image_paths[idx], float(dist)))
return results
def save_index(self, save_path: str):
"""保存索引"""
faiss.write_index(self.index, f"{save_path}.index")
with open(f"{save_path}.paths", "wb") as f:
pickle.dump(self.image_paths, f)
print(f"💾 索引已保存:{save_path}")
def load_index(self, load_path: str):
"""加载索引"""
self.index = faiss.read_index(f"{load_path}.index")
with open(f"{load_path}.paths", "rb") as f:
self.image_paths = pickle.load(f)
print(f"📂 索引已加载:{len(self.image_paths)} 张图像")
# 使用示例
extractor = ImageFeatureExtractor()
search_engine = ImageSearchEngine(extractor)
# 构建索引
search_engine.build_index("./image_dataset")
# 保存索引
search_engine.save_index("./image_index")
# 搜索相似图像
results = search_engine.search("query.jpg", top_k=5)
print("\n🔍 搜索结果:")
for i, (path, score) in enumerate(results, 1):
print(f"{i}. {Path(path).name}: {score:.4f}")输出示例
🔨 构建索引:./image_dataset
📁 找到 1000 张图像
处理进度: 100/1000
处理进度: 200/1000
...
✅ 索引构建完成:1000 张图像
💾 索引已保存:./image_index
🔍 搜索结果:
1. cat_001.jpg: 0.9823
2. cat_045.jpg: 0.9156
3. cat_089.jpg: 0.8934
4. cat_123.jpg: 0.8712
5. kitten_007.jpg: 0.85016.1.3 目标检测
1. 使用 YOLO 进行目标检测
python
# pip install ultralytics
from ultralytics import YOLO
from PIL import Image
import cv2
import numpy as np
class ObjectDetector:
"""目标检测器(基于 YOLO)"""
def __init__(self, model_size: str = "yolov8n"):
"""
Args:
model_size: 模型大小
- yolov8n: Nano(最快)
- yolov8s: Small
- yolov8m: Medium
- yolov8l: Large
- yolov8x: XLarge(最准)
"""
self.model = YOLO(f"{model_size}.pt")
def detect(
self,
image_path: str,
conf_threshold: float = 0.25,
save_path: str = None
) -> list[dict]:
"""
检测图像中的物体
Args:
image_path: 图像路径
conf_threshold: 置信度阈值
save_path: 保存标注图像的路径(可选)
Returns:
[{"class": "person", "confidence": 0.95, "bbox": [x1, y1, x2, y2]}, ...]
"""
# 推理
results = self.model(image_path, conf=conf_threshold)
# 解析结果
detections = []
for result in results:
boxes = result.boxes
for box in boxes:
detection = {
"class": result.names[int(box.cls)],
"confidence": float(box.conf),
"bbox": box.xyxy[0].tolist() # [x1, y1, x2, y2]
}
detections.append(detection)
# 保存标注图像
if save_path:
annotated = results[0].plot()
cv2.imwrite(save_path, annotated)
print(f"💾 标注图像已保存:{save_path}")
return detections
def detect_video(
self,
video_path: str,
output_path: str,
conf_threshold: float = 0.25
):
"""
检测视频中的物体
Args:
video_path: 视频路径
output_path: 输出视频路径
conf_threshold: 置信度阈值
"""
# 打开视频
cap = cv2.VideoCapture(video_path)
# 获取视频信息
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# 创建输出视频
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
print(f"🎥 处理视频:{total_frames} 帧")
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# 检测
results = self.model(frame, conf=conf_threshold, verbose=False)
annotated = results[0].plot()
# 写入输出
out.write(annotated)
frame_count += 1
if frame_count % 30 == 0:
print(f" 进度: {frame_count}/{total_frames}")
cap.release()
out.release()
print(f"✅ 视频处理完成:{output_path}")
# 使用示例
detector = ObjectDetector(model_size="yolov8n")
# 检测图像
detections = detector.detect("street.jpg", save_path="street_annotated.jpg")
print("🎯 检测结果:")
for det in detections:
print(f"• {det['class']}: {det['confidence']:.2%} at {det['bbox']}")
# 检测视频
detector.detect_video("traffic.mp4", "traffic_annotated.mp4")输出示例
🎯 检测结果:
• person: 95.30% at [120.5, 200.3, 250.8, 450.2]
• car: 89.20% at [300.1, 250.5, 500.3, 400.7]
• bicycle: 78.50% at [50.2, 300.1, 150.6, 420.3]
💾 标注图像已保存:street_annotated.jpg2. 使用 Transformers 进行目标检测
python
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image, ImageDraw, ImageFont
import torch
class TransformersDetector:
"""基于 Transformers 的目标检测器"""
def __init__(self, model_name: str = "facebook/detr-resnet-50"):
"""
Args:
model_name: 模型名称
- facebook/detr-resnet-50: DETR
- microsoft/conditional-detr-resnet-50: Conditional DETR
"""
self.processor = DetrImageProcessor.from_pretrained(model_name)
self.model = DetrForObjectDetection.from_pretrained(model_name)
self.model.eval()
def detect(
self,
image_path: str,
threshold: float = 0.9
) -> list[dict]:
"""检测物体"""
# 加载图像
image = Image.open(image_path).convert("RGB")
# 预处理
inputs = self.processor(images=image, return_tensors="pt")
# 推理
with torch.no_grad():
outputs = self.model(**inputs)
# 后处理
target_sizes = torch.tensor([image.size[::-1]])
results = self.processor.post_process_object_detection(
outputs,
target_sizes=target_sizes,
threshold=threshold
)[0]
# 解析结果
detections = []
for score, label, box in zip(
results["scores"],
results["labels"],
results["boxes"]
):
detections.append({
"class": self.model.config.id2label[label.item()],
"confidence": score.item(),
"bbox": box.tolist()
})
return detections
def visualize(
self,
image_path: str,
detections: list[dict],
save_path: str
):
"""可视化检测结果"""
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
for det in detections:
bbox = det["bbox"]
label = f"{det['class']}: {det['confidence']:.2f}"
# 绘制边界框
draw.rectangle(bbox, outline="red", width=3)
# 绘制标签
draw.text((bbox[0], bbox[1] - 10), label, fill="red")
image.save(save_path)
print(f"💾 可视化结果已保存:{save_path}")
# 使用示例
detector = TransformersDetector()
detections = detector.detect("image.jpg", threshold=0.9)
print("🎯 检测到的物体:")
for det in detections:
print(f"• {det['class']}: {det['confidence']:.2%}")
detector.visualize("image.jpg", detections, "result.jpg")6.1.4 视觉语言模型(VLM)
1. 使用 CLIP 进行零样本分类
python
# pip install transformers pillow torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
class CLIPClassifier:
"""CLIP 零样本图像分类器"""
def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
"""
Args:
model_name: CLIP 模型
- openai/clip-vit-base-patch32: CLIP ViT-B/32
- openai/clip-vit-large-patch14: CLIP ViT-L/14
"""
self.processor = CLIPProcessor.from_pretrained(model_name)
self.model = CLIPModel.from_pretrained(model_name)
self.model.eval()
def classify(
self,
image_path: str,
candidate_labels: list[str]
) -> list[dict]:
"""
零样本分类
Args:
image_path: 图像路径
candidate_labels: 候选类别列表
Returns:
[{"label": "猫", "score": 0.95}, ...]
"""
# 加载图像
image = Image.open(image_path).convert("RGB")
# 构建文本提示
texts = [f"a photo of a {label}" for label in candidate_labels]
# 预处理
inputs = self.processor(
text=texts,
images=image,
return_tensors="pt",
padding=True
)
# 推理
with torch.no_grad():
outputs = self.model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)[0]
# 解析结果
results = []
for label, prob in zip(candidate_labels, probs):
results.append({
"label": label,
"score": prob.item()
})
# 按分数排序
results.sort(key=lambda x: x["score"], reverse=True)
return results
def search_text_in_images(
self,
image_paths: list[str],
text_query: str
) -> list[tuple[str, float]]:
"""
文本搜索图像
Args:
image_paths: 图像路径列表
text_query: 文本查询
Returns:
[(图像路径, 相似度分数), ...]
"""
# 加载所有图像
images = [Image.open(path).convert("RGB") for path in image_paths]
# 预处理
inputs = self.processor(
text=[text_query],
images=images,
return_tensors="pt",
padding=True
)
# 推理
with torch.no_grad():
outputs = self.model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=0)[:, 0]
# 整理结果
results = list(zip(image_paths, probs.tolist()))
results.sort(key=lambda x: x[1], reverse=True)
return results
# 使用示例
clip = CLIPClassifier()
# 零样本分类
labels = ["cat", "dog", "bird", "car", "tree"]
results = clip.classify("animal.jpg", candidate_labels=labels)
print("🏷️ 分类结果:")
for result in results:
print(f"• {result['label']}: {result['score']:.2%}")
# 文本搜索图像
image_list = ["img1.jpg", "img2.jpg", "img3.jpg"]
search_results = clip.search_text_in_images(image_list, "a cat sleeping on a sofa")
print("\n🔍 搜索结果:")
for path, score in search_results:
print(f"• {path}: {score:.4f}")输出示例
🏷️ 分类结果:
• cat: 78.50%
• dog: 12.30%
• bird: 5.20%
• car: 2.80%
• tree: 1.20%
🔍 搜索结果:
• img2.jpg: 0.8923
• img1.jpg: 0.3456
• img3.jpg: 0.12342. 使用 LLaVA 进行视觉问答
python
# pip install transformers torch pillow
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
import torch
class VisionQA:
"""视觉问答系统(基于 LLaVA)"""
def __init__(self, model_name: str = "llava-hf/llava-v1.6-mistral-7b-hf"):
"""
Args:
model_name: VLM 模型
- llava-hf/llava-v1.6-mistral-7b-hf: LLaVA-1.6 (7B)
- llava-hf/llava-v1.6-vicuna-13b-hf: LLaVA-1.6 (13B)
"""
self.processor = LlavaNextProcessor.from_pretrained(model_name)
self.model = LlavaNextForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
def answer(
self,
image_path: str,
question: str,
max_new_tokens: int = 200
) -> str:
"""
回答关于图像的问题
Args:
image_path: 图像路径
question: 问题
max_new_tokens: 最大生成长度
Returns:
答案文本
"""
# 加载图像
image = Image.open(image_path).convert("RGB")
# 构建提示
prompt = f"USER: <image>\n{question}\nASSISTANT:"
# 预处理
inputs = self.processor(
text=prompt,
images=image,
return_tensors="pt"
).to(self.model.device)
# 生成答案
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False
)
# 解码
answer = self.processor.decode(
outputs[0],
skip_special_tokens=True
)
# 提取 ASSISTANT 后的内容
answer = answer.split("ASSISTANT:")[-1].strip()
return answer
def describe_image(self, image_path: str) -> str:
"""生成图像描述"""
return self.answer(
image_path,
"Describe this image in detail."
)
def count_objects(self, image_path: str, object_name: str) -> str:
"""计数图像中的物体"""
return self.answer(
image_path,
f"How many {object_name}s are in this image?"
)
# 使用示例
vqa = VisionQA()
# 视觉问答
answer = vqa.answer("kitchen.jpg", "What is on the table?")
print(f"Q: What is on the table?")
print(f"A: {answer}\n")
# 图像描述
description = vqa.describe_image("landscape.jpg")
print(f"图像描述:\n{description}\n")
# 物体计数
count = vqa.count_objects("parking_lot.jpg", "car")
print(f"Q: How many cars?")
print(f"A: {count}")输出示例
Q: What is on the table?
A: There is a laptop, a coffee cup, and some books on the table.
图像描述:
This image shows a beautiful mountain landscape during sunset. The sky is painted with vibrant orange and pink hues, while snow-capped peaks stand majestically in the background. In the foreground, there's a serene lake reflecting the colorful sky.
Q: How many cars?
A: There are 7 cars visible in the parking lot.3. 使用 Qwen-VL 进行多模态理解
python
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
class QwenVL:
"""Qwen-VL 多模态模型"""
def __init__(self, model_name: str = "Qwen/Qwen-VL-Chat"):
"""
Args:
model_name: 模型名称
- Qwen/Qwen-VL-Chat: Qwen-VL 对话版
- Qwen/Qwen-VL: Qwen-VL 基础版
"""
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16
).eval()
def chat(
self,
image_path: str,
query: str,
history: list = None
) -> tuple[str, list]:
"""
多轮对话
Args:
image_path: 图像路径
query: 用户问题
history: 对话历史
Returns:
(回答, 更新后的历史)
"""
if history is None:
history = []
# 构建输入(Qwen-VL 特殊格式)
query_with_image = f"<img>{image_path}</img>\n{query}"
# 生成回答
response, history = self.model.chat(
self.tokenizer,
query=query_with_image,
history=history
)
return response, history
def analyze_image(self, image_path: str) -> dict:
"""
全面分析图像
Returns:
{
"description": "图像描述",
"objects": ["物体1", "物体2"],
"scene": "场景类型",
"text": "图像中的文字"
}
"""
results = {}
# 1. 图像描述
desc, _ = self.chat(image_path, "详细描述这张图片")
results["description"] = desc
# 2. 物体识别
objects, _ = self.chat(image_path, "列出图片中的所有物体")
results["objects"] = objects
# 3. 场景识别
scene, _ = self.chat(image_path, "这是什么场景?")
results["scene"] = scene
# 4. OCR(如果有文字)
text, _ = self.chat(image_path, "图片中有什么文字?")
results["text"] = text
return results
# 使用示例
qwen_vl = QwenVL()
# 单轮问答
response, history = qwen_vl.chat("product.jpg", "这个产品是什么?")
print(f"Q: 这个产品是什么?")
print(f"A: {response}\n")
# 多轮对话
response, history = qwen_vl.chat(
"product.jpg",
"它的主要特点是什么?",
history=history
)
print(f"Q: 它的主要特点是什么?")
print(f"A: {response}\n")
# 全面分析
analysis = qwen_vl.analyze_image("complex_scene.jpg")
print("📊 图像分析:")
for key, value in analysis.items():
print(f"• {key}: {value}")6.1.5 图像生成(文生图)
1. 使用 Stable Diffusion
python
# pip install diffusers transformers accelerate
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
import torch
class TextToImage:
"""文生图系统(Stable Diffusion)"""
def __init__(
self,
model_id: str = "stabilityai/stable-diffusion-2-1"
):
"""
Args:
model_id: 模型 ID
- stabilityai/stable-diffusion-2-1: SD 2.1
- runwayml/stable-diffusion-v1-5: SD 1.5
- stabilityai/stable-diffusion-xl-base-1.0: SDXL
"""
self.pipe = StableDiffusionPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16,
safety_checker=None # 禁用安全检查器(可选)
)
# 使用更快的调度器
self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
self.pipe.scheduler.config
)
# 移动到 GPU
self.pipe = self.pipe.to("cuda")
# 启用内存优化
self.pipe.enable_attention_slicing()
def generate(
self,
prompt: str,
negative_prompt: str = None,
num_images: int = 1,
num_inference_steps: int = 30,
guidance_scale: float = 7.5,
width: int = 512,
height: int = 512,
seed: int = None
) -> list:
"""
生成图像
Args:
prompt: 正向提示词
negative_prompt: 负向提示词
num_images: 生成图像数量
num_inference_steps: 推理步数(越多越精细,但越慢)
guidance_scale: 引导强度(越高越符合提示词)
width, height: 图像尺寸
seed: 随机种子(用于复现)
Returns:
PIL Image 列表
"""
# 设置随机种子
if seed is not None:
generator = torch.Generator(device="cuda").manual_seed(seed)
else:
generator = None
# 生成
with torch.autocast("cuda"):
images = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_images_per_prompt=num_images,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
width=width,
height=height,
generator=generator
).images
return images
# 使用示例
t2i = TextToImage()
# 生成图像
prompt = "a beautiful sunset over the ocean, golden hour, photorealistic, 4k"
negative_prompt = "ugly, blurry, low quality, distorted"
images = t2i.generate(
prompt=prompt,
negative_prompt=negative_prompt,
num_images=2,
num_inference_steps=50,
guidance_scale=7.5,
seed=42
)
# 保存图像
for i, image in enumerate(images):
image.save(f"generated_{i}.png")
print(f"💾 已保存:generated_{i}.png")2. 图像编辑(Inpainting)
python
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import torch
class ImageInpainter:
"""图像修复/编辑"""
def __init__(self):
self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-inpainting",
torch_dtype=torch.float16
).to("cuda")
def inpaint(
self,
image_path: str,
mask_path: str,
prompt: str,
save_path: str = "inpainted.png"
):
"""
修复图像
Args:
image_path: 原始图像
mask_path: 掩码图像(白色区域将被重绘)
prompt: 重绘内容的描述
save_path: 保存路径
"""
# 加载图像和掩码
image = Image.open(image_path).convert("RGB")
mask = Image.open(mask_path).convert("L")
# 生成
result = self.pipe(
prompt=prompt,
image=image,
mask_image=mask,
num_inference_steps=50
).images[0]
result.save(save_path)
print(f"💾 修复结果已保存:{save_path}")
# 使用示例
inpainter = ImageInpainter()
inpainter.inpaint(
image_path="photo.jpg",
mask_path="mask.png",
prompt="a red sports car",
save_path="edited.png"
)6.1.6 实战案例:智能图像分析系统
python
"""
综合案例:智能图像分析系统
功能:
1. 物体检测
2. 场景分类
3. 视觉问答
4. 图像描述生成
"""
from typing import Dict, List
import json
class IntelligentImageAnalyzer:
"""智能图像分析系统"""
def __init__(self):
print("🔧 初始化模型...")
# 1. 物体检测
self.detector = ObjectDetector(model_size="yolov8n")
# 2. 场景分类
self.classifier = ImageClassifier(model_name="microsoft/resnet-50")
# 3. 视觉问答
self.vqa = VisionQA()
print("✅ 初始化完成")
def analyze(self, image_path: str) -> Dict:
"""
全面分析图像
Returns:
{
"objects": [...],
"scene": {...},
"description": "...",
"qa_results": {...}
}
"""
print(f"\n📸 分析图像:{image_path}\n")
results = {}
# 1. 物体检测
print("🎯 步骤 1/4:物体检测")
objects = self.detector.detect(image_path, conf_threshold=0.5)
results["objects"] = objects
print(f"✅ 检测到 {len(objects)} 个物体")
# 2. 场景分类
print("\n🏷️ 步骤 2/4:场景分类")
scene = self.classifier.classify(image_path, top_k=3)
results["scene"] = scene
print(f"✅ 场景:{scene[0]['label']} ({scene[0]['score']:.2%})")
# 3. 图像描述
print("\n📝 步骤 3/4:生成描述")
description = self.vqa.describe_image(image_path)
results["description"] = description
print(f"✅ 描述:{description[:100]}...")
# 4. 自动问答
print("\n❓ 步骤 4/4:自动问答")
qa_results = self._auto_qa(image_path, objects)
results["qa_results"] = qa_results
print(f"✅ 回答了 {len(qa_results)} 个问题")
return results
def _auto_qa(
self,
image_path: str,
detected_objects: List[Dict]
) -> Dict[str, str]:
"""基于检测结果自动生成问答"""
qa_results = {}
# 问题 1:主要物体是什么?
if detected_objects:
main_object = detected_objects[0]["class"]
qa_results["main_object"] = f"The main object is a {main_object}"
# 问题 2:有多少人?
person_count = sum(1 for obj in detected_objects if obj["class"] == "person")
if person_count > 0:
qa_results["person_count"] = f"There are {person_count} person(s)"
# 问题 3:室内还是室外?
answer = self.vqa.answer(image_path, "Is this indoors or outdoors?")
qa_results["location_type"] = answer
return qa_results
def generate_report(self, image_path: str, save_path: str = None):
"""生成分析报告"""
results = self.analyze(image_path)
# 生成 Markdown 报告
report = f"""# 图像分析报告
## 基本信息
- 图像路径:{image_path}
## 物体检测
检测到 {len(results['objects'])} 个物体:
"""
for i, obj in enumerate(results['objects'][:10], 1):
report += f"{i}. **{obj['class']}** (置信度: {obj['confidence']:.2%})\n"
report += f"""
## 场景分类
- 主要场景:**{results['scene'][0]['label']}** ({results['scene'][0]['score']:.2%})
- 次要场景:{results['scene'][1]['label']} ({results['scene'][1]['score']:.2%})
## 图像描述
{results['description']}
## 自动问答
"""
for question, answer in results['qa_results'].items():
report += f"- **{question}**: {answer}\n"
# 保存报告
if save_path:
with open(save_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"\n💾 报告已保存:{save_path}")
return report
# 使用示例
analyzer = IntelligentImageAnalyzer()
# 分析图像
report = analyzer.generate_report(
image_path="test_image.jpg",
save_path="analysis_report.md"
)
print("\n" + "="*60)
print(report)6.1.7 学习资源
推荐课程
- Stanford CS231n: 计算机视觉经典课程
- Fast.ai Practical Deep Learning: 实战导向
- DeepLearning.AI Computer Vision: 吴恩达视觉课程
开源项目
- Ultralytics YOLOv8: 最先进的目标检测
- Segment Anything (SAM): Meta 的分割模型
- Grounding DINO: 开放词汇目标检测
论文阅读
- Vision Transformer (ViT): An Image is Worth 16x16 Words
- CLIP: Learning Transferable Visual Models
- LLaVA: Visual Instruction Tuning
实战练习
- 构建商品识别系统(分类 + 检测)
- 开发智能监控系统(视频目标检测)
- 实现以图搜图引擎(FAISS + 特征提取)
- 构建图像问答机器人(VLM + RAG)
关键要点
- ✅ 预训练模型优先:不要从头训练,使用 HuggingFace 模型
- ✅ 选择合适的模型:速度 vs 精度的权衡
- ✅ 批量处理优化:使用 DataLoader 提升吞吐量
- ✅ GPU 加速必备:视觉模型计算密集,CPU 太慢
- ✅ 关注最新进展:VLM 领域快速发展,持续学习
常见错误
- ❌ 忽视图像预处理(尺寸、归一化)
- ❌ 置信度阈值设置不当
- ❌ 显存溢出(batch size 过大)
- ❌ 忽视推理速度优化
下一步 学完本节后,结合语音模型(6.2)和多模态融合(6.3),你将能够构建完整的多模态 AI 应用。