prompt工程起步

1.手工提示词

有关CLIP和ActionClip的手工特征,也是一个进步。通过给标签填入不同的修饰语当中，组成一段话来,来增强语义理解

 def text_prompt(data):
    text_aug = [f"a photo of action {{}}", f"a picture of action {{}}", f"Human action of {{}}", f"{{}}, an action",
                f"{{}} this is an action", f"{{}}, a video of action", f"Playing action of {{}}", f"{{}}",
                f"Playing a kind of action, {{}}", f"Doing a kind of action, {{}}", f"Look, the human is {{}}",
                f"Can you recognize the action of {{}}?", f"Video classification of {{}}", f"A video of {{}}",
                f"The man is {{}}", f"The woman is {{}}"]
    text_dict = {}
    num_text_aug = len(text_aug)
 
    for ii, txt in enumerate(text_aug):
        text_dict[ii] = torch.cat([clip.tokenize(txt.format(c)) for i, c in data.classes])
 
    classes = torch.cat([v for k, v in text_dict.items()])
 
    return classes, num_text_aug,text_dict复制

这个就是一个手工提示词,针对于特殊的任务设计出来的。text_dict就是蕴含着一个模板是键,对应不同的一句话!

应用为：

 text_id = numpy.random.randint(num_text_aug,size=len(list_id))#类别长度 上限
            #text_dict[j][i, :]：根据 text_id 中的索引 j 从 text_dict 中选择对应的编码张量，再根据 list_id 中的索引 i 从该张量中选择一行。
            texts = torch.stack([text_dict[j][i,:] for i,j in zip(list_id,text_id)])复制

将输入标签随机选一个模板与视频进行对比学习！

2.自动生成提示

(1) 视觉特征的提取
首先，从视频中提取视觉特征。这些特征可以是视频帧的特征，也可以是视频的整体特征。视觉特征的提取通常使用预训练的视觉模型（如 CLIP 的视觉编码器）来完成。

(2) 文本特征的初始化
文本特征可以是类别标签的嵌入，也可以是其他与任务相关的文本信息。这些文本特征将作为提示的初始输入。

(3)跨模态注意力机制

 class MulitHeadAttention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
 
        self.scale = qk_scale or head_dim ** -0.5
 
        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
 
 
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
 
    def forward(self, q, k, v):
        B, N, C = q.shape
        B, M, C = k.shape
        q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads).permute(0,2,1,3)
        k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)
        v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)
 
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x复制

（4） PromptGeneratorLayer(本质上是一个transfomer编码器)

 class PromptGeneratorLayer(nn.Module):
    def __init__( 
        self,
        d_model,
        nhead,
        dropout=0.,
    ):
        super().__init__()
        self.cross_attn = MulitHeadAttention(d_model, nhead, proj_drop=dropout)
 
        self.norm1 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
 
        self.dropout = nn.Dropout(dropout)
 
        self.mlp = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            QuickGELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model)
        )
 
    def forward(self, x, visual):
        q = k = v = self.norm1(x)
        x = x + self.cross_attn(q, visual, visual)
        x = x + self.dropout(self.mlp(self.norm3(x)))
        return x复制

（5）融合

 class VideoSpecificPrompt(nn.Module):
    def __init__(self, layers=2, embed_dim=512, alpha=0.1,):
        super().__init__()
        self.norm = nn.LayerNorm(embed_dim)
        self.decoder = nn.ModuleList([PromptGeneratorLayer(embed_dim, embed_dim//64) for _ in range(layers)])
        self.alpha = nn.Parameter(torch.ones(embed_dim) * alpha)
        self.apply(self._init_weights)
 
 
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
 
    
    def forward(self, text, visual):
        B, N, C = visual.shape
        visual = self.norm(visual)
        for layer in self.decoder:
            text = layer(text, visual)复制

应用：

 def generate_text(data):
    text_aug = f"{{}}"
    classes = torch.cat([clip.tokenize(text_aug.format(c), context_length=77) for i, c in data.classes])
 
    return classes复制

   def classes(self):
        classes_all = pd.read_csv(self.labels_file)
        return classes_all.values.tolist()复制

class是：classes 是一个 torch.Tensor 类型的对象，其形状为 (n, 77)，其中 n 是类别名称的数量，每一行代表一个类别名称经过 CLIP 分词后的结果。

prompt工程起步

1.手工提示词

2.自动生成提示

Python Selenium全栈指南：从自动化入门到企业级实战

【一篇搞定配置】一篇带你从配置到使用（PyCharm远程）完成服务器运行项目（配置、使用一条龙）【全网最详细版】

父子进程的故事：解读Linux中的fork机制

超详细Python教程——用Python解析HTML页面

HTML 音频(Audio)

jQuery 选择器

Html5学习教程，从入门到精通，HTML5 简介语法知识点及案例代码（1）

CSS3 字体

【CSS 面经】display 属性值及其作用详解

CSS语言的区块链

前端哥

【Linux笔记】基础IO(上)

Windows11离线安装wsl2(适用于 Linux 的 Windows 子系统)和Ubuntu

Linux 服务器运维常用命令大全

深入了解 Ubuntu 中的 build-essential：开发者的必备工具

RagFlow Docker Desktop Ollama DeepSeek-R1本地部署自己的本地AI大模型工具

Linux 学习必杀技：从菜鸟到高手的蜕变密码

Trae AI 全新功能 SSHremote助力开发者个人产品打造，高效构建全栈应用

Python Selenium全栈指南：从自动化入门到企业级实战

Spring Cloud之负载均衡之LoadBalance

【一篇搞定配置】一篇带你从配置到使用（PyCharm远程）完成服务器运行项目（配置、使用一条龙）【全网最详细版】

1
【Echarts系列】—— 实现电池图、3D立体圆形柱状图

2024-03-03 11:03:011001

2
CSS 动画效果（5种） - 附完整示例

2025-02-28 12:02:481000

3
在Vue中实现与OpenAI对话的功能

2025-02-27 11:02:161000

4
jQuery.flowchart 项目常见问题解决方案

2025-02-24 13:02:021000

5
CSS常用属性（文本属性）

2024-11-04 09:11:111000

6
TypeScript 中的 Number 类型，Number 类型的特性、常见操作和注意事项

2024-09-30 23:09:061000

7
CSS写代码使页面划分为左右两个区域

2024-09-09 00:09:071000

8
vue使用datav echarts

2024-09-06 00:09:381000

9
使用TweenMax.js和CSS3创建冰球运动员动画效果教程

2024-09-04 23:09:411000

10
使用CDN提高jQuery加载速度

2024-08-24 23:08:211000

	def text_prompt(data):
	text_aug = [f"a photo of action {{}}", f"a picture of action {{}}", f"Human action of {{}}", f"{{}}, an action",
	f"{{}} this is an action", f"{{}}, a video of action", f"Playing action of {{}}", f"{{}}",
	f"Playing a kind of action, {{}}", f"Doing a kind of action, {{}}", f"Look, the human is {{}}",
	f"Can you recognize the action of {{}}?", f"Video classification of {{}}", f"A video of {{}}",
	f"The man is {{}}", f"The woman is {{}}"]
	text_dict = {}
	num_text_aug = len(text_aug)

	for ii, txt in enumerate(text_aug):
	text_dict[ii] = torch.cat([clip.tokenize(txt.format(c)) for i, c in data.classes])

	classes = torch.cat([v for k, v in text_dict.items()])

	return classes, num_text_aug,text_dict

	text_id = numpy.random.randint(num_text_aug,size=len(list_id))#类别长度上限
	#text_dict[j][i, :]：根据 text_id 中的索引 j 从 text_dict 中选择对应的编码张量，再根据 list_id 中的索引 i 从该张量中选择一行。
	texts = torch.stack([text_dict[j][i,:] for i,j in zip(list_id,text_id)])

	class MulitHeadAttention(nn.Module):
	def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads

	self.scale = qk_scale or head_dim ** -0.5

	self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
	self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
	self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)


	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, q, k, v):
	B, N, C = q.shape
	B, M, C = k.shape
	q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads).permute(0,2,1,3)
	k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)
	v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x

	class PromptGeneratorLayer(nn.Module):
	def __init__(
	self,
	d_model,
	nhead,
	dropout=0.,
	):
	super().__init__()
	self.cross_attn = MulitHeadAttention(d_model, nhead, proj_drop=dropout)

	self.norm1 = nn.LayerNorm(d_model)
	self.norm3 = nn.LayerNorm(d_model)

	self.dropout = nn.Dropout(dropout)

	self.mlp = nn.Sequential(
	nn.Linear(d_model, d_model * 4),
	QuickGELU(),
	nn.Dropout(dropout),
	nn.Linear(d_model * 4, d_model)
	)

	def forward(self, x, visual):
	q = k = v = self.norm1(x)
	x = x + self.cross_attn(q, visual, visual)
	x = x + self.dropout(self.mlp(self.norm3(x)))
	return x

	class VideoSpecificPrompt(nn.Module):
	def __init__(self, layers=2, embed_dim=512, alpha=0.1,):
	super().__init__()
	self.norm = nn.LayerNorm(embed_dim)
	self.decoder = nn.ModuleList([PromptGeneratorLayer(embed_dim, embed_dim//64) for _ in range(layers)])
	self.alpha = nn.Parameter(torch.ones(embed_dim) * alpha)
	self.apply(self._init_weights)


	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)


	def forward(self, text, visual):
	B, N, C = visual.shape
	visual = self.norm(visual)
	for layer in self.decoder:
	text = layer(text, visual)

	def generate_text(data):
	text_aug = f"{{}}"
	classes = torch.cat([clip.tokenize(text_aug.format(c), context_length=77) for i, c in data.classes])

	return classes

	def classes(self):
	classes_all = pd.read_csv(self.labels_file)
	return classes_all.values.tolist()

prompt工程起步

1.手工提示词

2.自动生成提示

微信扫一扫：分享