通俗理解注意力机制中的Q、K和V表示的具体含义
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了通俗理解注意力机制中的Q、K和V表示的具体含义相关的知识,希望对你有一定的参考价值。
参考技术A以翻译为例
比如翻译目标单词为 I 的时候,Q为I
而source中的 “我” “是” “中国人”都是K,
那么Q就要与每一个source中的K进行对齐(相似度计算);"I"与"我"的相似度,"I"与"是"的相似度;"I"与"中国人"的相似度;
相似度的值进行归一化后会生成对齐概率值(“I"与source中每个单词的相似度(和为1)),也可以注意力值;
而V代表每个source中输出的context vector;如果为RNN模型的话就是对应的状态向量;即key与value相同;
然后相应的V与相应的P进行加权求和,就得到了context vetor;
从网上找到了一张图更能证明我的理解的正确性;
这里的v与上面的V是不一样的,这属于一个单隐藏层的前馈神经网络;v属于隐藏层激活后的一个计算得分的权重系数矩阵;
w属于激活前的权重系数矩阵;
这里应该是输出神经元为一个得分值;所以需要多个前馈神经网络同时计算每个hi的得分;与我预想的不同,以为一个前馈神经网络就可以输出所有对应的得分,即输出层的维度是与input序列长度一样;(目前的理解);为什么不与预想的一致呢?
然后对所有得分进行归一化,一般选择softmax方法;让权重系数为1
第二阶段:将hi与对应的权重系数相乘得到一个context vector;即注意力值.
[PS:本文谈论的Q、K、V只限于seq2seq结构]
看到这里,是不是只想直呼卧槽,这什么鬼。不急,先看一个例子
由于讨论的是seq2seq任务,于是来看看机器翻译。
假如我们要将 我喜欢看电影 翻译成 I like watching movies ,步骤则会如下
transformer
1. transformer
深度学习attention机制中的Q,K,V分别是从哪来的? - lllltdaf的回答 - 知乎 https://www.zhihu.com/question/325839123/answer/1903376265
对于数据的关系的权重,理解自注意力
https://zhuanlan.zhihu.com/p/338817680
2.vit
https://blog.csdn.net/EMIvv/article/details/122764606
把图片展开成一维向量和对应的index输入到transformer中
代码梳理
https://blog.csdn.net/level_code/article/details/126173408
# coding=utf-8
import os
import sys
path = os.path.dirname(__file__)
sys.path.append(path)
'''
Author:Don
date:2022/10/10 17:36
desc:
'''
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary
from PIL import Image
img=Image.open('image/15_16_09_39.jpg')
# resize to imagenet size
transform = Compose([Resize((224, 224)), ToTensor()])
x = transform(img)
x = x.unsqueeze(0) # add batch dim
class PatchEmbedding(nn.Module):
def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224):
self.patch_size = patch_size
super().__init__()
self.projection = nn.Sequential(
# using a conv layer instead of a linear one -> performance gains
nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
Rearrange('b e (h) (w) -> b (h w) e'),
)
self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
# img size是长和宽相等的,所以img_size//patch_size就是长和宽有多少个patch + 1(位置0)
self.positions = nn.Parameter(torch.randn((img_size // patch_size) ** 2 + 1, emb_size))
def forward(self, x: Tensor) -> Tensor:
b, _, _, _ = x.shape
x = self.projection(x)
cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b)
# prepend the cls token to the input
x = torch.cat([cls_tokens, x], dim=1)
# add position embedding
x += self.positions
return x
class MultiHeadAttention(nn.Module):
def __init__(self, emb_size: int = 768, num_heads: int = 8, dropout: float = 0):
super().__init__()
self.emb_size = emb_size
self.num_heads = num_heads
# fuse the queries, keys and values in one matrix
self.qkv = nn.Linear(emb_size, emb_size * 3)
self.att_drop = nn.Dropout(dropout)
self.projection = nn.Linear(emb_size, emb_size)
def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
# split keys, queries and values in num_heads
qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=self.num_heads, qkv=3)
queries, keys, values = qkv[0], qkv[1], qkv[2]
# sum up over the last axis
energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_heads, query_len, key_len
if mask is not None:
fill_value = torch.finfo(torch.float32).min
energy.mask_fill(~mask, fill_value)
scaling = self.emb_size ** (1 / 2)
att = F.softmax(energy, dim=-1) / scaling
att = self.att_drop(att)
# sum up over the third axis
out = torch.einsum('bhal, bhlv -> bhav ', att, values)
out = rearrange(out, "b h n d -> b n (h d)")
out = self.projection(out)
return out
class ResidualAdd(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
res = x
x = self.fn(x, **kwargs)
x += res
return x
class FeedForwardBlock(nn.Sequential):
def __init__(self, emb_size: int, expansion: int = 4, drop_p: float = 0.):
super().__init__(
nn.Linear(emb_size, expansion * emb_size),
nn.GELU(),
nn.Dropout(drop_p),
nn.Linear(expansion * emb_size, emb_size),
)
class TransformerEncoderBlock(nn.Sequential):
def __init__(self,
emb_size: int = 768,
drop_p: float = 0.,
forward_expansion: int = 4,
forward_drop_p: float = 0.,
** kwargs):
super().__init__(
ResidualAdd(nn.Sequential(
nn.LayerNorm(emb_size),
MultiHeadAttention(emb_size, **kwargs),
nn.Dropout(drop_p)
)),
ResidualAdd(nn.Sequential(
nn.LayerNorm(emb_size),
FeedForwardBlock(
emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
nn.Dropout(drop_p)
)
))
class ClassificationHead(nn.Sequential):
def __init__(self, emb_size: int = 768, n_classes: int = 1000):
super().__init__(
Reduce('b n e -> b e', reduction='mean'),
nn.LayerNorm(emb_size),
nn.Linear(emb_size, n_classes))
class ViT(nn.Sequential):
def __init__(self,
in_channels: int = 3,
patch_size: int = 16,
emb_size: int = 768,
img_size: int = 224,
depth: int = 12,
n_classes: int = 1000,
**kwargs):
super().__init__(
PatchEmbedding(in_channels, patch_size, emb_size, img_size),
TransformerEncoderBlock( emb_size=emb_size, **kwargs),
ClassificationHead(emb_size, n_classes)
)
print(ViT()(x).shape)
3. swin transformer
https://zhuanlan.zhihu.com/p/367111046
以上是关于通俗理解注意力机制中的Q、K和V表示的具体含义的主要内容,如果未能解决你的问题,请参考以下文章