transformer

Posted 2023-02-19 东东就是我

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了transformer相关的知识，希望对你有一定的参考价值。

1. transformer

深度学习attention机制中的Q,K,V分别是从哪来的？ - lllltdaf的回答 - 知乎 https://www.zhihu.com/question/325839123/answer/1903376265

对于数据的关系的权重，理解自注意力

https://zhuanlan.zhihu.com/p/338817680

2.vit

https://blog.csdn.net/EMIvv/article/details/122764606
把图片展开成一维向量和对应的index输入到transformer中
代码梳理
https://blog.csdn.net/level_code/article/details/126173408

# coding=utf-8
import os
import sys

path = os.path.dirname(__file__)
sys.path.append(path)

'''
Author:Don
date:2022/10/10 17:36
desc:
'''
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary
from PIL  import Image

img=Image.open('image/15_16_09_39.jpg')
# resize to imagenet size
transform = Compose([Resize((224, 224)), ToTensor()])
x = transform(img)
x = x.unsqueeze(0) # add batch dim


class PatchEmbedding(nn.Module):
	def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224):
		self.patch_size = patch_size
		super().__init__()
		self.projection = nn.Sequential(
			# using a conv layer instead of a linear one -> performance gains
			nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
			Rearrange('b e (h) (w) -> b (h w) e'),
		)
		self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
		# img size是长和宽相等的，所以img_size//patch_size就是长和宽有多少个patch + 1(位置0）
		self.positions = nn.Parameter(torch.randn((img_size // patch_size) ** 2 + 1, emb_size))

	def forward(self, x: Tensor) -> Tensor:
		b, _, _, _ = x.shape
		x = self.projection(x)
		cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b)
		# prepend the cls token to the input
		x = torch.cat([cls_tokens, x], dim=1)
		# add position embedding
		x += self.positions
		return x


class MultiHeadAttention(nn.Module):
	def __init__(self, emb_size: int = 768, num_heads: int = 8, dropout: float = 0):
		super().__init__()
		self.emb_size = emb_size
		self.num_heads = num_heads
		# fuse the queries, keys and values in one matrix
		self.qkv = nn.Linear(emb_size, emb_size * 3)
		self.att_drop = nn.Dropout(dropout)
		self.projection = nn.Linear(emb_size, emb_size)

	def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
		# split keys, queries and values in num_heads
		qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=self.num_heads, qkv=3)
		queries, keys, values = qkv[0], qkv[1], qkv[2]
		# sum up over the last axis
		energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys)  # batch, num_heads, query_len, key_len
		if mask is not None:
			fill_value = torch.finfo(torch.float32).min
			energy.mask_fill(~mask, fill_value)

		scaling = self.emb_size ** (1 / 2)
		att = F.softmax(energy, dim=-1) / scaling
		att = self.att_drop(att)
		# sum up over the third axis
		out = torch.einsum('bhal, bhlv -> bhav ', att, values)
		out = rearrange(out, "b h n d -> b n (h d)")
		out = self.projection(out)
		return out


class ResidualAdd(nn.Module):
	def __init__(self, fn):
		super().__init__()
		self.fn = fn

	def forward(self, x, **kwargs):
		res = x
		x = self.fn(x, **kwargs)
		x += res
		return x




class FeedForwardBlock(nn.Sequential):
    def __init__(self, emb_size: int, expansion: int = 4, drop_p: float = 0.):
        super().__init__(
            nn.Linear(emb_size, expansion * emb_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * emb_size, emb_size),
        )


class TransformerEncoderBlock(nn.Sequential):
    def __init__(self,
                 emb_size: int = 768,
                 drop_p: float = 0.,
                 forward_expansion: int = 4,
                 forward_drop_p: float = 0.,
                 ** kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                MultiHeadAttention(emb_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                FeedForwardBlock(
                    emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            )
            ))

class ClassificationHead(nn.Sequential):
    def __init__(self, emb_size: int = 768, n_classes: int = 1000):
        super().__init__(
            Reduce('b n e -> b e', reduction='mean'),
            nn.LayerNorm(emb_size),
            nn.Linear(emb_size, n_classes))


class ViT(nn.Sequential):
    def __init__(self,
                in_channels: int = 3,
                patch_size: int = 16,
                emb_size: int = 768,
                img_size: int = 224,
                depth: int = 12,
                n_classes: int = 1000,
                **kwargs):
        super().__init__(
            PatchEmbedding(in_channels, patch_size, emb_size, img_size),
            TransformerEncoderBlock( emb_size=emb_size, **kwargs),
            ClassificationHead(emb_size, n_classes)
        )

print(ViT()(x).shape)

3. swin transformer

https://zhuanlan.zhihu.com/p/367111046

以上是关于transformer的主要内容，如果未能解决你的问题，请参考以下文章