(self.embedding_output, self.embedding_table) = embedding_lookup(
input_ids=input_ids,
vocab_size=config.vocab_size,
embedding_size=config.hidden_size,
initializer_range=config.initializer_range,
word_embedding_name="word_embeddings",
use_one_hot_embeddings=use_one_hot_embeddings)
添加位置嵌入、令牌嵌入 , 然后标准化并执行丢弃
# Add positional embeddings and token type embeddings, then layer
# normalize and perform dropout.
embedding_postprocessor对单词嵌入张量执行各种后处理 。
self.embedding_output = embedding_postprocessor(
input_tensor=self.embedding_output,
use_token_type=True,
token_type_ids=token_type_ids,
token_type_vocab_size=config.type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=config.initializer_range,
max_position_embeddings=config.max_position_embeddings,
dropout_prob=config.hidden_dropout_prob)
with tf.variable_scope("encoder"):
将2维掩码转换成3维,用于注意力评分
# This converts a 2D mask of shape [batch_size, seq_length] to a 3D
# mask of shape [batch_size, seq_length, seq_length] which is used
# for the attention scores.
attention_mask = create_attention_mask_from_input_mask(
input_ids, input_mask)
# Run the stacked transformer. 运行堆叠的transformer模型
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
创建transformer_model对象
self.all_encoder_layers = transformer_model(
input_tensor=self.embedding_output,
attention_mask=attention_mask,
hidden_size=config.hidden_size,
num_hidden_layers=config.num_hidden_layers,
num_attention_heads=config.num_attention_heads,
intermediate_size=config.intermediate_size,
intermediate_act_fn=get_activation(config.hidden_act),
hidden_dropout_prob=config.hidden_dropout_prob,
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
initializer_range=config.initializer_range,
do_return_all_layers=True)
[-1]表示倒数第一项
self.sequence_output = self.all_encoder_layers[-1]
# The "pooler" converts the encoded sequence tensor of shape
# [batch_size, seq_length, hidden_size] to a tensor of shape
# [batch_size, hidden_size].
pooler改变编码张量的形状,从3维变成了2维
This is necessary for segment-level
# (or segment-pair-level) classification tasks where we need a fixed
# dimensional representation of the segment.
句子分类任务中,这种转换是必要的,因为我们需要一个固定维度的表达
with tf.variable_scope("pooler"):
# We "pool" the model by simply taking the hidden state corresponding to the first token.
通过获取和第一个令牌一致的隐藏状态,我们池化了模型
We assume that this has been pre-trained
假定模型已经预训练好了
tf.squeeze从张量的形状中去除大小为1的维数
squeeze英 [skwi?z] 美 [skwi?z]v. 挤压,捏;
first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
self.pooled_output = tf.layers.dense(
first_token_tensor, 符号张量输入到密集层
config.hidden_size, 隐藏层的大小
activation=tf.tanh, 激活函数:反正切
kernel_initializer=create_initializer(config.initializer_range))
#构造函数结束
def get_pooled_output(self): 获取池化输出
return self.pooled_output
def get_sequence_output(self): 获取序列输出
"""Gets final hidden layer of encoder. 获取编码后的隐藏层
Returns: 返回一个张量 , 和transformer 编码一致的
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the final hidden of the transformer encoder.
"""
return self.sequence_output
def get_all_encoder_layers(self): 获取所有编码层
return self.all_encoder_layers
def get_embedding_output(self): 获取嵌入层的输出
"""Gets output of the embedding lookup (i.e., input to the transformer).
获取嵌入查找 的结果,例如 transformer的输入
Returns: 返回一个浮点型张量,和嵌入层一致的
将位置嵌入和类型嵌入数据统统相加求和,然后再标准化
这就是transformer的输入
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the output of the embedding layer, after summing the word
embeddings with the positional embeddings and the token type embeddings,
then performing layer normalization. This is the input to the transformer.
推荐阅读
- 【lwip】11-UDP协议&源码分析
- 硬核剖析Java锁底层AQS源码,深入理解底层架构设计
- SpringCloudAlibaba 微服务组件 Nacos 之配置中心源码深度解析
- Seata 1.5.2 源码学习
- MindStudio模型训练场景精度比对全流程和结果分析
- .NET 源码学习 [数据结构-线性表1.2] 链表与 LinkedList<T>
- Redisson源码解读-公平锁
- OpenHarmony移植案例: build lite源码分析之hb命令__entry__.py
- 【深入浅出 Yarn 架构与实现】1-2 搭建 Hadoop 源码阅读环境
- JVM学习笔记——内存模型篇