怎么做网站在谷歌,网站项目案例,适合新手做的网站,wordpress下载站概述
序列标注指给定输入序列#xff0c;给序列中每个Token进行标注标签的过程。序列标注问题通常用于从文本中进行信息抽取#xff0c;包括分词(Word Segmentation)、词性标注(Position Tagging)、命名实体识别(Named Entity Recognition, NER)等。
条件随机场#xff08…概述
序列标注指给定输入序列给序列中每个Token进行标注标签的过程。序列标注问题通常用于从文本中进行信息抽取包括分词(Word Segmentation)、词性标注(Position Tagging)、命名实体识别(Named Entity Recognition, NER)等。
条件随机场CRF
对序列进行标注实际上是对序列中每个Token进行标签预测可以直接视作简单的多分类问题。但是序列标注不仅仅需要对单个Token进行分类预测同时相邻Token直接有关联关系。
设为输入序列为输出的标注序列输出序列y的概率为 定义两个概率函数
1. 发射概率函数表示的概率
2. 转移概率函数表示的概率
于是可以得到Score的计算公式 设标签集合为T构造大小为的矩阵P用于存储标签间的转移概率。
实现CRF层的前向训练部分将CRF和损失函数做合并选择分类问题常用的负对数似然函数则有 Score计算
def compute_score(emissions, tags, seq_ends, mask, trans, start_trans, end_trans):# emissions: (seq_length, batch_size, num_tags)# tags: (seq_length, batch_size)# mask: (seq_length, batch_size)seq_length, batch_size tags.shapemask mask.astype(emissions.dtype)# 将score设置为初始转移概率# shape: (batch_size,)score start_trans[tags[0]]# score 第一次发射概率# shape: (batch_size,)score emissions[0, mnp.arange(batch_size), tags[0]]for i in range(1, seq_length):# 标签由i-1转移至i的转移概率当mask 1时有效# shape: (batch_size,)score trans[tags[i - 1], tags[i]] * mask[i]# 预测tags[i]的发射概率当mask 1时有效# shape: (batch_size,)score emissions[i, mnp.arange(batch_size), tags[i]] * mask[i]# 结束转移# shape: (batch_size,)last_tags tags[seq_ends, mnp.arange(batch_size)]# score 结束转移概率# shape: (batch_size,)score end_trans[last_tags]return score
Normalizer计算
Normalizer可以改写为以下形式 Normalizer代码实现如下
def compute_normalizer(emissions, mask, trans, start_trans, end_trans):# emissions: (seq_length, batch_size, num_tags)# mask: (seq_length, batch_size)seq_length emissions.shape[0]# 将score设置为初始转移概率并加上第一次发射概率# shape: (batch_size, num_tags)score start_trans emissions[0]for i in range(1, seq_length):# 扩展score的维度用于总score的计算# shape: (batch_size, num_tags, 1)broadcast_score score.expand_dims(2)# 扩展emission的维度用于总score的计算# shape: (batch_size, 1, num_tags)broadcast_emissions emissions[i].expand_dims(1)# 根据公式(7)计算score_i# 此时broadcast_score是由第0个到当前Token所有可能路径# 对应score的log_sum_exp# shape: (batch_size, num_tags, num_tags)next_score broadcast_score trans broadcast_emissions# 对score_i做log_sum_exp运算用于下一个Token的score计算# shape: (batch_size, num_tags)next_score ops.logsumexp(next_score, axis1)# 当mask 1时score才会变化# shape: (batch_size, num_tags)score mnp.where(mask[i].expand_dims(1), next_score, score)# 最后加结束转移概率# shape: (batch_size, num_tags)score end_trans# 对所有可能的路径得分求log_sum_exp# shape: (batch_size,)return ops.logsumexp(score, axis1)
Viterbi算法
在完成前向训练部分后需要实现解码部分。Viterbi算法与计算Normalizer类似使用动态规划求解所有可能的预测序列得分。不同的是在解码时同时需要将第i个Token对应的score取值最大的标签保存供后续使用Viterbi算法求解最优预测序列使用。
取得最大概率得分ScoreScore以及每个Token对应的标签历史HistoryHistory后根据Viterbi算法可以得到公式 代码实现
def viterbi_decode(emissions, mask, trans, start_trans, end_trans):# emissions: (seq_length, batch_size, num_tags)# mask: (seq_length, batch_size)seq_length mask.shape[0]score start_trans emissions[0]history ()for i in range(1, seq_length):broadcast_score score.expand_dims(2)broadcast_emission emissions[i].expand_dims(1)next_score broadcast_score trans broadcast_emission# 求当前Token对应score取值最大的标签并保存indices next_score.argmax(axis1)history (indices,)next_score next_score.max(axis1)score mnp.where(mask[i].expand_dims(1), next_score, score)score end_transreturn score, historydef post_decode(score, history, seq_length):# 使用Score和History计算最佳预测序列batch_size seq_length.shape[0]seq_ends seq_length - 1# shape: (batch_size,)best_tags_list []# 依次对一个Batch中每个样例进行解码for idx in range(batch_size):# 查找使最后一个Token对应的预测概率最大的标签# 并将其添加至最佳预测序列存储的列表中best_last_tag score[idx].argmax(axis0)best_tags [int(best_last_tag.asnumpy())]# 重复查找每个Token对应的预测概率最大的标签加入列表for hist in reversed(history[:seq_ends[idx]]):best_last_tag hist[idx][best_tags[-1]]best_tags.append(int(best_last_tag.asnumpy()))# 将逆序求解的序列标签重置为正序best_tags.reverse()best_tags_list.append(best_tags)return best_tags_list
CRF层
CRF的输入需要考虑输入序列的真实长度因此除发射矩阵和标签外加入 seq_length 参数传入序列Padding前的长度并实现生成mask矩阵的 sequence_mask 方法。
代码实现
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops
import mindspore.numpy as mnp
from mindspore.common.initializer import initializer, Uniformdef sequence_mask(seq_length, max_length, batch_firstFalse):根据序列实际长度和最大长度生成mask矩阵range_vector mnp.arange(0, max_length, 1, seq_length.dtype)result range_vector seq_length.view(seq_length.shape (1,))if batch_first:return result.astype(ms.int64)return result.astype(ms.int64).swapaxes(0, 1)class CRF(nn.Cell):def __init__(self, num_tags: int, batch_first: bool False, reduction: str sum) - None:if num_tags 0:raise ValueError(finvalid number of tags: {num_tags})super().__init__()if reduction not in (none, sum, mean, token_mean):raise ValueError(finvalid reduction: {reduction})self.num_tags num_tagsself.batch_first batch_firstself.reduction reductionself.start_transitions ms.Parameter(initializer(Uniform(0.1), (num_tags,)), namestart_transitions)self.end_transitions ms.Parameter(initializer(Uniform(0.1), (num_tags,)), nameend_transitions)self.transitions ms.Parameter(initializer(Uniform(0.1), (num_tags, num_tags)), nametransitions)def construct(self, emissions, tagsNone, seq_lengthNone):if tags is None:return self._decode(emissions, seq_length)return self._forward(emissions, tags, seq_length)def _forward(self, emissions, tagsNone, seq_lengthNone):if self.batch_first:batch_size, max_length tags.shapeemissions emissions.swapaxes(0, 1)tags tags.swapaxes(0, 1)else:max_length, batch_size tags.shapeif seq_length is None:seq_length mnp.full((batch_size,), max_length, ms.int64)mask sequence_mask(seq_length, max_length)# shape: (batch_size,)numerator compute_score(emissions, tags, seq_length-1, mask, self.transitions, self.start_transitions, self.end_transitions)# shape: (batch_size,)denominator compute_normalizer(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)# shape: (batch_size,)llh denominator - numeratorif self.reduction none:return llhif self.reduction sum:return llh.sum()if self.reduction mean:return llh.mean()return llh.sum() / mask.astype(emissions.dtype).sum()def _decode(self, emissions, seq_lengthNone):if self.batch_first:batch_size, max_length emissions.shape[:2]emissions emissions.swapaxes(0, 1)else:batch_size, max_length emissions.shape[:2]if seq_length is None:seq_length mnp.full((batch_size,), max_length, ms.int64)mask sequence_mask(seq_length, max_length)return viterbi_decode(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)
BiLSTMCRF模型
其中LSTM提取序列特征经过Dense层变换获得发射概率矩阵最后送入CRF层。具体实现如下
class BiLSTM_CRF(nn.Cell):def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags, padding_idx0):super().__init__()self.embedding nn.Embedding(vocab_size, embedding_dim, padding_idxpadding_idx)self.lstm nn.LSTM(embedding_dim, hidden_dim // 2, bidirectionalTrue, batch_firstTrue)self.hidden2tag nn.Dense(hidden_dim, num_tags, he_uniform)self.crf CRF(num_tags, batch_firstTrue)def construct(self, inputs, seq_length, tagsNone):embeds self.embedding(inputs)outputs, _ self.lstm(embeds, seq_lengthseq_length)feats self.hidden2tag(outputs)crf_outs self.crf(feats, tags, seq_length)return crf_outs
完成模型设计后我们生成两句例子和对应的标签并构造词表和标签表。
embedding_dim 16
hidden_dim 32training_data [(清 华 大 学 坐 落 于 首 都 北 京.split(),B I I I O O O O O B I.split()
), (重 庆 是 一 个 魔 幻 城 市.split(),B I O O O O O O O.split()
)]word_to_idx {}
word_to_idx[pad] 0
for sentence, tags in training_data:for word in sentence:if word not in word_to_idx:word_to_idx[word] len(word_to_idx)tag_to_idx {B: 0, I: 1, O: 2}
接下来实例化模型选择优化器并将模型和优化器送入Wrapper。
model BiLSTM_CRF(len(word_to_idx), embedding_dim, hidden_dim, len(tag_to_idx))
optimizer nn.SGD(model.trainable_params(), learning_rate0.01, weight_decay1e-4)grad_fn ms.value_and_grad(model, None, optimizer.parameters)def train_step(data, seq_length, label):loss, grads grad_fn(data, seq_length, label)optimizer(grads)return loss
将生成的数据打包成Batch按照序列最大长度对长度不足的序列进行填充分别返回输入序列、输出标签和序列长度构成的Tensor。
def prepare_sequence(seqs, word_to_idx, tag_to_idx):seq_outputs, label_outputs, seq_length [], [], []max_len max([len(i[0]) for i in seqs])for seq, tag in seqs:seq_length.append(len(seq))idxs [word_to_idx[w] for w in seq]labels [tag_to_idx[t] for t in tag]idxs.extend([word_to_idx[pad] for i in range(max_len - len(seq))])labels.extend([tag_to_idx[O] for i in range(max_len - len(seq))])seq_outputs.append(idxs)label_outputs.append(labels)return ms.Tensor(seq_outputs, ms.int64), \ms.Tensor(label_outputs, ms.int64), \ms.Tensor(seq_length, ms.int64)
对模型进行预编译后训练500个step。
from tqdm import tqdmsteps 500
with tqdm(totalsteps) as t:for i in range(steps):loss train_step(data, seq_length, label)t.set_postfix(lossloss)t.update(1)
最后将预测的index序列转换为标签序列打印输出结果查看效果。
idx_to_tag {idx: tag for tag, idx in tag_to_idx.items()}def sequence_to_tag(sequences, idx_to_tag):outputs []for seq in sequences:outputs.append([idx_to_tag[i] for i in seq])return outputssequence_to_tag(predict, idx_to_tag)
得到输出标签
[[B, I, I, I, O, O, O, O, O, B, I],[B, I, O, O, O, O, O, O, O]]
总结
LSTM用于提取序列特征CRF用于序列标注从而实现语义的切分。