实验二:基于HMM的拼音转汉字程序|自然语言

Posted 桃陉

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了实验二:基于HMM的拼音转汉字程序|自然语言相关的知识,希望对你有一定的参考价值。


分步实现

主要就是训练HMM模型,实现维特比算法。我的HMM模型包含以下7个函数:

初始化函数:

 '''初始化函数''' 
def __init__(self,): 
    self.init_pro = {} #初始概率 
    self.emiss_pro = {} #发射概率 
    self.trans_pro = {} #转移概率 
    self.pinyin_to_chinese= {} #拼音字典

主要包含HMM模型必备的三大概率(初始概率、发射概率、转移概率),以及为了实现拼音转汉字功能而定义的拼音字典。

训练初始概率、转移概率函数:

 '''训练初始概率、转移概率函数''' 
def train_init_trans_pro(self,taotiao_path,): 
    f = open(taotiao_path,encoding='utf-8') 
     
    single_word = {} #存放单个词的频数 
    double_word = {} #存放两个词的频数 
    num = 0 
 
    for line in f.readlines(): 
        temp =re.findall('[\\u4e00-\\u9fa5]+',line) 
        for words in temp: 
            pre = ' ' 
            for word in words: 
                #计算单个词的频数 
                if word in single_word: 
                    single_word[word] += 1 
                else: 
                    single_word[word]=1 
                #计算两个词的频数 
                if pre != ' ': 
                    if pre+word in double_word: 
                        double_word[pre+word] += 1 
                    else: 
                        double_word[pre+word] = 1 
                pre = word 
            num += 1 
    f.close() 
 
    #求初始概率 
    for i in single_word.keys(): 
        #取对数运算,避免出现概率过小 
    #     init_pro[i] = np.log(single_word[i]/num) 
        self.init_pro[i] = single_word[i]/num 
 
    #求转移概率 
    for i in double_word: 
    #     trans_pro[i] = np.log(double_word[i]/single_word[i[0]]) 
        self.trans_pro[i] = double_word[i]/single_word[i[0]]

传入的是头条的一个语料库进行训练,格式如下图:

初始概率统计的是每个字占总体字数的比例(字典格式:{字:概率,…}),转移概率统计的是 由前一个字转换为后一个字的概率(字典格式:{词组:概率,…})。

汉字转拼音函数(将汉字转化为对应的拼音,训练发射概率时使用):

def word2pinyin(self,text): 
	py = pypinyin.lazy_pinyin(text) 
	return py

训练发射概率函数:

'''训练发射概率函数''' 
    def train_emiss_pro(self,taotiao_path): 
        f = open(taotiao_path,encoding='utf-8') 
 
        for line in f.readlines(): 
            temp =re.findall('[\\u4e00-\\u9fa5]+',line) 
            for words in temp: 
                ans = self.word2pinyin(words) 
                for i in range(len(ans)): 
                    if ans[i] not in self.emiss_pro: 
                        self.emiss_pro[ans[i]] = {} 
                        self.emiss_pro[ans[i]][words[i]] = 1 
                    else: 
                        if words[i] not in self.emiss_pro[ans[i]]: 
                            self.emiss_pro[ans[i]][words[i]] = 1 
                        else: 
                            self.emiss_pro[ans[i]][words[i]] += 1 
        f.close() 
 
        for key in self.emiss_pro: 
            s = sum(self.emiss_pro[key].values()) 
            for key2 in self.emiss_pro[key]: 
        #         emiss_pro[key][key2] = np.log(emiss_pro[key][key2]/s) 
                self.emiss_pro[key][key2] = self.emiss_pro[key][key2]/s

发射概率指的时一个字在它这个读音下占的比例(因为同一个读音会有许多不同的字,这里不考虑音调),字典格式为;{拼音:{字1:概率1,字2:概率2,…},…}。

获得拼音字典函数:

'''获得拼音字典函数''' 
def get_pinyin_dict(self,pinyin2hanzi_path,): 
    f = open(pinyin2hanzi_path,encoding='utf-8') 
    #按行读取 
    for line in f.readlines(): 
        #将每行拼音与汉字之间的零宽不换行空格换为普通空格 
        line = re.sub(r'[\\ufeff]','',line) 
        #将每行按空格切分并放入line列表中,一共有两个部分,其中line[0]为拼音,line[1]为对应的汉字(一堆) 
        line = line.strip().split() 
        #存入拼音字典中 
        self.pinyin_to_chinese[line[0]] = line[1] 
    f.close()

将每个读音对应的字分别存储起来,在最后拼音转汉字的时候用来生成隐藏序列。

维特比算法:

'''维特比算法''' 
    def viterbi(self,word_list, pinyin_list, n, id2word,label_list,): 
        """ 
        维特比算法求解最大路径问题 
        :param word_list:   每个拼音对应的隐藏状态矩阵 
        :param n:   可能观察到的状态数, 对应为汉字数量 
        :param id2word:    id到汉字的映射 
        :label_list:    原句 
        :return: 
        """ 
        T = len(word_list)  # 观察状态的长度 
 
        delta = np.zeros((T, n)) 
        # 保存转移下标值 
        psi = np.zeros((T, n), dtype=int) 
 
        # 初始化第一个字符的转移概率, 设置为每个词在词典中的单独出现的概率 
        words = word_list[0] 
        for w in words: 
            if id2word[w] not in self.init_pro: 
                delta[0][w] = 0 
            else: 
                delta[0][w] = self.init_pro[id2word[w]] 
 
        # 动态规划计算 
        for idx in range(1, T): 
            words = word_list[idx] 
            for i in range(len(words)): 
                max_value = 0 
                pre_words = word_list[idx-1] 
                index = 0 
                for j in range(len(pre_words)): 
                    tmp_key = id2word[pre_words[j]] + id2word[words[i]] 
                    # 获得转移概率,如果不存在,转移概率则为0 
                    if tmp_key in self.trans_pro: 
                        prob = self.trans_pro[tmp_key] 
                    else: 
                        prob = 0 
 
                    tmp_value = delta[idx-1][pre_words[j]] * prob 
 
                    if max_value < tmp_value: 
                        max_value = tmp_value 
                        index = j 
 
                # 计算观察状态到隐藏状态的概率 
    #             tmp_key = id2word[words[i]] + pinyin_list[idx]  
                if pinyin_list[idx] not in self.emiss_pro: 
                    emit_prob=0 
                elif id2word[words[i]] not in self.emiss_pro[pinyin_list[idx]]: 
                    emit_prob=0 
                else: 
    #                 emit_prob = emiss_pro[pinyin_list[idx]][id2word[words[i]]] * max_value 
                    emit_prob = self.emiss_pro[pinyin_list[idx]][id2word[words[i]]] * max_value 
 
                delta[idx][words[i]] = emit_prob 
                psi[idx][words[i]] = pre_words[index] 
 
        prob = 0 
        path = np.zeros(T, dtype=int) 
        path[T - 1] = 1 
        # 获取最大的转移值 
        for i in range(n): 
            if prob < delta[T - 1][i]: 
                prob = delta[T - 1][i] 
                path[T - 1] = i 
 
        # 最优路径回溯 
        for t in range(T - 2, -1, -1): 
            path[t] = psi[t+1][path[t+1]] 
 
        # 生成解析结果 
        final_word = "" 
        a=0 
        for i in range(T): 
            final_word += id2word[path[i]] 
            if id2word[path[i]]==label_list[i]: 
                a+=1 
 
        print('转换:',final_word) 
        print('准确率:',a*1.0/T)

使用了动态规划的思想,生成概率最大的序列作为输出序列。

输入测试集函数:

 def accury(self,test_path,): 
    f = open(test_path,encoding='gb2312') 
    lines = f.readlines() 
 
    flag = 1 
    pinyin = [] 
    label = [] 
    for line in lines: 
        if flag == 1: 
            line = line.lower() 
            line = line.split() 
            pinyin.append(line) 
            flag = 0 
        else: 
            label.append(line) 
            flag = 1 
 
    for i in range(len(pinyin)): 
        pinyin_list = pinyin[i] 
        label_list = label[i] 
  
        word_list =  [] 
        for i in pinyin_list: 
            temp = re.findall(r'[\\u4e00-\\u9fa5]',self.pinyin_to_chinese[i]) 
            word_list.append(temp) 
 
        words = set() 
        for wl in word_list: 
            for w in wl: 
                words.add(w) 
 
        word2idx = dict() 
        id2word = dict() 
        idx = 0 
        for w in words: 
            word2idx[w] = idx 
            id2word[idx] = w 
            idx += 1 
 
        # 将各个汉字转换为id表示 
        word_id_list = [None] * len(word_list) 
        for i, wl in enumerate(word_list): 
            word_id_list[i] = [None] * len(wl) 
            for j, w in enumerate(wl): 
                word_id_list[i][j] = (word2idx[w]) 
         
        print('原句:',label_list) 
        self.viterbi(word_id_list, pinyin_list, len(words), id2word,label_list, )

测试集格式如下(一行拼音,一行汉字):


全代码

import re 
import numpy as np 
import pypinyin 
 
 
'''HMM模型''' 
class HMM(object): 
     
    '''初始化函数''' 
    def __init__(self,): 
        self.init_pro = {} #初始概率 
        self.emiss_pro = {} #发射概率 
        self.trans_pro = {} #转移概率 
        self.pinyin_to_chinese= {} #拼音字典 
         
    '''获得拼音字典函数''' 
    def get_pinyin_dict(self,pinyin2hanzi_path,): 
        f = open(pinyin2hanzi_path,encoding='utf-8') 
        #按行读取 
        for line in f.readlines(): 
            #将每行拼音与汉字之间的零宽不换行空格换为普通空格 
            line = re.sub(r'[\\ufeff]','',line) 
            #将每行按空格切分并放入line列表中,一共有两个部分,其中line[0]为拼音,line[1]为对应的汉字(一堆) 
            line = line.strip().split() 
            #存入拼音字典中 
            self.pinyin_to_chinese[line[0]] = line[1] 
        f.close() 
         
    '''训练初始概率、转移概率函数''' 
    def train_init_trans_pro(self,taotiao_path,): 
        f = open(taotiao_path,encoding='utf-8') 
         
        single_word = {} #存放单个词的频数 
        double_word = {} #存放两个词的频数 
        num = 0 
 
        for line in f.readlines(): 
            temp =re.findall('[\\u4e00-\\u9fa5]+',line) 
            for words in temp: 
                pre = ' ' 
                for word in words: 
                    #计算单个词的频数 
                    if word in single_word: 
                        single_word[word] += 1 
                    else: 
                        single_word[word]=1 
                    #计算两个词的频数 
                    if pre != ' ': 
                        if pre+word in double_word: 
                            double_word[pre+word] += 1 
                        else: 
                            double_word[pre+word] = 1 
                    pre = word 
                num += 1 
        f.close() 
 
        #求初始概率 
        for i in single_word.keys(): 
            #取对数运算,避免出现概率过小 
        #     init_pro[i] = np.log(single_word[i]/num) 
            self.init_pro[i] = single_word[i]/num 
 
        #求转移概率 
        for i in double_word: 
        #     trans_pro[i] = np.log(double_word[i]/single_word[i[0]]) 
            self.trans_pro[i] = double_word[i]/single_word[i[0]] 
     
    '''汉字转拼音函数'''     
    def word2pinyin(self,text): 
        py = pypinyin.lazy_pinyin(text) 
        return py 
     
    '''训练发射概率函数''' 
    def train_emiss_pro(self,taotiao_path): 
        f = open(taotiao_path,encoding='utf-8') 
 
        for line in f.readlines(): 
            temp =re.findall('[\\u4e00-\\u9fa5]+',line) 
            for words in temp: 
                ans = self.word2pinyin(words) 
                for i in range(len(ans)): 
                    if ans[i] not in self.emiss_pro: 
                        self.emiss_pro[ans[i]] = {} 
                        self.emiss_pro[ans[i]][words[i]] = 1 
                    else: 
                        if words[i] not in self.emiss_pro[ans[i]]: 
                            self.emiss_pro[ans[i]][words[i]] = 1 
                        else: 
                            self.emiss_pro[ans[i]][words[i]] += 1 
        f.close() 
 
        for key in self.emiss_pro: 
            s = sum(self.emiss_pro[key].values()) 
            for key2 in self.emiss_pro[key]: 
        #         emiss_pro[key][key2] = np.log(emiss_pro[key][key2]/s) 
                self.emiss_pro[key][key2] = self.emiss_pro[key][key2]/s 
     
    '''维特比算法''' 
    def viterbi(self,word_list, pinyin_list, n, id2word,label_list,): 
        """ 
        维特比算法求解最大路径问题 
        :param word_list:   每个拼音对应的隐藏状态矩阵 
        :param n:   可能观察到的状态数, 对应为汉字数量 
        :param id2word:    id到汉字的映射 
        :return: 
        """ 
        T = len(word_list)  # 观察状态的长度 
 
        delta = np.zeros((T, n)) 
        # 保存转移下标值 
        psi = np.zeros((T, n), dtype=int) 
 
        # 初始化第一个字符的转移概率, 设置为每个词在词典中的单独出现的概率 
        words = word_list[0] 
        for w in words: 
            if id2word[w] not in self.init_pro: 
                delta[0][w] = 0 
            else: 
                delta[0][w]以上是关于实验二:基于HMM的拼音转汉字程序|自然语言的主要内容,如果未能解决你的问题,请参考以下文章

c语言--将中文转化为拼音

.NET 拼音汉字转化(全面)

汉字转拼音

ASP.NET Core使用微软官方类库实现汉字转拼音

ASP.NET Core使用微软官方类库实现汉字转拼音

ASP.NET Core使用微软官方类库实现汉字转拼音