搜狗词库转txt

Posted 2020-10-20 在路上-UP
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了搜狗词库转txt相关的知识，希望对你有一定的参考价值。
# 运行环境要求 python2
  1 #!/usr/bin/python
  2 # -*- coding: utf-8 -*-
  3 
  4 import struct
  5 import sys
  6 import binascii 
  7 import pdb
  8 #搜狗的scel词库就是保存的文本的unicode编码，每两个字节一个字符（中文汉字或者英文字母）
  9 #找出其每部分的偏移位置即可
 10 #主要两部分
 11 #1.全局拼音表，貌似是所有的拼音组合，字典序
 12 #       格式为(index,len,pinyin)的列表
 13 #       index: 两个字节的整数 代表这个拼音的索引
 14 #       len: 两个字节的整数 拼音的字节长度
 15 #       pinyin: 当前的拼音，每个字符两个字节，总长len
 16 #       
 17 #2.汉语词组表
 18 #       格式为(same,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表
 19 #       same: 两个字节 整数 同音词数量
 20 #       py_table_len:  两个字节 整数
 21 #       py_table: 整数列表，每个整数两个字节,每个整数代表一个拼音的索引
 22 #
 23 #       word_len:两个字节 整数 代表中文词组字节数长度
 24 #       word: 中文词组,每个中文汉字两个字节，总长度word_len
 25 #       ext_len: 两个字节 整数 代表扩展信息的长度，好像都是10
 26 #       ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0
 27 #
 28 #      {word_len,word,ext_len,ext} 一共重复same次 同音词 相同拼音表
 29 
 30 
 31 #拼音表偏移，
 32 startPy = 0x1540;
 33 
 34 #汉语词组表偏移
 35 startChinese = 0x2628;
 36 
 37 #全局拼音表
 38 
 39 GPy_Table ={}
 40 
 41 #解析结果
 42 #元组(词频,拼音,中文词组)的列表
 43 GTable = []
 44 
 45 def byte2str(data):
 46     ‘‘‘将原始字节码转为字符串‘‘‘
 47     i = 0;
 48     length = len(data)
 49     ret = u‘‘
 50     while i < length:
 51         x = data[i] + data[i+1]
 52         t = unichr(struct.unpack(‘H‘,x)[0])
 53         if t == u‘r‘:
 54             ret += u‘n‘
 55         elif t != u‘ ‘:
 56             ret += t
 57         i += 2
 58     return ret
 59 #获取拼音表
 60 def getPyTable(data):
 61 
 62     if data[0:4] != "x9Dx01x00x00":
 63         return None
 64     data = data[4:]
 65     pos = 0
 66     length = len(data)
 67     while pos < length:
 68         index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
 69         #print index,
 70         pos += 2
 71         l = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
 72         #print l,
 73         pos += 2
 74         py = byte2str(data[pos:pos+l])
 75         #print py
 76         GPy_Table[index]=py
 77         pos += l
 78 
 79 
 80 #获取一个词组的拼音
 81 def getWordPy(data):
 82     pos = 0
 83     length = len(data)
 84     ret = u‘‘
 85     while pos < length:
 86         
 87         index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
 88         ret += GPy_Table[index]
 89         pos += 2    
 90     return ret
 91 
 92 
 93 #获取一个词组
 94 def getWord(data):
 95     pos = 0
 96     length = len(data)
 97     ret = u‘‘
 98     while pos < length:
 99         
100         index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
101         ret += GPy_Table[index]
102         pos += 2    
103     return ret
104 
105 #读取中文表    
106 def getChinese(data):
107     #import pdb
108     #pdb.set_trace()
109     
110     pos = 0
111     length = len(data)
112     while pos < length:
113         #同音词数量
114         same = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
115         #print ‘[same]:‘,same,
116         
117         #拼音索引表长度
118         pos += 2
119         py_table_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
120         #拼音索引表
121         pos += 2
122         py = getWordPy(data[pos: pos+py_table_len])
123 
124         #中文词组
125         pos += py_table_len
126         for i in xrange(same):
127             #中文词组长度
128             c_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
129             #中文词组
130             pos += 2  
131             word = byte2str(data[pos: pos + c_len])
132             #扩展数据长度
133             pos += c_len        
134             ext_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
135             #词频
136             pos += 2
137             count  = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]
138 
139             #保存
140             GTable.append((count,py,word))
141         
142             #到下个词的偏移位置
143             pos +=  ext_len
144 
145 def deal(file_name):
146     print ‘-‘*60
147     f = open(file_name,‘rb‘)
148     data = f.read()
149     f.close()
150         
151     if data[0:12] !="x40x15x00x00x44x43x53x01x01x00x00x00":
152         print "确认你选择的是搜狗(.scel)词库?"
153         sys.exit(0)
154     #pdb.set_trace()
155     
156     print "词库名：" ,byte2str(data[0x130:0x338])#.encode(‘GB18030‘)
157     print "词库类型：" ,byte2str(data[0x338:0x540])#.encode(‘GB18030‘)
158     print "描述信息：" ,byte2str(data[0x540:0xd40])#.encode(‘GB18030‘)
159     print "词库示例：",byte2str(data[0xd40:startPy])#.encode(‘GB18030‘)
160     
161     getPyTable(data[startPy:startChinese])
162     getChinese(data[startChinese:])
163             
164 if __name__ == ‘__main__‘:
165 
166     #将要转换的词库添加在这里就可以了
167     o = [‘计算机词汇大全【官方推荐】.scel‘,
168     ‘IT计算机.scel‘,
169     ‘计算机词汇大全【官方推荐】.scel‘,
170     ‘北京市城市信息精选.scel‘,
171     ‘常用餐饮词汇.scel‘,
172     ‘成语.scel‘,
173     ‘成语俗语【官方推荐】.scel‘,
174     ‘法律词汇大全【官方推荐】.scel‘,
175     ‘房地产词汇大全【官方推荐】.scel‘,
176     ‘手机词汇大全【官方推荐】.scel‘,
177     ‘网络流行新词【官方推荐】.scel‘,
178     ‘歇后语集锦【官方推荐】.scel‘,
179     ‘饮食大全【官方推荐】.scel‘,
180     ]
181     
182     for f in o:
183         deal(f)
184         
185     #保存结果  
186     f = open(‘sougou.txt‘,‘w‘)
187     for count,py,word in GTable:
188         #GTable保存着结果，是一个列表，每个元素是一个元组(词频,拼音,中文词组)，有需要的话可以保存成自己需要个格式
189         #我没排序，所以结果是按照上面输入文件的顺序
190         f.write( unicode(‘{%(count)s}‘ %{‘count‘:count}+py+‘ ‘+ word).encode(‘GB18030‘) )#最终保存文件的编码，可以自给改
191         f.write(‘n‘)
192     f.close()
以上是关于搜狗词库转txt的主要内容，如果未能解决你的问题，请参考以下文章