lightgbm用于排序
Posted little-horse
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了lightgbm用于排序相关的知识,希望对你有一定的参考价值。
一.
LTR(learning to rank)经常用于搜索排序中,开源工具中比较有名的是微软的ranklib,但是这个好像是单机版的,也有好长时间没有更新了。所以打算想利用lightgbm进行排序,但网上关于lightgbm用于排序的代码很少,关于回归和分类的倒是一堆。这里我将贴上python版的lightgbm用于排序的代码,里面将包括训练、获取叶结点、ndcg评估、预测以及特征重要度等处理代码,有需要的朋友可以参考一下或进行修改。
其实在使用时,本人也对比了ranlib中的lambdamart和lightgbm,令人映像最深刻的是lightgbm的训练速度非常快,快的起飞。可能lambdamart训练需要几个小时,而lightgbm只需要几分钟,但是后面的ndcg测试都差不多,不像论文中所说的lightgbm精度高一点。lightgbm的训练速度快,我想可能最大的原因要可能是:a.节点分裂用到了直方图,而不是预排序方法;b.基于梯度的单边采样,即行采样;c.互斥特征绑定,即列采样;d.其于leaf-wise决策树生长策略;e.类别特征的支持
二.代码
第一部分代码块是主代码,后面三个代码块是用到的加载数据和ndcg。运行主代码使用命令如训练模型使用:python lgb.py -train等
1 import os 2 import lightgbm as lgb 3 from sklearn import datasets as ds 4 import pandas as pd 5 6 import numpy as np 7 from datetime import datetime 8 import sys 9 from sklearn.preprocessing import OneHotEncoder 10 11 def split_data_from_keyword(data_read, data_group, data_feats): 12 ‘‘‘ 13 利用pandas 14 转为lightgbm需要的格式进行保存 15 :param data_read: 16 :param data_save: 17 :return: 18 ‘‘‘ 19 with open(data_group, ‘w‘, encoding=‘utf-8‘) as group_path: 20 with open(data_feats, ‘w‘, encoding=‘utf-8‘) as feats_path: 21 dataframe = pd.read_csv(data_read, 22 sep=‘ ‘, 23 header=None, 24 encoding="utf-8", 25 engine=‘python‘) 26 current_keyword = ‘‘ 27 current_data = [] 28 group_size = 0 29 for _, row in dataframe.iterrows(): 30 feats_line = [str(row[0])] 31 for i in range(2, len(dataframe.columns) - 1): 32 feats_line.append(str(row[i])) 33 if current_keyword == ‘‘: 34 current_keyword = row[1] 35 if row[1] == current_keyword: 36 current_data.append(feats_line) 37 group_size += 1 38 else: 39 for line in current_data: 40 feats_path.write(‘ ‘.join(line)) 41 feats_path.write(‘\n‘) 42 group_path.write(str(group_size) + ‘\n‘) 43 44 group_size = 1 45 current_data = [] 46 current_keyword = row[1] 47 current_data.append(feats_line) 48 49 for line in current_data: 50 feats_path.write(‘ ‘.join(line)) 51 feats_path.write(‘\n‘) 52 group_path.write(str(group_size) + ‘\n‘) 53 54 def save_data(group_data, output_feature, output_group): 55 ‘‘‘ 56 group与features分别进行保存 57 :param group_data: 58 :param output_feature: 59 :param output_group: 60 :return: 61 ‘‘‘ 62 if len(group_data) == 0: 63 return 64 output_group.write(str(len(group_data)) + ‘\n‘) 65 for data in group_data: 66 # 只包含非零特征 67 # feats = [p for p in data[2:] if float(p.split(":")[1]) != 0.0] 68 feats = [p for p in data[2:]] 69 output_feature.write(data[0] + ‘ ‘ + ‘ ‘.join(feats) + ‘\n‘) # data[0] => level ; data[2:] => feats 70 71 def process_data_format(test_path, test_feats, test_group): 72 ‘‘‘ 73 转为lightgbm需要的格式进行保存 74 ‘‘‘ 75 with open(test_path, ‘r‘, encoding=‘utf-8‘) as fi: 76 with open(test_feats, ‘w‘, encoding=‘utf-8‘) as output_feature: 77 with open(test_group, ‘w‘, encoding=‘utf-8‘) as output_group: 78 group_data = [] 79 group = ‘‘ 80 for line in fi: 81 if not line: 82 break 83 if ‘#‘ in line: 84 line = line[:line.index(‘#‘)] 85 splits = line.strip().split() 86 if splits[1] != group: # qid => splits[1] 87 save_data(group_data, output_feature, output_group) 88 group_data = [] 89 group = splits[1] 90 group_data.append(splits) 91 save_data(group_data, output_feature, output_group) 92 93 def load_data(feats, group): 94 ‘‘‘ 95 加载数据 96 分别加载feature,label,query 97 ‘‘‘ 98 x_train, y_train = ds.load_svmlight_file(feats) 99 q_train = np.loadtxt(group) 100 return x_train, y_train, q_train 101 102 def load_data_from_raw(raw_data): 103 with open(raw_data, ‘r‘, encoding=‘utf-8‘) as testfile: 104 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 105 return test_X, test_y, test_qids, comments 106 107 def train(x_train, y_train, q_train, model_save_path): 108 ‘‘‘ 109 模型的训练和保存 110 ‘‘‘ 111 train_data = lgb.Dataset(x_train, label=y_train, group=q_train) 112 params = 113 ‘task‘: ‘train‘, # 执行的任务类型 114 ‘boosting_type‘: ‘gbrt‘, # 基学习器 115 ‘objective‘: ‘lambdarank‘, # 排序任务(目标函数) 116 ‘metric‘: ‘ndcg‘, # 度量的指标(评估函数) 117 ‘max_position‘: 10, # @NDCG 位置优化 118 ‘metric_freq‘: 1, # 每隔多少次输出一次度量结果 119 ‘train_metric‘: True, # 训练时就输出度量结果 120 ‘ndcg_at‘: [10], 121 ‘max_bin‘: 255, # 一个整数,表示最大的桶的数量。默认值为 255。lightgbm 会根据它来自动压缩内存。如max_bin=255 时,则lightgbm 将使用uint8 来表示特征的每一个值。 122 ‘num_iterations‘: 500, # 迭代次数 123 ‘learning_rate‘: 0.01, # 学习率 124 ‘num_leaves‘: 31, # 叶子数 125 # ‘max_depth‘:6, 126 ‘tree_learner‘: ‘serial‘, # 用于并行学习,‘serial’: 单台机器的tree learner 127 ‘min_data_in_leaf‘: 30, # 一个叶子节点上包含的最少样本数量 128 ‘verbose‘: 2 # 显示训练时的信息 129 130 gbm = lgb.train(params, train_data, valid_sets=[train_data]) 131 gbm.save_model(model_save_path) 132 133 def predict(x_test, comments, model_input_path): 134 ‘‘‘ 135 预测得分并排序 136 ‘‘‘ 137 gbm = lgb.Booster(model_file=model_input_path) # 加载model 138 139 ypred = gbm.predict(x_test) 140 141 predicted_sorted_indexes = np.argsort(ypred)[::-1] # 返回从大到小的索引 142 143 t_results = comments[predicted_sorted_indexes] # 返回对应的comments,从大到小的排序 144 145 return t_results 146 147 def test_data_ndcg(model_path, test_path): 148 ‘‘‘ 149 评估测试数据的ndcg 150 ‘‘‘ 151 with open(test_path, ‘r‘, encoding=‘utf-8‘) as testfile: 152 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 153 154 gbm = lgb.Booster(model_file=model_path) 155 test_predict = gbm.predict(test_X) 156 157 average_ndcg, _ = ndcg.validate(test_qids, test_y, test_predict, 60) 158 # 所有qid的平均ndcg 159 print("all qid average ndcg: ", average_ndcg) 160 print("job done!") 161 162 def plot_print_feature_importance(model_path): 163 ‘‘‘ 164 打印特征的重要度 165 ‘‘‘ 166 #模型中的特征是Column_数字,这里打印重要度时可以映射到真实的特征名 167 feats_dict = 168 ‘Column_0‘: ‘特征0名称‘, 169 ‘Column_1‘: ‘特征1名称‘, 170 ‘Column_2‘: ‘特征2名称‘, 171 ‘Column_3‘: ‘特征3名称‘, 172 ‘Column_4‘: ‘特征4名称‘, 173 ‘Column_5‘: ‘特征5名称‘, 174 ‘Column_6‘: ‘特征6名称‘, 175 ‘Column_7‘: ‘特征7名称‘, 176 ‘Column_8‘: ‘特征8名称‘, 177 ‘Column_9‘: ‘特征9名称‘, 178 ‘Column_10‘: ‘特征10名称‘, 179 180 if not os.path.exists(model_path): 181 print("file no exists! ".format(model_path)) 182 sys.exit(0) 183 184 gbm = lgb.Booster(model_file=model_path) 185 186 # 打印和保存特征重要度 187 importances = gbm.feature_importance(importance_type=‘split‘) 188 feature_names = gbm.feature_name() 189 190 sum = 0. 191 for value in importances: 192 sum += value 193 194 for feature_name, importance in zip(feature_names, importances): 195 if importance != 0: 196 feat_id = int(feature_name.split(‘_‘)[1]) + 1 197 print(‘ : : : ‘.format(feat_id, feats_dict[feature_name], importance, importance / sum)) 198 199 def get_leaf_index(data, model_path): 200 ‘‘‘ 201 得到叶结点并进行one-hot编码 202 ‘‘‘ 203 gbm = lgb.Booster(model_file=model_path) 204 ypred = gbm.predict(data, pred_leaf=True) 205 206 one_hot_encoder = OneHotEncoder() 207 x_one_hot = one_hot_encoder.fit_transform(ypred) 208 print(x_one_hot.toarray()[0]) 209 210 if __name__ == ‘__main__‘: 211 model_path = "保存模型的路径" 212 213 if len(sys.argv) != 2: 214 print("Usage: python main.py [-process | -train | -predict | -ndcg | -feature | -leaf]") 215 sys.exit(0) 216 217 if sys.argv[1] == ‘-process‘: 218 # 训练样本的格式与ranklib中的训练样本是一样的,但是这里需要处理成lightgbm中排序所需的格式 219 # lightgbm中是将样本特征和group分开保存为txt的,什么意思呢,看下面解释 220 ‘‘‘ 221 feats: 222 1 1:0.2 2:0.4 ... 223 2 1:0.2 2:0.4 ... 224 1 1:0.2 2:0.4 ... 225 3 1:0.2 2:0.4 ... 226 group: 227 2 228 4 229 这里group中2表示前2个是一个qid,4表示后两个是一个qid 230 ‘‘‘ 231 raw_data_path = ‘训练样本集路径‘ 232 data_feats = ‘特征保存路径‘ 233 data_group = ‘group保存路径‘ 234 process_data_format(raw_data_path, data_feats, data_group) 235 236 elif sys.argv[1] == ‘-train‘: 237 # train 238 train_start = datetime.now() 239 data_feats = ‘特征保存路径‘ 240 data_group = ‘group保存路径‘ 241 x_train, y_train, q_train = load_data(data_feats, data_group) 242 train(x_train, y_train, q_train, model_path) 243 train_end = datetime.now() 244 consume_time = (train_end - train_start).seconds 245 print("consume time : ".format(consume_time)) 246 247 elif sys.argv[1] == ‘-predict‘: 248 train_start = datetime.now() 249 raw_data_path = ‘需要预测的数据路径‘#格式如ranklib中的数据格式 250 test_X, test_y, test_qids, comments = load_data_from_raw(raw_data_path) 251 t_results = predict(test_X, comments, model_path) 252 train_end = datetime.now() 253 consume_time = (train_end - train_start).seconds 254 print("consume time : ".format(consume_time)) 255 256 elif sys.argv[1] == ‘-ndcg‘: 257 # ndcg 258 test_path = ‘测试的数据路径‘#评估测试数据的平均ndcg 259 test_data_ndcg(model_path, test_path) 260 261 elif sys.argv[1] == ‘-feature‘: 262 plot_print_feature_importance(model_path) 263 264 elif sys.argv[1] == ‘-leaf‘: 265 #利用模型得到样本叶结点的one-hot表示 266 raw_data = ‘测试数据路径‘# 267 with open(raw_data, ‘r‘, encoding=‘utf-8‘) as testfile: 268 test_X, test_y, test_qids, comments = letor.read_dataset(testfile) 269 get_leaf_index(test_X, model_path)
1 """ 2 3 Various utilities for converting data from/to Microsoft‘s LETOR format. 4 5 """ 6 7 import numpy as np 8 import re 9 import sklearn.externals.six 10 from sklearn.externals.six import moves 11 range = moves.range 12 13 14 def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0): 15 """Transforms an iterator of lines to an iterator of LETOR rows. 16 17 Each row is represented by a (x, y, qid, comment) tuple. 18 19 Parameters 20 ---------- 21 lines : iterable of lines 22 Lines to parse. 23 has_targets : bool, optional 24 Whether the file contains targets. If True, will expect the first token 25 of every line to be a real representing the sample‘s target (i.e. 26 score). If False, will use -1 as a placeholder for all targets. 27 one_indexed : bool, optional 特征id从1开始的转为从0开始 28 Whether feature ids are one-indexed. If True, will subtract 1 from each 29 feature id. 30 missing : float, optional 31 Placeholder to use if a feature value is not provided for a sample. 32 33 Yields 34 ------ 35 x : array of floats 36 Feature vector of the sample. 37 y : float 38 Target value (score) of the sample, or -1 if no target was parsed. 39 qid : object 40 Query id of the sample. This is currently guaranteed to be a string. 41 comment : str 42 Comment accompanying the sample. 43 44 """ 45 for line in lines: 46 data, _, comment = line.rstrip().partition(‘#‘) 47 toks = data.strip().split() 48 #toks = line.rstrip() 49 #toks = re.split(‘\s+‘, toks.strip()) 50 #print("toks: ", toks) 51 #comment = "no comment" 52 num_features = 0 # 统计特征个数 53 x = np.repeat(missing, 8) 54 y = -1.0 55 if has_targets: 56 y = float(toks[0].strip()) # 相关度label 57 toks = toks[1:] 58 # qid:1 => 1 59 qid = _parse_qid_tok(toks[0].strip()) 60 61 # feature(id:value) 62 for tok in toks[1:]: 63 #fid, _, val = tok.strip().partition(‘:‘) # fid,_,val => featureID,:,featureValue 64 fid, val = tok.split(":") # featureID:featureValue 65 fid = int(fid) 66 val = float(val) 67 if one_indexed: 68 fid -= 1 69 assert fid >= 0 70 while len(x) <= fid: 71 orig = len(x) 72 #x=np.resize(x,(len(x) * 2)) 73 x.resize(len(x) * 2) 74 x[orig:orig * 2] = missing 75 x[fid] = val 76 num_features = max(fid + 1, num_features) 77 78 assert num_features > 0 79 x.resize(num_features) 80 81 yield (x, y, qid, comment) 82 83 84 def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0): 85 """Parses a LETOR dataset from `source`. 86 87 Parameters 88 ---------- 89 source : string or iterable of lines 90 String, file, or other file-like object to parse. 91 has_targets : bool, optional 92 See `iter_lines`. 93 one_indexed : bool, optional 94 See `iter_lines`. 95 missing : float, optional 96 See `iter_lines`. 97 98 Returns 99 ------- 100 X : array of arrays of floats 101 Feature matrix (see `iter_lines`). 102 y : array of floats 103 Target vector (see `iter_lines`). 104 qids : array of objects 105 Query id vector (see `iter_lines`). 106 comments : array of strs 107 Comment vector (see `iter_lines`). 108 """ 109 if isinstance(source, sklearn.externals.six.string_types): 110 source = source.splitlines(True) 111 112 max_width = 0 # 某行最多特征个数 113 xs, ys, qids, comments = [], [], [], [] 114 iter_content = iter_lines(source, has_targets=has_targets, 115 one_indexed=one_indexed, missing=missing) 116 # x:特征向量; y:float 相关度值[0-4]; qid:string query id; comment: #后面内容 117 for x, y, qid, comment in iter_content: 118 xs.append(x) 119 ys.append(y) 120 qids.append(qid) 121 comments.append(comment) 122 max_width = max(max_width, len(x)) 123 124 assert max_width > 0 125 # X.shape = [len(xs), max_width] 126 X = np.ndarray((len(xs), max_width), dtype=np.float64) 127 X.fill(missing) 128 for i, x in enumerate(xs): 129 X[i, :len(x)] = x 130 ys = np.array(ys) if has_targets else None 131 qids = np.array(qids) 132 comments = np.array(comments) 133 134 return (X, ys, qids, comments) 135 136 137 def _parse_qid_tok(tok): 138 assert tok.startswith(‘qid:‘) 139 return tok[4:]
1 import numpy as np 2 import sklearn.externals.six 3 from sklearn.externals.six import moves 4 range = moves.range 5 6 7 def iter_lines(lines): 8 for line in lines: 9 toks = line.split() 10 qid = toks[0] 11 target = float(toks[4]) 12 pred = float(toks[5]) 13 yield (qid, target, pred) 14 15 def read_dataset(source): 16 17 if isinstance(source, sklearn.externals.six.string_types): 18 source = source.splitlines(True) 19 20 qids, targets, preds = [], [], [] 21 iter_content = iter_lines(source) 22 for qid, target, pred in iter_content: 23 qids.append(qid) 24 targets.append(target) 25 preds.append(pred) 26 27 qids = np.array(qids) 28 targets = np.array(targets) 29 preds = np.array(preds) 30 31 return (qids, targets, preds)
1 import numpy as np 2 import collections 3 4 def validate(qids, targets, preds, k): 5 """ 6 Predicts the scores for the test dataset and calculates the NDCG value. 7 Parameters 8 ---------- 9 data : Numpy array of documents 10 Numpy array of documents with each document‘s format is [relevance score, query index, feature vector] 11 k : int 12 this is used to compute the NDCG@k 13 14 Returns 15 ------- 16 average_ndcg : float 17 This is the average NDCG value of all the queries 18 predicted_scores : Numpy array of scores 19 This contains an array or the predicted scores for the documents. 20 """ 21 query_groups = get_groups(qids) # (qid,from,to),一个元组,表示这个qid的样本从哪到哪 22 all_ndcg = [] 23 every_qid_ndcg = collections.OrderedDict() 24 25 for qid, a, b in query_groups: 26 predicted_sorted_indexes = np.argsort(preds[a:b])[::-1] # 从大到小的索引 27 t_results = targets[a:b] # 目标数据的相关度 28 t_results = t_results[predicted_sorted_indexes] #是predicted_sorted_indexes排好序的在test_data中的相关度 29 30 dcg_val = dcg_k(t_results, k) 31 idcg_val = ideal_dcg_k(t_results, k) 32 ndcg_val = (dcg_val / idcg_val) 33 all_ndcg.append(ndcg_val) 34 every_qid_ndcg.setdefault(qid, ndcg_val) 35 36 average_ndcg = np.nanmean(all_ndcg) 37 return average_ndcg, every_qid_ndcg 38 39 40 ‘‘‘ 41 for query in query_indexes: 42 results = np.zeros(len(query_indexes[query])) 43 44 for tree in self.trees: 45 results += self.learning_rate * tree.predict(data[query_indexes[query], 2:]) 46 predicted_sorted_indexes = np.argsort(results)[::-1] 47 t_results = data[query_indexes[query], 0] # 第0列的相关度 48 t_results = t_results[predicted_sorted_indexes] 49 50 dcg_val = dcg_k(t_results, k) 51 idcg_val = ideal_dcg_k(t_results, k) 52 ndcg_val = (dcg_val / idcg_val) 53 average_ndcg.append(ndcg_val) 54 average_ndcg = np.nanmean(average_ndcg) 55 return average_ndcg 56 ‘‘‘ 57 58 def get_groups(qids): 59 """Makes an iterator of query groups on the provided list of query ids. 60 61 Parameters 62 ---------- 63 qids : array_like of shape = [n_samples] 64 List of query ids. 65 66 Yields 67 ------ 68 row : (qid, int, int) 69 Tuple of query id, from, to. 70 ``[i for i, q in enumerate(qids) if q == qid] == range(from, to)`` 71 72 """ 73 prev_qid = None 74 prev_limit = 0 75 total = 0 76 77 for i, qid in enumerate(qids): 78 total += 1 79 if qid != prev_qid: 80 if i != prev_limit: 81 yield (prev_qid, prev_limit, i) 82 prev_qid = qid 83 prev_limit = i 84 85 if prev_limit != total: 86 yield (prev_qid, prev_limit, total) 87 88 def group_queries(training_data, qid_index): 89 """ 90 Returns a dictionary that groups the documents by their query ids. 91 Parameters 92 ---------- 93 training_data : Numpy array of lists 94 Contains a list of document information. Each document‘s format is [relevance score, query index, feature vector] 95 qid_index : int 96 This is the index where the qid is located in the training data 97 98 Returns 99 ------- 100 query_indexes : dictionary 101 The keys were the different query ids and teh values were the indexes in the training data that are associated of those keys. 102 """ 103 query_indexes = # 每个qid对应的样本索引范围,比如qid=1020,那么此qid在training data中的训练样本从0到100的范围, key=str,value=[] 104 index = 0 105 for record in training_data: 106 query_indexes.setdefault(record[qid_index], []) 107 query_indexes[record[qid_index]].append(index) 108 index += 1 109 return query_indexes 110 111 112 def dcg_k(scores, k): 113 """ 114 Returns the DCG value of the list of scores and truncates to k values. 115 Parameters 116 ---------- 117 scores : list 118 Contains labels in a certain ranked order 119 k : int 120 In the amount of values you want to only look at for computing DCG 121 122 Returns 123 ------- 124 DCG_val: int 125 This is the value of the DCG on the given scores 126 """ 127 return np.sum([ 128 (np.power(2, scores[i]) - 1) / np.log2(i + 2) 129 for i in range(len(scores[:k])) 130 ]) 131 132 133 def ideal_dcg_k(scores, k): 134 """ 135 前k个理想状态下的dcg 136 Returns the Ideal DCG value of the list of scores and truncates to k values. 137 Parameters 138 ---------- 139 scores : list 140 Contains labels in a certain ranked order 141 k : int 142 In the amount of values you want to only look at for computing DCG 143 144 Returns 145 ------- 146 Ideal_DCG_val: int 147 This is the value of the Ideal DCG on the given scores 148 """ 149 # 相关度降序排序 150 scores = [score for score in sorted(scores)[::-1]] 151 return dcg_k(scores, k)
以上是关于lightgbm用于排序的主要内容,如果未能解决你的问题,请参考以下文章