使用朴素贝叶斯做多分类
Posted
技术标签:
【中文标题】使用朴素贝叶斯做多分类【英文标题】:Using Naive Bayes to do multi classification 【发布时间】:2020-06-02 01:03:34 【问题描述】:我有一个数据集如下:
data = [[92, 155],
[56, 186, 117, 210, 224],
[247, 202, 189, 210, 65, 3, 270, 224],
[20, 14, 157, 224],
[17, 89, 158, 224],
[263, 283, 68, 224],
[182, 166, 224],
[176, 37, 100, 224],
[33, 102, 41, 269, 177, 224],
[0, 260, 49, 207, 278, 217, 35],
[119],
[118],
[142, 185, 7, 246, 224],
[104, 22, 101, 224],
[84, 205, 224],
[225, 93, 54, 224],
[98, 32, 78, 224],
[159, 217, 212, 198, 224],
[178, 94, 187, 224],
[211, 149, 193, 149, 66, 139, 67, 28, 106, 224],
[133, 151],
[259, 109, 29, 224],
[215, 241, 73, 255, 77, 144, 224],
[36, 254, 19, 268, 183, 224],
[47, 234, 203, 111, 231, 141, 30],
[127, 275, 220, 161],
[214, 267, 22, 90, 224],
[46, 217, 103],
[17, 89, 128, 224],
[225, 22, 101, 224],
[285, 265, 151],
[215, 206, 264, 43, 224],
[244, 21, 224],
[82, 122, 240, 5, 224],
[259, 136, 162, 194, 224],
[176, 208, 112, 224],
[172, 19, 146, 276, 31, 246, 51, 224],
[45, 10],
[229, 24, 224],
[143, 108, 239, 224],
[225, 282, 83, 224],
[110, 267, 171],
[176, 245, 95, 123, 270, 224],
[248, 195, 139, 261, 173, 281, 232, 80, 18, 224],
[61, 60, 233],
[211, 120, 1, 23],
[225, 267, 249, 224],
[247, 202, 86, 196, 224],
[15, 127, 222, 224],
[247, 202, 186, 226, 145, 224],
[174, 242, 196, 224],
[259, 152, 71, 224],
[235, 44, 230, 224],
[69, 96, 50, 99, 116],
[259, 279, 224],
[228, 70],
[39, 139, 201, 190, 224],
[132, 40, 219, 81, 224],
[159, 221, 224],
[267, 16, 6, 62],
[143, 59, 175, 129, 48, 224],
[280, 140, 224],
[284, 124, 167, 150, 274],
[113, 265, 184],
[179, 4, 257, 145, 224],
[247, 202, 72, 11, 224],
[64],
[192, 125, 105],
[174, 134, 224],
[58, 139, 85, 160, 209, 224],
[130, 169, 137, 256, 224],
[215, 163, 265, 185, 26],
[176, 147, 74, 224],
[0, 266],
[143, 34, 153, 188, 224],
[121],
[243, 75, 135],
[38, 218, 199, 253, 224],
[178, 271, 224],
[154, 164, 180, 27, 270, 224],
[176, 189, 148, 139, 277, 224],
[57, 62],
[91, 168, 251, 224],
[172, 19, 146, 276, 53, 97, 200, 224],
[64],
[8, 237, 224],
[138, 107, 224],
[176, 238, 224],
[204, 217, 63, 165, 224],
[215, 216, 272, 62, 170, 2, 55, 224],
[247, 273, 202, 223, 9, 148, 224],
[258, 267, 181, 224],
[262, 76, 126, 12, 224],
[36, 254, 19, 268, 250, 213, 48, 224],
[227, 42],
[79, 197, 52, 87, 224],
[143, 131, 224],
[156, 88, 115, 236, 224],
[259, 13, 252, 224],
[114, 25, 191, 224]]
target = [9,
31,
20,
9,
3,
26,
16,
11,
28,
0,
9,
9,
9,
9,
7,
1,
33,
9,
13,
15,
9,
21,
9,
34,
9,
9,
9,
9,
3,
1,
9,
27,
14,
22,
21,
11,
17,
9,
6,
8,
1,
9,
11,
9,
9,
9,
1,
20,
29,
20,
23,
21,
9,
9,
21,
9,
18,
9,
9,
30,
8,
9,
9,
9,
9,
20,
9,
32,
23,
9,
24,
9,
11,
9,
8,
9,
9,
9,
13,
10,
11,
9,
12,
17,
9,
5,
9,
11,
9,
2,
20,
9,
25,
34,
9,
9,
8,
4,
21,
19]
我希望使用朴素贝叶斯或任何其他可用的最佳算法进行分类。但是,我在使用朴素贝叶斯时收到如下错误:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(class_prior=[.25,.75])
mnb.fit(data, target)
错误:
ValueError: Expected 2D array, got 1D array instead:
array=[list([92, 155]) list([56, 186, 117, 210, 224])
list([247, 202, 189, 210, 65, 3, 270, 224]) list([20, 14, 157, 224])
list([17, 89, 158, 224]) list([263, 283, 68, 224]) list([182, 166, 224])
list([176, 37, 100, 224]) list([33, 102, 41, 269, 177, 224])
list([0, 260, 49, 207, 278, 217, 35]) list([119]) list([118])
list([142, 185, 7, 246, 224]) list([104, 22, 101, 224])
list([84, 205, 224]) list([225, 93, 54, 224]) list([98, 32, 78, 224])
list([159, 217, 212, 198, 224]) list([178, 94, 187, 224])
list([211, 149, 193, 149, 66, 139, 67, 28, 106, 224]) list([133, 151])
list([259, 109, 29, 224]) list([215, 241, 73, 255, 77, 144, 224])
list([36, 254, 19, 268, 183, 224])
list([47, 234, 203, 111, 231, 141, 30]) list([127, 275, 220, 161])
list([214, 267, 22, 90, 224]) list([46, 217, 103])
list([17, 89, 128, 224]) list([225, 22, 101, 224]) list([285, 265, 151])
list([215, 206, 264, 43, 224]) list([244, 21, 224])
list([82, 122, 240, 5, 224]) list([259, 136, 162, 194, 224])
list([176, 208, 112, 224]) list([172, 19, 146, 276, 31, 246, 51, 224])
list([45, 10]) list([229, 24, 224]) list([143, 108, 239, 224])
list([225, 282, 83, 224]) list([110, 267, 171])
list([176, 245, 95, 123, 270, 224])
list([248, 195, 139, 261, 173, 281, 232, 80, 18, 224])
list([61, 60, 233]) list([211, 120, 1, 23]) list([225, 267, 249, 224])
list([247, 202, 86, 196, 224]) list([15, 127, 222, 224])
list([247, 202, 186, 226, 145, 224]) list([174, 242, 196, 224])
list([259, 152, 71, 224]) list([235, 44, 230, 224])
list([69, 96, 50, 99, 116]) list([259, 279, 224]) list([228, 70])
list([39, 139, 201, 190, 224]) list([132, 40, 219, 81, 224])
list([159, 221, 224]) list([267, 16, 6, 62])
list([143, 59, 175, 129, 48, 224]) list([280, 140, 224])
list([284, 124, 167, 150, 274]) list([113, 265, 184])
list([179, 4, 257, 145, 224]) list([247, 202, 72, 11, 224]) list([64])
list([192, 125, 105]) list([174, 134, 224])
list([58, 139, 85, 160, 209, 224]) list([130, 169, 137, 256, 224])
list([215, 163, 265, 185, 26]) list([176, 147, 74, 224]) list([0, 266])
list([143, 34, 153, 188, 224]) list([121]) list([243, 75, 135])
list([38, 218, 199, 253, 224]) list([178, 271, 224])
list([154, 164, 180, 27, 270, 224]) list([176, 189, 148, 139, 277, 224])
list([57, 62]) list([91, 168, 251, 224])
list([172, 19, 146, 276, 53, 97, 200, 224]) list([64])
list([8, 237, 224]) list([138, 107, 224]) list([176, 238, 224])
list([204, 217, 63, 165, 224]) list([215, 216, 272, 62, 170, 2, 55, 224])
list([247, 273, 202, 223, 9, 148, 224]) list([258, 267, 181, 224])
list([262, 76, 126, 12, 224]) list([36, 254, 19, 268, 250, 213, 48, 224])
list([227, 42]) list([79, 197, 52, 87, 224]) list([143, 131, 224])
list([156, 88, 115, 236, 224]) list([259, 13, 252, 224])
list([114, 25, 191, 224])].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
请任何人都可以帮我解决这个问题吗?或者有人可以向我展示一些其他用于机器学习的最佳算法示例,例如决策树、支持向量机或其他任何东西。
【问题讨论】:
数据中每一行的长度必须相同。 是的,但我想做的是对整个 tokenized_sequences 行进行分类并将其映射到目标。这不可能吗?如果没记错的话,就像文本分类一样。 您可以尝试用零填充数据以确保每一行的大小相同 【参考方案1】:因此,您收到错误的直接答案是您将列表数组作为参数传递。因此,Sklearn 认为您正在传递一维列表数组。无法将您的 data
转换为 2D 矩阵,因为列表中的值数量不一致。
根据我的理解(这可能是错误的),输入特征矩阵的每一行都需要具有相同数量的数字。既然满足了,那么您应该能够将您的数据输入MultinomialNB
没问题。
考虑用零填充:
data1 = np.zeros((len(data), 10))
for i in range(len(data)):
data1[i, :len(data[i])] = data[i]
【讨论】:
是的,但我想做的是对整个 tokenized_sequences 行进行分类并将其映射到目标。这不可能吗?如果没记错的话,就像文本分类一样。 您是如何生成标记化序列的? 我发现问题就在这里。数组([数组([ 92, 155, 0, 0, 0, 0, 0, 0, 0, 0]),数组([ 56, 186, 117, 210, 224, 0, 0, 0, 0, 0 ]), array([247, 202, 189, 210, 65, 3, 270, 224, 0, 0]), dtype=object) 有没有办法将其重塑为(3,10) 我进行了编辑,向您展示了如何将整个 2ddata
数组用零填充到一个 numpy 矩阵中。希望能帮助到你。这是你的问题吗?以上是关于使用朴素贝叶斯做多分类的主要内容,如果未能解决你的问题,请参考以下文章
朴素贝叶斯:朴素贝叶斯定义朴素贝叶斯公式分解朴素贝叶斯分类流程高斯型朴素贝叶斯多项式朴素贝叶斯伯努利型朴素贝叶斯朴素贝叶斯预测概率校准朴素贝叶斯优缺点