如何在 R 中的线性判别分析图上绘制分类边界

Posted

技术标签:

【中文标题】如何在 R 中的线性判别分析图上绘制分类边界【英文标题】:How to plot classification borders on an Linear Discrimination Analysis plot in R 【发布时间】:2015-08-17 15:14:09 【问题描述】:

我使用线性判别分析 (LDA) 来研究一组变量在 3 个组之间的区分程度。然后,我使用plot.lda() 函数将我的数据绘制在两个线性判别式(x 轴上的 LD1 和 y 轴上的 LD2)上。我现在想将 LDA 的分类边界添加到绘图中。我在函数中看不到允许这样做的参数。 partimat() 函数允许可视化 LD 分类边界,但在这种情况下,变量用作 x 和 y 轴,而不是线性判别式。任何有关如何向plot.lda 添加分类边界的建议将不胜感激。下面是一些示例代码:

library(MASS)

# LDA
t.lda = lda(Group ~ Var1 + Var2, data=mydata, 
                na.action="na.omit", CV=TRUE) 

# Scatter plot using the two discriminant dimensions 
plot(t.lda, 
     panel = function(x, y, ...)  points(x, y, ...) ,
     col = c(4,2,3)[factor(mydata$Group)], 
     pch = c(17,19,15)[factor(mydata$Group)],
     ylim=c(-3,3), xlim=c(-5,5))

以下是一些示例数据(3 个组,2 个变量):

> dput(mydata)
structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a", 
"b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", 
"c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8, 
8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12, 
11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1, 
-8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10, 
-6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group", 
"Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
))

> head(mydata)

  Group Var1 Var2
1     a  7.5 -6.5
2     a  6.9 -6.2
3     a  6.5 -6.7
4     a  7.3 -6.9
5     a  8.1 -7.1
6     a  8.0 -8.0

编辑:根据 Roman 的回答,我尝试更改代码以在线性判别尺度上绘制分类边界(这是我想要实现的),而不是在原始变量的尺度上。但是,边界并不位于应有的位置。任何关于我在这里做错的建议将不胜感激:

#create new data
np = 300
nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
nd = expand.grid(Var1 = nd.x, Var2 = nd.y)

#run lda and predict using new data
new.lda = lda(Group ~ Var1 + Var2, data=mydata) 
prd = as.numeric(predict(new.lda, newdata = nd)$class)

#create LD sequences from min - max values 
p = predict(new.lda, newdata= nd)
p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores

#create original plot 
quartz()
plot(t.lda, panel = function(x, y, ...)  points(x, y, ...) ,
     col = c(4,2,3)[factor(mydata$Group)], 
     pch = c(17,19,15)[factor(mydata$Group)],
     ylim=c(-3,3), xlim=c(-5,5))

#add classification border on scale of linear discriminants (NOTE: this step currently doesn't work)
contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np), 
         levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)

【问题讨论】:

您也可以查看 [这里][1] 的 ggplot2 解决方案。 [1]:***.com/questions/24260576/… 【参考方案1】:

我根据here 找到的示例修改了我的代码。

require(MASS)

# generate data
set.seed(357)
Ng <- 100 # number of cases per group
group.a.x <- rnorm(n = Ng, mean = 2, sd = 3)
group.a.y <- rnorm(n = Ng, mean = 2, sd = 3)

group.b.x <- rnorm(n = Ng, mean = 11, sd = 3)
group.b.y <- rnorm(n = Ng, mean = 11, sd = 3)

group.a <- data.frame(x = group.a.x, y = group.a.y, group = "A")
group.b <- data.frame(x = group.b.x, y = group.b.y, group = "B")

my.xy <- rbind(group.a, group.b)

# construct the model
mdl <- lda(group ~ x + y, data = my.xy)

# draw discrimination line
np <- 300
nd.x <- seq(from = min(my.xy$x), to = max(my.xy$x), length.out = np)
nd.y <- seq(from = min(my.xy$y), to = max(my.xy$y), length.out = np)
nd <- expand.grid(x = nd.x, y = nd.y)

prd <- as.numeric(predict(mdl, newdata = nd)$class)

plot(my.xy[, 1:2], col = my.xy$group)
points(mdl$means, pch = "+", cex = 3, col = c("black", "red"))
contour(x = nd.x, y = nd.y, z = matrix(prd, nrow = np, ncol = np), 
        levels = c(1, 2), add = TRUE, drawlabels = FALSE)

编辑

如果我尝试

library(MASS)

mydata <- structure(list(Group = c("a", "a", "a", "a", "a", "a", "a", "a", 
                                   "b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", 
                                   "c", "c", "c"), Var1 = c(7.5, 6.9, 6.5, 7.3, 8.1, 8, 7.4, 7.8, 
                                                            8.3, 8.7, 8.9, 9.3, 8.5, 9.6, 9.8, 9.7, 11.2, 10.9, 11.5, 12, 
                                                            11, 11.6, 11.7, 11.3), Var2 = c(-6.5, -6.2, -6.7, -6.9, -7.1, 
                                                                                            -8, -6.5, -6.3, -9.3, -9.5, -9.6, -9.1, -8.9, -8.7, -9.9, -10, 
                                                                                            -6.7, -6.4, -6.8, -6.1, -7.1, -8, -6.9, -6.6)), .Names = c("Group", 
                                                                                                                                                       "Var1", "Var2"), class = "data.frame", row.names = c(NA, -24L
                                                                                                                                                   ))

np <- 300    

nd.x = seq(from = min(mydata$Var1), to = max(mydata$Var1), length.out = np)
nd.y = seq(from = min(mydata$Var2), to = max(mydata$Var2), length.out = np)
nd = expand.grid(Var1 = nd.x, Var2 = nd.y)

#run lda and predict using new data
new.lda = lda(Group ~ Var1 + Var2, data=mydata) 
prd = as.numeric(predict(new.lda, newdata = nd)$class)

#create LD sequences from min - max values 
p = predict(new.lda, newdata= nd)
p.x = seq(from = min(p$x[,1]), to = max(p$x[,1]), length.out = np) #LD1 scores
p.y = seq(from = min(p$x[,2]), to = max(p$x[,2]), length.out = np) #LD2 scores

# notice I don't use t.lda for first variable
plot(new.lda, panel = function(x, y, ...)  points(x, y, ...) ,
     col = c(4,2,3)[factor(mydata$Group)], 
     pch = c(17,19,15)[factor(mydata$Group)],
     ylim=c(-3,3), xlim=c(-5,5))

contour(x = p.x, y = p.y, z = matrix(prd, nrow = np, ncol = np), 
        levels = c(1, 2, 3), add = TRUE, drawlabels = FALSE)

我明白了

【讨论】:

@Roman:谢谢你的回答。我对如何将生成的数据输入绘图有点困惑(即 plot.lda() 函数在 y 轴和 x 轴上绘制 LD1 和 LD2 分数),但我认为您的代码绘制了原始变量值?有没有办法改为绘制 LD 分数?我尝试用 LD 分数补充生成的数据,但无法让它工作。我现在已经包含了一些包含 3 个组的示例数据,以使事情更容易转移。非常感谢您的帮助! @Roman:我现在添加了更改代码以在线性判别分数图上绘制分类边界的尝试(这是我想要实现的)。任何建议将不胜感激! @jjulip 看看我的编辑,如果那是你要找的? @罗曼:谢谢!这很奇怪。它适用于上面的简单示例,但不适用于我的大型数据集。我的数据中一定有一些东西丢失了!谢谢。【参考方案2】:

这里也是使用ggplot2的方法:

library(MASS)
library(ggplot2)
fit <- lda(Species ~ ., data = iris, prior = rep(1, 3)/3)
datPred <- data.frame(Species=predict(fit)$class,predict(fit)$x)
#Create decision boundaries
fit2 <- lda(Species ~ LD1 + LD2, data=datPred, prior = rep(1, 3)/3)
ld1lim <- expand_range(c(min(datPred$LD1),max(datPred$LD1)),mul=0.05)
ld2lim <- expand_range(c(min(datPred$LD2),max(datPred$LD2)),mul=0.05)
ld1 <- seq(ld1lim[[1]], ld1lim[[2]], length.out=300)
ld2 <- seq(ld2lim[[1]], ld1lim[[2]], length.out=300)
newdat <- expand.grid(list(LD1=ld1,LD2=ld2))
preds <-predict(fit2,newdata=newdat)
predclass <- preds$class
postprob <- preds$posterior
df <- data.frame(x=newdat$LD1, y=newdat$LD2, class=predclass)
df$classnum <- as.numeric(df$class)
df <- cbind(df,postprob)
head(df)

           x        y     class classnum       setosa   versicolor virginica
1 -10.122541 -2.91246 virginica        3 5.417906e-66 1.805470e-10         1
2 -10.052563 -2.91246 virginica        3 1.428691e-65 2.418658e-10         1
3  -9.982585 -2.91246 virginica        3 3.767428e-65 3.240102e-10         1
4  -9.912606 -2.91246 virginica        3 9.934630e-65 4.340531e-10         1
5  -9.842628 -2.91246 virginica        3 2.619741e-64 5.814697e-10         1
6  -9.772650 -2.91246 virginica        3 6.908204e-64 7.789531e-10         1

colorfun <- function(n,l=65,c=100)  hues = seq(15, 375, length=n+1); hcl(h=hues, l=l, c=c)[1:n]  # default ggplot2 colours
colors <- colorfun(3)
colorslight <- colorfun(3,l=90,c=50)
ggplot(datPred, aes(x=LD1, y=LD2) ) +
    geom_raster(data=df, aes(x=x, y=y, fill = factor(class)),alpha=0.7,show_guide=FALSE) +
    geom_contour(data=df, aes(x=x, y=y, z=classnum), colour="red2", alpha=0.5, breaks=c(1.5,2.5)) +
    geom_point(data = datPred, size = 3, aes(pch = Species,  colour=Species)) +
    scale_x_continuous(limits = ld1lim, expand=c(0,0)) +
    scale_y_continuous(limits = ld2lim, expand=c(0,0)) +
    scale_fill_manual(values=colorslight,guide=F)

(不完全确定这种在 1.5 和 2.5 处使用等高线/断点显示分类边界的方法总是正确的 - 它对于物种 1 和 2 以及物种 2 和 3 之间的边界是正确的,但如果物种区域不正确1 将在物种 3 旁边,因为那时我会在那里得到两个边界 - 也许我必须使用 here 使用的方法,其中每个物种对之间的每个边界都被单独考虑)

【讨论】:

以上是关于如何在 R 中的线性判别分析图上绘制分类边界的主要内容,如果未能解决你的问题,请参考以下文章

R语言如何进行线性判别分析?

线性判别分析(LDA),二次判别分析(QDA)和正则判别分析(RDA)

再现Fisher线性判别图

R语言中多分类问题 multicalss classification 的性能测量

线性和二次判别分析

R语言线性判别分析(LDA),二次判别分析(QDA)和正则判别分析(RDA)