r 来自网络研讨会的R演示
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 来自网络研讨会的R演示相关的知识,希望对你有一定的参考价值。
## ----rmarkdown install---------------------------------------------------
if (!require("rmarkdown")) install.packages("rmarkdown"
, repos='https://cran.revolutionanalytics.com')
library(rmarkdown)
## ----setup, include=TRUE-------------------------------------------------
knitr::opts_chunk$set(cache = TRUE,
echo = TRUE)
options(scipen=1, digits=4)
## ----package management--------------------------------------------------
# ensure package 'pacman' for package management is installed
if (!require("pacman")) install.packages("pacman")
library(pacman)
# add any required packages to the p_load() function parameters
pacman::p_load(rmarkdown # markdown functionality
, knitr # for table formatting
, dplyr # for data wrangling
, memisc # for convenient control structures like 'case'
, ggplot2 # for better visualizations
, ggthemes # color themes for ggplot2
, ggsci # Scientific Journal and
# Sci-Fi Themed Color Palettes
, grid # enhance ggplot2
, gridExtra # enhance ggplot2
, corrplot # for corrleation plots
, reshape2 # for data wrangling
, randomForest # machine learning pkg
, tree # regression tree
, dendextend # for hierarchical clustering
, cluster # for clustering
, amap # for clustering
, fclust # for fuzzy clustering
, factoextra # fancy cluster plots
, fpc # cluster stats
, mclust # advanced clustering
, plyr # data wrangling
, update=FALSE)
pacman::p_loaded() # check which packages are loaded
## ----data access---------------------------------------------------------
getwd() # current working directory
data("iris")
dfS <- data.frame(cbind(scale(iris[1:4]), (iris[,"Species"]))) # scaled data frame
dfS <- dplyr::select(dfS, c(feature1 = Sepal.Length
, feature2 = Sepal.Width
, feature3 = Petal.Length
, feature4 = Petal.Width
, cluster = V5))
dfS$cluster <- as.factor(dfS$cluster)
rm(iris)
## ----function MultiPlot--------------------------------------------------
# This code creates a correlation matrix with r and p values
# Code from: http://bit.ly/24hV4Xz
# Help page for the function pairs() gives you example how to define panels to plot.
panel.cor <- function(x, y, digits=2, cex.cor) {
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
test <- cor.test(x,y)
Signif <- ifelse(round(test$p.value,3) < 0.001,
"p<0.001",
paste("p=", round(test$p.value,3))
)
#text(0.5, 0.25, paste("r=",txt))
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.25, txt, cex = cex.cor * (1 + r) /3)
text(.5, .75, Signif)
}
# For panel.smooth() function defined cex=, col= and pch= arguments.
panel.smooth <- function (x, y, col = "cornflowerblue", bg = NA, pch = 18,
cex = 0.8, col.smooth = "red", span = 2/3, iter = 3, ...) {
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
ok <- is.finite(x) & is.finite(y)
if (any(ok))
lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
col = col.smooth, ...)
}
# To add histograms, panel.hist() functions should be
# defined (taken from help file of pairs())
panel.hist <- function(x, ...) {
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="coral", ...)
}
# pairs plot of distribution, scatter, r, and p-values between variables
MultiPlot <- function(myCols, myMain) {
pairs(dfS[, myCols],
lower.panel=panel.smooth, upper.panel=panel.cor,
diag.panel=panel.hist, main= myMain)
}
## ----function CorrPlot---------------------------------------------------
# create correlation matrix
# from R Graphics Cookbook
CorrPlot <- function(myCols, order, myTitle) {
d.cor <- dfS[, myCols]
mcor <- cor(d.cor, use = "complete.obs")
mcor <- round(mcor, digits=2)
# generate a lighter color palette
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
# order parameter options
# the ordering method of the correlation matrix.
# "original" for original order (default).
# "AOE" for the angular order of the eigenvectors.
# "FPC" for the first principal component order.
# "hclust" for the hierarchical clustering order.
# "alphabet" for alphabetical order.
# plot using corrplot package
corrplot(mcor,
method="shade",
shade.col=NA,
tl.col="black",
tl.srt=20,
col=col(200),
addCoef.col="black",
addCoefasPercent = TRUE,
order=order,
type="lower",
tl.cex = .90,
cl.cex = 0.90,
pch.cex = 0.5,
title = title(myTitle, line = -1)
)
}
## ----function more plots-------------------------------------------------
# barplot
# legend location may be specified by setting x to a single keyword from the list
# "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right"
# and "center".
myBarplot <- function(myCenters, k, title) {
barplot(t(myCenters)
, beside = TRUE
, xlab="cluster"
, ylab="value"
, legend.text = TRUE
, col = 2:6
, args.legend = list(x = "bottom")
, main = title)
}
# CreateDenPlot to create a density plot (smooth histogram), with params
# myData : data frame (e.g. metal)
# xString : string vector of variable name to use for
# x axis of density plot (e.g. "Mill.Head.FeCuR..")
# myFill : string vector of variable to use for fill (multiple series)
# (e.g. "CuCluster")
CreateDenPlot <- function(myData, xString, myFill) {
ggplot(myData, aes_string(x = xString, fill = myFill, colour=myFill)) +
geom_density(alpha = 0.3) +
theme_bw() +
theme(legend.position="bottom") +
scale_color_jco() +
scale_fill_jco() +
theme(axis.title.x = element_blank()) +
ggtitle(paste0(xString, collapse = NULL))
}
## ----fn viz cluster plots------------------------------------------------
# function CreateBoxPlot is boxplot with following params
# myData : data frame
# xString : string vector of variable name to use for
# x axis of box plot (e.g. "cluster")
# yString : string value of variable name to use for
# y axis of box plot (e.g. "feature1")
CreateBoxPlot <- function(myData, xString, yString) {
ggplot(myData,
aes_string(x = xString
, y = yString
, colour = xString
, fill = xString)) +
geom_boxplot(notch = FALSE
, width = 0.50
, outlier.size = 3
, lwd = .90
, fatten = 0
, alpha = 0.3) +
stat_summary(fun.y = "mean"
, geom = "point"
, shape = 20
, size = 4
, fill = "white") +
stat_summary(fun.y = "mean"
, geom = "text"
, vjust = -1.85
, hjust = 0.0
, colour = "black"
, aes(label = round(..y.., digits = 3))) +
# stat_boxplot(geom = "errorbar", lwd = 1, width = 0.5) +
theme_bw() +
scale_fill_jco() +
scale_color_jco() +
theme(legend.position = "right") +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_blank()) +
ggtitle(paste0(yString, collapse = NULL))
}
# grid_arrange_shared_legend helper function to create gtable object of
# multiple plots with one legend at the bottom, with parameters
# ... : ggplot objects (multiple plot objects)
grid_arrange_shared_legend <- function(..., position = c("bottom", "right")) {
plots <- list(...)
position <- match.arg(position)
g <- ggplotGrob(plots[[1]] + theme(legend.position=position))$grobs
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
lheight <- sum(legend$height)
lwidth <- sum(legend$width)
gl <- lapply(plots, function(x) x + theme(legend.position="none"))
combined <- switch(position,
"bottom" = arrangeGrob(do.call(arrangeGrob, gl),
legend,
ncol = 1,
heights = unit.c(unit(1, "npc") - lheight, lheight)),
"right" = arrangeGrob(do.call(arrangeGrob, gl),
legend,
ncol = 2,
widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
grid.newpage()
# grid.draw(combined) # original code, draws the plot (but I want an object)
arrangeGrob(combined) # this returns a gtable object
}
# CreateDenPlot to create a density plot (smooth histogram), with params
# myData : data frame (e.g. metal)
# xString : string vector of variable name to use for
# x axis of density plot (e.g. "feature1")
# myFill : string vector of variable to use for fill (multiple series)
# (e.g. "cluster")
CreateDenPlot <- function(myData, xString, myFill) {
ggplot(myData, aes_string(x = xString, fill = myFill, colour=myFill)) +
geom_density(alpha = 0.3) +
theme_bw() +
theme(legend.position="bottom") +
scale_color_jco() +
scale_fill_jco() +
theme(axis.title.x = element_blank()) +
ggtitle(paste0(xString, collapse = NULL))
}
## ----eda MultiPlot-------------------------------------------------------
# MultiPlot is a pairs plot of distribution,
# scatter, r, and p-values between variables
# MultiPlot function has params
# myCols : columns to include in correlation matrix
# myMain : plot title
MultiPlot(c("feature1", "feature2", "feature3", "feature4")
, "MultiPlot from Scaled Data Set")
## ----eda CorrPlot--------------------------------------------------------
# create correlation matrix
# CorrPlot function has params
# myCOls : columns to include in correlation matrix
# order : paramter to control ordering of cols in matrix, in quotes
# myTitle : chart title, in quotes
# order parameter options
# the ordering method of the correlation matrix.
# "original" for original order (default).
# "AOE" for the angular order of the eigenvectors.
# "FPC" for the first principal component order.
# "hclust" for the hierarchical clustering order.
# "alphabet" for alphabetical order.
order <- "original"
# par(mfrow=c(1,1)
# # mar = c(0,0,2,0)
# )
# features
dfSFeat <- c("feature1"
, "feature2"
, "feature3"
, "feature4")
myCols <- c(dfSFeat)
# call plotting function
# break up plots b/c of so many variables
myTitle <- "Correlation Matrix"
CorrPlot(myCols
, order
, myTitle
)
## ----clustering fit------------------------------------------------------
set.seed(93)
# set # of clusters
k <- 3
# K means
fitKM <- kmeans(scale(dfS[,1:4]), iter.max = 500, k)
fitKM
summary(fitKM)
table(fitKM$cluster) # num obs by cluster
fitKM$centers # center of cluster
# medoids
# PAM (partitioning around medoids)
fitPAM <- cluster::pam(dfS[,1:4], k)
fitPAM
summary(fitPAM)
table(fitPAM$clustering)
fitPAM$medoids # center of cluster
# FKM w medoids and noise (fuzzy clustering)
fitPAM.n <- FKM.med.noise(dfS[,1:4]
, k = k
, m = 1.5
, delta = 4
, RS = 3)
table(fitPAM.n$clus[,1])
# calc primary and seconday cluster assignment and probability
clusProbs <- data.frame(round(fitPAM.n$U, 3)) # cluster probabilities
colnames(clusProbs) <- paste("prob", colnames(clusProbs), sep = "_") # rename columns
clusProbs$primClus <- fitPAM.n$clus[,1] # primary clster number
clusProbs$primClusProb <- round(fitPAM.n$clus[,2],3) # primary cluster probability
clusProbs$secClus <- apply(clusProbs # secondary cluster
, 1
, function(x) which(rank(x, ties.method = "first")==3)) # secondary cluster
clusProbs$secClusProb <- apply(clusProbs # secondary cluster probability
, 1
, function(x) x[which(rank(x, ties.method = "first")==3)]) # sec cluster prob
# table of results
dResults <- dfS # initialize results table with original chem values
dResults <- cbind(dResults, clusProbs) # cluster membership; probability by cluster
dResults$tot <- rowSums(round(fitPAM.n$U, 3)) # calc sum of probability across clusters
dResults$probOut <- round(1 - dResults$tot, 3) # prob of being in outlier cluster
# calc principal components
myPrincomp <- prcomp(dfS[1:4])
# myPrincomp$x # matrix of principal component values
# plot prin comp
p1 <- fviz_pca_var(myPrincomp) +
theme_bw()
p2 <- fviz_pca_ind(myPrincomp) +
theme_bw()
grid.arrange(p1,p2, ncol = 2)
dResults$prin1 <- round(myPrincomp$x[,1],4)
dResults$prin2 <- round(myPrincomp$x[,2],4)
# identify outliers (probability of being in the outlier cluster > 20%)
myOut <- which(dResults$probOut > 0.20)
# refit without outliers
fitPAM.n2 <- FKM.med.noise(dfS[-myOut,1:4]
, k = k
, m = 1.5
, delta = 3
, RS = 1)
fitPAM.n$medoid # which points are medoids
fitPAM.n$H # medoid center
fitPAM.n$clus[1:150,] # which cluster assigned with associated probability
SIL(dfS[,1:4], fitPAM.n$U)
SIL.F(dfS[,1:4], fitPAM.n$U, 1)
plot.fclust(fitPAM.n
, pca = TRUE
#, ucex = TRUE # magnify by max membership
, umin = 0.40) # membership threshhold
#for assignment to cluster
print.fclust(fitPAM.n)
summary(fitPAM.n)
## ----clustering visualizations-------------------------------------------
# barplot of cluster composition by feature
par(mfrow = c(1, 1), xpd = TRUE)
myBarplot(fitKM$centers, k, "K means") # K means
myBarplot(fitPAM$medoids, k, "PAM") # PAM
myBarplot(fitPAM.n$H, k, "PAM w fuzzy clustering") # PAM w fuzzy clustering
myBarplot(fitPAM.n$H, k, "PAM w fuzzy clustering, outliers removed") # PAM w fuzzy clustering, outliers removed
# matrix of scatter plots by feature, colored by cluster
par(mfrow = c(1, 1))
plot(dfS[,1:4], col = fitKM$cluster, main = "KM") # KM
plot(dfS[,1:4], col = fitPAM$clustering, main = "PAM") # PAM matrix
plot(dfS[,1:4], col = fitPAM.n$clus[,1], main = "PAM w fuzzy clustering") # PAM.n matrix
plot(dfS[-myOut,1:4], col = fitPAM.n2$clus[,1], main = "PAM w fuzzy clustering, outliers removed") # PAM.n2 matrix
par(mfrow = c(2, 2))
plot(dfS[,2:3], col = fitKM$cluster, main = "KM") # KM Var v Vol
plot(dfS[,2:3], col = fitPAM$clustering, main = "PAM") # PAM Var v Vol
plot(dfS[,2:3], col = fitPAM.n$clus[,1], main = "PAM w fuzzy clustering") # PAM.n Var v Vol
plot(dfS[-myOut,2:3], col = fitPAM.n2$clus[,1], main = "PAM w fuzzy clustering, outliers removed") # PAM.n Var v Vol
par(mfrow = c(1, 1))
# calc principal components
myPrincomp <- prcomp(dfS[,1:4])
myPrincomp$x[1:10,] # matrix of principal component values
p1 <- fviz_pca_var(myPrincomp) +
theme_bw()
p2 <- fviz_pca_ind(myPrincomp) +
theme_bw()
grid.arrange(p1,p2, ncol = 2)
# bivariate cluster plot on prin comp scale
par(mfrow = c(1, 1))
clusplot(dfS[,1:4], fitKM$cluster, color=TRUE, shade=TRUE, main = "KM") # KM
clusplot(dfS[,1:4], fitPAM$clustering, color=TRUE, shade=TRUE, main = "PAM") # PAM
clusplot(dfS[,1:4], fitPAM.n$clus[,1], color=TRUE, shade=TRUE, main = "PAM w fuzzy clustering") # PAM.n
clusplot(dfS[-myOut,1:4], fitPAM.n2$clus[,1], color=TRUE, shade=TRUE, main = "PAM w fuzzy clustering, outliers removed") # PAM.n2
par(mfrow = c(1, 1))
# scatter plot of cluster by principal components 1 and 2
# KM
p1 <- fviz_cluster(fitKM, data = myPrincomp$x[,c(1,2)]) +
theme_bw() + ggtitle("KM")
p1
# PAM
p2 <- fviz_cluster(fitPAM) +
theme_bw() + ggtitle("PAM")
p2
# PAM.n
myList <- list(data = dfS[,1:4], cluster = fitPAM.n$clus[,1] )
p3 <- fviz_cluster(myList) +
theme_bw() + ggtitle("PAM w fuzzy clustering")
p3
# PAM.n2
myList <- list(data = dfS[-myOut,1:4], cluster = fitPAM.n2$clus[,1] )
p4 <- fviz_cluster(myList) +
theme_bw() + ggtitle("PAM w fuzzy clustering, outliers removed")
p4
grid.arrange(p1, p2, p3, p4, ncol = 2)
# scatter plot of cluster by 2 features at a time
# currenlty only setup with K means
fviz_cluster(fitKM, data = dfS[,c(1,3)]) +
theme_bw()
fviz_cluster(fitKM, data = dfS[,c(2,3)]) +
theme_bw()
fviz_cluster(fitKM, data = dfS[,c(1,3)]) +
theme_bw()
############################################################
# silhouette metric for within and between cluster distances
# KM
silKM = silhouette(fitKM$cluster, dist(dfS[,1:4]))
summary(silKM)
plot(silKM, col=2:(k+1), main = "KM")
# Objects with negative silhouette
neg_sil_index.KM <- which(silKM[, 'sil_width'] < 0)
silKM[neg_sil_index.KM, , drop = FALSE]
# PAM
silPAM <- silhouette(fitPAM)
summary(silPAM)
plot(silPAM, col=2:(k+1), main = "PAM")
# alternate silhouette plot
fviz_silhouette(fitPAM, label = TRUE) +
theme_bw()
# Objects with negative silhouette
neg_sil_index.PAM <- which(silPAM[, 'sil_width'] < 0)
silPAM[neg_sil_index.PAM, , drop = FALSE]
# PAM.n
silPAM.n = silhouette(fitPAM.n$clus[,1], dist(dfS[,1:4]))
summary(silPAM.n)
plot(silPAM.n, col=2:(k+1), main = "PAM w fuzzy clustering")
# Objects with negative silhouette
neg_sil_index.PAM.n <- which(silPAM.n[, 'sil_width'] < 0)
silPAM.n[neg_sil_index.PAM.n, , drop = FALSE]
# PAM.n2
silPAM.n2 = silhouette(fitPAM.n2$clus[,1], dist(dfS[-myOut,1:4]))
summary(silPAM.n2)
plot(silPAM.n2, col=2:(k+1), main = "PAM w fuzzy clustering, outliers removed")
# Objects with negative silhouette
neg_sil_index.PAM.n2 <- which(silPAM.n2[, 'sil_width'] < 0)
silPAM.n2[neg_sil_index.PAM.n2, , drop = FALSE]
# multiple sihouette plots on page
par(mfrow = c(1, 1))
plot(silKM, col=2:(k+1), main = "KM")
plot(silPAM, col=2:(k+1), main = "PAM")
plot(silPAM.n, col=2:(k+1), main = "PAM w fuzzy clustering")
plot(silPAM.n2, col=2:(k+1), main = "PAM w fuzzy clustering, outliers removed")
###
# density plot
# CreateDenPlot to create a density plot (smooth histogram), with params
# myData : data frame (e.g. metal)
# xString : string vector of variable name to use for
# x axis of density plot (e.g. "feature1")
# myFill : string vector of variable to use for fill (multiple series)
# (e.g. "cluster")
myFill <- "cluster"
denFtr1 <- CreateDenPlot(dfS, "feature1", myFill)
denFtr2 <- CreateDenPlot(dfS, "feature2", myFill)
denFtr3 <- CreateDenPlot(dfS, "feature3", myFill)
denFtr4 <- CreateDenPlot(dfS, "feature4", myFill)
# grid plot of all individual density plots
# intermediate function to create a gtable object with a shared legend
# grid_arrange_shared_legend helper function to create gtable object of
# multiple plots with one legend at the bottom, with parameters
# ... : ggplot objects (multiple plot objects)
myGtable.den <- grid_arrange_shared_legend(denFtr1
, denFtr2
, denFtr3
, denFtr4)
# create grid plot of density plots
grid.arrange(myGtable.den
, top = textGrob("Density Plots of Cluster Features"
, gp = gpar(fontsize=16)))
## ----export results------------------------------------------------------
# write resuts table
#write.csv(dResults, "Myoutput.csv")
以上是关于r 来自网络研讨会的R演示的主要内容,如果未能解决你的问题,请参考以下文章