实验报告聚类分析报告
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
实验报告聚类分析
实验原理:K均值聚类、中心点聚类、系统聚类和EM算法聚类分析技术。
实验题目:用鸢尾花的数据集,进行聚类挖掘分析。
实验要求:探索鸢尾花数据的基本特征,利用不同的聚类挖掘方法,获得基本结论并简明解释。
实验题目--分析报告:data(iris)
> rm(list=ls())
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 431730 23.1 929718 49.7 607591 32.5
Vcells 787605 6.1 8388608 64.0 1592403 12.2
> data(iris)
> data<-iris
> head(data)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.
2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.
5 0.2 setosa
5 5.0 3.
6 1.4 0.2 setosa
6 5.4 3.9 1.
7 0.4 setosa
#Kmean聚类分析
> newiris <- iris
> newiris$Species <- NULL
> (kc <- kmeans(newiris, 3))
K-means clustering with 3 clusters of sizes 62, 50, 38
Cluster means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.901613 2.748387 4.393548 1.433871
2 5.006000 3.428000 1.462000 0.246000
3 6.850000 3.07368
4 5.74210
5 2.071053
Clustering vector:
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2
[41] 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
[81] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1
[121] 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3 3 3 1 3 3 3 1 3 3 1
Within cluster sum of squares by cluster:
[1] 39.82097 15.15100 23.87947
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
> table(iris$Species, kc$cluster)
1 2 3
setosa 0 50 0
versicolor 48 0 2
virginica 14 0 36
> plot(newiris[c("Sepal.Length", "Sepal.Width")], col = kc$cluster) > points(kc$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pc h = 8, cex=2)
#K-Mediods 进行聚类分析
> install.packages("cluster")
> library(cluster)
> iris.pam<-pam(iris,3)
> table(iris$Species,iris.pam$clustering)
1 2 3
setosa 50 0 0
versicolor 0 3 47
virginica 0 49 1
> layout(matrix(c(1,2),1,2))
> plot(iris.pam)
> layout(matrix(1))
#hc
> iris.hc <- hclust( dist(iris[,1:4]))
> plot( iris.hc, hang = -1)
> plclust( iris.hc, labels = FALSE, hang = -1)
> re <- rect.hclust(iris.hc, k = 3)
> iris.id <- cutree(iris.hc, 3)
#利用剪枝函数cutree()参数h控制输出height=18时的系谱类别> sapply(unique(iris.id),
+ function(g)iris$Species[iris.id==g])
[[1]]
[1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [12] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [23] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [34] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [45] setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
[[2]]
[1] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
[8] versicolor versicolor versicolor versicolor versicolor versicolor versicolor [15] versicolor versicolor versicolor versicolor versicolor versicolor versicolor [22] versicolor versicolor virginica virginica virginica virginica virginica [29] virginica virginica virginica virginica virginica virginica virginica
[36] virginica virginica virginica virginica virginica virginica virginica
[43] virginica virginica virginica virginica virginica virginica virginica
[50] virginica virginica virginica virginica virginica virginica virginica
[57] virginica virginica virginica virginica virginica virginica virginica
[64] virginica virginica virginica virginica virginica virginica virginica
[71] virginica virginica
Levels: setosa versicolor virginica
[[3]]
[1] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
[8] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
[15] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
[22] versicolor versicolor versicolor versicolor versicolor versicolor virginica
Levels: setosa versicolor virginica
> plot(iris.hc)
> rect.hclust(iris.hc,k=4,border="light grey")#用浅灰色矩形框出4分类聚类结果
> rect.hclust(iris.hc,k=3,border="dark grey")#用浅灰色矩形框出3分类聚类结果
> rect.hclust(iris.hc,k=7,which=c(2,6),border="dark grey")
# DBSCAN #基于密度的聚类
> install.packages("fpc")
> library(fpc)
> ds1=dbscan(iris[,1:4],eps=1,MinPts=5)#半径参数为1,密度阈值为5
> ds1
dbscan Pts=150 MinPts=5 eps=1
1 2
border 0 1
seed 50 99
total 50 100
> ds2=dbscan(iris[,1:4],eps=4,MinPts=5)
> ds3=dbscan(iris[,1:4],eps=4,MinPts=2)
> ds4=dbscan(iris[,1:4],eps=8,MinPts=2)
> par(mfcol=c(2,2))
> plot(ds1,iris[,1:4],main="1: MinPts=5 eps=1")
> plot(ds3,iris[,1:4],main="3: MinPts=2 eps=4")
> plot(ds2,iris[,1:4],main="2: MinPts=5 eps=4")
> plot(ds4,iris[,1:4],main="4: MinPts=2 eps=8")
> d=dist(iris[,1:4])#计算数据集的距离矩阵d
> max(d);min(d)#计算数据集样本的距离的最值
[1] 7.085196
[1] 0
> install.packages("ggplot2")
> library(ggplot2)
> interval=cut_interval(d,30)
> table(interval)
interval
[0,0.236] (0.236,0.472] (0.472,0.709] (0.709,0.945] (0.945,1.18] (1.18,1.42] 88 585 876 891 831 688 (1.42,1.65] (1.65,1.89] (1.89,2.13] (2.13,2.36] (2.36,2.6] (2.6,2.83] 543 369 379 339 335 406 (2.83,3.07] (3.07,3.31] (3.31,3.54] (3.54,3.78] (3.78,4.01] (4.01,4.25] 458 459 465 480 468 505 (4.25,4.49] (4.49,4.72] (4.72,4.96] (4.96,5.2] (5.2,5.43] (5.43,5.67] 349 385 321 291 187 138 (5.67,5.9] (5.9,6.14] (6.14,6.38] (6.38,6.61] (6.61,6.85] (6.85,7.09]
97 92 78 50 18 4 > which.max(table(interval))
(0.709,0.945]
4
> for(i in 3:5)
+ { for(j in 1:10)
+ { ds=dbscan(iris[,1:4],eps=i,MinPts=j)
+ print(ds)
+ }
+ }
dbscan Pts=150 MinPts=1 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=2 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=3 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=4 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=5 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=6 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=7 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=8 eps=3
1
seed 150
total 150
dbscan Pts=150 MinPts=9 eps=3
1
total 150
dbscan Pts=150 MinPts=10 eps=3 1
seed 150
total 150
dbscan Pts=150 MinPts=1 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=2 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=3 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=4 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=5 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=6 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=7 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=8 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=9 eps=4 1
seed 150
total 150
dbscan Pts=150 MinPts=10 eps=4 1
total 150
dbscan Pts=150 MinPts=1 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=2 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=3 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=4 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=5 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=6 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=7 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=8 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=9 eps=5
1
seed 150
total 150
dbscan Pts=150 MinPts=10 eps=5
1
seed 150
total 150
#30次dbscan的聚类结果
> ds5=dbscan(iris[,1:4],eps=3,MinPts=2)
> ds6=dbscan(iris[,1:4],eps=4,MinPts=5)
> ds7=dbscan(iris[,1:4],eps=5,MinPts=9)
> par(mfcol=c(1,3))
> plot(ds5,iris[,1:4],main="1: MinPts=2 eps=3")
> plot(ds6,iris[,1:4],main="3: MinPts=5 eps=4")
> plot(ds7,iris[,1:4],main="2: MinPts=9 eps=5")
# EM 期望最大化聚类
> install.packages("mclust")
> library(mclust)
> fit_EM=Mclust(iris[,1:4])
fitting ...
|===========================================================================| 100% > summary(fit_EM)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VEV (ellipsoidal, equal shape) model with 2 components:
log.likelihood n df BIC ICL
-215.726 150 26 -561.7285 -561.7289
Clustering table:
1 2
50 100
> summary(fit_EM,parameters=TRUE)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VEV (ellipsoidal, equal shape) model with 2 components:
log.likelihood n df BIC ICL
-215.726 150 26 -561.7285 -561.7289
Clustering table:
1 2
50 100
Mixing probabilities:
1 2
0.3333319 0.6666681
Means:
[,1] [,2]
Sepal.Length 5.0060022 6.261996
Sepal.Width 3.4280049 2.871999
Petal.Length 1.4620007 4.905992
Petal.Width 0.2459998 1.675997
Variances:
[,,1]
Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length 0.15065114 0.13080115 0.02084463 0.01309107 Sepal.Width 0.13080115 0.17604529 0.01603245 0.01221458 Petal.Length 0.02084463 0.01603245 0.02808260 0.00601568 Petal.Width 0.01309107 0.01221458 0.00601568 0.01042365
[,,2]
Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length 0.4000438 0.10865444 0.3994018 0.14368256 Sepal.Width 0.1086544 0.10928077 0.1238904 0.07284384 Petal.Length 0.3994018 0.12389040 0.6109024 0.25738990 Petal.Width 0.1436826 0.07284384 0.2573899 0.16808182
> plot(fit_EM)#对EM聚类结果作图Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
Selection:(下面显示选项)
#选1
Selection: 0
> iris_BIC=mclustBIC(iris[,1:4])
fitting ...
|===========================================================================| 100%
> iris_BICsum=summary(iris_BIC,data=iris[,1:4])
> iris_BICsum #获取数1据集iris在各模型和类别数下的BIC值
Best BIC values:
VEV,2 VEV,3 VVV,2
BIC -561.7285 -562.5522369 -574.01783
BIC diff 0.0000 -0.8237748 -12.28937
Classification table for model (VEV,2):
1 2
50 100
> iris_BIC
Bayesian Information Criterion (BIC):
EII VII EEI VEI EVI VVI EEE
1 -1804.0854 -1804.0854 -1522.120
2 -1522.1202 -1522.1202 -1522.1202 -829.9782
2 -1123.4117 -1012.2352 -1042.9679 -956.282
3 -1007.3082 -857.5515 -688.0972
3 -878.7650 -853.814
4 -813.0504 -779.1566 -797.8342 -744.6382 -632.9647
4 -893.6140 -812.6048 -827.4036 -748.4529 -837.5452 -751.0198 -646.0258
5 -782.6441 -742.6083 -741.9185 -688.3463 -766.8158 -711.4502 -604.8131
6 -715.7136 -705.7811 -693.7908 -676.169
7 -774.0673 -707.2901 -609.8543
7 -731.8821 -698.5413 -713.1823 -680.7377 -813.5220 -766.6500 -632.4947
8 -725.0805 -701.4806 -691.4133 -679.4640 -740.4068 -764.1969 -639.2640
9 -694.5205 -700.0276 -696.2607 -702.0143 -767.8044 -755.8290 -653.0878 EVE VEE VVE EEV VEV EVV VVV
1 -829.978
2 -829.9782 -829.9782 -829.9782 -829.9782 -829.9782 -829.9782
2 -657.226
3 -656.3270 -605.1841 -644.5997 -561.7285 -658.3306 -574.0178
3 -666.5491 -605.3982 -636.4259 -644.7810 -562.5522 -656.0359 -580.8396
4 -705.543
5 -604.8371 -639.7078 -699.8684 -602.0104 -725.2925 -630.6000
5 -723.7199 NA -632.205
6 -652.2959 -634.2890 NA -676.6061
6 -661.949
7 -609.5584 -664.8224 -664.4537 -679.5116 NA -754.7938
7 -699.5102 NA -690.6108 -709.9530 -704.7699 -809.8276 -806.9277
8 -700.4277 -654.8237 -709.9392 -735.4463 -712.8788 -831.7520 -830.6373
9 -729.6651 NA -734.2997 -758.9348 -748.8237 -882.4391 -883.6931
Top 3 models based on the BIC criterion:
VEV,2 VEV,3 VVV,2
-561.7285 -562.5522 -574.0178
> par(mfcol=c(1,1))
> plot(iris_BIC,G=1:7,col="yellow")
> mclust2Dplot(iris[,1:2],
+ classification=iris_BICsum$classification,
+ parameters=iris_BICsum$parameters,col="yellow")
> iris_Dens=densityMclust(iris[,1:2])# 对每一个样本进行密度估计fitting ...
|===========================================================================| 100%
> iris_Dens
'densityMclust' model object: (VEV,2)
Available components:
[1] "call" "data" "modelName" "n"
[5] "d" "G" "BIC" "bic"
[9] "loglik" "df" "hypvol" "parameters"
[13] "z" "classification" "uncertainty" "density" > plot(iris_Dens,iris[,1:2],col="yellow",nlevels=55) ##输入1或2 Model-based density estimation plots:
1: BIC
2: density
Selection:(下面显示选项)
#选1
#选2
Selection: 0
> plot(iris_Dens,type = "persp",col = grey(0.8)) Model-based density estimation plots:
1: BIC
2: density
Selection:(下面显示选项)
#选1
#选2
Selection: 0。