中:
中:
聚类分析的主要应用
中:
中:
中:
中:
中:
Objective function
Parameter setting
Performance
中:
> library(datasets)
> str(USArrests)
'data.frame': 50 obs. of 4 variables:
$ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
> row.names(USArrests)
[1] "Alabama" "Alaska" "Arizona" "Arkansas" "California" "Colorado"
[7] "Connecticut" "Delaware" "Florida" "Georgia" "Hawaii" "Idaho"
[13] "Illinois" "Indiana" "Iowa" "Kansas" "Kentucky" "Louisiana"
[19] "Maine" "Maryland" "Massachusetts" "Michigan" "Minnesota" "Mississippi"
[25] "Missouri" "Montana" "Nebraska" "Nevada" "New Hampshire" "New Jersey"
[31] "New Mexico" "New York" "North Carolina" "North Dakota" "Ohio" "Oklahoma"
[37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina" "South Dakota" "Tennessee"
[43] "Texas" "Utah" "Vermont" "Virginia" "Washington" "West Virginia"
[49] "Wisconsin" "Wyoming"
> sum(!complete.cases(USArrests))
[1] 0
> summary(USArrests)
Murder Assault UrbanPop Rape
Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
Median : 7.250 Median :159.0 Median :66.00 Median :20.10
Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
> df <- na.omit(USArrests)
> df <- scale(df, center = T, scale = T)
> summary(df)
Murder Assault UrbanPop Rape
Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
> library(ggplot2)
> library(factoextra)
> distance <- get_dist(df, method = "euclidean")
> fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
> km_output <- kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
> str(km_output)
List of 9
$ cluster : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ centers : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "1" "2"
.. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
$ totss : num 196
$ withinss : num [1:2] 46.7 56.1
$ tot.withinss: num 103
$ betweenss : num 93.1
$ size : int [1:2] 20 30
$ iter : int 1
$ ifault : int 0
- attr(*, "class")= chr "kmeans"
> km_output$totss
[1] 196
> km_output$withinss
[1] 46.74796 56.11445
> km_output$betweenss
[1] 93.1376
> sum(c(km_output$withinss, km_output$betweenss))
[1] 196
fviz_cluster(km_output, data = df)
cluster_df <- data.frame(state = tolower(row.names(USArrests)),
cluster = unname(km_output$cluster))
library(maps)
states <- map_data("state")
states %>%
left_join(cluster_df, by = c("region" = "state")) %>%
ggplot() +
geom_polygon(
aes(
x = long,
y = lat,
fill = as.factor(cluster),
group = group
),
color = "white") +
coord_fixed(1.3) +
guides(fill = F) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()
)
set.seed(8)
wss <- function(k){
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values,
y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")
hac_output <- hclust(dist(USArrests, method = "euclidean"), method = "complete")
plot(hac_output)
> for (i in 1:length(hac_cut)){
+ if(hac_cut[i] != km_output$cluster[i]) print(names(hac_cut)[i])
+ }
[1] "Colorado"
[1] "Delaware"
[1] "Georgia"
[1] "Missouri"
[1] "Tennessee"
[1] "Texas"
#install.packages("ggplot2")
#install.packages("factoextra")
#载入包
library(factoextra)
# 载入数据
data("USArrests")
# 数据进行标准化
df <- scale(USArrests)
# 查看数据的前五行
head(df, n = 5)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144 1.7589234 2.067820292
#确定最佳聚类数目
fviz_nbclust(df, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2)
#从指标上看,选择坡度变化不明显的点最为最佳聚类数目。可以初步认为聚为四类最合适。
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df, 4, nstart = 24)
#查看聚类的一些结果
print(km_result)
#提取类标签并且与原始数据进行合并
dd <- cbind(USArrests, cluster = km_result$cluster)
head(dd)
Murder Assault UrbanPop Rape cluster
Alabama 13.2 236 58 21.2 4
Alaska 10.0 263 48 44.5 3
Arizona 8.1 294 80 31.0 3
Arkansas 8.8 190 50 19.5 4
California 9.0 276 91 40.6 3
Colorado 7.9 204 78 38.7 3
#查看每一类的数目
table(dd$cluster)
1 2 3 4
13 16 13 8
#进行可视化展示
fviz_cluster(km_result, data = df,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)
#先求样本之间两两相似性
result <- dist(df, method = "euclidean")
#产生层次结构
result_hc <- hclust(d = result, method = "ward.D2")
#进行初步展示
fviz_dend(result_hc, cex = 0.6)
fviz_dend(result_hc, k = 4,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE,
rect = TRUE
)
#load library
library(ggplot2)
library(factoextra)
# load data
data("USArrests")
# Data standardization
df <- scale(USArrests)
# Top 10 rows of view data
head(df, n = 10)
# Determining the optimal number of clusters
fviz_nbclust(df, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2)
# From the point of view of the index, the best number of
# clustering is to select the points whose gradient changes are not obvious.
# It is preliminarily considered that it is most appropriate to divide into four categories.
# Setting up random number seeds to ensure the repeatability of the experiment
set.seed(123)
# Clustering by K-means
km_result <- kmeans(df, 4, nstart = 24)
# Look at some of the results of clustering
print(km_result)
# Extract class labels and merge them with the original data
dd <- cbind(USArrests, cluster = km_result$cluster)
head(dd)
# View the number of each category
table(dd$cluster)
# Visual display
fviz_cluster(km_result, data = df,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)
# Seek the similarity between two samples
result <- dist(df, method = "euclidean")
# Generating hierarchy
result_hc <- hclust(d = result, method = "ward.D2")
# preliminary dispaly
fviz_dend(result_hc, cex = 0.6)
# According to this graph, it is convenient to determine the suitable
# grouping into several categories, such as we grouped into four categories
# and displayed visually.
fviz_dend(result_hc, k = 4,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE,
rect = TRUE
)
Option 1:
Pick a dataset of your choice, apply both K-means and HAC algorithms to identify the underlying cluster structures and compare the differene between two outputs (if you are using a labeled dataset, you can also evaluate the performance of the cluster assignments by comparing them to the true class labels)
Submit your R codes with the cluster assignment outputs.
Option 2:
Identify a successful real world application of cluster analysis algorithm and discuss how it works. Please include a discussion of your understanding of clustering model, and how clustering model helps discover hidden data patterns for the application. The application could be in any kind of industries: banking, insurance, education, public administration, technology, healthcare management, etc.
Submit your essay which should be no less than 300 words
一些相关资料
因篇幅问题不能全部显示,请点此查看更多更全内容
怀疑对方AI换脸可以让对方摁鼻子 真人摁下去鼻子会变形
女子野生动物园下车狼悄悄靠近 后车司机按喇叭提醒
睡前玩8分钟手机身体兴奋1小时 还可能让你“变丑”
惊蛰为啥吃梨?倒春寒来不来就看惊蛰
男子高速犯困开智能驾驶出事故 60万刚买的奔驰严重损毁