This is a simple demo written in R
to compare performances of graph clustering algorithms used in the paper:
Congyuan Yang, Carey E. Priebe, Youngser Park, David J. Marchette,
Simultaneous Dimensionality and Complexity Model Selection for Spectral Graph Clustering,
Journal of Computational and Graphical Statistics, in revision, 2020.
To run a simple working live demo, please run these lines
if (!require(RCurl)) install.packages("RCurl")
library(RCurl)
script <- getURL("https://raw.githubusercontent.com/youngser/dhatkhat/master/R/demo.R", ssl.verifypeer = FALSE)
eval(parse(text = script))
and the result (each plot on its own window) should look like below.
(This may take around 10 minutes on a typical laptop.)
## required packages and sources
options(repos=structure(c(CRAN="http://cloud.r-project.org/")))
requiredPkg <- c("tidyverse", "igraph", "Matrix", "mclust", "irlba")
newPkg <- requiredPkg[!(requiredPkg %in% installed.packages()[, "Package"])]
if (length(newPkg)) install.packages(newPkg, dependencies = TRUE)
update.packages(newPkg, ask = FALSE)
invisible(sapply(requiredPkg, require, character.only = TRUE))
util.file <- url("http://www.cis.jhu.edu/~parky/dhatKhat/R/utils.R")
source(util.file)
close(util.file)
This is exactly the same setting as Figures 8 and 10 in the paper (with \(p = 0.09\)) except that the graph is smaller, e.g., \(n = 100\) as opposed to \(n = 500\) in the paper.
## parameters
n <- 100 # number of vertices
p <- 0.09 # a probability that between blocks can have edges
B <- cbind( c(.2, p), c(p, .1) ) # blockconnectivityprobabilitymatrix
rho <- c(.5, .5) # block membership probabilities, \pi in the paper
K <- length(rho) # number of blocks
Y <- rep(1:K, times=rho*n) # block membership vector, vertex labels
## generate a graph
set.seed(42+49182)
g <- sbm.game(n, pref.matrix = as.matrix(B),
block.sizes = rho*n, directed = F, loops = F); summary(g)
## IGRAPH 4a1aaa8 U--- 100 622 -- Stochastic block-model
## + attr: name (g/c), loops (g/l)
## embed graph
dmax <- 8 # maximum embedding dimension
ase <- embed.graph(A=g[,,sparse=F], dim=dmax, scaling = TRUE)
#dev.new()
elb <- getElbows(abs(ase$D), n=3, plot=TRUE); title("Scree plot with 3 suggested elbows")
(Note that it may take ~30 seconds to run on a typical personal computer.)
## run our algorithms
Kmax <- 4 # maximum number of clusters
bic <- ari <- rep(-Inf, dmax-1)
Khat <- rep(0, dmax-1)
out.SMS <- NULL; out.SMS$bic <- -Inf
for (d in 1 : (dmax - 1)) {
for (k in 1 : Kmax) {
# SMS_Reduced
mout.SMS_R <- bicGmmVVV(ase$X[, 1:d, drop=FALSE], k)
if (mout.SMS_R$bic > bic[d]) {
bic[d] <- mout.SMS_R$bic
ari[d] <- adjustedRandIndex(Y, mout.SMS_R$classification)
Khat[d] <- k
}
# SMS
mout.SMS <- bicGmmBlock2(ase$X, d, k)
if (mout.SMS$bic > out.SMS$bic) {
out.SMS <- mout.SMS
}
}
}
## calculate ARIs
## SMS
ari_SMS <- adjustedRandIndex(Y, out.SMS$classification)
dhat_SMS <- out.SMS$d
khat_SMS <- out.SMS$k
## SMS_Reduced
ari_SMS_R <- ari[dhat_SMS]
dhat_SMS_R <- dhat_SMS
khat_SMS_R <- khat_SMS
## ZG1
dhat_ZG1 <- min(elb[1], dmax-1)
ari_ZG1 <- ari[dhat_ZG1]
khat_ZG1 <- Khat[dhat_ZG1]
## ZG2
dhat_ZG2 <- min(elb[2], dmax-1)
ari_ZG2 <- ari[dhat_ZG2]
khat_ZG2 <- Khat[dhat_ZG2]
## ZG3
dhat_ZG3 <- min(elb[3], dmax-1)
ari_ZG3 <- ari[dhat_ZG3]
khat_ZG3 <- Khat[dhat_ZG3]
(Note that it may take ~600 seconds to run on a typical personal computer.)
## Louvain
result_Louvain <- cluster_louvain(g)
ari_Louvain <- adjustedRandIndex(Y, result_Louvain$membership)
khat_Louvain <- max(membership(result_Louvain))
## Walktrap
result_Rw <- cluster_walktrap(g)
ari_Walktrap <- adjustedRandIndex(Y, result_Rw$membership)
khat_Walktrap <- max(membership(result_Rw))
## IRM
if (!file.exists("RData/irm_out.RData")) {
system.time(result_IRM <- irm(g[,,sparse=FALSE], sweeps = 1000)) # ~10 min to run on a macbook
ari_IRM <- max(sapply(1:nrow(result_IRM), function(x) adjustedRandIndex(Y, result_IRM[x,])))
khat_IRM <- max(mode.irm(result_IRM))
dir.create("RData", showWarnings = FALSE)
save(result_IRM, ari_IRM, khat_IRM, file="RData/irm_out.RData")
} else {
load("RData/irm_out.RData")
}
algs <- c("SMS", "SMS_R", "ZG1", "ZG2", "ZG3", "Louvain", "Walktrap", "IRM")
df.out <- data.frame(method = factor(algs, levels=algs),
ARI = c(ari_SMS, ari_SMS_R, ari_ZG1, ari_ZG2, ari_ZG3,
ari_Louvain, ari_Walktrap, ari_IRM),
dhat = c(dhat_SMS, dhat_SMS_R, dhat_ZG1, dhat_ZG2, dhat_ZG3,
NA, NA, NA),
Khat = c(khat_SMS, khat_SMS_R, khat_ZG1, khat_ZG2, khat_ZG3,
khat_Louvain, khat_Walktrap, khat_IRM))
df.out
## method ARI dhat Khat
## 1 SMS 0.45694975 1 2
## 2 SMS_R 0.28486473 1 2
## 3 ZG1 0.28486473 1 2
## 4 ZG2 0.42984314 4 2
## 5 ZG3 0.00000000 6 1
## 6 Louvain 0.00650671 NA 6
## 7 Walktrap 0.05559344 NA 7
## 8 IRM 0.20046771 NA 7
p <- df.out %>% ggplot(aes(method, ARI, color=method, fill=method)) +
geom_col(alpha=0.5, show.legend = FALSE) +
geom_text(aes(label=paste0("hat(K) == ", Khat)), parse=TRUE, vjust=-0.2, show.legend = FALSE) +
theme(axis.title=element_text(size=15)) +
theme(axis.text.x=element_text(size=12)) +
# theme(axis.ticks = element_line(size = 5)) +
theme(axis.text.y=element_text(size=15)) +
theme(strip.text=element_text(size=rel(1.2))) +
theme(legend.text = element_text(colour="black", size = 15, face = "plain"))
#grid::grid.newpage()
print(p)