This is a simple demo written in R to compare performances of graph clustering algorithms used in the paper:

Congyuan Yang, Carey E. Priebe, Youngser Park, David J. Marchette,
Simultaneous Dimensionality and Complexity Model Selection for Spectral Graph Clustering,
Journal of Computational and Graphical Statistics, in revision, 2020.

To run a simple working live demo, please run these lines

if (!require(RCurl)) install.packages("RCurl")
library(RCurl)

script <- getURL("https://raw.githubusercontent.com/youngser/dhatkhat/master/R/demo.R", ssl.verifypeer = FALSE)
eval(parse(text = script))

and the result (each plot on its own window) should look like below.
(This may take around 10 minutes on a typical laptop.)

1 Install/Load required packages and utility functions

## required packages and sources
options(repos=structure(c(CRAN="http://cloud.r-project.org/")))
requiredPkg <- c("tidyverse", "igraph", "Matrix", "mclust", "irlba")
newPkg <- requiredPkg[!(requiredPkg %in% installed.packages()[, "Package"])]
if (length(newPkg)) install.packages(newPkg, dependencies = TRUE)
update.packages(newPkg, ask = FALSE)
invisible(sapply(requiredPkg, require, character.only = TRUE))

util.file <- url("http://www.cis.jhu.edu/~parky/dhatKhat/R/utils.R")
source(util.file)
close(util.file)

2 Set parameters

This is exactly the same setting as Figures 8 and 10 in the paper (with \(p = 0.09\)) except that the graph is smaller, e.g., \(n = 100\) as opposed to \(n = 500\) in the paper.

## parameters
n <- 100 # number of vertices
p <- 0.09 # a probability that between blocks can have edges
B <- cbind( c(.2, p), c(p, .1) ) # blockconnectivityprobabilitymatrix
rho <- c(.5, .5) # block membership probabilities, \pi in the paper
K <- length(rho) # number of blocks
Y <- rep(1:K, times=rho*n) # block membership vector, vertex labels

3 Generate graph

## generate a graph
set.seed(42+49182)
g <- sbm.game(n, pref.matrix = as.matrix(B),
             block.sizes = rho*n, directed = F, loops = F); summary(g)

## IGRAPH 4a1aaa8 U--- 100 622 -- Stochastic block-model
## + attr: name (g/c), loops (g/l)

#image(as_adjacency_matrix(g), main="Adjacency Matrix")
#dev.new(); 
plotA(g)

4 Embed graph

## embed graph
dmax <- 8 # maximum embedding dimension
ase <- embed.graph(A=g[,,sparse=F], dim=dmax, scaling = TRUE)
#dev.new()
elb <- getElbows(abs(ase$D), n=3, plot=TRUE); title("Scree plot with 3 suggested elbows")

5 Cluster graph

(Note that it may take ~30 seconds to run on a typical personal computer.)

## run our algorithms
Kmax <- 4 # maximum number of clusters
bic <- ari <- rep(-Inf, dmax-1)
Khat <- rep(0, dmax-1)
out.SMS <- NULL; out.SMS$bic <- -Inf
for (d in 1 : (dmax - 1)) {
  for (k in 1 : Kmax) {
    # SMS_Reduced
    mout.SMS_R <- bicGmmVVV(ase$X[, 1:d, drop=FALSE], k)
    if (mout.SMS_R$bic > bic[d]) {
      bic[d] <- mout.SMS_R$bic
      ari[d] <- adjustedRandIndex(Y, mout.SMS_R$classification)
      Khat[d] <- k
    }

    # SMS
    mout.SMS <- bicGmmBlock2(ase$X, d, k)
    if (mout.SMS$bic > out.SMS$bic) {
      out.SMS <- mout.SMS
    }
  }
}

## calculate ARIs
## SMS
ari_SMS <- adjustedRandIndex(Y, out.SMS$classification)
dhat_SMS <- out.SMS$d
khat_SMS <- out.SMS$k

## SMS_Reduced
ari_SMS_R <- ari[dhat_SMS]
dhat_SMS_R <- dhat_SMS
khat_SMS_R <- khat_SMS

## ZG1
dhat_ZG1 <- min(elb[1], dmax-1)
ari_ZG1 <- ari[dhat_ZG1]
khat_ZG1 <- Khat[dhat_ZG1]

## ZG2
dhat_ZG2 <- min(elb[2], dmax-1)
ari_ZG2 <- ari[dhat_ZG2]
khat_ZG2 <- Khat[dhat_ZG2]

## ZG3
dhat_ZG3 <- min(elb[3], dmax-1)
ari_ZG3 <- ari[dhat_ZG3]
khat_ZG3 <- Khat[dhat_ZG3]

6 Try other algorithms

(Note that it may take ~600 seconds to run on a typical personal computer.)

## Louvain
result_Louvain <- cluster_louvain(g)
ari_Louvain <- adjustedRandIndex(Y, result_Louvain$membership)
khat_Louvain <- max(membership(result_Louvain))

## Walktrap
result_Rw <- cluster_walktrap(g)
ari_Walktrap <- adjustedRandIndex(Y, result_Rw$membership)
khat_Walktrap <- max(membership(result_Rw))

## IRM
if (!file.exists("RData/irm_out.RData")) {
  system.time(result_IRM <- irm(g[,,sparse=FALSE], sweeps = 1000)) # ~10 min to run on a macbook
  ari_IRM <- max(sapply(1:nrow(result_IRM), function(x) adjustedRandIndex(Y, result_IRM[x,])))
  khat_IRM <- max(mode.irm(result_IRM))
  dir.create("RData", showWarnings = FALSE)
  save(result_IRM, ari_IRM, khat_IRM, file="RData/irm_out.RData")
} else {
  load("RData/irm_out.RData")
}

7 Print/Plot Output

algs <- c("SMS", "SMS_R", "ZG1", "ZG2", "ZG3", "Louvain", "Walktrap", "IRM")
df.out <- data.frame(method = factor(algs, levels=algs),
                     ARI = c(ari_SMS, ari_SMS_R, ari_ZG1, ari_ZG2, ari_ZG3,
                             ari_Louvain, ari_Walktrap, ari_IRM),
                     dhat = c(dhat_SMS, dhat_SMS_R, dhat_ZG1, dhat_ZG2, dhat_ZG3,
                              NA, NA, NA),
                     Khat = c(khat_SMS, khat_SMS_R, khat_ZG1, khat_ZG2, khat_ZG3,
                              khat_Louvain, khat_Walktrap, khat_IRM))
df.out

##     method        ARI dhat Khat
## 1      SMS 0.45694975    1    2
## 2    SMS_R 0.28486473    1    2
## 3      ZG1 0.28486473    1    2
## 4      ZG2 0.42984314    4    2
## 5      ZG3 0.00000000    6    1
## 6  Louvain 0.00650671   NA    6
## 7 Walktrap 0.05559344   NA    7
## 8      IRM 0.20046771   NA    7

p <- df.out %>% ggplot(aes(method, ARI, color=method, fill=method)) +
  geom_col(alpha=0.5, show.legend = FALSE) +
  geom_text(aes(label=paste0("hat(K) == ", Khat)), parse=TRUE, vjust=-0.2, show.legend = FALSE) +
  theme(axis.title=element_text(size=15)) +
  theme(axis.text.x=element_text(size=12)) +
  #  theme(axis.ticks = element_line(size = 5)) +
  theme(axis.text.y=element_text(size=15)) +
  theme(strip.text=element_text(size=rel(1.2))) +
  theme(legend.text = element_text(colour="black", size = 15, face = "plain"))
#grid::grid.newpage()
print(p)

Simultaneous Dimensionality and Complexity Model Selection for Spectral Graph Clustering

Department of Applied Mathematics and Statistics, Johns Hopkins University

2020-05-16