if (!require(pacman)) {
install.packages("pacman")
}
if (!require(devtools)) {
install.packages("devtools")
}
if (!require(gmmase)) {
devtools::install_github("youngser/gmmase")
}
pacman::p_load(igraph, irlba, Matrix, lubridate, scales, vegan, broom, tidyverse, ggrepel, doParallel)
registerDoParallel(detectCores()/2)
theme_update(axis.text = element_text(size=12),
strip.text = element_text(size = 12, face="bold"),
axis.title = element_text(size=14, face="bold"))
gg_color_hue <- function(n, l=65, c=100) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = l, c = c)[1:n]
}
rdpg.sample <- function(X, rdpg_scale=FALSE) {
P <- X %*% t(X)
if (rdpg_scale) {
P <- scales::rescale(P)
}
n <- nrow(P)
U <- matrix(0, nrow = n, ncol = n)
U[col(U) > row(U)] <- runif(n*(n-1)/2)
U <- (U + t(U))
diag(U) <- runif(n)
A <- (U < P) + 0 ;
diag(A) <- 0
return(graph.adjacency(A,"undirected"))
}
full.ase <- function(A, d, diagaug=TRUE, doptr=FALSE) {
# require(irlba)
# doptr
if (doptr) {
g <- ptr(A)
A <- g[]
} else {
A <- A[]
}
# diagaug
if (diagaug) {
diag(A) <- rowSums(A) / (nrow(A)-1)
}
A.svd <- irlba(A,d)
Xhat <- A.svd$u %*% diag(sqrt(A.svd$d))
Xhat.R <- NULL
if (!isSymmetric(A)) {
Xhat.R <- A.svd$v %*% diag(sqrt(A.svd$d))
}
return(list(eval=A.svd$d, Xhat=Matrix(Xhat), Xhat.R=Xhat.R))
}
procrustes2 <- function(X, Y) {
tmp <- t(X) %*% Y
tmp.svd <- svd(tmp)
W <- tmp.svd$u %*% t(tmp.svd$v)
newX <- X %*% W
return(list(newX = newX, error = norm(newX-Y, type="F"), W = W))
}
Avanti Athreya, Zachary Lubberts, Youngser Park, and Carey Priebe, “Discovering underlying dynamics in time series of networks”, submitted, 2022.
1 Abstract
Analyzing changes in network evolution is central to statistical network inference, as underscored by recent challenges of predicting and distinguishing pandemic-induced transformations in organizational and communication networks. We consider a joint network model in which each node has an associated time-varying low-dimensional latent vector of feature data, and connection probabilities are functions of these vectors. Under mild assumptions, the time-varying evolution of the latent vectors exhibits low-dimensional manifold structure under a suitable notion of distance. This distance can be approximated by a measure of separation between the observed networks themselves, and there exist Euclidean representations for underlying network structure, as characterized by this distance, at any given time. These Euclidean representations, called Euclidean mirrors, permit the visualization of network evolution and transform network inference questions such as change-point and anomaly detection into a classical setting. We illustrate our methodology with real and synthetic data, and identify change points corresponding to massive shifts in pandemic policies in a communication network of a large organization.
2 MSFT TSG Data
We consider a time series of weighted communication networks, arising from the email communications between 32277 entities in a large organization, with one network being generated each month from January 2019 to December 2020, a period of 24 months. We obtain a clustering by applying Leiden clustering to the January 2019 network, obtaining 33 clusters that we retain throughout the two year period. We make use of this clustering to compute the Graph Encoder Embedding (GEE, which produces spectrally-derived estimates of invertible transformations of the original latent positions. For each time \(t=1, \cdots, 24\), we obtain a matrix \(\hat{Z}_t\in\mathbb{R}^{32277\times 33}\), each row of which provides an estimate of these transformed latent positions. Constructing the distance matrix \(\mathcal{D}_{\hat{\psi}}=[\hat{d}_{MV}(\hat{Z}_t,\hat{Z}_{t'})]\in\mathbb{R}^{24\times24}\), we apply CMDS to obtain the estimated curve \(\hat{\psi}\), where the choice of dimension \(c=2\) is based on the scree plot of \(\mathcal{D}_{\hat{\psi}}\). The nonlinear dimensionality reduction technique ISOMAP, which relies on a spectral decomposition of geodesic distances, can be applied to these points to extract an estimated 1-dimensional curve. This one-dimensional curve exhibits some changes from the previous trend in Spring 2020 and a much sharper qualitative transformation in July 2020. What is striking is that both these qualitative shifts correspond to policy changes: in Spring 2020, there was an initial shift in operations, widely regarded at the time as temporary. In mid-summer 2020, nearly the peak of the second wave of of COVID-19, it was much clearer that these organizational shifts were likely permanent, or at least significantly longer-lived.
- take MSFT OrgSci tsg, weighted (noptr’ed), symmetric, hollow, connected.
- use JL’s root partition (into 33 classes).
- run AEE, which yields 24 x 32277 x 33 embeddings.
- then do our manifold learning procedure, i.e.,
- get 24 x 24 of D via mean(L2_norm)^2 o AEE. note d_aee = 33.
- run mds(sqrt(D)), find dhat_mds = 2.
- run isomap(mds), find dhat_isomap = 1.
Fig 1. Evidence of structural network dissimilarity across time. Visualizations are of anonymized and aggregated Microsoft communications networks in April and July 2020, with two communities highlighted. During this time period, our analysis demonstrates that the network as a whole experiences a major structural shock coincident with the pandemic work-from-home order. However, this shock is experienced by all network communities: the green community highlighted here experiences just a constant drift, while the blue community experiences a major shock. (See Figure 2.3, wherein the overall network behavior, as well as the two highlighted communities with their different temporal behavior, are depicted.) The figures here are two-dimensional renderings of temporal snapshots of a large (n=32277) complex network; hence, any conclusions based on this visualization alone are entirely notional.
load("~/Dropbox/OrgSci/df.vY.RData")
Yhat <- unlist(df.v$Y)
if (min(Yhat) == 0) Yhat <- Yhat + 1 # fix 0-base
Laplacian <- params$Laplacian
Correlation <- params$Correlation
if (params$rerun) {
source("~/Dropbox/RFiles/GraphEncoder.R")
if (FALSE) {
renameV <- function(g, df.v)
{
el <- igraph::as_data_frame(g, what = "edges") %>% as_tibble()
el <- el %>% select(1:3) %>%
mutate(from2 = map_int(from, function(x) df.v$i[match(x, df.v$v)]),
to2 = map_int(to, function(x) df.v$i[match(x, df.v$v)])) %>%
select(from=from2, to=to2, weight)
el <- el %>% distinct_all() %>% as.matrix()
}
load(paste0("~/Dropbox/OrgSci/df.lcc-",params$weighted,"-",params$ptr,"-emb.RData"))
pacman::p_load(furrr, wordspace)
plan(multisession, workers = 24)
opts <- furrr_options(packages=c("igraph", "tidyverse"))
system.time(df.gb2 <- future_map(df.gb$tb, function(x) renameV(x, df.v), .options = opts, .progress=TRUE))
df.gb3 <- tibble(date=df.gb$date, el=df.gb2)
save(df.gb3, file="df.gb-el.RData")
} else {
load("df.gb-el.RData")
}
system.time(df.gb3 <- df.gb3 %>% mutate(aee = map(el, ~GraphEncoder(., Yhat,
Laplacian=Laplacian,
Correlation=Correlation))))
save(df.gb3, file=paste0("df.gb3-el-aee-L",Laplacian,"-C",Correlation,".RData")) # weighted, noptr
} else {
load(paste0("df.gb3-el-aee-L",Laplacian,"-C",Correlation,".RData")) # weighted, noptr
}
n <- max(df.gb3$el[[1]][,1:2])
m <- nrow(df.gb3)
registerDoParallel(m)
getD <- function(Xlist, k=0) {
if (k==0) {
ind <- 1:n
} else {
ind <- which(Yhat==k)
}
comb <- combn(m,2)
Dout <- foreach (k = 1:ncol(comb), .combine='rbind') %dopar% {
i <- comb[1,k]
j <- comb[2,k]
cat("i = ", i, ", j = ", j, "\n")
Xhati <- Xlist[[i]]$Z[ind,] # 32277 x 33
Xhatj <- Xlist[[j]]$Z[ind,]
D <- norm(Xhati - Xhatj, type="2")^2/n
tibble(i=i, j=j, D=D)
}
D2 <- matrix(0,m,m)
D2[t(comb)] <- Dout$D
D2 <- (D2 + t(D2)) / 1
#as.dist(D2)
D2 <- sqrt(D2)
D2
}
D2 <- getD(df.gb3$aee)
doMDS <- function(D, type = "AEE", doplot=FALSE)
{
if (doplot) print(image(Matrix(D), colorkey=T, sub="", main="", xlab="time_j", ylab="time_i")) #paste0("sqrt(D) o ", type)))
mds <- cmdscale(D, m-1)
df.mds <- tibble(date=df.gb3$date, x=mds[,1], y=mds[,2], z=mds[,3], w=mds[,4])
if (df.mds$y[1] > 0) df.mds$y <- -df.mds$y
if (doplot) plot(apply(mds,2,sd), type="b", main="", xlab="dimension", ylab="column stdev")
p1 <- df.mds %>% ggplot(aes(x=ym(date), y=x, color=date, group=1)) +
geom_point(size=3) + geom_line() +
theme(legend.position = "none") + labs(x="date",y="mds1") #+
if (doplot) print(p1)
p2 <- df.mds %>% ggplot(aes(x=x, y=y, color=date)) +
geom_point(size=3) +
geom_label_repel(aes(label=date), size=2) + #, max.overlaps = 20) +
theme(legend.position = "none") + labs(x="mds1",y="mds2") #+
if (doplot) print(p2)
return(list(mds=mds, df.mds=df.mds))
}
embedtype <- ifelse(params$Laplacian, "LEE", "AEE")
df.mds <- doMDS(D2, embedtype, doplot=TRUE)
mds <- df.mds$mds
df.mds <- df.mds$df.mds
Figure 2.1: Top two dimensions from multidimensional scaling of pairwise network distance matrix for anonymized and aggregated Microsoft communications networks from January 2019 to December 2020, demonstrating that individual networks follow a curve that progresses smoothly until spring 2020 and then exhibits a major shock.
knn = 3 # large enough to be connected
isod <- 1
# Screeplot of Isomap
if (Correlation==TRUE) {
dis21 <- vegdist(mds, "euclidean")
isoD = isomap(dis21, k=knn, ndim=m-1, path="shortest")$points
plot(apply(isoD,2,sd), type="b", main="", xlab="dimension", ylab="column stdev")
}
doIso <- function(mds, mdsd=2, isod=1, doplot=FALSE, dolm=FALSE)
{
df.iso <- NULL
dis <- vegdist(mds[,1:mdsd], "euclidean")
knn <- 1
success <- FALSE
while(!success) {
tryCatch({
iso = isomap(dis, k=knn, ndim=isod, path="shortest")$points
success <- TRUE
},
error = function(e) {
knn <<- knn + 1
})
}
iso2 <- tibble(iso=iso[,1]) %>% mutate(i=1:nrow(mds), date=df.gb3$date, knn=knn)
df.iso <- rbind(df.iso, cbind(iso2, mdsd=mdsd))
df.iso <- df.iso %>% group_by(mdsd) %>% mutate(iso = if(iso[1] > 0) {-iso} else {iso}) %>% ungroup()
p <- df.iso %>% filter(mdsd==mdsd) %>%
ggplot(aes(x=ym(date), y=iso, color=date, group=1)) +
geom_point(size=3) + geom_line() +
theme(legend.position = "none") +
labs(x="time", y="iso-mirror") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short()) +
theme(axis.text.x=element_text(hjust=0.7))
if (doplot) print(p)
if (dolm) {
df.isok <- df.iso %>% filter(mdsd==mdsd) %>% mutate(date2 = format(ymd(paste0(date,"-01")),"%m/%y"))
row.names(df.isok) <- df.isok$date2
fit <- lm(iso ~ i, data=df.isok)
myfor <- augment(fit)
myfor2 <- myfor %>% mutate(date=.rownames,
ranks = rank(.sigma),
mycol=sprintf("%2d",rank(.fitted)))
p <- myfor2 %>%
ggplot(aes(.fitted, .resid)) +
geom_point(aes(color=mycol)) +
geom_hline(yintercept = 0, linetype="dashed", color="grey") +
geom_smooth(method="loess", se=FALSE) +
labs(x="Fitted Values", y="Residuals") +
theme(legend.position = "none",
axis.title = element_text(size=14, face="bold"))
p <- p + geom_text_repel(aes(label=date), data=myfor2 %>% filter(ranks %in% 1:3))
print(p)
}
return(df.iso)
}
df.iso <- doIso(mds, mdsd=2, doplot=TRUE)
Figure 2.2: ISOMAP manifold learning representation and provides clear and concise anomaly and changepoint information.
Khat <- max(Yhat)
nk <- table(Yhat)
df.K <- NULL
df.K <- foreach (k = 0:Khat, .combine='rbind') %dopar% {
# cat("k = ", k, "\n")
Dk <- getD(df.gb3$aee, k)
df.mds.k <- doMDS(Dk, embedtype)
mds.k <- df.mds.k$mds
doIso(mds.k, mdsd=2) %>% mutate(k=k, nk=ifelse(k==0, n, nk[k]))
}
pk <- df.K %>% filter(k!=0) %>% #mutate(labk=paste0(k," (",nk,")")) %>%
ggplot(aes(x=ym(date), y=iso, group=k)) +
geom_line(color="grey50", alpha=0.2) +
labs(x="time", y="iso-mirror") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short()) +
theme(legend.position = "none") +
theme(axis.text.x=element_text(hjust=0.7))
labK <- paste0(1:Khat, " (", as.numeric(table(Yhat)),")")
names(labK) <- 1:Khat
df.slope <- df.K %>% filter(k!=0) %>%
group_by(k) %>% summarize(slope=lm(iso~i)$coefficient[2]) %>%
summarize(flat=which.min(slope), steep=which.max(slope))
pk2 <- pk + geom_line(data=df.K %>% filter(k %in% as.numeric(df.slope)), size=1)
df.bent <- df.K %>% group_by(k) %>% summarize(bent = max(diff(iso))) %>%
filter(k!=0) %>%
summarize(flat=which.min(bent), steep=which.max(bent))
df.slope[2] <- df.bent[2]
df.slope <- c(19,14)
mycol <- gg_color_hue(Khat)[df.slope]
df.Kslope <- df.K %>% filter(k %in% df.slope) %>% mutate(col=rep(rev(mycol), each=m))
pk2 <- pk + geom_line(data=df.Kslope, size=1.2, color=df.Kslope$col)
df.iso0 <- df.K %>% filter(k==0)
pk2 + geom_line(aes(color=date, group=1), data=df.iso0, size=1.1) +
geom_point(aes(color=date, group=1), data=df.iso0, size=3)
Figure 2.3: Differential pandemic effect on 33 subcommunities of the larger organization. For each subcommunity, we apply our methods to obtain an ISOMAP representation, yielding a single grey curve. The green and blue highlighted curves correspond to the communities highlighted in Figure 1 which exhibit constant drift over the two years (green) versus major shock during the pandemic (blue). The overall network ISOMAP embedding from Figure 2.1 is overlaid for context.
df.isok <- df.iso
tvec <- 5:(m-1)
getSigmage <- function(df) {
df.fit <- NULL
for (j in tvec) {
fit <- lm(iso ~ i, data=df %>% filter(i %in% 1:j))
newdat <- df %>% filter(i==j+1)
pred <- predict(fit, newdat, se.fit=T)
df.sig <- newdat %>% mutate(pred.iso=as.numeric(pred$fit),
diff=as.numeric(abs(iso - pred$fit)),
pred.se=pred$se.fit) %>%
mutate(sigmage = diff/pred.se)
df.fit <- rbind(df.fit, df.sig)
}
df.fit
}
df.fit <- getSigmage(df.isok)
df.fit2 <- full_join(df.isok, df.fit)
df.fit2 %>% mutate(sigmages=diff/pred.se) %>%
ggplot(aes(x=ym(date), y=sigmages, color=date, group=1)) +
geom_point() + geom_line() +
geom_hline(yintercept = 5, linetype=2) +
labs(x="time") +
theme(legend.position = "none") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short()) +
theme(axis.text.x=element_text(hjust=0.7))
p <- df.fit2 %>% select(date, iso, pred.iso) %>%
gather(key="type", value="value", -date) %>%
ggplot(aes(x=ym(date), y=value, color=type)) +
geom_point(size=2) + geom_line() +
labs(x="time", y="iso-mirror") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short()) +
theme(axis.text.x=element_text(hjust=0.7))
df.rib <- df.fit %>% select(date, value=pred.iso, se=pred.se) %>% mutate(type="pred.iso")
df.rib <- rbind(df.rib, tibble(date=df.rib$date, value=df.fit$iso, se=0, type="iso"))
mycol <- gg_color_hue(2)
th <- 5
p + geom_ribbon(data=df.rib, aes(ymin=value-th*se, ymax=value+th*se, fill=type), alpha=0.2, color=NA) +
theme(legend.position = c(0.15,0.9), legend.title=element_blank(), legend.background = element_rect("grey95"))
Figure 2.4: Changepoint detection. Left panel: Comparing the ISOMAP embedding at a given time to the mean of the previous 5 months’ emeddings, measured in terms of their standard deviation. The plot of the sigmages, indicates clear outliers in March-April of 2020, and again in July-September. Right panel: We plot the observed ISOMAP embedding along with a running confidence interval for the predicted value of the ISOMAP embedding generated from linear regression applied to the embeddings for the previous 5 months, with width 5 standard deviations. Once again, we detect the changepoints in Spring and Summer of 2020.
3 Synthetic Data
We use real data to obtain a distribution from which we may resample. Such a network bootstrap permits us to test our asymptotic results through replicable simulations that are grounded in actual data. To this end, we consider the true latent position distribution at each time to be equally likely to be any row of the GEE-obtained estimates from the real data, \(\hat{Z}_t\in\mathbb{R}^{32277\times 33}\), for \(t=1,\ldots,24.\) Given a sample size \(n_s\), for each time, we sample these rows uniformly and with replacement to get a matrix of latent positions \(X_t\in\mathbb{R}^{n_s\times 33}\). We treat this matrix as the generating latent position matrix for independent adjacency matrices \(A_t\sim\mathrm{RDPG}(X_t)\). Note that if for sample \(i\), we choose row \(j\) of \(\hat{Z}_1\) at time \(t=1\), then the same row \(j\) of \(\hat{Z}_t\) will be used for all times \(t=1,\ldots,24\) for that sample, so that the original dependence structure is preserved. We may now apply the methods described in our theorems, namely ASE of the adjacency matrices followed by Procrustes alignment, to obtain the estimates \(\hat{X}_t\), along with the associated distance estimates.
- take 32277 x 33 AEE embedding for each time,
- given sample size \(n_s\), sample rows (with replacement) for each time (use the same sampled rows for all time),
- generate a tsg of RDPG’s,
- run ASE into d=33, do procrustes to align them,
- then do our manifold learning procedure, i.e., isomap o cmds o D.
Figure 3.1: Pandemic effect recovered from synthetic data. For each of 100 replicates of bootstrapped data, with ns = 30000 for each replicate, we repeat the procedure of Figure 2.4 left panel. We then plot these sigmages using a box and whisker plot. The pandemic effect in summer of 2020 is clearly visible in all but a few replicates, while the effect in March-April is still identified in the majority of replicates.
load("~/Dropbox/AEE/df.fit2-aee.RData")
load(paste0("~/Dropbox/OrgSci/df.iso-rdpg-aee-ns1000-mc1.RData"))
df.fit2 <- df.fit2 %>% mutate(iso1000 = df.iso$iso)
load(paste0("~/Dropbox/OrgSci/df.iso-rdpg-aee-ns10000-mc1.RData"))
df.fit2 <- df.fit2 %>% mutate(iso10000 = df.iso$iso)
load(paste0("~/Dropbox/OrgSci/df.iso-rdpg-aee-ns20000-mc1.RData"))
df.fit2 <- df.fit2 %>% mutate(iso20000 = df.iso$iso)
load(paste0("~/Dropbox/OrgSci/df.iso-rdpg-aee-ns30000-mc1.RData"))
df.fit2 <- df.fit2 %>% mutate(iso30000 = df.iso$iso)
p <- df.fit2 %>% select(date, iso, pred.iso) %>%
gather(key="type", value="value", -date) %>%
ggplot(aes(x=ym(date), y=value, color=type)) +
geom_point(size=2) + geom_line() +
labs(x="time", y="iso-mirror") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short())
mycol <- gg_color_hue(5)
df.fit2 %>% select(date, iso, iso1000, iso10000, iso20000, iso30000) %>%
gather(key="type", value="value", -date) %>%
ggplot(aes(x=ym(date), y=value, color=type)) +
geom_point(size=2) + geom_line() +
scale_color_manual(labels = c(expression(true~(n==32277)),
expression(bootstrap~(n[s]==1000)),
expression(bootstrap~(n[s]==10000)),
expression(bootstrap~(n[s]==20000)),
expression(bootstrap~(n[s]==30000))),
values=gg_color_hue(5)) +
labs(x="time", y="iso-mirror") +
scale_x_date(breaks = scales::breaks_pretty(8), labels=label_date_short()) +
theme(legend.position = c(0.25,0.8), legend.title=element_blank(), legend.background = element_rect("grey92"), # 95
axis.text.x=element_text(hjust=0.7),
legend.text.align = 0)
Figure 3.2: Convergence of bootstrapped estimates. We collect bootstrapped data samples with replacement from the real data of the previous section, at various sample sizes. Applying ISOMAP to the resulting network embeddings, we observe that the bootstrapped curve converges to the original ISOMAP curve as the number of samples ns increases.
4 Simulations
4.1 Brownian Motion
In this section, we provide estimation results for a time series of networks with latent position process in which the latent positions process follows \(X_t=\gamma(t)+B_t\), where \(B_t\) is a \(2\)-dimensional Brownian motion, and \(\gamma:[0,T] \rightarrow \mathbb{R}^d\) is a Lipschitz continuous function of the form \(\gamma(t)=a(t)v\).
These plots are for a linear drift shown in Figures 7 and 8 in Section A.3 in the manuscript.
set.seed(12345)
n = 2000
d = 2
m = 30
atv = function(x) ((x/50)+1/10)*rep(1/sqrt(2),2)
Bt <- c(0,0)
scaleB = 1/(180*sqrt(30))
Xt<- glist <- NULL
for(i in 1:m){
Bt=Bt+matrix(rnorm(n*2),ncol=2)*scaleB
Xt[[i]] = atv(i)+Bt
glist[[i]]<-rdpg.sample(Xt[[i]],rdpg_scale=FALSE)
}
df.X <- do.call('rbind', Xt) %>% as_tibble() %>%
rename(x=V1,y=V2) %>%
mutate(time = rep(1:m, each=n)) %>%
mutate(v = sprintf("%2d", rep(1:n, times=m)))
rangeX = range(df.X %>% select(x,y))
# do ase
dmax = 10
df.sim <- tibble(t=1:m, g=glist) %>%
mutate(emb = map(g, ~full.ase(., dmax, doptr=FALSE))) %>%
mutate(elb = map(emb, function(x) getElbows(x$eval, plot=F)))
df.elb <- df.sim %>% select(t, emb) %>%
mutate(eval = map(emb, function(x) x$eval)) %>%
select(-emb) %>%
unnest(eval) %>%
group_by(t) %>%
mutate(dim = row_number())
round(colMeans(t(do.call('cbind', df.sim$elb))))
# [1] 1 5 8
df.elb %>% ggplot(aes(x=dim, y=eval, color=factor(t), group=factor(t))) +
geom_point() + geom_line() + theme(legend.position = "none") +
xlab("dimension") + ylab("eigenvalues") +
scale_x_continuous(breaks=seq(1,10,by=2))
docenter <- FALSE
dproc <- 2
comb <- combn(m,2)
Dsim <- foreach (k = 1:ncol(comb), .combine='c') %dopar% {
i <- comb[1,k]
j <- comb[2,k]
cat("i = ", i, ", j = ", j, "\n")
Xhati <- df.sim$emb[[i]]$Xhat[,1:dproc,drop=F]
Xhatj <- df.sim$emb[[j]]$Xhat[,1:dproc,drop=F]
proc <- procrustes2(as.matrix(Xhati), as.matrix(Xhatj))
Xhati <- Xhati %*% proc$W
norm(Xhati - Xhatj, type="2")^2/n
}
D <- matrix(0,m,m)
D[t(comb)] <- Dsim
D <- (D + t(D)) / 1
D2 <- sqrt(D)
image(Matrix(D2), main="", sub="", xlab="time_j", ylab="time_i")#, main="sqrt(D) o proc o ase")
simmds <- cmdscale(D2, m-1)
#getElbows(simmds, main="Scree of mds o sqrt(D)")
plot(apply(simmds,2,sd), type="b", xlab="dimension", ylab="column stdev", main="Scree of mds o sqrt(D)")#,par(font.axis = 2),
df.simmds <- tibble(t=df.sim$t, "mds(sqrt(D)) dim 1"=simmds[,1], "mds(sqrt(D)) dim 2"=simmds[,2])
df.simmds %>%
ggplot(aes(x=`mds(sqrt(D)) dim 1`, y=`mds(sqrt(D)) dim 2`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("mds1") + ylab("mds2") +
theme(legend.position = "none")
df.simmds %>%
ggplot(aes(x=1:30, y=`mds(sqrt(D)) dim 1`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("time") + ylab("mds1") +
theme(legend.position = "none")
df.simmds %>%
ggplot(aes(x=`mds(sqrt(D)) dim 1`, y=`mds(sqrt(D)) dim 2`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("mds1") + ylab("mds2") +
theme(legend.position = "none")
knn = 1 # ifelse(params$rdpg_scale, 7, 3) # 7 for rescale # large enough to be connected
isod <- 10
# Screeplot of Isomap
dissim <- vegdist(simmds, "euclidean")
success <- FALSE
while(!success) {
tryCatch({
isosim <- isomap(dissim, k=knn, ndim=isod, path="shortest")$points
success <- TRUE
},
error = function(e) {
knn <<- knn + 1
})
}
# scree of isomap
#getElbows(isosim, main="Scree of isomap into maxd")
plot(apply(isosim,2,sd), type="b", xlab="dimension", ylab="column stdev", main="Scree of isomap into maxd")
isod <- 1
# Screeplot of Isomap
dissim <- vegdist(simmds, "euclidean")
success <- FALSE
while(!success) {
tryCatch({
isosim <- isomap(dissim, k=knn, ndim=isod, path="shortest")$points
success <- TRUE
},
error = function(e) {
knn <<- knn + 1
})
}
df.isosim <- tibble(iso=isosim[,1]) %>% mutate(i=1:nrow(simmds), t=1:nrow(simmds))
p <- df.isosim %>% ggplot(aes(x=t, y=iso, color=factor(t), group=1)) +
geom_point(size=3) + geom_line() + #geom_vline(xintercept=c(ts1,ts2), linetype="dashed") +
theme(legend.position = "none") + labs(y="iso-mirror",x="time") #+
# ggtitle("first dim of Isomap")
p
Figure 4.1: Mirror estimates for deterministic-drift-plus-noise latent position processes
4.2 Stochastic Block Model
To illustrate the estimation of a mirror and its localization properties in a concrete case, we consider a time-series of two-community stochastic blockmodels with varying block connectivity matrix \(B_t\).
Part of these plots are shown in Figures 9 and 10 in Section A.4 in the manuscript.
set.seed(12345)
n1=1000
n = 2*n1
d = 2
m1= 10
m = 3*m1+1
B1 <- matrix(c(1/2,1/3,1/3,1/2),nrow=2)
B2 <- matrix(c(1/2,1/2,1/2,1/2),nrow=2)
B3 <- matrix(c(1/2,1/3,1/3,1/3),nrow=2)
Xt<- glist <- NULL
for(i in 0:m1){
Bt= ((m1-i)*B1+i*B2)/m1
Xt[[i+1]] = kronecker(t(chol(Bt)),matrix(rep(1,n1),nrow=n1))
glist[[i+1]]<-rdpg.sample(Xt[[i+1]],rdpg_scale=FALSE)
}
for(i in (m1+1):(2*m1)){
Bt= ((2*m1-i)*B2+(i-m1)*B3)/m1
Xt[[i+1]] = kronecker(t(chol(Bt)),matrix(rep(1,n1),nrow=n1))
glist[[i+1]]<-rdpg.sample(Xt[[i+1]],rdpg_scale=FALSE)
}
for(i in (2*m1+1):(3*m1)){
Bt= ((3*m1-i)*B3+(i-2*m1)*B1)/m1
Xt[[i+1]] = kronecker(t(chol(Bt)),matrix(rep(1,n1),nrow=n1))
glist[[i+1]]<-rdpg.sample(Xt[[i+1]],rdpg_scale=FALSE)
}
df.X <- do.call('rbind', Xt) %>% as_tibble() %>%
rename(x=V1,y=V2) %>%
mutate(time = rep(1:m, each=n)) %>%
mutate(v = sprintf("%2d", rep(1:n, times=m)))
rangeX = range(df.X %>% select(x,y))
dmax = 10
df.sim <- tibble(t=1:m, g=glist) %>%
mutate(emb = map(g, ~full.ase(., dmax, doptr=FALSE))) %>%
mutate(elb = map(emb, function(x) getElbows(x$eval, plot=F)))
df.elb <- df.sim %>% select(t, emb) %>%
mutate(eval = map(emb, function(x) x$eval)) %>%
select(-emb) %>%
unnest(eval) %>%
group_by(t) %>%
mutate(dim = row_number())
round(colMeans(t(do.call('cbind', df.sim$elb))))
# [1] 1 2 5
df.elb %>% ggplot(aes(x=dim, y=eval, color=factor(t), group=factor(t))) +
geom_point() + geom_line() + theme(legend.position = "none") +
xlab("dimension") + ylab("eigenvalues") +
scale_x_continuous(breaks=seq(1,10,by=2))
docenter <- FALSE
dproc <- 2
comb <- combn(m,2)
Dsim <- foreach (k = 1:ncol(comb), .combine='c') %dopar% {
i <- comb[1,k]
j <- comb[2,k]
cat("i = ", i, ", j = ", j, "\n")
Xhati <- df.sim$emb[[i]]$Xhat[,1:dproc,drop=F]
Xhatj <- df.sim$emb[[j]]$Xhat[,1:dproc,drop=F]
proc <- procrustes2(as.matrix(Xhati), as.matrix(Xhatj))
Xhati <- Xhati %*% proc$W
norm(Xhati - Xhatj, type="2")^2/n
}
D <- matrix(0,m,m)
D[t(comb)] <- Dsim
D <- (D + t(D)) / 1
D2 <- sqrt(D)
image(Matrix(D2), main="", sub="", xlab="time_j", ylab="time_i")#, main="sqrt(D) o proc o ase")
simmds <- cmdscale(D2, m-1)
#getElbows(simmds, main="Scree of mds o sqrt(D)")
plot(apply(simmds,2,sd), type="b", xlab="dimension", ylab="column stdev", main="Scree of mds o sqrt(D)")
df.simmds <- tibble(t=df.sim$t, "mds(sqrt(D)) dim 1"=simmds[,1], "mds(sqrt(D)) dim 2"=simmds[,2])
df.simmds %>%
ggplot(aes(x=`mds(sqrt(D)) dim 1`, y=`mds(sqrt(D)) dim 2`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("mds1") + ylab("mds2") +
theme(legend.position = "none")
df.simmds %>%
ggplot(aes(x=1:m, y=`mds(sqrt(D)) dim 1`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("time") + ylab("mds1") +
theme(legend.position = "none")
df.simmds %>%
ggplot(aes(x=1:m, y=`mds(sqrt(D)) dim 2`, color=factor(t), group=1)) +
geom_point(size=3) + geom_path() +
xlab("time") + ylab("mds2") +
theme(legend.position = "none")
knn = 1 # ifelse(params$rdpg_scale, 7, 3) # 7 for rescale # large enough to be connected
isod <- 10
# Screeplot of Isomap
dissim <- vegdist(simmds, "euclidean")
success <- FALSE
while(!success) {
tryCatch({
isosim <- isomap(dissim, k=knn, ndim=isod, path="shortest")$points
success <- TRUE
},
error = function(e) {
knn <<- knn + 1
})
}
# scree of isomap
#getElbows(isosim, main="Scree of isomap into maxd")
plot(apply(isosim,2,sd), type="b", xlab="dimension", ylab="column stdev", main="Scree of isomap into maxd")
isod <- 1
# Screeplot of Isomap
dissim <- vegdist(simmds, "euclidean")
success <- FALSE
while(!success) {
tryCatch({
isosim <- isomap(dissim, k=knn, ndim=isod, path="shortest")$points
success <- TRUE
},
error = function(e) {
knn <<- knn + 1
})
}
df.isosim <- tibble(iso=isosim[,1]) %>% mutate(i=1:nrow(simmds), t=1:nrow(simmds))
p <- df.isosim %>% ggplot(aes(x=t, y=iso, color=factor(t), group=1)) +
geom_point(size=3) + geom_line() + #geom_vline(xintercept=c(ts1,ts2), linetype="dashed") +
theme(legend.position = "none") + labs(y="iso-mirror",x="time") #+
# ggtitle("first dim of Isomap")
p
Figure 4.2: Mirror estimation for evolving stochastic blockmodels with varying connectivity
# R version 4.2.2 (2022-10-31)
# Platform: aarch64-apple-darwin20 (64-bit)
# Running under: macOS Ventura 13.5
#
# Matrix products: default
# BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] parallel stats graphics grDevices utils datasets methods
# [8] base
#
# other attached packages:
# [1] doParallel_1.0.17 iterators_1.0.14 foreach_1.5.2 ggrepel_0.9.3
# [5] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1
# [9] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2
# [13] tidyverse_2.0.0 broom_1.0.5 vegan_2.6-4 lattice_0.20-45
# [17] permute_0.9-7 scales_1.2.1 lubridate_1.9.2 irlba_2.3.5.1
# [21] Matrix_1.5-3 igraph_1.5.0 gmmase_0.1.0 devtools_2.4.5
# [25] usethis_2.1.6 pacman_0.5.1 knitr_1.42
#
# loaded via a namespace (and not attached):
# [1] colorspace_2.1-0 ellipsis_0.3.2 class_7.3-20 modeltools_0.2-23
# [5] mclust_6.0.0 fs_1.6.2 rstudioapi_0.15.0 farver_2.1.1
# [9] remotes_2.4.2 flexmix_2.3-19 fansi_1.0.4 codetools_0.2-18
# [13] splines_4.2.2 cachem_1.0.6 robustbase_0.99-0 pkgload_1.3.2.1
# [17] jsonlite_1.8.7 cluster_2.1.4 kernlab_0.9-32 shiny_1.7.2
# [21] compiler_4.2.2 backports_1.4.1 fastmap_1.1.0 cli_3.6.1
# [25] later_1.3.0 htmltools_0.5.4 prettyunits_1.1.1 tools_4.2.2
# [29] gtable_0.3.3 glue_1.6.2 Rcpp_1.0.11 jquerylib_0.1.4
# [33] vctrs_0.6.3 nlme_3.1-160 fpc_2.2-10 xfun_0.37
# [37] ps_1.7.5 timechange_0.1.1 mime_0.12 miniUI_0.1.1.1
# [41] lifecycle_1.0.3 DEoptimR_1.0-14 MASS_7.3-58.1 hms_1.1.2
# [45] promises_1.2.0.1 yaml_2.3.7 memoise_2.0.1 sass_0.4.4
# [49] stringi_1.7.12 highr_0.10 pkgbuild_1.4.0 rlang_1.1.1
# [53] pkgconfig_2.0.3 prabclus_2.3-2 evaluate_0.21 htmlwidgets_1.6.0
# [57] labeling_0.4.2 processx_3.8.2 tidyselect_1.2.0 magrittr_2.0.3
# [61] bookdown_0.31 R6_2.5.1 generics_0.1.3 profvis_0.3.7
# [65] pillar_1.9.0 withr_2.5.0 mgcv_1.8-41 nnet_7.3-18
# [69] crayon_1.5.2 utf8_1.2.3 tzdb_0.3.0 rmarkdown_2.20
# [73] urlchecker_1.0.1 grid_4.2.2 callr_3.7.3 rmdformats_1.0.4
# [77] digest_0.6.33 diptest_0.76-0 xtable_1.8-4 httpuv_1.6.7
# [81] stats4_4.2.2 munsell_0.5.0 bslib_0.4.1 sessioninfo_1.2.2