<!--pandoc
s:
S:
i:
toc:
mathjax:
-->
<!-- pandoc -s -S --toc -h r_styling.html --self-contained cgp.html -o cgp-toc.html -->
<!-- pandoc -s cgp.md -t latex -o cgp.pdf -->
JOFC on CxP data
========================================================
### YP, MT, JV, CEP, ...
`r date()` 

Goal
----
The long term goal is to discover structure that is meaningful in terms of treatment outcomes.

Data
-----
1. **C**-data ([time_series_cc200_20130402](./Data/time_series_cc200_20130402)): **C** is for *connectome*, which just refers, in this case, to the fMRI data.  The 197 regions are obtained from the 'all voxel' data via parsing voxels into regions.  We already know the function relating the 197 to the all voxels case, because we wrote it. 
It is in the form of $(region=197)\times (time=194)\times (subj=42)$.
Unfortunately, the time-series across subjects are *not* aligned relative to one another.

2. **P**-data ([NEO2012.csv](./Data/NEO2012.csv)): **P** is short for *phenotype*, in this case, it is the 5 factor model ($(subj=42)\times (factor=5)$), which are the 5 factors: ENOAC (sometimes ordered differently and called OCEAN).


Supplements
-----------
1. [NEO2012.csv](./Data/NEO2012.csv) has 42 subjects and the following columns:
ParticipantID,Sex,Age,E,N,O,A,C
the last 5 are the "5 factors"

2. [NEO2012_linkage.csv](./Data/NEO2012_linkage.csv) has 42 subjects and 2 columns:
the first is the ParticipantID, which is listed in the other file,
the second is the file name.
so it maps participant id to file, which we need.

3. _ScaledScales.xls_: this is the file that had the original data that i extracted *NEO2012.csv* from. you shouldn't need it, but it is the raw data, so i didn't want to discard it.


```{r,echo=FALSE,eval=TRUE,warning=FALSE,message=FALSE}
#utils::globalVariables(c("scales", "obj2"))

suppressMessages(require(knitr,quietly=TRUE))
suppressMessages(require(mclust,quietly=TRUE))
suppressMessages(require(jhusmacof,quietly=TRUE))
suppressMessages(require(Matrix,quietly=TRUE))
suppressMessages(require(parallel,quietly=TRUE))
suppressMessages(require(foreach,quietly=TRUE))
suppressMessages(require(doMC,quietly=TRUE))
suppressMessages(require(ggplot2,quietly=TRUE))
suppressMessages(require(reshape2,quietly=TRUE))
registerDoMC(cores=detectCores()-1)


#library(IM,silent=TRUE)
source("cgputils.r")
source("~/RFolder/fastoos.R")
source("~/RFolder/oosIM.R")

opts_chunk$set(cache=TRUE,echo=FALSE,warning=FALSE,message=FALSE,comment="#",tidy=FALSE,
               fig.width=6,fig.height=6,fig.path='Figure/cgp-',dpi=72)

hook_output <- knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
  # this hook is used only when the linewidth option is not NULL
  if (!is.null(n <- options$linewidth)) {
    x = knitr:::split_lines(x)
    # any lines wider than n should be wrapped
    if (any(nchar(x) > n)) x = strwrap(x, width = n)
    x = paste(x, collapse = '\n')
  }
  hook_output(x, options)
})
```
## *C* Data
```{r cdata, echo=TRUE, eval=FALSE}
    dname <- "http://www.cis.jhu.edu/~parky/CGP/Data/time_series_cc200_20130402/"
    urlfiles <- readHTMLTable(dname,skip.rows=1:2)[[1]]$V2
    urlfiles <- urlfiles[!is.na(urlfiles)]
    files <- paste0(dname,urlfiles)
    (n <- length(files)) # 42

    require(flexmix)
    coh <- ts <- spect <- spect_norm <- NULL
    for (i in 1:length(files)) {
        ts[[i]] <- readMat(files[i])[[1]] # 194 x 197
        spect[[i]] <- spectrum(t(ts[[i]]))$spec[6:40,] # 100 (freq) x 194 => (bandpass) 35 x 194
        spect_norm[[i]] <- scale(spect[[i]],center=FALSE,scale=colSums(spect[[i]])) # normalization
        coh[[i]] <- KLdiv(spect_norm[[i]]) # Delta: 194 x 194
    }
```
```{r dissmatata,echo=FALSE}
load("outC.Rbin")
attach(outC)
plotmat(ts[[1]],"region","time","ts")
```
### run spectrum (FFT) on ts for each region
```{r}
spec <- spectrum(t(ts[[1]]))$spec
plotmat(spec,"frequency","region","spec")
```
### bandpass and normalize it
```{r}
plotmat(spect[[1]],"frequency","region","bandpass(spec)")
plotmat(spect_norm[[1]],"frequency","region","spect_norm")
```
### calc KLdiv among regions
```{r}
#dim(coh[[1]])
plotmat(coh[[1]],"region","region","KLdiv")
```
### calc normalized Hellinger distance among subjects
```{r,echo=TRUE,eval=FALSE}
    dissmat <- matrix(0,n,n)
    for (i in 1:(n-1)) {
        for (j in (i+1):n) {
            dissmat[i,j] <- helldist(coh[[i]],coh[[j]]) # hellinger distance
        }
    }
    dissmat <- dissmat + t(dissmat)
    diag(dissmat) <- 0

distC <- normalize(dissmat)
```
```{r plotC,echo=FALSE}
distC <- normalize(dissmat)
dim(distC)
plotmat(distC,"subject","subject","Hellinger")
hist(distC)
```
### embed and cluster it
```{r,echo=TRUE,eval=TRUE}
n <- nrow(distC)
mdsC <- cmdscale(distC,n-1)
elbowC <- getElbows(mdsC,3)
mcC <- Mclust(mdsC[,1:elbowC[2]])
plot(mcC,"BIC")
```
## *P* Data
```{r pdata,echo=TRUE,eval=FALSE}
    require(gdata)
    fname <- "http://www.cis.jhu.edu/~parky/CGP/Data/NEO2012.csv"
    neo <- read.csv(fname)

    pdat <- neo[,4:8]
    distP <- as.matrix(dist(pdat))
plotmat(as.matrix(pdat),"subject","factor")
distP <- normalize(distP)
dim(distP)

plotmat(distP,"subject","subject")
hist(distP)

mdsP <- cmdscale(distP,n-1)
elbowP <- getElbows(mdsP,3)
mcP <- Mclust(mdsP[,1:elbowP[3]])
plot(mcP,"BIC")
```
```{r distPdata,echo=FALSE}
#outP <- doPdata()
load("outP.Rbin")
attach(outP)
plotmat(as.matrix(pdat),"subject","factor","pdat")
distP <- dmat
distP <- normalize(distP)
#dim(distP)
```
```{r plotP,echo=FALSE}
stopifnot(n==nrow(distP))
plotmat(distP,"subject","subject","dist(pdat)")
hist(distP)

mdsP <- cmdscale(distP,n-1)
elbowP <- getElbows(mdsP,3)
mcP <- Mclust(mdsP[,1:elbowP[3]])
plot(mcP,"BIC")
```
## JOFC
```{r JOFC, echo=TRUE}
#distC <- distP
#set.seed(123)
#distC <- jitter(distP,17000)
#plotmat(distC,main="distC = jitter(distP,100)")
#plotmat(distP,main="distP")
#plotmat(abs(distC-distP),main="abs(distC-distP)")

## form omnibus matrix
meanD <- (distC+distP)/2
omni <- rbind(cbind(distC,meanD),cbind(t(meanD),distP))
dim(omni)    
## embed
n2 <- n*2
mds <- cmdscale(omni,n2-1)
```
```{r JOFC-omni-w, echo=FALSE}
plotmat(omni,"column","row","omni")
```
```{r JOFC-mds, echo=TRUE}
elbow <- getElbows(mds,3)
knee <- elbow[1]
dat <- mds[,1:knee]
```
```{r JOFC-scatter, echo=FALSE}
lab <- factor(rep(c("C","P"),each=n))
#for (i in 1:4) {
#    for (j in (i+1):5) {
for (i in 1:1) {
    for (j in (i+1):2) {
    plot(dat[,i],dat[,j],col=as.numeric(lab),pch=as.numeric(lab),main=paste("dim ",i, " vs dim ", j))
    legend("bottomleft", c("C","P"),col=1:2,pch=1:2)
    segments(dat[1:n,i],dat[1:n,j],dat[(n+1):n2,i],dat[(n+1):n2,j],col="orange")
    }
}
pairs(dat[,1:5],col=as.numeric(lab),pch=as.numeric(lab))
```

## Validation
the null (ie, truly matched subjects) and alternative holdout pdf estimates of the oos distances!

1. loop through the data, embedding but holding out two matched pairs.
call these two matched pairs (i1,i2) and (j1,j2).
    * oos embed (i1,i2) under the null -- that is, assuming they are a matched pair -- and record d(i1,i2) in the embedded space.
    * oos embed (j1,j2) under the null -- that is, assuming they are a matched pair -- and record d(j1,j2) in the embedded space.
    * oos embed (i1,j2) under the null -- that is, assuming they are a matched pair -- and record d(i1,j2) in the embedded space.
    * oos embed (j1,i2) under the null -- that is, assuming they are a matched pair -- and record d(j1,i2) in the embedded space.
2. calc null pdf estimates of the first two oos distances.
   calc alternative pdf estimates of the last two oos distances.
3. plot these two kernel estimates together on the same plot -- null in blue, alt in red.
```{r oos, echo=TRUE, eval=FALSE}
    set.seed(12345)
    nmc <- 1000
    out <- foreach(mc=1:nmc, .combine="rbind") %dopar% {
        ij1 <- sample(1:n,2,replace=FALSE) # i1, j1
        ij2 <- ij1+n         # i2, j2
        ij <- c(ij1,ij2) 

        ## "joint" oos
        Y <- fast.oos2(omni[-ij,-ij], knee, omni[ij,-ij])
        DY <- as.matrix(dist(Y$X.oos))
        D0 <- c(DY[1,3],DY[2,4]) # d(i1,i2), d(j1,j2)
        DA <- c(DY[1,4],DY[2,3]) # d(i1,j2), d(j1,i2)
        c(D0,DA)
    }
```
```{r loadpwr, echo=FALSE, eval=TRUE}
load("mcout-power-mc1000.Rbin")
```
```{r plotpdf, echo=FALSE, eval=TRUE}
    adj <- 2
    plot(density(out[,1],adjust=adj),type="l",lwd=2,col="blue",main="pdf of H0(blue) and HA(red)",xlim=c(0,max(out))) # (i1, i2)
    lines(density(out[,2],adjust=adj),type="l",lwd=2,lty=2,col="blue") # (j1, j2)
    lines(density(out[,3],adjust=adj),type="l",lwd=2,lty=1,col="red")  # (i1, j2)
    lines(density(out[,4],adjust=adj),type="l",lwd=2,lty=2,col="red")  # (j1, i2)
leg <- c("d(i1,i2)","d(j1,j2)","d(i1,j2)","d(j1,i2)")
    legend("topright",leg,col=c(4,4,2,2),lty=c(1,2,1,2))
```
```{r plotpwr, echo=FALSE, eval=TRUE}
size <- seq(0,1,0.01)
    p1 <- get_power2(out[,1], out[,3], size) # (i1,*)
    p2 <- get_power2(out[,2], out[,4], size) # (j1,*)
    plot(size,p1,type="l",xlab="alpha",ylab="power")
    lines(size,p2,lty=2)
    lines(size,(p1+p2)/2,lty=3,col="blue")
    abline(a=0,b=1,col=2)
```
```{r W2, echo=FALSE, eval=TRUE}
    meanD <- matrix(1,n,n) - diag(1,n,n)
    O2 <- rbind(cbind(distC,meanD), cbind(t(meanD),distP))

    w <- 1
    Wdiag <- matrix(w, n, n)
    Woff <- matrix(0, n, n)
    diag(Woff) <- 1
    Wb <- rbind(cbind(Wdiag, Woff), cbind(t(Woff), Wdiag)) # 84 x 84
```
## Predicting P from C
```{r W1, echo=TRUE, eval=TRUE}
    n2 <- n*2

    ## omni
    meanD <- (distC + distP)/2
    O1 <- rbind(cbind(distC, meanD), cbind(t(meanD), distP))
```
```{r plotO2, echo=FALSE, eval=TRUE}
    plotmat(O1,main=expression(O[1]))
    plotmat(O2,main=expression(O[2]))
```
```{r O1, echo=TRUE, eval=TRUE}
    ## smacof W
    w <- 0.01
    Wdiag <- matrix(w, n, n)
    Woff <- matrix(0, n, n)
    diag(Woff) <- 1 - w
    W1 <- rbind(cbind(Wdiag, Woff), cbind(t(Woff), Wdiag))
```
```{r plotW2, echo=FALSE, eval=TRUE}
    plotmat(W1,main=expression(W[1]))
    plotmat(Wb,main=expression(W[2]))
```
```{r perf, echo=TRUE, eval=TRUE}
    ## smacof embedding of all 2*n points
    knee <- 2
    set.seed(12345)
    smacof <- smacofSym(O1, knee, weightmat = W1)$conf 
    lab <- factor(rep(c("C","P"),each=n))
    plot(smacof[,1],smacof[,2],col=as.numeric(lab),pch=as.numeric(lab),main="C vs P via smacof embedding")
    segments(smacof[1:n,1],smacof[1:n,2],smacof[(n+1):n2,1],smacof[(n+1):n2,2],col="orange")
    legend("topright", c("C","P"),col=1:2,pch=1:2)
```
### Estimating P for a C for which we have no P
for each left-out (_C_,_P_) in turn

1. we choose _Phat[C]_ = weighted avg of _P_'s associated with _C_'s nearby to embedded _C_.
2. we look at _d( P[C], Phat[C] )_.
3. we decide what type of person you are, based on your connectome.
```{r ex1, echo=TRUE, eval=FALSE}
    K <- 1 # 1-NN
    Pjhat <- matrix(0,n,knee)
    for(i in 1:n) {
        j <- i+n # matching P
        ij <- c(i,j) ## (Ci,Pj) pair

        ## in sample without (Ci,Pj) pair
        Dij <- O1[-ij,-ij]
        Wij <- W1[-ij,-ij]
        Ci <- smacof[i,]
        Xij <- smacof[-ij,]

        X <- smacofSym(Dij, knee, weightmat = Wij, init=Xij)$conf # 82 x 5
        Chat <- X[1:(n-1),]
        Phat <- X[-c(1:(n-1)),]
        
        ## oos Ci only
        Di <- O1[-j,-j]
        Wi <- matrix(0, nrow(O1), ncol = ncol(O1))
        Wi[i,1:n] <- 1
        Wi[1:n,i] <- 1
        Wi <- Wi[-j,-j]
        within <- rep(1,n2)
        within[ij] <- 0
        Cihat[i,] <- oosIM(Di, X, W=Wi, init=Ci, isWithin=within[-j])

        ## estimate Pj
        dChat <- outer(rowSums(Cihat^2),rowSums(Chat^2),"+") - (2*Cihat %*% t(Chat))
        dChat <- sqrt((dChat+abs(dChat))/2)
        ind <- order(dChat) # index of k-NN from Cihat to Chat
        Pjhat[i,] <- colMeans(Phat[ind[1:K],,drop=FALSE])
    }
```
```{r load1, echo=FALSE, eval=TRUE}
    load("smacof-oos-outP-new.Rbin")
    xrng <- range(c(smacof[,1],Cihat[,1]))
    yrng <- range(c(smacof[,2],Cihat[,2]))
    plot(smacof[1:n,],xlim=xrng,ylim=yrng, main="i = 42") # C
    points(smacof[n,1],smacof[n,2],pch=19,col=1,cex=2) # Ci
    points(Cihat[,1],Cihat[,2],pch=8,col=1,cex=2) # Cihat
    points(Chat,col=1,pch=21,bg="grey") # Chat
    points(Chat[ind[1],1],Chat[ind[1],2],col=1,pch=21,bg=3) # 1NN Chat
    points(smacof[-c(1:n),1],smacof[-c(1:n),2],col=2,pch=2) # P
    points(smacof[n2,1],smacof[n2,2],pch=17,col=2,cex=2) # Pj
    points(Phat,col=2,pch=24,bg="pink") # Pjhat
    points(Pjhat[n,1],Pjhat[n,2],col=2,pch=8,cex=2) # Phat
    segments(Pjhat[n,1],Pjhat[n,2],smacof[n2,1],smacof[n2,2],col="blue",lwd=2)
    segments(Cihat[,1],Cihat[,2],smacof[n,1],smacof[n,2],col="blue",lwd=2)
    legend("bottomright", c("C","Ci","Chat","Cihat","P","Pj","Phat","Pjhat"),
           col=c(1,1,1,1,2,2,2,2),pch=c(1,19,21,8,2,17,24,8), ncol=2,
           pt.bg=c("white","white","grey","white","white","white","pink","white"))

    plot(smacof[-c(1:n),],pch=2,col=2, main="P vs Pjhat")#,xlim=c(-1,1),ylim=c(-1,1))
    points(Pjhat,col=2,pch=8)
    segments(smacof[-c(1:n),1],smacof[-c(1:n),2],Pjhat[1:n,1],Pjhat[1:n,2],col="orange")
    legend("topright", c("P","Pjhat"),col=2,pch=c(2,8))

    diffP <- abs(Pjhat - smacof[-c(1:n),])
    plot(density(diffP),main="d(P,Pjhat)")
```
### Performance
With
$d_1 = d(\hat{P}_j, P_j)$ and 
$d_2 = d(P_j, mean(\hat{P}))$, where $P_j \in joint embedding(C,P)$ and $\hat{P}_j$ is obtained by either "no loo" or "with loo" based on $C_j$,
we can define $Perf = \sum I(d_1 < d_2)$.

Omni |  Weight  | no loo | with loo
-----|-----|--------|---------
$O_1$  | $W_1$ |   41   |    34
$O_1$  | $W_2$ |   19   |    19
$O_2$  | $W_1$ |   42   |    25
$O_2$  | $W_2$ |   15   |    16
------------------------------

Let $dP$ be the distance in the *original* $P$-space.<br />
Let $dC$ be the distance in the *original* $C$-space.<br />
Let $dJOFC$ be the distance in the *original* $JOFC$-space.<br />
Consider the $j^{th}$ subject, 
then we care about $dP(\hat{P}_j,P_j)$ as our figure of merit.

We observe $C_j$, as well as all the other $(C,P)$ pairs.

1. we estimate $P_j$ via the $P$'s associated with the $C$'s which are $dC$-close to $C_j$.
2. we estimate $P_j$ via the $P$'s associated with the $C$'s which are $JOFC$-close to $C_j$.<br />
Both approaches are trying to approximate<br />
3. we estimate $P_j$ via the $P$'s which are $dP$-close to $P_j$.

Let's look at which $P$'s are close, in all three cases,
that is, for each $j$, 

1. $dist_0 = dP(P_j, \hat{P})$, where $\hat{P} = P_{\arg\min_{k \neq j} dP(C_j,C_k)}$
2. $dist_1 = dP(P_j, \hat{P}_{dC})$, where $\hat{P}_{dC} = P_{\arg\min_{k \neq j} dC(C_j,C_k)}$
3. $dist_2 = dP(P_j, \hat{P}_{dJOFC})$, where $\hat{P}_{dJOFC} = P_{\arg\min_{k \neq j} dJOFC(C_j,C_k)}$

If $dist_2$ looks more like $dist_0$ than $dist_1$ looks like $dist_0$, then JOFC wins!

```{r pval0, echo=FALSE, eval=TRUE}
    require(smacof)
    N <- nrow(distC)
    N2 <- N*2
    knee <- 2
    K <- 1

    dP <- distP
    dC <- distC
    mds <- smacofSym(O1, knee, weightmat = W1)$conf    
    dmat <- as.matrix(dist(mds))[1:N,1:N]  # for C

    ## (1) we estimate P_j via the P's associated with the C's which are dC-close to C_j.
    ## (2) we estimate P_j via the P's associated with the C's which are jofc-close to C_j.
    ## both approaches are trying to approximate
    ## (0) we estimate P_j via the P's which are dP-close to P_j.
    order0 <- order1 <- order2 <- matrix(0,N,N)
    for (j in 1:N) {
      tmp <- order(dP[j,])
      order0[j,] <- tmp
      tmp <- order(dC[j,])
      order1[j,] <- tmp
      tmp <- order(dmat[j,])
      order2[j,] <- tmp
    }

    whichj <- matrix(0,N,2)
    for (j in 1:N) {
#      print(rbind(order0[j,1:10],order1[j,1:10],order2[j,1:10]))
      whichj[j,1] <- which(order0[j,2] == order1[j,])
      whichj[j,2] <- which(order0[j,2] == order2[j,])
    }

    ## dP(Pj,Phat_jofcnn(Cj)) - dP(Pj,Phat_dCnn(Cj))
    diffD <- rep(0,N)
    for (j in 1:N) {
      diffD[j] <- dP[j,order2[j,2]] - dP[j,order1[j,2]]
#      diffD[j] <- dP[j,whichj[j,2]] - dP[j,whichj[j,1]]
    }

    ## sign test
#    pval <- 1-pbinom(sum(diffD<0)-1,sum(diffD!=0),.5)
#    main=expression(dP(P[j],hat(P)[jofcnn](C[j])) - dP(P[j],hat(P)[dCnn](C[j])))
#    plot(density(diffD),main=main,xlab=paste("p-value = ",round(pval,2), "using alpha = 0.5"))
#    abline(v=0,lty=2)

#    alp <- seq(0,0.5,0.01)
#    pvec <- rep(0,length(alp))
#    ind <- 1
#    for (i in alp) {
#        pvec[ind] <- 1-pbinom(sum(diffD<0)-1,sum(diffD!=0),alp[ind])
#        ind <- ind+1
#    }
#    plot(x=alp,y=pvec,type="l",xlab=expression(alpha),ylab="p-value")

    diffD <- dist0 <- dist1 <- dist2 <- rep(0,N)
    for (j in 1:N) {
#      diffD[j] <- dP[j,whichj[j,2]] - dP[j,whichj[j,1]]
      dist0[j] <- dP[j,order0[j,K+1]]
      dist1[j] <- dP[j,order1[j,K+1]]
      dist2[j] <- dP[j,order2[j,K+1]]
      diffD[j] <- dist2[j] - dist1[j]
    }

    dist3 <- data.frame(dP=dist0,dC=dist1,dJOFC=dist2)
    dist3.df <- melt(dist3)
    diff.df <- melt(diffD)

    p <- ggplot(dist3.df, aes(value, fill = variable))
    p <- p + geom_density(alpha = 0.2)
#    p <- p + geom_bar(alpha = 0.5)
    p <- p + scale_fill_discrete(name="method",labels=c("dist0","dist1","dist2"))
    p <- p + theme(legend.position=c(0.9,0.5))
    p <- p + scale_x_continuous(name="phenotype prediction distance")
    p <- p + scale_y_continuous(name="kernel density estimate")
    p + ggtitle("Nearest Neighbor Phenotype Prediction based on Connectome")

    p <- ggplot(diff.df,aes(x=value)) + geom_density(fill="cyan",alpha=0.2)
#    p <- p + geom_vline(xintercept = mean(diffD),color="red",linetype="dashed")
    p <- p + scale_x_continuous(name="dist2 - dist1") #, red vline: mean(d_JOFC-d_C)")
    p <- p + scale_y_continuous(name="kernel density estimate")
    p <- p + ggtitle("Nearest Neighbor Phenotype Prediction based on Connectome")
    p <- p + geom_point(x = mean(diffD), y = 0, color="red",size=5)
    p + annotate("text", x = mean(diffD), y = 0.1, label = "mean", color="red")

```

Now, let's do sign tests for $dP(P_j,\hat{P}_{jofcnn}(C_j)) - dP(P_j,\hat{P}_{dCnn}(C_j))$ by varying $w$ in $W_1$. Note that $w=0.5$ yields $W_1=W_2$.

```{r pval, echo=FALSE, eval=TRUE}
dat <- dget("alpha-pval.RData")
    plot(x=dat[1,], y=dat[2,], type="l", xlab="w in W1, w=0.5 => W2", ylab="p-value")
    abline(h=0.05,lty=2,col=2)
```
**Prediction of Phenotype based on Connectome is better via jofc than via raw!**