# Overfitting and optimism R code for figures

## Fig 5.2

```for (mort in c(.1)) # 10% true mortality
plot(x=seq(from=-.025,to=.975,by=.05), dbinom(x=0:20,20,mort), axes=F,  type = "s",
xlim=c(-.05,.35), ylim=c(0,.33),
xlab=paste('Observed mortality, true mortality ',round(100*mort,0),"%",sep=""), ylab='probability density')
axis(side=1,at=c(0,.1,.2,.3),labels=c("0%","10%","20%","30%"))
axis(side=2,at=c(0,0.1,.2,.3,.4,.5,.6,.7),labels=c("0%","10%","20%","30%","40%","50%","60%","70%"))
text(x=mort,y=.02+ dbinom(x=max(round(mort*20),0),20,mort),labels=paste("n=20"))

for (i in c(50,200)) {  # add 2 more sample sizes
lines(x=seq(from=0-(0.5*1/i),to=1-(0.5*1/i),by=1/i), dbinom(x=0:i,i,mort),
type = "s", lty=ifelse(i==50,2,3))
text(x=mort,y=.02+dbinom(x=max(round(mort*i),0),i,mort),labels=paste("n=",i, sep=""))
} # end loop n=50,200
} # end loop mort
```

## Fig 5.3 and 5.4

```# Plot 4 situations: no heterogeneity to much heterogeneity
# Fig 5.3 first
par(mfrow = c(2,2), pty='s', mar=c(3,2,1,1))
# Make data set with 100 centers, each 20 patients, 10% mortality, variability sd 0 to 0.03
for (sdtau in c(0,.01,.02,.03)) {
truemort  <- rnorm(n=100,mean=0.1,sd=sdtau)
mortmat <- as.matrix(cbind(1:100, sapply(truemort,FUN=function(x)rbinom(n=1,20,x))/20,truemort))

plot(x=0, y=0, pch='', xlim=c(-.2,1.2), ylim=c(-.03,.35),axes=F, xlab="", ylab='mortality')
axis(side=2,at=c(0,.1,.2,.3),labels=c("0%","10%","20%","30%"))
axis(side=1,at=c(0,1),labels=c("Observed","True mortality"))
for (i in (1:100)) {
lines(x=c(0,1), y=mortmat[i,c(2,3)])
points(x=c(0+runif(1,min=-.07,max=.07),1), y=mortmat[i,c(2,3)], pch="+") } }

## Same, for n=200 per center Fig 5.4
# Plot 4 situations: no heterogeneity to much heterogeneity
par(mfrow = c(2,2), pty='s', mar=c(3,2,1,1))
# Make data set with 100 centers, each 200 patients, 10% mortality, variability sd 0 to 0.03
for (sdtau in c(0,.01,.02,.03)) {
truemort  <- rnorm(n=100,mean=0.1,sd=sdtau)
mortmat <- as.matrix(cbind(1:100, sapply(truemort,FUN=function(x)rbinom(n=1,200,x))/200,truemort))
plot(x=0, y=0, pch='', xlim=c(-.2,1.2), ylim=c(-.03,.35),axes=F, xlab="", ylab='mortality')
axis(side=2,at=c(0,.1,.2,.3),labels=c("0%","10%","20%","30%"))
axis(side=1,at=c(0,1),labels=c("Observed","True mortality"))
for (i in (1:100)) {
lines(x=c(0,1), y=mortmat[i,c(2,3)])
points(x=c(0+runif(1,min=-.07,max=.07),1), y=mortmat[i,c(2,3)], pch="+") } }
```

## Fig 5.5

```## Under the NULL
plot(function(x) dnorm(x, mean=0, sd = 1, log = F), -4, 4,
xlab="Estimated effect", ylab = "Density")
lines(x=c(0,0), y=c(0,.5), lty=1)
x95 <- qnorm(p=(1-0.025), mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
x995 <- qnorm(p=(1-0.0025), mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)

# Polygon for p<.05
xpol <- seq(x95,4,by=0.01)
ypol <- dnorm(x=xpol, mean=0, sd = 1, log = F)
xpol <- c(x95,xpol,4)
ypol <- c(0,ypol,0)
polygon(xpol,ypol, col = "gray")
# lower tail
xpol <- seq(-x95,-4,by=-0.01)
ypol <- dnorm(x=xpol, mean=0, sd = 1, log = F)
xpol <- c(-x95,xpol,-4)
ypol <- c(0,ypol,0)
polygon(xpol,ypol, col = "gray")

##############
## mean = 1
## range -3, 5
plot(function(x) dnorm(x, mean=1, sd = 1, log = F), -3, 5,
xlab="Estimated effect", ylab = "Density")
lines(x=c(1,1), y=c(0,.5), lty=2)

x95 <- qnorm(p=(1-0.025), mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
x995 <- qnorm(p=(1-0.0025), mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)

# Polygon for p<.05
xpol <- seq(x95,5,by=0.01)
ypol <- dnorm(x=xpol, mean=1, sd = 1, log = F)
xpol <- c(x95,xpol,5)
ypol <- c(0,ypol,0)
polygon(xpol,ypol, col = "gray")
# lower tail
xpol <- seq(-x95,-3,by=-0.01)
ypol <- dnorm(x=xpol, mean=1, sd = 1, log = F)
xpol <- c(-x95,xpol,-3)
ypol <- c(0,ypol,0)
polygon(xpol,ypol, col = "gray")

```

## Fig 5.6

```# Illustrate model performance development / test / external: Fig 5.6
library(rms)

# create x vars
n <- 100000
x1  <- rnorm(n,sd=1)
x2  <- rnorm(n,sd=.9)
x3  <- rnorm(n,sd=.8)
x4  <- rnorm(n,sd=.7)
x5  <- rnorm(n,sd=.6)
x6  <- rnorm(n,sd=.5)
x7  <- rnorm(n,sd=.4)
x8  <- rnorm(n,sd=.3)
x9  <- rnorm(n,sd=.2)
x10  <- rnorm(n,sd=.1)
y <- x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 +rnorm(n,sd=3) # y in developement data
y2 <- .5*x1 + 1.5*x2 + .5*x3 + 1.5*x4 + 0.5*x5 + 1.5*x6 + 0.5*x7 + 1.5*x8 + 0.5*x9 + 1.5*x10 +rnorm(n,sd=3)

datax <- as.data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,y,y2))
datax[1:5,]
describe(y)
describe(y2)

colnames(datax) <- Cs(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y,Y2)
# gold standard fits
ols(y~x1)
ols(y~x1 + x2 + x3 + x4 + x5 + x6 + x6 + x7 + x8 + x9 + x9 + x10)
ols(y2~x1 + x2 + x3 + x4 + x5 + x6 + x6 + x7 + x8 + x9 + x9 + x10)

# function for MSE
MSE <- function(y, fit,newdata)   {mean((y-predict(fit,newdata=newdata))^2)}

#######################
## Start simulations ##
#######################
sim <- 1000
nstudy  <- 50
resultsmat  <- matrix(nrow=sim,ncol=45)

for (i in 1:sim) {
cat("\nSimulation #", i)

# create x vars
n <- 10000
x1  <- rnorm(n,sd=1)
x2  <- rnorm(n,sd=.9)
x3  <- rnorm(n,sd=.8)
x4  <- rnorm(n,sd=.7)
x5  <- rnorm(n,sd=.6)
x6  <- rnorm(n,sd=.5)
x7  <- rnorm(n,sd=.4)
x8  <- rnorm(n,sd=.3)
x9  <- rnorm(n,sd=.2)
x10  <- rnorm(n,sd=.1)
y <- x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 +rnorm(n,sd=3)
y2 <- .5*x1 + 1.5*x2 + .5*x3 + 1.5*x4 + 0.5*x5 + 1.5*x6 + 0.5*x7 + 1.5*x8 + 0.5*x9 + 1.5*x10 +rnorm(n,sd=3)

datax <- as.data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,y,y2))
colnames(datax) <- Cs(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y,Y2)

j <- sample(x=1:n, size=nstudy)
dataj <- datax[j,]
datamj <- datax[-j,]
Yj <- datax\$Y[j]
Ymj <- datax\$Y[-j]
Y2mj <- datax\$Y2[-j]

fit1  <- ols(Y~X1, data=dataj)
fit2  <- ols(Y~X1 + X2, data=dataj)
fit3  <- ols(Y~X1 + X2 + X3, data=dataj)
fit4  <- ols(Y~X1 + X2 + X3 + X4, data=dataj)
fit5  <- ols(Y~X1 + X2 + X3 + X4 + X5, data=dataj)
fit6  <- ols(Y~X1 + X2 + X3 + X4 + X5 + X6, data=dataj)
fit7  <- ols(Y~X1 + X2 + X3 + X4 + X5 + X6 + X7, data=dataj)
fit8  <- ols(Y~X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data=dataj)
fit9  <- ols(Y~X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 , data=dataj)
fit10  <- ols(Y~X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, data=dataj)

# coefs in matrix
resultsmat[i,1:15]  <- c(coef(fit5)[2:6],coef(fit10)[2:11])
# test MSE
resultsmat[i,16:25]  <-
c(MSE(y=Ymj, fit=fit1, newdata=datamj),
MSE(y=Ymj, fit=fit2, newdata=datamj),
MSE(y=Ymj, fit=fit3, newdata=datamj),
MSE(y=Ymj, fit=fit4, newdata=datamj),
MSE(y=Ymj, fit=fit5, newdata=datamj),
MSE(y=Ymj, fit=fit6, newdata=datamj),
MSE(y=Ymj, fit=fit7, newdata=datamj),
MSE(y=Ymj, fit=fit8, newdata=datamj),
MSE(y=Ymj, fit=fit9, newdata=datamj),
MSE(y=Ymj, fit=fit10, newdata=datamj))

# external validation
resultsmat[i,26:35]  <-
c(MSE(y=Y2mj, fit=fit1, newdata=datamj),
MSE(y=Y2mj, fit=fit2, newdata=datamj),
MSE(y=Y2mj, fit=fit3, newdata=datamj),
MSE(y=Y2mj, fit=fit4, newdata=datamj),
MSE(y=Y2mj, fit=fit5, newdata=datamj),
MSE(y=Y2mj, fit=fit6, newdata=datamj),
MSE(y=Y2mj, fit=fit7, newdata=datamj),
MSE(y=Y2mj, fit=fit8, newdata=datamj),
MSE(y=Y2mj, fit=fit9, newdata=datamj),
MSE(y=Y2mj, fit=fit10, newdata=datamj))

# apparent validation
resultsmat[i,36:45]  <-
c(MSE(y=Yj, fit=fit1, newdata=dataj),
MSE(y=Yj, fit=fit2, newdata=dataj),
MSE(y=Yj, fit=fit3, newdata=dataj),
MSE(y=Yj, fit=fit4, newdata=dataj),
MSE(y=Yj, fit=fit5, newdata=dataj),
MSE(y=Yj, fit=fit6, newdata=dataj),
MSE(y=Yj, fit=fit7, newdata=dataj),
MSE(y=Yj, fit=fit8, newdata=dataj),
MSE(y=Yj, fit=fit9, newdata=dataj),
MSE(y=Yj, fit=fit10, newdata=dataj))

cat(".. end")
} # end simulations sim

## With n=50
resultsmatn50 <- resultsmat
resultsn50 <- apply(resultsmatn50,2,mean)
perfmat <- matrix(resultsn50[16:45],nrow=10,ncol=3)
plot(spline(x=1:10,y=perfmat[,3]),lty=1,lwd=1, xlim=c(0.5,10.5),ylim=c(5,15), type="l",
xlab="Number of predictors", ylab="Mean squared error",cex.lab=1.2) # Apparent
lines(spline(x=1:10,y=perfmat[,1]),lty=2,lwd=2)                   # Internal
lines(smooth.spline(x=1:10,y=perfmat[,2]),lty=3,lwd=3)                   # External
text(x=c(7,7,7),y=c(8.4,10.4,12.5),label=Cs(Apparent, Internal, External)) \\

```