require(knitr)
require(dplyr)
read_chunk("ex2/ex2_chunks.R")
read_chunk("ex4/ex4_chunks.R")
set.seed(12345)
ex3data1 <- as.matrix(read.csv("data/ex3data1.csv"))
colnames(ex3data1) <- paste("X", seq(ncol(ex3data1)), sep = "")
colnames(ex3data1)[ncol(ex3data1)] <- "Y"
ex3data1 <- cbind(X0 = 1, ex3data1)
sig <- function(x){1 / (1 + exp(-x))}
newy <- vector()
for(i in 1:10){
newy <- cbind(newy, ex3data1[, "Y"] == i)
}
This is the same visualization as in Exercise 3
This is largely based on kaleko’s python script on Github.
I read in the weights and save them as a list because this mimics what’s returned with readMat
. Since I already processed them and saved them as separate CSVs, this is an odd way to do it, but it would require going back and modifying some of the structure for the following computations. I may tidy this up in the future
ex3weights <- list(Theta1 = as.matrix(read.csv("data/ex3weights_Theta1.csv")),
Theta2 = as.matrix(read.csv("data/ex3weights_Theta2.csv")))
input_layer_size <- 400
hidden_layer_size <- 25
output_layer_size <- 10
n_training_samples <- 5000
unlist()
does what flattenParams
does in kaleko’s Python script
reshapeParams <- function(flattened_array){
theta1 <- matrix(flattened_array[1:((input_layer_size+1)*hidden_layer_size)],
nrow = hidden_layer_size,
ncol = input_layer_size + 1,
byrow = FALSE)
theta2 <- matrix(flattened_array[((input_layer_size+1)*hidden_layer_size + 1):
length(flattened_array)],
nrow = output_layer_size,
ncol = hidden_layer_size + 1,
byrow = FALSE)
return(list(theta1 = theta1, theta2 = theta2))
}
reshapeX <- function(flattenedX){
xReshaped <- matrix(flattenedX,
nrow = n_training_samples,
ncol = (input_layer_size+1),
byrow = FALSE)
return(xReshaped)
}
computeCost <- function(mythetas_flattened, myX_flattened, myy, mylambda = 0){
# Modified to take (m X k) dimensional y matrix
# First unroll the parameters
mythetas <- reshapeParams(mythetas_flattened)
# Now unroll X
myX <- reshapeX(myX_flattened)
#This is what will accumulate the total cost
total_cost <- 0
m <- n_training_samples
# irow <- 100
for(irow in 1:m){
myrow <- myX[irow, ]
myhs <- propagateForward(myrow,mythetas)[[2]][,2]
tmpy <- myy[irow, ]
mycost <- - crossprod(c(tmpy, 1 - tmpy), c(log(myhs), log(1 - myhs)))
total_cost <- total_cost + mycost
}
total_cost <- total_cost / m
total_reg <- 0
for(mytheta in mythetas){
total_reg <- total_reg + sum(mytheta * mytheta)
}
total_reg <- total_reg * mylambda / (2 * m)
return(total_cost + total_reg)
}
propagateForward <- function(row, Thetas){
features <- row
zs_as_per_layer <- list()
for(i in 1:length(Thetas)){
# i <- 1
Theta <- Thetas[[i]]
#Theta1 is (25,401), features are (401, 1)
#so "z" comes out to be (25, 1)
#this is one "z" value for each unit in the hidden layer
#not counting the bias unit
z <- Theta %*% features
a <- sig(z)
zs_as_per_layer[[i]] <- cbind(z, a)
if(i == length(Thetas)) {
return(zs_as_per_layer)
}
a <- c(1, a)
features <- a
# i <- 2
}
}
We should expect an initial cost of 0.287629.
computeCost(unlist(ex3weights), unlist(ex3data1[, 1:401]), newy)
## [,1]
## [1,] 0.2876292
With the regularization term, we should have an initial cost of 0.383770. As of 11/21/2016, this is slightly off.
computeCost(unlist(ex3weights), unlist(ex3data1[, 1:401]), newy, 1)
## [,1]
## [1,] 0.3844878
sigmoidGradient <- function(z){
sig(z) * (1 - sig(z))
}
genRandThetas <- function(epsilon_init = 0.12){
t1 <- matrix(runif(hidden_layer_size * (input_layer_size+1), -1),
hidden_layer_size,
input_layer_size + 1)
t2 <- matrix(runif(output_layer_size * (hidden_layer_size+1)),
output_layer_size,
hidden_layer_size+1)
return(list(Theta1 = t1 * epsilon_init, Theta2 = t2 * epsilon_init))
}
backPropagate <- function(mythetas_flattened, myX_flattened, myy, mylambda = 0){
# First unroll the parameters
mythetas <- reshapeParams(mythetas_flattened)
# Now unroll X
myX <- reshapeX(myX_flattened)
Delta1 = matrix(0, hidden_layer_size,input_layer_size+1)
Delta2 = matrix(0, output_layer_size,hidden_layer_size+1)
m = n_training_samples
for(irow in 1:m){
myrow <- myX[irow,]
a1 <- myrow
# propagateForward returns (zs, activations) for each layer excluding the input layer
temp = propagateForward(myrow,mythetas)
z2 = temp[[1]][,1]
a2 = temp[[1]][,2]
z3 = temp[[2]][,1]
a3 = temp[[2]][,2]
tmpy <- myy[irow, ]
delta3 = a3 - tmpy
delta2 <- (t(mythetas[[2]])[-1,] %*% delta3) * sigmoidGradient(z2)
a2 <- c(1,a2)
Delta1 <- Delta1 + delta2 %*% t(a1)
Delta2 <- Delta2 + delta3 %*% t(a2)
}
D1 <- Delta1 / m
D2 <- Delta2 / m
#Regularization:
D1[, -1] <- D1[, -1] + (mylambda / m) * mythetas[[1]][,-1]
D2[, -1] <- D2[, -1] + (mylambda / m) * mythetas[[2]][,-1]
return(unlist(list(D1, D2)))
}
#Actually compute D matrices for the Thetas provided
flattenedD1D2 <- backPropagate(unlist(ex3weights),
unlist(ex3data1[, 1:401]),
newy,#ex3data1[, ncol(ex3data1)],
mylambda = 0)
deltas <- reshapeParams(flattenedD1D2)
checkGradient <- function(mythetas,myDs,myX,myy,mylambda=0){
myeps <- 0.0001
flattened <- unlist(mythetas)
flattenedDs <- unlist(myDs)
myX_flattened <- unlist(myX)
n_elems <- length(flattened)
# Pick ten random elements, compute numerical gradient, compare to respective D's
for(i in 1:10){
x <- as.integer(runif(1) * n_elems)
epsvec <- rep(0, times = n_elems)
epsvec[x] <- myeps
cost_high <- computeCost(flattened + epsvec,myX_flattened,myy,mylambda)
cost_low <- computeCost(flattened - epsvec,myX_flattened,myy,mylambda)
mygrad <- (cost_high - cost_low) / (2*myeps)
return(list(element = x, num.grad = mygrad, backprop.grad = flattenedDs[x]))
}
}
checkGradient(ex3weights, deltas, ex3data1[, 1:401], newy)
## $element
## [1] 7414
##
## $num.grad
## [,1]
## [1,] -0.0001192694
##
## $backprop.grad
## theta17414
## -0.0001192694
optim
trainNN <- function(mylambda=0, it = 50){
randomThetas_unrolled <- unlist(genRandThetas())
result <- optim(par = randomThetas_unrolled,
fn = function(x){computeCost(x, unlist(ex3data1[, 1:401]), newy)},
gr = function(x)(backPropagate(x, unlist(ex3data1[, 1:401]), newy)),
method = "BFGS", control = list(maxit = it))
}
start_time <- Sys.time()
learned_Thetas <- trainNN()
print(Sys.time() - start_time)
## Time difference of 2.187779 mins
Clearly not the most efficient implementation of neural networks.
NNpred <- function(myX,myThetas,myy){ #takes vector of ys
apply(myX, 1, function(x){
which.max(propagateForward(x, myThetas)[[2]][,2])
})
}
pred <- NNpred(ex3data1[, 1:401],
reshapeParams(learned_Thetas$par),
ex3data1[, ncol(ex3data1)])
sum(pred== ex3data1[, ncol(ex3data1)]) / length(pred)
## [1] 0.925
Unfortunately, there’s no way to check the exact solution numerically. From the assignment:
If your implementation is correct, you should see a reported training accuracy of about 95.3% (this may vary by about 1% due to the random initialization)
It’s close, but I suspect the error with how regularization was implemented is causing it to not perform quite as well.