require(knitr)
require(dplyr)
read_chunk("ex2/ex2_chunks.R")
read_chunk("ex4/ex4_chunks.R")

1 Neural Networks

set.seed(12345)
ex3data1 <- as.matrix(read.csv("data/ex3data1.csv"))
colnames(ex3data1) <- paste("X", seq(ncol(ex3data1)), sep = "")
colnames(ex3data1)[ncol(ex3data1)] <- "Y"
ex3data1 <- cbind(X0 = 1, ex3data1)
sig <- function(x){1 / (1 + exp(-x))}
newy <- vector()
for(i in 1:10){
  newy <- cbind(newy, ex3data1[, "Y"] == i)
}

1.1 Visualizing the data

This is the same visualization as in Exercise 3

1.2 Model representation

This is largely based on kaleko’s python script on Github.

I read in the weights and save them as a list because this mimics what’s returned with readMat. Since I already processed them and saved them as separate CSVs, this is an odd way to do it, but it would require going back and modifying some of the structure for the following computations. I may tidy this up in the future

ex3weights <- list(Theta1 = as.matrix(read.csv("data/ex3weights_Theta1.csv")),
                   Theta2 = as.matrix(read.csv("data/ex3weights_Theta2.csv")))
input_layer_size <- 400
hidden_layer_size <- 25
output_layer_size <- 10
n_training_samples <- 5000

1.3 Feedforward and cost function

unlist() does what flattenParams does in kaleko’s Python script

reshapeParams <- function(flattened_array){
    theta1 <- matrix(flattened_array[1:((input_layer_size+1)*hidden_layer_size)],
                     nrow = hidden_layer_size,
                     ncol = input_layer_size + 1,
                     byrow = FALSE)

    theta2 <- matrix(flattened_array[((input_layer_size+1)*hidden_layer_size + 1):
                                         length(flattened_array)],
                     nrow = output_layer_size,
                     ncol = hidden_layer_size + 1,
                     byrow = FALSE)

    return(list(theta1 = theta1, theta2 = theta2))
}
reshapeX <- function(flattenedX){
    xReshaped <- matrix(flattenedX,
                        nrow = n_training_samples,
                        ncol = (input_layer_size+1),
                        byrow = FALSE)
    return(xReshaped)
}
computeCost <- function(mythetas_flattened, myX_flattened, myy, mylambda = 0){
    # Modified to take (m X k) dimensional y matrix

    # First unroll the parameters
    mythetas <- reshapeParams(mythetas_flattened)

    # Now unroll X
    myX <- reshapeX(myX_flattened)

    #This is what will accumulate the total cost
    total_cost <- 0

    m <- n_training_samples

    # irow <- 100
    for(irow in 1:m){
        myrow <- myX[irow, ]
        myhs <- propagateForward(myrow,mythetas)[[2]][,2]
        tmpy <- myy[irow, ]
        mycost <- - crossprod(c(tmpy, 1 - tmpy), c(log(myhs), log(1 - myhs)))
        total_cost <- total_cost + mycost
    }
    total_cost <- total_cost / m

    total_reg <- 0
    for(mytheta in mythetas){
        total_reg <- total_reg + sum(mytheta * mytheta)
    }
    total_reg <- total_reg * mylambda / (2 * m)
    return(total_cost + total_reg)
}
propagateForward <- function(row, Thetas){
    features <- row
    zs_as_per_layer <- list()

    for(i in 1:length(Thetas)){
        # i <- 1
        Theta <- Thetas[[i]]

        #Theta1 is (25,401), features are (401, 1)
        #so "z" comes out to be (25, 1)
        #this is one "z" value for each unit in the hidden layer
        #not counting the bias unit
        z <- Theta %*% features
        a <- sig(z)
        zs_as_per_layer[[i]] <- cbind(z, a)
        if(i == length(Thetas)) {
            return(zs_as_per_layer)
        }
        a <- c(1, a)
        features <- a
        # i <- 2
    }
}

We should expect an initial cost of 0.287629.

computeCost(unlist(ex3weights), unlist(ex3data1[, 1:401]), newy)
##           [,1]
## [1,] 0.2876292

1.4 Regularized cost function

With the regularization term, we should have an initial cost of 0.383770. As of 11/21/2016, this is slightly off.

computeCost(unlist(ex3weights), unlist(ex3data1[, 1:401]), newy, 1)
##           [,1]
## [1,] 0.3844878

2 Backpropagation

2.1 Sigmoid gradient

sigmoidGradient <- function(z){
    sig(z) * (1 - sig(z))
}

2.2 Random initialization

genRandThetas <- function(epsilon_init = 0.12){
    t1 <- matrix(runif(hidden_layer_size * (input_layer_size+1), -1),
                 hidden_layer_size,
                 input_layer_size + 1)
    t2 <- matrix(runif(output_layer_size * (hidden_layer_size+1)),
                 output_layer_size,
                 hidden_layer_size+1)
    return(list(Theta1 = t1 * epsilon_init, Theta2 = t2 * epsilon_init))
}

2.3 Backpropagation

backPropagate <- function(mythetas_flattened, myX_flattened, myy, mylambda = 0){
    # First unroll the parameters
    mythetas <- reshapeParams(mythetas_flattened)

    # Now unroll X
    myX <- reshapeX(myX_flattened)

    Delta1 = matrix(0, hidden_layer_size,input_layer_size+1)
    Delta2 = matrix(0, output_layer_size,hidden_layer_size+1)

    m = n_training_samples
    for(irow in 1:m){
        myrow <- myX[irow,]
        a1 <- myrow
        # propagateForward returns (zs, activations) for each layer excluding the input layer
        temp = propagateForward(myrow,mythetas)
        z2 = temp[[1]][,1]
        a2 = temp[[1]][,2]
        z3 = temp[[2]][,1]
        a3 = temp[[2]][,2]
        tmpy <- myy[irow, ]
        delta3 = a3 - tmpy

        delta2 <- (t(mythetas[[2]])[-1,] %*% delta3) * sigmoidGradient(z2)
        a2 <- c(1,a2)
        Delta1 <- Delta1 + delta2 %*% t(a1)
        Delta2 <- Delta2 + delta3 %*% t(a2)
    }
    D1 <- Delta1 / m
    D2 <- Delta2 / m

    #Regularization:
    D1[, -1] <- D1[, -1] + (mylambda / m) * mythetas[[1]][,-1]
    D2[, -1] <- D2[, -1] + (mylambda / m) * mythetas[[2]][,-1]

    return(unlist(list(D1, D2)))
}
#Actually compute D matrices for the Thetas provided
flattenedD1D2 <- backPropagate(unlist(ex3weights),
                              unlist(ex3data1[, 1:401]),
                              newy,#ex3data1[, ncol(ex3data1)],
                              mylambda = 0)

deltas <- reshapeParams(flattenedD1D2)

2.4 Gradient checking

checkGradient <- function(mythetas,myDs,myX,myy,mylambda=0){
    myeps <- 0.0001
    flattened <- unlist(mythetas)
    flattenedDs <- unlist(myDs)
    myX_flattened <- unlist(myX)
    n_elems <- length(flattened)
    # Pick ten random elements, compute numerical gradient, compare to respective D's
    for(i in 1:10){
        x <- as.integer(runif(1) * n_elems)
        epsvec <- rep(0, times = n_elems)
        epsvec[x] <- myeps
        cost_high <- computeCost(flattened + epsvec,myX_flattened,myy,mylambda)
        cost_low <- computeCost(flattened - epsvec,myX_flattened,myy,mylambda)
        mygrad <- (cost_high - cost_low) / (2*myeps)
        return(list(element = x, num.grad = mygrad, backprop.grad = flattenedDs[x]))
    }
}
checkGradient(ex3weights, deltas, ex3data1[, 1:401], newy)
## $element
## [1] 7414
## 
## $num.grad
##               [,1]
## [1,] -0.0001192694
## 
## $backprop.grad
##    theta17414 
## -0.0001192694

2.5 Regularized Neural Networks

2.6 Learning parameters using optim

trainNN <- function(mylambda=0, it = 50){
    randomThetas_unrolled <- unlist(genRandThetas())
    result <- optim(par = randomThetas_unrolled,
                    fn = function(x){computeCost(x, unlist(ex3data1[, 1:401]), newy)},
                    gr = function(x)(backPropagate(x, unlist(ex3data1[, 1:401]), newy)),
                    method = "BFGS", control = list(maxit = it))
}
start_time <- Sys.time()
learned_Thetas <- trainNN()
print(Sys.time() - start_time)
## Time difference of 2.187779 mins

Clearly not the most efficient implementation of neural networks.

NNpred <- function(myX,myThetas,myy){ #takes vector of ys
    apply(myX, 1, function(x){
        which.max(propagateForward(x, myThetas)[[2]][,2])
    })
}
pred <- NNpred(ex3data1[, 1:401], 
               reshapeParams(learned_Thetas$par), 
               ex3data1[, ncol(ex3data1)])
sum(pred== ex3data1[, ncol(ex3data1)]) / length(pred)
## [1] 0.925

Unfortunately, there’s no way to check the exact solution numerically. From the assignment:

If your implementation is correct, you should see a reported training accuracy of about 95.3% (this may vary by about 1% due to the random initialization)

It’s close, but I suspect the error with how regularization was implemented is causing it to not perform quite as well.

3 Visualizing the hidden layer

Again, displayData was already written in the included Matlab files, so I’ve skipped it here.

3.1 Optional (ungraded) exercise

I may revisit the optional exercizes once I’m finished with the “main” content.