Charlie Homewood

Monte Carlo Simulations Project R Scripts



The following are the R scripts I wrote for this Monte Carlo simulations project. Each script corresponds to each part of the project, producing the visualisations seen in the report.

These scripts were submitted in completion of the Monte Carlo Simulations (L7) (865G1) module.



Part 1

    # initialise values

    seed = XXX
    n = 30000
    alpha = 3
    k = 1
    mu = 0

    # ensure plots folder is created
    if (!dir.exists("plots")) dir.create("plots")

    # define a function to ensure constraints mentioned in part 1 are held

    constraints = function(seed, n, alpha, k, mu) {
    
        # begin by assuming all constraints are met, then test them
        constraints = TRUE
        
        # ensure all inputs are numeric, alpha & k are positive reals, and mu is real
        
        ### if any input is not a numeric value, return FALSE
        if (!is.numeric(seed) || !is.numeric(n) || !is.numeric(alpha) || 
            !is.numeric(k) || !is.numeric(mu)) {
            constraints = FALSE
            stop("All arguments must be numeric")
        } 
        ### if alpha or k are <= 0 OR infinite, return FALSE
        if (alpha <= 0 || k <= 0 || !is.finite(alpha) || !is.finite(k)) {
            constraints = FALSE
            stop("alpha and k must be positive real numbers")
        } 
        if (!is.finite(mu)) {
            constraints = FALSE
            stop("mu must be a finite real number")
        }
        
        return(constraints)
    
    }

    # generate U ~ U[0,1]

    U = function(seed, n) {
        set.seed(seed)
        runif(n, min = 0, max = 1)
    }

    # define theoretical pdf, with equivalent parameters to the MC estimate

    pdf = function(x, alpha, k, mu) {
    
        # apply constraints logic
        constraints(seed, n, alpha, k, mu)
        
        # define pdf as shown in Part 1 instructions
        pdf = ifelse(
            x >= mu, 
            (alpha/k) * ((k-mu+x)/k)^(-alpha-1),
            0
        )

        return(pdf)
    
    } 

    # define cdf 

    cdf = function(x, alpha, k, mu) {
    
        # apply constraints logic
        constraints(seed, n, alpha, k, mu)
        
        # using the cdf defined in eq. 1
        ifelse(
            x >= mu, 
            1 - ((k - mu + x) / k)^-alpha, 
            0
        )
    
    }

    # define quantile function

    inverse_cdf = function(seed, n, alpha, k, mu) {
    
        # apply constraints logic
        constraints(seed, n, alpha, k, mu)
        
        # using the quantile function defined in eq. 2, with x = U
        inv_cdf = k * (1/(1-U(seed, n)))^(1/alpha) - k + mu
        return(inv_cdf)
    
    }

    # define histogram plot function

    plot_hist = function() {
    
        hist(
            x = inverse_cdf(seed, n, alpha, k, mu),
            breaks = "Freedman-Diaconis", # h = 2 * (IQR(x) / n^(1/3))
            freq = FALSE,
            main = "",
            cex = 2,
            xlab = "x",
            xlim = c(0, 3),
            ylim = c(0, 3)
        )
        
        curve(pdf(x, alpha, k, mu), from = 0, to = 5, col = "red", lwd = 2, add = TRUE)

        legend(
            "topright",                    
            legend = expression(paste(f(x), ": ", alpha == 3, ", ", italic(k) == 1, ", ", mu == 0)), 
            col = c("red"), 
            lwd = 2, lty = 1, bty = "n", cex = 1.5
        ) 
    
    }

    # plot histogram with theoretical pdf
    png("plots/histogram.png", width = 8, height = 6, units = "in", res = 300)

    par(mar = c(4, 4, 1, 1))

    plot_hist()

    dev.off()

    # goodness-of-fit test, as a function of sample size: k-s test

    goodness_of_fit_test = function() {
    
        # initialise dataframe to store test results in. 
        test_results = data.frame(
            sample_size = numeric(),
            statistic = numeric(),
            p_value = numeric()
        )
        
        # to avoid having each ks-test run on subsets of the same set of generated values from f(x), we need to change the seed each time we generate samples from inverse_cdf(). To do this, I am going to generate values using U(seed, n). Then, each of these values will be used as a seed for each KS test. This ensures that each KS test runs on independent samples, but is still reproducible as the seeds are generated using the global seed (i.e. 576).
        
        list_of_seeds = U(seed, n)
        
        # for each number between 1 and n, run the ks-test and store the sample size and p-value in the test_results dataframe
        progress_bar = txtProgressBar(min = 1, max = n, style = 3)
        print("Performing KS Tests...")
        
        for (i in 1:n) {
            
            if (i == 1 || i %% 50 == 0) {
            
            result = ks.test(
                inverse_cdf(list_of_seeds[i], i, alpha, k, mu), 
                function(x) cdf(x, alpha, k, mu)
            )
            
            test_results = rbind(test_results, data.frame(
                sample_size = i,
                statistic = result$statistic,
                p_value = result$p.value
            ))
            
            setTxtProgressBar(progress_bar, i)
        }
        
    }
    
    # plot the results on line graphs
    
    par(mfrow = c(1, 2))
    
    plot(
        test_results$sample_size,
        test_results$statistic,
        type = "l", col = "darkorange", cex = 2,
        main = "Test Statistic vs Sample size",
        xlab = "Sample Size", xlim = c(1, n),
        ylab = "Test Statistic", ylim = c(0, 1)
    )
    
    plot(
        test_results$sample_size,
        test_results$p_value,
        type = "l", col = "darkgreen", cex = 2,
        main = expression(bold(bolditalic(p)*"-value vs Sample size")),
        xlab = "Sample Size", xlim = c(1, n),
        ylab = expression(italic(p)*"-value"), ylim = c(0, 1)
    )
    
    # significance threshold = 0.05
    abline(h = 0.05, col = "red", lty = 2, lwd = 5)
    
    }

    png("plots/goodness_of_fit.png", width = 16, height = 8, units = "in", res = 300)

    par(mar = c(4, 4, 1, 1))

    goodness_of_fit_test()

    dev.off()

    # importance sampling

    ## plot proposal distribution(s)

    proposal_plot = function() {
    
        plot(
            0, type = "n", xlab = "X", ylab = "Density", 
            xlim = c(0, 3), ylim = c(0, 3), cex = 2
        )
        
        curve(pdf(x, alpha, k, mu), col = "black", lwd = 2, from = 0, to = 3, ylab = "Density")
        curve(pdf(x, alpha = 4, k = 2, mu), col = "red", lwd = 2, from = 0, to = 3, add = TRUE)
        
        legend(
            "topright",                    
            legend = c(
            expression(paste(f(x), ": ", alpha == 3, ", ", italic(k) == 1, ", ", mu == 0)), 
            expression(paste(g(x), ": ", alpha == 4, ", ", italic(k) == 2, ", ", mu == 0))
            ), 
            col = c("black", "red"), 
            lwd = 2, lty = 1, bty = "n", cex = 1.5
        )  
    
    }

    png("plots/proposal_plot.png", width = 9, height = 5, units = "in", res = 300)

    par(mar = c(4, 4, 1, 1))

    proposal_plot()

    dev.off()

    # perform importance sampling algorithm

    importance_sampling = function() {
    
        gx_samples = inverse_cdf(seed, n, alpha = 4, k = 2, mu)
        
        weights = pdf(gx_samples, alpha, k, mu) / pdf(gx_samples, alpha = 4, k = 2, mu)
        
        options(scipen = 999)
        
        simple_mean = mean(inverse_cdf(seed, n, alpha, k, mu))
        simple_variance = var(inverse_cdf(seed, n, alpha, k, mu))
        IS_mean = mean(gx_samples * weights)
        IS_variance = var(gx_samples * weights) / n
        
        results = data.frame(
            Mean = c(simple_mean, IS_mean),
            Variance = c(simple_variance, IS_variance)
        )
        
        rownames(results) = c("Simple", "IS")

        print(results)
    
    }

    importance_sampling()

  

Part 2

    # initialise values

    seed = XXX            # random seed
    n_states = 5          # number of states in Markov chain
    n = 100000            # (max) sample size for empirical distribution
    options(scipen = 999) # avoids printing in scientific notation

    # ensure plots folder is created
    if (!dir.exists("plots")) dir.create("plots")

    # generate transition matrix

    generate_transition_matrix = function(seed, n_states) {
    
    set.seed(seed)
    
    transition_matrix = matrix(runif(n_states * n_states), nrow = n_states)
    transition_matrix = t(apply(transition_matrix, 1, function(x) x / sum(x))) # Normalize each row to sum to 1
    
    return(transition_matrix)
    
    }

    P = generate_transition_matrix(seed, n_states)

    # check if P is irreducible

    is_irreducible = function(matrix, max_iterations) {

        irreducible = FALSE
        
        if (sqrt(length(matrix)) %% 1 != 0) {
            stop("P must be a square matrix")
        }

        Pk = matrix

        for (k in 1:max_iterations) {

            Pk = Pk %*% matrix

            if (all(Pk > 0)) {
            # if all the elements are positive, the transition matrix is irreducible
            irreducible = TRUE
            return(list(
                irreducible = irreducible,
                message = paste("Matrix is irreducible (k = ", k, ")", sep = "")
            ))
            } else {
            return(list(
                irreducible = irreducible,
                message = paste("Matrix is still reducible after", max_iterations, "iterations")
            ))
            }
        }
    }

    is_irreducible(matrix = P, max_iterations = 100000)

    # check if P is aperiodic

    is_aperiodic = function(matrix) {
    
        # if P is irreducible, then we just need to check that at least 1 P[i,i]
        # is > 0. If P is not irreducible, we would need to correct that.
        if (is_irreducible(matrix = matrix, max_iterations = 1000)$irreducible == TRUE) {
            for (i in 1:length(diag(matrix))) {
            if (i > 0) {
                return(TRUE)
            }
            }
        } else {
            return(FALSE)
        }
    
    }

    is_aperiodic(matrix = P)

    # check if P is positive recurrent

    is_positive_recurrent = function(matrix) {
    
        # if P is aperiodic, is.aperiodic(P) == TRUE also means P is irreducible (see
        # function code). as such, P simply needs to have a finite number of states
        # to be positive recurrent
        
        if (is_aperiodic(matrix = matrix) == TRUE) {
            return(is.finite(nrow(matrix)))
        }
    
    }

    is_positive_recurrent(matrix = P)

    # calculate invariant distribution

    invariant_distribution = function(matrix) {
    
        # eigen(matrix) gives the right-eigenvectors of a matrix. So, to get the
        # left-eigenvectors, we can transpose the matrix eigen(t(matrix)). Since the
        # right-eigenvectors of t(matrix) == left-eigenvectors of matrix
        
        # get eigenvalues and eigenvectors of transposed transition matrix
        eigen = eigen(t(matrix))
        
        # extract real components of eigenvalues
        eigenvalues = Re(round(eigen$values, 8))
        
        # extract real components of eigenvectors
        left_eigenvectors = Re(eigen$vectors)
        
        # obtain pi (the left-eigenvector with eigenvalue = 1)
        pi = left_eigenvectors[, which(eigenvalues == 1)]
        
        # normalise pi
        invariant_dist  = pi / sum(pi)
        
        return(invariant_dist)
    
    }

    invariant_distribution(matrix = P)

    # simulate Markov chain

    simulate_Markov_chain = function(n_states, matrix, sample_size) {
    
        set.seed(seed)
        
        samples = sample(
            1:n_states, 
            size = sample_size, 
            replace = TRUE, 
            prob = invariant_distribution(matrix = matrix)
        )
        
        empirical_distribution = table(samples, dnn = "") / sample_size
        
        return(empirical_distribution)
    
    }

    simulate_Markov_chain(n_states = n_states, matrix = P, sample_size = n)

    # compare empirical v theoretical invariant distributions

    empirical_vs_theoretical = function() {
    
        differences = data.frame()
        
        for (i in seq(100, n, by = 100)) {
            
            sample_size = i
            empirical = simulate_Markov_chain(n_states = n_states, matrix = P, sample_size = i)
            theoretical = invariant_distribution(matrix = P)
            
            difference = empirical - theoretical
            
            differences = rbind(
            differences, 
            data.frame(
                sample_size = sample_size,
                state_1 = difference[1],
                state_2 = difference[2],
                state_3 = difference[3],
                state_4 = difference[4],
                state_5 = difference[5]
            )
            )
            
        }
        
        matplot(
            
            x = differences$sample_size, 
            y = differences[,2:ncol(differences)], 
            
            type = "l", 
            lty = 1, 
            lwd = 2, 
            col = rainbow(ncol(differences)-1), 
            
            main = "",
            xlab = "Sample Size",
            ylab = "Difference (Empirical - Theoretical)",
            
            ylim = c(-0.1, 0.1)
            
        )
        
        abline(h = 0, col = "black", lty = 2, lwd = 2)

        legend(
            "topright", 
            legend = paste("State", 2:ncol(differences)-1), 
            col = rainbow(ncol(differences)-1), 
            lty = 1, 
            lwd = 2
        )

    }

    png("plots/empirical_vs_theoretical.png", width = 8, height = 6, units = "in", res = 300)

    par(mar = c(4, 4, 1, 1))

    empirical_vs_theoretical()

    dev.off()
  

Part 3

    # Initialise values

    seed = XXX
    N = 1.1 * 10^4
    N_I = floor(N * (2000 / (9.2 * 10^6)))
    N_RC = N * 0.05
    L_0 = 124
    t_span = 57

    options(scipen = 999)

    # ensure plots folder is created
    if (!dir.exists("plots")) dir.create("plots")

    initialise_particle_df = function(N, N_I, N_RC, L_0, seed) {
        
        set.seed(seed)
        
        N_SI = N - N_RC
        N_S = N_SI - N_I
        
        IDs = sample(1:N)
        
        N_SI_IDs = IDs[1:N_S]
        N_I_IDs = IDs[(N_S + 1):(N_S + N_I)]
        N_RC_IDs = IDs[(N_S + N_I + 1):(N_S + N_I + N_RC)]
        
        particle_df = data.frame(
            x = runif(N, 0, L_0),
            y = runif(N, 0, L_0),
            SIR_status = rep(NA, N),
            day_vaccinated = rep(Inf, N),
            vaccine_effectiveness = rep(0, N),
            day_infected = rep(NA, N),
            is_infectious = rep(FALSE, N),
            age_group = cut(
            1:N,
            breaks = c(0, N * 0.2, N * 0.83, N),
            labels = c("0-11", "12-59", "60+")
            )
        )
        
        particle_df$SIR_status[N_SI_IDs] = "susceptible"
        particle_df$SIR_status[N_I_IDs] = "infected"
        particle_df$day_infected[N_I_IDs] = -3
        particle_df$SIR_status[N_RC_IDs] = "recovered"
        
        vaccinated_12_59 = sample(
            which(particle_df$age_group == "12-59"), 
            size = round(0.15 * length(which(particle_df$age_group == "12-59")))
        )
        vaccinated_60plus = sample(
            which(particle_df$age_group == "60+"), 
            size = round(0.70 * length(which(particle_df$age_group == "60+")))
        )
        
        particle_df$day_vaccinated[c(vaccinated_12_59, vaccinated_60plus)] = 1
        
        return(particle_df)
    
    }

    init_df = initialise_particle_df(N = N, N_I = N_I, N_RC = N_RC, L_0 = L_0, seed = seed)

    vaccine_effectiveness = function(particle_df, t, VE_days = c(7, 28, 150, 180), VE_level) {
    
        T_V = particle_df$day_vaccinated
        VE = numeric(length(T_V))  
        is_vaccinated = is.finite(T_V)
        
        VE[is_vaccinated & (t - T_V) <= VE_days[1]] = 0
        
        days_8_to_28 = is_vaccinated & (t - T_V) > VE_days[1] & (t - T_V) <= VE_days[2]
        VE[days_8_to_28] = VE_level * (((t - T_V)[days_8_to_28] - VE_days[1]) / (VE_days[2] - VE_days[1]))
        
        days_29_to_150 = is_vaccinated & (t - T_V) > VE_days[2] & (t - T_V) <= VE_days[3]
        VE[days_29_to_150] = VE_level
        
        days_151_180 = is_vaccinated & (t - T_V) > VE_days[3] & (t - T_V) <= VE_days[4]
        VE[days_151_180] = VE_level * (1 - (((t - T_V)[days_151_180] - VE_days[3]) / (3 * (VE_days[4] - VE_days[3]))))
        
        days_181_plus = is_vaccinated & (t - T_V) > VE_days[4]
        VE[days_181_plus] = 2 * (VE_level / 3)
        
        particle_df$vaccine_effectiveness = VE

        return(particle_df)

    }

    is_infectious = function(particle_df, t) {
    
        infectious_status = rep(FALSE, nrow(particle_df))
        
        infected = (particle_df$SIR_status == "infected")
        
        infectious_now = infected & (t - particle_df$day_infected >= 4)
        
        infectious_status[infectious_now] = TRUE
        particle_df$is_infectious = infectious_status
        
        return(particle_df)
    
    }

    box_muller = function(n, mu, sigma, seed) {

        set.seed(seed)
        
        U1 = runif(n, min = 0, max = 1)
        U2 = runif(n, min = 0, max = 1)
        
        Z0 = sqrt(-2 * log(U1)) * cos(2 * pi * U2)
        Z1 = sqrt(-2 * log(U1)) * sin(2 * pi * U2)
        
        X0 = mu + sigma * Z0
        X1 = mu + sigma * Z1
        
        return(list(X0 = X0, X1 = X1))

    }

    infection_probability = function(particle_df, t, sigma) {
        
        susceptible_idx = which(particle_df$SIR_status == "susceptible")
        infectious_idx = which(particle_df$is_infectious)
        
        if (length(susceptible_idx) == 0 || length(infectious_idx) == 0) {
            return(particle_df)
        }
        
        dist_sq = outer(particle_df$x[susceptible_idx], particle_df$x[infectious_idx], "-")^2 + outer(particle_df$y[susceptible_idx], particle_df$y[infectious_idx], "-")^2
        
        decay = exp(- (dist_sq / (2 * sigma^2))^2)
        
        ve_matrix = matrix(1 - particle_df$vaccine_effectiveness[susceptible_idx], nrow = length(susceptible_idx), ncol = length(infectious_idx))
        infection_matrix = ve_matrix * decay
        
        infection_prob = rowSums(infection_matrix)
        
        newly_infected = which(infection_prob > 0)
        if (length(newly_infected) > 0) {
            infected_ids = susceptible_idx[newly_infected]
            particle_df$SIR_status[infected_ids] = "infected"
            particle_df$day_infected[infected_ids] = t
        }
        
        return(particle_df)

    }

    simulation = function(N, N_I, N_RC, L_0, seed, t_span, R_t, sigma, VE_level) {

        particle_df = initialise_particle_df(N = N, N_I = N_I, N_RC = N_RC, L_0 = L_0, seed = seed)
        L = (L_0 * sqrt(1.25)) / R_t
        daily_new_infections = numeric(t_span)
        
        pb = txtProgressBar(min = 0, max = t_span, style = 3)
        
        for (t in 1:t_span) {
            
            particle_df = vaccine_effectiveness(particle_df = particle_df, t, VE_level = VE_level)
            particle_df = is_infectious(particle_df, t)
            
            particle_df = infection_probability(particle_df, t, sigma = sigma)
            
            newly_infected_today = sum(particle_df$day_infected == t, na.rm = TRUE)
            daily_new_infections[t] = newly_infected_today
            
            k = 1
            total_target_vaccinated = floor(N * 1 - exp(-k * t))
            
            current_vaccinated = sum(is.finite(particle_df$day_vaccinated))
            new_vaccinations_needed = total_target_vaccinated - current_vaccinated
            
            if (new_vaccinations_needed > 0) {
            eligible = which(!is.finite(particle_df$day_vaccinated))
            
            if (length(eligible) > 0) {
                n_to_vaccinate = min(new_vaccinations_needed, length(eligible))
                chosen = sample(eligible, n_to_vaccinate)
                particle_df$day_vaccinated[chosen] = t
            }
        }
            
            bm = box_muller(n = nrow(particle_df), mu = 0, sigma = L_0 / 2, seed = seed + t)
            particle_df$x = (particle_df$x + bm$X0) %% L
            particle_df$y = (particle_df$y + bm$X1) %% L
            
            setTxtProgressBar(pb, t)
            
        }
        
        close(pb)

        return(list(particle_df = particle_df, daily_new_infections = daily_new_infections))
    
    }

    results = simulation(N, N_I, N_RC, L_0, seed, t_span = t_span, R_t = 1.1, sigma = 0.1, VE_level = 0.9)
    moving_average = stats::filter(results$daily_new_infections, rep(1/7, 7), sides = 1)

    png("plots/reproduced_figure1B.png", width = 8, height = 6, units = "in", res = 300)

    par(mar = c(4, 4, 1, 1))

    plot(
        moving_average, 
        type = "l", 
        col = "blue", 
        lwd = 2,
        ylab = "Daily new cases, All (7-days moving average)",
        ylim = c(0, 1200),
        xlab = "Day",
        main = ""
    )

    dev.off()