Executive Summary

This analysis applies stochastic modeling techniques to insurance claims data, focusing on:

  • Frequency Modeling: Fitting discrete distributions (Poisson, Negative Binomial, Geometric) to claim frequencies
  • Severity Modeling: Fitting continuous distributions to claim amounts
  • Monte Carlo Simulation: Estimating risk premiums with variance reduction techniques
  • Risk Assessment: Calculating Value-at-Risk and Combined Ratios for reinsurance decisions

Data Loading and Preprocessing

# Load and examine the dataset
data <- read.csv("DATA_SET_4.csv", sep = ";")
cat("Original dataset dimensions:", dim(data), "\n")
## Original dataset dimensions: 1002 17
head(data) %>% kable(caption = "Sample of Raw Data")
Sample of Raw Data
id CLM_FREQ CLM_AMT_1 CLM_AMT_2 CLM_AMT_3 CLM_AMT_4 CLM_AMT_5 CLM_AMT_6 CLM_AMT_7 CLM_AMT_8 CAR_USE CAR_TYPE AGE GENDER AREA PREMIUM X
1 6 580 591 640 569 652 617 - - Private Sedan 60 M Urban 1 727 NA
2 4 578 585 610 667 - - - - Commercial Sedan 43 M Urban 1 803 NA
3 3 656 527 671 - - - - - Private Van 48 M Urban 1 527 NA
4 3 640 668 597 - - - - - Private SUV 35 F Urban 1 451 NA
5 3 624 490 601 - - - - - Private Sedan 51 M Urban 1 895 NA
6 1 610 - - - - - - - Private SUV 50 F Urban 1 812 NA
# Data cleaning pipeline
data$X <- NULL  # Remove unnecessary column

# Remove policies with zero claims (no claims experience)
data <- data[data$CLM_FREQ != 0, ]

# Clean premium column
data$PREMIUM <- as.numeric(gsub(" ", "", data$PREMIUM))

# Clean claim amount columns
for (i in 1:8) {
  col_name <- paste0("CLM_AMT_", i)
  data[[col_name]] <- gsub(" ", "", data[[col_name]])
  data[[col_name]][data[[col_name]] %in% c("-", "")] <- "0"
  data[[col_name]] <- as.integer(data[[col_name]])
  data <- data[data[[col_name]] >= 0, ]
}

# Calculate total claim amounts
data$TOTAL_CLM_AMT <- rowSums(data[, paste0("CLM_AMT_", 1:8)], na.rm = TRUE)

cat("Cleaned dataset dimensions:", dim(data), "\n")
## Cleaned dataset dimensions: 946 17
summary(data[, c("CLM_FREQ", "PREMIUM", "TOTAL_CLM_AMT")]) %>% kable(caption = "Summary Statistics")
Summary Statistics
CLM_FREQ PREMIUM TOTAL_CLM_AMT
Min. :1.000 Min. :1400 Min. : 469
1st Qu.:2.000 1st Qu.:1538 1st Qu.:1169
Median :3.000 Median :1681 Median :1760
Mean :3.094 Mean :1688 Mean :1853
3rd Qu.:4.000 3rd Qu.:1841 3rd Qu.:2428
Max. :8.000 Max. :1999 Max. :5059

Claim Frequency Analysis

Distribution Fitting

# Visualize claim frequency distribution
ggplot(data, aes(x = CLM_FREQ)) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Claim Frequencies",
       x = "Number of Claims", y = "Frequency") +
  theme_minimal()

# Fit discrete distributions to claim frequency
fit_poisson <- fitdist(data$CLM_FREQ, "pois", method = "mle")
fit_nbinom <- fitdist(data$CLM_FREQ, "nbinom", method = "mle")
fit_geom <- fitdist(data$CLM_FREQ, "geom", method = "mle")

# Compare model fits
fit_list <- list(Poisson = fit_poisson, 
                 NegBinomial = fit_nbinom,
                 Geometric = fit_geom)

gof <- gofstat(fit_list)

# Create comparison table
comparison_table <- data.frame(
  AIC = round(gof$aic, 2),
  Chi_Squared = round(gof$chisq, 2)
) %>% arrange(AIC)

kable(comparison_table, caption = "Model Comparison for Claim Frequency")
Model Comparison for Claim Frequency
AIC Chi_Squared
1-mle-pois 3525.73 12.59
2-mle-nbinom 3527.73 12.60
3-mle-geom 4308.27 569.91

Result: Poisson distribution provides the best fit based on AIC criterion.

# Visualize fitted vs empirical distributions
obs_counts <- table(data$CLM_FREQ)
obs_probs <- obs_counts / sum(obs_counts)
x_vals <- as.integer(names(obs_counts))

# Calculate fitted probabilities
pois_probs <- dpois(x_vals, lambda = fit_poisson$estimate)
nbinom_probs <- dnbinom(x_vals, size = fit_nbinom$estimate["size"],
                        mu = fit_nbinom$estimate["mu"])
geom_probs <- dgeom(x_vals - 1, prob = fit_geom$estimate)

# Create visualization
plot_data <- data.frame(
  x = rep(x_vals, 4),
  probability = c(obs_probs, pois_probs, nbinom_probs, geom_probs),
  distribution = rep(c("Empirical", "Poisson", "Neg. Binomial", "Geometric"), 
                     each = length(x_vals))
)

ggplot(plot_data, aes(x = x, y = probability, color = distribution)) +
  geom_point(size = 3) + geom_line() +
  labs(title = "Empirical vs Fitted Discrete Distributions",
       x = "Claim Frequency", y = "Probability") +
  theme_minimal() +
  scale_color_manual(values = c("black", "blue", "red", "darkgreen"))

Monte Carlo Estimation with Variance Reduction

set.seed(2025)

# Standard Monte Carlo estimation
observed_mean <- mean(data$CLM_FREQ)

plot_monte_carlo <- function(max_simulations = 50000, step = 1000, sample_size = 100) {
  results <- data.frame(Simulation = integer(), Estimator = numeric(), 
                        Lower_CI = numeric(), Upper_CI = numeric())
  
  for (num_sims in seq(100, max_simulations, by = step)) {
    sims <- replicate(num_sims, mean(rpois(sample_size, lambda = observed_mean)))
    
    mean_est <- mean(sims)
    sd_est <- sd(sims)
    alpha <- 0.05
    z_value <- qnorm(1 - alpha/2)
    lower_ci <- mean_est - z_value * (sd_est / sqrt(num_sims))
    upper_ci <- mean_est + z_value * (sd_est / sqrt(num_sims))
    
    results <- rbind(results, data.frame(
      Simulation = num_sims, Estimator = mean_est,
      Lower_CI = lower_ci, Upper_CI = upper_ci
    ))
  }
  
  return(list(results = results, final_variance = var(sims), final_estimate = mean(sims)))
}

mc_standard <- plot_monte_carlo()

# Visualization
ggplot(mc_standard$results, aes(x = Simulation)) +
  geom_line(aes(y = Estimator), color = "blue", size = 1) +
  geom_ribbon(aes(ymin = Lower_CI, ymax = Upper_CI), alpha = 0.2, fill = "red") +
  geom_hline(yintercept = observed_mean, linetype = "dashed", 
             color = "darkgreen", size = 1) +
  labs(title = "Monte Carlo Convergence: Claim Frequency Estimation",
       subtitle = paste("True mean =", round(observed_mean, 3)),
       x = "Number of Simulations", y = "Estimated Mean") +
  theme_minimal()

set.seed(2025)
# Antithetic variates for variance reduction

sample_size <- 100
num_simulations <- 50000
anthitetic_sim <- replicate(num_simulations, {
      u <- runif(sample_size)
      x1 <- qpois(u, lambda = observed_mean)
      x2 <- qpois(1 - u, lambda = observed_mean)
      mean(c(x1, x2))
    })


# Compare variance reduction effectiveness
variance_comparison <- data.frame(
  Method = c("Standard MC", "Antithetic Variates"),
  Estimate = c(mc_standard$final_estimate, mean(anthitetic_sim)),
  Variance = c(mc_standard$final_variance, var(anthitetic_sim))
)

kable(variance_comparison, digits = 6, 
      caption = "Variance Reduction Comparison")
Variance Reduction Comparison
Method Estimate Variance
Standard MC 3.094309 0.031058
Antithetic Variates 3.093793 0.000962

Claim Severity Analysis

# Extract individual claim amounts (severity data)
selected_columns <- data[, paste0("CLM_AMT_", 1:8)]
severity_matrix <- as.matrix(selected_columns)
severity_vector <- as.vector(severity_matrix)
severity_vector <- severity_vector[severity_vector > 0]

# Visualize severity distribution
ggplot(data.frame(severity = severity_vector), aes(x = severity)) +
  geom_histogram(bins = 50, fill = "lightcoral", alpha = 0.7) +
  labs(title = "Distribution of Claim Severities",
       x = "Claim Amount", y = "Frequency") +
  theme_minimal()

# Fit continuous distributions to claim severity
fit_exp <- fitdist(severity_vector, "exp", method = "mle")
fit_gamma <- fitdist(severity_vector, "gamma", method = "mle")
fit_lnorm <- fitdist(severity_vector, "lnorm", method = "mle")
fit_norm <- fitdist(severity_vector, "norm", method = "mle")

# Density comparison plot
denscomp(
  list(fit_exp, fit_gamma, fit_lnorm, fit_norm),
  legendtext = c("Exponential", "Gamma", "Lognormal", "Normal"),
  main = "Density Comparison: Claim Severity Models",
  fitlwd = 2
)

# Goodness-of-fit comparison
gof_severity <- gofstat(
  list(fit_exp, fit_gamma, fit_lnorm, fit_norm),
  fitnames = c("Exponential", "Gamma", "Lognormal", "Normal")
)

severity_results <- data.frame(

  AIC = c(fit_exp$aic, fit_gamma$aic, fit_lnorm$aic, fit_norm$aic),
  KS_Statistic = gof_severity$ks
) %>% arrange(KS_Statistic)

kable(severity_results, digits = 2, 
      caption = "Model Comparison for Claim Severity")
Model Comparison for Claim Severity
AIC KS_Statistic
Normal 30889.67 0.01
Gamma 30885.07 0.02
Lognormal 30891.61 0.03
Exponential 43291.18 0.55

Result: Normal distribution provides the best fit based on Kolmogorov-Smirnov test.

# Stratified sampling for variance reduction
set.seed(2025)
mean_norm <- fit_norm$estimate["mean"]
sd_norm <- fit_norm$estimate["sd"]

sample_size <- 100
num_simulations <- 10000

# Standard Monte Carlo
sims_standard <- replicate(num_simulations, {
  x <- rnorm(sample_size, mean = mean_norm, sd = sd_norm)
  mean(x)
})

# Stratified sampling
sims_stratified <- replicate(num_simulations, {
  u <- (1:sample_size - runif(sample_size)) / sample_size
  x <- qnorm(u, mean = mean_norm, sd = sd_norm)
  mean(x)
})

# Compare results
severity_mc_results <- data.frame(
  Method = c("Standard MC", "Stratified Sampling"),
  Estimate = c(mean(sims_standard), mean(sims_stratified)),
  Variance = c(var(sims_standard), var(sims_stratified))
)


kable(severity_mc_results, digits = 6, 
      caption = "Severity Estimation: Variance Reduction Results")
Severity Estimation: Variance Reduction Results
Method Estimate Variance
Standard MC 598.7039 22.594135
Stratified Sampling 598.7203 0.050343

Risk Premium Calculation

In the simulation, the aggregate claims amount \(S\) is defined as:

\[ S = \sum_{i=1}^{N} X_i \]

where:

  • \(N \sim \text{Poisson}(\lambda)\), representing the number of claims and \(\lambda\) is the empirical mean of the claims’ distribution, found before.
  • \(X_i \sim \mathcal{N}(\mu, \sigma^2)\), representing the claim sizes, assumed independent and identically distributed (i.i.d.).

In order to obtain an estimate of the risk premium, we implemented a Monte-Carlo simulation of the aggregate loss S and computed the average.

# Compound Poisson model: S = X₁ + X₂ + ... + Xₙ where N ~ Poisson(λ)
set.seed(2025)
n_simulations <- 10000
risk_premiums <- numeric(n_simulations)

for (i in 1:n_simulations) {
  n_claims <- rpois(1, lambda = observed_mean)  # Frequency
  if (n_claims > 0) {
    claim_amounts <- rnorm(n_claims, mean = mean_norm, sd = sd_norm)
    risk_premiums[i] <- sum(claim_amounts)
  } else {
    risk_premiums[i] <- 0
  }
}

# Visualize risk premium distribution
ggplot(data.frame(premium = risk_premiums), aes(x = premium)) +
  geom_histogram(bins = 50, fill = "lightgreen", alpha = 0.7) +
  labs(title = "Simulated Risk Premium Distribution",
       x = "Total Claim Amount", y = "Frequency") +
  theme_minimal()

# Compare simulated vs empirical
risk_comparison <- data.frame(
  Metric = c("Simulated Risk Premium", "Empirical Risk Premium", "Average Premium Paid"),
  Value = c(mean(risk_premiums), mean(data$TOTAL_CLM_AMT), mean(data$PREMIUM))
)

kable(risk_comparison, digits = 2, 
      caption = "Risk Premium Comparison")
Risk Premium Comparison
Metric Value
Simulated Risk Premium 1857.53
Empirical Risk Premium 1852.50
Average Premium Paid 1688.25

Insight: The close alignment between simulated and empirical risk premiums validates our fitted distributions. However, average premiums paid are below the expected risk premium, suggesting potential underpricing.

Risk Management Metrics

Value-at-Risk (VaR)

# Calculate 99.5th percentile VaR
confidence_level <- 0.995
var_99_5 <- quantile(risk_premiums, confidence_level)

cat("99.5% Value-at-Risk:", round(var_99_5, 2), "\n")
## 99.5% Value-at-Risk: 5311.71
cat("This represents the economic capital needed to cover claims 99.5% of the time.\n")
## This represents the economic capital needed to cover claims 99.5% of the time.
cat("Probability of ruin:", (1 - confidence_level) * 100, "%\n")
## Probability of ruin: 0.5 %
# Visualize VaR
ggplot(data.frame(premium = risk_premiums), aes(x = premium)) +
  geom_histogram(bins = 50, fill = "lightblue", alpha = 0.7) +
  geom_vline(xintercept = var_99_5, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Risk Premium Distribution with 99.5% VaR",
       x = "Total Claim Amount", y = "Frequency",
       subtitle = paste("VaR₉₉.₅% =", round(var_99_5, 2))) +
  theme_minimal()

Combined Ratio Analysis

combined_ratio <- mean(data$TOTAL_CLM_AMT) / mean(data$PREMIUM)

cat("Combined Ratio:", round(combined_ratio, 3), "\n")
## Combined Ratio: 1.097
if (combined_ratio > 1) {
  cat("Result: Reinsurance is recommended (Combined Ratio > 1)\n")
  cat("Claims exceed premiums by", round((combined_ratio - 1) * 100, 1), "%\n")
} else {
  cat("Result: No reinsurance needed (Combined Ratio ≤ 1)\n")
}
## Result: Reinsurance is recommended (Combined Ratio > 1)
## Claims exceed premiums by 9.7 %
# Reinsurance threshold analysis
admin_cost_rate <- 0.10  # 10% administrative costs
admin_costs <- admin_cost_rate * mean(data$PREMIUM)

sorted_claims <- sort(data$TOTAL_CLM_AMT)
combined_ratios <- (sorted_claims + admin_costs) / mean(data$PREMIUM)

reinsurance_threshold <- sorted_claims[which(combined_ratios > 1)[1]]

cat("\nRecommended reinsurance threshold (including 10% admin costs):", 
    round(reinsurance_threshold, 2), "\n")
## 
## Recommended reinsurance threshold (including 10% admin costs): 1578

Conclusions

Key Findings

  1. Frequency Model: Poisson distribution (λ = 3.094) best fits claim frequencies
  2. Severity Model: Normal distribution (μ = 598.73, σ = 47.33) best fits claim amounts
  3. Risk Premium: Simulated premium (1857.53) closely matches empirical data
  4. VaR₉₉.₅%: Economic capital requirement of 5311.71 per policy
  5. Combined Ratio: 1.097 indicates need for reinsurance

Recommendations

  • Pricing Adjustment: Increase premiums to achieve combined ratio ≤ 1.0
  • Reinsurance: Implement coverage for claims exceeding 1578
  • Risk Management: Maintain capital reserves of 5311.71 per policy for 99.5% confidence

Technical Notes

  • Monte Carlo simulations used variance reduction techniques (antithetic variates, stratified sampling)
  • All models validated using AIC and Kolmogorov-Smirnov tests
  • Analysis assumes independence between claim frequency and severity