Executive Summary
This analysis applies stochastic modeling techniques to insurance
claims data, focusing on:
- Frequency Modeling: Fitting discrete distributions
(Poisson, Negative Binomial, Geometric) to claim frequencies
- Severity Modeling: Fitting continuous distributions
to claim amounts
- Monte Carlo Simulation: Estimating risk premiums
with variance reduction techniques
- Risk Assessment: Calculating Value-at-Risk and
Combined Ratios for reinsurance decisions
Data Loading and Preprocessing
# Load and examine the dataset
data <- read.csv("DATA_SET_4.csv", sep = ";")
cat("Original dataset dimensions:", dim(data), "\n")
## Original dataset dimensions: 1002 17
head(data) %>% kable(caption = "Sample of Raw Data")
Sample of Raw Data
1 |
6 |
580 |
591 |
640 |
569 |
652 |
617 |
- |
- |
Private |
Sedan |
60 |
M |
Urban |
1 727 |
NA |
2 |
4 |
578 |
585 |
610 |
667 |
- |
- |
- |
- |
Commercial |
Sedan |
43 |
M |
Urban |
1 803 |
NA |
3 |
3 |
656 |
527 |
671 |
- |
- |
- |
- |
- |
Private |
Van |
48 |
M |
Urban |
1 527 |
NA |
4 |
3 |
640 |
668 |
597 |
- |
- |
- |
- |
- |
Private |
SUV |
35 |
F |
Urban |
1 451 |
NA |
5 |
3 |
624 |
490 |
601 |
- |
- |
- |
- |
- |
Private |
Sedan |
51 |
M |
Urban |
1 895 |
NA |
6 |
1 |
610 |
- |
- |
- |
- |
- |
- |
- |
Private |
SUV |
50 |
F |
Urban |
1 812 |
NA |
# Data cleaning pipeline
data$X <- NULL # Remove unnecessary column
# Remove policies with zero claims (no claims experience)
data <- data[data$CLM_FREQ != 0, ]
# Clean premium column
data$PREMIUM <- as.numeric(gsub(" ", "", data$PREMIUM))
# Clean claim amount columns
for (i in 1:8) {
col_name <- paste0("CLM_AMT_", i)
data[[col_name]] <- gsub(" ", "", data[[col_name]])
data[[col_name]][data[[col_name]] %in% c("-", "")] <- "0"
data[[col_name]] <- as.integer(data[[col_name]])
data <- data[data[[col_name]] >= 0, ]
}
# Calculate total claim amounts
data$TOTAL_CLM_AMT <- rowSums(data[, paste0("CLM_AMT_", 1:8)], na.rm = TRUE)
cat("Cleaned dataset dimensions:", dim(data), "\n")
## Cleaned dataset dimensions: 946 17
summary(data[, c("CLM_FREQ", "PREMIUM", "TOTAL_CLM_AMT")]) %>% kable(caption = "Summary Statistics")
Summary Statistics
|
Min. :1.000 |
Min. :1400 |
Min. : 469 |
|
1st Qu.:2.000 |
1st Qu.:1538 |
1st Qu.:1169 |
|
Median :3.000 |
Median :1681 |
Median :1760 |
|
Mean :3.094 |
Mean :1688 |
Mean :1853 |
|
3rd Qu.:4.000 |
3rd Qu.:1841 |
3rd Qu.:2428 |
|
Max. :8.000 |
Max. :1999 |
Max. :5059 |
Claim Severity Analysis
# Extract individual claim amounts (severity data)
selected_columns <- data[, paste0("CLM_AMT_", 1:8)]
severity_matrix <- as.matrix(selected_columns)
severity_vector <- as.vector(severity_matrix)
severity_vector <- severity_vector[severity_vector > 0]
# Visualize severity distribution
ggplot(data.frame(severity = severity_vector), aes(x = severity)) +
geom_histogram(bins = 50, fill = "lightcoral", alpha = 0.7) +
labs(title = "Distribution of Claim Severities",
x = "Claim Amount", y = "Frequency") +
theme_minimal()

# Fit continuous distributions to claim severity
fit_exp <- fitdist(severity_vector, "exp", method = "mle")
fit_gamma <- fitdist(severity_vector, "gamma", method = "mle")
fit_lnorm <- fitdist(severity_vector, "lnorm", method = "mle")
fit_norm <- fitdist(severity_vector, "norm", method = "mle")
# Density comparison plot
denscomp(
list(fit_exp, fit_gamma, fit_lnorm, fit_norm),
legendtext = c("Exponential", "Gamma", "Lognormal", "Normal"),
main = "Density Comparison: Claim Severity Models",
fitlwd = 2
)

# Goodness-of-fit comparison
gof_severity <- gofstat(
list(fit_exp, fit_gamma, fit_lnorm, fit_norm),
fitnames = c("Exponential", "Gamma", "Lognormal", "Normal")
)
severity_results <- data.frame(
AIC = c(fit_exp$aic, fit_gamma$aic, fit_lnorm$aic, fit_norm$aic),
KS_Statistic = gof_severity$ks
) %>% arrange(KS_Statistic)
kable(severity_results, digits = 2,
caption = "Model Comparison for Claim Severity")
Model Comparison for Claim Severity
Normal |
30889.67 |
0.01 |
Gamma |
30885.07 |
0.02 |
Lognormal |
30891.61 |
0.03 |
Exponential |
43291.18 |
0.55 |
Result: Normal distribution provides the best fit
based on Kolmogorov-Smirnov test.
# Stratified sampling for variance reduction
set.seed(2025)
mean_norm <- fit_norm$estimate["mean"]
sd_norm <- fit_norm$estimate["sd"]
sample_size <- 100
num_simulations <- 10000
# Standard Monte Carlo
sims_standard <- replicate(num_simulations, {
x <- rnorm(sample_size, mean = mean_norm, sd = sd_norm)
mean(x)
})
# Stratified sampling
sims_stratified <- replicate(num_simulations, {
u <- (1:sample_size - runif(sample_size)) / sample_size
x <- qnorm(u, mean = mean_norm, sd = sd_norm)
mean(x)
})
# Compare results
severity_mc_results <- data.frame(
Method = c("Standard MC", "Stratified Sampling"),
Estimate = c(mean(sims_standard), mean(sims_stratified)),
Variance = c(var(sims_standard), var(sims_stratified))
)
kable(severity_mc_results, digits = 6,
caption = "Severity Estimation: Variance Reduction Results")
Severity Estimation: Variance Reduction Results
Standard MC |
598.7039 |
22.594135 |
Stratified Sampling |
598.7203 |
0.050343 |
Risk Premium Calculation
In the simulation, the aggregate claims amount \(S\) is defined as:
\[
S = \sum_{i=1}^{N} X_i
\]
where:
- \(N \sim \text{Poisson}(\lambda)\),
representing the number of claims and \(\lambda\) is the empirical mean of the
claims’ distribution, found before.
- \(X_i \sim \mathcal{N}(\mu,
\sigma^2)\), representing the claim sizes, assumed independent
and identically distributed (i.i.d.).
In order to obtain an estimate of the risk premium, we implemented a
Monte-Carlo simulation of the aggregate loss S and computed the
average.
# Compound Poisson model: S = X₁ + X₂ + ... + Xₙ where N ~ Poisson(λ)
set.seed(2025)
n_simulations <- 10000
risk_premiums <- numeric(n_simulations)
for (i in 1:n_simulations) {
n_claims <- rpois(1, lambda = observed_mean) # Frequency
if (n_claims > 0) {
claim_amounts <- rnorm(n_claims, mean = mean_norm, sd = sd_norm)
risk_premiums[i] <- sum(claim_amounts)
} else {
risk_premiums[i] <- 0
}
}
# Visualize risk premium distribution
ggplot(data.frame(premium = risk_premiums), aes(x = premium)) +
geom_histogram(bins = 50, fill = "lightgreen", alpha = 0.7) +
labs(title = "Simulated Risk Premium Distribution",
x = "Total Claim Amount", y = "Frequency") +
theme_minimal()

# Compare simulated vs empirical
risk_comparison <- data.frame(
Metric = c("Simulated Risk Premium", "Empirical Risk Premium", "Average Premium Paid"),
Value = c(mean(risk_premiums), mean(data$TOTAL_CLM_AMT), mean(data$PREMIUM))
)
kable(risk_comparison, digits = 2,
caption = "Risk Premium Comparison")
Risk Premium Comparison
Simulated Risk Premium |
1857.53 |
Empirical Risk Premium |
1852.50 |
Average Premium Paid |
1688.25 |
Insight: The close alignment between simulated and
empirical risk premiums validates our fitted distributions. However,
average premiums paid are below the expected risk premium, suggesting
potential underpricing.
Risk Management Metrics
Value-at-Risk (VaR)
# Calculate 99.5th percentile VaR
confidence_level <- 0.995
var_99_5 <- quantile(risk_premiums, confidence_level)
cat("99.5% Value-at-Risk:", round(var_99_5, 2), "\n")
## 99.5% Value-at-Risk: 5311.71
cat("This represents the economic capital needed to cover claims 99.5% of the time.\n")
## This represents the economic capital needed to cover claims 99.5% of the time.
cat("Probability of ruin:", (1 - confidence_level) * 100, "%\n")
## Probability of ruin: 0.5 %
# Visualize VaR
ggplot(data.frame(premium = risk_premiums), aes(x = premium)) +
geom_histogram(bins = 50, fill = "lightblue", alpha = 0.7) +
geom_vline(xintercept = var_99_5, color = "red", linetype = "dashed", size = 1) +
labs(title = "Risk Premium Distribution with 99.5% VaR",
x = "Total Claim Amount", y = "Frequency",
subtitle = paste("VaR₉₉.₅% =", round(var_99_5, 2))) +
theme_minimal()

Combined Ratio Analysis
combined_ratio <- mean(data$TOTAL_CLM_AMT) / mean(data$PREMIUM)
cat("Combined Ratio:", round(combined_ratio, 3), "\n")
## Combined Ratio: 1.097
if (combined_ratio > 1) {
cat("Result: Reinsurance is recommended (Combined Ratio > 1)\n")
cat("Claims exceed premiums by", round((combined_ratio - 1) * 100, 1), "%\n")
} else {
cat("Result: No reinsurance needed (Combined Ratio ≤ 1)\n")
}
## Result: Reinsurance is recommended (Combined Ratio > 1)
## Claims exceed premiums by 9.7 %
# Reinsurance threshold analysis
admin_cost_rate <- 0.10 # 10% administrative costs
admin_costs <- admin_cost_rate * mean(data$PREMIUM)
sorted_claims <- sort(data$TOTAL_CLM_AMT)
combined_ratios <- (sorted_claims + admin_costs) / mean(data$PREMIUM)
reinsurance_threshold <- sorted_claims[which(combined_ratios > 1)[1]]
cat("\nRecommended reinsurance threshold (including 10% admin costs):",
round(reinsurance_threshold, 2), "\n")
##
## Recommended reinsurance threshold (including 10% admin costs): 1578