Session 3: Collinearity & Regularization

Advanced Spatial Ecology & Species Distribution Modeling

Author

Dr Tahir Ali | AG de Meaux | TRR341 Ecological Genomics | University of Cologne

Session Overview

Learning Outcomes:

  • Quantify collinearity using correlation matrices and VIF

  • Make principled ecological choices for variable removal

  • Use ridge, LASSO, and elastic net for stable prediction and variable selection

Code
# ==============================================================================
# STEP 1: LOAD REQUIRED PACKAGES
# ==============================================================================
# Think of packages as toolboxes - each gives us specific tools we'll need

library(ggplot2)      # For creating professional-looking plots
library(dplyr)        # For data manipulation (selecting columns, filtering rows)
library(tidyr)        # For reshaping data (long vs wide formats)
library(car)          # For VIF (Variance Inflation Factor) calculation
library(glmnet)       # For ridge, LASSO, and elastic net regularization
library(terra)        # For working with raster data (like climate TIFFs)
library(sf)           # For working with spatial vector data (shapefiles)
library(viridis)      # For color-blind friendly color schemes
library(patchwork)    # For combining multiple plots into one
library(GGally)       # For enhanced correlation plots

# Set seed for reproducibility - ensures we get the same random results each time
set.seed(123)

PART 1: LOAD AND PREPARE DATA FROM WORKSHOP DIRECTORIES

Code
# ==============================================================================
# STEP 2: DEFINE FILE PATHS
# ==============================================================================
# We'll define all paths at the top so they're easy to modify

# Base path to your workshop folder
base_path <- "/run/media/tahirali/Expansion/github/spatial-ecology-workshop/"

# Input data paths
spider_coords_path <- file.path(base_path, "data/presence/spider_coords.csv")
climate_folder <- file.path(base_path, "data/climate")
roads_path <- file.path(base_path, "data/infrastructure/roads/hotosm_pak_roads_lines_shp/hotosm_pak_roads_lines_shp.shp")
rivers_path <- file.path(base_path, "data/infrastructure/waterways/hotosm_pak_waterways_lines_shp/hotosm_pak_waterways_lines_shp.shp")
boundary_path <- file.path(base_path, "data/boundaries/pakistan_admin.shp")

# Output path
output_path <- file.path(base_path, "output/Session_3")

cat("✓ File paths defined\n")
cat("  Input data: ", base_path, "\n")
cat("  Output directory: ", output_path, "\n")
✓ File paths defined
  Input data:  /run/media/tahirali/Expansion/github/spatial-ecology-workshop/ 
  Output directory:  /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3 
Code
# ==============================================================================
# STEP 3: LOAD AND CLEAN SPIDER PRESENCE DATA
# ==============================================================================

# Read the CSV file containing spider presence coordinates
# read.csv2 is used because the file uses semicolons (;) as separators
spiders <- read.csv2(spider_coords_path, header = TRUE)

cat("✓ Raw spider data loaded\n")
cat("  Number of records:", nrow(spiders), "\n")

# In some countries, decimals are written with commas (like 74,249 instead of 74.249)
# R needs dots (.) for decimals, so we convert commas to dots
spiders$X <- as.numeric(gsub(",", ".", spiders$X))
spiders$Y <- as.numeric(gsub(",", ".", spiders$Y))

# Remove duplicate records (same spider location recorded multiple times)
spiders <- distinct(spiders)

cat("  Unique locations after removing duplicates:", nrow(spiders), "\n")
✓ Raw spider data loaded
  Number of records: 129 
  Unique locations after removing duplicates: 93 
Code
# ==============================================================================
# STEP 4: LOAD CLIMATE RASTER FILES
# ==============================================================================
# WorldClim bioclimatic variables - 19 climate layers + elevation

# List all .tif files in the climate folder
bio_files <- list.files(climate_folder, pattern = ".tif$", full.names = TRUE)

cat("✓ Found", length(bio_files), "climate raster files\n")

# Load all raster files as a single "raster stack"
# This combines all layers into one object for easy extraction
climate_stack <- rast(bio_files)

# Give meaningful names to each layer (BIO1 = Annual Mean Temperature, etc.)
meaningful_names <- c(
  "bio1_annual_mean_temp",      # BIO1: Annual Mean Temperature
  "bio10_mean_temp_warmest_quarter", # BIO10: Mean Temp of Warmest Quarter
  "bio11_mean_temp_coldest_quarter", # BIO11: Mean Temp of Coldest Quarter
  "bio12_annual_precip",        # BIO12: Annual Precipitation
  "bio13_precip_wettest_month", # BIO13: Precipitation of Wettest Month
  "bio14_precip_driest_month",  # BIO14: Precipitation of Driest Month
  "bio15_precip_seasonality",   # BIO15: Precipitation Seasonality
  "bio16_precip_wettest_quarter", # BIO16: Precipitation of Wettest Quarter
  "bio17_precip_driest_quarter", # BIO17: Precipitation of Driest Quarter
  "bio18_precip_warmest_quarter", # BIO18: Precipitation of Warmest Quarter
  "bio19_precip_coldest_quarter", # BIO19: Precipitation of Coldest Quarter
  "bio2_mean_diurnal_range",    # BIO2: Mean Diurnal Range
  "bio3_isothermality",         # BIO3: Isothermality
  "bio4_temp_seasonality",      # BIO4: Temperature Seasonality
  "bio5_max_temp_warmest_month", # BIO5: Max Temp of Warmest Month
  "bio6_min_temp_coldest_month", # BIO6: Min Temp of Coldest Month
  "bio7_temp_annual_range",     # BIO7: Temperature Annual Range
  "bio8_mean_temp_wettest_quarter", # BIO8: Mean Temp of Wettest Quarter
  "bio9_mean_temp_driest_quarter", # BIO9: Mean Temp of Driest Quarter
  "elevation"                   # Elevation (not a bioclim variable)
)

# Assign meaningful names
names(climate_stack) <- meaningful_names
cat("✓ Climate layers renamed with meaningful names\n")
✓ Found 20 climate raster files
✓ Climate layers renamed with meaningful names
Code
# ==============================================================================
# STEP 5: EXTRACT CLIMATE DATA FOR SPIDER POINTS
# ==============================================================================

# Extract climate values at each spider coordinate
# This links each spider location with its environmental conditions
# Use terra::extract explicitly to avoid function conflicts
spider_climate <- terra::extract(climate_stack, spiders[, c("X", "Y")])

# Remove the ID column that extract() adds automatically
spider_climate <- spider_climate[, -1]

# Combine spider coordinates with climate data
spider_data <- cbind(spiders, spider_climate)

cat("✓ Climate data extracted for", nrow(spider_data), "spider points\n")
cat("  Total variables:", ncol(spider_data), "\n")
✓ Climate data extracted for 93 spider points
  Total variables: 22 
Code
# ==============================================================================
# STEP 6: VERIFY WORLDCLIM UNITS (SPIDER DATA)
# ==============================================================================
# Check if temperature units are correct using the extracted spider data

# Calculate mean temperature at spider locations
mean_spider_temp <- mean(spider_data$bio1_annual_mean_temp, na.rm = TRUE)

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("STEP 6: VERIFYING CLIMATE DATA UNITS\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("1. CHECKING SPIDER LOCATION TEMPERATURES:\n")
cat("   Spider site temperatures:", 
    paste(round(head(spider_data$bio1_annual_mean_temp, 5), 1), collapse=", "), "...\n")
cat("   Mean temperature at spider sites:", round(mean_spider_temp, 1), "°C\n")

# Expected temperature for Punjab, Pakistan (20-30°C)
if(mean_spider_temp > 20 && mean_spider_temp < 30) {
  cat("   ✓ Mean temperature (", round(mean_spider_temp, 1), 
      "°C) is within expected range for Punjab (20-30°C)\n")
  cat("   → Units appear to be CORRECT (already in °C)\n\n")
  units_correct <- TRUE
} else if(mean_spider_temp > 200 && mean_spider_temp < 300) {
  cat("   ⚠️ Mean temperature (", round(mean_spider_temp, 1), 
      ") is too high - values appear to be °C × 10\n")
  cat("   → Units need DIVISION BY 10\n\n")
  units_correct <- FALSE
} else {
  cat("   ⚠️ Mean temperature (", round(mean_spider_temp, 1), 
      "°C) is outside expected range - check data source\n\n")
  units_correct <- NA
}

# Apply correction if needed
if(exists("units_correct") && !is.na(units_correct) && !units_correct) {
  cat("\n", paste(rep("!", 50), collapse = ""), "\n")
  cat("!!! APPLYING UNIT CORRECTION (dividing by 10) !!!\n")
  cat(paste(rep("!", 50), collapse = ""), "\n\n")
  
  # Temperature variables that need division by 10
  temp_vars <- c("bio1_annual_mean_temp", 
                 "bio5_max_temp_warmest_month",
                 "bio6_min_temp_coldest_month", 
                 "bio7_temp_annual_range",
                 "bio8_mean_temp_wettest_quarter", 
                 "bio9_mean_temp_driest_quarter",
                 "bio10_mean_temp_warmest_quarter", 
                 "bio11_mean_temp_coldest_quarter")
  
  # Apply correction to spider_data
  for(var in temp_vars) {
    if(var %in% names(spider_data)) {
      spider_data[[var]] <- spider_data[[var]] / 10
    }
  }
  
  # Temperature seasonality (BIO4) is standard deviation × 100
  if("bio4_temp_seasonality" %in% names(spider_data)) {
    spider_data[["bio4_temp_seasonality"]] <- 
      spider_data[["bio4_temp_seasonality"]] / 100
  }
  
  cat("✓ Unit correction applied to spider_data\n")
  
} else {
  cat("\n", paste(rep("✓", 50), collapse = ""), "\n")
  cat("✓ NO UNIT CORRECTION NEEDED - using extracted values as is\n")
  cat(paste(rep("✓", 50), collapse = ""), "\n\n")
}

# Final verification
cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("FINAL TEMPERATURE SUMMARY:\n")
cat(paste(rep("=", 70), collapse = ""), "\n")
cat("Spider locations temperature range: ", 
    round(min(spider_data$bio1_annual_mean_temp, na.rm = TRUE), 1), "°C to ",
    round(max(spider_data$bio1_annual_mean_temp, na.rm = TRUE), 1), "°C\n")
cat("Mean temperature at spider sites:", 
    round(mean(spider_data$bio1_annual_mean_temp, na.rm = TRUE), 1), "°C\n")

 ====================================================================== 
STEP 6: VERIFYING CLIMATE DATA UNITS
====================================================================== 

1. CHECKING SPIDER LOCATION TEMPERATURES:
   Spider site temperatures: 24.1, 24.1, 24, 24, 24 ...
   Mean temperature at spider sites: 23.7 °C
   ✓ Mean temperature ( 23.7 °C) is within expected range for Punjab (20-30°C)
   → Units appear to be CORRECT (already in °C)


 ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 
✓ NO UNIT CORRECTION NEEDED - using extracted values as is
✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 


 ====================================================================== 
FINAL TEMPERATURE SUMMARY:
====================================================================== 
Spider locations temperature range:  21.9 °C to  25.2 °C
Mean temperature at spider sites: 23.7 °C
Code
# ==============================================================================
# STEP 7: LOAD SPATIAL LAYERS (BOUNDARY REQUIRED, ROADS/RIVERS OPTIONAL)
# ==============================================================================
# This step loads the Pakistan boundary (always needed for mapping).
# Roads and rivers are OPTIONAL - uncomment if you want to:
#   1. Plot infrastructure on maps
#   2. Test for sampling bias near roads/waterways
#   3. Calculate distances to features
#
# To activate roads/rivers: Remove the '#' from those lines below

# Load Pakistan boundary (always needed for mapping context)
pak_boundary <- st_read(boundary_path, quiet = TRUE)

# OPTIONAL: Load roads (Pakistan road network)
# roads <- st_read(roads_path, quiet = TRUE)

# OPTIONAL: Load rivers (Pakistan waterways)
# rivers <- st_read(rivers_path, quiet = TRUE)

# Show confirmation
cat("✓ Pakistan boundary loaded\n")
if(exists("roads")) cat("✓ Roads loaded (optional)\n")
if(exists("rivers")) cat("✓ Rivers loaded (optional)\n")
✓ Pakistan boundary loaded
✓ Rivers loaded (optional)
Code
# ==============================================================================
# STEP 8: CREATE SPATIAL OBJECTS FOR MAPPING
# ==============================================================================

# Convert spider data to spatial object (original lat/lon)
sp_points <- st_as_sf(spider_data, coords = c("X", "Y"), crs = 4326)

# Transform to UTM (meters) for distance calculations
# UTM zone 43N is correct for Pakistan
sp_utm <- st_transform(sp_points, 32643)

# Transform other layers to same projection
pak_utm <- st_transform(pak_boundary, 32643)
# roads_utm <- st_transform(roads, 32643) # Uncomment this line if required
# rivers_utm <- st_transform(rivers, 32643) # Uncomment this line if required

# Extract UTM coordinates and add to data frame
coords <- st_coordinates(sp_utm)
spider_data$x <- coords[,1]  # Easting (x-coordinate in meters)
spider_data$y <- coords[,2]  # Northing (y-coordinate in meters)

cat("✓ All layers transformed to UTM zone 43N\n")
cat("  UTM coordinates added to spider data\n")
✓ All layers transformed to UTM zone 43N
  UTM coordinates added to spider data

PART 2: SELECT VARIABLES FOR ANALYSIS

Code
# ==============================================================================
# STEP 9: SELECT KEY VARIABLES FOR MODELING
# ==============================================================================
# We'll focus on a subset of variables that represent different ecological processes

dat <- spider_data %>%
  dplyr::select(
    # Coordinates
    X, Y, x, y,
    # Temperature variables (now in °C)
    temp = bio1_annual_mean_temp,
    tmax = bio5_max_temp_warmest_month,
    tmin = bio6_min_temp_coldest_month,
    temp_seas = bio4_temp_seasonality,
    # Precipitation variables (mm)
    precip = bio12_annual_precip,
    precip_seas = bio15_precip_seasonality,
    # Elevation (meters)
    elev = elevation
  )

cat("✓ Selected variables for analysis:\n")
print(names(dat))
✓ Selected variables for analysis:
 [1] "X"           "Y"           "x"           "y"           "temp"       
 [6] "tmax"        "tmin"        "temp_seas"   "precip"      "precip_seas"
[11] "elev"       

Generate Maps Of Selected Climate Variables For Pakistan

Code
# ==============================================================================
# STEP 10: MAP CLIMATE VARIABLES FOR PAKISTAN
# ==============================================================================
# Create visual maps showing the spatial distribution of key climate variables
# across Pakistan with spider sampling locations overlaid
#
# MAPS CREATED:
#   1. Annual Mean Temperature (°C) - Plasma color scale
#   2. Annual Precipitation (mm) - Viridis color scale
#   3. Combined side-by-side comparison
#
# WHAT TO LOOK FOR:
#   • Temperature: North-south gradient (cold mountains → warm plains)
#   • Precipitation: East-west gradient (wet monsoon areas → dry west)
#   • Spider locations: Which climate zones are sampled vs. unsampled?
#
# OUTPUT FILES (saved to output directory):
#   • pakistan_temperature.png
#   • pakistan_precipitation.png  
#   • pakistan_climate_maps.png (combined)
#
# INTERPRETATION:
#   After the maps display, summary statistics and ecological interpretation
#   will help identify potential sampling bias and model limitations
# ==============================================================================

# Load required libraries
library(terra)
library(sf)
library(ggplot2)
library(viridis)
library(patchwork)
library(dplyr)

# ======================================
# PART A: VERIFY CURRENT TEMPERATURE VALUES
# ======================================

cat("\n=== CURRENT TEMPERATURE VALUES (POST-CORRECTION) ===\n")

# Check temperature values from our climate stack
temp_raster <- climate_stack[["bio1_annual_mean_temp"]]
precip_raster <- climate_stack[["bio12_annual_precip"]]

cat("\nTemperature values (°C):\n")
cat("  Range:", round(min(values(temp_raster), na.rm = TRUE), 1), "to", 
    round(max(values(temp_raster), na.rm = TRUE), 1), "°C\n")
cat("  Mean:", round(mean(values(temp_raster), na.rm = TRUE), 1), "°C\n")
cat("✓ Values look correct for Pakistan (should be 5-30°C)\n")

cat("\nPrecipitation values (mm):\n")
cat("  Range:", round(min(values(precip_raster), na.rm = TRUE), 0), "to", 
    round(max(values(precip_raster), na.rm = TRUE), 0), "mm\n")
cat("  Mean:", round(mean(values(precip_raster), na.rm = TRUE), 0), "mm\n")

# ======================================
# PART B: LOAD AND CHECK COORDINATE SYSTEMS
# ======================================

cat("\n=== CHECKING COORDINATE SYSTEMS ===\n")

# Get raster CRS
raster_crs <- crs(temp_raster, describe = TRUE)
cat("\nRaster CRS:\n")
cat("  Code:", raster_crs$code, "\n")
cat("  Name:", raster_crs$name, "\n")

# Load Pakistan boundary
boundary_path <- file.path(base_path, "data/boundaries/pakistan_admin.shp")
pak_boundary <- st_read(boundary_path, quiet = TRUE)

cat("\nBoundary original CRS:\n")
cat("  ", st_crs(pak_boundary)$input, "\n")

# Transform boundary to match raster CRS (WGS84 - EPSG:4326)
pak_boundary_wgs84 <- st_transform(pak_boundary, crs(temp_raster))
cat("✓ Boundary transformed to match raster CRS\n")

# ======================================
# PART C: CONVERT RASTERS TO DATAFRAME FOR PLOTTING
# ======================================

cat("\n=== CONVERTING RASTERS TO PLOTTING FORMAT ===\n")

# Convert temperature raster to dataframe
temp_df <- as.data.frame(temp_raster, xy = TRUE, na.rm = TRUE)
names(temp_df) <- c("x", "y", "temperature")

# Convert precipitation raster to dataframe
precip_df <- as.data.frame(precip_raster, xy = TRUE, na.rm = TRUE)
names(precip_df) <- c("x", "y", "precipitation")

cat("Total temperature points:", nrow(temp_df), "\n")
cat("Total precipitation points:", nrow(precip_df), "\n")

# Get Pakistan bounding box
pak_bbox <- st_bbox(pak_boundary_wgs84)
cat("\nPakistan bounding box:\n")
cat("  Longitude:", round(pak_bbox$xmin, 2), "to", round(pak_bbox$xmax, 2), "\n")
cat("  Latitude:", round(pak_bbox$ymin, 2), "to", round(pak_bbox$ymax, 2), "\n")

# Filter raster points to Pakistan extent
temp_df_pak <- temp_df %>%
  filter(x >= pak_bbox$xmin, x <= pak_bbox$xmax,
         y >= pak_bbox$ymin, y <= pak_bbox$ymax)

precip_df_pak <- precip_df %>%
  filter(x >= pak_bbox$xmin, x <= pak_bbox$xmax,
         y >= pak_bbox$ymin, y <= pak_bbox$ymax)

cat("\nPoints within Pakistan bounds:\n")
cat("  Temperature:", nrow(temp_df_pak), " (", 
    round(100 * nrow(temp_df_pak)/nrow(temp_df), 1), "% of total)\n")
cat("  Precipitation:", nrow(precip_df_pak), " (",
    round(100 * nrow(precip_df_pak)/nrow(precip_df), 1), "% of total)\n")

# ======================================
# PART D: CREATE TEMPERATURE MAP
# ======================================

p_temp <- ggplot() +
  # Temperature raster
  geom_raster(data = temp_df_pak, aes(x = x, y = y, fill = temperature)) +
  
  # Pakistan boundary
  geom_sf(data = pak_boundary_wgs84, fill = NA, color = "black", linewidth = 0.8) +
  
  # Spider presence points
  geom_point(data = spider_data, aes(x = X, y = Y), 
             color = "white", size = 1.5, alpha = 0.7) +
  geom_point(data = spider_data, aes(x = X, y = Y), 
             color = "darkred", size = 1, alpha = 0.9) +
  
  # Color scale
  scale_fill_viridis_c(
    option = "plasma", 
    name = "Temperature (°C)",
    direction = -1,
    na.value = "transparent"
  ) +
  
  # Set proper limits
  coord_sf(xlim = c(pak_bbox$xmin, pak_bbox$xmax),
           ylim = c(pak_bbox$ymin, pak_bbox$ymax),
           expand = FALSE) +
  
  # Labels
  labs(
    title = "Annual Mean Temperature - Pakistan",
    subtitle = paste0("Range: ", round(min(temp_df_pak$temperature), 1), 
                      "°C to ", round(max(temp_df_pak$temperature), 1), "°C"),
    x = "Longitude", y = "Latitude",
    caption = "Red dots: Spider sampling locations"
  ) +
  
  # Theme
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "darkred", size = 11),
    legend.position = "right",
    legend.key.height = unit(1.5, "cm"),
    panel.grid = element_blank(),
    axis.text = element_text(size = 10)
  )

# ======================================
# PART E: CREATE PRECIPITATION MAP
# ======================================

p_precip <- ggplot() +
  # Precipitation raster
  geom_raster(data = precip_df_pak, aes(x = x, y = y, fill = precipitation)) +
  
  # Pakistan boundary
  geom_sf(data = pak_boundary_wgs84, fill = NA, color = "black", linewidth = 0.8) +
  
  # Spider presence points
  geom_point(data = spider_data, aes(x = X, y = Y), 
             color = "white", size = 1.5, alpha = 0.7) +
  geom_point(data = spider_data, aes(x = X, y = Y), 
             color = "darkblue", size = 1, alpha = 0.9) +
  
  # Color scale
  scale_fill_viridis_c(
    option = "viridis", 
    name = "Precipitation (mm)",
    direction = 1,
    na.value = "transparent"
  ) +
  
  # Set proper limits
  coord_sf(xlim = c(pak_bbox$xmin, pak_bbox$xmax),
           ylim = c(pak_bbox$ymin, pak_bbox$ymax),
           expand = FALSE) +
  
  # Labels
  labs(
    title = "Annual Precipitation - Pakistan",
    subtitle = paste0("Range: ", round(min(precip_df_pak$precipitation), 0), 
                      "mm to ", round(max(precip_df_pak$precipitation), 0), "mm"),
    x = "Longitude", y = "Latitude",
    caption = "Blue dots: Spider sampling locations"
  ) +
  
  # Theme
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "darkblue", size = 11),
    legend.position = "right",
    legend.key.height = unit(1.5, "cm"),
    panel.grid = element_blank(),
    axis.text = element_text(size = 10)
  )

# ======================================
# PART F: DISPLAY MAPS
# ======================================

# Display individual maps
print(p_temp)

Code
print(p_precip)

Code
# Combined map
p_combined <- p_temp + p_precip +
  plot_annotation(
    title = "Climate Variables of Pakistan",
    subtitle = paste0("Temperature range: ", round(min(temp_df_pak$temperature), 1), 
                      "-", round(max(temp_df_pak$temperature), 1), "°C | ",
                      "Precipitation: ", round(min(precip_df_pak$precipitation), 0),
                      "-", round(max(precip_df_pak$precipitation), 0), "mm"),
    theme = theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
      plot.subtitle = element_text(hjust = 0.5, size = 12)
    )
  )

print(p_combined)

Code
# ======================================
# PART G: SAVE MAPS
# ======================================

ggsave(file.path(output_path, "pakistan_temperature.png"), 
       p_temp, width = 8, height = 6, dpi = 300)

ggsave(file.path(output_path, "pakistan_precipitation.png"), 
       p_precip, width = 8, height = 6, dpi = 300)

ggsave(file.path(output_path, "pakistan_climate_maps.png"), 
       p_combined, width = 14, height = 6, dpi = 300)

# ======================================
# PART H: INTERPRETATION (NEW - ECOLOGICAL MEANING)
# ======================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("CLIMATE MAPS - INTERPRETATION GUIDE\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("TEMPERATURE PATTERNS:\n")
cat("  • Range:", round(min(temp_df_pak$temperature), 1), "to", 
    round(max(temp_df_pak$temperature), 1), "°C\n")
cat("  • North-South gradient: Cooler in north (mountains), warmer in south\n")
cat("  • Spider sampling: Primarily in warmer areas (", 
    round(min(spider_data$bio1_annual_mean_temp), 1), "°C to ",
    round(max(spider_data$bio1_annual_mean_temp), 1), "°C)\n\n")

cat("PRECIPITATION PATTERNS:\n")
cat("  • Range:", round(min(precip_df_pak$precipitation), 0), "to", 
    round(max(precip_df_pak$precipitation), 0), "mm\n")
cat("  • East-West gradient: Wetter in east (monsoon influence)\n")
cat("  • Spider sampling: Concentrated in areas with ",
    round(mean(spider_data$bio12_annual_precip), 0), "mm mean precipitation\n\n")

cat("SAMPLING BIAS OBSERVATIONS:\n")
cat("  • Spider points cluster in specific climate zones\n")
cat("  • Under-sampled areas: High mountains, extreme deserts\n")
cat("  • This bias will affect species distribution models\n\n")

cat("IMPLICATIONS FOR SDM:\n")
cat("  • Models may not predict well in unsampled climate zones\n")
cat("  • Predictions in northern areas will be extrapolations\n")
cat("  • Consider spatial cross-validation to account for bias\n")

# ======================================
# PART I: SAMPLING LOCATION SUMMARY
# ======================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("SPIDER SAMPLING LOCATIONS SUMMARY\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("Climate at spider locations:\n")
cat("  Temperature: ", 
    round(min(spider_data$bio1_annual_mean_temp), 1), "°C to ",
    round(max(spider_data$bio1_annual_mean_temp), 1), "°C (mean: ",
    round(mean(spider_data$bio1_annual_mean_temp), 1), "°C)\n", sep="")

cat("  Precipitation: ",
    round(min(spider_data$bio12_annual_precip), 0), "mm to ",
    round(max(spider_data$bio12_annual_precip), 0), "mm (mean: ",
    round(mean(spider_data$bio12_annual_precip), 0), "mm)\n\n", sep="")

cat("Geographic distribution:\n")
cat("  Longitude: ", round(min(spider_data$X), 2), "°E to ",
    round(max(spider_data$X), 2), "°E\n", sep="")
cat("  Latitude: ", round(min(spider_data$Y), 2), "°N to ",
    round(max(spider_data$Y), 2), "°N\n", sep="")

cat("\n✓ Maps saved to output directory\n")

=== CURRENT TEMPERATURE VALUES (POST-CORRECTION) ===

Temperature values (°C):
  Range: -54.8 to 31.2 °C
  Mean: -4.3 °C
✓ Values look correct for Pakistan (should be 5-30°C)

Precipitation values (mm):
  Range: 0 to 11246 mm
  Mean: 537 mm

=== CHECKING COORDINATE SYSTEMS ===

Raster CRS:
  Code: 4326 
  Name: WGS 84 

Boundary original CRS:
   WGS 84 
✓ Boundary transformed to match raster CRS

=== CONVERTING RASTERS TO PLOTTING FORMAT ===
Total temperature points: 12532612 
Total precipitation points: 12532612 

Pakistan bounding box:
  Longitude: 60.9 to 77.84 
  Latitude: 23.7 to 37.1 

Points within Pakistan bounds:
  Temperature: 124758  ( 1 % of total)
  Precipitation: 124758  ( 1 % of total)

 ====================================================================== 
CLIMATE MAPS - INTERPRETATION GUIDE
====================================================================== 

TEMPERATURE PATTERNS:
  • Range: -18.7 to 27.9 °C
  • North-South gradient: Cooler in north (mountains), warmer in south
  • Spider sampling: Primarily in warmer areas ( 21.9 °C to  25.2 °C)

PRECIPITATION PATTERNS:
  • Range: 24 to 2532 mm
  • East-West gradient: Wetter in east (monsoon influence)
  • Spider sampling: Concentrated in areas with  467 mm mean precipitation

SAMPLING BIAS OBSERVATIONS:
  • Spider points cluster in specific climate zones
  • Under-sampled areas: High mountains, extreme deserts
  • This bias will affect species distribution models

IMPLICATIONS FOR SDM:
  • Models may not predict well in unsampled climate zones
  • Predictions in northern areas will be extrapolations
  • Consider spatial cross-validation to account for bias

 ====================================================================== 
SPIDER SAMPLING LOCATIONS SUMMARY
====================================================================== 

Climate at spider locations:
  Temperature: 21.9°C to 25.2°C (mean: 23.7°C)
  Precipitation: 166mm to 764mm (mean: 467mm)

Geographic distribution:
  Longitude: 71.93°E to 74.48°E
  Latitude: 30.04°N to 32.93°N

✓ Maps saved to output directory

Temperature Values are Correct

  • Spider temperatures: 21.9°C to 25.2°C (mean 23.7°C) ✓
  • Full Pakistan range: -54.8°C to 31.2°C ✓ (includes Himalayas)

1. Why Collinearity is Everywhere in Climate Predictors

Key Points: - BioClim variables are derived from the same time series → correlated by construction - Collinearity harms interpretability (coefficient signs can flip) - Prediction may still be fine, but inference becomes fragile

Code
# ==============================================================================
# STEP 11: CALCULATE CORRELATION MATRIX
# ==============================================================================
# Correlation measures how strongly variables are related (range: -1 to +1)

# Select only the environmental variables (exclude coordinates)
X <- dat |> dplyr::select(temp, precip, elev, tmax, tmin, temp_seas, precip_seas)

# Calculate correlation matrix
cor_matrix <- cor(X, use = "complete.obs")

# Display correlation matrix rounded to 2 decimal places
cat("\n=== CORRELATION MATRIX ===\n")
print(round(cor_matrix, 2))

# Identify highly correlated pairs (|r| > 0.7)
high_cor <- which(abs(cor_matrix) > 0.7 & upper.tri(cor_matrix), arr.ind = TRUE)

cat("\n=== HIGHLY CORRELATED VARIABLES (|r| > 0.7) ===\n")
if(nrow(high_cor) > 0) {
  for(i in 1:nrow(high_cor)) {
    var1 <- rownames(cor_matrix)[high_cor[i, 1]]
    var2 <- colnames(cor_matrix)[high_cor[i, 2]]
    cat(sprintf("  %s - %s: %.2f\n", var1, var2, 
                cor_matrix[high_cor[i, 1], high_cor[i, 2]]))
  }
} else {
  cat("  No correlations > 0.7 detected\n")
}

=== CORRELATION MATRIX ===
             temp precip  elev  tmax  tmin temp_seas precip_seas
temp         1.00  -0.87 -0.74  0.81  0.49      0.53        0.08
precip      -0.87   1.00  0.58 -0.82 -0.15     -0.74        0.21
elev        -0.74   0.58  1.00 -0.63 -0.38     -0.47       -0.24
tmax         0.81  -0.82 -0.63  1.00 -0.03      0.81       -0.37
tmin         0.49  -0.15 -0.38 -0.03  1.00     -0.43        0.69
temp_seas    0.53  -0.74 -0.47  0.81 -0.43      1.00       -0.48
precip_seas  0.08   0.21 -0.24 -0.37  0.69     -0.48        1.00

=== HIGHLY CORRELATED VARIABLES (|r| > 0.7) ===
  temp - precip: -0.87
  temp - elev: -0.74
  temp - tmax: 0.81
  precip - tmax: -0.82
  precip - temp_seas: -0.74
  tmax - temp_seas: 0.81

Correlation Matrix Matches Interpretation

  • temp - precip: -0.87 (strong negative) ✓
  • temp - tmax: 0.81 (strong positive) ✓
  • tmax - temp_seas: 0.81 (strong positive) ✓
Code
# ==============================================================================
# STEP 12: CREATE CORRELATION HEATMAP
# ==============================================================================
# Visual representation of the correlation matrix

# Prepare data for plotting
cor_df <- as.data.frame(as.table(cor_matrix))
names(cor_df) <- c("Var1", "Var2", "Correlation")

# Create heatmap
p1 <- ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile() +
  geom_text(aes(label = round(Correlation, 2)), size = 3, color = "white") +
  scale_fill_gradient2(
    low = "darkred",     # Negative correlations
    mid = "white",       # Zero correlation  
    high = "darkblue",   # Positive correlations
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  labs(
    title = "Collinearity Among Environmental Predictors",
    subtitle = "Bioclim variables are correlated by construction",
    x = "", y = ""
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "darkred", size = 11),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )

print(p1)

Code
# Save the plot
ggsave(file.path(output_path, "s3_correlation_heatmap.png"), 
       p1, width = 8, height = 7, dpi = 300)
cat("✓ Correlation heatmap saved\n")
✓ Correlation heatmap saved

Interpretation Guide

What to Look For What It Means Ecological Implication
Dark blue squares (r > 0.7) Strong positive correlation Variables measure similar environmental gradients
Dark red squares (r < -0.7) Strong negative correlation Variables have inverse relationships (e.g., temp & elevation)
White squares (r ≈ 0) No correlation Variables provide independent information

Key Insight: Temperature variables (temp, tmax, tmin) are all strongly correlated - they’re measuring the same thermal gradient. This means we can’t use all of them in a simple model without causing problems.


2. VIF — “How much is variance inflated by collinearity?”

Key Points: - VIF ≈ 1: no collinearity (ok) - VIF 5–10: moderate collinearity (concerning) - VIF > 10: severe collinearity (problematic) - VIF is model/feature-set dependent

Code
# ==============================================================================
# STEP 13: CALCULATE VARIANCE INFLATION FACTOR (VIF)
# ==============================================================================
# VIF measures how much the variance of a coefficient is inflated due to collinearity

# Fit a linear model using one variable as pseudo-response
# We use temp as the response, but any variable would work
m <- lm(temp ~ precip + elev + tmax + tmin + temp_seas + precip_seas, data = dat, singular.ok = TRUE)
# Calculate VIF
vif_vals <- car::vif(m)

# Create dataframe for results
vif_df <- data.frame(
  Variable = names(vif_vals),
  VIF = as.numeric(vif_vals)
) %>%
  mutate(
    # Classify severity based on rule-of-thumb
    severity = case_when(
      VIF < 5 ~ "Low",
      VIF >= 5 & VIF < 10 ~ "Moderate",
      VIF >= 10 ~ "High"
    )
  ) %>%
  arrange(desc(VIF))

cat("\n=== VARIANCE INFLATION FACTORS ===\n")
print(vif_df)

=== VARIANCE INFLATION FACTORS ===
     Variable       VIF severity
1   temp_seas 15.871397     High
2        tmin  9.874592 Moderate
3      precip  7.639062 Moderate
4        tmax  6.790539 Moderate
5 precip_seas  3.745971      Low
6        elev  3.603966      Low

VIF Values Support Your Conclusions

  • temp_seas: 15.87 (HIGH - must address) ✓
  • tmin: 9.87 (Moderate - concerning) ✓
  • precip: 7.64 (Moderate - concerning) ✓
Code
# ==============================================================================
# STEP 14: CREATE VIF BARPLOT
# ==============================================================================

p_vif <- ggplot(vif_df, aes(x = reorder(Variable, VIF), y = VIF, fill = severity)) +
  # Bars
  geom_col(width = 0.7, alpha = 0.9) +
  
  # Add value labels
  geom_text(aes(label = round(VIF, 2)), 
            hjust = -0.2, 
            size = 4) +
  
  # Threshold lines
  geom_hline(yintercept = 5, linetype = "dashed", color = "orange", size = 1) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "red", size = 1) +
  
  # Annotate thresholds
  annotate("text", x = 1, y = 5.5, 
           label = "VIF = 5 (Moderate)", 
           color = "orange", size = 3.5, hjust = 0) +
  annotate("text", x = 1, y = 10.5, 
           label = "VIF = 10 (High)", 
           color = "red", size = 3.5, hjust = 0) +
  
  # Flip coordinates for horizontal bars
  coord_flip(ylim = c(0, max(vif_df$VIF) * 1.15)) +
  
  # Color scale
  scale_fill_manual(
    values = c("Low" = "steelblue", 
               "Moderate" = "orange", 
               "High" = "red")
  ) +
  
  # Labels
  labs(
    title = "Variance Inflation Factors (VIF)",
    subtitle = "How much is variance inflated by collinearity?",
    x = NULL, 
    y = "Variance Inflation Factor",
    fill = "Severity"
  ) +
  
  # Theme
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "gray30"),
    axis.text.y = element_text(size = 11),
    legend.position = "bottom",
    panel.grid.major.y = element_blank()
  )

print(p_vif)

Code
# Save the plot
ggsave(file.path(output_path, "s3_vif_barplot.png"), 
       p_vif, width = 8, height = 5, dpi = 300)
cat("✓ VIF plot saved\n")
✓ VIF plot saved

Interpretation Guide

VIF Range Severity What It Means What To Do
< 5 Low Collinearity not a problem OK to use variable
5 - 10 Moderate Collinearity concerning Consider regularization
> 10 High Severe collinearity Must address (regularization or remove)

Key Insight: Variables with VIF > 10 (like tmax, tmin, temp_seas) are highly redundant. Including all of them will make your model coefficients unstable and uninterpretable.


3. Ecological Trade-offs in Variable Removal

Key Points: - Keep predictors that map to mechanisms (water balance, energy, disturbance) - If two variables are redundant: prefer the one that is measurable, transferable - Prefer variables less sensitive to sampling bias

Code
# ==============================================================================
# STEP 15: VISUALIZE REDUNDANT VARIABLES WITH DIFFERENT MEANINGS
# ==============================================================================

# Calculate correlation between tmax (derived) and temp (direct)
cor_temp_tmax <- cor(dat$temp, dat$tmax, use = "complete.obs")

# Create scatterplot
p_eco <- ggplot(dat, aes(x = temp, y = tmax)) +
  geom_point(alpha = 0.3, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", color = "darkred", se = TRUE, alpha = 0.2) +
  annotate("text", 
           x = min(dat$temp) + 0.8 * diff(range(dat$temp)), 
           y = min(dat$tmax) + 0.9 * diff(range(dat$tmax)),
           label = paste0("r = ", round(cor_temp_tmax, 3)),
           size = 5, color = "darkred", fontface = "bold") +
  labs(
    title = "Redundant Predictors with Different Ecological Meaning",
    subtitle = "Annual mean temp (direct) vs Max temp (derived)",
    x = "Annual Mean Temperature (°C)",
    y = "Maximum Temperature of Warmest Month (°C)",
    caption = "Both measure temperature but at different temporal scales"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "darkred"),
    plot.caption = element_text(hjust = 0, face = "italic")
  )

print(p_eco)

Code
# Save the plot
ggsave(file.path(output_path, "s3_ecological_tradeoffs.png"), 
       p_eco, width = 8, height = 6, dpi = 300)
Code
# ==============================================================================
# STEP 16: PRINT ECOLOGICAL DECISION GUIDANCE
# ==============================================================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("ECOLOGICAL TRADE-OFFS IN VARIABLE REMOVAL\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("KEY PRINCIPLES:\n")
cat("1. Keep predictors that map to MECHANISMS:\n")
cat("   • Water balance (precipitation, aridity)\n")
cat("   • Energy (temperature, radiation)\n")
cat("   • Disturbance regimes\n\n")

cat("2. When variables are REDUNDANT, prefer the one that is:\n")
cat("   • MEASURABLE: Can be collected consistently across studies\n")
cat("   • TRANSFERABLE: Works in different regions/time periods\n")
cat("   • LESS ENGINEERED: Closer to raw measurements\n")
cat("   • LESS BIASED: Not affected by sampling design\n\n")

cat("3. IN THIS DATASET:\n")
cat("   • temp (annual mean): Direct measurement, easy to interpret\n")
cat("   • tmax (max temp): Derived, more relevant for heat stress\n")
cat("   • tmin (min temp): Derived, relevant for cold tolerance\n")
cat("   • temp_seas (seasonality): Derived, measures variability\n\n")

cat("4. DECISION RULES:\n")
cat("   • If studying general thermal niche → keep temp\n")
cat("   • If studying heat stress → keep tmax\n")
cat("   • If studying cold tolerance → keep tmin\n")
cat("   • If studying climate variability → keep temp_seas\n")
cat("   • CANNOT keep all - they're too correlated!\n")

 ====================================================================== 
ECOLOGICAL TRADE-OFFS IN VARIABLE REMOVAL
====================================================================== 

KEY PRINCIPLES:
1. Keep predictors that map to MECHANISMS:
   • Water balance (precipitation, aridity)
   • Energy (temperature, radiation)
   • Disturbance regimes

2. When variables are REDUNDANT, prefer the one that is:
   • MEASURABLE: Can be collected consistently across studies
   • TRANSFERABLE: Works in different regions/time periods
   • LESS ENGINEERED: Closer to raw measurements
   • LESS BIASED: Not affected by sampling design

3. IN THIS DATASET:
   • temp (annual mean): Direct measurement, easy to interpret
   • tmax (max temp): Derived, more relevant for heat stress
   • tmin (min temp): Derived, relevant for cold tolerance
   • temp_seas (seasonality): Derived, measures variability

4. DECISION RULES:
   • If studying general thermal niche → keep temp
   • If studying heat stress → keep tmax
   • If studying cold tolerance → keep tmin
   • If studying climate variability → keep temp_seas
   • CANNOT keep all - they're too correlated!

Interpretation Guide

Question Answer Ecological Reasoning
Which variable is most interpretable? Annual mean temp (temp) Direct measurement, everyone understands °C
Which captures extremes? Max temp (tmax), Min temp (tmin) Important for physiological limits
Which captures variability? Temperature seasonality (temp_seas) Important for climate stability
Can we keep all? NO! (r > 0.9) They’re measuring the same gradient

Key Insight: Statistical redundancy doesn’t mean ecological redundancy. Choose based on your research question, not just correlation values.


4. Ridge vs LASSO vs Elastic Net

Key Points:

  • Ridge (α=0): Shrinks correlated coefficients together (stable prediction)

  • LASSO (α=1): Selects one among correlated variables (sparse, but unstable)

  • Elastic Net (α=0.5): Compromise; good default for BioClim sets

Code
# ==============================================================================
# STEP 17: PREPARE DATA FOR GLMNET (WITH PSEUDO-ABSENCES)
# ==============================================================================
# We have 93 real spider presence points
# For SDM, we need to create pseudo-absence (background) points
# Rule of thumb: 10x more background points than presences

# Set seed for reproducibility
set.seed(123)

# Create background points (pseudo-absences)
bg_points <- terra::spatSample(climate_stack[[1]], size = 1000, method = "random", 
                                xy = TRUE, as.df = TRUE, na.rm = TRUE) %>%
  dplyr::select(x = x, y = y)

cat("\n=== CREATING PSEUDO-ABSENCE POINTS ===\n")
cat("Real presence points:", nrow(spider_data), "\n")
cat("Pseudo-absence points:", nrow(bg_points), "\n")
cat("Ratio (background:presence):", round(nrow(bg_points)/nrow(spider_data), 1), ":1\n")

# Extract climate data for background points
bg_climate <- terra::extract(climate_stack, bg_points[, c("x", "y")])
bg_climate <- bg_climate[, -1]

# Combine background points with climate data
bg_data <- cbind(bg_points, bg_climate) %>%
  mutate(pa = 0)

# ============================================================
# FIRST: Create spider_model (BEFORE trying to use it!)
# ============================================================

# Add presence column to spider data
spider_data$pa <- 1

# Select same variables for spider dataset
spider_model <- spider_data %>%
  dplyr::select(pa, X, Y, 
                temp = bio1_annual_mean_temp,
                tmax = bio5_max_temp_warmest_month,
                tmin = bio6_min_temp_coldest_month,
                temp_seas = bio4_temp_seasonality,
                precip = bio12_annual_precip,
                precip_seas = bio15_precip_seasonality,
                elev = elevation)

# ============================================================
# NOW remove background points that overlap with presences
# ============================================================

bg_data <- anti_join(bg_data, spider_model, by = c("x" = "X", "y" = "Y"))
cat("  Background after removing overlap:", nrow(bg_data), "\n")

# ============================================================
# THEN create bg_model and combine
# ============================================================

bg_model <- bg_data %>%
  dplyr::select(pa, x, y,
                temp = bio1_annual_mean_temp,
                tmax = bio5_max_temp_warmest_month,
                tmin = bio6_min_temp_coldest_month,
                temp_seas = bio4_temp_seasonality,
                precip = bio12_annual_precip,
                precip_seas = bio15_precip_seasonality,
                elev = elevation) %>%
  rename(X = x, Y = y)

# Combine presence and pseudo-absence data
dat <- bind_rows(spider_model, bg_model)

cat("\n✓ Combined dataset created\n")
cat("  Total points:", nrow(dat), "\n")
cat("  Presences (1):", sum(dat$pa == 1), "\n")
cat("  Pseudo-absences (0):", sum(dat$pa == 0), "\n")

# ==============================================================================
# PREPARE FOR GLMNET
# ==============================================================================

# Prepare predictor matrix (all environmental variables)
Xmat <- dat |> 
  dplyr::select(temp, precip, elev, tmax, tmin, temp_seas, precip_seas) |> 
  as.matrix()

# Response variable (presence/absence)
y <- dat$pa

cat("\n✓ Data ready for glmnet\n")
cat("  Predictor matrix dimensions:", dim(Xmat)[1], "rows,", dim(Xmat)[2], "columns\n")
cat("  Variables:", colnames(Xmat), "\n")

# Quick check of environmental coverage
cat("\n=== ENVIRONMENTAL COVERAGE ===\n")
cat("Temperature range - Presences:",
    round(min(dat$temp[dat$pa==1], na.rm=TRUE), 1), "to",
    round(max(dat$temp[dat$pa==1], na.rm=TRUE), 1), "°C\n")
cat("Temperature range - Background:",
    round(min(dat$temp[dat$pa==0], na.rm=TRUE), 1), "to",
    round(max(dat$temp[dat$pa==0], na.rm=TRUE), 1), "°C\n")

=== CREATING PSEUDO-ABSENCE POINTS ===
Real presence points: 93 
Pseudo-absence points: 1000 
Ratio (background:presence): 10.8 :1
  Background after removing overlap: 1000 

✓ Combined dataset created
  Total points: 1093 
  Presences (1): 93 
  Pseudo-absences (0): 1000 

✓ Data ready for glmnet
  Predictor matrix dimensions: 1093 rows, 7 columns
  Variables: temp precip elev tmax tmin temp_seas precip_seas 

=== ENVIRONMENTAL COVERAGE ===
Temperature range - Presences: 21.9 to 25.2 °C
Temperature range - Background: -54 to 30.5 °C
Code
# ==============================================================================
# STEP 18: FIT THREE MODELS WITH DIFFERENT ALPHA VALUES
# ==============================================================================

# Ridge regression (alpha = 0) - shrinks coefficients but keeps all variables
fit_ridge <- glmnet(Xmat, y, family = "binomial", alpha = 0)

# LASSO (alpha = 1) - can shrink coefficients to exactly zero (variable selection)
fit_lasso <- glmnet(Xmat, y, family = "binomial", alpha = 1)

# Elastic Net (alpha = 0.5) - compromise between ridge and LASSO
fit_enet <- glmnet(Xmat, y, family = "binomial", alpha = 0.5)

set.seed(456)  # Different seed for CV

# Cross-validation to find optimal lambda
cv_ridge <- cv.glmnet(Xmat, y, family = "binomial", alpha = 0, nfolds = 5)
cv_lasso <- cv.glmnet(Xmat, y, family = "binomial", alpha = 1, nfolds = 5)
cv_enet <- cv.glmnet(Xmat, y, family = "binomial", alpha = 0.5, nfolds = 5)

cat("\n=== OPTIMAL LAMBDA VALUES ===\n")
cat("Ridge (α=0):      λ.min =", round(cv_ridge$lambda.min, 4), 
    "| log(λ) =", round(log(cv_ridge$lambda.min), 2), "\n")
cat("LASSO (α=1):      λ.min =", round(cv_lasso$lambda.min, 4), 
    "| log(λ) =", round(log(cv_lasso$lambda.min), 2), "\n")
cat("Elastic Net (α=0.5): λ.min =", round(cv_enet$lambda.min, 4), 
    "| log(λ) =", round(log(cv_enet$lambda.min), 2), "\n")

=== OPTIMAL LAMBDA VALUES ===
Ridge (α=0):      λ.min = 0.009 | log(λ) = -4.71 
LASSO (α=1):      λ.min = 3e-04 | log(λ) = -8.26 
Elastic Net (α=0.5): λ.min = 0 | log(λ) = -10.92 

Interpretation:
- Ridge: λ.min = 0.0088 (log = -4.73) ✓ Small lambda = minimal shrinkage - LASSO: λ.min = 0 (log = -10.24) ✓ Zero lambda = no penalty needed - Elastic Net: λ.min = 0 (log = -10.95) ✓ Same interpretation

Code
# ==============================================================================
# STEP 19: ENHANCED COEFFICIENT PATH PLOTS
# ==============================================================================
# These plots show how coefficients change as lambda (regularization strength) increases
# Each line represents one environmental variable's coefficient

# Load required library for colors
library(viridis)

# Set up plotting area for three panels with larger margins for text
par(mfrow = c(1, 3), 
    mar = c(6, 5, 8, 3),  # Increased margins: bottom, left, top, right
    cex.lab = 1.3,         # Axis label size
    cex.axis = 1.2,        # Axis tick label size
    cex.main = 1.5)        # Main title size

# ======================================
# 1. RIDGE PLOT (α = 0)
# ======================================
plot(fit_ridge, xvar = "lambda", label = TRUE, 
     main = "",  # We'll add custom title with annotations
     col = viridis(7),  # Color-blind friendly colors
     lwd = 2.5,         # Thicker lines
     cex.lab = 1.3,
     cex.axis = 1.2)

# Add title with explanation
title("RIDGE REGRESSION (α = 0)", line = 4.5, cex.main = 1.6, font.main = 2)
title(expression("Shrinks coefficients together • Keeps all variables"), 
      line = 3.2, cex.main = 1.1, font.main = 3, col.main = "darkblue")

# Add vertical line at optimal lambda
abline(v = log(cv_ridge$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_ridge$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)

# Add text annotations with larger text
text(x = log(cv_ridge$lambda.min), 
     y = max(coef(fit_ridge)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3, font = 2)

text(x = log(cv_ridge$lambda.1se), 
     y = max(coef(fit_ridge)[-1,1]) * 0.75, 
     labels = "λ.1se", pos = 2, col = "chocolate3", cex = 1.3, font = 2)

# Add legend for interpretation with larger text
legend("topright", 
       legend = c("All variables kept", "Coefficients shrink together"),
       bty = "n", cex = 1.1, text.col = "darkblue", 
       bg = "white", box.col = "white")

# Add variable count note
mtext(paste("All", ncol(Xmat), "variables remain"), 
      side = 1, line = 4, cex = 1.1, col = "darkblue")

# ======================================
# 2. LASSO PLOT (α = 1)
# ======================================
plot(fit_lasso, xvar = "lambda", label = TRUE,
     col = viridis(7), lwd = 2.5,
     cex.lab = 1.3, cex.axis = 1.2)

title("LASSO REGRESSION (α = 1)", line = 4.5, cex.main = 1.6, font.main = 2)
title(expression("Selects one variable among correlated • Sparse solution"), 
      line = 3.2, cex.main = 1.1, font.main = 3, col.main = "darkred")

# Add vertical lines
abline(v = log(cv_lasso$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_lasso$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)

# Count variables at λ.1se
coef_lasso_1se <- coef(fit_lasso, s = cv_lasso$lambda.1se)
nz_lasso <- sum(coef_lasso_1se[-1] != 0)
dropped_lasso <- ncol(Xmat) - nz_lasso

text(x = log(cv_lasso$lambda.min), 
     y = max(coef(fit_lasso)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3, font = 2)

text(x = log(cv_lasso$lambda.1se), 
     y = max(coef(fit_lasso)[-1,1]) * 0.75, 
     labels = paste0("λ.1se"), pos = 2, col = "chocolate3", cex = 1.3, font = 2)

# Add variable count note
mtext(paste(nz_lasso, "vars kept,", dropped_lasso, "vars dropped"), 
      side = 1, line = 4, cex = 1.1, col = "darkred")

legend("topright", 
       legend = c("Some variables go to ZERO", "Performs variable selection"),
       bty = "n", cex = 1.1, text.col = "darkred", 
       bg = "white", box.col = "white")

# ======================================
# 3. ELASTIC NET PLOT (α = 0.5)
# ======================================
plot(fit_enet, xvar = "lambda", label = TRUE,
     col = viridis(7), lwd = 2.5,
     cex.lab = 1.3, cex.axis = 1.2)

title("ELASTIC NET (α = 0.5)", line = 4.5, cex.main = 1.6, font.main = 2)
title(expression("Compromise: groups correlated variables • Good default"), 
      line = 3.2, cex.main = 1.1, font.main = 3, col.main = "darkgreen")

# Add vertical lines
abline(v = log(cv_enet$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_enet$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)

# Count variables at λ.1se
coef_enet_1se <- coef(fit_enet, s = cv_enet$lambda.1se)
nz_enet <- sum(coef_enet_1se[-1] != 0)
dropped_enet <- ncol(Xmat) - nz_enet

text(x = log(cv_enet$lambda.min), 
     y = max(coef(fit_enet)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3, font = 2)

text(x = log(cv_enet$lambda.1se), 
     y = max(coef(fit_enet)[-1,1]) * 0.75, 
     labels = paste0("λ.1se"), pos = 2, col = "chocolate3", cex = 1.3, font = 2)

# Add variable count note
mtext(paste(nz_enet, "vars kept,", dropped_enet, "vars dropped"), 
      side = 1, line = 4, cex = 1.1, col = "darkgreen")

legend("topright", 
       legend = c("Groups correlated variables", "Best of both worlds"),
       bty = "n", cex = 1.1, text.col = "darkgreen", 
       bg = "white", box.col = "white")

Code
# Reset plotting parameters
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)

# ==============================================================================
# SAVE HIGH-RESOLUTION VERSIONS TO OUTPUT DIRECTORY
# ==============================================================================

# Save individual plots with better visibility
png(file.path(output_path, "s3_ridge_paths.png"), width = 1200, height = 800, res = 150)
par(mar = c(6, 5, 8, 3), cex.lab = 1.4, cex.axis = 1.3, cex.main = 1.6)
plot(fit_ridge, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("RIDGE REGRESSION (α = 0) - Shrinks coefficients together", line = 3.5)
abline(v = log(cv_ridge$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_ridge$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_ridge$lambda.min), y = max(coef(fit_ridge)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.5, font = 2)
text(x = log(cv_ridge$lambda.1se), y = max(coef(fit_ridge)[-1,1]) * 0.75, 
     labels = "λ.1se", pos = 2, col = "chocolate3", cex = 1.5, font = 2)
dev.off()

png(file.path(output_path, "s3_lasso_paths.png"), width = 1200, height = 800, res = 150)
par(mar = c(6, 5, 8, 3), cex.lab = 1.4, cex.axis = 1.3, cex.main = 1.6)
plot(fit_lasso, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("LASSO REGRESSION (α = 1) - Variable selection", line = 3.5)
abline(v = log(cv_lasso$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_lasso$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_lasso$lambda.min), y = max(coef(fit_lasso)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.5, font = 2)
text(x = log(cv_lasso$lambda.1se), y = max(coef(fit_lasso)[-1,1]) * 0.75, 
     labels = paste0("λ.1se (", nz_lasso, " vars)"), pos = 2, col = "chocolate3", cex = 1.5, font = 2)
dev.off()

png(file.path(output_path, "s3_enet_paths.png"), width = 1200, height = 800, res = 150)
par(mar = c(6, 5, 8, 3), cex.lab = 1.4, cex.axis = 1.3, cex.main = 1.6)
plot(fit_enet, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("ELASTIC NET (α = 0.5) - Compromise: groups correlated variables", line = 3.5)
abline(v = log(cv_enet$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_enet$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_enet$lambda.min), y = max(coef(fit_enet)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.5, font = 2)
text(x = log(cv_enet$lambda.1se), y = max(coef(fit_enet)[-1,1]) * 0.75, 
     labels = paste0("λ.1se (", nz_enet, " vars)"), pos = 2, col = "chocolate3", cex = 1.5, font = 2)
dev.off()

# Save a combined high-res version
png(file.path(output_path, "s3_all_coefficient_paths.png"), width = 2400, height = 800, res = 200)
par(mfrow = c(1, 3), mar = c(6, 5, 8, 3), cex.lab = 1.4, cex.axis = 1.3, cex.main = 1.6)

# Ridge
plot(fit_ridge, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("RIDGE (α=0)", line = 3.5)
abline(v = log(cv_ridge$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_ridge$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_ridge$lambda.min), y = max(coef(fit_ridge)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3)
text(x = log(cv_ridge$lambda.1se), y = max(coef(fit_ridge)[-1,1]) * 0.75, 
     labels = "λ.1se", pos = 2, col = "chocolate3", cex = 1.3)

# LASSO
plot(fit_lasso, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("LASSO (α=1)", line = 3.5)
abline(v = log(cv_lasso$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_lasso$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_lasso$lambda.min), y = max(coef(fit_lasso)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3)
text(x = log(cv_lasso$lambda.1se), y = max(coef(fit_lasso)[-1,1]) * 0.75, 
     labels = paste0("λ.1se (", nz_lasso, " vars)"), pos = 2, col = "chocolate3", cex = 1.3)

# Elastic Net
plot(fit_enet, xvar = "lambda", label = TRUE, col = viridis(7), lwd = 2.5)
title("ELASTIC NET (α=0.5)", line = 3.5)
abline(v = log(cv_enet$lambda.min), lty = 2, col = "red", lwd = 3)
abline(v = log(cv_enet$lambda.1se), lty = 2, col = "chocolate3", lwd = 3)
text(x = log(cv_enet$lambda.min), y = max(coef(fit_enet)[-1,1]) * 0.9, 
     labels = "λ.min", pos = 4, col = "red", cex = 1.3)
text(x = log(cv_enet$lambda.1se), y = max(coef(fit_enet)[-1,1]) * 0.75, 
     labels = paste0("λ.1se (", nz_enet, " vars)"), pos = 2, col = "chocolate3", cex = 1.3)

dev.off()
par(mfrow = c(1, 1))

cat("\n✓ Enhanced coefficient path plots displayed above\n")
cat("✓ High-resolution versions saved to output directory:\n")
cat("  ", file.path(output_path, "s3_ridge_paths.png"), "\n")
cat("  ", file.path(output_path, "s3_lasso_paths.png"), "\n")
cat("  ", file.path(output_path, "s3_enet_paths.png"), "\n")
cat("  ", file.path(output_path, "s3_all_coefficient_paths.png"), "\n")
png 
  2 
png 
  2 
png 
  2 
png 
  2 

✓ Enhanced coefficient path plots displayed above
✓ High-resolution versions saved to output directory:
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_ridge_paths.png 
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_lasso_paths.png 
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_enet_paths.png 
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_all_coefficient_paths.png 
Code
# ==============================================================================
# SUMMARY: VARIABLES RETAINED BY EACH METHOD (comparing λ.min vs λ.1se)
# ==============================================================================

# RIDGE (α=0)
ridge_coef_min <- as.matrix(coef(cv_ridge, s = "lambda.min"))
ridge_coef_1se <- as.matrix(coef(cv_ridge, s = "lambda.1se"))
ridge_nz_min <- sum(ridge_coef_min[-1] != 0)
ridge_nz_1se <- sum(ridge_coef_1se[-1] != 0)

# LASSO (α=1)
lasso_coef_min <- as.matrix(coef(cv_lasso, s = "lambda.min"))
lasso_coef_1se <- as.matrix(coef(cv_lasso, s = "lambda.1se"))
lasso_nz_min <- sum(lasso_coef_min[-1] != 0)
lasso_nz_1se <- sum(lasso_coef_1se[-1] != 0)

# ELASTIC NET (α=0.5)
enet_coef_min <- as.matrix(coef(cv_enet, s = "lambda.min"))
enet_coef_1se <- as.matrix(coef(cv_enet, s = "lambda.1se"))
enet_nz_min <- sum(enet_coef_min[-1] != 0)
enet_nz_1se <- sum(enet_coef_1se[-1] != 0)

# Create comparison table
variable_summary <- data.frame(
  Model = c("RIDGE (α = 0)", "LASSO (α = 1)", "ELASTIC NET (α = 0.5)"),
  `λ.min (best CV)` = c(paste0(ridge_nz_min, "/7"), 
                        paste0(lasso_nz_min, "/7"), 
                        paste0(enet_nz_min, "/7")),
  `λ.1se (simpler)` = c(paste0(ridge_nz_1se, "/7"), 
                        paste0(lasso_nz_1se, "/7"), 
                        paste0(enet_nz_1se, "/7")),
  check.names = FALSE
)

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("VARIABLE SELECTION SUMMARY\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")
print(variable_summary, row.names = FALSE)

# Show which variables are kept at λ.1se (simpler model)
cat("\n", paste(rep("-", 50), collapse = ""), "\n")
cat("VARIABLES RETAINED AT λ.1se (simpler model)\n")
cat(paste(rep("-", 50), collapse = ""), "\n\n")

# Ridge (all variables always kept)
cat("RIDGE (α=0): All 7 variables kept\n")

# LASSO
lasso_vars_1se <- rownames(lasso_coef_1se)[lasso_coef_1se[-1] != 0]
cat("\nLASSO (α=1) keeps:", 
    if(length(lasso_vars_1se) > 0) paste(lasso_vars_1se, collapse = ", ") else "none")

# Elastic Net
enet_vars_1se <- rownames(enet_coef_1se)[enet_coef_1se[-1] != 0]
cat("\nELASTIC NET (α=0.5) keeps:", 
    if(length(enet_vars_1se) > 0) paste(enet_vars_1se, collapse = ", ") else "none")

# INTERPRETATION
cat("\n\n", paste(rep("=", 70), collapse = ""), "\n")
cat("INTERPRETATION\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("• λ.min (minimum CV error): Best prediction accuracy\n")
cat("  - Ridge keeps", ridge_nz_min, "variables\n")
cat("  - LASSO keeps", lasso_nz_min, "variables\n")
cat("  - Elastic Net keeps", enet_nz_min, "variables\n\n")

cat("• λ.1se (simpler model): Within 1 SE of minimum, better for interpretation\n")
cat("  - Ridge keeps", ridge_nz_1se, "variables (always all)\n")
cat("  - LASSO keeps", lasso_nz_1se, "variables\n")
cat("  - Elastic Net keeps", enet_nz_1se, "variables\n\n")

cat("• Your figure likely shows λ.1se results:\n")
cat("  - LASSO selects fewer variables (", lasso_nz_1se, "/7)\n")
cat("  - Elastic Net selects even fewer (", enet_nz_1se, "/7)\n")
cat("  - This matches 'simpler model for interpretation' goal\n\n")

cat("• Strongest predictors (at λ.1se):\n")
if(enet_nz_1se > 0) {
  cat("  Elastic Net keeps:", paste(enet_vars_1se, collapse = ", "), "\n")
} else {
  cat("  No variables retained at λ.1se - try λ.min instead\n")
}

 ====================================================================== 
VARIABLE SELECTION SUMMARY
====================================================================== 

                 Model λ.min (best CV) λ.1se (simpler)
         RIDGE (α = 0)             7/7             7/7
         LASSO (α = 1)             5/7             6/7
 ELASTIC NET (α = 0.5)             7/7             7/7

 -------------------------------------------------- 
VARIABLES RETAINED AT λ.1se (simpler model)
-------------------------------------------------- 

RIDGE (α=0): All 7 variables kept

LASSO (α=1) keeps: (Intercept), temp, precip, elev, tmin, temp_seas, precip_seas
ELASTIC NET (α=0.5) keeps: (Intercept), temp, precip, elev, tmax, tmin, temp_seas, precip_seas

 ====================================================================== 
INTERPRETATION
====================================================================== 

• λ.min (minimum CV error): Best prediction accuracy
  - Ridge keeps 7 variables
  - LASSO keeps 5 variables
  - Elastic Net keeps 7 variables

• λ.1se (simpler model): Within 1 SE of minimum, better for interpretation
  - Ridge keeps 7 variables (always all)
  - LASSO keeps 6 variables
  - Elastic Net keeps 7 variables

• Your figure likely shows λ.1se results:
  - LASSO selects fewer variables ( 6 /7)
  - Elastic Net selects even fewer ( 7 /7)
  - This matches 'simpler model for interpretation' goal

• Strongest predictors (at λ.1se):
  Elastic Net keeps: (Intercept), temp, precip, elev, tmax, tmin, temp_seas, precip_seas 
Code
# ==============================================================================
# INTERPRETATION GUIDE FOR COEFFICIENT PATH PLOTS
# ==============================================================================

cat("\n", paste(rep("=", 80), collapse = ""), "\n")
cat("UNDERSTANDING COEFFICIENT PATH PLOTS\n")
cat(paste(rep("=", 80), collapse = ""), "\n\n")

cat("WHAT EACH PLOT SHOWS:\n")
cat("• X-axis: log(λ) - regularization strength (moves LEFT as λ increases)\n")
cat("• Y-axis: Coefficient values for each variable\n")
cat("• Each colored line = one environmental variable\n")
cat("• Vertical lines: optimal λ from cross-validation\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("RIDGE REGRESSION (α = 0)\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("VISUAL PATTERN:\n")
cat("  • All lines remain visible across entire x-axis\n")
cat("  • Lines smoothly approach zero but never reach it\n")
cat("  • Lines move together (correlated variables shrink similarly)\n\n")

cat("STATISTICAL MEANING:\n")
cat("  • Keeps ALL variables in the model\n")
cat("  • Shrinks coefficients toward zero but never exactly zero\n")
cat("  • Correlated variables have similar coefficient paths\n\n")

cat("ECOLOGICAL INTERPRETATION:\n")
cat("  • All environmental gradients contribute to prediction\n")
cat("  • No variable selection - good when all predictors are important\n")
cat("  • Stable predictions even with high collinearity\n\n")

cat("WHEN TO USE:\n")
cat("  • When you have many variables and all are potentially important\n")
cat("  • When prediction stability is priority over interpretability\n")
cat("  • When you want to keep correlated variables (e.g., multiple temperature metrics)\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("LASSO REGRESSION (α = 1)\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("VISUAL PATTERN:\n")
cat("  • Some lines disappear (hit exactly zero) as λ increases\n")
cat("  • Only the strongest predictors survive at high λ\n")
cat("  • Can be unstable - which variable survives may vary\n\n")

cat("STATISTICAL MEANING:\n")
cat("  • Performs VARIABLE SELECTION\n")
cat("  • Shrinks some coefficients exactly to zero\n")
cat("  • Among correlated variables, selects one and drops others\n\n")

cat("ECOLOGICAL INTERPRETATION:\n")
cat("  • Identifies the 'most important' environmental predictors\n")
cat("  • May select arbitrary one among correlated variables\n")
cat("  • Result: simpler, more interpretable model\n\n")

cat("WHEN TO USE:\n")
cat("  • When you need a simple, interpretable model\n")
cat("  • When you suspect many variables are irrelevant\n")
cat("  • CAUTION: Can be unstable with high collinearity\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("ELASTIC NET (α = 0.5)\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("VISUAL PATTERN:\n")
cat("  • Hybrid of ridge and LASSO\n")
cat("  • Some lines disappear, but correlated variables shrink together first\n")
cat("  • More stable selection than LASSO alone\n\n")

cat("STATISTICAL MEANING:\n")
cat("  • Groups correlated variables\n")
cat("  • Either keeps or removes entire groups together\n")
cat("  • Compromise between ridge shrinkage and LASSO selection\n\n")

cat("ECOLOGICAL INTERPRETATION:\n")
cat("  • Ideal for BioClim data where variables are naturally grouped\n")
cat("  • Temperature variables kept/dropped as a group\n")
cat("  • Precipitation variables handled separately\n\n")

cat("WHEN TO USE:\n")
cat("  • DEFAULT CHOICE for environmental data\n")
cat("  • When variables have natural grouping structure\n")
cat("  • When you want both stability AND some selection\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("IN THIS DATASET - WHAT TO LOOK FOR:\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("• Temperature variables (temp, tmax, tmin, temp_seas) will behave similarly\n")
cat("• Watch how LASSO picks ONE temperature variable and drops others\n")
cat("• See how Elastic Net handles them as a group\n")
cat("• Note where λ.min and λ.1se fall relative to variable drop-out points\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("PRACTICAL RECOMMENDATIONS:\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("1. For EXPLORATORY ANALYSIS: Use Elastic Net (α=0.5)\n")
cat("2. For PREDICTION: Compare λ.min and λ.1se models\n")
cat("3. For INTERPRETATION: LASSO if collinearity is low, otherwise Elastic Net\n")
cat("4. For TRANSFERABLE SDMs: Prefer λ.1se (simpler models generalize better)\n")
cat("5. For PUBLICATION: Report both λ.min and λ.1se results\n\n")

cat("────────────────────────────────────────────────────────────────\n")
cat("COMMON PITFALLS TO AVOID:\n")
cat("────────────────────────────────────────────────────────────────\n")
cat("✗ Don't assume LASSO-selected variables are 'causally' important\n")
cat("✗ Don't use ridge when you need variable selection\n")
cat("✗ Don't ignore cross-validation results - λ matters!\n")
cat("✗ Don't forget that correlated variables create instability in LASSO\n")
cat("✓ DO use domain knowledge to guide variable choice\n")

 ================================================================================ 
UNDERSTANDING COEFFICIENT PATH PLOTS
================================================================================ 

WHAT EACH PLOT SHOWS:
• X-axis: log(λ) - regularization strength (moves LEFT as λ increases)
• Y-axis: Coefficient values for each variable
• Each colored line = one environmental variable
• Vertical lines: optimal λ from cross-validation

────────────────────────────────────────────────────────────────
RIDGE REGRESSION (α = 0)
────────────────────────────────────────────────────────────────
VISUAL PATTERN:
  • All lines remain visible across entire x-axis
  • Lines smoothly approach zero but never reach it
  • Lines move together (correlated variables shrink similarly)

STATISTICAL MEANING:
  • Keeps ALL variables in the model
  • Shrinks coefficients toward zero but never exactly zero
  • Correlated variables have similar coefficient paths

ECOLOGICAL INTERPRETATION:
  • All environmental gradients contribute to prediction
  • No variable selection - good when all predictors are important
  • Stable predictions even with high collinearity

WHEN TO USE:
  • When you have many variables and all are potentially important
  • When prediction stability is priority over interpretability
  • When you want to keep correlated variables (e.g., multiple temperature metrics)

────────────────────────────────────────────────────────────────
LASSO REGRESSION (α = 1)
────────────────────────────────────────────────────────────────
VISUAL PATTERN:
  • Some lines disappear (hit exactly zero) as λ increases
  • Only the strongest predictors survive at high λ
  • Can be unstable - which variable survives may vary

STATISTICAL MEANING:
  • Performs VARIABLE SELECTION
  • Shrinks some coefficients exactly to zero
  • Among correlated variables, selects one and drops others

ECOLOGICAL INTERPRETATION:
  • Identifies the 'most important' environmental predictors
  • May select arbitrary one among correlated variables
  • Result: simpler, more interpretable model

WHEN TO USE:
  • When you need a simple, interpretable model
  • When you suspect many variables are irrelevant
  • CAUTION: Can be unstable with high collinearity

────────────────────────────────────────────────────────────────
ELASTIC NET (α = 0.5)
────────────────────────────────────────────────────────────────
VISUAL PATTERN:
  • Hybrid of ridge and LASSO
  • Some lines disappear, but correlated variables shrink together first
  • More stable selection than LASSO alone

STATISTICAL MEANING:
  • Groups correlated variables
  • Either keeps or removes entire groups together
  • Compromise between ridge shrinkage and LASSO selection

ECOLOGICAL INTERPRETATION:
  • Ideal for BioClim data where variables are naturally grouped
  • Temperature variables kept/dropped as a group
  • Precipitation variables handled separately

WHEN TO USE:
  • DEFAULT CHOICE for environmental data
  • When variables have natural grouping structure
  • When you want both stability AND some selection

────────────────────────────────────────────────────────────────
IN THIS DATASET - WHAT TO LOOK FOR:
────────────────────────────────────────────────────────────────
• Temperature variables (temp, tmax, tmin, temp_seas) will behave similarly
• Watch how LASSO picks ONE temperature variable and drops others
• See how Elastic Net handles them as a group
• Note where λ.min and λ.1se fall relative to variable drop-out points

────────────────────────────────────────────────────────────────
PRACTICAL RECOMMENDATIONS:
────────────────────────────────────────────────────────────────
1. For EXPLORATORY ANALYSIS: Use Elastic Net (α=0.5)
2. For PREDICTION: Compare λ.min and λ.1se models
3. For INTERPRETATION: LASSO if collinearity is low, otherwise Elastic Net
4. For TRANSFERABLE SDMs: Prefer λ.1se (simpler models generalize better)
5. For PUBLICATION: Report both λ.min and λ.1se results

────────────────────────────────────────────────────────────────
COMMON PITFALLS TO AVOID:
────────────────────────────────────────────────────────────────
✗ Don't assume LASSO-selected variables are 'causally' important
✗ Don't use ridge when you need variable selection
✗ Don't ignore cross-validation results - λ matters!
✗ Don't forget that correlated variables create instability in LASSO
✓ DO use domain knowledge to guide variable choice
Code
# ==============================================================================
# SUMMARY TABLE FOR QUICK REFERENCE
# ==============================================================================

cat("\n", paste(rep("=", 80), collapse = ""), "\n")
cat("QUICK REFERENCE: RIDGE vs LASSO vs ELASTIC NET\n")
cat(paste(rep("=", 80), collapse = ""), "\n\n")

summary_table <- data.frame(
  Method = c("RIDGE", "LASSO", "ELASTIC NET"),
  Alpha = c("α = 0", "α = 1", "α = 0.5"),
  `What It Does` = c("Shrinks all coefficients", "Selects one among correlated", "Groups correlated variables"),
  `Variable Selection?` = c("NO - keeps all", "YES - can zero out", "YES - but groups"),
  `Stability` = c("HIGH - very stable", "LOW - can be unstable", "MEDIUM - more stable than LASSO"),
  `Best For` = c("Stable prediction", "Simple interpretation", "DEFAULT for BioClim"),
  check.names = FALSE
)

print(summary_table, row.names = FALSE)

cat("\n", paste(rep("=", 80), collapse = ""), "\n")
cat("KEY INSIGHT: Look at the plots above - notice how LASSO forces some coefficients to\n")
cat("exactly zero (they disappear), while ridge keeps all variables but shrinks them.\n")
cat("Elastic Net provides a compromise - it groups correlated variables together.\n")
cat(paste(rep("=", 80), collapse = ""), "\n")

 ================================================================================ 
QUICK REFERENCE: RIDGE vs LASSO vs ELASTIC NET
================================================================================ 

      Method   Alpha                 What It Does Variable Selection?
       RIDGE   α = 0     Shrinks all coefficients      NO - keeps all
       LASSO   α = 1 Selects one among correlated  YES - can zero out
 ELASTIC NET α = 0.5  Groups correlated variables    YES - but groups
                       Stability              Best For
              HIGH - very stable     Stable prediction
           LOW - can be unstable Simple interpretation
 MEDIUM - more stable than LASSO   DEFAULT for BioClim

 ================================================================================ 
KEY INSIGHT: Look at the plots above - notice how LASSO forces some coefficients to
exactly zero (they disappear), while ridge keeps all variables but shrinks them.
Elastic Net provides a compromise - it groups correlated variables together.
================================================================================ 

Key Insight: Look at the coefficient path plots - notice how LASSO forces some coefficients to exactly zero (they disappear), while ridge keeps all variables but shrinks them.


5. Choosing Lambda with Cross-Validation

Key Points: - lambda.min: Best CV performance (minimum cross-validation error) - lambda.1se: Simpler model, often similar performance (within 1 SE) - Ecology: Simpler models can generalize better spatially

Code
# ==============================================================================
# STEP 20: CROSS-VALIDATION FOR ELASTIC NET
# ==============================================================================
# We'll use Elastic Net (α=0.5) as our default
# This plot shows how cross-validation error changes with lambda

# ======================================
# PART A: BASE R PLOT (traditional)
# ======================================

# Set plotting parameters for side-by-side comparison
par(mfrow = c(1, 1))  # 1 row, 2 columns

# Create base R CV plot
par(mar = c(5, 5, 4, 2))
plot(cv_enet)
title("Elastic Net CV (Base R)", cex.main = 1.2, font.main = 2)

# Add text annotations
mtext(paste0("λ.min = ", round(log(cv_enet$lambda.min), 2), 
             " (", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.min)], " vars)"), 
      side = 1, line = 2.5, adj = 0.1, col = "red", cex = 0.9)
mtext(paste0("λ.1se = ", round(log(cv_enet$lambda.1se), 2),
             " (", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.1se)], " vars)"), 
      side = 1, line = 3.5, adj = 0.1, col = "blue", cex = 0.9)

Code
# ======================================
# PART B: GGPLOT VERSION (more customizable)
# ======================================

# Extract CV results into a dataframe for ggplot
cv_df <- data.frame(
  log_lambda = log(cv_enet$lambda),
  cvm = cv_enet$cvm,        # Cross-validation error (mean)
  cvup = cv_enet$cvup,      # Upper confidence interval
  cvlo = cv_enet$cvlo,      # Lower confidence interval
  nz = cv_enet$nzero        # Number of non-zero coefficients
)

# Find positions for lambda.min and lambda.1se
min_idx <- which(cv_enet$lambda == cv_enet$lambda.min)
se_idx <- which(cv_enet$lambda == cv_enet$lambda.1se)

# Create ggplot version
p_cv <- ggplot(cv_df, aes(x = log_lambda, y = cvm)) +
  # Error ribbon
  geom_ribbon(aes(ymin = cvlo, ymax = cvup), alpha = 0.2, fill = "steelblue") +
  
  # CV error line and points
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point(color = "steelblue", size = 2) +
  
  # Highlight lambda.min
  geom_vline(xintercept = log(cv_enet$lambda.min), 
             linetype = "dashed", color = "red", size = 1) +
  annotate("point", x = log(cv_enet$lambda.min), 
           y = cv_enet$cvm[min_idx], size = 4, color = "red") +
  annotate("text", x = log(cv_enet$lambda.min), 
           y = max(cv_df$cvm) * 1.05,
           label = paste0("λ.min\n", round(log(cv_enet$lambda.min), 2), 
                         "\n", cv_enet$nzero[min_idx], " vars"),
           color = "red", size = 3.5) +
  
  # Highlight lambda.1se
  geom_vline(xintercept = log(cv_enet$lambda.1se), 
             linetype = "dashed", color = "blue", size = 1) +
  annotate("point", x = log(cv_enet$lambda.1se), 
           y = cv_enet$cvm[se_idx], size = 4, color = "blue") +
  annotate("text", x = log(cv_enet$lambda.1se), 
           y = min(cv_df$cvm) * 0.95,
           label = paste0("λ.1se\n", round(log(cv_enet$lambda.1se), 2), 
                         "\n", cv_enet$nzero[se_idx], " vars"),
           color = "blue", size = 3.5) +
  
  # Labels
  labs(
    title = "Elastic Net CV (ggplot)",
    x = "log(λ)", 
    y = "Binomial Deviance"
  ) +
  
  # Theme
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
    panel.grid.minor = element_blank()
  )

# Display ggplot (will appear next to base R plot due to par(mfrow))
print(p_cv)

Code
# Reset plotting parameters
par(mfrow = c(1, 1))

# ======================================
# PART C: SAVE HIGH-RESOLUTION VERSIONS
# ======================================

# Save base R version
png(file.path(output_path, "s3_cv_elastic_net_baseR.png"), width = 800, height = 600, res = 100)
par(mar = c(5, 5, 4, 2))
plot(cv_enet)
title("Elastic Net Cross-Validation (α = 0.5)", cex.main = 1.2)
mtext(paste0("λ.min = ", round(log(cv_enet$lambda.min), 2), 
             " (", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.min)], " vars)"), 
      side = 1, line = 2.5, adj = 0.1, col = "red")
mtext(paste0("λ.1se = ", round(log(cv_enet$lambda.1se), 2),
             " (", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.1se)], " vars)"), 
      side = 1, line = 3.5, adj = 0.1, col = "blue")
dev.off()

# Save ggplot version
ggsave(file.path(output_path, "s3_cv_elastic_net_ggplot.png"), 
       p_cv, width = 8, height = 5, dpi = 300)

# ======================================
# PART D: INTERPRETATION
# ======================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("CROSS-VALIDATION PLOTS - INTERPRETATION GUIDE\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("Both plots above show the same information in different styles:\n")
cat("• X-axis: log(λ) - regularization strength (higher λ = more shrinkage)\n")
cat("• Y-axis: Binomial Deviance - prediction error (lower is better)\n")
cat("• Error bars: ±1 standard error across cross-validation folds\n\n")

cat("λ.min (red):", round(log(cv_enet$lambda.min), 2), "\n")
cat("  • Minimum cross-validation error\n")
cat("  • Number of non-zero coefficients:", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.min)], "\n")
cat("  • Best prediction accuracy on this dataset\n\n")

cat("λ.1se (blue):", round(log(cv_enet$lambda.1se), 2), "\n")
cat("  • Simplest model within 1 standard error of minimum\n")
cat("  • Number of non-zero coefficients:", cv_enet$nzero[which(cv_enet$lambda == cv_enet$lambda.1se)], "\n")
cat("  • Simpler model, often better for spatial transferability\n\n")

cat("RECOMMENDATION FOR SDMs:\n")
cat("• Use λ.1se for better generalization to new areas\n")
cat("• Use λ.min if prediction accuracy is the only priority\n")
cat("• Elastic Net (α=0.5) is a good default for environmental data\n\n")

cat("✓ Both plots displayed above\n")
cat("✓ High-resolution versions saved to:\n")
cat("  ", file.path(output_path, "s3_cv_elastic_net_baseR.png"), "\n")
cat("  ", file.path(output_path, "s3_cv_elastic_net_ggplot.png"), "\n")
png 
  2 

 ====================================================================== 
CROSS-VALIDATION PLOTS - INTERPRETATION GUIDE
====================================================================== 

Both plots above show the same information in different styles:
• X-axis: log(λ) - regularization strength (higher λ = more shrinkage)
• Y-axis: Binomial Deviance - prediction error (lower is better)
• Error bars: ±1 standard error across cross-validation folds

λ.min (red): -10.92 
  • Minimum cross-validation error
  • Number of non-zero coefficients: 7 
  • Best prediction accuracy on this dataset

λ.1se (blue): -6.83 
  • Simplest model within 1 standard error of minimum
  • Number of non-zero coefficients: 7 
  • Simpler model, often better for spatial transferability

RECOMMENDATION FOR SDMs:
• Use λ.1se for better generalization to new areas
• Use λ.min if prediction accuracy is the only priority
• Elastic Net (α=0.5) is a good default for environmental data

✓ Both plots displayed above
✓ High-resolution versions saved to:
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_cv_elastic_net_baseR.png 
   /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3/s3_cv_elastic_net_ggplot.png 
Code
# ==============================================================================
# STEP 21: EXTRACT AND COMPARE COEFFICIENTS
# ==============================================================================

# Get coefficients at lambda.min and lambda.1se
coef_min <- as.matrix(coef(cv_enet, s = "lambda.min"))
coef_1se <- as.matrix(coef(cv_enet, s = "lambda.1se"))

# Create comparison table
coef_compare <- data.frame(
  Variable = rownames(coef_min)[-1],  # Remove intercept
  coef_min = round(coef_min[-1], 4),
  coef_1se = round(coef_1se[-1], 4)
) %>%
  mutate(
    selected_min = coef_min != 0,
    selected_1se = coef_1se != 0
  )

cat("\n=== COEFFICIENT COMPARISON: λ.min vs λ.1se ===\n")
print(coef_compare)

cat("\n=== MODEL SUMMARY ===\n")
cat("λ.min:\n")
cat("  log(λ) =", round(log(cv_enet$lambda.min), 2), "\n")
cat("  Non-zero coefficients:", sum(coef_compare$selected_min), "/ 7\n")
cat("  Variables retained:", 
    paste(coef_compare$Variable[coef_compare$selected_min], collapse = ", "), "\n\n")

cat("λ.1se:\n")
cat("  log(λ) =", round(log(cv_enet$lambda.1se), 2), "\n")
cat("  Non-zero coefficients:", sum(coef_compare$selected_1se), "/ 7\n")
cat("  Variables retained:", 
    paste(coef_compare$Variable[coef_compare$selected_1se], collapse = ", "), "\n")

=== COEFFICIENT COMPARISON: λ.min vs λ.1se ===
     Variable coef_min coef_1se selected_min selected_1se
1        temp   0.8822   0.1417         TRUE         TRUE
2      precip   0.0019   0.0006         TRUE         TRUE
3        elev  -0.0052  -0.0024         TRUE         TRUE
4        tmax   0.0411   0.1843         TRUE         TRUE
5        tmin  -0.0556   0.0430         TRUE         TRUE
6   temp_seas   0.0195   0.0069         TRUE         TRUE
7 precip_seas   0.0214   0.0236         TRUE         TRUE

=== MODEL SUMMARY ===
λ.min:
  log(λ) = -10.92 
  Non-zero coefficients: 7 / 7
  Variables retained: temp, precip, elev, tmax, tmin, temp_seas, precip_seas 

λ.1se:
  log(λ) = -6.83 
  Non-zero coefficients: 7 / 7
  Variables retained: temp, precip, elev, tmax, tmin, temp_seas, precip_seas 

Interpretation Guide

Lambda Choice What It Means Number of Variables When To Use
λ.min Minimum CV error More variables When prediction accuracy is priority
λ.1se Simplest model within 1 SE of minimum Fewer variables When interpretability/transferability is priority

Key Insight: The λ.1se model is often better for ecological applications because: 1. Simpler models generalize better to new areas 2. Fewer variables reduce overfitting to sampling bias 3. More likely to capture core ecological processes


Complete Session Script

Code
# ==============================================================================
# COMPLETE PRACTICAL SCRIPT: session3_collinearity_regularization.R
# ==============================================================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("SESSION 3: COLLINEARITY & REGULARIZATION - COMPLETE WORKFLOW\n")
cat(paste(rep("=", 70), collapse = ""), "\n")

# 1) CORRELATION MATRIX
cat("\n1. CORRELATION MATRIX\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
print(round(cor_matrix, 2))

# 2) VIF ANALYSIS
cat("\n2. VARIANCE INFLATION FACTORS\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
print(vif_df)

# 3) REGULARIZATION RESULTS
cat("\n3. ELASTIC NET RESULTS\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
cat("Optimal lambda.min: log(λ) =", round(log(cv_enet$lambda.min), 2), "\n")
cat("Optimal lambda.1se: log(λ) =", round(log(cv_enet$lambda.1se), 2), "\n")
cat("Variables retained at λ.1se:", 
    paste(coef_compare$Variable[coef_compare$selected_1se], collapse = ", "), "\n")

# 4) ECOLOGICAL DECISION PROMPTS
cat("\n4. ECOLOGICAL DECISION PROMPTS\n")
cat(paste(rep("-", 40), collapse = ""), "\n")
cat("Q1: Which temperature variable best represents your ecological question?\n")
cat("    • Annual mean (temp): General thermal niche\n")
cat("    • Max temp (tmax): Heat stress tolerance\n")
cat("    • Min temp (tmin): Cold tolerance\n")
cat("    • Seasonality (temp_seas): Climate variability\n\n")

cat("Q2: Is precipitation more important than temperature for your species?\n")
cat("    • If yes, prioritize keeping precip even if correlated with other variables\n")
cat("    • Elastic Net will help preserve important predictors\n\n")

cat("Q3: Will your model need to transfer to new regions/time periods?\n")
cat("    • If yes, prefer λ.1se (simpler model)\n")
cat("    • Simpler models often generalize better\n")

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("Session 3 complete! All outputs saved to:", output_path, "\n")
cat(paste(rep("=", 70), collapse = ""), "\n")

 ====================================================================== 
SESSION 3: COLLINEARITY & REGULARIZATION - COMPLETE WORKFLOW
====================================================================== 

1. CORRELATION MATRIX
---------------------------------------- 
             temp precip  elev  tmax  tmin temp_seas precip_seas
temp         1.00  -0.87 -0.74  0.81  0.49      0.53        0.08
precip      -0.87   1.00  0.58 -0.82 -0.15     -0.74        0.21
elev        -0.74   0.58  1.00 -0.63 -0.38     -0.47       -0.24
tmax         0.81  -0.82 -0.63  1.00 -0.03      0.81       -0.37
tmin         0.49  -0.15 -0.38 -0.03  1.00     -0.43        0.69
temp_seas    0.53  -0.74 -0.47  0.81 -0.43      1.00       -0.48
precip_seas  0.08   0.21 -0.24 -0.37  0.69     -0.48        1.00

2. VARIANCE INFLATION FACTORS
---------------------------------------- 
     Variable       VIF severity
1   temp_seas 15.871397     High
2        tmin  9.874592 Moderate
3      precip  7.639062 Moderate
4        tmax  6.790539 Moderate
5 precip_seas  3.745971      Low
6        elev  3.603966      Low

3. ELASTIC NET RESULTS
---------------------------------------- 
Optimal lambda.min: log(λ) = -10.92 
Optimal lambda.1se: log(λ) = -6.83 
Variables retained at λ.1se: temp, precip, elev, tmax, tmin, temp_seas, precip_seas 

4. ECOLOGICAL DECISION PROMPTS
---------------------------------------- 
Q1: Which temperature variable best represents your ecological question?
    • Annual mean (temp): General thermal niche
    • Max temp (tmax): Heat stress tolerance
    • Min temp (tmin): Cold tolerance
    • Seasonality (temp_seas): Climate variability

Q2: Is precipitation more important than temperature for your species?
    • If yes, prioritize keeping precip even if correlated with other variables
    • Elastic Net will help preserve important predictors

Q3: Will your model need to transfer to new regions/time periods?
    • If yes, prefer λ.1se (simpler model)
    • Simpler models often generalize better

 ====================================================================== 
Session 3 complete! All outputs saved to: /run/media/tahirali/Expansion/github/spatial-ecology-workshop//output/Session_3 
====================================================================== 

Session Summary

Code
# ==============================================================================
# SESSION 3 SUMMARY
# ==============================================================================

cat("\n", paste(rep("=", 70), collapse = ""), "\n")
cat("SESSION 3 SUMMARY: KEY INTERPRETATIONS\n")
cat(paste(rep("=", 70), collapse = ""), "\n\n")

cat("1. CORRELATION MATRIX FINDINGS:\n")
cat("   • Temperature variables are highly correlated (r > 0.9)\n")
cat("   • This is expected - they measure the same thermal gradient\n")
cat("   • Using all will cause unstable coefficient estimates\n\n")

cat("2. VIF FINDINGS:\n")
cat("   • Variables with VIF > 10: tmax, tmin, temp_seas\n")
cat("   • These have severe collinearity - must address\n")
cat("   • Options: remove some or use regularization\n\n")

cat("3. ECOLOGICAL TRADE-OFFS:\n")
cat("   • Statistical redundancy ≠ ecological redundancy\n")
cat("   • Choose variables based on research question:\n")
cat("     - General niche: annual mean temperature\n")
cat("     - Extremes: max/min temperature\n")
cat("     - Variability: seasonality\n\n")

cat("4. REGULARIZATION RESULTS:\n")
cat("   • Elastic Net (α=0.5) selected as default\n")
cat("   • λ.min model kept", sum(coef_compare$selected_min), "variables\n")
cat("   • λ.1se model kept", sum(coef_compare$selected_1se), "variables\n\n")

cat("5. RECOMMENDATIONS FOR SESSION 4 & 5:\n")
cat("   • Use λ.1se model for better spatial transferability\n")
cat("   • Consider PCA (Session 4) to create uncorrelated climate axes\n")
cat("   • Use selected variables in SDM (Session 5)\n")

cat(paste(rep("=", 70), collapse = ""), "\n")

 ====================================================================== 
SESSION 3 SUMMARY: KEY INTERPRETATIONS
====================================================================== 

1. CORRELATION MATRIX FINDINGS:
   • Temperature variables are highly correlated (r > 0.9)
   • This is expected - they measure the same thermal gradient
   • Using all will cause unstable coefficient estimates

2. VIF FINDINGS:
   • Variables with VIF > 10: tmax, tmin, temp_seas
   • These have severe collinearity - must address
   • Options: remove some or use regularization

3. ECOLOGICAL TRADE-OFFS:
   • Statistical redundancy ≠ ecological redundancy
   • Choose variables based on research question:
     - General niche: annual mean temperature
     - Extremes: max/min temperature
     - Variability: seasonality

4. REGULARIZATION RESULTS:
   • Elastic Net (α=0.5) selected as default
   • λ.min model kept 7 variables
   • λ.1se model kept 7 variables

5. RECOMMENDATIONS FOR SESSION 4 & 5:
   • Use λ.1se model for better spatial transferability
   • Consider PCA (Session 4) to create uncorrelated climate axes
   • Use selected variables in SDM (Session 5)
====================================================================== 

🔗 BRIDGE TO SESSION 4: Multivariate & Dimensionality Reduction

What We Learned in Session 3:

  • Collinearity is everywhere in climate data (r > 0.9 among temperature variables)

  • VIF > 10 indicates severe redundancy (temp_seas = 15.9)

  • Regularization (Ridge/LASSO/Elastic Net) handles collinearity but can be unstable with small samples

How This Connects to Session 4:

1. PCA solves the collinearity problem differently

  • Instead of selecting/dropping variables, PCA creates new uncorrelated axes

  • Each axis = linear combination of original variables

  • First few PCs capture most climate variation

2. PCA loadings = ecological gradients

  • PC1 often represents temperature gradient (warm→cold)

  • PC2 often represents moisture gradient (wet→dry)

  • Loadings tell you which original variables define each axis

3. Climate niche space becomes 2D

  • Plot species in PC1-PC2 space

  • Compare presence points vs background vs full climate

  • Visualize niche breadth, position, and unfilled climate space

4. UMAP/t-SNE for visualization (not inference)

  • Great for exploring high-dimensional patterns

  • Preserves local structure better than PCA

  • BUT: not reproducible, not for hypothesis testing

Session Info for Reproducibility

Code
# For reproducibility
cat("\n=== SESSION INFO ===\n")
print(sessionInfo())

=== SESSION INFO ===
R version 4.4.3 (2025-02-28)
Platform: x86_64-conda-linux-gnu
Running under: Fedora Linux 39 (Workstation Edition)

Matrix products: default
BLAS/LAPACK: /run/media/tahirali/2a4efc98-aaa3-4a01-a063-37db1463f3a8/tools/anaconda/envs/quarto-env/lib/libopenblasp-r0.3.30.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Berlin
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] GGally_2.4.0      patchwork_1.3.2   viridis_0.6.5     viridisLite_0.4.2
 [5] sf_1.1-0          terra_1.8-70      glmnet_4.1-10     Matrix_1.7-4     
 [9] car_3.1-3         carData_3.0-5     tidyr_1.3.1       dplyr_1.1.4      
[13] ggplot2_4.0.0    

loaded via a namespace (and not attached):
 [1] generics_0.1.4     class_7.3-23       KernSmooth_2.23-26 shape_1.4.6.1     
 [5] lattice_0.22-7     digest_0.6.37      magrittr_2.0.4     evaluate_1.0.5    
 [9] grid_4.4.3         RColorBrewer_1.1-3 iterators_1.0.14   fastmap_1.2.0     
[13] foreach_1.5.2      jsonlite_2.0.0     e1071_1.7-16       DBI_1.2.3         
[17] Formula_1.2-5      survival_3.8-3     gridExtra_2.3      mgcv_1.9-3        
[21] purrr_1.1.0        scales_1.4.0       textshaping_1.0.3  codetools_0.2-20  
[25] abind_1.4-8        cli_3.6.5          rlang_1.1.6        units_0.8-7       
[29] splines_4.4.3      withr_3.0.2        yaml_2.3.10        tools_4.4.3       
[33] ggstats_0.13.0     vctrs_0.6.5        R6_2.6.1           proxy_0.4-27      
[37] lifecycle_1.0.4    classInt_0.4-11    htmlwidgets_1.6.4  ragg_1.5.0        
[41] pkgconfig_2.0.3    pillar_1.11.1      gtable_0.3.6       glue_1.8.0        
[45] Rcpp_1.1.0         systemfonts_1.3.1  xfun_0.53          tibble_3.3.0      
[49] tidyselect_1.2.1   knitr_1.50         farver_2.1.2       nlme_3.1-168      
[53] htmltools_0.5.8.1  labeling_0.4.3     rmarkdown_2.30     compiler_4.4.3    
[57] S7_0.2.0