This notebook provides a probabilistic approach to forecasting Germany’s hourly solar power generation for June 2025. The approach selects the most effective approaches from the first attempt at modelling the data set, choosing again persistence as baseline and using quantile Regression with LightGBM. The answer is written to two files:
forecast_q1.csv for a deterministic forecast
forecast_q1_probabilistic.csv for a probabilistic forecast
2. Setup and Data Loading
Code
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom typing import Dict, Tuple, List, Optionalimport warningswarnings.filterwarnings("ignore")# ML Modelsimport lightgbm as lgbfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score# Set style and seedssns.set_style("whitegrid")plt.rcParams['figure.dpi'] =100np.random.seed(42)# Load dataprint("Loading data...")solar_obs = pd.read_csv("../data/germany_solar_observation_q1.csv", parse_dates=['DateTime'])atm_features = pd.read_csv("../data/germany_atm_features_q1.csv", parse_dates=['DateTime'])# Merge and index by DateTimedata = pd.merge(solar_obs, atm_features, on="DateTime", how="inner")data = data.set_index('DateTime')data = data.sort_index()print(f"Dataset shape: {data.shape}")print(f"Date range: {data.index.min()} to {data.index.max()}")print(f"Missing values: {data.isnull().sum().sum()}")
Loading data...
Dataset shape: (29928, 11)
Date range: 2022-01-01 00:00:00+00:00 to 2025-05-31 23:00:00+00:00
Missing values: 0
3. Feature Engineering
We implement physically accurate solar position calculations
Code
def calculate_solar_position(df: pd.DataFrame, latitude: float=51.5) -> pd.DataFrame:""" Calculate accurate solar position features. Germany's approximate latitude: 51.5°N """ df = df.copy()# Basic temporal features df['hour'] = df.index.hour df['day_of_year'] = df.index.dayofyear df['month'] = df.index.month df['is_weekend'] = df.index.dayofweek.isin([5, 6]).astype(int)# Solar declination angle (simplified but more accurate) df['declination'] =23.45* np.sin(np.radians(360* (284+ df['day_of_year']) /365))# Hour angle (degrees from solar noon) df['hour_angle'] =15* (df['hour'] -12)# Solar elevation angle (in radians first) elevation_rad = np.arcsin( np.sin(np.radians(df['declination'])) * np.sin(np.radians(latitude)) + np.cos(np.radians(df['declination'])) * np.cos(np.radians(latitude)) * np.cos(np.radians(df['hour_angle'])) )# Convert to degrees and clip negative values df['solar_elevation'] = np.degrees(elevation_rad) df['solar_elevation'] = np.maximum(0, df['solar_elevation'])# Solar zenith angle (complement of elevation) df['solar_zenith'] =90- df['solar_elevation']# Air mass (simplified Kasten-Young formula)# Only calculate when sun is above horizon df['air_mass'] = np.where( df['solar_elevation'] >0,1/ (np.sin(np.radians(df['solar_elevation'])) +0.50572* (df['solar_elevation'] +6.07995) **-1.6364),0 )# Cap air mass at reasonable value df['air_mass'] = np.minimum(df['air_mass'], 40)return dfdef create_features(df: pd.DataFrame) -> pd.DataFrame:""" Create all features including solar position and weather interactions. """# Add solar position features df = calculate_solar_position(df)# Cyclical encoding for temporal features df['hour_sin'] = np.sin(2* np.pi * df['hour'] /24) df['hour_cos'] = np.cos(2* np.pi * df['hour'] /24) df['month_sin'] = np.sin(2* np.pi * df['month'] /12) df['month_cos'] = np.cos(2* np.pi * df['month'] /12)# Weather-solar interactions df['clear_sky_index'] = df['surface_solar_radiation_downwards'] * (1- df['total_cloud_cover']) df['cloud_impact'] = df['total_cloud_cover'] * df['solar_elevation']# Temperature efficiency (solar panels are less efficient at high temperatures)# Optimal around 25°C, decreasing efficiency beyond df['temp_efficiency'] = np.exp(-((df['temperature_2m'] -25) /20) **2)# Wind cooling effect (helps panel efficiency) df['wind_cooling'] = np.log1p(df['wind_speed_100m']) * df['temperature_2m']# Is daylight (based on solar elevation) df['is_daylight'] = (df['solar_elevation'] >0).astype(int)# Potential clear sky radiation (theoretical maximum) df['theoretical_max'] = df['solar_elevation'] *1000/90# Simplifiedreturn df# Apply feature engineeringprint("Creating features...")data = create_features(data)# Display feature statisticsfeature_cols = [col for col in data.columns if col !='power']print(f"Total features created: {len(feature_cols)}")print("\nSample of created features:")print(data[['solar_elevation', 'air_mass', 'clear_sky_index', 'temp_efficiency']].describe())
Creating features...
Total features created: 29
Sample of created features:
solar_elevation air_mass clear_sky_index temp_efficiency
count 29928.000000 29928.000000 29928.000000 29928.000000
mean 12.519540 2.385316 51.411227 0.578002
std 17.154595 4.876787 106.776787 0.260896
min 0.000000 0.000000 0.000000 0.053851
25% 0.000000 0.000000 0.000000 0.353674
50% 0.000000 0.000000 0.730650 0.561030
75% 21.948322 2.629839 47.845200 0.813739
max 61.949783 37.744706 823.994200 1.000000
4. Data Splitting Strategy
We implement a proper train/validation/test split to avoid data leakage.
Code
def create_data_splits(data: pd.DataFrame, val_start: str='2025-01-01', test_start: str='2025-06-01') -> Dict:""" Create proper train/validation/test splits for time series data. """# Define feature columns (exclude target) feature_cols = ['surface_solar_radiation_downwards', 'temperature_2m', 'total_cloud_cover','relative_humidity_2m', 'apparent_temperature', 'wind_speed_100m','hour_sin', 'hour_cos', 'month_sin', 'month_cos','solar_elevation', 'solar_zenith', 'air_mass','clear_sky_index', 'cloud_impact', 'temp_efficiency', 'wind_cooling','is_daylight', 'theoretical_max' ]# Filter to available features feature_cols = [col for col in feature_cols if col in data.columns] target_col ='power'# Split data chronologically train_data = data[data.index < val_start].copy() val_data = data[(data.index >= val_start) & (data.index < test_start)].copy()# For early stopping within training data (last 20% of training) early_stop_start = train_data.index[-int(len(train_data) *0.2)] train_early = train_data[train_data.index < early_stop_start].copy() val_early = train_data[train_data.index >= early_stop_start].copy() splits = {'X_train': train_data[feature_cols],'y_train': train_data[target_col],'X_val': val_data[feature_cols],'y_val': val_data[target_col],'X_train_early': train_early[feature_cols],'y_train_early': train_early[target_col],'X_val_early': val_early[feature_cols],'y_val_early': val_early[target_col],'feature_cols': feature_cols,'train_data': train_data,'val_data': val_data }return splits# Create splitssplits = create_data_splits(data)print("Data split summary:")print(f"Training set: {len(splits['X_train'])} samples ({splits['X_train'].index.min()} to {splits['X_train'].index.max()})")print(f"Validation set: {len(splits['X_val'])} samples ({splits['X_val'].index.min()} to {splits['X_val'].index.max()})")print(f"Early stopping set: {len(splits['X_val_early'])} samples")# Initialize and fit scaler on training data onlyscaler = StandardScaler()scaler.fit(splits['X_train']) # Fit only once on training data# Transform all setsX_train_scaled = scaler.transform(splits['X_train'])X_val_scaled = scaler.transform(splits['X_val'])X_train_early_scaled = scaler.transform(splits['X_train_early'])X_val_early_scaled = scaler.transform(splits['X_val_early'])
Data split summary:
Training set: 26304 samples (2022-01-01 00:00:00+00:00 to 2024-12-31 23:00:00+00:00)
Validation set: 3624 samples (2025-01-01 00:00:00+00:00 to 2025-05-31 23:00:00+00:00)
Early stopping set: 5260 samples
5. Baseline Model: Persistence
Code
class PersistenceModel:""" Enhanced persistence model using weighted historical patterns with consideration for recent trends. """def__init__(self, decay_factor: float=365):self.decay_factor = decay_factorself.patterns = {}self.monthly_trends = {}def fit(self, data: pd.DataFrame, target_col: str='power'):"""Fit the model using exponentially weighted historical averages."""# Calculate days from start for weighting days_from_start = (data.index - data.index.min()).days weights = np.exp(days_from_start /self.decay_factor)# Normalize weights to sum to length of data (for numerical stability) weights = weights *len(weights) / weights.to_numpy().sum()# Calculate weighted patterns for each (month, hour) combination data_with_weights = data.copy() data_with_weights['weight'] = weights data_with_weights['weighted_power'] = data_with_weights[target_col] * weights# Group by month and hour grouped = data_with_weights.groupby([data_with_weights.index.month, data_with_weights.index.hour])self.patterns = (grouped['weighted_power'].sum() / grouped['weight'].sum()).to_dict()# Calculate monthly trends (for adjustment) monthly_avg = data_with_weights.groupby(data_with_weights.index.month)[target_col].mean()self.monthly_trends = monthly_avg.to_dict()returnselfdef predict(self, X: pd.DataFrame) -> np.ndarray:"""Generate predictions based on historical patterns.""" predictions = []for timestamp in X.index: month = timestamp.month hour = timestamp.hour# Get base prediction from patterns base_pred =self.patterns.get((month, hour), 0)# Apply slight random variation (±5%) to avoid identical predictions# This simulates natural variability variation = np.random.normal(1.0, 0.05) pred = base_pred * variation predictions.append(max(0, pred)) # Ensure non-negativereturn np.array(predictions)# Train baseline modelbaseline_model = PersistenceModel(decay_factor=365)baseline_model.fit(splits['train_data'], 'power')# Generate predictionsy_pred_baseline = baseline_model.predict(splits['X_val'])# Calculate metricsbaseline_rmse = np.sqrt(mean_squared_error(splits['y_val'], y_pred_baseline))baseline_mae = mean_absolute_error(splits['y_val'], y_pred_baseline)baseline_r2 = r2_score(splits['y_val'], y_pred_baseline)print(f"Baseline Model Performance:")print(f"RMSE: {baseline_rmse:.2f} MWh")print(f"MAE: {baseline_mae:.2f} MWh")print(f"R²: {baseline_r2:.4f}")
def analyze_feature_importance(model: QuantileLightGBM, feature_names: List[str], top_n: int=15):""" Analyze and visualize feature importance across quantiles. """# Get importance for median model median_model = model.models[0.5] importance = median_model.feature_importances_# Create importance dataframe importance_df = pd.DataFrame({'feature': feature_names,'importance': importance }).sort_values('importance', ascending=False).head(top_n)# Visualize fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(data=importance_df, y='feature', x='importance', ax=ax) ax.set_title('Top 15 Most Important Features (Median Model)') ax.set_xlabel('Feature Importance')return importance_df# Analyze feature importanceimportance_df = analyze_feature_importance(quantile_model, splits['feature_cols'])plt.show()print("\nTop 10 Most Important Features:")print(importance_df.head(10).to_string(index=False))
Feature Importance from Quantile Models
Top 10 Most Important Features:
feature importance
wind_speed_100m 3164
relative_humidity_2m 2916
total_cloud_cover 2261
surface_solar_radiation_downwards 2032
wind_cooling 1921
apparent_temperature 1919
temperature_2m 1847
month_sin 1422
clear_sky_index 1374
month_cos 1226
10. Final Model Training and June 2025 Forecast
Code
print("Training final model on all available data...")# Combine all historical data for final trainingX_full = pd.concat([splits['X_train'], splits['X_val']])y_full = pd.concat([splits['y_train'], splits['y_val']])# Train final quantile modelfinal_model = QuantileLightGBM(quantiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])final_model.fit(X_full, y_full)print("\nPreparing June 2025 data...")# Load and prepare June 2025 datajune_features = atm_features[(atm_features['DateTime'] >='2025-06-01') & (atm_features['DateTime'] <'2025-07-01')].copy()june_features = june_features.set_index('DateTime')june_features = create_features(june_features)# Use the same features as trainingX_june = june_features[splits['feature_cols']]# Generate probabilistic predictionsprint("Generating probabilistic forecast for June 2025...")june_predictions = final_model.predict(X_june)# Create submission dataframe with probabilistic outputssubmission_df = pd.DataFrame({'DateTime': X_june.index,'power': june_predictions['q50'], # Median as point forecast'power_q5': june_predictions['q5'],'power_q10': june_predictions['q10'],'power_q25': june_predictions['q25'],'power_q75': june_predictions['q75'],'power_q90': june_predictions['q90'],'power_q95': june_predictions['q95']})# Save primary submission (median forecast)submission_df[['DateTime', 'power']].to_csv('forecast_q1.csv', index=False)# Save full probabilistic forecastsubmission_df.to_csv('forecast_q1_probabilistic.csv', index=False)print(f"\nForecast generated successfully!")print(f"Point forecast saved to: forecast_q1.csv")print(f"Probabilistic forecast saved to: forecast_q1_probabilistic.csv")print(f"\nForecast summary for June 2025:")print(f"Mean predicted power: {submission_df['power'].mean():.2f} MWh")print(f"Max predicted power: {submission_df['power'].max():.2f} MWh")print(f"Total predicted generation: {submission_df['power'].sum():.2f} MWh")
Training final model on all available data...
Training quantile 0.05... Done (best iteration: 0)
Training quantile 0.10... Done (best iteration: 0)
Training quantile 0.25... Done (best iteration: 0)
Training quantile 0.50... Done (best iteration: 0)
Training quantile 0.75... Done (best iteration: 0)
Training quantile 0.90... Done (best iteration: 0)
Training quantile 0.95... Done (best iteration: 0)
Preparing June 2025 data...
Generating probabilistic forecast for June 2025...
Forecast generated successfully!
Point forecast saved to: forecast_q1.csv
Probabilistic forecast saved to: forecast_q1_probabilistic.csv
Forecast summary for June 2025:
Mean predicted power: 11782.88 MWh
Max predicted power: 41810.84 MWh
Total predicted generation: 8483676.80 MWh