655 lines
24 KiB
Python
655 lines
24 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.linear_model import Ridge
|
||
from sklearn.model_selection import GridSearchCV
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.metrics import r2_score, mean_squared_error
|
||
import joblib
|
||
import os
|
||
from datetime import datetime
|
||
from dateutil.relativedelta import relativedelta
|
||
|
||
|
||
class RidgePredictor:
|
||
def __init__(self, base_model_path='models'):
|
||
"""
|
||
初始化预测器
|
||
base_model_path: 模型存储的基础路径
|
||
"""
|
||
self.base_model_path = base_model_path
|
||
self.scaler = StandardScaler()
|
||
os.makedirs(base_model_path, exist_ok=True)
|
||
|
||
def get_project_features(self, df, project):
|
||
"""获取项目对应的特征列名"""
|
||
try:
|
||
features = [col for col in df.columns if col.startswith(f'{project}_')]
|
||
if not features:
|
||
raise ValueError(f"No features found for project {project}")
|
||
return features
|
||
except Exception as e:
|
||
print(f"Error in get_project_features: {str(e)}")
|
||
raise
|
||
|
||
def print_ridge_formula(self, model, feature_names, scaler):
|
||
"""打印岭回归公式"""
|
||
try:
|
||
coefficients = model.coef_
|
||
intercept = model.intercept_
|
||
|
||
formula = "y = "
|
||
for i, (coef, name) in enumerate(zip(coefficients, feature_names)):
|
||
if i > 0:
|
||
formula += " + " if coef >= 0 else " - "
|
||
else:
|
||
formula += "-" if coef < 0 else ""
|
||
formula += f"{abs(coef):.4f} * {name}"
|
||
|
||
formula += f" + {intercept:.4f}"
|
||
|
||
# 添加标准化说明
|
||
formula += "注:特征已进行标准化处理,使用以下参数:"
|
||
for i, name in enumerate(feature_names):
|
||
formula += f"{name}: mean={scaler.mean_[i]:.4f}, scale={scaler.scale_[i]:.4f}"
|
||
return formula
|
||
except Exception as e:
|
||
print(f"Error in print_ridge_formula: {str(e)}")
|
||
raise
|
||
|
||
def get_feature_average(self, df, area, project, year, month, n_months=3):
|
||
"""获取前n个月的特征平均值"""
|
||
try:
|
||
feature_cols = self.get_project_features(df, project)
|
||
|
||
# 计算前n个月的日期范围
|
||
dates = []
|
||
curr_year, curr_month = year, month
|
||
for _ in range(n_months):
|
||
curr_month -= 1
|
||
if curr_month == 0:
|
||
curr_month = 12
|
||
curr_year -= 1
|
||
dates.append((curr_year, curr_month))
|
||
|
||
# 获取特征值
|
||
features_list = []
|
||
for year, month in dates:
|
||
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
|
||
if sum(mask) > 0:
|
||
features_list.append(df[mask][feature_cols].values[0])
|
||
|
||
if not features_list:
|
||
raise ValueError(f"No feature data found for {area} in specified months")
|
||
|
||
return np.mean(features_list, axis=0)
|
||
except Exception as e:
|
||
print(f"Error in get_feature_average: {str(e)}")
|
||
raise
|
||
|
||
def train_and_evaluate(self, df, area, project, target_year, target_month):
|
||
"""训练模型并评估性能"""
|
||
try:
|
||
feature_cols = self.get_project_features(df, project)
|
||
|
||
# 准备训练数据
|
||
dates = []
|
||
curr_year, curr_month = target_year, target_month
|
||
for _ in range(12): # 使用前12个月的数据
|
||
curr_month -= 1
|
||
if curr_month == 0:
|
||
curr_month = 12
|
||
curr_year -= 1
|
||
dates.append((curr_year, curr_month))
|
||
|
||
train_data = []
|
||
for year, month in dates:
|
||
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
|
||
if sum(mask) > 0:
|
||
train_data.append(df[mask])
|
||
|
||
if not train_data:
|
||
raise ValueError(f"No training data found for {area}")
|
||
|
||
train_df = pd.concat(train_data)
|
||
|
||
X = train_df[feature_cols].values
|
||
y = train_df[project].values
|
||
|
||
# 数据标准化
|
||
X_scaled = self.scaler.fit_transform(X)
|
||
|
||
# 网格搜索最佳参数
|
||
param_grid = {'alpha': np.logspace(-3, 3, 7)}
|
||
ridge = Ridge()
|
||
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
|
||
grid_search.fit(X_scaled, y)
|
||
|
||
# 获取最佳模型
|
||
best_model = grid_search.best_estimator_
|
||
|
||
# 计算评估指标
|
||
y_pred = best_model.predict(X_scaled)
|
||
r2 = r2_score(y, y_pred)
|
||
mse = mean_squared_error(y, y_pred)
|
||
|
||
# 保存模型和标准化器
|
||
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
|
||
os.makedirs(model_dir, exist_ok=True)
|
||
|
||
model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
|
||
scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")
|
||
|
||
joblib.dump(best_model, model_path)
|
||
joblib.dump(self.scaler, scaler_path)
|
||
|
||
# 生成岭回归公式
|
||
formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)
|
||
|
||
return best_model, self.scaler, r2, mse, formula
|
||
|
||
except Exception as e:
|
||
print(f"Error in train_and_evaluate: {str(e)}")
|
||
raise
|
||
|
||
def get_feature_average(self, df, area, project, year, month, n_months=3):
|
||
"""获取前n个月的特征平均值"""
|
||
try:
|
||
feature_cols = self.get_project_features(df, project)
|
||
|
||
# 计算前n个月的日期范围
|
||
dates = []
|
||
curr_year, curr_month = year, month
|
||
for _ in range(n_months):
|
||
curr_month -= 1
|
||
if curr_month == 0:
|
||
curr_month = 12
|
||
curr_year -= 1
|
||
dates.append((curr_year, curr_month))
|
||
|
||
# 获取特征值
|
||
features_list = []
|
||
for year, month in dates:
|
||
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
|
||
if sum(mask) > 0:
|
||
features_list.append(df[mask][feature_cols].values[0])
|
||
|
||
if not features_list:
|
||
raise ValueError(f"No feature data found for {area} in specified months")
|
||
|
||
return np.mean(features_list, axis=0)
|
||
except Exception as e:
|
||
print(f"Error in get_feature_average: {str(e)}")
|
||
raise
|
||
|
||
def train_and_evaluate(self, df, area, project, target_year, target_month):
|
||
"""训练模型并评估性能"""
|
||
try:
|
||
feature_cols = self.get_project_features(df, project)
|
||
|
||
# 准备训练数据
|
||
dates = []
|
||
curr_year, curr_month = target_year, target_month
|
||
for _ in range(12): # 使用前12个月的数据
|
||
curr_month -= 1
|
||
if curr_month == 0:
|
||
curr_month = 12
|
||
curr_year -= 1
|
||
dates.append((curr_year, curr_month))
|
||
|
||
train_data = []
|
||
for year, month in dates:
|
||
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
|
||
if sum(mask) > 0:
|
||
train_data.append(df[mask])
|
||
|
||
if not train_data:
|
||
raise ValueError(f"No training data found for {area}")
|
||
|
||
train_df = pd.concat(train_data)
|
||
|
||
X = train_df[feature_cols].values
|
||
y = train_df[project].values
|
||
|
||
# 数据标准化
|
||
X_scaled = self.scaler.fit_transform(X)
|
||
|
||
# 网格搜索最佳参数
|
||
param_grid = {'alpha': np.logspace(-3, 3, 7)}
|
||
ridge = Ridge()
|
||
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
|
||
grid_search.fit(X_scaled, y)
|
||
print(f"区域: {area}, 项目: {project} - 当前模型的最佳参数: {grid_search.best_params_}")
|
||
|
||
# 获取最佳模型
|
||
best_model = grid_search.best_estimator_
|
||
|
||
# 计算评估指标
|
||
y_pred = best_model.predict(X_scaled)
|
||
r2 = r2_score(y, y_pred)
|
||
mse = mean_squared_error(y, y_pred)
|
||
|
||
# 保存模型和标准化器
|
||
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
|
||
os.makedirs(model_dir, exist_ok=True)
|
||
|
||
model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
|
||
scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")
|
||
|
||
joblib.dump(best_model, model_path)
|
||
joblib.dump(self.scaler, scaler_path)
|
||
|
||
# 生成岭回归公式
|
||
formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)
|
||
|
||
return best_model, self.scaler, r2, mse, formula
|
||
|
||
except Exception as e:
|
||
print(f"Error in train_and_evaluate: {str(e)}")
|
||
raise
|
||
|
||
def predict_with_history(self, df, area, project, start_year=2023, start_month=1, end_month=6):
|
||
"""使用历史数据进行预测(2023.1-6)"""
|
||
try:
|
||
results = []
|
||
feature_importance = []
|
||
model_metrics = []
|
||
|
||
for month in range(start_month, end_month + 1):
|
||
print(f"Predicting {area} - {project} for {start_year}/{month}")
|
||
|
||
# 训练模型并评估
|
||
model, scaler, r2, mse, formula = self.train_and_evaluate(
|
||
df, area, project, start_year, month
|
||
)
|
||
|
||
# 获取预测用的特征(前3个月平均)
|
||
test_features = self.get_feature_average(
|
||
df, area, project, start_year, month
|
||
)
|
||
|
||
# 标准化特征并预测
|
||
test_features_scaled = scaler.transform(test_features.reshape(1, -1))
|
||
prediction = model.predict(test_features_scaled)[0]
|
||
|
||
# 获取特征重要性
|
||
feature_cols = self.get_project_features(df, project)
|
||
importance = abs(model.coef_)
|
||
|
||
# 修改特征重要性的存储格式
|
||
importance_dict = {
|
||
'area': area,
|
||
'year': start_year,
|
||
'month': month,
|
||
}
|
||
# 将特征重要性存储为 project_feature: value 的格式
|
||
for feat, imp in zip(feature_cols, importance):
|
||
importance_dict[f'{feat}'] = imp
|
||
|
||
# 查找是否已存在相同area/year/month的记录
|
||
existing_idx = next((
|
||
i for i, x in enumerate(feature_importance)
|
||
if x['area'] == area and x['year'] == start_year and x['month'] == month
|
||
), None)
|
||
|
||
if existing_idx is not None:
|
||
# 如果存在,更新现有记录
|
||
feature_importance[existing_idx].update(importance_dict)
|
||
else:
|
||
# 如果不存在,添加新记录
|
||
feature_importance.append(importance_dict)
|
||
|
||
results.append({
|
||
'area': area,
|
||
'project': project,
|
||
'year': start_year,
|
||
'month': month,
|
||
'prediction': prediction,
|
||
'actual': df[(df['area_name'] == area) &
|
||
(df['year'] == start_year) &
|
||
(df['month'] == month)][project].values[0]
|
||
})
|
||
|
||
model_metrics.append({
|
||
'area': area,
|
||
'project': project,
|
||
'year': start_year,
|
||
'month': month,
|
||
'r2': r2,
|
||
'mse': mse,
|
||
'formula': formula
|
||
})
|
||
|
||
return results, feature_importance, model_metrics
|
||
|
||
except Exception as e:
|
||
print(f"Error in predict_with_history: {str(e)}")
|
||
raise
|
||
def predict_future(self, df, area, project, start_month=7, end_month=9, year=2023):
|
||
"""预测未来月份(2023.7-9)"""
|
||
try:
|
||
results = []
|
||
|
||
# 加载6月的模型
|
||
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
|
||
model = joblib.load(os.path.join(model_dir, f"model_{year}_6.pkl"))
|
||
scaler = joblib.load(os.path.join(model_dir, f"scaler_{year}_6.pkl"))
|
||
|
||
for month in range(start_month, end_month + 1):
|
||
print(f"Predicting future {area} - {project} for {year}/{month}")
|
||
|
||
if month == 7:
|
||
# 使用4,5,6月的特征平均值
|
||
test_features = np.mean([
|
||
self.get_feature_average(df, area, project, year, m)
|
||
for m in [4, 5, 6]
|
||
], axis=0)
|
||
else:
|
||
# 使用前三个预测结果的特征
|
||
test_features = np.mean([
|
||
self.get_feature_average(df, area, project, year, m)
|
||
for m in range(month - 3, month)
|
||
], axis=0)
|
||
|
||
test_features_scaled = scaler.transform(test_features.reshape(1, -1))
|
||
prediction = model.predict(test_features_scaled)[0]
|
||
|
||
results.append({
|
||
'area': area,
|
||
'project': project,
|
||
'year': year,
|
||
'month': month,
|
||
'prediction': prediction
|
||
})
|
||
|
||
return results
|
||
|
||
except Exception as e:
|
||
print(f"Error in predict_future: {str(e)}")
|
||
raise
|
||
|
||
def format_all_predictions(all_results, all_future_results):
|
||
"""
|
||
将所有预测结果格式化为指定格式
|
||
"""
|
||
# 创建一个字典来存储所有结果
|
||
formatted_data = {}
|
||
|
||
# 合并所有结果
|
||
all_predictions = all_results + all_future_results
|
||
|
||
# 处理每个预测结果
|
||
for result in all_predictions:
|
||
key = (result['area'], result['year'], result['month'])
|
||
|
||
if key not in formatted_data:
|
||
formatted_data[key] = {
|
||
'area_name': result['area'],
|
||
'year': result['year'],
|
||
'month': result['month'],
|
||
'xxzs': 0,
|
||
'zfjc': 0,
|
||
'tsjb': 0,
|
||
'xzcf': 0,
|
||
'cckh': 0,
|
||
'cjjc': 0,
|
||
'zhxzb': 0
|
||
}
|
||
|
||
# 将预测值转换为百分比并保存
|
||
project = result['project']
|
||
formatted_data[key][project] = round(result['prediction'] * 100, 8)
|
||
|
||
# 转换为列表并排序
|
||
formatted_list = list(formatted_data.values())
|
||
formatted_list.sort(key=lambda x: (x['area_name'], x['year'], x['month']))
|
||
|
||
return formatted_list
|
||
|
||
def update_predictions(original_data_path, new_data_path):
|
||
"""
|
||
使用新数据更新预测
|
||
original_data_path: 原始数据文件路径
|
||
new_data_path: 新数据文件路径
|
||
"""
|
||
try:
|
||
# 读取原始数据和新数据
|
||
original_df = pd.read_excel(original_data_path)
|
||
new_df = pd.read_excel(new_data_path)
|
||
|
||
# 确保新数据格式与原始数据一致
|
||
required_cols = set(original_df.columns)
|
||
if not set(new_df.columns).issuperset(required_cols):
|
||
raise ValueError("New data missing required columns")
|
||
|
||
# 合并数据并去重(保留最新)
|
||
updated_df = pd.concat([original_df, new_df], ignore_index=True)
|
||
updated_df = updated_df.drop_duplicates(
|
||
subset=['area_name', 'year', 'month'],
|
||
keep='last'
|
||
).reset_index(drop=True)
|
||
|
||
# 初始化预测器
|
||
predictor = RidgePredictor()
|
||
|
||
# 存储所有结果
|
||
all_results = []
|
||
all_importance = []
|
||
all_metrics = []
|
||
all_future_results = []
|
||
|
||
# 获取所有地区和项目
|
||
areas = updated_df['area_name'].unique()
|
||
projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
|
||
|
||
# 获取最新的时间点
|
||
latest_date = pd.to_datetime(
|
||
updated_df['year'].astype(str) + '-' +
|
||
updated_df['month'].astype(str) + '-01'
|
||
).max()
|
||
|
||
# 对每个地区进行预测
|
||
for area in areas:
|
||
print(f"Updating predictions for {area}")
|
||
current_area_importance = None
|
||
|
||
for project in projects:
|
||
print(f"Processing project: {project}")
|
||
|
||
# 使用更新后的数据重新训练并预测
|
||
results, importance, metrics = predictor.predict_with_history(
|
||
updated_df,
|
||
area,
|
||
project,
|
||
start_year=latest_date.year,
|
||
start_month=latest_date.month,
|
||
end_month=latest_date.month
|
||
)
|
||
all_results.extend(results)
|
||
|
||
# 更新特征重要性
|
||
if current_area_importance is None:
|
||
current_area_importance = importance
|
||
else:
|
||
current_area_importance = [
|
||
dict(current, **new)
|
||
for current, new in zip(current_area_importance, importance)
|
||
]
|
||
|
||
all_metrics.extend(metrics)
|
||
|
||
# 预测未来3个月
|
||
future_results = predictor.predict_future(
|
||
updated_df,
|
||
area,
|
||
project,
|
||
start_month=latest_date.month + 1,
|
||
end_month=latest_date.month + 3,
|
||
year=latest_date.year
|
||
)
|
||
all_future_results.extend(future_results)
|
||
|
||
if current_area_importance:
|
||
all_importance.extend(current_area_importance)
|
||
|
||
# 保存更新后的结果
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
|
||
# 保存更新后的完整数据集
|
||
updated_df.to_excel(f'updated_dataset_{timestamp}.xlsx', index=False)
|
||
|
||
# 保存新的预测结果
|
||
pd.DataFrame(all_results).to_excel(
|
||
f'updated_predictions_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存新的特征重要性
|
||
importance_df = pd.DataFrame(all_importance)
|
||
cols = ['area', 'year', 'month']
|
||
feature_cols = [col for col in importance_df.columns if col not in cols]
|
||
importance_df = importance_df[cols + sorted(feature_cols)]
|
||
importance_df.to_excel(
|
||
f'updated_feature_importance_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存新的模型评估指标
|
||
pd.DataFrame(all_metrics).to_excel(
|
||
f'updated_model_metrics_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存新的未来预测结果
|
||
pd.DataFrame(all_future_results).to_excel(
|
||
f'updated_future_predictions_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
print("Update and prediction completed successfully!")
|
||
return updated_df, all_results, all_future_results
|
||
|
||
except Exception as e:
|
||
print(f"Error in update_predictions: {str(e)}")
|
||
raise
|
||
|
||
def main():
|
||
try:
|
||
# 读取数据
|
||
# df = pd.read_excel('test.xlsx')
|
||
df = pd.read_csv('H:\公司-文件\公司-维域-文件\buliang-main\buliang\src\main\resources\python\data.csv')
|
||
|
||
# 初始化预测器
|
||
predictor = RidgePredictor()
|
||
|
||
# 存储所有结果
|
||
all_results = []
|
||
all_importance = []
|
||
all_metrics = []
|
||
all_future_results = []
|
||
|
||
# 获取所有地区和项目
|
||
areas = df['area_name'].unique()
|
||
projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
|
||
|
||
# 对每个地区进行预测
|
||
for area in areas:
|
||
print(f"Processing{area}")
|
||
# 初始化当前地区的特征重要性列表
|
||
current_area_importance = None
|
||
|
||
# 对每个项目进行预测
|
||
for project in projects:
|
||
print(f"Processing project: {project}")
|
||
|
||
# 预测2023.1-6
|
||
results, importance, metrics = predictor.predict_with_history(
|
||
df, area, project
|
||
)
|
||
all_results.extend(results)
|
||
|
||
# 更新当前地区的特征重要性
|
||
if current_area_importance is None:
|
||
current_area_importance = importance
|
||
else:
|
||
# 合并特征重要性
|
||
current_area_importance = [
|
||
dict(current, **new)
|
||
for current, new in zip(current_area_importance, importance)
|
||
]
|
||
|
||
all_metrics.extend(metrics)
|
||
|
||
# 预测2023.7-9
|
||
future_results = predictor.predict_future(df, area, project)
|
||
all_future_results.extend(future_results)
|
||
|
||
# 将该地区的所有特征重要性添加到总结果中
|
||
if current_area_importance:
|
||
all_importance.extend(current_area_importance)
|
||
|
||
# 保存结果
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
# 转换预测结果格式
|
||
def format_predictions(results_list):
|
||
formatted_data = {}
|
||
for result in results_list:
|
||
key = (result['area'], result['year'], result['month'])
|
||
if key not in formatted_data:
|
||
formatted_data[key] = {
|
||
'area_name': result['area'],
|
||
'year': result['year'],
|
||
'month': result['month'],
|
||
'xxzs': 0,
|
||
'zfjc': 0,
|
||
'tsjb': 0,
|
||
'xzcf': 0,
|
||
'cckh': 0,
|
||
'cjjc': 0,
|
||
'zhxzb': 0
|
||
}
|
||
formatted_data[key][result['project']] = round(result['prediction'] , 8)
|
||
return list(formatted_data.values())
|
||
|
||
# 合并并保存所有预测结果
|
||
all_predictions = format_predictions(all_results + all_future_results)
|
||
predictions_df = pd.DataFrame(all_predictions)
|
||
predictions_df = predictions_df[['area_name', 'year', 'month',
|
||
'xxzs', 'zfjc', 'tsjb', 'xzcf',
|
||
'cckh', 'cjjc', 'zhxzb']]
|
||
predictions_df.to_excel(f'all_predictions_{timestamp}.xlsx', index=False)
|
||
|
||
|
||
# 保存1-6月预测结果
|
||
pd.DataFrame(all_results).to_excel(
|
||
f'predictions_1_6_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存特征重要性
|
||
importance_df = pd.DataFrame(all_importance)
|
||
# 确保列的顺序
|
||
cols = ['area', 'year', 'month']
|
||
feature_cols = [col for col in importance_df.columns if col not in cols]
|
||
importance_df = importance_df[cols + sorted(feature_cols)]
|
||
importance_df.to_excel(
|
||
f'feature_importance_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存模型评估指标
|
||
pd.DataFrame(all_metrics).to_excel(
|
||
f'model_metrics_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
# 保存7-9月预测结果
|
||
pd.DataFrame(all_future_results).to_excel(
|
||
f'predictions_7_9_{timestamp}.xlsx', index=False
|
||
)
|
||
|
||
print("Prediction completed successfully!")
|
||
|
||
# 当有新数据时,可以运行更新预测
|
||
# update_predictions('test.xlsx', 'new_data.xlsx')
|
||
except Exception as e:
|
||
print(f"Error in main function: {str(e)}")
|
||
raise
|
||
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |