response/target/classes/python/PredictedScores.py
2025-07-17 10:41:24 +08:00

655 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
class RidgePredictor:
def __init__(self, base_model_path='models'):
"""
初始化预测器
base_model_path: 模型存储的基础路径
"""
self.base_model_path = base_model_path
self.scaler = StandardScaler()
os.makedirs(base_model_path, exist_ok=True)
def get_project_features(self, df, project):
"""获取项目对应的特征列名"""
try:
features = [col for col in df.columns if col.startswith(f'{project}_')]
if not features:
raise ValueError(f"No features found for project {project}")
return features
except Exception as e:
print(f"Error in get_project_features: {str(e)}")
raise
def print_ridge_formula(self, model, feature_names, scaler):
"""打印岭回归公式"""
try:
coefficients = model.coef_
intercept = model.intercept_
formula = "y = "
for i, (coef, name) in enumerate(zip(coefficients, feature_names)):
if i > 0:
formula += " + " if coef >= 0 else " - "
else:
formula += "-" if coef < 0 else ""
formula += f"{abs(coef):.4f} * {name}"
formula += f" + {intercept:.4f}"
# 添加标准化说明
formula += "注:特征已进行标准化处理,使用以下参数:"
for i, name in enumerate(feature_names):
formula += f"{name}: mean={scaler.mean_[i]:.4f}, scale={scaler.scale_[i]:.4f}"
return formula
except Exception as e:
print(f"Error in print_ridge_formula: {str(e)}")
raise
def get_feature_average(self, df, area, project, year, month, n_months=3):
"""获取前n个月的特征平均值"""
try:
feature_cols = self.get_project_features(df, project)
# 计算前n个月的日期范围
dates = []
curr_year, curr_month = year, month
for _ in range(n_months):
curr_month -= 1
if curr_month == 0:
curr_month = 12
curr_year -= 1
dates.append((curr_year, curr_month))
# 获取特征值
features_list = []
for year, month in dates:
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
if sum(mask) > 0:
features_list.append(df[mask][feature_cols].values[0])
if not features_list:
raise ValueError(f"No feature data found for {area} in specified months")
return np.mean(features_list, axis=0)
except Exception as e:
print(f"Error in get_feature_average: {str(e)}")
raise
def train_and_evaluate(self, df, area, project, target_year, target_month):
"""训练模型并评估性能"""
try:
feature_cols = self.get_project_features(df, project)
# 准备训练数据
dates = []
curr_year, curr_month = target_year, target_month
for _ in range(12): # 使用前12个月的数据
curr_month -= 1
if curr_month == 0:
curr_month = 12
curr_year -= 1
dates.append((curr_year, curr_month))
train_data = []
for year, month in dates:
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
if sum(mask) > 0:
train_data.append(df[mask])
if not train_data:
raise ValueError(f"No training data found for {area}")
train_df = pd.concat(train_data)
X = train_df[feature_cols].values
y = train_df[project].values
# 数据标准化
X_scaled = self.scaler.fit_transform(X)
# 网格搜索最佳参数
param_grid = {'alpha': np.logspace(-3, 3, 7)}
ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)
# 获取最佳模型
best_model = grid_search.best_estimator_
# 计算评估指标
y_pred = best_model.predict(X_scaled)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
# 保存模型和标准化器
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")
joblib.dump(best_model, model_path)
joblib.dump(self.scaler, scaler_path)
# 生成岭回归公式
formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)
return best_model, self.scaler, r2, mse, formula
except Exception as e:
print(f"Error in train_and_evaluate: {str(e)}")
raise
def get_feature_average(self, df, area, project, year, month, n_months=3):
"""获取前n个月的特征平均值"""
try:
feature_cols = self.get_project_features(df, project)
# 计算前n个月的日期范围
dates = []
curr_year, curr_month = year, month
for _ in range(n_months):
curr_month -= 1
if curr_month == 0:
curr_month = 12
curr_year -= 1
dates.append((curr_year, curr_month))
# 获取特征值
features_list = []
for year, month in dates:
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
if sum(mask) > 0:
features_list.append(df[mask][feature_cols].values[0])
if not features_list:
raise ValueError(f"No feature data found for {area} in specified months")
return np.mean(features_list, axis=0)
except Exception as e:
print(f"Error in get_feature_average: {str(e)}")
raise
def train_and_evaluate(self, df, area, project, target_year, target_month):
"""训练模型并评估性能"""
try:
feature_cols = self.get_project_features(df, project)
# 准备训练数据
dates = []
curr_year, curr_month = target_year, target_month
for _ in range(12): # 使用前12个月的数据
curr_month -= 1
if curr_month == 0:
curr_month = 12
curr_year -= 1
dates.append((curr_year, curr_month))
train_data = []
for year, month in dates:
mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
if sum(mask) > 0:
train_data.append(df[mask])
if not train_data:
raise ValueError(f"No training data found for {area}")
train_df = pd.concat(train_data)
X = train_df[feature_cols].values
y = train_df[project].values
# 数据标准化
X_scaled = self.scaler.fit_transform(X)
# 网格搜索最佳参数
param_grid = {'alpha': np.logspace(-3, 3, 7)}
ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)
print(f"区域: {area}, 项目: {project} - 当前模型的最佳参数: {grid_search.best_params_}")
# 获取最佳模型
best_model = grid_search.best_estimator_
# 计算评估指标
y_pred = best_model.predict(X_scaled)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
# 保存模型和标准化器
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")
joblib.dump(best_model, model_path)
joblib.dump(self.scaler, scaler_path)
# 生成岭回归公式
formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)
return best_model, self.scaler, r2, mse, formula
except Exception as e:
print(f"Error in train_and_evaluate: {str(e)}")
raise
def predict_with_history(self, df, area, project, start_year=2023, start_month=1, end_month=6):
"""使用历史数据进行预测2023.1-6"""
try:
results = []
feature_importance = []
model_metrics = []
for month in range(start_month, end_month + 1):
print(f"Predicting {area} - {project} for {start_year}/{month}")
# 训练模型并评估
model, scaler, r2, mse, formula = self.train_and_evaluate(
df, area, project, start_year, month
)
# 获取预测用的特征前3个月平均
test_features = self.get_feature_average(
df, area, project, start_year, month
)
# 标准化特征并预测
test_features_scaled = scaler.transform(test_features.reshape(1, -1))
prediction = model.predict(test_features_scaled)[0]
# 获取特征重要性
feature_cols = self.get_project_features(df, project)
importance = abs(model.coef_)
# 修改特征重要性的存储格式
importance_dict = {
'area': area,
'year': start_year,
'month': month,
}
# 将特征重要性存储为 project_feature: value 的格式
for feat, imp in zip(feature_cols, importance):
importance_dict[f'{feat}'] = imp
# 查找是否已存在相同area/year/month的记录
existing_idx = next((
i for i, x in enumerate(feature_importance)
if x['area'] == area and x['year'] == start_year and x['month'] == month
), None)
if existing_idx is not None:
# 如果存在,更新现有记录
feature_importance[existing_idx].update(importance_dict)
else:
# 如果不存在,添加新记录
feature_importance.append(importance_dict)
results.append({
'area': area,
'project': project,
'year': start_year,
'month': month,
'prediction': prediction,
'actual': df[(df['area_name'] == area) &
(df['year'] == start_year) &
(df['month'] == month)][project].values[0]
})
model_metrics.append({
'area': area,
'project': project,
'year': start_year,
'month': month,
'r2': r2,
'mse': mse,
'formula': formula
})
return results, feature_importance, model_metrics
except Exception as e:
print(f"Error in predict_with_history: {str(e)}")
raise
def predict_future(self, df, area, project, start_month=7, end_month=9, year=2023):
"""预测未来月份2023.7-9"""
try:
results = []
# 加载6月的模型
model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
model = joblib.load(os.path.join(model_dir, f"model_{year}_6.pkl"))
scaler = joblib.load(os.path.join(model_dir, f"scaler_{year}_6.pkl"))
for month in range(start_month, end_month + 1):
print(f"Predicting future {area} - {project} for {year}/{month}")
if month == 7:
# 使用4,5,6月的特征平均值
test_features = np.mean([
self.get_feature_average(df, area, project, year, m)
for m in [4, 5, 6]
], axis=0)
else:
# 使用前三个预测结果的特征
test_features = np.mean([
self.get_feature_average(df, area, project, year, m)
for m in range(month - 3, month)
], axis=0)
test_features_scaled = scaler.transform(test_features.reshape(1, -1))
prediction = model.predict(test_features_scaled)[0]
results.append({
'area': area,
'project': project,
'year': year,
'month': month,
'prediction': prediction
})
return results
except Exception as e:
print(f"Error in predict_future: {str(e)}")
raise
def format_all_predictions(all_results, all_future_results):
"""
将所有预测结果格式化为指定格式
"""
# 创建一个字典来存储所有结果
formatted_data = {}
# 合并所有结果
all_predictions = all_results + all_future_results
# 处理每个预测结果
for result in all_predictions:
key = (result['area'], result['year'], result['month'])
if key not in formatted_data:
formatted_data[key] = {
'area_name': result['area'],
'year': result['year'],
'month': result['month'],
'xxzs': 0,
'zfjc': 0,
'tsjb': 0,
'xzcf': 0,
'cckh': 0,
'cjjc': 0,
'zhxzb': 0
}
# 将预测值转换为百分比并保存
project = result['project']
formatted_data[key][project] = round(result['prediction'] * 100, 8)
# 转换为列表并排序
formatted_list = list(formatted_data.values())
formatted_list.sort(key=lambda x: (x['area_name'], x['year'], x['month']))
return formatted_list
def update_predictions(original_data_path, new_data_path):
"""
使用新数据更新预测
original_data_path: 原始数据文件路径
new_data_path: 新数据文件路径
"""
try:
# 读取原始数据和新数据
original_df = pd.read_excel(original_data_path)
new_df = pd.read_excel(new_data_path)
# 确保新数据格式与原始数据一致
required_cols = set(original_df.columns)
if not set(new_df.columns).issuperset(required_cols):
raise ValueError("New data missing required columns")
# 合并数据并去重(保留最新)
updated_df = pd.concat([original_df, new_df], ignore_index=True)
updated_df = updated_df.drop_duplicates(
subset=['area_name', 'year', 'month'],
keep='last'
).reset_index(drop=True)
# 初始化预测器
predictor = RidgePredictor()
# 存储所有结果
all_results = []
all_importance = []
all_metrics = []
all_future_results = []
# 获取所有地区和项目
areas = updated_df['area_name'].unique()
projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
# 获取最新的时间点
latest_date = pd.to_datetime(
updated_df['year'].astype(str) + '-' +
updated_df['month'].astype(str) + '-01'
).max()
# 对每个地区进行预测
for area in areas:
print(f"Updating predictions for {area}")
current_area_importance = None
for project in projects:
print(f"Processing project: {project}")
# 使用更新后的数据重新训练并预测
results, importance, metrics = predictor.predict_with_history(
updated_df,
area,
project,
start_year=latest_date.year,
start_month=latest_date.month,
end_month=latest_date.month
)
all_results.extend(results)
# 更新特征重要性
if current_area_importance is None:
current_area_importance = importance
else:
current_area_importance = [
dict(current, **new)
for current, new in zip(current_area_importance, importance)
]
all_metrics.extend(metrics)
# 预测未来3个月
future_results = predictor.predict_future(
updated_df,
area,
project,
start_month=latest_date.month + 1,
end_month=latest_date.month + 3,
year=latest_date.year
)
all_future_results.extend(future_results)
if current_area_importance:
all_importance.extend(current_area_importance)
# 保存更新后的结果
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# 保存更新后的完整数据集
updated_df.to_excel(f'updated_dataset_{timestamp}.xlsx', index=False)
# 保存新的预测结果
pd.DataFrame(all_results).to_excel(
f'updated_predictions_{timestamp}.xlsx', index=False
)
# 保存新的特征重要性
importance_df = pd.DataFrame(all_importance)
cols = ['area', 'year', 'month']
feature_cols = [col for col in importance_df.columns if col not in cols]
importance_df = importance_df[cols + sorted(feature_cols)]
importance_df.to_excel(
f'updated_feature_importance_{timestamp}.xlsx', index=False
)
# 保存新的模型评估指标
pd.DataFrame(all_metrics).to_excel(
f'updated_model_metrics_{timestamp}.xlsx', index=False
)
# 保存新的未来预测结果
pd.DataFrame(all_future_results).to_excel(
f'updated_future_predictions_{timestamp}.xlsx', index=False
)
print("Update and prediction completed successfully!")
return updated_df, all_results, all_future_results
except Exception as e:
print(f"Error in update_predictions: {str(e)}")
raise
def main():
try:
# 读取数据
# df = pd.read_excel('test.xlsx')
df = pd.read_csv('H:\公司-文件\公司-维域-文件\buliang-main\buliang\src\main\resources\python\data.csv')
# 初始化预测器
predictor = RidgePredictor()
# 存储所有结果
all_results = []
all_importance = []
all_metrics = []
all_future_results = []
# 获取所有地区和项目
areas = df['area_name'].unique()
projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
# 对每个地区进行预测
for area in areas:
print(f"Processing{area}")
# 初始化当前地区的特征重要性列表
current_area_importance = None
# 对每个项目进行预测
for project in projects:
print(f"Processing project: {project}")
# 预测2023.1-6
results, importance, metrics = predictor.predict_with_history(
df, area, project
)
all_results.extend(results)
# 更新当前地区的特征重要性
if current_area_importance is None:
current_area_importance = importance
else:
# 合并特征重要性
current_area_importance = [
dict(current, **new)
for current, new in zip(current_area_importance, importance)
]
all_metrics.extend(metrics)
# 预测2023.7-9
future_results = predictor.predict_future(df, area, project)
all_future_results.extend(future_results)
# 将该地区的所有特征重要性添加到总结果中
if current_area_importance:
all_importance.extend(current_area_importance)
# 保存结果
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# 转换预测结果格式
def format_predictions(results_list):
formatted_data = {}
for result in results_list:
key = (result['area'], result['year'], result['month'])
if key not in formatted_data:
formatted_data[key] = {
'area_name': result['area'],
'year': result['year'],
'month': result['month'],
'xxzs': 0,
'zfjc': 0,
'tsjb': 0,
'xzcf': 0,
'cckh': 0,
'cjjc': 0,
'zhxzb': 0
}
formatted_data[key][result['project']] = round(result['prediction'] , 8)
return list(formatted_data.values())
# 合并并保存所有预测结果
all_predictions = format_predictions(all_results + all_future_results)
predictions_df = pd.DataFrame(all_predictions)
predictions_df = predictions_df[['area_name', 'year', 'month',
'xxzs', 'zfjc', 'tsjb', 'xzcf',
'cckh', 'cjjc', 'zhxzb']]
predictions_df.to_excel(f'all_predictions_{timestamp}.xlsx', index=False)
# 保存1-6月预测结果
pd.DataFrame(all_results).to_excel(
f'predictions_1_6_{timestamp}.xlsx', index=False
)
# 保存特征重要性
importance_df = pd.DataFrame(all_importance)
# 确保列的顺序
cols = ['area', 'year', 'month']
feature_cols = [col for col in importance_df.columns if col not in cols]
importance_df = importance_df[cols + sorted(feature_cols)]
importance_df.to_excel(
f'feature_importance_{timestamp}.xlsx', index=False
)
# 保存模型评估指标
pd.DataFrame(all_metrics).to_excel(
f'model_metrics_{timestamp}.xlsx', index=False
)
# 保存7-9月预测结果
pd.DataFrame(all_future_results).to_excel(
f'predictions_7_9_{timestamp}.xlsx', index=False
)
print("Prediction completed successfully!")
# 当有新数据时,可以运行更新预测
# update_predictions('test.xlsx', 'new_data.xlsx')
except Exception as e:
print(f"Error in main function: {str(e)}")
raise
if __name__ == "__main__":
main()