import pandas as pd import numpy as np from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score, mean_squared_error import joblib import os from datetime import datetime from dateutil.relativedelta import relativedelta class RidgePredictor: def __init__(self, base_model_path='models'): """ 初始化预测器 base_model_path: 模型存储的基础路径 """ self.base_model_path = base_model_path self.scaler = StandardScaler() os.makedirs(base_model_path, exist_ok=True) def get_project_features(self, df, project): """获取项目对应的特征列名""" try: features = [col for col in df.columns if col.startswith(f'{project}_')] if not features: raise ValueError(f"No features found for project {project}") return features except Exception as e: print(f"Error in get_project_features: {str(e)}") raise def print_ridge_formula(self, model, feature_names, scaler): """打印岭回归公式""" try: coefficients = model.coef_ intercept = model.intercept_ formula = "y = " for i, (coef, name) in enumerate(zip(coefficients, feature_names)): if i > 0: formula += " + " if coef >= 0 else " - " else: formula += "-" if coef < 0 else "" formula += f"{abs(coef):.4f} * {name}" formula += f" + {intercept:.4f}" # 添加标准化说明 formula += "注:特征已进行标准化处理,使用以下参数:" for i, name in enumerate(feature_names): formula += f"{name}: mean={scaler.mean_[i]:.4f}, scale={scaler.scale_[i]:.4f}" return formula except Exception as e: print(f"Error in print_ridge_formula: {str(e)}") raise def get_feature_average(self, df, area, project, year, month, n_months=3): """获取前n个月的特征平均值""" try: feature_cols = self.get_project_features(df, project) # 计算前n个月的日期范围 dates = [] curr_year, curr_month = year, month for _ in range(n_months): curr_month -= 1 if curr_month == 0: curr_month = 12 curr_year -= 1 dates.append((curr_year, curr_month)) # 获取特征值 features_list = [] for year, month in dates: mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month) if sum(mask) > 0: features_list.append(df[mask][feature_cols].values[0]) if not features_list: raise ValueError(f"No feature data found for {area} in specified months") return np.mean(features_list, axis=0) except Exception as e: print(f"Error in get_feature_average: {str(e)}") raise def train_and_evaluate(self, df, area, project, target_year, target_month): """训练模型并评估性能""" try: feature_cols = self.get_project_features(df, project) # 准备训练数据 dates = [] curr_year, curr_month = target_year, target_month for _ in range(12): # 使用前12个月的数据 curr_month -= 1 if curr_month == 0: curr_month = 12 curr_year -= 1 dates.append((curr_year, curr_month)) train_data = [] for year, month in dates: mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month) if sum(mask) > 0: train_data.append(df[mask]) if not train_data: raise ValueError(f"No training data found for {area}") train_df = pd.concat(train_data) X = train_df[feature_cols].values y = train_df[project].values # 数据标准化 X_scaled = self.scaler.fit_transform(X) # 网格搜索最佳参数 param_grid = {'alpha': np.logspace(-3, 3, 7)} ridge = Ridge() grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_scaled, y) # 获取最佳模型 best_model = grid_search.best_estimator_ # 计算评估指标 y_pred = best_model.predict(X_scaled) r2 = r2_score(y, y_pred) mse = mean_squared_error(y, y_pred) # 保存模型和标准化器 model_dir = os.path.join(self.base_model_path, f"{area}_{project}") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl") scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl") joblib.dump(best_model, model_path) joblib.dump(self.scaler, scaler_path) # 生成岭回归公式 formula = self.print_ridge_formula(best_model, feature_cols, self.scaler) return best_model, self.scaler, r2, mse, formula except Exception as e: print(f"Error in train_and_evaluate: {str(e)}") raise def get_feature_average(self, df, area, project, year, month, n_months=3): """获取前n个月的特征平均值""" try: feature_cols = self.get_project_features(df, project) # 计算前n个月的日期范围 dates = [] curr_year, curr_month = year, month for _ in range(n_months): curr_month -= 1 if curr_month == 0: curr_month = 12 curr_year -= 1 dates.append((curr_year, curr_month)) # 获取特征值 features_list = [] for year, month in dates: mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month) if sum(mask) > 0: features_list.append(df[mask][feature_cols].values[0]) if not features_list: raise ValueError(f"No feature data found for {area} in specified months") return np.mean(features_list, axis=0) except Exception as e: print(f"Error in get_feature_average: {str(e)}") raise def train_and_evaluate(self, df, area, project, target_year, target_month): """训练模型并评估性能""" try: feature_cols = self.get_project_features(df, project) # 准备训练数据 dates = [] curr_year, curr_month = target_year, target_month for _ in range(12): # 使用前12个月的数据 curr_month -= 1 if curr_month == 0: curr_month = 12 curr_year -= 1 dates.append((curr_year, curr_month)) train_data = [] for year, month in dates: mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month) if sum(mask) > 0: train_data.append(df[mask]) if not train_data: raise ValueError(f"No training data found for {area}") train_df = pd.concat(train_data) X = train_df[feature_cols].values y = train_df[project].values # 数据标准化 X_scaled = self.scaler.fit_transform(X) # 网格搜索最佳参数 param_grid = {'alpha': np.logspace(-3, 3, 7)} ridge = Ridge() grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_scaled, y) print(f"区域: {area}, 项目: {project} - 当前模型的最佳参数: {grid_search.best_params_}") # 获取最佳模型 best_model = grid_search.best_estimator_ # 计算评估指标 y_pred = best_model.predict(X_scaled) r2 = r2_score(y, y_pred) mse = mean_squared_error(y, y_pred) # 保存模型和标准化器 model_dir = os.path.join(self.base_model_path, f"{area}_{project}") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl") scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl") joblib.dump(best_model, model_path) joblib.dump(self.scaler, scaler_path) # 生成岭回归公式 formula = self.print_ridge_formula(best_model, feature_cols, self.scaler) return best_model, self.scaler, r2, mse, formula except Exception as e: print(f"Error in train_and_evaluate: {str(e)}") raise def predict_with_history(self, df, area, project, start_year=2023, start_month=1, end_month=6): """使用历史数据进行预测(2023.1-6)""" try: results = [] feature_importance = [] model_metrics = [] for month in range(start_month, end_month + 1): print(f"Predicting {area} - {project} for {start_year}/{month}") # 训练模型并评估 model, scaler, r2, mse, formula = self.train_and_evaluate( df, area, project, start_year, month ) # 获取预测用的特征(前3个月平均) test_features = self.get_feature_average( df, area, project, start_year, month ) # 标准化特征并预测 test_features_scaled = scaler.transform(test_features.reshape(1, -1)) prediction = model.predict(test_features_scaled)[0] # 获取特征重要性 feature_cols = self.get_project_features(df, project) importance = abs(model.coef_) # 修改特征重要性的存储格式 importance_dict = { 'area': area, 'year': start_year, 'month': month, } # 将特征重要性存储为 project_feature: value 的格式 for feat, imp in zip(feature_cols, importance): importance_dict[f'{feat}'] = imp # 查找是否已存在相同area/year/month的记录 existing_idx = next(( i for i, x in enumerate(feature_importance) if x['area'] == area and x['year'] == start_year and x['month'] == month ), None) if existing_idx is not None: # 如果存在,更新现有记录 feature_importance[existing_idx].update(importance_dict) else: # 如果不存在,添加新记录 feature_importance.append(importance_dict) results.append({ 'area': area, 'project': project, 'year': start_year, 'month': month, 'prediction': prediction, 'actual': df[(df['area_name'] == area) & (df['year'] == start_year) & (df['month'] == month)][project].values[0] }) model_metrics.append({ 'area': area, 'project': project, 'year': start_year, 'month': month, 'r2': r2, 'mse': mse, 'formula': formula }) return results, feature_importance, model_metrics except Exception as e: print(f"Error in predict_with_history: {str(e)}") raise def predict_future(self, df, area, project, start_month=7, end_month=9, year=2023): """预测未来月份(2023.7-9)""" try: results = [] # 加载6月的模型 model_dir = os.path.join(self.base_model_path, f"{area}_{project}") model = joblib.load(os.path.join(model_dir, f"model_{year}_6.pkl")) scaler = joblib.load(os.path.join(model_dir, f"scaler_{year}_6.pkl")) for month in range(start_month, end_month + 1): print(f"Predicting future {area} - {project} for {year}/{month}") if month == 7: # 使用4,5,6月的特征平均值 test_features = np.mean([ self.get_feature_average(df, area, project, year, m) for m in [4, 5, 6] ], axis=0) else: # 使用前三个预测结果的特征 test_features = np.mean([ self.get_feature_average(df, area, project, year, m) for m in range(month - 3, month) ], axis=0) test_features_scaled = scaler.transform(test_features.reshape(1, -1)) prediction = model.predict(test_features_scaled)[0] results.append({ 'area': area, 'project': project, 'year': year, 'month': month, 'prediction': prediction }) return results except Exception as e: print(f"Error in predict_future: {str(e)}") raise def format_all_predictions(all_results, all_future_results): """ 将所有预测结果格式化为指定格式 """ # 创建一个字典来存储所有结果 formatted_data = {} # 合并所有结果 all_predictions = all_results + all_future_results # 处理每个预测结果 for result in all_predictions: key = (result['area'], result['year'], result['month']) if key not in formatted_data: formatted_data[key] = { 'area_name': result['area'], 'year': result['year'], 'month': result['month'], 'xxzs': 0, 'zfjc': 0, 'tsjb': 0, 'xzcf': 0, 'cckh': 0, 'cjjc': 0, 'zhxzb': 0 } # 将预测值转换为百分比并保存 project = result['project'] formatted_data[key][project] = round(result['prediction'] * 100, 8) # 转换为列表并排序 formatted_list = list(formatted_data.values()) formatted_list.sort(key=lambda x: (x['area_name'], x['year'], x['month'])) return formatted_list def update_predictions(original_data_path, new_data_path): """ 使用新数据更新预测 original_data_path: 原始数据文件路径 new_data_path: 新数据文件路径 """ try: # 读取原始数据和新数据 original_df = pd.read_excel(original_data_path) new_df = pd.read_excel(new_data_path) # 确保新数据格式与原始数据一致 required_cols = set(original_df.columns) if not set(new_df.columns).issuperset(required_cols): raise ValueError("New data missing required columns") # 合并数据并去重(保留最新) updated_df = pd.concat([original_df, new_df], ignore_index=True) updated_df = updated_df.drop_duplicates( subset=['area_name', 'year', 'month'], keep='last' ).reset_index(drop=True) # 初始化预测器 predictor = RidgePredictor() # 存储所有结果 all_results = [] all_importance = [] all_metrics = [] all_future_results = [] # 获取所有地区和项目 areas = updated_df['area_name'].unique() projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb'] # 获取最新的时间点 latest_date = pd.to_datetime( updated_df['year'].astype(str) + '-' + updated_df['month'].astype(str) + '-01' ).max() # 对每个地区进行预测 for area in areas: print(f"Updating predictions for {area}") current_area_importance = None for project in projects: print(f"Processing project: {project}") # 使用更新后的数据重新训练并预测 results, importance, metrics = predictor.predict_with_history( updated_df, area, project, start_year=latest_date.year, start_month=latest_date.month, end_month=latest_date.month ) all_results.extend(results) # 更新特征重要性 if current_area_importance is None: current_area_importance = importance else: current_area_importance = [ dict(current, **new) for current, new in zip(current_area_importance, importance) ] all_metrics.extend(metrics) # 预测未来3个月 future_results = predictor.predict_future( updated_df, area, project, start_month=latest_date.month + 1, end_month=latest_date.month + 3, year=latest_date.year ) all_future_results.extend(future_results) if current_area_importance: all_importance.extend(current_area_importance) # 保存更新后的结果 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # 保存更新后的完整数据集 updated_df.to_excel(f'updated_dataset_{timestamp}.xlsx', index=False) # 保存新的预测结果 pd.DataFrame(all_results).to_excel( f'updated_predictions_{timestamp}.xlsx', index=False ) # 保存新的特征重要性 importance_df = pd.DataFrame(all_importance) cols = ['area', 'year', 'month'] feature_cols = [col for col in importance_df.columns if col not in cols] importance_df = importance_df[cols + sorted(feature_cols)] importance_df.to_excel( f'updated_feature_importance_{timestamp}.xlsx', index=False ) # 保存新的模型评估指标 pd.DataFrame(all_metrics).to_excel( f'updated_model_metrics_{timestamp}.xlsx', index=False ) # 保存新的未来预测结果 pd.DataFrame(all_future_results).to_excel( f'updated_future_predictions_{timestamp}.xlsx', index=False ) print("Update and prediction completed successfully!") return updated_df, all_results, all_future_results except Exception as e: print(f"Error in update_predictions: {str(e)}") raise def main(): try: # 读取数据 # df = pd.read_excel('test.xlsx') df = pd.read_csv('H:\公司-文件\公司-维域-文件\buliang-main\buliang\src\main\resources\python\data.csv') # 初始化预测器 predictor = RidgePredictor() # 存储所有结果 all_results = [] all_importance = [] all_metrics = [] all_future_results = [] # 获取所有地区和项目 areas = df['area_name'].unique() projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb'] # 对每个地区进行预测 for area in areas: print(f"Processing{area}") # 初始化当前地区的特征重要性列表 current_area_importance = None # 对每个项目进行预测 for project in projects: print(f"Processing project: {project}") # 预测2023.1-6 results, importance, metrics = predictor.predict_with_history( df, area, project ) all_results.extend(results) # 更新当前地区的特征重要性 if current_area_importance is None: current_area_importance = importance else: # 合并特征重要性 current_area_importance = [ dict(current, **new) for current, new in zip(current_area_importance, importance) ] all_metrics.extend(metrics) # 预测2023.7-9 future_results = predictor.predict_future(df, area, project) all_future_results.extend(future_results) # 将该地区的所有特征重要性添加到总结果中 if current_area_importance: all_importance.extend(current_area_importance) # 保存结果 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # 转换预测结果格式 def format_predictions(results_list): formatted_data = {} for result in results_list: key = (result['area'], result['year'], result['month']) if key not in formatted_data: formatted_data[key] = { 'area_name': result['area'], 'year': result['year'], 'month': result['month'], 'xxzs': 0, 'zfjc': 0, 'tsjb': 0, 'xzcf': 0, 'cckh': 0, 'cjjc': 0, 'zhxzb': 0 } formatted_data[key][result['project']] = round(result['prediction'] , 8) return list(formatted_data.values()) # 合并并保存所有预测结果 all_predictions = format_predictions(all_results + all_future_results) predictions_df = pd.DataFrame(all_predictions) predictions_df = predictions_df[['area_name', 'year', 'month', 'xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']] predictions_df.to_excel(f'all_predictions_{timestamp}.xlsx', index=False) # 保存1-6月预测结果 pd.DataFrame(all_results).to_excel( f'predictions_1_6_{timestamp}.xlsx', index=False ) # 保存特征重要性 importance_df = pd.DataFrame(all_importance) # 确保列的顺序 cols = ['area', 'year', 'month'] feature_cols = [col for col in importance_df.columns if col not in cols] importance_df = importance_df[cols + sorted(feature_cols)] importance_df.to_excel( f'feature_importance_{timestamp}.xlsx', index=False ) # 保存模型评估指标 pd.DataFrame(all_metrics).to_excel( f'model_metrics_{timestamp}.xlsx', index=False ) # 保存7-9月预测结果 pd.DataFrame(all_future_results).to_excel( f'predictions_7_9_{timestamp}.xlsx', index=False ) print("Prediction completed successfully!") # 当有新数据时,可以运行更新预测 # update_predictions('test.xlsx', 'new_data.xlsx') except Exception as e: print(f"Error in main function: {str(e)}") raise if __name__ == "__main__": main()