import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV, TimeSeriesSplit from sklearn.metrics import r2_score import plotly.graph_objects as go from plotly.subplots import make_subplots import os import joblib import json import sys # 重新配置标准输出流的编码为 UTF-8 sys.stdout.reconfigure(encoding='utf-8') class ComprehensiveScoreAnalyzer: def __init__(self): self.project_models = {} self.final_model = None self.feature_scalers = {} self.score_scaler = StandardScaler() self.project_polys = {} self.score_poly = None self.project_names = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb'] self.feature_importance = {} # 定义每个项目的特征前缀 self.project_features = { 'xxzs': 'xxzs_', 'zfjc': 'zfjc_', 'tsjb': 'tsjb_', 'xzcf': 'xzcf_', 'cckh': 'cckh_', 'cjjc': 'cjjc_', 'zhxzb': 'zhxzb_' } # # def preprocess_data(self, df): # df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str)) # features = {} # for project, prefix in self.project_features.items(): # project_cols = [col for col in df.columns if col.startswith(prefix) and col != project] # if project_cols: # features[project] = df[project_cols] # valid_projects = [] # project_scores = pd.DataFrame() # for project in self.project_names: # if df[project].std() > 0 and project in features and not features[project].empty: # valid_projects.append(project) # project_scores[project] = df[project] # else: # # print(f"警告: {project} 项目得分无变化或无子特征,已剔除") # self.project_names = valid_projects # total_score = df['all_score'] # return features, project_scores, total_score, df['date'] def preprocess_data(self, df): df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str)) features = {} for project, prefix in self.project_features.items(): project_cols = [col for col in df.columns if col.startswith(prefix) and col != project] if project_cols: features[project] = df[project_cols] project_scores = pd.DataFrame() for project in self.project_names: if project in features and not features[project].empty: project_scores[project] = df[project] # else: # print(f"警告: {project} 项目无子特征或没有数据") noise = np.random.normal(0, 0.01, project_scores.shape) project_scores += noise total_score = df['all_score'] return features, project_scores, total_score, df['date'] def optimize_model(self, X, y, project_name=None): param_grid = { 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_intercept': [True], 'solver': ['auto'] } tscv = TimeSeriesSplit(n_splits=5) grid_search = GridSearchCV( Ridge(), param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=1, verbose=0 ) grid_search.fit(X, y) # print(f"{'最终' if project_name is None else project_name} 模型最佳参数:", grid_search.best_params_) return grid_search.best_estimator_ def fit_project_models(self, features, project_scores, train_mask): for project in self.project_names: # print(f"\n训练 {project} 项目模型:") project_features = features[project] self.feature_scalers[project] = StandardScaler() self.project_polys[project] = PolynomialFeatures(degree=2, include_bias=False) X_train = self.feature_scalers[project].fit_transform(project_features[train_mask]) X_train_poly = self.project_polys[project].fit_transform(X_train) y_train = project_scores[project][train_mask] model = self.optimize_model(X_train_poly, y_train, project) self.project_models[project] = model importance = pd.DataFrame({ 'feature': self.project_polys[project].get_feature_names_out(project_features.columns), 'coefficient': abs(model.coef_) }).sort_values('coefficient', ascending=False) self.feature_importance[project] = importance def fit_final_model(self, project_scores, total_score, train_mask): # print("\n训练最终总分模型:") X_scores = self.score_scaler.fit_transform(project_scores[train_mask]) self.score_poly = PolynomialFeatures(degree=2, include_bias=False) X_scores_poly = self.score_poly.fit_transform(X_scores) self.final_model = self.optimize_model(X_scores_poly, total_score[train_mask]) def predict(self, features, project_scores, test_mask): project_predictions = pd.DataFrame() for project in self.project_names: project_features = features[project] X_test = self.feature_scalers[project].transform(project_features[test_mask]) X_test_poly = self.project_polys[project].transform(X_test) project_predictions[project] = self.project_models[project].predict(X_test_poly) X_scores_test = self.score_scaler.transform(project_scores[test_mask]) X_scores_test_poly = self.score_poly.transform(X_scores_test) total_score_pred = self.final_model.predict(X_scores_test_poly) return project_predictions, total_score_pred def plot_project_comparisons(self, features, project_scores, train_mask): n_projects = len(self.project_names) n_cols = 3 n_rows = (n_projects + n_cols - 1) // n_cols fig = make_subplots( rows=n_rows, cols=n_cols, subplot_titles=self.project_names, vertical_spacing=0.22, horizontal_spacing=0.1 ) for i, project in enumerate(self.project_names): row = i // n_cols + 1 col = i % n_cols + 1 project_features = features[project] y_true = project_scores[project][train_mask] y_pred = self.project_models[project].predict( self.project_polys[project].transform( self.feature_scalers[project].transform(project_features[train_mask]) ) ) r2 = r2_score(y_true, y_pred) fig.add_trace( go.Scatter( x=y_true, y=y_pred, mode='markers', name=project, marker=dict( size=8, color='rgba(8, 81, 156, 0.6)', line=dict( color='rgba(8, 81, 156, 0.9)', width=1 ) ), showlegend=False ), row=row, col=col ) min_val = min(min(y_true), min(y_pred)) max_val = max(max(y_true), max(y_pred)) fig.add_trace( go.Scatter( x=[min_val, max_val], y=[min_val, max_val], mode='lines', line=dict(color='red', dash='dash'), showlegend=False ), row=row, col=col ) fig.add_annotation( text=f'R² = {r2:.4f}', xref=f'x{i + 1}', yref=f'y{i + 1}', x=0.05, y=0.95, showarrow=False, font=dict(size=10), xanchor='left', yanchor='top' ) pad = 0.1 * (max_val - min_val) fig.update_xaxes(range=[min_val - pad, max_val + pad], title_text="实际值", row=row, col=col) fig.update_yaxes(range=[min_val - pad, max_val + pad], title_text="预测值", row=row, col=col) fig.update_layout( height=300 * n_rows, width=1000, title_text="各项目得分预测结果对比", showlegend=False, template='plotly_white', ) return fig def plot_total_score_comparison(self, project_scores, total_score, train_mask): y_true = total_score[train_mask] y_pred = self.final_model.predict( self.score_poly.transform( self.score_scaler.transform(project_scores[train_mask]))) r2 = r2_score(y_true, y_pred) fig = go.Figure() fig.add_trace( go.Scatter( x=y_true, y=y_pred, mode='markers', name='预测值', marker=dict( size=10, color='rgba(255, 0, 0, 0.6)', line=dict( color='rgba(255, 0, 0, 0.9)', width=1 ) ) ) ) min_val = min(min(y_true), min(y_pred)) max_val = max(max(y_true), max(y_pred)) fig.add_trace( go.Scatter( x=[min_val, max_val], y=[min_val, max_val], mode='lines', line=dict(color='red', dash='dash'), name='y=x' ) ) fig.add_annotation( text=f'R² = {r2:.4f}', xref='paper', yref='paper', x=0.05, y=0.95, showarrow=False, font=dict(size=14) ) fig.update_layout( title_text="总分预测结果对比", xaxis_title="实际值", yaxis_title="预测值", template='plotly_white', width=800, height=600 ) return fig def save_models(self, area_name, output_dir='models'): if not os.path.exists(output_dir): os.makedirs(output_dir) area_dir = os.path.join(output_dir, area_name) if not os.path.exists(area_dir): os.makedirs(area_dir) for project in self.project_names: model_data = { 'model': self.project_models[project], 'scaler': self.feature_scalers[project], 'poly': self.project_polys[project] } model_path = os.path.join(area_dir, f'{project}_model.pkl') joblib.dump(model_data, model_path) final_model_data = { 'model': self.final_model, 'scaler': self.score_scaler, 'poly': self.score_poly } final_model_path = os.path.join(area_dir, 'final_model.pkl') joblib.dump(final_model_data, final_model_path) def process_feature_importance(analyzer, area): importance_dict = {'area_name': area} for project in analyzer.project_names: importance_df = analyzer.feature_importance[project] original_features = importance_df[ ~importance_df['feature'].str.contains(' ') ].copy() for _, row in original_features.iterrows(): importance_dict[row['feature']] = row['coefficient'] return importance_dict def run_prediction(train_date, test_date, period_months): period_dir = f"{period_months}个月" for base_dir in ['models', 'predictions', 'feature_importance', 'plots']: period_path = os.path.join(base_dir, period_dir) if not os.path.exists(period_path): os.makedirs(period_path) try: predictions, feature_importance = main(train_date, test_date, period_dir) # print(f"\n{period_months}个月预测完成!") # 创建一个包含 predictions 和 feature_importance 的字典 result = { "predictions": json.loads(predictions.to_json(orient='records')), "feature_importance": json.loads(feature_importance.to_json(orient='records')) } # 使用 json.dumps 将字典转换为 JSON 格式的字符串 result_json = json.dumps(result, ensure_ascii=False, indent=4) print(result_json) # if predictions is not None: # print(f"共生成 {len(predictions)} 条预测记录") # if feature_importance is not None: # print(f"特征重要性分析包含 {len(feature_importance)} 条记录") except Exception as e: print(f"{period_months}个月预测执行出错: {str(e)}") def main(train_date='2023-01-01', test_date='2023-06-01', period_dir='6个月'): """ 主函数,用于执行数据分析和预测流程。 参数: - train_date (str): 训练数据的截止日期,默认为'2023-01-01'。 - test_date (str): 测试数据的起始日期,默认为'2023-06-01'。 - period_dir (str): 保存模型、预测结果和特征重要性等的目录名称,默认为'6个月'。 返回: - all_predictions_df (pd.DataFrame): 所有地区的预测结果数据框,如果未生成则为None。 - importance_df (pd.DataFrame): 所有地区的特征重要性数据框,如果未生成则为None。 """ # 读取数据集 # df = pd.read_csv('data.csv') df = pd.read_csv('H:/develop/dama/java/buliangfanying/target/classes/python/data.csv') # 获取所有不同的地区名称 areas = df['area_name'].unique() # 初始化存储所有预测结果和特征重要性的列表 all_predictions = [] all_feature_importance = [] # 识别特征列,即以特定前缀开头的列 feature_columns = [col for col in df.columns if any(col.startswith(prefix) for prefix in ['xxzs_', 'zfjc_', 'tsjb_', 'xzcf_', 'cckh_', 'cjjc_', 'zhxzb_'])] # 遍历每个地区 for area in areas: # 打印地区分割线 # print(f"\n{'=' * 50}") # print(f"处理地区: {area}") # print(f"{'=' * 50}") # 筛选特定地区的数据 area_df = df[df['area_name'] == area].copy() # 初始化 ComprehensiveScoreAnalyzer 实例 analyzer = ComprehensiveScoreAnalyzer() # 预处理数据 features, project_scores, total_score, dates = analyzer.preprocess_data(area_df) # 创建训练集和测试集的掩码 train_mask = dates < pd.to_datetime(train_date) test_mask = dates >= pd.to_datetime(test_date) # 检查训练集和测试集的数据量是否充足 if sum(train_mask) < 2 or sum(test_mask) < 1: # print(f"警告: {area} 数据量不足,跳过此地区") continue try: # 训练项目模型 analyzer.fit_project_models(features, project_scores, train_mask) # 训练最终模型 analyzer.fit_final_model(project_scores, total_score, train_mask) # 保存模型 model_dir = os.path.join('models', period_dir) analyzer.save_models(area, output_dir=model_dir) # 处理并存储特征重要性 importance_dict = process_feature_importance(analyzer, area) all_feature_importance.append(importance_dict) # 进行预测 project_pred, total_pred = analyzer.predict(features, project_scores, test_mask) # 准备测试数据以保存预测结果 test_data = area_df[test_mask].copy() for project in analyzer.project_names: test_data[project] = project_pred[project].values test_data['all_score'] = total_pred all_predictions.append(test_data) # 创建并保存可视化图表 plots_dir = os.path.join('plots', period_dir) if not os.path.exists(plots_dir): os.makedirs(plots_dir) project_fig = analyzer.plot_project_comparisons(features, project_scores, train_mask) project_fig.write_image(os.path.join(plots_dir, f"{area}_project_comparisons.png")) total_fig = analyzer.plot_total_score_comparison(project_scores, total_score, train_mask) total_fig.write_image(os.path.join(plots_dir, f"{area}_total_score_comparison.png")) except Exception as e: # 错误处理 # print(f"处理 {area} 时发生错误: {str(e)}") continue # 合并所有地区的预测结果 if all_predictions: all_predictions_df = pd.concat(all_predictions, ignore_index=True) all_predictions_df = all_predictions_df.sort_values(['area_name', 'year', 'month']) predictions_dir = os.path.join('predictions', period_dir) all_predictions_df.to_csv(os.path.join(predictions_dir, 'all_areas_predictions.csv'), index=False) # 合并所有地区的特征重要性 if all_feature_importance: importance_df = pd.DataFrame(all_feature_importance) for col in feature_columns: if col not in importance_df.columns: importance_df[col] = 0.0 cols_order = ['area_name'] + [col for col in feature_columns if col in importance_df.columns] importance_df = importance_df[cols_order] importance_dir = os.path.join('feature_importance', period_dir) importance_df.to_csv(os.path.join(importance_dir, 'all_areas_feature_importance.csv'), index=False) # 打印保存路径信息 # print(f"\n所有结果已保存在 {period_dir} 目录下:") # if all_predictions: # print(f"1. 预测结果:predictions/{period_dir}/all_areas_predictions.csv") # if all_feature_importance: # print(f"2. 特征重要性:feature_importance/{period_dir}/all_areas_feature_importance.csv") # print(f"3. 模型保存在 models/{period_dir} 目录下") # print(f"4. 可视化结果保存在 plots/{period_dir} 目录下") # 返回预测结果和特征重要性的数据框 return all_predictions_df if all_predictions else None, \ importance_df if all_feature_importance else None def load_model(area_name, project=None, base_dir='models'): area_dir = os.path.join(base_dir, area_name) if not os.path.exists(area_dir): raise FileNotFoundError(f"未找到地区 {area_name} 的模型目录") if project is None: model_path = os.path.join(area_dir, 'final_model.pkl') else: model_path = os.path.join(area_dir, f'{project}_model.pkl') if not os.path.exists(model_path): raise FileNotFoundError(f"未找到模型文件: {model_path}") return joblib.load(model_path) def predict_scores(area_name, features_df, base_dir='models'): project_predictions = {} project_scores = pd.DataFrame() area_dir = os.path.join(base_dir, area_name) model_files = [f for f in os.listdir(area_dir) if f.endswith('_model.pkl')] for model_file in model_files: if model_file == 'final_model.pkl': continue project = model_file.replace('_model.pkl', '') model_data = load_model(area_name, project) project_prefix = project + '_' project_features = features_df[[col for col in features_df.columns if col.startswith(project_prefix)]] X = model_data['scaler'].transform(project_features) X_poly = model_data['poly'].transform(X) project_predictions[project] = model_data['model'].predict(X_poly) project_scores[project] = project_predictions[project] final_model_data = load_model(area_name) X_scores = final_model_data['scaler'].transform(project_scores) X_scores_poly = final_model_data['poly'].transform(X_scores) total_score_prediction = final_model_data['model'].predict(X_scores_poly) return project_predictions, total_score_prediction if __name__ == "__main__": # file_path = sys.argv[1] file_path = 'H:/develop/dama/java/buliangfanying/PredictedScores.txt' with open(file_path, 'r', encoding='utf-8') as file: json_str = file.read() # 解析 JSON 字符串 data_dict = json.loads(json_str) base_dirs = ['models', 'predictions', 'feature_importance', 'plots'] for dir_name in base_dirs: if not os.path.exists(dir_name): os.makedirs(dir_name) prediction_configs = [ { 'train_date': data_dict['train_date'], # 使用6月份以前的数据 'test_date': data_dict['test_date'], # 预测6月份 'period_months': data_dict['period_months'] } # { # 'train_date': '2023-06-01', # 使用6月份以前的数据 # 'test_date': '2023-06-01', # 预测6月份 # 'period_months': 1 # } # { # 'train_date': '2023-04-01', # 使用4月份以前的数据 # 'test_date': '2023-06-01', # 预测6月份 # 'period_months': 3 # }, # { # 'train_date': '2023-01-01', # 使用1月份以前的数据 # 'test_date': '2023-06-01', # 预测6月份 # 'period_months': 6 # } ] for config in prediction_configs: run_prediction( config['train_date'], config['test_date'], config['period_months'] )