response/target/classes/python/PredictedScores_old.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import joblib
import json
import sys


# 重新配置标准输出流的编码为 UTF-8
sys.stdout.reconfigure(encoding='utf-8')

class ComprehensiveScoreAnalyzer:
    def __init__(self):
        self.project_models = {}
        self.final_model = None
        self.feature_scalers = {}
        self.score_scaler = StandardScaler()
        self.project_polys = {}
        self.score_poly = None
        self.project_names = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
        self.feature_importance = {}

        # 定义每个项目的特征前缀
        self.project_features = {
            'xxzs': 'xxzs_',
            'zfjc': 'zfjc_',
            'tsjb': 'tsjb_',
            'xzcf': 'xzcf_',
            'cckh': 'cckh_',
            'cjjc': 'cjjc_',
            'zhxzb': 'zhxzb_'
        }

    #
    # def preprocess_data(self, df):
    #     df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
    #     features = {}
    #     for project, prefix in self.project_features.items():
    #         project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
    #         if project_cols:
    #             features[project] = df[project_cols]
    #     valid_projects = []
    #     project_scores = pd.DataFrame()
    #     for project in self.project_names:
    #         if df[project].std() > 0 and project in features and not features[project].empty:
    #             valid_projects.append(project)
    #             project_scores[project] = df[project]
    #         else:
    #             # print(f"警告: {project} 项目得分无变化或无子特征，已剔除")
    #     self.project_names = valid_projects
    #     total_score = df['all_score']
    #     return features, project_scores, total_score, df['date']
    def preprocess_data(self, df):
        df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
        features = {}
        for project, prefix in self.project_features.items():
            project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
            if project_cols:
                features[project] = df[project_cols]
        project_scores = pd.DataFrame()
        for project in self.project_names:
            if project in features and not features[project].empty:
                project_scores[project] = df[project]
            # else:
        # print(f"警告: {project} 项目无子特征或没有数据")
        noise = np.random.normal(0, 0.01, project_scores.shape)
        project_scores += noise
        total_score = df['all_score']

        return features, project_scores, total_score, df['date']

    def optimize_model(self, X, y, project_name=None):
        param_grid = {
            'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
            'fit_intercept': [True],
            'solver': ['auto']
        }
        tscv = TimeSeriesSplit(n_splits=5)
        grid_search = GridSearchCV(
            Ridge(),
            param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=1,
            verbose=0
        )
        grid_search.fit(X, y)
        # print(f"{'最终' if project_name is None else project_name} 模型最佳参数:", grid_search.best_params_)
        return grid_search.best_estimator_

    def fit_project_models(self, features, project_scores, train_mask):
        for project in self.project_names:
            # print(f"\n训练 {project} 项目模型:")
            project_features = features[project]
            self.feature_scalers[project] = StandardScaler()
            self.project_polys[project] = PolynomialFeatures(degree=2, include_bias=False)
            X_train = self.feature_scalers[project].fit_transform(project_features[train_mask])
            X_train_poly = self.project_polys[project].fit_transform(X_train)
            y_train = project_scores[project][train_mask]
            model = self.optimize_model(X_train_poly, y_train, project)
            self.project_models[project] = model
            importance = pd.DataFrame({
                'feature': self.project_polys[project].get_feature_names_out(project_features.columns),
                'coefficient': abs(model.coef_)
            }).sort_values('coefficient', ascending=False)
            self.feature_importance[project] = importance

    def fit_final_model(self, project_scores, total_score, train_mask):
        # print("\n训练最终总分模型:")
        X_scores = self.score_scaler.fit_transform(project_scores[train_mask])
        self.score_poly = PolynomialFeatures(degree=2, include_bias=False)
        X_scores_poly = self.score_poly.fit_transform(X_scores)
        self.final_model = self.optimize_model(X_scores_poly, total_score[train_mask])

    def predict(self, features, project_scores, test_mask):
        project_predictions = pd.DataFrame()
        for project in self.project_names:
            project_features = features[project]
            X_test = self.feature_scalers[project].transform(project_features[test_mask])
            X_test_poly = self.project_polys[project].transform(X_test)
            project_predictions[project] = self.project_models[project].predict(X_test_poly)
        X_scores_test = self.score_scaler.transform(project_scores[test_mask])
        X_scores_test_poly = self.score_poly.transform(X_scores_test)
        total_score_pred = self.final_model.predict(X_scores_test_poly)
        return project_predictions, total_score_pred

    def plot_project_comparisons(self, features, project_scores, train_mask):
        n_projects = len(self.project_names)
        n_cols = 3
        n_rows = (n_projects + n_cols - 1) // n_cols
        fig = make_subplots(
            rows=n_rows,
            cols=n_cols,
            subplot_titles=self.project_names,
            vertical_spacing=0.22,
            horizontal_spacing=0.1
        )
        for i, project in enumerate(self.project_names):
            row = i // n_cols + 1
            col = i % n_cols + 1
            project_features = features[project]
            y_true = project_scores[project][train_mask]
            y_pred = self.project_models[project].predict(
                self.project_polys[project].transform(
                    self.feature_scalers[project].transform(project_features[train_mask])
                )
            )
            r2 = r2_score(y_true, y_pred)
            fig.add_trace(
                go.Scatter(
                    x=y_true,
                    y=y_pred,
                    mode='markers',
                    name=project,
                    marker=dict(
                        size=8,
                        color='rgba(8, 81, 156, 0.6)',
                        line=dict(
                            color='rgba(8, 81, 156, 0.9)',
                            width=1
                        )
                    ),
                    showlegend=False
                ),
                row=row, col=col
            )

            min_val = min(min(y_true), min(y_pred))
            max_val = max(max(y_true), max(y_pred))
            fig.add_trace(
                go.Scatter(
                    x=[min_val, max_val],
                    y=[min_val, max_val],
                    mode='lines',
                    line=dict(color='red', dash='dash'),
                    showlegend=False
                ),
                row=row, col=col
            )
            fig.add_annotation(
                text=f'R² = {r2:.4f}',
                xref=f'x{i + 1}',
                yref=f'y{i + 1}',
                x=0.05,
                y=0.95,
                showarrow=False,
                font=dict(size=10),
                xanchor='left',
                yanchor='top'
            )
            pad = 0.1 * (max_val - min_val)
            fig.update_xaxes(range=[min_val - pad, max_val + pad], title_text="实际值", row=row, col=col)
            fig.update_yaxes(range=[min_val - pad, max_val + pad], title_text="预测值", row=row, col=col)
        fig.update_layout(
            height=300 * n_rows,
            width=1000,
            title_text="各项目得分预测结果对比",
            showlegend=False,
            template='plotly_white',
        )
        return fig

    def plot_total_score_comparison(self, project_scores, total_score, train_mask):
        y_true = total_score[train_mask]
        y_pred = self.final_model.predict(
            self.score_poly.transform(
                self.score_scaler.transform(project_scores[train_mask])))
        r2 = r2_score(y_true, y_pred)
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=y_true,
                y=y_pred,
                mode='markers',
                name='预测值',
                marker=dict(
                    size=10,
                    color='rgba(255, 0, 0, 0.6)',
                    line=dict(
                        color='rgba(255, 0, 0, 0.9)',
                        width=1
                    )
                )
            )
        )
        min_val = min(min(y_true), min(y_pred))
        max_val = max(max(y_true), max(y_pred))
        fig.add_trace(
            go.Scatter(
                x=[min_val, max_val],
                y=[min_val, max_val],
                mode='lines',
                line=dict(color='red', dash='dash'),
                name='y=x'
            )
        )
        fig.add_annotation(
            text=f'R² = {r2:.4f}',
            xref='paper',
            yref='paper',
            x=0.05,
            y=0.95,
            showarrow=False,
            font=dict(size=14)
        )
        fig.update_layout(
            title_text="总分预测结果对比",
            xaxis_title="实际值",
            yaxis_title="预测值",
            template='plotly_white',
            width=800,
            height=600
        )
        return fig

    def save_models(self, area_name, output_dir='models'):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        area_dir = os.path.join(output_dir, area_name)
        if not os.path.exists(area_dir):
            os.makedirs(area_dir)
        for project in self.project_names:
            model_data = {
                'model': self.project_models[project],
                'scaler': self.feature_scalers[project],
                'poly': self.project_polys[project]
            }
            model_path = os.path.join(area_dir, f'{project}_model.pkl')
            joblib.dump(model_data, model_path)
        final_model_data = {
            'model': self.final_model,
            'scaler': self.score_scaler,
            'poly': self.score_poly
        }
        final_model_path = os.path.join(area_dir, 'final_model.pkl')
        joblib.dump(final_model_data, final_model_path)


def process_feature_importance(analyzer, area):
    importance_dict = {'area_name': area}
    for project in analyzer.project_names:
        importance_df = analyzer.feature_importance[project]
        original_features = importance_df[
            ~importance_df['feature'].str.contains(' ')
        ].copy()
        for _, row in original_features.iterrows():
            importance_dict[row['feature']] = row['coefficient']

    return importance_dict


def run_prediction(train_date, test_date, period_months):
    period_dir = f"{period_months}个月"
    for base_dir in ['models', 'predictions', 'feature_importance', 'plots']:
        period_path = os.path.join(base_dir, period_dir)
        if not os.path.exists(period_path):
            os.makedirs(period_path)
    try:
        predictions, feature_importance = main(train_date, test_date, period_dir)
        # print(f"\n{period_months}个月预测完成！")

        # 创建一个包含 predictions 和 feature_importance 的字典
        result = {
            "predictions": json.loads(predictions.to_json(orient='records')),
            "feature_importance": json.loads(feature_importance.to_json(orient='records'))
        }

        # 使用 json.dumps 将字典转换为 JSON 格式的字符串
        result_json = json.dumps(result, ensure_ascii=False, indent=4)
        print(result_json)
        # if predictions is not None:

        # print(f"共生成 {len(predictions)} 条预测记录")
        # if feature_importance is not None:
    # print(f"特征重要性分析包含 {len(feature_importance)} 条记录")
    except Exception as e:
        print(f"{period_months}个月预测执行出错: {str(e)}")
def main(train_date='2023-01-01', test_date='2023-06-01', period_dir='6个月'):
    """
    主函数，用于执行数据分析和预测流程。

    参数:
    - train_date (str): 训练数据的截止日期，默认为'2023-01-01'。
    - test_date (str): 测试数据的起始日期，默认为'2023-06-01'。
    - period_dir (str): 保存模型、预测结果和特征重要性等的目录名称，默认为'6个月'。

    返回:
    - all_predictions_df (pd.DataFrame): 所有地区的预测结果数据框，如果未生成则为None。
    - importance_df (pd.DataFrame): 所有地区的特征重要性数据框，如果未生成则为None。
    """
    # 读取数据集
    # df = pd.read_csv('data.csv')
    df = pd.read_csv('H:/develop/dama/java/buliangfanying/target/classes/python/data.csv')
    # 获取所有不同的地区名称
    areas = df['area_name'].unique()
    # 初始化存储所有预测结果和特征重要性的列表
    all_predictions = []
    all_feature_importance = []
    # 识别特征列，即以特定前缀开头的列
    feature_columns = [col for col in df.columns if any(col.startswith(prefix)
                                                        for prefix in ['xxzs_', 'zfjc_', 'tsjb_', 'xzcf_',
                                                                       'cckh_', 'cjjc_', 'zhxzb_'])]
    # 遍历每个地区
    for area in areas:
        # 打印地区分割线
        # print(f"\n{'=' * 50}")
        # print(f"处理地区: {area}")
        # print(f"{'=' * 50}")
        # 筛选特定地区的数据
        area_df = df[df['area_name'] == area].copy()
        # 初始化 ComprehensiveScoreAnalyzer 实例
        analyzer = ComprehensiveScoreAnalyzer()
        # 预处理数据
        features, project_scores, total_score, dates = analyzer.preprocess_data(area_df)
        # 创建训练集和测试集的掩码
        train_mask = dates < pd.to_datetime(train_date)
        test_mask = dates >= pd.to_datetime(test_date)
        # 检查训练集和测试集的数据量是否充足
        if sum(train_mask) < 2 or sum(test_mask) < 1:
            # print(f"警告: {area} 数据量不足，跳过此地区")
            continue
        try:
            # 训练项目模型
            analyzer.fit_project_models(features, project_scores, train_mask)
            # 训练最终模型
            analyzer.fit_final_model(project_scores, total_score, train_mask)
            # 保存模型
            model_dir = os.path.join('models', period_dir)
            analyzer.save_models(area, output_dir=model_dir)
            # 处理并存储特征重要性
            importance_dict = process_feature_importance(analyzer, area)
            all_feature_importance.append(importance_dict)
            # 进行预测
            project_pred, total_pred = analyzer.predict(features, project_scores, test_mask)
            # 准备测试数据以保存预测结果
            test_data = area_df[test_mask].copy()
            for project in analyzer.project_names:
                test_data[project] = project_pred[project].values
            test_data['all_score'] = total_pred
            all_predictions.append(test_data)
            # 创建并保存可视化图表
            plots_dir = os.path.join('plots', period_dir)
            if not os.path.exists(plots_dir):
                os.makedirs(plots_dir)
            project_fig = analyzer.plot_project_comparisons(features, project_scores, train_mask)
            project_fig.write_image(os.path.join(plots_dir, f"{area}_project_comparisons.png"))
            total_fig = analyzer.plot_total_score_comparison(project_scores, total_score, train_mask)
            total_fig.write_image(os.path.join(plots_dir, f"{area}_total_score_comparison.png"))
        except Exception as e:
            # 错误处理
            # print(f"处理 {area} 时发生错误: {str(e)}")
            continue
    # 合并所有地区的预测结果
    if all_predictions:
        all_predictions_df = pd.concat(all_predictions, ignore_index=True)
        all_predictions_df = all_predictions_df.sort_values(['area_name', 'year', 'month'])
        predictions_dir = os.path.join('predictions', period_dir)
        all_predictions_df.to_csv(os.path.join(predictions_dir, 'all_areas_predictions.csv'), index=False)
    # 合并所有地区的特征重要性
    if all_feature_importance:
        importance_df = pd.DataFrame(all_feature_importance)
        for col in feature_columns:
            if col not in importance_df.columns:
                importance_df[col] = 0.0
        cols_order = ['area_name'] + [col for col in feature_columns if col in importance_df.columns]
        importance_df = importance_df[cols_order]
        importance_dir = os.path.join('feature_importance', period_dir)
        importance_df.to_csv(os.path.join(importance_dir, 'all_areas_feature_importance.csv'), index=False)
        # 打印保存路径信息
        # print(f"\n所有结果已保存在 {period_dir} 目录下：")
        # if all_predictions:
        # print(f"1. 预测结果：predictions/{period_dir}/all_areas_predictions.csv")
        # if all_feature_importance:
    # print(f"2. 特征重要性：feature_importance/{period_dir}/all_areas_feature_importance.csv")
    # print(f"3. 模型保存在 models/{period_dir} 目录下")
    # print(f"4. 可视化结果保存在 plots/{period_dir} 目录下")
    # 返回预测结果和特征重要性的数据框
    return all_predictions_df if all_predictions else None, \
        importance_df if all_feature_importance else None


def load_model(area_name, project=None, base_dir='models'):
    area_dir = os.path.join(base_dir, area_name)
    if not os.path.exists(area_dir):
        raise FileNotFoundError(f"未找到地区 {area_name} 的模型目录")
    if project is None:
        model_path = os.path.join(area_dir, 'final_model.pkl')
    else:
        model_path = os.path.join(area_dir, f'{project}_model.pkl')
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"未找到模型文件: {model_path}")
    return joblib.load(model_path)


def predict_scores(area_name, features_df, base_dir='models'):
    project_predictions = {}
    project_scores = pd.DataFrame()
    area_dir = os.path.join(base_dir, area_name)
    model_files = [f for f in os.listdir(area_dir) if f.endswith('_model.pkl')]
    for model_file in model_files:
        if model_file == 'final_model.pkl':
            continue
        project = model_file.replace('_model.pkl', '')
        model_data = load_model(area_name, project)
        project_prefix = project + '_'
        project_features = features_df[[col for col in features_df.columns if col.startswith(project_prefix)]]
        X = model_data['scaler'].transform(project_features)
        X_poly = model_data['poly'].transform(X)
        project_predictions[project] = model_data['model'].predict(X_poly)
        project_scores[project] = project_predictions[project]
    final_model_data = load_model(area_name)
    X_scores = final_model_data['scaler'].transform(project_scores)
    X_scores_poly = final_model_data['poly'].transform(X_scores)
    total_score_prediction = final_model_data['model'].predict(X_scores_poly)
    return project_predictions, total_score_prediction


if __name__ == "__main__":

    # file_path = sys.argv[1]
    file_path = 'H:/develop/dama/java/buliangfanying/PredictedScores.txt'
    with open(file_path, 'r', encoding='utf-8') as file:
        json_str = file.read()

    # 解析 JSON 字符串
    data_dict = json.loads(json_str)

    base_dirs = ['models', 'predictions', 'feature_importance', 'plots']
    for dir_name in base_dirs:
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
    prediction_configs = [

        {
            'train_date': data_dict['train_date'],  # 使用6月份以前的数据
            'test_date':  data_dict['test_date'],  # 预测6月份
            'period_months': data_dict['period_months']
        }
        # {
        #     'train_date': '2023-06-01',  # 使用6月份以前的数据
        #     'test_date': '2023-06-01',  # 预测6月份
        #     'period_months': 1
        # }
        # {
        #  'train_date': '2023-04-01',  # 使用4月份以前的数据
        #  'test_date': '2023-06-01',  # 预测6月份
        # 'period_months': 3
        # },
        # {
        #  'train_date': '2023-01-01',  # 使用1月份以前的数据
        #  'test_date': '2023-06-01',  # 预测6月份
        # 'period_months': 6
        # }
    ]
    for config in prediction_configs:
        run_prediction(
            config['train_date'],
            config['test_date'],
            config['period_months']
        )