response/target/classes/python/PredictedScores.py

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta


class RidgePredictor:
    def __init__(self, base_model_path='models'):
        """
        初始化预测器
        base_model_path: 模型存储的基础路径
        """
        self.base_model_path = base_model_path
        self.scaler = StandardScaler()
        os.makedirs(base_model_path, exist_ok=True)

    def get_project_features(self, df, project):
        """获取项目对应的特征列名"""
        try:
            features = [col for col in df.columns if col.startswith(f'{project}_')]
            if not features:
                raise ValueError(f"No features found for project {project}")
            return features
        except Exception as e:
            print(f"Error in get_project_features: {str(e)}")
            raise

    def print_ridge_formula(self, model, feature_names, scaler):
        """打印岭回归公式"""
        try:
            coefficients = model.coef_
            intercept = model.intercept_

            formula = "y = "
            for i, (coef, name) in enumerate(zip(coefficients, feature_names)):
                if i > 0:
                    formula += " + " if coef >= 0 else " - "
                else:
                    formula += "-" if coef < 0 else ""
                formula += f"{abs(coef):.4f} * {name}"

            formula += f" + {intercept:.4f}"

            # 添加标准化说明
            formula += "注：特征已进行标准化处理，使用以下参数："
            for i, name in enumerate(feature_names):
                formula += f"{name}: mean={scaler.mean_[i]:.4f}, scale={scaler.scale_[i]:.4f}"
            return formula
        except Exception as e:
            print(f"Error in print_ridge_formula: {str(e)}")
            raise

        def get_feature_average(self, df, area, project, year, month, n_months=3):
            """获取前n个月的特征平均值"""
            try:
                feature_cols = self.get_project_features(df, project)

                # 计算前n个月的日期范围
                dates = []
                curr_year, curr_month = year, month
                for _ in range(n_months):
                    curr_month -= 1
                    if curr_month == 0:
                        curr_month = 12
                        curr_year -= 1
                    dates.append((curr_year, curr_month))

                # 获取特征值
                features_list = []
                for year, month in dates:
                    mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
                    if sum(mask) > 0:
                        features_list.append(df[mask][feature_cols].values[0])

                if not features_list:
                    raise ValueError(f"No feature data found for {area} in specified months")

                return np.mean(features_list, axis=0)
            except Exception as e:
                print(f"Error in get_feature_average: {str(e)}")
                raise

        def train_and_evaluate(self, df, area, project, target_year, target_month):
            """训练模型并评估性能"""
            try:
                feature_cols = self.get_project_features(df, project)

                # 准备训练数据
                dates = []
                curr_year, curr_month = target_year, target_month
                for _ in range(12):  # 使用前12个月的数据
                    curr_month -= 1
                    if curr_month == 0:
                        curr_month = 12
                        curr_year -= 1
                    dates.append((curr_year, curr_month))

                train_data = []
                for year, month in dates:
                    mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
                    if sum(mask) > 0:
                        train_data.append(df[mask])

                if not train_data:
                    raise ValueError(f"No training data found for {area}")

                train_df = pd.concat(train_data)

                X = train_df[feature_cols].values
                y = train_df[project].values

                # 数据标准化
                X_scaled = self.scaler.fit_transform(X)

                # 网格搜索最佳参数
                param_grid = {'alpha': np.logspace(-3, 3, 7)}
                ridge = Ridge()
                grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
                grid_search.fit(X_scaled, y)

                # 获取最佳模型
                best_model = grid_search.best_estimator_

                # 计算评估指标
                y_pred = best_model.predict(X_scaled)
                r2 = r2_score(y, y_pred)
                mse = mean_squared_error(y, y_pred)

                # 保存模型和标准化器
                model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
                os.makedirs(model_dir, exist_ok=True)

                model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
                scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")

                joblib.dump(best_model, model_path)
                joblib.dump(self.scaler, scaler_path)

                # 生成岭回归公式
                formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)

                return best_model, self.scaler, r2, mse, formula

            except Exception as e:
                print(f"Error in train_and_evaluate: {str(e)}")
                raise

    def get_feature_average(self, df, area, project, year, month, n_months=3):
        """获取前n个月的特征平均值"""
        try:
            feature_cols = self.get_project_features(df, project)

            # 计算前n个月的日期范围
            dates = []
            curr_year, curr_month = year, month
            for _ in range(n_months):
                curr_month -= 1
                if curr_month == 0:
                    curr_month = 12
                    curr_year -= 1
                dates.append((curr_year, curr_month))

            # 获取特征值
            features_list = []
            for year, month in dates:
                mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
                if sum(mask) > 0:
                    features_list.append(df[mask][feature_cols].values[0])

            if not features_list:
                raise ValueError(f"No feature data found for {area} in specified months")

            return np.mean(features_list, axis=0)
        except Exception as e:
            print(f"Error in get_feature_average: {str(e)}")
            raise

    def train_and_evaluate(self, df, area, project, target_year, target_month):
        """训练模型并评估性能"""
        try:
            feature_cols = self.get_project_features(df, project)

            # 准备训练数据
            dates = []
            curr_year, curr_month = target_year, target_month
            for _ in range(12):  # 使用前12个月的数据
                curr_month -= 1
                if curr_month == 0:
                    curr_month = 12
                    curr_year -= 1
                dates.append((curr_year, curr_month))

            train_data = []
            for year, month in dates:
                mask = (df['area_name'] == area) & (df['year'] == year) & (df['month'] == month)
                if sum(mask) > 0:
                    train_data.append(df[mask])

            if not train_data:
                raise ValueError(f"No training data found for {area}")

            train_df = pd.concat(train_data)

            X = train_df[feature_cols].values
            y = train_df[project].values

            # 数据标准化
            X_scaled = self.scaler.fit_transform(X)

            # 网格搜索最佳参数
            param_grid = {'alpha': np.logspace(-3, 3, 7)}
            ridge = Ridge()
            grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_scaled, y)
            print(f"区域: {area}, 项目: {project} - 当前模型的最佳参数: {grid_search.best_params_}")

            # 获取最佳模型
            best_model = grid_search.best_estimator_

            # 计算评估指标
            y_pred = best_model.predict(X_scaled)
            r2 = r2_score(y, y_pred)
            mse = mean_squared_error(y, y_pred)

            # 保存模型和标准化器
            model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
            os.makedirs(model_dir, exist_ok=True)

            model_path = os.path.join(model_dir, f"model_{target_year}_{target_month}.pkl")
            scaler_path = os.path.join(model_dir, f"scaler_{target_year}_{target_month}.pkl")

            joblib.dump(best_model, model_path)
            joblib.dump(self.scaler, scaler_path)

            # 生成岭回归公式
            formula = self.print_ridge_formula(best_model, feature_cols, self.scaler)

            return best_model, self.scaler, r2, mse, formula

        except Exception as e:
            print(f"Error in train_and_evaluate: {str(e)}")
            raise

    def predict_with_history(self, df, area, project, start_year=2023, start_month=1, end_month=6):
        """使用历史数据进行预测（2023.1-6）"""
        try:
            results = []
            feature_importance = []
            model_metrics = []

            for month in range(start_month, end_month + 1):
                print(f"Predicting {area} - {project} for {start_year}/{month}")

                # 训练模型并评估
                model, scaler, r2, mse, formula = self.train_and_evaluate(
                    df, area, project, start_year, month
                )

                # 获取预测用的特征（前3个月平均）
                test_features = self.get_feature_average(
                    df, area, project, start_year, month
                )

                # 标准化特征并预测
                test_features_scaled = scaler.transform(test_features.reshape(1, -1))
                prediction = model.predict(test_features_scaled)[0]

                # 获取特征重要性
                feature_cols = self.get_project_features(df, project)
                importance = abs(model.coef_)

                # 修改特征重要性的存储格式
                importance_dict = {
                    'area': area,
                    'year': start_year,
                    'month': month,
                }
                # 将特征重要性存储为 project_feature: value 的格式
                for feat, imp in zip(feature_cols, importance):
                    importance_dict[f'{feat}'] = imp

                # 查找是否已存在相同area/year/month的记录
                existing_idx = next((
                    i for i, x in enumerate(feature_importance)
                    if x['area'] == area and x['year'] == start_year and x['month'] == month
                ), None)

                if existing_idx is not None:
                    # 如果存在，更新现有记录
                    feature_importance[existing_idx].update(importance_dict)
                else:
                    # 如果不存在，添加新记录
                    feature_importance.append(importance_dict)

                results.append({
                    'area': area,
                    'project': project,
                    'year': start_year,
                    'month': month,
                    'prediction': prediction,
                    'actual': df[(df['area_name'] == area) &
                                 (df['year'] == start_year) &
                                 (df['month'] == month)][project].values[0]
                })

                model_metrics.append({
                    'area': area,
                    'project': project,
                    'year': start_year,
                    'month': month,
                    'r2': r2,
                    'mse': mse,
                    'formula': formula
                })

            return results, feature_importance, model_metrics

        except Exception as e:
            print(f"Error in predict_with_history: {str(e)}")
            raise
    def predict_future(self, df, area, project, start_month=7, end_month=9, year=2023):
        """预测未来月份（2023.7-9）"""
        try:
            results = []

            # 加载6月的模型
            model_dir = os.path.join(self.base_model_path, f"{area}_{project}")
            model = joblib.load(os.path.join(model_dir, f"model_{year}_6.pkl"))
            scaler = joblib.load(os.path.join(model_dir, f"scaler_{year}_6.pkl"))

            for month in range(start_month, end_month + 1):
                print(f"Predicting future {area} - {project} for {year}/{month}")

                if month == 7:
                    # 使用4,5,6月的特征平均值
                    test_features = np.mean([
                        self.get_feature_average(df, area, project, year, m)
                        for m in [4, 5, 6]
                    ], axis=0)
                else:
                    # 使用前三个预测结果的特征
                    test_features = np.mean([
                        self.get_feature_average(df, area, project, year, m)
                        for m in range(month - 3, month)
                    ], axis=0)

                test_features_scaled = scaler.transform(test_features.reshape(1, -1))
                prediction = model.predict(test_features_scaled)[0]

                results.append({
                    'area': area,
                    'project': project,
                    'year': year,
                    'month': month,
                    'prediction': prediction
                })

            return results

        except Exception as e:
            print(f"Error in predict_future: {str(e)}")
            raise

    def format_all_predictions(all_results, all_future_results):
        """
        将所有预测结果格式化为指定格式
        """
        # 创建一个字典来存储所有结果
        formatted_data = {}

        # 合并所有结果
        all_predictions = all_results + all_future_results

        # 处理每个预测结果
        for result in all_predictions:
            key = (result['area'], result['year'], result['month'])

            if key not in formatted_data:
                formatted_data[key] = {
                    'area_name': result['area'],
                    'year': result['year'],
                    'month': result['month'],
                    'xxzs': 0,
                    'zfjc': 0,
                    'tsjb': 0,
                    'xzcf': 0,
                    'cckh': 0,
                    'cjjc': 0,
                    'zhxzb': 0
                }

            # 将预测值转换为百分比并保存
            project = result['project']
            formatted_data[key][project] = round(result['prediction'] * 100, 8)

        # 转换为列表并排序
        formatted_list = list(formatted_data.values())
        formatted_list.sort(key=lambda x: (x['area_name'], x['year'], x['month']))

        return formatted_list

    def update_predictions(original_data_path, new_data_path):
        """
        使用新数据更新预测
        original_data_path: 原始数据文件路径
        new_data_path: 新数据文件路径
        """
        try:
            # 读取原始数据和新数据
            original_df = pd.read_excel(original_data_path)
            new_df = pd.read_excel(new_data_path)

            # 确保新数据格式与原始数据一致
            required_cols = set(original_df.columns)
            if not set(new_df.columns).issuperset(required_cols):
                raise ValueError("New data missing required columns")

            # 合并数据并去重（保留最新）
            updated_df = pd.concat([original_df, new_df], ignore_index=True)
            updated_df = updated_df.drop_duplicates(
                subset=['area_name', 'year', 'month'],
                keep='last'
            ).reset_index(drop=True)

            # 初始化预测器
            predictor = RidgePredictor()

            # 存储所有结果
            all_results = []
            all_importance = []
            all_metrics = []
            all_future_results = []

            # 获取所有地区和项目
            areas = updated_df['area_name'].unique()
            projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']

            # 获取最新的时间点
            latest_date = pd.to_datetime(
                updated_df['year'].astype(str) + '-' +
                updated_df['month'].astype(str) + '-01'
            ).max()

            # 对每个地区进行预测
            for area in areas:
                print(f"Updating predictions for {area}")
                current_area_importance = None

                for project in projects:
                    print(f"Processing project: {project}")

                    # 使用更新后的数据重新训练并预测
                    results, importance, metrics = predictor.predict_with_history(
                        updated_df,
                        area,
                        project,
                        start_year=latest_date.year,
                        start_month=latest_date.month,
                        end_month=latest_date.month
                    )
                    all_results.extend(results)

                    # 更新特征重要性
                    if current_area_importance is None:
                        current_area_importance = importance
                    else:
                        current_area_importance = [
                            dict(current, **new)
                            for current, new in zip(current_area_importance, importance)
                        ]

                    all_metrics.extend(metrics)

                    # 预测未来3个月
                    future_results = predictor.predict_future(
                        updated_df,
                        area,
                        project,
                        start_month=latest_date.month + 1,
                        end_month=latest_date.month + 3,
                        year=latest_date.year
                    )
                    all_future_results.extend(future_results)

                if current_area_importance:
                    all_importance.extend(current_area_importance)

            # 保存更新后的结果
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

            # 保存更新后的完整数据集
            updated_df.to_excel(f'updated_dataset_{timestamp}.xlsx', index=False)

            # 保存新的预测结果
            pd.DataFrame(all_results).to_excel(
                f'updated_predictions_{timestamp}.xlsx', index=False
            )

            # 保存新的特征重要性
            importance_df = pd.DataFrame(all_importance)
            cols = ['area', 'year', 'month']
            feature_cols = [col for col in importance_df.columns if col not in cols]
            importance_df = importance_df[cols + sorted(feature_cols)]
            importance_df.to_excel(
                f'updated_feature_importance_{timestamp}.xlsx', index=False
            )

            # 保存新的模型评估指标
            pd.DataFrame(all_metrics).to_excel(
                f'updated_model_metrics_{timestamp}.xlsx', index=False
            )

            # 保存新的未来预测结果
            pd.DataFrame(all_future_results).to_excel(
                f'updated_future_predictions_{timestamp}.xlsx', index=False
            )

            print("Update and prediction completed successfully!")
            return updated_df, all_results, all_future_results

        except Exception as e:
            print(f"Error in update_predictions: {str(e)}")
            raise

def main():
    try:
        # 读取数据
#         df = pd.read_excel('test.xlsx')
        df = pd.read_csv('H:\公司-文件\公司-维域-文件\buliang-main\buliang\src\main\resources\python\data.csv')

        # 初始化预测器
        predictor = RidgePredictor()

        # 存储所有结果
        all_results = []
        all_importance = []
        all_metrics = []
        all_future_results = []

        # 获取所有地区和项目
        areas = df['area_name'].unique()
        projects = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']

        # 对每个地区进行预测
        for area in areas:
            print(f"Processing{area}")
            # 初始化当前地区的特征重要性列表
            current_area_importance = None

            # 对每个项目进行预测
            for project in projects:
                print(f"Processing project: {project}")

                # 预测2023.1-6
                results, importance, metrics = predictor.predict_with_history(
                    df, area, project
                )
                all_results.extend(results)

                # 更新当前地区的特征重要性
                if current_area_importance is None:
                    current_area_importance = importance
                else:
                    # 合并特征重要性
                    current_area_importance = [
                        dict(current, **new)
                        for current, new in zip(current_area_importance, importance)
                    ]

                all_metrics.extend(metrics)

                # 预测2023.7-9
                future_results = predictor.predict_future(df, area, project)
                all_future_results.extend(future_results)

            # 将该地区的所有特征重要性添加到总结果中
            if current_area_importance:
                all_importance.extend(current_area_importance)

            # 保存结果
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        # 转换预测结果格式
        def format_predictions(results_list):
            formatted_data = {}
            for result in results_list:
                key = (result['area'], result['year'], result['month'])
                if key not in formatted_data:
                    formatted_data[key] = {
                        'area_name': result['area'],
                        'year': result['year'],
                        'month': result['month'],
                        'xxzs': 0,
                        'zfjc': 0,
                        'tsjb': 0,
                        'xzcf': 0,
                        'cckh': 0,
                        'cjjc': 0,
                        'zhxzb': 0
                    }
                formatted_data[key][result['project']] = round(result['prediction'] , 8)
            return list(formatted_data.values())

        # 合并并保存所有预测结果
        all_predictions = format_predictions(all_results + all_future_results)
        predictions_df = pd.DataFrame(all_predictions)
        predictions_df = predictions_df[['area_name', 'year', 'month',
                                      'xxzs', 'zfjc', 'tsjb', 'xzcf',
                                      'cckh', 'cjjc', 'zhxzb']]
        predictions_df.to_excel(f'all_predictions_{timestamp}.xlsx', index=False)


        # 保存1-6月预测结果
        pd.DataFrame(all_results).to_excel(
            f'predictions_1_6_{timestamp}.xlsx', index=False
        )

        # 保存特征重要性
        importance_df = pd.DataFrame(all_importance)
        # 确保列的顺序
        cols = ['area', 'year', 'month']
        feature_cols = [col for col in importance_df.columns if col not in cols]
        importance_df = importance_df[cols + sorted(feature_cols)]
        importance_df.to_excel(
            f'feature_importance_{timestamp}.xlsx', index=False
        )

        # 保存模型评估指标
        pd.DataFrame(all_metrics).to_excel(
            f'model_metrics_{timestamp}.xlsx', index=False
        )

        # 保存7-9月预测结果
        pd.DataFrame(all_future_results).to_excel(
            f'predictions_7_9_{timestamp}.xlsx', index=False
        )

        print("Prediction completed successfully!")

        # 当有新数据时，可以运行更新预测
        # update_predictions('test.xlsx', 'new_data.xlsx')
    except Exception as e:
        print(f"Error in main function: {str(e)}")
        raise


if __name__ == "__main__":
    main()