response/target/classes/python/PredictedScores_old.py
2025-07-17 10:41:24 +08:00

508 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import joblib
import json
import sys
# 重新配置标准输出流的编码为 UTF-8
sys.stdout.reconfigure(encoding='utf-8')
class ComprehensiveScoreAnalyzer:
def __init__(self):
self.project_models = {}
self.final_model = None
self.feature_scalers = {}
self.score_scaler = StandardScaler()
self.project_polys = {}
self.score_poly = None
self.project_names = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
self.feature_importance = {}
# 定义每个项目的特征前缀
self.project_features = {
'xxzs': 'xxzs_',
'zfjc': 'zfjc_',
'tsjb': 'tsjb_',
'xzcf': 'xzcf_',
'cckh': 'cckh_',
'cjjc': 'cjjc_',
'zhxzb': 'zhxzb_'
}
#
# def preprocess_data(self, df):
# df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
# features = {}
# for project, prefix in self.project_features.items():
# project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
# if project_cols:
# features[project] = df[project_cols]
# valid_projects = []
# project_scores = pd.DataFrame()
# for project in self.project_names:
# if df[project].std() > 0 and project in features and not features[project].empty:
# valid_projects.append(project)
# project_scores[project] = df[project]
# else:
# # print(f"警告: {project} 项目得分无变化或无子特征,已剔除")
# self.project_names = valid_projects
# total_score = df['all_score']
# return features, project_scores, total_score, df['date']
def preprocess_data(self, df):
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
features = {}
for project, prefix in self.project_features.items():
project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
if project_cols:
features[project] = df[project_cols]
project_scores = pd.DataFrame()
for project in self.project_names:
if project in features and not features[project].empty:
project_scores[project] = df[project]
# else:
# print(f"警告: {project} 项目无子特征或没有数据")
noise = np.random.normal(0, 0.01, project_scores.shape)
project_scores += noise
total_score = df['all_score']
return features, project_scores, total_score, df['date']
def optimize_model(self, X, y, project_name=None):
param_grid = {
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
'fit_intercept': [True],
'solver': ['auto']
}
tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(
Ridge(),
param_grid,
cv=tscv,
scoring='neg_mean_squared_error',
n_jobs=1,
verbose=0
)
grid_search.fit(X, y)
# print(f"{'最终' if project_name is None else project_name} 模型最佳参数:", grid_search.best_params_)
return grid_search.best_estimator_
def fit_project_models(self, features, project_scores, train_mask):
for project in self.project_names:
# print(f"\n训练 {project} 项目模型:")
project_features = features[project]
self.feature_scalers[project] = StandardScaler()
self.project_polys[project] = PolynomialFeatures(degree=2, include_bias=False)
X_train = self.feature_scalers[project].fit_transform(project_features[train_mask])
X_train_poly = self.project_polys[project].fit_transform(X_train)
y_train = project_scores[project][train_mask]
model = self.optimize_model(X_train_poly, y_train, project)
self.project_models[project] = model
importance = pd.DataFrame({
'feature': self.project_polys[project].get_feature_names_out(project_features.columns),
'coefficient': abs(model.coef_)
}).sort_values('coefficient', ascending=False)
self.feature_importance[project] = importance
def fit_final_model(self, project_scores, total_score, train_mask):
# print("\n训练最终总分模型:")
X_scores = self.score_scaler.fit_transform(project_scores[train_mask])
self.score_poly = PolynomialFeatures(degree=2, include_bias=False)
X_scores_poly = self.score_poly.fit_transform(X_scores)
self.final_model = self.optimize_model(X_scores_poly, total_score[train_mask])
def predict(self, features, project_scores, test_mask):
project_predictions = pd.DataFrame()
for project in self.project_names:
project_features = features[project]
X_test = self.feature_scalers[project].transform(project_features[test_mask])
X_test_poly = self.project_polys[project].transform(X_test)
project_predictions[project] = self.project_models[project].predict(X_test_poly)
X_scores_test = self.score_scaler.transform(project_scores[test_mask])
X_scores_test_poly = self.score_poly.transform(X_scores_test)
total_score_pred = self.final_model.predict(X_scores_test_poly)
return project_predictions, total_score_pred
def plot_project_comparisons(self, features, project_scores, train_mask):
n_projects = len(self.project_names)
n_cols = 3
n_rows = (n_projects + n_cols - 1) // n_cols
fig = make_subplots(
rows=n_rows,
cols=n_cols,
subplot_titles=self.project_names,
vertical_spacing=0.22,
horizontal_spacing=0.1
)
for i, project in enumerate(self.project_names):
row = i // n_cols + 1
col = i % n_cols + 1
project_features = features[project]
y_true = project_scores[project][train_mask]
y_pred = self.project_models[project].predict(
self.project_polys[project].transform(
self.feature_scalers[project].transform(project_features[train_mask])
)
)
r2 = r2_score(y_true, y_pred)
fig.add_trace(
go.Scatter(
x=y_true,
y=y_pred,
mode='markers',
name=project,
marker=dict(
size=8,
color='rgba(8, 81, 156, 0.6)',
line=dict(
color='rgba(8, 81, 156, 0.9)',
width=1
)
),
showlegend=False
),
row=row, col=col
)
min_val = min(min(y_true), min(y_pred))
max_val = max(max(y_true), max(y_pred))
fig.add_trace(
go.Scatter(
x=[min_val, max_val],
y=[min_val, max_val],
mode='lines',
line=dict(color='red', dash='dash'),
showlegend=False
),
row=row, col=col
)
fig.add_annotation(
text=f'R² = {r2:.4f}',
xref=f'x{i + 1}',
yref=f'y{i + 1}',
x=0.05,
y=0.95,
showarrow=False,
font=dict(size=10),
xanchor='left',
yanchor='top'
)
pad = 0.1 * (max_val - min_val)
fig.update_xaxes(range=[min_val - pad, max_val + pad], title_text="实际值", row=row, col=col)
fig.update_yaxes(range=[min_val - pad, max_val + pad], title_text="预测值", row=row, col=col)
fig.update_layout(
height=300 * n_rows,
width=1000,
title_text="各项目得分预测结果对比",
showlegend=False,
template='plotly_white',
)
return fig
def plot_total_score_comparison(self, project_scores, total_score, train_mask):
y_true = total_score[train_mask]
y_pred = self.final_model.predict(
self.score_poly.transform(
self.score_scaler.transform(project_scores[train_mask])))
r2 = r2_score(y_true, y_pred)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=y_true,
y=y_pred,
mode='markers',
name='预测值',
marker=dict(
size=10,
color='rgba(255, 0, 0, 0.6)',
line=dict(
color='rgba(255, 0, 0, 0.9)',
width=1
)
)
)
)
min_val = min(min(y_true), min(y_pred))
max_val = max(max(y_true), max(y_pred))
fig.add_trace(
go.Scatter(
x=[min_val, max_val],
y=[min_val, max_val],
mode='lines',
line=dict(color='red', dash='dash'),
name='y=x'
)
)
fig.add_annotation(
text=f'R² = {r2:.4f}',
xref='paper',
yref='paper',
x=0.05,
y=0.95,
showarrow=False,
font=dict(size=14)
)
fig.update_layout(
title_text="总分预测结果对比",
xaxis_title="实际值",
yaxis_title="预测值",
template='plotly_white',
width=800,
height=600
)
return fig
def save_models(self, area_name, output_dir='models'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
area_dir = os.path.join(output_dir, area_name)
if not os.path.exists(area_dir):
os.makedirs(area_dir)
for project in self.project_names:
model_data = {
'model': self.project_models[project],
'scaler': self.feature_scalers[project],
'poly': self.project_polys[project]
}
model_path = os.path.join(area_dir, f'{project}_model.pkl')
joblib.dump(model_data, model_path)
final_model_data = {
'model': self.final_model,
'scaler': self.score_scaler,
'poly': self.score_poly
}
final_model_path = os.path.join(area_dir, 'final_model.pkl')
joblib.dump(final_model_data, final_model_path)
def process_feature_importance(analyzer, area):
importance_dict = {'area_name': area}
for project in analyzer.project_names:
importance_df = analyzer.feature_importance[project]
original_features = importance_df[
~importance_df['feature'].str.contains(' ')
].copy()
for _, row in original_features.iterrows():
importance_dict[row['feature']] = row['coefficient']
return importance_dict
def run_prediction(train_date, test_date, period_months):
period_dir = f"{period_months}个月"
for base_dir in ['models', 'predictions', 'feature_importance', 'plots']:
period_path = os.path.join(base_dir, period_dir)
if not os.path.exists(period_path):
os.makedirs(period_path)
try:
predictions, feature_importance = main(train_date, test_date, period_dir)
# print(f"\n{period_months}个月预测完成!")
# 创建一个包含 predictions 和 feature_importance 的字典
result = {
"predictions": json.loads(predictions.to_json(orient='records')),
"feature_importance": json.loads(feature_importance.to_json(orient='records'))
}
# 使用 json.dumps 将字典转换为 JSON 格式的字符串
result_json = json.dumps(result, ensure_ascii=False, indent=4)
print(result_json)
# if predictions is not None:
# print(f"共生成 {len(predictions)} 条预测记录")
# if feature_importance is not None:
# print(f"特征重要性分析包含 {len(feature_importance)} 条记录")
except Exception as e:
print(f"{period_months}个月预测执行出错: {str(e)}")
def main(train_date='2023-01-01', test_date='2023-06-01', period_dir='6个月'):
"""
主函数,用于执行数据分析和预测流程。
参数:
- train_date (str): 训练数据的截止日期,默认为'2023-01-01'
- test_date (str): 测试数据的起始日期,默认为'2023-06-01'
- period_dir (str): 保存模型、预测结果和特征重要性等的目录名称,默认为'6个月'
返回:
- all_predictions_df (pd.DataFrame): 所有地区的预测结果数据框如果未生成则为None。
- importance_df (pd.DataFrame): 所有地区的特征重要性数据框如果未生成则为None。
"""
# 读取数据集
# df = pd.read_csv('data.csv')
df = pd.read_csv('H:/develop/dama/java/buliangfanying/target/classes/python/data.csv')
# 获取所有不同的地区名称
areas = df['area_name'].unique()
# 初始化存储所有预测结果和特征重要性的列表
all_predictions = []
all_feature_importance = []
# 识别特征列,即以特定前缀开头的列
feature_columns = [col for col in df.columns if any(col.startswith(prefix)
for prefix in ['xxzs_', 'zfjc_', 'tsjb_', 'xzcf_',
'cckh_', 'cjjc_', 'zhxzb_'])]
# 遍历每个地区
for area in areas:
# 打印地区分割线
# print(f"\n{'=' * 50}")
# print(f"处理地区: {area}")
# print(f"{'=' * 50}")
# 筛选特定地区的数据
area_df = df[df['area_name'] == area].copy()
# 初始化 ComprehensiveScoreAnalyzer 实例
analyzer = ComprehensiveScoreAnalyzer()
# 预处理数据
features, project_scores, total_score, dates = analyzer.preprocess_data(area_df)
# 创建训练集和测试集的掩码
train_mask = dates < pd.to_datetime(train_date)
test_mask = dates >= pd.to_datetime(test_date)
# 检查训练集和测试集的数据量是否充足
if sum(train_mask) < 2 or sum(test_mask) < 1:
# print(f"警告: {area} 数据量不足,跳过此地区")
continue
try:
# 训练项目模型
analyzer.fit_project_models(features, project_scores, train_mask)
# 训练最终模型
analyzer.fit_final_model(project_scores, total_score, train_mask)
# 保存模型
model_dir = os.path.join('models', period_dir)
analyzer.save_models(area, output_dir=model_dir)
# 处理并存储特征重要性
importance_dict = process_feature_importance(analyzer, area)
all_feature_importance.append(importance_dict)
# 进行预测
project_pred, total_pred = analyzer.predict(features, project_scores, test_mask)
# 准备测试数据以保存预测结果
test_data = area_df[test_mask].copy()
for project in analyzer.project_names:
test_data[project] = project_pred[project].values
test_data['all_score'] = total_pred
all_predictions.append(test_data)
# 创建并保存可视化图表
plots_dir = os.path.join('plots', period_dir)
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
project_fig = analyzer.plot_project_comparisons(features, project_scores, train_mask)
project_fig.write_image(os.path.join(plots_dir, f"{area}_project_comparisons.png"))
total_fig = analyzer.plot_total_score_comparison(project_scores, total_score, train_mask)
total_fig.write_image(os.path.join(plots_dir, f"{area}_total_score_comparison.png"))
except Exception as e:
# 错误处理
# print(f"处理 {area} 时发生错误: {str(e)}")
continue
# 合并所有地区的预测结果
if all_predictions:
all_predictions_df = pd.concat(all_predictions, ignore_index=True)
all_predictions_df = all_predictions_df.sort_values(['area_name', 'year', 'month'])
predictions_dir = os.path.join('predictions', period_dir)
all_predictions_df.to_csv(os.path.join(predictions_dir, 'all_areas_predictions.csv'), index=False)
# 合并所有地区的特征重要性
if all_feature_importance:
importance_df = pd.DataFrame(all_feature_importance)
for col in feature_columns:
if col not in importance_df.columns:
importance_df[col] = 0.0
cols_order = ['area_name'] + [col for col in feature_columns if col in importance_df.columns]
importance_df = importance_df[cols_order]
importance_dir = os.path.join('feature_importance', period_dir)
importance_df.to_csv(os.path.join(importance_dir, 'all_areas_feature_importance.csv'), index=False)
# 打印保存路径信息
# print(f"\n所有结果已保存在 {period_dir} 目录下:")
# if all_predictions:
# print(f"1. 预测结果predictions/{period_dir}/all_areas_predictions.csv")
# if all_feature_importance:
# print(f"2. 特征重要性feature_importance/{period_dir}/all_areas_feature_importance.csv")
# print(f"3. 模型保存在 models/{period_dir} 目录下")
# print(f"4. 可视化结果保存在 plots/{period_dir} 目录下")
# 返回预测结果和特征重要性的数据框
return all_predictions_df if all_predictions else None, \
importance_df if all_feature_importance else None
def load_model(area_name, project=None, base_dir='models'):
area_dir = os.path.join(base_dir, area_name)
if not os.path.exists(area_dir):
raise FileNotFoundError(f"未找到地区 {area_name} 的模型目录")
if project is None:
model_path = os.path.join(area_dir, 'final_model.pkl')
else:
model_path = os.path.join(area_dir, f'{project}_model.pkl')
if not os.path.exists(model_path):
raise FileNotFoundError(f"未找到模型文件: {model_path}")
return joblib.load(model_path)
def predict_scores(area_name, features_df, base_dir='models'):
project_predictions = {}
project_scores = pd.DataFrame()
area_dir = os.path.join(base_dir, area_name)
model_files = [f for f in os.listdir(area_dir) if f.endswith('_model.pkl')]
for model_file in model_files:
if model_file == 'final_model.pkl':
continue
project = model_file.replace('_model.pkl', '')
model_data = load_model(area_name, project)
project_prefix = project + '_'
project_features = features_df[[col for col in features_df.columns if col.startswith(project_prefix)]]
X = model_data['scaler'].transform(project_features)
X_poly = model_data['poly'].transform(X)
project_predictions[project] = model_data['model'].predict(X_poly)
project_scores[project] = project_predictions[project]
final_model_data = load_model(area_name)
X_scores = final_model_data['scaler'].transform(project_scores)
X_scores_poly = final_model_data['poly'].transform(X_scores)
total_score_prediction = final_model_data['model'].predict(X_scores_poly)
return project_predictions, total_score_prediction
if __name__ == "__main__":
# file_path = sys.argv[1]
file_path = 'H:/develop/dama/java/buliangfanying/PredictedScores.txt'
with open(file_path, 'r', encoding='utf-8') as file:
json_str = file.read()
# 解析 JSON 字符串
data_dict = json.loads(json_str)
base_dirs = ['models', 'predictions', 'feature_importance', 'plots']
for dir_name in base_dirs:
if not os.path.exists(dir_name):
os.makedirs(dir_name)
prediction_configs = [
{
'train_date': data_dict['train_date'], # 使用6月份以前的数据
'test_date': data_dict['test_date'], # 预测6月份
'period_months': data_dict['period_months']
}
# {
# 'train_date': '2023-06-01', # 使用6月份以前的数据
# 'test_date': '2023-06-01', # 预测6月份
# 'period_months': 1
# }
# {
# 'train_date': '2023-04-01', # 使用4月份以前的数据
# 'test_date': '2023-06-01', # 预测6月份
# 'period_months': 3
# },
# {
# 'train_date': '2023-01-01', # 使用1月份以前的数据
# 'test_date': '2023-06-01', # 预测6月份
# 'period_months': 6
# }
]
for config in prediction_configs:
run_prediction(
config['train_date'],
config['test_date'],
config['period_months']
)