508 lines
21 KiB
Python
508 lines
21 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.preprocessing import PolynomialFeatures
|
||
from sklearn.linear_model import Ridge
|
||
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
|
||
from sklearn.metrics import r2_score
|
||
import plotly.graph_objects as go
|
||
from plotly.subplots import make_subplots
|
||
import os
|
||
import joblib
|
||
import json
|
||
import sys
|
||
|
||
|
||
# 重新配置标准输出流的编码为 UTF-8
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
|
||
class ComprehensiveScoreAnalyzer:
|
||
def __init__(self):
|
||
self.project_models = {}
|
||
self.final_model = None
|
||
self.feature_scalers = {}
|
||
self.score_scaler = StandardScaler()
|
||
self.project_polys = {}
|
||
self.score_poly = None
|
||
self.project_names = ['xxzs', 'zfjc', 'tsjb', 'xzcf', 'cckh', 'cjjc', 'zhxzb']
|
||
self.feature_importance = {}
|
||
|
||
# 定义每个项目的特征前缀
|
||
self.project_features = {
|
||
'xxzs': 'xxzs_',
|
||
'zfjc': 'zfjc_',
|
||
'tsjb': 'tsjb_',
|
||
'xzcf': 'xzcf_',
|
||
'cckh': 'cckh_',
|
||
'cjjc': 'cjjc_',
|
||
'zhxzb': 'zhxzb_'
|
||
}
|
||
|
||
#
|
||
# def preprocess_data(self, df):
|
||
# df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
|
||
# features = {}
|
||
# for project, prefix in self.project_features.items():
|
||
# project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
|
||
# if project_cols:
|
||
# features[project] = df[project_cols]
|
||
# valid_projects = []
|
||
# project_scores = pd.DataFrame()
|
||
# for project in self.project_names:
|
||
# if df[project].std() > 0 and project in features and not features[project].empty:
|
||
# valid_projects.append(project)
|
||
# project_scores[project] = df[project]
|
||
# else:
|
||
# # print(f"警告: {project} 项目得分无变化或无子特征,已剔除")
|
||
# self.project_names = valid_projects
|
||
# total_score = df['all_score']
|
||
# return features, project_scores, total_score, df['date']
|
||
def preprocess_data(self, df):
|
||
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str))
|
||
features = {}
|
||
for project, prefix in self.project_features.items():
|
||
project_cols = [col for col in df.columns if col.startswith(prefix) and col != project]
|
||
if project_cols:
|
||
features[project] = df[project_cols]
|
||
project_scores = pd.DataFrame()
|
||
for project in self.project_names:
|
||
if project in features and not features[project].empty:
|
||
project_scores[project] = df[project]
|
||
# else:
|
||
# print(f"警告: {project} 项目无子特征或没有数据")
|
||
noise = np.random.normal(0, 0.01, project_scores.shape)
|
||
project_scores += noise
|
||
total_score = df['all_score']
|
||
|
||
return features, project_scores, total_score, df['date']
|
||
|
||
def optimize_model(self, X, y, project_name=None):
|
||
param_grid = {
|
||
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
|
||
'fit_intercept': [True],
|
||
'solver': ['auto']
|
||
}
|
||
tscv = TimeSeriesSplit(n_splits=5)
|
||
grid_search = GridSearchCV(
|
||
Ridge(),
|
||
param_grid,
|
||
cv=tscv,
|
||
scoring='neg_mean_squared_error',
|
||
n_jobs=1,
|
||
verbose=0
|
||
)
|
||
grid_search.fit(X, y)
|
||
# print(f"{'最终' if project_name is None else project_name} 模型最佳参数:", grid_search.best_params_)
|
||
return grid_search.best_estimator_
|
||
|
||
def fit_project_models(self, features, project_scores, train_mask):
|
||
for project in self.project_names:
|
||
# print(f"\n训练 {project} 项目模型:")
|
||
project_features = features[project]
|
||
self.feature_scalers[project] = StandardScaler()
|
||
self.project_polys[project] = PolynomialFeatures(degree=2, include_bias=False)
|
||
X_train = self.feature_scalers[project].fit_transform(project_features[train_mask])
|
||
X_train_poly = self.project_polys[project].fit_transform(X_train)
|
||
y_train = project_scores[project][train_mask]
|
||
model = self.optimize_model(X_train_poly, y_train, project)
|
||
self.project_models[project] = model
|
||
importance = pd.DataFrame({
|
||
'feature': self.project_polys[project].get_feature_names_out(project_features.columns),
|
||
'coefficient': abs(model.coef_)
|
||
}).sort_values('coefficient', ascending=False)
|
||
self.feature_importance[project] = importance
|
||
|
||
def fit_final_model(self, project_scores, total_score, train_mask):
|
||
# print("\n训练最终总分模型:")
|
||
X_scores = self.score_scaler.fit_transform(project_scores[train_mask])
|
||
self.score_poly = PolynomialFeatures(degree=2, include_bias=False)
|
||
X_scores_poly = self.score_poly.fit_transform(X_scores)
|
||
self.final_model = self.optimize_model(X_scores_poly, total_score[train_mask])
|
||
|
||
def predict(self, features, project_scores, test_mask):
|
||
project_predictions = pd.DataFrame()
|
||
for project in self.project_names:
|
||
project_features = features[project]
|
||
X_test = self.feature_scalers[project].transform(project_features[test_mask])
|
||
X_test_poly = self.project_polys[project].transform(X_test)
|
||
project_predictions[project] = self.project_models[project].predict(X_test_poly)
|
||
X_scores_test = self.score_scaler.transform(project_scores[test_mask])
|
||
X_scores_test_poly = self.score_poly.transform(X_scores_test)
|
||
total_score_pred = self.final_model.predict(X_scores_test_poly)
|
||
return project_predictions, total_score_pred
|
||
|
||
def plot_project_comparisons(self, features, project_scores, train_mask):
|
||
n_projects = len(self.project_names)
|
||
n_cols = 3
|
||
n_rows = (n_projects + n_cols - 1) // n_cols
|
||
fig = make_subplots(
|
||
rows=n_rows,
|
||
cols=n_cols,
|
||
subplot_titles=self.project_names,
|
||
vertical_spacing=0.22,
|
||
horizontal_spacing=0.1
|
||
)
|
||
for i, project in enumerate(self.project_names):
|
||
row = i // n_cols + 1
|
||
col = i % n_cols + 1
|
||
project_features = features[project]
|
||
y_true = project_scores[project][train_mask]
|
||
y_pred = self.project_models[project].predict(
|
||
self.project_polys[project].transform(
|
||
self.feature_scalers[project].transform(project_features[train_mask])
|
||
)
|
||
)
|
||
r2 = r2_score(y_true, y_pred)
|
||
fig.add_trace(
|
||
go.Scatter(
|
||
x=y_true,
|
||
y=y_pred,
|
||
mode='markers',
|
||
name=project,
|
||
marker=dict(
|
||
size=8,
|
||
color='rgba(8, 81, 156, 0.6)',
|
||
line=dict(
|
||
color='rgba(8, 81, 156, 0.9)',
|
||
width=1
|
||
)
|
||
),
|
||
showlegend=False
|
||
),
|
||
row=row, col=col
|
||
)
|
||
|
||
min_val = min(min(y_true), min(y_pred))
|
||
max_val = max(max(y_true), max(y_pred))
|
||
fig.add_trace(
|
||
go.Scatter(
|
||
x=[min_val, max_val],
|
||
y=[min_val, max_val],
|
||
mode='lines',
|
||
line=dict(color='red', dash='dash'),
|
||
showlegend=False
|
||
),
|
||
row=row, col=col
|
||
)
|
||
fig.add_annotation(
|
||
text=f'R² = {r2:.4f}',
|
||
xref=f'x{i + 1}',
|
||
yref=f'y{i + 1}',
|
||
x=0.05,
|
||
y=0.95,
|
||
showarrow=False,
|
||
font=dict(size=10),
|
||
xanchor='left',
|
||
yanchor='top'
|
||
)
|
||
pad = 0.1 * (max_val - min_val)
|
||
fig.update_xaxes(range=[min_val - pad, max_val + pad], title_text="实际值", row=row, col=col)
|
||
fig.update_yaxes(range=[min_val - pad, max_val + pad], title_text="预测值", row=row, col=col)
|
||
fig.update_layout(
|
||
height=300 * n_rows,
|
||
width=1000,
|
||
title_text="各项目得分预测结果对比",
|
||
showlegend=False,
|
||
template='plotly_white',
|
||
)
|
||
return fig
|
||
|
||
def plot_total_score_comparison(self, project_scores, total_score, train_mask):
|
||
y_true = total_score[train_mask]
|
||
y_pred = self.final_model.predict(
|
||
self.score_poly.transform(
|
||
self.score_scaler.transform(project_scores[train_mask])))
|
||
r2 = r2_score(y_true, y_pred)
|
||
fig = go.Figure()
|
||
fig.add_trace(
|
||
go.Scatter(
|
||
x=y_true,
|
||
y=y_pred,
|
||
mode='markers',
|
||
name='预测值',
|
||
marker=dict(
|
||
size=10,
|
||
color='rgba(255, 0, 0, 0.6)',
|
||
line=dict(
|
||
color='rgba(255, 0, 0, 0.9)',
|
||
width=1
|
||
)
|
||
)
|
||
)
|
||
)
|
||
min_val = min(min(y_true), min(y_pred))
|
||
max_val = max(max(y_true), max(y_pred))
|
||
fig.add_trace(
|
||
go.Scatter(
|
||
x=[min_val, max_val],
|
||
y=[min_val, max_val],
|
||
mode='lines',
|
||
line=dict(color='red', dash='dash'),
|
||
name='y=x'
|
||
)
|
||
)
|
||
fig.add_annotation(
|
||
text=f'R² = {r2:.4f}',
|
||
xref='paper',
|
||
yref='paper',
|
||
x=0.05,
|
||
y=0.95,
|
||
showarrow=False,
|
||
font=dict(size=14)
|
||
)
|
||
fig.update_layout(
|
||
title_text="总分预测结果对比",
|
||
xaxis_title="实际值",
|
||
yaxis_title="预测值",
|
||
template='plotly_white',
|
||
width=800,
|
||
height=600
|
||
)
|
||
return fig
|
||
|
||
def save_models(self, area_name, output_dir='models'):
|
||
if not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
area_dir = os.path.join(output_dir, area_name)
|
||
if not os.path.exists(area_dir):
|
||
os.makedirs(area_dir)
|
||
for project in self.project_names:
|
||
model_data = {
|
||
'model': self.project_models[project],
|
||
'scaler': self.feature_scalers[project],
|
||
'poly': self.project_polys[project]
|
||
}
|
||
model_path = os.path.join(area_dir, f'{project}_model.pkl')
|
||
joblib.dump(model_data, model_path)
|
||
final_model_data = {
|
||
'model': self.final_model,
|
||
'scaler': self.score_scaler,
|
||
'poly': self.score_poly
|
||
}
|
||
final_model_path = os.path.join(area_dir, 'final_model.pkl')
|
||
joblib.dump(final_model_data, final_model_path)
|
||
|
||
|
||
def process_feature_importance(analyzer, area):
|
||
importance_dict = {'area_name': area}
|
||
for project in analyzer.project_names:
|
||
importance_df = analyzer.feature_importance[project]
|
||
original_features = importance_df[
|
||
~importance_df['feature'].str.contains(' ')
|
||
].copy()
|
||
for _, row in original_features.iterrows():
|
||
importance_dict[row['feature']] = row['coefficient']
|
||
|
||
return importance_dict
|
||
|
||
|
||
def run_prediction(train_date, test_date, period_months):
|
||
period_dir = f"{period_months}个月"
|
||
for base_dir in ['models', 'predictions', 'feature_importance', 'plots']:
|
||
period_path = os.path.join(base_dir, period_dir)
|
||
if not os.path.exists(period_path):
|
||
os.makedirs(period_path)
|
||
try:
|
||
predictions, feature_importance = main(train_date, test_date, period_dir)
|
||
# print(f"\n{period_months}个月预测完成!")
|
||
|
||
# 创建一个包含 predictions 和 feature_importance 的字典
|
||
result = {
|
||
"predictions": json.loads(predictions.to_json(orient='records')),
|
||
"feature_importance": json.loads(feature_importance.to_json(orient='records'))
|
||
}
|
||
|
||
# 使用 json.dumps 将字典转换为 JSON 格式的字符串
|
||
result_json = json.dumps(result, ensure_ascii=False, indent=4)
|
||
print(result_json)
|
||
# if predictions is not None:
|
||
|
||
# print(f"共生成 {len(predictions)} 条预测记录")
|
||
# if feature_importance is not None:
|
||
# print(f"特征重要性分析包含 {len(feature_importance)} 条记录")
|
||
except Exception as e:
|
||
print(f"{period_months}个月预测执行出错: {str(e)}")
|
||
def main(train_date='2023-01-01', test_date='2023-06-01', period_dir='6个月'):
|
||
"""
|
||
主函数,用于执行数据分析和预测流程。
|
||
|
||
参数:
|
||
- train_date (str): 训练数据的截止日期,默认为'2023-01-01'。
|
||
- test_date (str): 测试数据的起始日期,默认为'2023-06-01'。
|
||
- period_dir (str): 保存模型、预测结果和特征重要性等的目录名称,默认为'6个月'。
|
||
|
||
返回:
|
||
- all_predictions_df (pd.DataFrame): 所有地区的预测结果数据框,如果未生成则为None。
|
||
- importance_df (pd.DataFrame): 所有地区的特征重要性数据框,如果未生成则为None。
|
||
"""
|
||
# 读取数据集
|
||
# df = pd.read_csv('data.csv')
|
||
df = pd.read_csv('H:/develop/dama/java/buliangfanying/target/classes/python/data.csv')
|
||
# 获取所有不同的地区名称
|
||
areas = df['area_name'].unique()
|
||
# 初始化存储所有预测结果和特征重要性的列表
|
||
all_predictions = []
|
||
all_feature_importance = []
|
||
# 识别特征列,即以特定前缀开头的列
|
||
feature_columns = [col for col in df.columns if any(col.startswith(prefix)
|
||
for prefix in ['xxzs_', 'zfjc_', 'tsjb_', 'xzcf_',
|
||
'cckh_', 'cjjc_', 'zhxzb_'])]
|
||
# 遍历每个地区
|
||
for area in areas:
|
||
# 打印地区分割线
|
||
# print(f"\n{'=' * 50}")
|
||
# print(f"处理地区: {area}")
|
||
# print(f"{'=' * 50}")
|
||
# 筛选特定地区的数据
|
||
area_df = df[df['area_name'] == area].copy()
|
||
# 初始化 ComprehensiveScoreAnalyzer 实例
|
||
analyzer = ComprehensiveScoreAnalyzer()
|
||
# 预处理数据
|
||
features, project_scores, total_score, dates = analyzer.preprocess_data(area_df)
|
||
# 创建训练集和测试集的掩码
|
||
train_mask = dates < pd.to_datetime(train_date)
|
||
test_mask = dates >= pd.to_datetime(test_date)
|
||
# 检查训练集和测试集的数据量是否充足
|
||
if sum(train_mask) < 2 or sum(test_mask) < 1:
|
||
# print(f"警告: {area} 数据量不足,跳过此地区")
|
||
continue
|
||
try:
|
||
# 训练项目模型
|
||
analyzer.fit_project_models(features, project_scores, train_mask)
|
||
# 训练最终模型
|
||
analyzer.fit_final_model(project_scores, total_score, train_mask)
|
||
# 保存模型
|
||
model_dir = os.path.join('models', period_dir)
|
||
analyzer.save_models(area, output_dir=model_dir)
|
||
# 处理并存储特征重要性
|
||
importance_dict = process_feature_importance(analyzer, area)
|
||
all_feature_importance.append(importance_dict)
|
||
# 进行预测
|
||
project_pred, total_pred = analyzer.predict(features, project_scores, test_mask)
|
||
# 准备测试数据以保存预测结果
|
||
test_data = area_df[test_mask].copy()
|
||
for project in analyzer.project_names:
|
||
test_data[project] = project_pred[project].values
|
||
test_data['all_score'] = total_pred
|
||
all_predictions.append(test_data)
|
||
# 创建并保存可视化图表
|
||
plots_dir = os.path.join('plots', period_dir)
|
||
if not os.path.exists(plots_dir):
|
||
os.makedirs(plots_dir)
|
||
project_fig = analyzer.plot_project_comparisons(features, project_scores, train_mask)
|
||
project_fig.write_image(os.path.join(plots_dir, f"{area}_project_comparisons.png"))
|
||
total_fig = analyzer.plot_total_score_comparison(project_scores, total_score, train_mask)
|
||
total_fig.write_image(os.path.join(plots_dir, f"{area}_total_score_comparison.png"))
|
||
except Exception as e:
|
||
# 错误处理
|
||
# print(f"处理 {area} 时发生错误: {str(e)}")
|
||
continue
|
||
# 合并所有地区的预测结果
|
||
if all_predictions:
|
||
all_predictions_df = pd.concat(all_predictions, ignore_index=True)
|
||
all_predictions_df = all_predictions_df.sort_values(['area_name', 'year', 'month'])
|
||
predictions_dir = os.path.join('predictions', period_dir)
|
||
all_predictions_df.to_csv(os.path.join(predictions_dir, 'all_areas_predictions.csv'), index=False)
|
||
# 合并所有地区的特征重要性
|
||
if all_feature_importance:
|
||
importance_df = pd.DataFrame(all_feature_importance)
|
||
for col in feature_columns:
|
||
if col not in importance_df.columns:
|
||
importance_df[col] = 0.0
|
||
cols_order = ['area_name'] + [col for col in feature_columns if col in importance_df.columns]
|
||
importance_df = importance_df[cols_order]
|
||
importance_dir = os.path.join('feature_importance', period_dir)
|
||
importance_df.to_csv(os.path.join(importance_dir, 'all_areas_feature_importance.csv'), index=False)
|
||
# 打印保存路径信息
|
||
# print(f"\n所有结果已保存在 {period_dir} 目录下:")
|
||
# if all_predictions:
|
||
# print(f"1. 预测结果:predictions/{period_dir}/all_areas_predictions.csv")
|
||
# if all_feature_importance:
|
||
# print(f"2. 特征重要性:feature_importance/{period_dir}/all_areas_feature_importance.csv")
|
||
# print(f"3. 模型保存在 models/{period_dir} 目录下")
|
||
# print(f"4. 可视化结果保存在 plots/{period_dir} 目录下")
|
||
# 返回预测结果和特征重要性的数据框
|
||
return all_predictions_df if all_predictions else None, \
|
||
importance_df if all_feature_importance else None
|
||
|
||
|
||
def load_model(area_name, project=None, base_dir='models'):
|
||
area_dir = os.path.join(base_dir, area_name)
|
||
if not os.path.exists(area_dir):
|
||
raise FileNotFoundError(f"未找到地区 {area_name} 的模型目录")
|
||
if project is None:
|
||
model_path = os.path.join(area_dir, 'final_model.pkl')
|
||
else:
|
||
model_path = os.path.join(area_dir, f'{project}_model.pkl')
|
||
if not os.path.exists(model_path):
|
||
raise FileNotFoundError(f"未找到模型文件: {model_path}")
|
||
return joblib.load(model_path)
|
||
|
||
|
||
def predict_scores(area_name, features_df, base_dir='models'):
|
||
project_predictions = {}
|
||
project_scores = pd.DataFrame()
|
||
area_dir = os.path.join(base_dir, area_name)
|
||
model_files = [f for f in os.listdir(area_dir) if f.endswith('_model.pkl')]
|
||
for model_file in model_files:
|
||
if model_file == 'final_model.pkl':
|
||
continue
|
||
project = model_file.replace('_model.pkl', '')
|
||
model_data = load_model(area_name, project)
|
||
project_prefix = project + '_'
|
||
project_features = features_df[[col for col in features_df.columns if col.startswith(project_prefix)]]
|
||
X = model_data['scaler'].transform(project_features)
|
||
X_poly = model_data['poly'].transform(X)
|
||
project_predictions[project] = model_data['model'].predict(X_poly)
|
||
project_scores[project] = project_predictions[project]
|
||
final_model_data = load_model(area_name)
|
||
X_scores = final_model_data['scaler'].transform(project_scores)
|
||
X_scores_poly = final_model_data['poly'].transform(X_scores)
|
||
total_score_prediction = final_model_data['model'].predict(X_scores_poly)
|
||
return project_predictions, total_score_prediction
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
# file_path = sys.argv[1]
|
||
file_path = 'H:/develop/dama/java/buliangfanying/PredictedScores.txt'
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
json_str = file.read()
|
||
|
||
# 解析 JSON 字符串
|
||
data_dict = json.loads(json_str)
|
||
|
||
base_dirs = ['models', 'predictions', 'feature_importance', 'plots']
|
||
for dir_name in base_dirs:
|
||
if not os.path.exists(dir_name):
|
||
os.makedirs(dir_name)
|
||
prediction_configs = [
|
||
|
||
{
|
||
'train_date': data_dict['train_date'], # 使用6月份以前的数据
|
||
'test_date': data_dict['test_date'], # 预测6月份
|
||
'period_months': data_dict['period_months']
|
||
}
|
||
# {
|
||
# 'train_date': '2023-06-01', # 使用6月份以前的数据
|
||
# 'test_date': '2023-06-01', # 预测6月份
|
||
# 'period_months': 1
|
||
# }
|
||
# {
|
||
# 'train_date': '2023-04-01', # 使用4月份以前的数据
|
||
# 'test_date': '2023-06-01', # 预测6月份
|
||
# 'period_months': 3
|
||
# },
|
||
# {
|
||
# 'train_date': '2023-01-01', # 使用1月份以前的数据
|
||
# 'test_date': '2023-06-01', # 预测6月份
|
||
# 'period_months': 6
|
||
# }
|
||
]
|
||
for config in prediction_configs:
|
||
run_prediction(
|
||
config['train_date'],
|
||
config['test_date'],
|
||
config['period_months']
|
||
)
|