126 lines
4.2 KiB
Python
126 lines
4.2 KiB
Python
import pandas as pd
|
|
from sklearn.linear_model import LassoCV, Lasso
|
|
from sklearn.preprocessing import StandardScaler
|
|
import json
|
|
import sys
|
|
|
|
# 重新配置标准输出流的编码为 UTF-8
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
|
|
def load_data(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
json_str = file.read()
|
|
|
|
# 解析 JSON 字符串
|
|
data_dict = json.loads(json_str)
|
|
# 转为 Pandas DataFrame
|
|
data = pd.DataFrame(data_dict)
|
|
|
|
# # """加载数据并重命名列"""
|
|
# data = pd.read_csv(file_path)
|
|
new_names = [f'q{i}' for i in range(7, 189)]
|
|
data.columns.values[6:189] = new_names
|
|
return data
|
|
|
|
|
|
def prepare_data_year(data):
|
|
# 确保 'year' 和 'month' 列的数据类型是整数
|
|
data['year'] = data['year'].astype(int)
|
|
data['month'] = data['month'].astype(int)
|
|
|
|
results = []
|
|
new_names = [f'q{i}' for i in range(10, 188)] # 确保此列名称与选择特征匹配
|
|
for month in range(1, 13): # 循环从1到12
|
|
# 动态选择数据
|
|
# if month <= 5:
|
|
# selected_data = (
|
|
# data[(data['year'] == 2021) | (data['year'] == 2022) | (
|
|
# (data['year'] == 2023) & (data['month'] <= 6 - month))]
|
|
# .groupby('area_name')[new_names].mean().reset_index()
|
|
# )
|
|
# else:
|
|
# selected_data = (
|
|
# data[(data['year'] == 2021) | (
|
|
# (data['year'] == 2022) & (data['month'] <= 12 + 6 - month))]
|
|
# .groupby('area_name')[new_names].mean().reset_index()
|
|
# )
|
|
|
|
if month <= 5:
|
|
selected_data = (
|
|
data[(data['year'] == 2022) | (
|
|
(data['year'] == 2023) & (data['month'] <= 6 - month))]
|
|
.groupby('area_name')[new_names].mean().reset_index()
|
|
)
|
|
else:
|
|
selected_data = (
|
|
data[(
|
|
(data['year'] == 2022) & (data['month'] <= 12 + 6 - month))]
|
|
.groupby('area_name')[new_names].mean().reset_index()
|
|
)
|
|
|
|
# 获取对应月份的all_score
|
|
all_score = data[(data['year'] == 2023) & (data['month'] == 6)].dropna(subset=['all_score'])[
|
|
['area_name', 'all_score']]
|
|
|
|
# 合并数据
|
|
model_data = pd.merge(selected_data, all_score, on='area_name')
|
|
|
|
# 移除所有值相同的列
|
|
model_data = model_data.loc[:, model_data.nunique() > 1]
|
|
|
|
# 将结果添加到列表中
|
|
results.append(model_data)
|
|
|
|
return results
|
|
|
|
|
|
def train_lasso(X, y, month):
|
|
"""训练LASSO回归模型并返回最佳模型和预测值"""
|
|
scaler = StandardScaler()
|
|
X_scaled = scaler.fit_transform(X)
|
|
lasso_cv = LassoCV(cv=16, random_state=42).fit(X_scaled, y)
|
|
best_lambda = lasso_cv.alpha_
|
|
lasso_model = Lasso(alpha=best_lambda).fit(X_scaled, y)
|
|
predictions = lasso_model.predict(X_scaled)
|
|
|
|
# 保存模型和标准化器
|
|
# joblib.dump(lasso_model, 'lasso_model' + str(month) + '.pkl')
|
|
|
|
return lasso_model, predictions, best_lambda, lasso_cv
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# 1. 加载数据
|
|
file_path = sys.argv[1]
|
|
data = load_data(file_path)
|
|
|
|
# 2. 准备数据
|
|
model_data_year = prepare_data_year(data)
|
|
|
|
# 保留但不参与计算的列
|
|
columns_to_exclude = ['area_name']
|
|
|
|
yuce_year = []
|
|
for index, model_data in enumerate(model_data_year):
|
|
# 3. 准备特征和目标变量
|
|
feature_names = model_data.drop(columns=['area_name', 'all_score']).columns.tolist()
|
|
X = model_data[feature_names].values
|
|
y = model_data['all_score'].values
|
|
# 移除不需要参与计算的列
|
|
X_for_model = model_data.drop(columns=columns_to_exclude)
|
|
|
|
# 4. 训练模型
|
|
lasso_model, lasso_predictions, best_lambda, lasso_cv = train_lasso(X, y, index + 1)
|
|
|
|
result_df = pd.DataFrame({
|
|
'area_name': model_data['area_name'],
|
|
'predicted_score': lasso_predictions
|
|
})
|
|
# 将每个 DataFrame 转换为字典并添加到列表中
|
|
yuce_year.append(result_df.to_dict(orient='records'))
|
|
|
|
# 将包含所有结果的列表转换为 JSON 字符串并输出
|
|
print(json.dumps(yuce_year, ensure_ascii=False, indent=4))
|