import pandas as pd from sklearn.linear_model import LassoCV, Lasso from sklearn.preprocessing import StandardScaler import json import sys # 重新配置标准输出流的编码为 UTF-8 sys.stdout.reconfigure(encoding='utf-8') def load_data(file_path): with open(file_path, 'r', encoding='utf-8') as file: json_str = file.read() # 解析 JSON 字符串 data_dict = json.loads(json_str) # 转为 Pandas DataFrame data = pd.DataFrame(data_dict) # # """加载数据并重命名列""" # data = pd.read_csv(file_path) new_names = [f'q{i}' for i in range(7, 189)] data.columns.values[6:189] = new_names return data def prepare_data_year(data): # 确保 'year' 和 'month' 列的数据类型是整数 data['year'] = data['year'].astype(int) data['month'] = data['month'].astype(int) results = [] new_names = [f'q{i}' for i in range(10, 188)] # 确保此列名称与选择特征匹配 for month in range(1, 13): # 循环从1到12 # 动态选择数据 # if month <= 5: # selected_data = ( # data[(data['year'] == 2021) | (data['year'] == 2022) | ( # (data['year'] == 2023) & (data['month'] <= 6 - month))] # .groupby('area_name')[new_names].mean().reset_index() # ) # else: # selected_data = ( # data[(data['year'] == 2021) | ( # (data['year'] == 2022) & (data['month'] <= 12 + 6 - month))] # .groupby('area_name')[new_names].mean().reset_index() # ) if month <= 5: selected_data = ( data[(data['year'] == 2022) | ( (data['year'] == 2023) & (data['month'] <= 6 - month))] .groupby('area_name')[new_names].mean().reset_index() ) else: selected_data = ( data[( (data['year'] == 2022) & (data['month'] <= 12 + 6 - month))] .groupby('area_name')[new_names].mean().reset_index() ) # 获取对应月份的all_score all_score = data[(data['year'] == 2023) & (data['month'] == 6)].dropna(subset=['all_score'])[ ['area_name', 'all_score']] # 合并数据 model_data = pd.merge(selected_data, all_score, on='area_name') # 移除所有值相同的列 model_data = model_data.loc[:, model_data.nunique() > 1] # 将结果添加到列表中 results.append(model_data) return results def train_lasso(X, y, month): """训练LASSO回归模型并返回最佳模型和预测值""" scaler = StandardScaler() X_scaled = scaler.fit_transform(X) lasso_cv = LassoCV(cv=16, random_state=42).fit(X_scaled, y) best_lambda = lasso_cv.alpha_ lasso_model = Lasso(alpha=best_lambda).fit(X_scaled, y) predictions = lasso_model.predict(X_scaled) # 保存模型和标准化器 # joblib.dump(lasso_model, 'lasso_model' + str(month) + '.pkl') return lasso_model, predictions, best_lambda, lasso_cv if __name__ == "__main__": # 1. 加载数据 file_path = sys.argv[1] data = load_data(file_path) # 2. 准备数据 model_data_year = prepare_data_year(data) # 保留但不参与计算的列 columns_to_exclude = ['area_name'] yuce_year = [] for index, model_data in enumerate(model_data_year): # 3. 准备特征和目标变量 feature_names = model_data.drop(columns=['area_name', 'all_score']).columns.tolist() X = model_data[feature_names].values y = model_data['all_score'].values # 移除不需要参与计算的列 X_for_model = model_data.drop(columns=columns_to_exclude) # 4. 训练模型 lasso_model, lasso_predictions, best_lambda, lasso_cv = train_lasso(X, y, index + 1) result_df = pd.DataFrame({ 'area_name': model_data['area_name'], 'predicted_score': lasso_predictions }) # 将每个 DataFrame 转换为字典并添加到列表中 yuce_year.append(result_df.to_dict(orient='records')) # 将包含所有结果的列表转换为 JSON 字符串并输出 print(json.dumps(yuce_year, ensure_ascii=False, indent=4))