import json import numpy as np import pandas as pd import sys # 重新配置标准输出流的编码为 UTF-8 sys.stdout.reconfigure(encoding='utf-8') file_path = sys.argv[1] with open(file_path, 'r', encoding='utf-8') as file: json_str = file.read() # 解析 JSON 字符串 data_dict = json.loads(json_str) # 转为 Pandas DataFrame data = pd.DataFrame(data_dict) # 清理列名,移除空格和特殊字符 data.columns = data.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True) # 选择需要的列,计算每个区第7-172列的平均值 selected_data = data.groupby('area_name').agg({col: 'mean' for col in data.columns[6:171]}).reset_index() # 识别并移除常数列 constant_columns = selected_data.apply(lambda col: len(col.unique()) == 1) selected_data = selected_data.loc[:, ~constant_columns] # 计算相关性矩阵 correlation_matrix = selected_data.iloc[:, 1:].corr() # 只保留相关系数绝对值大于 0.3 的元素 correlation_matrix[abs(correlation_matrix) <= 0.3] = np.nan # 计算每个变量的 NA 数量 na_counts = correlation_matrix.isna().sum(axis=0) # 按 NA 数量排序,选择 NA 最少的前 30 个变量 selected_vars = na_counts.sort_values().index[:30] # 提取选择的 30 个变量的相关性矩阵 selected_correlation_matrix = correlation_matrix.loc[selected_vars, selected_vars] # 查找相关系数绝对值大于 0.3 的元素 edges = np.where((abs(selected_correlation_matrix) > 0.3) & (~np.isnan(selected_correlation_matrix))) edge_weights = selected_correlation_matrix.values[edges] # 创建一个数据框来存储边的信息 edges_df = pd.DataFrame({ 'source': selected_vars[edges[0]], 'target': selected_vars[edges[1]], 'weight': edge_weights }) # 将负权重改为正权重 edges_df['weight'] = abs(edges_df['weight']) # 添加一个新列来指示原始权重的符号 edges_df['symbol'] = np.where(selected_correlation_matrix.values[edges] < 0, True, False) # 对每一对节点进行排序,以确保 'source' 总是小于等于 'target' edges_df[['source', 'target']] = edges_df.apply(lambda x: sorted([x['source'], x['target']]), axis=1, result_type='expand') # 删除重复行 edges_df = edges_df.drop_duplicates() # print("123123123123123") # 将 DataFrame 转换为 JSON 字符串并输出 print(json.dumps(json.loads(edges_df.to_json(orient='records')), ensure_ascii=False, indent=4))