70 lines
2.4 KiB
Python
70 lines
2.4 KiB
Python
import json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import sys
|
||
|
||
# 重新配置标准输出流的编码为 UTF-8
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
|
||
file_path = sys.argv[1]
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
json_str = file.read()
|
||
|
||
# 解析 JSON 字符串
|
||
data_dict = json.loads(json_str)
|
||
# 转为 Pandas DataFrame
|
||
data = pd.DataFrame(data_dict)
|
||
# 清理列名,移除空格和特殊字符
|
||
data.columns = data.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
|
||
|
||
# 选择需要的列,计算每个区第7-172列的平均值
|
||
selected_data = data.groupby('area_name').agg({col: 'mean' for col in data.columns[6:171]}).reset_index()
|
||
|
||
# 识别并移除常数列
|
||
constant_columns = selected_data.apply(lambda col: len(col.unique()) == 1)
|
||
selected_data = selected_data.loc[:, ~constant_columns]
|
||
|
||
# 计算相关性矩阵
|
||
correlation_matrix = selected_data.iloc[:, 1:].corr()
|
||
|
||
# 只保留相关系数绝对值大于 0.3 的元素
|
||
correlation_matrix[abs(correlation_matrix) <= 0.3] = np.nan
|
||
|
||
# 计算每个变量的 NA 数量
|
||
na_counts = correlation_matrix.isna().sum(axis=0)
|
||
|
||
# 按 NA 数量排序,选择 NA 最少的前 30 个变量
|
||
selected_vars = na_counts.sort_values().index[:30]
|
||
|
||
# 提取选择的 30 个变量的相关性矩阵
|
||
selected_correlation_matrix = correlation_matrix.loc[selected_vars, selected_vars]
|
||
|
||
# 查找相关系数绝对值大于 0.3 的元素
|
||
edges = np.where((abs(selected_correlation_matrix) > 0.3) & (~np.isnan(selected_correlation_matrix)))
|
||
edge_weights = selected_correlation_matrix.values[edges]
|
||
|
||
# 创建一个数据框来存储边的信息
|
||
edges_df = pd.DataFrame({
|
||
'source': selected_vars[edges[0]],
|
||
'target': selected_vars[edges[1]],
|
||
'weight': edge_weights
|
||
})
|
||
|
||
# 将负权重改为正权重
|
||
edges_df['weight'] = abs(edges_df['weight'])
|
||
|
||
# 添加一个新列来指示原始权重的符号
|
||
edges_df['symbol'] = np.where(selected_correlation_matrix.values[edges] < 0, True, False)
|
||
|
||
# 对每一对节点进行排序,以确保 'source' 总是小于等于 'target'
|
||
edges_df[['source', 'target']] = edges_df.apply(lambda x: sorted([x['source'], x['target']]), axis=1,
|
||
result_type='expand')
|
||
|
||
# 删除重复行
|
||
edges_df = edges_df.drop_duplicates()
|
||
|
||
# print("123123123123123")
|
||
|
||
# 将 DataFrame 转换为 JSON 字符串并输出
|
||
print(json.dumps(json.loads(edges_df.to_json(orient='records')), ensure_ascii=False, indent=4))
|