response/target/classes/python/RelationshipNetwork.py
2025-06-27 10:04:22 +08:00

70 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import numpy as np
import pandas as pd
import sys
# 重新配置标准输出流的编码为 UTF-8
sys.stdout.reconfigure(encoding='utf-8')
file_path = sys.argv[1]
with open(file_path, 'r', encoding='utf-8') as file:
json_str = file.read()
# 解析 JSON 字符串
data_dict = json.loads(json_str)
# 转为 Pandas DataFrame
data = pd.DataFrame(data_dict)
# 清理列名,移除空格和特殊字符
data.columns = data.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
# 选择需要的列计算每个区第7-172列的平均值
selected_data = data.groupby('area_name').agg({col: 'mean' for col in data.columns[6:171]}).reset_index()
# 识别并移除常数列
constant_columns = selected_data.apply(lambda col: len(col.unique()) == 1)
selected_data = selected_data.loc[:, ~constant_columns]
# 计算相关性矩阵
correlation_matrix = selected_data.iloc[:, 1:].corr()
# 只保留相关系数绝对值大于 0.3 的元素
correlation_matrix[abs(correlation_matrix) <= 0.3] = np.nan
# 计算每个变量的 NA 数量
na_counts = correlation_matrix.isna().sum(axis=0)
# 按 NA 数量排序,选择 NA 最少的前 30 个变量
selected_vars = na_counts.sort_values().index[:30]
# 提取选择的 30 个变量的相关性矩阵
selected_correlation_matrix = correlation_matrix.loc[selected_vars, selected_vars]
# 查找相关系数绝对值大于 0.3 的元素
edges = np.where((abs(selected_correlation_matrix) > 0.3) & (~np.isnan(selected_correlation_matrix)))
edge_weights = selected_correlation_matrix.values[edges]
# 创建一个数据框来存储边的信息
edges_df = pd.DataFrame({
'source': selected_vars[edges[0]],
'target': selected_vars[edges[1]],
'weight': edge_weights
})
# 将负权重改为正权重
edges_df['weight'] = abs(edges_df['weight'])
# 添加一个新列来指示原始权重的符号
edges_df['symbol'] = np.where(selected_correlation_matrix.values[edges] < 0, True, False)
# 对每一对节点进行排序,以确保 'source' 总是小于等于 'target'
edges_df[['source', 'target']] = edges_df.apply(lambda x: sorted([x['source'], x['target']]), axis=1,
result_type='expand')
# 删除重复行
edges_df = edges_df.drop_duplicates()
# print("123123123123123")
# 将 DataFrame 转换为 JSON 字符串并输出
print(json.dumps(json.loads(edges_df.to_json(orient='records')), ensure_ascii=False, indent=4))