-
Notifications
You must be signed in to change notification settings - Fork 30
feat: add network analysis #352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
BBO-Junbo
wants to merge
2
commits into
ZJUEarthData:main
Choose a base branch
from
BBO-Junbo:main
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Empty file.
86 changes: 86 additions & 0 deletions
86
geochemistrypi/data_mining/model/func/algo_network/_common.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from itertools import combinations | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
# Pair each dataframe with every other dataframe in a list of dataframes | ||
def pair_dataframes(dataframes): | ||
pairs = [] | ||
for pair in combinations(enumerate(dataframes), 2): | ||
idx1, df1 = pair[0] | ||
idx2, df2 = pair[1] | ||
pairs.append((df1, df2, idx1, idx2)) | ||
return pairs | ||
|
||
|
||
# Convert indices and distances into triplets of mineral IDs and their distance | ||
def convert_to_triplets(indices, distances, mineral_a_ids, mineral_b_ids): | ||
triplets = [] | ||
|
||
for i, (neighbors, distances_row) in enumerate(zip(indices, distances)): | ||
for neighbor, distance in zip(neighbors, distances_row): | ||
triplet = (mineral_a_ids[i], mineral_b_ids[neighbor], distance) | ||
triplets.append(triplet) | ||
|
||
return triplets | ||
|
||
|
||
# Clean the dataframe of triplets by sorting, removing duplicates, and handling nulls or zeros in distance | ||
def triplets_df_clean(triplets_df): | ||
triplets_df[["Node1", "Node2"]] = np.sort(triplets_df[["Node1", "Node2"]], axis=1) | ||
triplets_df = triplets_df.drop_duplicates(subset=["Node1", "Node2"]) | ||
triplets_df["Distance"] = triplets_df["Distance"].apply(lambda x: 0.001 if pd.isnull(x) or x == 0 else x) | ||
triplets_df = triplets_df.dropna(subset=["Distance"]) | ||
triplets_df = triplets_df.sort_values(by="Node1") | ||
return triplets_df | ||
|
||
|
||
# Construct an adjacency matrix from graph data | ||
def construct_adj_matrix(graph_data): | ||
nodes = np.unique(graph_data[["Node1", "Node2"]].values) | ||
num_nodes = len(nodes) | ||
adj_matrix = np.zeros((num_nodes, num_nodes)) | ||
node_index_mapping = {node: idx for idx, node in enumerate(nodes)} | ||
for index, row in graph_data.iterrows(): | ||
node1, node2, distance = int(row["Node1"]), int(row["Node2"]), row["Distance"] | ||
adj_matrix[node_index_mapping[node1], node_index_mapping[node2]] = distance | ||
adj_matrix[node_index_mapping[node2], node_index_mapping[node1]] = distance | ||
mapping_df = pd.DataFrame(list(node_index_mapping.items()), columns=["Original_Node", "Mapped_Node"]) | ||
return adj_matrix, mapping_df | ||
|
||
|
||
# Update community IDs based on a mapping and calculate unique and repeated counts | ||
def accurate_statistic_algo(communities, ids, group_ids): | ||
result_df = communities.copy() | ||
flat_ids = np.array(ids).flatten() | ||
flat_group_ids = np.array(group_ids).flatten() | ||
|
||
for i, row in communities.iloc[1:].iterrows(): | ||
for j, val in row.items(): | ||
if val in flat_ids: | ||
idx = flat_ids.tolist().index(val) | ||
if idx < len(flat_group_ids): | ||
result_df.at[i, j] = flat_group_ids[idx] | ||
repeated_counts = [] | ||
unique_counts = [] | ||
for index, row in result_df.iterrows(): | ||
row_values = row[1:].dropna() | ||
seen_values = set() | ||
repeated_set = set() | ||
for num in row_values: | ||
if num in seen_values: | ||
repeated_set.add(num) | ||
else: | ||
seen_values.add(num) | ||
|
||
repeated_count = len(repeated_set) | ||
unique_count = len(seen_values) - repeated_count | ||
|
||
repeated_counts.append(repeated_count) | ||
unique_counts.append(unique_count) | ||
|
||
result_df["Repeated_Counts"] = repeated_counts | ||
result_df["Unique_Counts"] = unique_counts | ||
|
||
return result_df |
73 changes: 73 additions & 0 deletions
73
geochemistrypi/data_mining/model/func/algo_network/_community.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import communities.algorithms | ||
import pandas as pd | ||
|
||
|
||
# Function to apply the Bron-Kerbosch algorithm to find maximal cliques in an adjacency matrix | ||
def bron_kerbosch_algo(adj_matrix, mapping_df): | ||
# Use the Bron-Kerbosch algorithm from the communities package, with pivoting enabled | ||
communities_list = communities.algorithms.bron_kerbosch(adj_matrix, pivot=True) | ||
# Retrieve the mapping from the dataframe to map back to original node IDs | ||
node_mapping_df = mapping_df | ||
mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"])) | ||
|
||
# Prepare column names for the dataframe based on the largest community size | ||
column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))] | ||
|
||
# Initialize community data with column names | ||
community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]] | ||
# Append community data with community ID and nodes in each community | ||
community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)] | ||
|
||
# Create a dataframe from the community data | ||
bk_df = pd.DataFrame(community_data, columns=column_names) | ||
# Initialize a dataframe for mapped community data | ||
mapped_bk_df = pd.DataFrame(columns=column_names) | ||
# Map the community nodes back to their original IDs | ||
for index, row in bk_df.iterrows(): | ||
mapped_row = [] | ||
for column in bk_df.columns[1:]: | ||
community_nodes = row[column] | ||
|
||
if not isinstance(community_nodes, list): | ||
community_nodes = [community_nodes] | ||
|
||
original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes] | ||
mapped_row.append(original_nodes) | ||
|
||
mapped_bk_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist] | ||
return mapped_bk_df | ||
|
||
|
||
# Function to apply the Louvain method to find communities in an adjacency matrix | ||
def louvain_method_algo(adj_matrix, mapping_df): | ||
# Use the Louvain method from the communities package | ||
communities_list, _ = communities.algorithms.louvain_method(adj_matrix) | ||
# Retrieve the mapping from the dataframe to map back to original node IDs | ||
node_mapping_df = mapping_df | ||
mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"])) | ||
# Prepare column names for the dataframe based on the largest community size | ||
column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))] | ||
|
||
# Initialize community data with column names | ||
community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]] | ||
# Append community data with community ID and nodes in each community | ||
community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)] | ||
# Create a dataframe from the community data | ||
louvain_df = pd.DataFrame(community_data, columns=column_names) | ||
# Initialize a dataframe for mapped community data | ||
mapped_louvain_df = pd.DataFrame(columns=column_names) | ||
# Map the community nodes back to their original IDs | ||
for index, row in louvain_df.iterrows(): | ||
mapped_row = [] | ||
|
||
for column in louvain_df.columns[1:]: | ||
community_nodes = row[column] | ||
|
||
if not isinstance(community_nodes, list): | ||
community_nodes = [community_nodes] | ||
|
||
original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes] | ||
mapped_row.append(original_nodes) | ||
|
||
mapped_louvain_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist] | ||
return mapped_louvain_df |
40 changes: 40 additions & 0 deletions
40
geochemistrypi/data_mining/model/func/algo_network/_distance.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import numpy as np | ||
import scipy | ||
|
||
|
||
# Calculate the Mahalanobis distance between two single points x and y | ||
def mahalanobis_distance_singal(x, y, inv_cov): | ||
# Compute the difference between the two points | ||
x_minus_y = x - y | ||
# Calculate the Mahalanobis distance using the inverse covariance matrix | ||
return np.sqrt(np.dot(np.dot(x_minus_y, inv_cov), x_minus_y.T)) | ||
|
||
|
||
# Calculate the Mahalanobis distance between two sets of points | ||
def mahalanobis_distance_calculator(mineral_a, mineral_b): | ||
# Calculate the inverse covariance matrix of the first set of points | ||
inv_cov = np.linalg.pinv(np.cov(mineral_a, rowvar=False)) | ||
# Use scipy's cdist function to calculate the distance between each pair of points | ||
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, lambda u, v: mahalanobis_distance_singal(u, v, inv_cov)) | ||
return distance | ||
|
||
|
||
# Calculate the Euclidean distance between two sets of points | ||
def euclidean_distance_calcular(mineral_a, mineral_b): | ||
# Specify the metric as Euclidean | ||
metric = "euclidean" | ||
# Use scipy's cdist function to calculate the distance between each pair of points | ||
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, metric) | ||
return distance | ||
|
||
|
||
# Compute the distance between two sets of points using either Mahalanobis or Euclidean metric | ||
def compute_distance_between_2(mineral_a, mineral_b, k=1, metric="euclidean"): | ||
# Select the distance calculation method based on the specified metric | ||
if metric == "mahalanobis": | ||
return mahalanobis_distance_calculator(mineral_a, mineral_b) | ||
elif metric == "euclidean": | ||
return euclidean_distance_calcular(mineral_a, mineral_b) | ||
else: | ||
# Raise an error if an unsupported metric is specified | ||
raise ValueError("Unsupported distance metric. Supported metrics are 'mahalanobis' and 'euclidean'.") |
13 changes: 13 additions & 0 deletions
13
geochemistrypi/data_mining/model/func/algo_network/_nearest.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import numpy as np | ||
|
||
|
||
# Function to find the top k nearest neighbors for each point | ||
def top_k_nearest_neighbors(distance, k=5): | ||
# Sort the distances and get indices of the nearest neighbors | ||
nearest_neighbors_indices = np.argsort(distance, axis=1)[:, :k] | ||
|
||
# Retrieve the distances of the nearest neighbors using the sorted indices | ||
nearest_neighbors_distances = np.take_along_axis(distance, nearest_neighbors_indices, axis=1) | ||
|
||
# Return the indices and distances of the nearest neighbors | ||
return nearest_neighbors_indices, nearest_neighbors_distances |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import os | ||
|
||
import pandas as pd | ||
|
||
from ..constants import MLFLOW_ARTIFACT_DATA_PATH | ||
from ..utils.base import save_data | ||
from ._base import WorkflowBase | ||
from .func.algo_network._common import accurate_statistic_algo, construct_adj_matrix, convert_to_triplets, pair_dataframes, triplets_df_clean | ||
from .func.algo_network._community import bron_kerbosch_algo, louvain_method_algo | ||
from .func.algo_network._distance import euclidean_distance_calcular, mahalanobis_distance_calculator | ||
from .func.algo_network._nearest import top_k_nearest_neighbors | ||
|
||
|
||
class NetworkAnalysisWorkflowBase(WorkflowBase): | ||
def __init__(self) -> None: | ||
# Initialize default values | ||
self.distance_calculator = "EU" # Default distance calculator | ||
self.community_detection_algo = "BK" # Default community detection algorithm | ||
self.minerals = [] # List to store dataframes of minerals | ||
self.ids = [] # List to store ids | ||
self.labels = [] # List to store labels | ||
self.k = 1 # Number of nearest neighbors | ||
self.distances = pd.DataFrame # DataFrame to store distances | ||
self.communities_sample = pd.DataFrame # DataFrame to store sample communities | ||
self.communities_gruop = pd.DataFrame # DataFrame to store grouped communities | ||
|
||
def fit(self) -> None: | ||
# Merge features and labels | ||
merged_df = pd.concat([self.X, self.y], axis=1) | ||
split_dfs = [] | ||
# Group by mineral type and split | ||
for mineral_type, group in merged_df.groupby("mineral_type"): | ||
split_dfs.append(group) | ||
extracted_dfs = [] | ||
# Extract last column from each group | ||
for mineral_type, df in enumerate(split_dfs): | ||
last_column = df.iloc[:, -1:] | ||
extracted_dfs.append(last_column) | ||
# Remove last column from original dataframes | ||
for mineral_type, df in enumerate(split_dfs): | ||
split_dfs[mineral_type] = df.iloc[:, :-1] | ||
self.minerals = split_dfs | ||
self.labels = extracted_dfs | ||
|
||
def manual_hyper_parameters(cls) -> None: | ||
"""Manual hyper-parameters specification.""" | ||
return dict() | ||
|
||
def generate_ids(self): | ||
# Generate unique ids for each mineral | ||
offset = 0 | ||
for df in self.minerals: | ||
self.ids.append(list(range(offset, offset + len(df)))) | ||
offset += len(df) | ||
|
||
def compute_distance(self): | ||
# Compute distances between minerals | ||
all_triplets = [] | ||
pair_combinations = pair_dataframes(self.minerals) | ||
# Select distance calculator | ||
if self.distance_calculator == "EU": | ||
distance_func = euclidean_distance_calcular | ||
elif self.distance_calculator == "MA": | ||
distance_func = mahalanobis_distance_calculator | ||
# Compute distances for each pair | ||
for pair in pair_combinations: | ||
mineral1, mineral2, index1, index2 = pair | ||
a_to_b_indices, a_to_b_distances = top_k_nearest_neighbors(distance_func(mineral1, mineral2), self.k) | ||
all_triplets += convert_to_triplets(a_to_b_indices, a_to_b_distances, self.ids[index1], self.ids[index2]) | ||
self.distances = triplets_df_clean(pd.DataFrame(all_triplets, columns=["Node1", "Node2", "Distance"])) | ||
# Save distances to file | ||
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") | ||
save_data(self.distances, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) | ||
|
||
def accuracy_statistic(self): | ||
# Print accuracy statistics | ||
self.communities_gruop = accurate_statistic_algo(self.communities_sample, self.ids, self.labels) | ||
# Save accuracy statistics to file | ||
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") | ||
save_data(self.communities_gruop, f"{self.naming} Result Accuracy Statistic", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) | ||
|
||
|
||
class bron_kerbosch(NetworkAnalysisWorkflowBase): | ||
def __init__(self, X, y) -> None: | ||
super().__init__() | ||
self.naming = "Bron_kerbosch" # Name of the algorithm | ||
self.X = X # Features | ||
self.y = y # Labels | ||
self.community_detection_algo = "BK" # Set community detection algorithm | ||
|
||
def community_detection(self): | ||
# Perform community detection using Bron-Kerbosch algorithm | ||
self.fit() | ||
self.generate_ids() | ||
self.compute_distance() | ||
adj_matrix, mapping_df = construct_adj_matrix(self.distances) | ||
self.communities_sample = bron_kerbosch_algo(adj_matrix, mapping_df) | ||
# Save communities to file | ||
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") | ||
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) | ||
self.accuracy_statistic() | ||
|
||
|
||
class louvain_method(NetworkAnalysisWorkflowBase): | ||
def __init__(self, X, y) -> None: | ||
super().__init__() | ||
self.naming = "Louvain_method" # Name of the algorithm | ||
self.X = X # Features | ||
self.y = y # Labels | ||
self.community_detection_algo = "LU" # Set community detection algorithm | ||
|
||
def community_detection(self): | ||
# Perform community detection using Louvain method | ||
self.fit() | ||
self.generate_ids() | ||
self.compute_distance() | ||
adj_matrix, mapping_df = construct_adj_matrix(self.distances) | ||
self.communities_sample = louvain_method_algo(adj_matrix, mapping_df) | ||
# Save communities to file | ||
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") | ||
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) | ||
self.accuracy_statistic() | ||
|
||
|
||
class BronKerboschWorkflow(NetworkAnalysisWorkflowBase): | ||
def community_detection(self, X, y): | ||
instance = bron_kerbosch(X, y) | ||
instance.community_detection() | ||
pass | ||
|
||
|
||
class LouvainMethodWorkflow(NetworkAnalysisWorkflowBase): | ||
def community_detection(self, X, y): | ||
instance = louvain_method(X, y) | ||
instance.community_detection() | ||
pass |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rename the file as "Data_NetworkAnalysis.xlsx"