Skip to content

Dev/yongkang #428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ def _data_requirement_print():
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
inference_data_dropped = inference_data
if drop_missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
Expand All @@ -425,6 +426,11 @@ def _data_requirement_print():
save_data(data_selected_dropped, drop_name_column, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
for column_name in data_selected.columns:
# Drop the rows with missing values
inference_data_dropped = inference_data_dropped.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
inference_data_dropped = inference_data_dropped.reset_index(drop=True)
elif drop_missing_value_strategy_num == 2:
is_null_value(data_selected)
show_data_columns(data_selected.columns)
Expand All @@ -441,6 +447,11 @@ def _data_requirement_print():
data_selected_dropped_name = data_selected_dropped_name.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped_name = data_selected_dropped_name.reset_index(drop=True)
for column_name in drop_data_selected.columns:
# Drop the rows with missing values
inference_data_dropped = inference_data_dropped.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
inference_data_dropped = inference_data_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
Expand Down Expand Up @@ -468,6 +479,7 @@ def _data_requirement_print():
save_data(data_selected, name_column_select, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
data_selected = data_selected_dropped if drop_rows_with_missing_value_flag else data_selected
inference_data = inference_data_dropped if drop_rows_with_missing_value_flag else inference_data
process_name_column = data_selected_dropped_name.iloc[:, 0] if drop_rows_with_missing_value_flag else name_column_select
# If the selected data set contains missing values and the user wants to deal with the missing values and choose not to drop the rows with missing values,
# then use imputation techniques to deal with the missing values.
Expand All @@ -479,6 +491,7 @@ def _data_requirement_print():
imputation_config, data_selected_imputed_np = imputer(data_selected, IMPUTING_STRATEGY[strategy_num - 1])
data_selected_imputed = np2pd(data_selected_imputed_np, data_selected.columns)
del data_selected_imputed_np
inference_data[data_selected.columns] = data_selected_imputed
clear_output()
print("[bold green]-*-*- Hypothesis Testing on Imputation Method -*-*-[/bold green]")
print("Null Hypothesis: The distributions of the data set before and after imputing remain the same.")
Expand Down
3 changes: 3 additions & 0 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
The transform pipeline configuration and the transform pipeline object.
"""
print("Build the transform pipeline according to the previous operations.")
if X_train.isnull().any().any():
print("Warning: X_train contains NaN values. Skipping feature selection step.")
feature_selection_config = {}
# Aggregate transformer configuartion.
transformer_config = {}
transformer_config.update(imputation_config)
Expand Down