Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/run_all_frameworks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ jobs:
- name: Create venv
run: python -m venv venv
- uses: actions/cache@v3
if: failure()
id: cache
with:
path: /home/runner/work/automlbenchmark/automlbenchmark/venv
Expand Down Expand Up @@ -151,6 +152,7 @@ jobs:
- name: Create venv
run: python -m venv venv
- uses: actions/cache@v3
if: failure()
id: cache
with:
path: /home/runner/work/automlbenchmark/automlbenchmark/venv
Expand All @@ -164,6 +166,12 @@ jobs:
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install "coverage[toml]"
- name: Check Things
run: |
ls -lah venv
ls -lah venv/bin
ls -lah /home/runner/work/automlbenchmark/automlbenchmark/venv/bin
echo $(pwd)
- name: Run ${{ matrix.framework }} on ${{ matrix.task }}
run: |
source venv/bin/activate
Expand Down
40 changes: 21 additions & 19 deletions amlb/utils/serialization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging

import math
import os
import pickle
import re
from typing import Optional


from .core import Namespace as ns, json_dump, json_load
from .process import profile

Expand Down Expand Up @@ -33,11 +35,10 @@ def _import_data_libraries():
# the serializer to use when there's no specific serializer available.
# mainly intended to serialize simple data structures like lists.
# allowed=['pickle', 'json']
fallback_serializer="json",
# if numpy can use pickle to serialize ndarrays,
numpy_allow_pickle=True,
# OPTION REMOVED: Only JSON is allowed. Pickle is evil.
# fallback_serializer="json",
# format used to serialize pandas dataframes/series between processes.
# allowed=['pickle', 'parquet', 'hdf', 'json']
# allowed=['parquet', 'json']
pandas_serializer="parquet",
# the compression format used when serializing pandas dataframes/series.
# allowed=[None, 'infer', 'bz2', 'gzip']
Expand Down Expand Up @@ -163,8 +164,14 @@ def serialize_data(data, path, config: Optional[ns] = None):
root, ext = os.path.splitext(path)
np, pd, sp = _import_data_libraries()
if np and isinstance(data, np.ndarray):
path = f"{root}.npy"
np.save(path, data, allow_pickle=config.numpy_allow_pickle)
if data.dtype == "object":
# Numpy cannot save object arrays without pickle
path = f"{root}.json"
data = data.squeeze().tolist()
json_dump(data, path, style="compact")
else:
path = f"{root}.npy"
np.save(path, data, allow_pickle=False)
elif sp and isinstance(data, sp.spmatrix):
# use custom extension to recognize sparsed matrices from file name.
# .npz is automatically appended if missing, and can also potentially be used for numpy arrays.
Expand All @@ -177,9 +184,7 @@ def serialize_data(data, path, config: Optional[ns] = None):
# for example, 'true' and 'false' are converted automatically to booleans, even for column names…
data.rename(str, axis="columns", inplace=True)
ser = config.pandas_serializer
if ser == "pickle":
data.to_pickle(path, compression=config.pandas_compression)
elif ser == "parquet":
if ser == "parquet":
if isinstance(data, pd.Series):
data = pd.DataFrame({__series__: data})
# parquet serialization doesn't support sparse dataframes
Expand All @@ -189,18 +194,15 @@ def serialize_data(data, path, config: Optional[ns] = None):
json_dump(dtypes, f"{path}.dtypes", style="compact")
data = unsparsify(data)
data.to_parquet(path, compression=config.pandas_parquet_compression)
elif ser == "hdf":
data.to_hdf(path, os.path.basename(path), mode="w", format="table")
elif ser == "json":
data.to_json(path, compression=config.pandas_compression)
else: # fallback serializer
if config.fallback_serializer == "json":
path = f"{root}.json"
json_dump(data, path, style="compact")
else:
path = f"{root}.pkl"
with open(path, "wb") as f:
pickle.dump(data, f)
raise ValueError(
f"Invalid pandas serialization {ser} must be 'parquet' or 'json'"
)
else: # fallback serializer
path = f"{root}.json"
json_dump(data, path, style="compact")
return path


Expand All @@ -212,7 +214,7 @@ def deserialize_data(path, config: Optional[ns] = None):
if ext == ".npy":
if np is None:
raise SerializationError(f"Numpy is required to deserialize {path}.")
return np.load(path, allow_pickle=config.numpy_allow_pickle)
return np.load(path)
elif ext == ".npz":
_, ext2 = os.path.splitext(base)
if ext2 == ".spy":
Expand Down
1 change: 0 additions & 1 deletion examples/custom/extensions/GradientBoosting/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def run(dataset: Dataset, config: TaskConfig):

save_predictions(
dataset=dataset,
output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion examples/custom/extensions/Stacking/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def run(dataset, config):
probabilities = estimator.predict_proba(X_test) if is_classification else None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
2 changes: 1 addition & 1 deletion frameworks/AutoGluon/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def run(dataset, config):
eval_metric=perf_metric.name,
path=models_dir,
problem_type=problem_type,
verbosity=4,
).fit(train_data=train_path, time_limit=time_limit, **training_params)

log.info(f"Finished fit in {training.duration}s.")
Expand Down Expand Up @@ -183,7 +184,6 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
shutil.rmtree(predictor.path, ignore_errors=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
probabilities_labels=prob_labels,
Expand Down
1 change: 0 additions & 1 deletion frameworks/AutoGluon/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def run(dataset, config):
get_reusable_executor().shutdown(wait=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions_only,
truth=truth_only,
target_is_encoded=False,
Expand Down
15 changes: 15 additions & 0 deletions frameworks/AutoGluon/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@ fi
echo "Finished setup, testing autogluon install..."

PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"

echo "Installing AMLB dependencies into AutoGluon venv"
REQ_FILE="${HERE}/../shared/requirements.txt"

for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
pkg=$(echo "$line" | sed -E 's/[=><~!].*$//')
# In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line

if ! PY -c "import $pkg" &> /dev/null; then
echo "$pkg not found. Installing from requirements.txt..."
PIP install --no-cache-dir "$line"
else
echo "$pkg is already installed by the framework, using that instead."
fi
done
1 change: 0 additions & 1 deletion frameworks/FEDOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def run(dataset, config):
save_artifacts(fedot, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/FEDOT/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def run(dataset, config):

save_artifacts(fedot, config)
return result(
output_file=config.output_predictions_file,
predictions=all_series_predictions,
truth=truth_only,
target_is_encoded=False,
Expand Down
1 change: 0 additions & 1 deletion frameworks/GAMA/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def infer(data: Union[str, pd.DataFrame]):
probabilities = gama_automl.predict_proba(X_test)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion frameworks/H2OAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def infer(path: str):
save_artifacts(aml, dataset=dataset, config=config)

return result(
output_file=config.output_predictions_file,
predictions=preds.predictions,
truth=preds.truth,
probabilities=preds.probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/MLPlan/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def run(dataset, config):
target_encoded = False

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=truth,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/NaiveAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=dataset.test.y,
Expand Down
1 change: 0 additions & 1 deletion frameworks/RandomForest/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def infer(data):
log.info("Finished inference time measurements.")

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
2 changes: 0 additions & 2 deletions frameworks/SapientML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def run(dataset, config):
)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand All @@ -88,7 +87,6 @@ def run(dataset, config):
)
else:
return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
training_duration=training.duration,
Expand Down
1 change: 0 additions & 1 deletion frameworks/TPOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def infer(data):
save_artifacts(tpot, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/TunedRandomForest/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ def infer(data):
log.info("Finished inference time measurements.")

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/autosklearn/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def sample_one_test_row(seed: int):
save_artifacts(auto_sklearn, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y if use_pandas else dataset.test.y_enc,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/flaml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def infer(data: Union[str, pd.DataFrame]):
log.info(f"Finished predict in {predict.duration}s.")

return result(
output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion frameworks/hyperoptsklearn/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def default():
probabilities = None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/lightautoml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)

return result(
output_file=config.output_predictions_file,
probabilities_labels=probabilities_labels,
probabilities=probabilities,
predictions=predictions,
Expand Down
1 change: 0 additions & 1 deletion frameworks/mljarsupervised/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
shutil.rmtree(results_path, ignore_errors=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/oboe/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def aml_models():
probabilities = None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
4 changes: 3 additions & 1 deletion frameworks/shared/callee.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class FrameworkError(Exception):


def result(
output_file=None,
predictions=None,
truth=None,
probabilities=None,
Expand Down Expand Up @@ -94,6 +93,7 @@ def load_data(name, path, **_):
path = os.path.join(config.result_dir, ".".join([name, "data"]))
res[name] = serialize_data(arr, path, config=ser_config)
except BaseException as e:
log.error("Integration script failed with uncaught exception:")
log.exception(e)
res = dict(error_message=str(e), models_count=0)
finally:
Expand All @@ -107,6 +107,8 @@ def load_data(name, path, **_):
)
json_dump(inference_measurements, inference_file, style="compact")
res["others"]["inference_times"] = str(inference_file)

res.setdefault("output_file", config.output_predictions_file)
json_dump(res, config.result_file, style="compact")


Expand Down
8 changes: 0 additions & 8 deletions frameworks/shared/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,5 @@ PIP() {
$pip_exec "$@"
}

#if [[ -x "$(command -v $PY_VENV/bin/activate)" ]]; then
# $PY_ROOT/activate
#fi

#echo "PY=$(command -v PY)"
#echo "PIP=$(command -v PIP)"
echo "PY=$py_exec"
echo "PIP=$pip_exec"

PIP install --no-cache-dir -r $SHARED_DIR/requirements.txt
Loading
Loading