|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# # XGBoost Performance Comparison |
| 5 | + |
| 6 | +# In this example we will train a XGBoost model and predict the results to show off Intel's optimizations for XGBoost used for increased performance. Intel optimized XGBoost is shipped as a part of the Intel® oneAPI AI Analytics Toolkit. |
| 7 | +# |
| 8 | +# This example is a Jupyter Notebook version of a XGBoost example seen in this Medium blog using the popular Higgs dataset: |
| 9 | +# https://medium.com/intel-analytics-software/new-optimizations-for-cpu-in-xgboost-1-1-81144ea21115 |
| 10 | + |
| 11 | +# In this example, we will use a dataset with particle features and functions of those features **to distinguish between a signal process which produces Higgs bosons (1) and a background process which does not (0)**. The Higgs boson is a basic particle in the standard model produced by the quantum excitation of the Higgs field, named after physicist Peter Higgs. |
| 12 | + |
| 13 | +# ## Example Environment Creation |
| 14 | + |
| 15 | +# This example is executed in two Anaconda environments. |
| 16 | +# |
| 17 | +# The first environment is the latest [Intel® oneAPI AI Analytics Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html) base environment, which includes data analytics and machine learning workflows and Intel optimizations for XGboost. See [here](https://software.intel.com/content/www/us/en/develop/articles/installing-ai-kit-with-conda.html) for more installation information. |
| 18 | + |
| 19 | +# The second environment is a clone of the first environment using commands: |
| 20 | + |
| 21 | +# `conda create --name xgb0.81 --clone aikit-base-env-name` |
| 22 | + |
| 23 | +# `conda activate xgb0.81` |
| 24 | + |
| 25 | +# `conda remove xgboost` |
| 26 | + |
| 27 | +# `pip install xgboost==0.81` |
| 28 | + |
| 29 | +# To convert the second environment to a Jupyter Notebook kernel, use the command: |
| 30 | + |
| 31 | +# `python -m ipykernel install --user --name=xgb0.81` |
| 32 | + |
| 33 | +# Run this performance demo in both environments, **saving the visualization cell until after both runs as instructed later in this demo.** |
| 34 | + |
| 35 | +# ## Importing and Organizing Data |
| 36 | + |
| 37 | +# Let's start by **importing** all necessary data and packages. |
| 38 | +# |
| 39 | + |
| 40 | +# In[1]: |
| 41 | + |
| 42 | + |
| 43 | +import numpy as np |
| 44 | +import os |
| 45 | +import matplotlib.pyplot as plt |
| 46 | +import requests |
| 47 | +import pandas as pd |
| 48 | +import sys |
| 49 | +import xgboost as xgb |
| 50 | +import time |
| 51 | + |
| 52 | + |
| 53 | +# Now let's **load** in the Higgs dataset and **organize** it as necessary to work with our model. You can opt to remove this cell and add your own data as you see fit. |
| 54 | + |
| 55 | +# In[2]: |
| 56 | + |
| 57 | + |
| 58 | +def load_higgs(nrows_train, nrows_test, dtype=np.float32): |
| 59 | + if not os.path.isfile("./HIGGS.csv.gz"): |
| 60 | + print("Loading data set...") |
| 61 | + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" |
| 62 | + myfile = requests.get(url) |
| 63 | + with open('./HIGGS.csv.gz', 'wb') as f: |
| 64 | + f.write(myfile.content) |
| 65 | + print("Reading data set...") |
| 66 | + data = pd.read_csv("./HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) |
| 67 | + print("Pre-processing data set...") |
| 68 | + data = data[list(data.columns[1:])+list(data.columns[0:1])] |
| 69 | + n_features = data.shape[1]-1 |
| 70 | + train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features]) |
| 71 | + train_label = np.ascontiguousarray(data.values[:nrows_train,n_features]) |
| 72 | + test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features]) |
| 73 | + test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features]) |
| 74 | + n_classes = len(np.unique(train_label)) |
| 75 | + print(sys.getsizeof(train_data)) |
| 76 | + return train_data, train_label, test_data, test_label, n_classes, n_features |
| 77 | + |
| 78 | + |
| 79 | +# We will run this model and prediction using 10,000 rows of the Higgs dataset. |
| 80 | + |
| 81 | +# In[3]: |
| 82 | + |
| 83 | + |
| 84 | +train_data, train_label, test_data, test_label, n_classes, n_features = load_higgs(10000, 10000) |
| 85 | + |
| 86 | + |
| 87 | +# ## Training the Model |
| 88 | + |
| 89 | +# **Fitting and training the model** using the training dataset, which consists of particle features and functions of those features to help discern between a signal process that produces Higgs bosons and background process. |
| 90 | + |
| 91 | +# In[4]: |
| 92 | + |
| 93 | + |
| 94 | +# Set XGBoost parameters |
| 95 | +xgb_params = { |
| 96 | + 'verbosity': 0, |
| 97 | + 'alpha': 0.9, |
| 98 | + 'max_bin': 256, |
| 99 | + 'scale_pos_weight': 2, |
| 100 | + 'learning_rate': 0.1, |
| 101 | + 'subsample': 1, |
| 102 | + 'reg_lambda': 1, |
| 103 | + "min_child_weight": 0, |
| 104 | + 'max_depth': 8, |
| 105 | + 'max_leaves': 2**8, |
| 106 | + 'objective': 'binary:logistic', |
| 107 | + 'predictor': 'cpu_predictor', |
| 108 | + 'tree_method': 'hist', |
| 109 | + 'n_estimators': 1000 |
| 110 | +} |
| 111 | + |
| 112 | +# Train the model |
| 113 | +t0 = time.time() #begin timer |
| 114 | +model_xgb= xgb.XGBClassifier(**xgb_params) |
| 115 | +model_xgb.fit(train_data, train_label) |
| 116 | +t1 = time.time() #end timer |
| 117 | + |
| 118 | + |
| 119 | +# ## Making A Prediction |
| 120 | + |
| 121 | +# Now let's **make a prediction,** using Intel optimized XGBoost*, for increased performance and to determine whether the particles in the dataset have a signal process which produces Higgs bosons or a background process which does not produce them. |
| 122 | + |
| 123 | +# In[5]: |
| 124 | + |
| 125 | + |
| 126 | +#predict label using test data |
| 127 | +result_predict_xgb_test = model_xgb.predict(test_data) |
| 128 | + |
| 129 | + |
| 130 | +# ## Accuracy |
| 131 | + |
| 132 | +# Now let's **check the accuracy** of our model by comparing our prediction to the correct classification of the particles. |
| 133 | + |
| 134 | +# In[6]: |
| 135 | + |
| 136 | + |
| 137 | +# Check model accuracy |
| 138 | +acc = np.mean(test_label == result_predict_xgb_test) |
| 139 | +print(acc) |
| 140 | + |
| 141 | + |
| 142 | +# ## Calculate Training Time |
| 143 | + |
| 144 | +# In[7]: |
| 145 | + |
| 146 | + |
| 147 | +xgb_total = t1-t0 |
| 148 | + |
| 149 | + |
| 150 | +# ### Visualization |
| 151 | + |
| 152 | +# **Directions:** |
| 153 | +# |
| 154 | +# If ./perf_numbers.csv is already created from a previous comparison of the two environment runs (aikit-base and XGBoost 0.81), remove it. |
| 155 | +# |
| 156 | +# Run the following cell in both environments to generate the dataframe that will be used for visualization (run demo in one environment, then switch to the other environment and run it again.). |
| 157 | + |
| 158 | +# In[8]: |
| 159 | + |
| 160 | + |
| 161 | +filename = "./perf_numbers.csv" |
| 162 | + |
| 163 | +xgb_ver= xgb.__version__ |
| 164 | + |
| 165 | +if not os.path.isfile(filename): |
| 166 | + df = pd.DataFrame([[xgb_ver,xgb_total]], columns = ["XGBoost Version", "Time in Sec"]) |
| 167 | + df.to_csv(filename, index=False) |
| 168 | +else: |
| 169 | + df = pd.read_csv(filename) |
| 170 | + if not df.shape[0]==2: |
| 171 | + df2 = pd.DataFrame([[xgb_ver,xgb_total]], columns = ["XGBoost Version", "Time in Sec"]) |
| 172 | + df = df.append(df2, ignore_index=True) |
| 173 | + |
| 174 | + |
| 175 | +# **Only run the following cells after running the demo in both environments.** This will generate the performance visualization. |
| 176 | + |
| 177 | +# In[9]: |
| 178 | + |
| 179 | + |
| 180 | +if ((os.path.isfile(filename)) and (df.shape[0]==2)): |
| 181 | + left = [1,2] |
| 182 | + df.plot(x='XGBoost Version', y='Time in Sec', kind='bar',width = 0.5) |
| 183 | + plt.xlabel('XGBoost Version'); plt.ylabel('Time in Sec'); plt.title('XGBoost Performance Comparison') |
| 184 | + plt.show() |
| 185 | + |
| 186 | + |
| 187 | +# In[10]: |
| 188 | + |
| 189 | + |
| 190 | +df |
| 191 | + |
| 192 | + |
| 193 | +# In[11]: |
| 194 | + |
| 195 | + |
| 196 | +print("[CODE_SAMPLE_COMPLETED_SUCCESFULLY]") |
| 197 | + |
| 198 | + |
| 199 | +# In[ ]: |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | + |
0 commit comments