|
5 | 5 | import mlflow.azureml
|
6 | 6 | import seaborn as sns
|
7 | 7 | import argparse
|
| 8 | +from lightgbm import LGBMClassifier |
8 | 9 |
|
9 | 10 | from sklearn.model_selection import train_test_split
|
10 | 11 | from sklearn.pipeline import Pipeline
|
11 | 12 | from sklearn.linear_model import LogisticRegression
|
12 |
| -from sklearn.preprocessing import StandardScaler, LabelEncoder |
| 13 | +from sklearn.preprocessing import StandardScaler, LabelEncoder, MaxAbsScaler |
13 | 14 | from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
|
14 | 15 |
|
15 |
| -def split_dataset(X_raw, Y): |
16 |
| - A = X_raw[['UniqueCarrier']] |
17 |
| - X = X_raw.drop(labels=['UniqueCarrier'],axis = 1) |
18 |
| - X = pd.get_dummies(X) |
| 16 | +import collections |
| 17 | +import shutil |
19 | 18 |
|
| 19 | +# brute force delete local model directory if it exists |
| 20 | +shutil.rmtree('model', ignore_errors=True) |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | +def split_dataset(X, Y): |
20 | 25 |
|
21 | 26 | le = LabelEncoder()
|
22 | 27 | Y = le.fit_transform(Y)
|
23 |
| - |
24 |
| - X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_raw, |
| 28 | + X_train, X_test, Y_train, Y_test = train_test_split(X, |
25 | 29 | Y,
|
26 |
| - A, |
27 | 30 | test_size = 0.2,
|
28 |
| - random_state=123, |
29 |
| - stratify=Y) |
| 31 | + random_state=123) |
30 | 32 |
|
31 |
| - # Work around indexing bug |
32 |
| - X_train = X_train.reset_index(drop=True) |
33 |
| - A_train = A_train.reset_index(drop=True) |
34 |
| - X_test = X_test.reset_index(drop=True) |
35 |
| - A_test = A_test.reset_index(drop=True) |
36 | 33 |
|
37 |
| - return X_train, X_test, Y_train, Y_test, A_train, A_test |
| 34 | + return X_train, X_test, Y_train, Y_test |
38 | 35 |
|
39 | 36 | def prepareDataset(df):
|
40 | 37 | Y = df['ArrDelay15'].values
|
41 | 38 | synth_df = df.drop(columns=['ArrDelay15'])
|
| 39 | + print(collections.Counter(Y)) |
42 | 40 | return synth_df, Y
|
43 | 41 |
|
44 | 42 | def analyze_model(clf, X_test, Y_test, preds):
|
45 |
| - with mlflow.start_run() as run: |
46 | 43 | accuracy = accuracy_score(Y_test, preds)
|
47 |
| - print(f'Accuracy', np.float(accuracy)) |
48 |
| - mlflow.log_metric(f'Accuracy', np.float(accuracy)) |
| 44 | + print(f'Accuracy', float(accuracy)) |
| 45 | + mlflow.log_metric(f'Accuracy', float(accuracy)) |
49 | 46 |
|
50 | 47 | precision = precision_score(Y_test, preds, average="macro")
|
51 |
| - print(f'Precision', np.float(precision)) |
52 |
| - mlflow.log_metric(f'Precision', np.float(precision)) |
| 48 | + print(f'Precision', float(precision)) |
| 49 | + mlflow.log_metric(f'Precision', float(precision)) |
53 | 50 |
|
54 | 51 | recall = recall_score(Y_test, preds, average="macro")
|
55 |
| - print(f'Recall', np.float(recall)) |
56 |
| - mlflow.log_metric(f'Recall', np.float(recall)) |
| 52 | + print(f'Recall', float(recall)) |
| 53 | + mlflow.log_metric(f'Recall', float(recall)) |
57 | 54 |
|
58 | 55 | f1score = f1_score(Y_test, preds, average="macro")
|
59 |
| - print(f'F1 Score', np.float(f1score)) |
60 |
| - mlflow.log_metric(f'F1 Score', np.float(f1score)) |
61 |
| - |
62 |
| - mlflow.sklearn.log_model(clf, artifact_path="outputs", registered_model_name="fd_model_mlflow_proj") |
| 56 | + print(f'F1 Score', float(f1score)) |
| 57 | + mlflow.log_metric(f'F1 Score', float(f1score)) |
63 | 58 |
|
| 59 | + mlflow.lightgbm.log_model(clf, artifact_path="outputs", registered_model_name="fd_model_mlflow_proj") |
| 60 | + mlflow.lightgbm.save_model(clf, path="model") |
| 61 | + |
64 | 62 | class_names = clf.classes_
|
65 | 63 | fig, ax = plt.subplots()
|
66 | 64 | tick_marks = np.arange(len(class_names))
|
@@ -91,25 +89,31 @@ def analyze_model(clf, X_test, Y_test, preds):
|
91 | 89 |
|
92 | 90 | parser = argparse.ArgumentParser()
|
93 | 91 |
|
94 |
| - parser.add_argument("--data", type=str, help="input data path") |
| 92 | + parser.add_argument("--data", type=str, help="input data path", default=".") |
95 | 93 |
|
96 | 94 | args = parser.parse_args()
|
97 | 95 | print(args.data)
|
98 | 96 |
|
99 | 97 | data = pd.read_csv(args.data+'/flightdelayweather_ds_clean.csv')
|
100 | 98 |
|
101 |
| - mlflow.sklearn.autolog() |
| 99 | + # mlflow.sklearn.autolog() |
102 | 100 |
|
103 |
| - synth_df, Y = prepareDataset(data) |
| 101 | + X, y = prepareDataset(data) |
104 | 102 |
|
105 | 103 | #Split dataset
|
106 |
| - X_train, X_test, Y_train, Y_test, A_train, A_test = split_dataset(synth_df, Y) |
| 104 | + X_train, X_test, y_train, y_test = split_dataset(X, y) |
| 105 | + print(X_train.dtypes) |
| 106 | + print(y_train) |
107 | 107 |
|
108 | 108 | # Setup scikit-learn pipeline
|
109 |
| - numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) |
| 109 | + |
| 110 | + clf = LGBMClassifier(learning_rate=0.24945760279230222, max_bin=511, |
| 111 | + min_child_samples=29, n_estimators=80, num_leaves=21, |
| 112 | + reg_alpha=0.0020334241010261135, reg_lambda=0.04344763354508823, metric='auc', is_unbalance='true') |
| 113 | + |
110 | 114 |
|
111 |
| - clf = Pipeline(steps=[('classifier', LogisticRegression(solver='liblinear', fit_intercept=True))]) |
| 115 | +# Analyze the model |
112 | 116 |
|
113 |
| - model = clf.fit(X_train, Y_train) |
114 |
| - preds = clf.predict(X_test) |
115 |
| - analyze_model(clf, X_test, Y_test, preds) |
| 117 | +model = clf.fit(X_train, y_train) |
| 118 | +preds = model.predict(X_test) |
| 119 | +analyze_model(clf, X_test, y_test, preds) |
0 commit comments