Skip to content

Commit 759fb39

Browse files
committed
Update train.py
1 parent 38a61de commit 759fb39

File tree

1 file changed

+40
-36
lines changed

1 file changed

+40
-36
lines changed

train.py

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,62 +5,60 @@
55
import mlflow.azureml
66
import seaborn as sns
77
import argparse
8+
from lightgbm import LGBMClassifier
89

910
from sklearn.model_selection import train_test_split
1011
from sklearn.pipeline import Pipeline
1112
from sklearn.linear_model import LogisticRegression
12-
from sklearn.preprocessing import StandardScaler, LabelEncoder
13+
from sklearn.preprocessing import StandardScaler, LabelEncoder, MaxAbsScaler
1314
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
1415

15-
def split_dataset(X_raw, Y):
16-
A = X_raw[['UniqueCarrier']]
17-
X = X_raw.drop(labels=['UniqueCarrier'],axis = 1)
18-
X = pd.get_dummies(X)
16+
import collections
17+
import shutil
1918

19+
# brute force delete local model directory if it exists
20+
shutil.rmtree('model', ignore_errors=True)
21+
22+
23+
24+
def split_dataset(X, Y):
2025

2126
le = LabelEncoder()
2227
Y = le.fit_transform(Y)
23-
24-
X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_raw,
28+
X_train, X_test, Y_train, Y_test = train_test_split(X,
2529
Y,
26-
A,
2730
test_size = 0.2,
28-
random_state=123,
29-
stratify=Y)
31+
random_state=123)
3032

31-
# Work around indexing bug
32-
X_train = X_train.reset_index(drop=True)
33-
A_train = A_train.reset_index(drop=True)
34-
X_test = X_test.reset_index(drop=True)
35-
A_test = A_test.reset_index(drop=True)
3633

37-
return X_train, X_test, Y_train, Y_test, A_train, A_test
34+
return X_train, X_test, Y_train, Y_test
3835

3936
def prepareDataset(df):
4037
Y = df['ArrDelay15'].values
4138
synth_df = df.drop(columns=['ArrDelay15'])
39+
print(collections.Counter(Y))
4240
return synth_df, Y
4341

4442
def analyze_model(clf, X_test, Y_test, preds):
45-
with mlflow.start_run() as run:
4643
accuracy = accuracy_score(Y_test, preds)
47-
print(f'Accuracy', np.float(accuracy))
48-
mlflow.log_metric(f'Accuracy', np.float(accuracy))
44+
print(f'Accuracy', float(accuracy))
45+
mlflow.log_metric(f'Accuracy', float(accuracy))
4946

5047
precision = precision_score(Y_test, preds, average="macro")
51-
print(f'Precision', np.float(precision))
52-
mlflow.log_metric(f'Precision', np.float(precision))
48+
print(f'Precision', float(precision))
49+
mlflow.log_metric(f'Precision', float(precision))
5350

5451
recall = recall_score(Y_test, preds, average="macro")
55-
print(f'Recall', np.float(recall))
56-
mlflow.log_metric(f'Recall', np.float(recall))
52+
print(f'Recall', float(recall))
53+
mlflow.log_metric(f'Recall', float(recall))
5754

5855
f1score = f1_score(Y_test, preds, average="macro")
59-
print(f'F1 Score', np.float(f1score))
60-
mlflow.log_metric(f'F1 Score', np.float(f1score))
61-
62-
mlflow.sklearn.log_model(clf, artifact_path="outputs", registered_model_name="fd_model_mlflow_proj")
56+
print(f'F1 Score', float(f1score))
57+
mlflow.log_metric(f'F1 Score', float(f1score))
6358

59+
mlflow.lightgbm.log_model(clf, artifact_path="outputs", registered_model_name="fd_model_mlflow_proj")
60+
mlflow.lightgbm.save_model(clf, path="model")
61+
6462
class_names = clf.classes_
6563
fig, ax = plt.subplots()
6664
tick_marks = np.arange(len(class_names))
@@ -91,25 +89,31 @@ def analyze_model(clf, X_test, Y_test, preds):
9189

9290
parser = argparse.ArgumentParser()
9391

94-
parser.add_argument("--data", type=str, help="input data path")
92+
parser.add_argument("--data", type=str, help="input data path", default=".")
9593

9694
args = parser.parse_args()
9795
print(args.data)
9896

9997
data = pd.read_csv(args.data+'/flightdelayweather_ds_clean.csv')
10098

101-
mlflow.sklearn.autolog()
99+
# mlflow.sklearn.autolog()
102100

103-
synth_df, Y = prepareDataset(data)
101+
X, y = prepareDataset(data)
104102

105103
#Split dataset
106-
X_train, X_test, Y_train, Y_test, A_train, A_test = split_dataset(synth_df, Y)
104+
X_train, X_test, y_train, y_test = split_dataset(X, y)
105+
print(X_train.dtypes)
106+
print(y_train)
107107

108108
# Setup scikit-learn pipeline
109-
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
109+
110+
clf = LGBMClassifier(learning_rate=0.24945760279230222, max_bin=511,
111+
min_child_samples=29, n_estimators=80, num_leaves=21,
112+
reg_alpha=0.0020334241010261135, reg_lambda=0.04344763354508823, metric='auc', is_unbalance='true')
113+
110114

111-
clf = Pipeline(steps=[('classifier', LogisticRegression(solver='liblinear', fit_intercept=True))])
115+
# Analyze the model
112116

113-
model = clf.fit(X_train, Y_train)
114-
preds = clf.predict(X_test)
115-
analyze_model(clf, X_test, Y_test, preds)
117+
model = clf.fit(X_train, y_train)
118+
preds = model.predict(X_test)
119+
analyze_model(clf, X_test, y_test, preds)

0 commit comments

Comments
 (0)