Init and have all packages required
This commit is contained in:
commit
782aba19ba
53 changed files with 21896 additions and 0 deletions
115
solution-v2/diabetes_training/diabetes_training.py
Normal file
115
solution-v2/diabetes_training/diabetes_training.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
# Import libraries
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import mlflow
|
||||
import mlflow.sklearn
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import roc_auc_score, roc_curve
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function of the script."""
|
||||
|
||||
# Input and output arguments
|
||||
# Get script arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
type=str,
|
||||
help="path to input data",
|
||||
)
|
||||
parser.add_argument("--registered_model_name", type=str, help="model name")
|
||||
args = parser.parse_args()
|
||||
print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
|
||||
|
||||
# Start Logging
|
||||
mlflow.start_run()
|
||||
|
||||
# enable autologging
|
||||
mlflow.sklearn.autolog()
|
||||
|
||||
# load the diabetes data (passed as an input dataset)
|
||||
print("input data:", args.data)
|
||||
|
||||
diabetes = pd.read_csv(args.data)
|
||||
|
||||
mlflow.log_metric("num_samples", diabetes.shape[0])
|
||||
mlflow.log_metric("num_features", diabetes.shape[1] - 1)
|
||||
|
||||
# Separate features and labels
|
||||
X, y = (
|
||||
diabetes[
|
||||
[
|
||||
"Pregnancies",
|
||||
"PlasmaGlucose",
|
||||
"DiastolicBloodPressure",
|
||||
"TricepsThickness",
|
||||
"SerumInsulin",
|
||||
"BMI",
|
||||
"DiabetesPedigree",
|
||||
"Age",
|
||||
]
|
||||
].values,
|
||||
diabetes["Diabetic"].values,
|
||||
)
|
||||
|
||||
# Split data into training set and test set
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.30, random_state=0
|
||||
)
|
||||
|
||||
# Train a decision tree model
|
||||
print("Training a decision tree model")
|
||||
model = DecisionTreeClassifier().fit(X_train, y_train)
|
||||
|
||||
# calculate accuracy
|
||||
y_hat = model.predict(X_test)
|
||||
accuracy = np.average(y_hat == y_test)
|
||||
print("Accuracy:", accuracy)
|
||||
mlflow.log_metric("Accuracy", float(accuracy))
|
||||
|
||||
# calculate AUC
|
||||
y_scores = model.predict_proba(X_test)
|
||||
auc = roc_auc_score(y_test, y_scores[:, 1])
|
||||
print("AUC: " + str(auc))
|
||||
mlflow.log_metric("AUC", float(auc))
|
||||
|
||||
# plot ROC curve
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
|
||||
fig = plt.figure(figsize=(6, 4))
|
||||
# Plot the diagonal 50% line
|
||||
plt.plot([0, 1], [0, 1], "k--")
|
||||
# Plot the FPR and TPR achieved by our model
|
||||
plt.plot(fpr, tpr)
|
||||
plt.xlabel("False Positive Rate")
|
||||
plt.ylabel("True Positive Rate")
|
||||
plt.title("ROC Curve")
|
||||
fig.savefig("ROC.png")
|
||||
mlflow.log_artifact("ROC.png")
|
||||
plt.show()
|
||||
|
||||
# Registering the model to the workspace
|
||||
print("Registering the model via MLFlow")
|
||||
mlflow.sklearn.log_model(
|
||||
sk_model=model,
|
||||
registered_model_name=args.registered_model_name,
|
||||
artifact_path=args.registered_model_name,
|
||||
)
|
||||
|
||||
# Saving the model to a file
|
||||
mlflow.sklearn.save_model(
|
||||
sk_model=model,
|
||||
path=os.path.join(args.registered_model_name, "trained_model"),
|
||||
)
|
||||
|
||||
# Stop Logging
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue