33-AzureML-2/solution-v2/diabetes_training/diabetes_training.py

# Import libraries
import argparse
import os

import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def main():
    """Main function of the script."""

    # Input and output arguments
    # Get script arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data",
        type=str,
        help="path to input data",
    )
    parser.add_argument("--registered_model_name", type=str, help="model name")
    args = parser.parse_args()
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.sklearn.autolog()

    # load the diabetes data (passed as an input dataset)
    print("input data:", args.data)

    diabetes = pd.read_csv(args.data)

    mlflow.log_metric("num_samples", diabetes.shape[0])
    mlflow.log_metric("num_features", diabetes.shape[1] - 1)

    # Separate features and labels
    X, y = (
        diabetes[
            [
                "Pregnancies",
                "PlasmaGlucose",
                "DiastolicBloodPressure",
                "TricepsThickness",
                "SerumInsulin",
                "BMI",
                "DiabetesPedigree",
                "Age",
            ]
        ].values,
        diabetes["Diabetic"].values,
    )

    # Split data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=0
    )

    # Train a decision tree model
    print("Training a decision tree model")
    model = DecisionTreeClassifier().fit(X_train, y_train)

    # calculate accuracy
    y_hat = model.predict(X_test)
    accuracy = np.average(y_hat == y_test)
    print("Accuracy:", accuracy)
    mlflow.log_metric("Accuracy", float(accuracy))

    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_scores[:, 1])
    print("AUC: " + str(auc))
    mlflow.log_metric("AUC", float(auc))

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], "k--")
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    fig.savefig("ROC.png")
    mlflow.log_artifact("ROC.png")
    plt.show()

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=model,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=model,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()