# Import libraries import argparse import os import matplotlib.pyplot as plt import mlflow import mlflow.sklearn import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score, roc_curve from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier def main(): """Main function of the script.""" # Input and output arguments # Get script arguments parser = argparse.ArgumentParser() parser.add_argument( "--data", type=str, help="path to input data", ) parser.add_argument("--registered_model_name", type=str, help="model name") args = parser.parse_args() print(" ".join(f"{k}={v}" for k, v in vars(args).items())) # Start Logging mlflow.start_run() # enable autologging mlflow.sklearn.autolog() # load the diabetes data (passed as an input dataset) print("input data:", args.data) diabetes = pd.read_csv(args.data) mlflow.log_metric("num_samples", diabetes.shape[0]) mlflow.log_metric("num_features", diabetes.shape[1] - 1) # Separate features and labels X, y = ( diabetes[ [ "Pregnancies", "PlasmaGlucose", "DiastolicBloodPressure", "TricepsThickness", "SerumInsulin", "BMI", "DiabetesPedigree", "Age", ] ].values, diabetes["Diabetic"].values, ) # Split data into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=0 ) # Train a decision tree model print("Training a decision tree model") model = DecisionTreeClassifier().fit(X_train, y_train) # calculate accuracy y_hat = model.predict(X_test) accuracy = np.average(y_hat == y_test) print("Accuracy:", accuracy) mlflow.log_metric("Accuracy", float(accuracy)) # calculate AUC y_scores = model.predict_proba(X_test) auc = roc_auc_score(y_test, y_scores[:, 1]) print("AUC: " + str(auc)) mlflow.log_metric("AUC", float(auc)) # plot ROC curve fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1]) fig = plt.figure(figsize=(6, 4)) # Plot the diagonal 50% line plt.plot([0, 1], [0, 1], "k--") # Plot the FPR and TPR achieved by our model plt.plot(fpr, tpr) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC Curve") fig.savefig("ROC.png") mlflow.log_artifact("ROC.png") plt.show() # Registering the model to the workspace print("Registering the model via MLFlow") mlflow.sklearn.log_model( sk_model=model, registered_model_name=args.registered_model_name, artifact_path=args.registered_model_name, ) # Saving the model to a file mlflow.sklearn.save_model( sk_model=model, path=os.path.join(args.registered_model_name, "trained_model"), ) # Stop Logging mlflow.end_run() if __name__ == "__main__": main()