Init and have all packages required

2024-09-04 10:15:43 +02:00 · 2024-09-04 10:15:43 +02:00 · 782aba19ba
commit 782aba19ba
53 changed files with 21896 additions and 0 deletions
--- a/solution-v2/diabetes_training/diabetes_training.py
+++ b/solution-v2/diabetes_training/diabetes_training.py
@ -0,0 +1,115 @@
+# Import libraries
+import argparse
+import os
+
+import matplotlib.pyplot as plt
+import mlflow
+import mlflow.sklearn
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+
+
+def main():
+    """Main function of the script."""
+
+    # Input and output arguments
+    # Get script arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data",
+        type=str,
+        help="path to input data",
+    )
+    parser.add_argument("--registered_model_name", type=str, help="model name")
+    args = parser.parse_args()
+    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
+
+    # Start Logging
+    mlflow.start_run()
+
+    # enable autologging
+    mlflow.sklearn.autolog()
+
+    # load the diabetes data (passed as an input dataset)
+    print("input data:", args.data)
+
+    diabetes = pd.read_csv(args.data)
+
+    mlflow.log_metric("num_samples", diabetes.shape[0])
+    mlflow.log_metric("num_features", diabetes.shape[1] - 1)
+
+    # Separate features and labels
+    X, y = (
+        diabetes[
+            [
+                "Pregnancies",
+                "PlasmaGlucose",
+                "DiastolicBloodPressure",
+                "TricepsThickness",
+                "SerumInsulin",
+                "BMI",
+                "DiabetesPedigree",
+                "Age",
+            ]
+        ].values,
+        diabetes["Diabetic"].values,
+    )
+
+    # Split data into training set and test set
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.30, random_state=0
+    )
+
+    # Train a decision tree model
+    print("Training a decision tree model")
+    model = DecisionTreeClassifier().fit(X_train, y_train)
+
+    # calculate accuracy
+    y_hat = model.predict(X_test)
+    accuracy = np.average(y_hat == y_test)
+    print("Accuracy:", accuracy)
+    mlflow.log_metric("Accuracy", float(accuracy))
+
+    # calculate AUC
+    y_scores = model.predict_proba(X_test)
+    auc = roc_auc_score(y_test, y_scores[:, 1])
+    print("AUC: " + str(auc))
+    mlflow.log_metric("AUC", float(auc))
+
+    # plot ROC curve
+    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
+    fig = plt.figure(figsize=(6, 4))
+    # Plot the diagonal 50% line
+    plt.plot([0, 1], [0, 1], "k--")
+    # Plot the FPR and TPR achieved by our model
+    plt.plot(fpr, tpr)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve")
+    fig.savefig("ROC.png")
+    mlflow.log_artifact("ROC.png")
+    plt.show()
+
+    # Registering the model to the workspace
+    print("Registering the model via MLFlow")
+    mlflow.sklearn.log_model(
+        sk_model=model,
+        registered_model_name=args.registered_model_name,
+        artifact_path=args.registered_model_name,
+    )
+
+    # Saving the model to a file
+    mlflow.sklearn.save_model(
+        sk_model=model,
+        path=os.path.join(args.registered_model_name, "trained_model"),
+    )
+
+    # Stop Logging
+    mlflow.end_run()
+
+
+if __name__ == "__main__":
+    main()