Shamil's Notepad

Hyperparameter optimization using Optuna

Hyperparameter optimization for Random Forest Classifier using the Optuna lib import optuna from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score optuna.logging.set_verbosity(optuna.logging.WARNING) X, y = make_classification( n_samples=250, n_features=10, n_informative=5, n_redundant=3, random_state=42, shuffle=True ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42 ) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) def objective(trial): # Number of trees in random forest n_estimators = trial.suggest_int(name="n_estimators", low=100, high=500, step=100) # Number of features to consider at every split max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) # Maximum number of levels in tree max_depth = trial.suggest_int(name="max_depth", low=10, high=110, step=20) # Minimum number of samples required to split a node min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2) # Minimum number of samples required at each leaf node min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1) params = { "n_estimators": n_estimators, "max_features": max_features, "max_depth": max_depth, "min_samples_split": min_samples_split, "min_samples_leaf": min_samples_leaf } model = RandomForestClassifier(random_state=SEED, **params) cv_score = cross_val_score(model, X_train, y_train, n_jobs=4, cv=5) mean_cv_accuracy = cv_score.mean() return mean_cv_accuracy study = optuna.create_study() study.optimize(objective, n_trials=5) # Train a new model using the best parameters best_model = RandomForestClassifier(random_state=SEED, **study.best_params) best_model.fit(X_train, y_train) y_pred = best_model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( y_test, y_pred, average='binary' ) print("test_accuracy:", test_acc) print("test_precision:", test_precision) print("test_recall:", test_recall) print("test_f1_score:", test_f1)

Mlflow autolog

MLFow with autologging and custom metrics from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score import mlflow mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_experiment("experiment-001") # ------------------------------------------------ # X, y = make_classification( n_samples=250, n_features=10, n_informative=5, n_redundant=3, random_state=42, shuffle=True ) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42 ) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # ------------------------------------------------ # %%time with mlflow.start_run(): mlflow.sklearn.autolog(exclusive=False) n_estimators = 50 max_depth = 5 mlflow.log_param("max_depth", max_depth) mlflow.log_param("n_estimators", n_estimators) model = RandomForestClassifier( random_state=42, max_depth=max_depth, n_estimators=n_estimators ) model.fit(X_train, y_train) y_pred = model.predict(X_test) # y_proba = model.predict_proba(X_test) mlflow.log_dict( { "y_test": [int(x) for x in y_test], "y_pred": [int(x) for x in y_pred] }, "ytest-ypred.json" ) test_acc = accuracy_score(y_test, y_pred) test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( y_test, y_pred, average='binary' ) mlflow.log_metric("test_accuracy", test_acc) mlflow.log_metric("test_precision", test_precision) mlflow.log_metric("test_recall", test_recall) mlflow.log_metric("test_f1_score", test_f1) print("test_accuracy:", test_acc) print("test_precision:", test_precision) print("test_recall:", test_recall) print("test_f1_score:", test_f1) mlflow.sklearn.autolog(disable=True)

Randomized Search CV

Hyperparameter tuning for Random Forest Classifier using the RandomizedSearchCV class from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV import numpy as np SEED=42 # Number of trees in random forest n_estimators = [int(x) for x in range(100,505,100)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num = 5)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores rf_random = RandomizedSearchCV( estimator=RandomForestClassifier(), param_distributions=random_grid, scoring="average_precision", random_state=SEED, n_iter=10, verbose=2, n_jobs=4, cv=3, ) # Fit the random search model rf_random.fit(X_train, y_train)

Model training duration

Model training evaluation using Matplotlib / seaborn scatter plot, colors on condition and custom color palette. fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5)) a = pd.Series(np.random.randint(60, 180, 25)) b = pd.Series(np.random.randint(55, 160, 25)) x_min = min(min(a), min(b)) y_max = max(max(a), max(b)) sns.scatterplot(a, b, ax=ax1) ax1.plot([x_min, y_max], [x_min, y_max], ":", color="grey") ax1.set_title("Model training runtime (Experiment #2)", size=16) ax1.set_xlabel("User-defined runtime (sec.)", size=14) ax1.set_ylabel("Actual runtime (sec.)", size=14) data=pd.DataFrame({"a":a, "diff":(b-a), "cond":((b-a) <= 0) * 1}) sns.scatterplot(x="a", y="diff", data=data, ax=ax2, hue="cond", palette={0: "tab:orange", 1: "tab:green"}, legend = False) ax2.axhline(y=0, xmin=a.index.min(), xmax=a.index.max(), linestyle=":", color="grey") ax2.set_title("Runtime difference in seconds (lower is better)", size=16) ax2.set_ylabel("Runtime difference (sec.)", size=14) ax2.set_xlabel("User-defined runtime (sec.)", size=14) plt.show() Output:

Mlflow artifacts

How to save mlflow runs and artifacts into an external folder Install the mlflow lib: pip install mlflow Create a folder artifacts: mkdir mlflow-artifacts Create a folder sqlite database(s): mkdir mlflow-dbs Start the mlflow UI in terminal: mlflow ui \ --backend-store-uri sqlite:///mlflow-dbs/db-20220822.sqlite \ --default-artifact-root mlflow-artifacts/ The mlflow UI will be served on http://127.0.0.1:5000 Create a project folder: your-project Switch to the project directory: cd your-project Connect to the UI and run experiments: # experiment.py import mlflow mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_experiment("experiment-001") with mlflow.start_run(): mlflow.log_param("num_layers", 5) mlflow.log_metric("accuracy", 0.75) Run the script: python experiment.py Autologging contents to an active fluent run, which may be user-created: ...