Python

Resize an image and bbox

Resize an image and bounding boxes: Reference: https://sheldonsebastian94.medium.com/resizing-image-and-bounding-boxes-for-object-detection-7b9d9463125a import albumentations from PIL import Image import numpy as np sample_img = Image.open("data/img1.jpg") sample_arr = np.asarray(sample_img) def resize_image(img_arr, bboxes, h, w): """ :param img_arr: original image as a numpy array :param bboxes: bboxes as numpy array where each row is 'x_min', 'y_min', 'x_max', 'y_max', "class_id" :param h: resized height dimension of image :param w: resized weight dimension of image :return: dictionary containing {image:transformed, bboxes:['x_min', 'y_min', 'x_max', 'y_max', "class_id"]} """ # create resize transform pipeline transform = albumentations.Compose( [albumentations.Resize(height=h, width=w, always_apply=True)], bbox_params=albumentations.BboxParams(format='pascal_voc')) transformed = transform(image=img_arr, bboxes=bboxes) return transformed transformed_dict = resize_image(sample_arr, bboxes_og, 224, 224) # contains the image as array transformed_arr = transformed_dict["image"] # contains the resized bounding boxes transformed_info = np.array(list(map(list, transformed_dict["bboxes"]))).astype(float)

Confusion matrix for cross validation

Confusion matrix for cross validation. Results are being saved as mlflow artifacts and retrieved later for evaluation purposes. import mlflow import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.datasets import load_breast_cancer from sklearn.model_selection import StratifiedKFold SEED = 42 mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_experiment("cross-validation-average") # Data preparation dataset = load_breast_cancer() X = dataset.data y = dataset.target print(X.shape, y.shape) # Model training skf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] with mlflow.start_run(): mlflow.sklearn.autolog(exclusive=False) model = RandomForestClassifier(random_state=SEED) model.fit(X_train, y_train) y_pred = model.predict(X_test) mlflow.log_dict({"y_test": [int(x) for x in y_test], "y_pred": [int(x) for x in y_pred] }, "ytest-ypred.json") test_acc = accuracy_score(y_test, y_pred) mlflow.log_metric("test_accuracy", test_acc) print("test_accuracy:", test_acc) test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( y_test, y_pred, average='binary' ) mlflow.log_metric("test_precision", test_precision) mlflow.log_metric("test_recall", test_recall) mlflow.log_metric("test_f1_score", test_f1) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() mlflow.log_metric("tn", tn) mlflow.log_metric("fp", fp) mlflow.log_metric("fn", fn) mlflow.log_metric("tp", tp) tn, fp, fn, tp = confusion_matrix( y_test, y_pred, normalize="true" ).ravel() mlflow.log_metric("tn_normalized", tn) mlflow.log_metric("fp_normalized", fp) mlflow.log_metric("fn_normalized", fn) mlflow.log_metric("tp_normalized", tp) mlflow.sklearn.autolog(disable=True) # Results evaluation runs = mlflow.search_runs(experiment_ids=["2"]) columns = [ 'metrics.tn_normalized', 'metrics.fp_normalized', 'metrics.fn_normalized', 'metrics.tp_normalized' ] mean_confusion_matrix = runs[columns].mean() print(mean_confusion_matrix) Output: ...

Hyperparameter optimization using Optuna

Hyperparameter optimization for Random Forest Classifier using the Optuna lib import optuna from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score optuna.logging.set_verbosity(optuna.logging.WARNING) X, y = make_classification( n_samples=250, n_features=10, n_informative=5, n_redundant=3, random_state=42, shuffle=True ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42 ) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) def objective(trial): # Number of trees in random forest n_estimators = trial.suggest_int(name="n_estimators", low=100, high=500, step=100) # Number of features to consider at every split max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) # Maximum number of levels in tree max_depth = trial.suggest_int(name="max_depth", low=10, high=110, step=20) # Minimum number of samples required to split a node min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2) # Minimum number of samples required at each leaf node min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1) params = { "n_estimators": n_estimators, "max_features": max_features, "max_depth": max_depth, "min_samples_split": min_samples_split, "min_samples_leaf": min_samples_leaf } model = RandomForestClassifier(random_state=SEED, **params) cv_score = cross_val_score(model, X_train, y_train, n_jobs=4, cv=5) mean_cv_accuracy = cv_score.mean() return mean_cv_accuracy study = optuna.create_study() study.optimize(objective, n_trials=5) # Train a new model using the best parameters best_model = RandomForestClassifier(random_state=SEED, **study.best_params) best_model.fit(X_train, y_train) y_pred = best_model.predict(X_test) test_acc = accuracy_score(y_test, y_pred) test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( y_test, y_pred, average='binary' ) print("test_accuracy:", test_acc) print("test_precision:", test_precision) print("test_recall:", test_recall) print("test_f1_score:", test_f1)

Mlflow autolog

MLFow with autologging and custom metrics from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score import mlflow mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_experiment("experiment-001") # ------------------------------------------------ # X, y = make_classification( n_samples=250, n_features=10, n_informative=5, n_redundant=3, random_state=42, shuffle=True ) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42 ) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # ------------------------------------------------ # %%time with mlflow.start_run(): mlflow.sklearn.autolog(exclusive=False) n_estimators = 50 max_depth = 5 mlflow.log_param("max_depth", max_depth) mlflow.log_param("n_estimators", n_estimators) model = RandomForestClassifier( random_state=42, max_depth=max_depth, n_estimators=n_estimators ) model.fit(X_train, y_train) y_pred = model.predict(X_test) # y_proba = model.predict_proba(X_test) mlflow.log_dict( { "y_test": [int(x) for x in y_test], "y_pred": [int(x) for x in y_pred] }, "ytest-ypred.json" ) test_acc = accuracy_score(y_test, y_pred) test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( y_test, y_pred, average='binary' ) mlflow.log_metric("test_accuracy", test_acc) mlflow.log_metric("test_precision", test_precision) mlflow.log_metric("test_recall", test_recall) mlflow.log_metric("test_f1_score", test_f1) print("test_accuracy:", test_acc) print("test_precision:", test_precision) print("test_recall:", test_recall) print("test_f1_score:", test_f1) mlflow.sklearn.autolog(disable=True)

Randomized Search CV

Hyperparameter tuning for Random Forest Classifier using the RandomizedSearchCV class from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV import numpy as np SEED=42 # Number of trees in random forest n_estimators = [int(x) for x in range(100,505,100)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num = 5)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores rf_random = RandomizedSearchCV( estimator=RandomForestClassifier(), param_distributions=random_grid, scoring="average_precision", random_state=SEED, n_iter=10, verbose=2, n_jobs=4, cv=3, ) # Fit the random search model rf_random.fit(X_train, y_train)