Data-Processing

Copy-Paste Augmentation for YOLO

Simple copy-paste augmentation for YOLO object detection task from PIL import Image import random def paste_image(source_path, target_path, position=None): """ Pastes a small source image onto a larger target image. Parameters: - source_path: Path to the small source image. - target_path: Path to the larger target image. - position: Optional; A tuple (x, y) specifying the top-left corner where the source image will be pasted. If not provided, a random position will be chosen. Returns: - new_image: The newly created image with the source image pasted onto it. - bbox_yolo: A tuple (x_center, y_center, width, height) representing the bounding box of the pasted image in YOLO format. """ # Open the source and target images source_image = Image.open(source_path) target_image = Image.open(target_path) # Get the dimensions of the source and target images source_width, source_height = source_image.size target_width, target_height = target_image.size # Choose a random position if not provided if position is None: max_x = target_width - source_width max_y = target_height - source_height if max_x < 0 or max_y < 0: raise ValueError("The source image is larger than the target image.") position = (random.randint(0, max_x), random.randint(0, max_y)) else: # Ensure the specified position is within bounds if position[0] < 0 or position[1] < 0 or position[0] + source_width > target_width or position[1] + source_height > target_height: raise ValueError("The specified position is out of the target image bounds.") # Paste the source image onto the target image target_image.paste(source_image, position) # Calculate the bounding box of the pasted image left = position[0] upper = position[1] right = position[0] + source_width lower = position[1] + source_height bbox = (left, upper, right, lower) # Convert the bounding box to YOLO format x_center = (left + right) / 2 / target_width y_center = (upper + lower) / 2 / target_height width = source_width / target_width height = source_height / target_height bbox_yolo = (x_center, y_center, width, height) return target_image, bbox, bbox_yolo # Example usage if __name__ == "__main__": source_path = "path/to/small_image.png" # Replace with the path to your small source image target_path = "path/to/large_image.png" # Replace with the path to your large target image # Example with a specified position new_image, bbox_yolo = paste_image(source_path, target_path, position=(50, 50)) new_image.save("path/to/new_image_specified_position.png") # Replace with the desired path to save the new image print("Bounding box of the pasted image (specified position) in YOLO format:", bbox_yolo) # Example with a random position new_image, bbox_yolo = paste_image(source_path, target_path) new_image.save("path/to/new_image_random_position.png") # Replace with the desired path to save the new image print("Bounding box of the pasted image (random position) in YOLO format:", bbox_yolo)

Add leading zeros to PySpark DataFrame

PySpark DataFrame - add leading zero if the value is a single digit between 0 and 9 from pyspark.sql import SparkSession from pyspark.sql.functions import col, when, regexp_extract, length, lpad # Create a SparkSession spark = SparkSession.builder.appName("TupleToDataFrame").getOrCreate() # List of tuples arr = [("a", "0a1x"), ("x", "3"), ("cc", "11"), ("h", "5")] # Create a DataFrame df = spark.createDataFrame(arr, ["column1", "column2"]) # Function to add leading zero if the value is a single digit between 0 and 9, # replace non-digit values with "0000", and leave other values as is df = df.withColumn("column2", when(regexp_extract(col("column2"), "^\\d+$", 0) == "", "0000") .when((regexp_extract(col("column2"), "^\\d+$", 0) != "") & (length(col("column2")) == 1), lpad(col("column2"), 2, "0")) .otherwise(col("column2")) ) # Show the updated DataFrame df.show()

Read deleted S3 objects in AWS Glue

Read files marked as deleted from an AWS S3 bucket into a DynamicFrame using boto3 and AWS Glue import boto3 from awsglue.context import GlueContext from pyspark.context import SparkContext from awsglue.dynamicframe import DynamicFrame def read_deleted_files_to_dynamicframe(bucket_name, prefix=''): # Initialize Glue context sc = SparkContext() glueContext = GlueContext(sc) # Initialize S3 client s3 = boto3.client('s3') # List object versions including delete markers paginator = s3.get_paginator('list_object_versions') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) # Collect keys of deleted files deleted_files = [] for page in page_iterator: if 'DeleteMarkers' in page: for delete_marker in page['DeleteMarkers']: if delete_marker['IsLatest']: deleted_files.append(delete_marker['Key']) # Read deleted files into a DynamicFrame if deleted_files: # Create a DynamicFrame from the deleted files dyf = glueContext.create_dynamic_frame.from_options( connection_type="s3", connection_options={ "paths": [f"s3://{bucket_name}/{key}" for key in deleted_files], "recurse": True, "useS3ListImplementation": True, "readVersion": "LATEST_PREVIOUS" # Read the version before the delete marker }, format="csv", # Adjust this based on your file format format_options={ "withHeader": True, "separator": "," } ) return dyf else: print("No deleted files found.") return None # Usage bucket_name = 'your-bucket-name' prefix = 'your-folder-prefix/' # Optional dyf = read_deleted_files_to_dynamicframe(bucket_name, prefix) if dyf: # Print the schema of the DynamicFrame dyf.printSchema() # Convert to DataFrame for further processing if needed df = dyf.toDF() # Perform your manipulations here # Convert back to DynamicFrame if necessary result_dyf = DynamicFrame.fromDF(df, glueContext, "result_dyf") # Write the result back to S3 glueContext.write_dynamic_frame.from_options( frame=result_dyf, connection_type="s3", connection_options={"path": "s3://output-bucket/output-path/"}, format="parquet" )

Read S3 objects based on date in AWS Glue

Read files from an AWS S3 bucket based on their creation date using AWS Glue import boto3 from datetime import datetime from awsglue.context import GlueContext from pyspark.context import SparkContext # First, use boto3 to list and filter objects def list_s3_files_after_date(bucket_name, prefix='', start_date=None): s3 = boto3.client('s3') paginator = s3.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) filtered_files = [] for page in page_iterator: if 'Contents' in page: for obj in page['Contents']: # S3 doesn't have a separate creation date, so we use LastModified if start_date is None or obj['LastModified'].replace(tzinfo=None) > start_date: filtered_files.append(obj['Key']) return filtered_files # Usage bucket_name = 'your-bucket-name' start_date = datetime(2024, 1, 1) # Files created after January 1, 2024 filtered_files = list_s3_files_after_date(bucket_name, start_date=start_date) # Then, use AWS Glue to create a DynamicFrame from the filtered files def read_filtered_files_to_dynamicframe(bucket_name, filtered_files): sc = SparkContext() glueContext = GlueContext(sc) if filtered_files: dyf = glueContext.create_dynamic_frame.from_options( connection_type="s3", connection_options={ "paths": [f"s3://{bucket_name}/{key}" for key in filtered_files], "recurse": True, "useS3ListImplementation": True }, format="csv", # Adjust this based on your file format format_options={ "withHeader": True, "separator": "," } ) return dyf else: print("No files found matching the criteria.") return None # Usage bucket_name = 'your-bucket-name' start_date = datetime(2024, 1, 1) # Files created after January 1, 2024 filtered_files = list_s3_files_after_date(bucket_name, start_date=start_date) dyf = read_filtered_files_to_dynamicframe(bucket_name, filtered_files) if dyf: # Print the schema of the DynamicFrame dyf.printSchema() # Convert to DataFrame for further processing if needed df = dyf.toDF() # Perform your manipulations here # Write the result back to S3 if needed glueContext.write_dynamic_frame.from_options( frame=dyf, connection_type="s3", connection_options={"path": "s3://output-bucket/output-path/"}, format="parquet" )

Keras Transfer Learning

Transfer learning with VGG and Keras for image classification task # Credits: # Author: Gabriel Cassimiro # Blog post: https://towardsdatascience.com/transfer-learning-with-vgg16-and-keras-50ea161580b4 # GitHub Repo: https://github.com/gabrielcassimiro17/object-detection # import tensorflow_datasets as tfds from tensorflow.keras import layers, models from tensorflow.keras.utils import to_categorical from tensorflow.keras.callbacks import EarlyStopping ## Loading images and labels (train_ds, train_labels), (test_ds, test_labels) = tfds.load( "tf_flowers", split=["train[:70%]", "train[:30%]"], ## Train test split batch_size=-1, as_supervised=True, # Include labels ) ## Resizing images train_ds = tf.image.resize(train_ds, (150, 150)) test_ds = tf.image.resize(test_ds, (150, 150)) ## Transforming labels to correct format train_labels = to_categorical(train_labels, num_classes=5) test_labels = to_categorical(test_labels, num_classes=5) from tensorflow.keras.applications.vgg16 import VGG16 from tensorflow.keras.applications.vgg16 import preprocess_input ## Loading VGG16 model base_model = VGG16(weights="imagenet", include_top=False, input_shape=train_ds[0].shape) base_model.trainable = False ## Not trainable weights ## Preprocessing input train_ds = preprocess_input(train_ds) test_ds = preprocess_input(test_ds) flatten_layer = layers.Flatten() dense_layer_1 = layers.Dense(50, activation='relu') dense_layer_2 = layers.Dense(20, activation='relu') prediction_layer = layers.Dense(5, activation='softmax') model = models.Sequential([ base_model, flatten_layer, dense_layer_1, dense_layer_2, prediction_layer ]) model.compile( optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], ) es = EarlyStopping(monitor='val_accuracy', mode='max', patience=5, restore_best_weights=True) model.fit(train_ds, train_labels, epochs=50, validation_split=0.2, batch_size=32, callbacks=[es]) model.evaluate(test_ds, test_labels)