Welcome to my Notepad

This is my personal Notepad. Here, I document the solutions and tools I find useful, and the lessons I learn along the way. The Notepad covers a variety of topics, including Data Engineering, Machine Learning, Linux tools, Time Series Analysis, and Computer Vision tasks.

Install draw.io Desktop AppImage on Linux Mint

Install draw.io Desktop on Linux Mint (should work for Debian as well): Download AppImage file from Github releases page, e.g. Release 27.0.9 · jgraph/drawio-desktop Make AppImage file executable: chmod u+x drawio.AppImage Move AppImage file to ~/.local/bin/ mv drawio.AppImage ~/.local/bin/drawio.AppImage Create and edit the desktop entry file nano ~/.local/share/applications/appimagekit-drawio.desktop Add following content to the desktop entry file and save changes [Desktop Entry] Name=draw.io Comment=Diagramming Application Exec=/home/YOUR_USERNAME/.local/bin/appimagekit-drawio.AppImage Icon=com.jgraph.drawio.desktop Terminal=false Type=Application Categories=Graphics; Update the desktop database xdg-desktop-menu install ~/.local/share/applications/appimagekit-drawio.desktop draw.io should appear in Linux Mint start menu under “Graphics”

Copy-Paste Augmentation for YOLO

Simple copy-paste augmentation for YOLO object detection task from PIL import Image import random def paste_image(source_path, target_path, position=None): """ Pastes a small source image onto a larger target image. Parameters: - source_path: Path to the small source image. - target_path: Path to the larger target image. - position: Optional; A tuple (x, y) specifying the top-left corner where the source image will be pasted. If not provided, a random position will be chosen. Returns: - new_image: The newly created image with the source image pasted onto it. - bbox_yolo: A tuple (x_center, y_center, width, height) representing the bounding box of the pasted image in YOLO format. """ # Open the source and target images source_image = Image.open(source_path) target_image = Image.open(target_path) # Get the dimensions of the source and target images source_width, source_height = source_image.size target_width, target_height = target_image.size # Choose a random position if not provided if position is None: max_x = target_width - source_width max_y = target_height - source_height if max_x < 0 or max_y < 0: raise ValueError("The source image is larger than the target image.") position = (random.randint(0, max_x), random.randint(0, max_y)) else: # Ensure the specified position is within bounds if position[0] < 0 or position[1] < 0 or position[0] + source_width > target_width or position[1] + source_height > target_height: raise ValueError("The specified position is out of the target image bounds.") # Paste the source image onto the target image target_image.paste(source_image, position) # Calculate the bounding box of the pasted image left = position[0] upper = position[1] right = position[0] + source_width lower = position[1] + source_height bbox = (left, upper, right, lower) # Convert the bounding box to YOLO format x_center = (left + right) / 2 / target_width y_center = (upper + lower) / 2 / target_height width = source_width / target_width height = source_height / target_height bbox_yolo = (x_center, y_center, width, height) return target_image, bbox, bbox_yolo # Example usage if __name__ == "__main__": source_path = "path/to/small_image.png" # Replace with the path to your small source image target_path = "path/to/large_image.png" # Replace with the path to your large target image # Example with a specified position new_image, bbox_yolo = paste_image(source_path, target_path, position=(50, 50)) new_image.save("path/to/new_image_specified_position.png") # Replace with the desired path to save the new image print("Bounding box of the pasted image (specified position) in YOLO format:", bbox_yolo) # Example with a random position new_image, bbox_yolo = paste_image(source_path, target_path) new_image.save("path/to/new_image_random_position.png") # Replace with the desired path to save the new image print("Bounding box of the pasted image (random position) in YOLO format:", bbox_yolo)

Add leading zeros to PySpark DataFrame

PySpark DataFrame - add leading zero if the value is a single digit between 0 and 9 from pyspark.sql import SparkSession from pyspark.sql.functions import col, when, regexp_extract, length, lpad # Create a SparkSession spark = SparkSession.builder.appName("TupleToDataFrame").getOrCreate() # List of tuples arr = [("a", "0a1x"), ("x", "3"), ("cc", "11"), ("h", "5")] # Create a DataFrame df = spark.createDataFrame(arr, ["column1", "column2"]) # Function to add leading zero if the value is a single digit between 0 and 9, # replace non-digit values with "0000", and leave other values as is df = df.withColumn("column2", when(regexp_extract(col("column2"), "^\\d+$", 0) == "", "0000") .when((regexp_extract(col("column2"), "^\\d+$", 0) != "") & (length(col("column2")) == 1), lpad(col("column2"), 2, "0")) .otherwise(col("column2")) ) # Show the updated DataFrame df.show()

Read deleted S3 objects in AWS Glue

Read files marked as deleted from an AWS S3 bucket into a DynamicFrame using boto3 and AWS Glue import boto3 from awsglue.context import GlueContext from pyspark.context import SparkContext from awsglue.dynamicframe import DynamicFrame def read_deleted_files_to_dynamicframe(bucket_name, prefix=''): # Initialize Glue context sc = SparkContext() glueContext = GlueContext(sc) # Initialize S3 client s3 = boto3.client('s3') # List object versions including delete markers paginator = s3.get_paginator('list_object_versions') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) # Collect keys of deleted files deleted_files = [] for page in page_iterator: if 'DeleteMarkers' in page: for delete_marker in page['DeleteMarkers']: if delete_marker['IsLatest']: deleted_files.append(delete_marker['Key']) # Read deleted files into a DynamicFrame if deleted_files: # Create a DynamicFrame from the deleted files dyf = glueContext.create_dynamic_frame.from_options( connection_type="s3", connection_options={ "paths": [f"s3://{bucket_name}/{key}" for key in deleted_files], "recurse": True, "useS3ListImplementation": True, "readVersion": "LATEST_PREVIOUS" # Read the version before the delete marker }, format="csv", # Adjust this based on your file format format_options={ "withHeader": True, "separator": "," } ) return dyf else: print("No deleted files found.") return None # Usage bucket_name = 'your-bucket-name' prefix = 'your-folder-prefix/' # Optional dyf = read_deleted_files_to_dynamicframe(bucket_name, prefix) if dyf: # Print the schema of the DynamicFrame dyf.printSchema() # Convert to DataFrame for further processing if needed df = dyf.toDF() # Perform your manipulations here # Convert back to DynamicFrame if necessary result_dyf = DynamicFrame.fromDF(df, glueContext, "result_dyf") # Write the result back to S3 glueContext.write_dynamic_frame.from_options( frame=result_dyf, connection_type="s3", connection_options={"path": "s3://output-bucket/output-path/"}, format="parquet" )

Read S3 objects based on date in AWS Glue

Read files from an AWS S3 bucket based on their creation date using AWS Glue import boto3 from datetime import datetime from awsglue.context import GlueContext from pyspark.context import SparkContext # First, use boto3 to list and filter objects def list_s3_files_after_date(bucket_name, prefix='', start_date=None): s3 = boto3.client('s3') paginator = s3.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) filtered_files = [] for page in page_iterator: if 'Contents' in page: for obj in page['Contents']: # S3 doesn't have a separate creation date, so we use LastModified if start_date is None or obj['LastModified'].replace(tzinfo=None) > start_date: filtered_files.append(obj['Key']) return filtered_files # Usage bucket_name = 'your-bucket-name' start_date = datetime(2024, 1, 1) # Files created after January 1, 2024 filtered_files = list_s3_files_after_date(bucket_name, start_date=start_date) # Then, use AWS Glue to create a DynamicFrame from the filtered files def read_filtered_files_to_dynamicframe(bucket_name, filtered_files): sc = SparkContext() glueContext = GlueContext(sc) if filtered_files: dyf = glueContext.create_dynamic_frame.from_options( connection_type="s3", connection_options={ "paths": [f"s3://{bucket_name}/{key}" for key in filtered_files], "recurse": True, "useS3ListImplementation": True }, format="csv", # Adjust this based on your file format format_options={ "withHeader": True, "separator": "," } ) return dyf else: print("No files found matching the criteria.") return None # Usage bucket_name = 'your-bucket-name' start_date = datetime(2024, 1, 1) # Files created after January 1, 2024 filtered_files = list_s3_files_after_date(bucket_name, start_date=start_date) dyf = read_filtered_files_to_dynamicframe(bucket_name, filtered_files) if dyf: # Print the schema of the DynamicFrame dyf.printSchema() # Convert to DataFrame for further processing if needed df = dyf.toDF() # Perform your manipulations here # Write the result back to S3 if needed glueContext.write_dynamic_frame.from_options( frame=dyf, connection_type="s3", connection_options={"path": "s3://output-bucket/output-path/"}, format="parquet" )