Wednesday, October 18, 2023

pyspark code to get estimated size of dataframe in bytes

 from pyspark.sql import SparkSession

import sys
# Initialize a Spark session
spark = SparkSession.builder.appName("DataFrameSize").getOrCreate()

# Create a PySpark DataFrame
data = [(1, "John"), (2, "Alice"), (3, "Bob")]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)

# Get the size of the DataFrame in bytes
size_in_bytes = df.rdd.flatMap(lambda x: x).map(lambda x: sys.getsizeof(x) if x is not None else 0).sum()
print(f"Size of the DataFrame: {size_in_bytes} bytes")

# Stop the Spark session
spark.stop()