from pyspark.sql import SparkSession
import sys
# Initialize a Spark session
spark = SparkSession.builder.appName("DataFrameSize").getOrCreate()
# Create a PySpark DataFrame
data = [(1, "John"), (2, "Alice"), (3, "Bob")]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)
# Get the size of the DataFrame in bytes
size_in_bytes = df.rdd.flatMap(lambda x: x).map(lambda x: sys.getsizeof(x) if x is not None else 0).sum()
print(f"Size of the DataFrame: {size_in_bytes} bytes")
# Stop the Spark session
spark.stop()