mmafrar · April 30, 2025 10:10
diff --git a/sql_dataframe.py b/sql_dataframe.py
 from pyspark.sql import SparkSession

 # Initialize a Spark session
 spark = SparkSession.builder.appName("sql_dataframe").getOrCreate()

 # Read the CSV file into a DataFrame, with header and schema inference options enabled
 productSales = spark.read.option("header", "true")\
    .option("inferSchema", "true").csv("file:///Users/mmafrar/DataRetrieval/Salesstore.csv")

 # Print the inferred schema of the DataFrame
 print("Here is our inferred schema:")
 productSales.printSchema()

 # Display the 'Customer_Name' column
 print("Let's display the name column:")
 productSales.select("Customer_Name").show()

 # Filter out product categories other than 'Office Supplies'
 # Uncomment the line below to use DataFrame API for filtering
 # productSales.filter(productSales.Product_Category == 'Office Supplies').show()

 # Create a temporary view for SQL queries
 productSales.createOrReplaceTempView("salesStore")

 # Use Spark SQL to filter 'Office Supplies' category
 officeSupplies = spark.sql("SELECT * FROM salesStore WHERE Product_Category == 'Office Supplies'")
 print(type(officeSupplies))
 officeSupplies.show()

 # Group by 'Region' and sum the 'Sales' column
 print("Group by Region")
 productSales.groupBy("Region").sum("Sales").show()

 # Stop the Spark session
 spark.stop()
	from pyspark.sql import SparkSession

	# Initialize a Spark session
	spark = SparkSession.builder.appName("sql_dataframe").getOrCreate()

	# Read the CSV file into a DataFrame, with header and schema inference options enabled
	productSales = spark.read.option("header", "true")\
	.option("inferSchema", "true").csv("file:///Users/mmafrar/DataRetrieval/Salesstore.csv")

	# Print the inferred schema of the DataFrame
	print("Here is our inferred schema:")
	productSales.printSchema()

	# Display the 'Customer_Name' column
	print("Let's display the name column:")
	productSales.select("Customer_Name").show()

	# Filter out product categories other than 'Office Supplies'
	# Uncomment the line below to use DataFrame API for filtering
	# productSales.filter(productSales.Product_Category == 'Office Supplies').show()

	# Create a temporary view for SQL queries
	productSales.createOrReplaceTempView("salesStore")

	# Use Spark SQL to filter 'Office Supplies' category
	officeSupplies = spark.sql("SELECT * FROM salesStore WHERE Product_Category == 'Office Supplies'")
	print(type(officeSupplies))
	officeSupplies.show()

	# Group by 'Region' and sum the 'Sales' column
	print("Group by Region")
	productSales.groupBy("Region").sum("Sales").show()

	# Stop the Spark session
	spark.stop()