Created
April 30, 2025 10:10
-
-
Save mmafrar/8744bac7767c15a04a39272fe7c2e69c to your computer and use it in GitHub Desktop.
Big Data Processing - Case Study 4 (Spark)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
# Initialize a Spark session | |
spark = SparkSession.builder.appName("sql_dataframe").getOrCreate() | |
# Read the CSV file into a DataFrame, with header and schema inference options enabled | |
productSales = spark.read.option("header", "true")\ | |
.option("inferSchema", "true").csv("file:///Users/mmafrar/DataRetrieval/Salesstore.csv") | |
# Print the inferred schema of the DataFrame | |
print("Here is our inferred schema:") | |
productSales.printSchema() | |
# Display the 'Customer_Name' column | |
print("Let's display the name column:") | |
productSales.select("Customer_Name").show() | |
# Filter out product categories other than 'Office Supplies' | |
# Uncomment the line below to use DataFrame API for filtering | |
# productSales.filter(productSales.Product_Category == 'Office Supplies').show() | |
# Create a temporary view for SQL queries | |
productSales.createOrReplaceTempView("salesStore") | |
# Use Spark SQL to filter 'Office Supplies' category | |
officeSupplies = spark.sql("SELECT * FROM salesStore WHERE Product_Category == 'Office Supplies'") | |
print(type(officeSupplies)) | |
officeSupplies.show() | |
# Group by 'Region' and sum the 'Sales' column | |
print("Group by Region") | |
productSales.groupBy("Region").sum("Sales").show() | |
# Stop the Spark session | |
spark.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment