Skip to content

Instantly share code, notes, and snippets.

@larkintuckerllc
Last active April 23, 2025 00:32
Show Gist options
  • Save larkintuckerllc/dc9e74215257a65d60897eb71ac28542 to your computer and use it in GitHub Desktop.
Save larkintuckerllc/dc9e74215257a65d60897eb71ac28542 to your computer and use it in GitHub Desktop.
fare-prediction.py
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.sql.functions import rand,when,lit
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
dyf = glueContext.create_dynamic_frame.from_catalog(database='fare-prediction', table_name='raw')
df = dyf.toDF()
df = df.withColumn("trip_minutes", df["trip_seconds"] / 60)
df = df.withColumn("partition", when(rand() < 0.6, lit("train")).otherwise(when(rand() < 0.5, lit("validation")).otherwise(lit("test"))))
df = df.select("fare", "trip_miles", "trip_minutes", "partition")
dyf = DynamicFrame.fromDF(df, glueContext, "convert")
glueContext.write_dynamic_frame_from_options(
dyf,
connection_type="s3",
connection_options = {"path": "s3://fare-prediction/model", "partitionKeys": ["partition"]},
format="csv",
format_options={"writeHeader": False}
)
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment