Created
April 18, 2025 01:44
-
-
Save larkintuckerllc/204ee42ce96257178b33a5183b8bbe67 to your computer and use it in GitHub Desktop.
fare-prediction.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"kernelspec": { | |
"name": "glue_pyspark", | |
"display_name": "Glue PySpark", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "Python_Glue_Session", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "python", | |
"version": 3 | |
}, | |
"pygments_lexer": "python3", | |
"file_extension": ".py" | |
} | |
}, | |
"nbformat_minor": 4, | |
"nbformat": 4, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": "# magic", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "%idle_timeout 2880\n%glue_version 5.0\n%worker_type G.1X\n%number_of_workers 5", | |
"metadata": { | |
"trusted": true, | |
"tags": [] | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "Welcome to the Glue Interactive Sessions Kernel\nFor more information on available magic commands, please type %help in any new cell.\n\nPlease view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html\nInstalled kernel version: 1.0.8 \nCurrent idle_timeout is None minutes.\nidle_timeout has been set to 2880 minutes.\nSetting Glue version to: 5.0\nPrevious worker type: None\nSetting new worker type to: G.1X\nPrevious number of workers: None\nSetting new number of workers to: 5\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# imports", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "from pyspark.context import SparkContext\nfrom awsglue.context import GlueContext\nfrom awsglue.dynamicframe import DynamicFrame\nfrom pyspark.sql.functions import rand,when,lit", | |
"metadata": { | |
"trusted": true, | |
"tags": [] | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "Trying to create a Glue session for the kernel.\nSession Type: glueetl\nWorker Type: G.1X\nNumber of Workers: 5\nIdle Timeout: 2880\nSession ID: 26926b67-0a5a-45e8-b6cc-bc372ca40a18\nApplying the following default arguments:\n--glue_kernel_version 1.0.8\n--enable-glue-datacatalog true\nWaiting for session 26926b67-0a5a-45e8-b6cc-bc372ca40a18 to get into ready status...\nSession 26926b67-0a5a-45e8-b6cc-bc372ca40a18 has been created.\n\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# initialize", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "sc = SparkContext.getOrCreate()\nglueContext = GlueContext(sc)", | |
"metadata": { | |
"trusted": true, | |
"editable": true | |
}, | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# fetch", | |
"metadata": { | |
"editable": true, | |
"trusted": true | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "dyf = glueContext.create_dynamic_frame.from_catalog(database='fare-prediction', table_name='raw')\ndyf.printSchema()", | |
"metadata": { | |
"trusted": true, | |
"editable": true | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "root\n|-- trip_start_timestamp: string\n|-- trip_end_timestamp: string\n|-- trip_start_hour: double\n|-- trip_seconds: long\n|-- trip_miles: double\n|-- trip_speed: double\n|-- dropoff_community_area: long\n|-- fare: double\n|-- tips: double\n|-- tip_rate: double\n|-- tolls: choice\n| |-- long\n| |-- string\n|-- extras: double\n|-- trip_total: double\n|-- payment_type: string\n|-- company: string\n|-- dropoff_census_tract: long\n|-- pickup_census_tract: long\n|-- pickup_community_area: long\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": "df = dyf.toDF()\ndf.show()", | |
"metadata": { | |
"trusted": true, | |
"editable": true | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "+--------------------+--------------------+---------------+------------+----------+----------+----------------------+-----+----+--------+---------+------+----------+------------+--------------------+--------------------+-------------------+---------------------+\n|trip_start_timestamp| trip_end_timestamp|trip_start_hour|trip_seconds|trip_miles|trip_speed|dropoff_community_area| fare|tips|tip_rate| tolls|extras|trip_total|payment_type| company|dropoff_census_tract|pickup_census_tract|pickup_community_area|\n+--------------------+--------------------+---------------+------------+----------+----------+----------------------+-----+----+--------+---------+------+----------+------------+--------------------+--------------------+-------------------+---------------------+\n|05/17/2022 7:15:0...|05/17/2022 7:45:0...| 7.25| 2341| 2.57| 4.0| 17|31.99| 2.0| 6.3|{0, NULL}| 0.0| 33.99| Mobile| Flash Cab| NULL| NULL| NULL|\n|05/17/2022 5:15:0...|05/17/2022 5:30:0...| 17.25| 1074| 1.18| 4.0| 8| 9.75| 3.0| 27.9|{0, NULL}| 1.0| 14.25| Credit Card| Flash Cab| 17031081202| NULL| NULL|\n|05/17/2022 5:15:0...|05/17/2022 5:30:0...| 17.25| 1173| 1.29| 4.0| 8|10.25| 0.0| 0.0|{0, NULL}| 0.0| 10.25| Cash| Sun Taxi| 17031081201| 17031320100| 32|\n|05/17/2022 6:00:0...|05/17/2022 7:00:0...| 18.0| 3360| 3.7| 4.0| 24|23.75| 0.0| 0.0|{0, NULL}| 1.0| 24.75| Cash|Choice Taxi Assoc...| 17031241400| 17031320100| 32|\n|05/17/2022 5:00:0...|05/17/2022 5:30:0...| 17.0| 1044| 1.15| 4.0| 8| 10.0| 0.0| 0.0|{0, NULL}| 0.0| 10.0| Cash| Flash Cab| 17031081800| 17031320100| 32|\n|05/17/2022 5:30:0...|05/17/2022 6:00:0...| 17.5| 1251| 1.38| 4.0| 28| 11.0| 3.0| 27.3|{0, NULL}| 0.0| 14.5| Credit Card| Flash Cab| 17031280100| 17031081500| 8|\n|05/17/2022 5:00:0...|05/17/2022 5:30:0...| 17.0| 1813| 2.0| 4.0| 28| 14.5| 0.0| 0.0|{0, NULL}| 0.0| 14.5| Cash| Sun Taxi| 17031281900| 17031081201| 8|\n|05/17/2022 5:30:0...|05/17/2022 6:00:0...| 17.5| 1857| 2.05| 4.0| 28|14.75|4.19| 25.8|{0, NULL}| 1.5| 20.94| Credit Card| Sun Taxi| 17031280100| 17031081300| 8|\n|05/17/2022 7:00:0...|05/17/2022 7:00:0...| 19.0| 787| 0.87| 4.0| 8| 8.0| 0.0| 0.0|{0, NULL}| 1.5| 9.5| Cash| City Service| 17031081201| 17031081201| 8|\n|05/16/2022 6:30:0...|05/16/2022 6:45:0...| 18.5| 1399| 1.55| 4.0| 6|17.88|4.11| 23.0|{0, NULL}| 0.0| 21.99| Mobile| Flash Cab| NULL| NULL| 32|\n|05/16/2022 10:30:...|05/16/2022 11:45:...| 10.5| 3934| 4.37| 4.0| 13|30.25| 0.0| 0.0|{0, NULL}| 0.0| 30.25| Prcard| Flash Cab| NULL| NULL| 5|\n|05/17/2022 6:45:0...|05/17/2022 6:45:0...| 18.75| 540| 0.6| 4.0| 28| 6.5| 0.0| 0.0|{0, NULL}| 1.0| 7.5| Cash|Taxi Affiliation ...| 17031281900| 17031839100| 32|\n|05/17/2022 5:45:0...|05/17/2022 6:00:0...| 17.75| 720| 0.8| 4.0| 8| 7.5| 2.0| 26.7|{0, NULL}| 0.0| 9.5| Credit Card| Top Cab Affiliation| 17031081800| 17031081201| 8|\n|05/17/2022 4:45:0...|05/17/2022 5:00:0...| 16.75| 720| 0.8| 4.0| 8| 7.5| 0.0| 0.0|{0, NULL}| 1.0| 8.5| Cash| Globe Taxi| 17031081500| 17031839100| 32|\n|05/16/2022 2:15:0...|05/16/2022 2:30:0...| 14.25| 540| 0.6| 4.0| 7| 6.75| 0.0| 0.0|{0, NULL}| 0.0| 6.75| Unknown|Taxi Affiliation ...| NULL| NULL| 7|\n|05/17/2022 5:00:0...|05/17/2022 5:30:0...| 17.0| 1207| 1.35| 4.0| 8|10.75| 3.0| 27.9|{0, NULL}| 0.0| 14.25| Credit Card| City Service| 17031081500| 17031839100| 32|\n|05/16/2022 11:45:...|05/16/2022 12:00:...| 11.75| 1295| 1.45| 4.0| 8| 8.25| 0.0| 0.0|{0, NULL}| 0.0| 8.25| Cash|Blue Ribbon Taxi ...| 17031081403| 17031081800| 8|\n|05/17/2022 9:45:0...|05/17/2022 10:30:...| 9.75| 2646| 2.97| 4.0| 8| 20.5| 0.0| 0.0|{0, NULL}| 0.0| 20.5| Cash| Flash Cab| 17031081000| 17031081600| 8|\n|05/17/2022 11:00:...|05/17/2022 11:15:...| 11.0| 1593| 1.79| 4.0| 32| 9.0| 0.0| 0.0|{0, NULL}| 0.0| 9.0| Cash| City Service| 17031839100| 17031081401| 8|\n|05/17/2022 5:00:0...|05/17/2022 5:15:0...| 17.0| 702| 0.79| 4.1| 28| 7.5| 3.0| 40.0|{0, NULL}| 0.0| 11.0| Credit Card| City Service| 17031280100| 17031839100| 32|\n+--------------------+--------------------+---------------+------------+----------+----------+----------------------+-----+----+--------+---------+------+----------+------------+--------------------+--------------------+-------------------+---------------------+\nonly showing top 20 rows\n\n/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py:147: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# trip_minutes", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "df = df.withColumn(\"trip_minutes\", df[\"trip_seconds\"] / 60)", | |
"metadata": { | |
"trusted": true, | |
"tags": [] | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# partition", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "df = df.withColumn(\"partition\", when(rand() < 0.6, lit(\"train\")).otherwise(when(rand() < 0.5, lit(\"validation\")).otherwise(lit(\"test\")))) ", | |
"metadata": { | |
"trusted": true, | |
"tags": [] | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# select columms", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "df = df.select(\"fare\", \"trip_miles\", \"trip_minutes\", \"partition\")\ndf.show()", | |
"metadata": { | |
"trusted": true, | |
"tags": [] | |
}, | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "+-----+----------+------------------+----------+\n| fare|trip_miles| trip_minutes| partition|\n+-----+----------+------------------+----------+\n|31.99| 2.57|39.016666666666666| train|\n| 9.75| 1.18| 17.9| test|\n|10.25| 1.29| 19.55| train|\n|23.75| 3.7| 56.0| train|\n| 10.0| 1.15| 17.4|validation|\n| 11.0| 1.38| 20.85|validation|\n| 14.5| 2.0|30.216666666666665| train|\n|14.75| 2.05| 30.95| train|\n| 8.0| 0.87|13.116666666666667|validation|\n|17.88| 1.55|23.316666666666666|validation|\n|30.25| 4.37| 65.56666666666666| train|\n| 6.5| 0.6| 9.0| train|\n| 7.5| 0.8| 12.0| train|\n| 7.5| 0.8| 12.0| train|\n| 6.75| 0.6| 9.0| train|\n|10.75| 1.35|20.116666666666667| train|\n| 8.25| 1.45|21.583333333333332| test|\n| 20.5| 2.97| 44.1| train|\n| 9.0| 1.79| 26.55| train|\n| 7.5| 0.79| 11.7|validation|\n+-----+----------+------------------+----------+\nonly showing top 20 rows\n", | |
"output_type": "stream" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "# write", | |
"metadata": { | |
"editable": true, | |
"trusted": true | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": "dyf = DynamicFrame.fromDF(df, glueContext, \"convert\")\nglueContext.write_dynamic_frame_from_options(\n dyf,\n connection_type=\"s3\",\n connection_options = {\"path\": \"s3://fare-prediction/model\", \"partitionKeys\": [\"partition\"]},\n format=\"csv\",\n format_options={\"writeHeader\": False}\n)", | |
"metadata": { | |
"trusted": true, | |
"editable": true | |
}, | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"text": "<awsglue.dynamicframe.DynamicFrame object at 0x7faa25673090>\n", | |
"output_type": "stream" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment