Created
January 24, 2017 16:18
-
-
Save thedeetch/5622dc5d783f93a3bd2db62b11f5f591 to your computer and use it in GitHub Desktop.
Hive Sessionization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- find session boundaries | |
-- look at previous row timestamp, if more than 20 minutes ago, give that row a value of 1, else 0 | |
with session_start as ( | |
SELECT | |
CASE | |
WHEN timestamp - LAG(timestamp, 1, 0) OVER (PARTITION BY user_id, ORDER BY timestamp) > 1200000 -- 20 minutes | |
THEN 1 | |
ELSE 0 | |
END as session_boundary, | |
* | |
FROM web_logs | |
) | |
-- now forward fill session number | |
-- do a running total of the values calculated above, each session will have an incrementing session_number | |
with session_number as ( | |
SELECT | |
SUM(session_boundary) OVER (PARTITION BY user_id, ORDER BY timestamp) as session_number, | |
* | |
FROM session_start | |
) | |
-- to take it a step further (optional) | |
-- assign a unique session id computed as a hash of first timestamp + user id | |
with session_id as ( | |
SELECT | |
SHA1( | |
CONCAT( | |
FIRST_VALUE(timestamp) | |
OVER (PARTITION BY user_id, session_number ORDER BY timestamp), | |
affiliate_id)) | |
as session_id, | |
* | |
FROM session_number | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment