Last active
May 3, 2025 01:06
-
-
Save gnilrets/48886b4c8945dde1da13547c2373df73 to your computer and use it in GitHub Desktop.
dbt Snapshot Join
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{% macro trange_join(left_model, row_key_left, right_model, row_key_right, merge_key, series_key, from_at, to_at) %} | |
-- Assumptions: | |
-- Within a series key, there can be no overlapping ranges. | |
-- Overlaping ranges must be resolved prior to using this macro. | |
-- The row key represents a set of attributes from a model. It is typically a hash of those | |
-- attributes. The user is expected to join the results of this macro back to the | |
-- original data to get the full attribute set. | |
-- Canonical Example: | |
-- | merge_key | series_key | row_key | from_at | to_at | | |
-- | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | 2020-01-01 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | 2020-02-05 | 2999-12-31 | | |
__trange__left as ( | |
select | |
{{ merge_key }} as merge_key, | |
{{ series_key }} as series_key, | |
{{ row_key_left }} as row_key_left, | |
null::varchar as row_key_right, | |
{{ from_at }} as from_at, | |
{{ to_at }} as to_at | |
from | |
{{ left_model }} | |
), | |
-- Canonical Example: | |
-- | merge_key | series_key | row_key | from_at | to_at | | |
-- | - | - | - | - | - | | |
-- | 1 | S1 | S2-1 | 2020-02-03 | 2020-02-07 | | |
-- | 1 | S1 | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__right as ( | |
select | |
{{ merge_key }} as merge_key, | |
{{ series_key }} as series_key, | |
null::varchar as row_key_left, | |
{{ row_key_right }} as row_key_right, | |
{{ from_at }} as from_at, | |
{{ to_at }} as to_at | |
from | |
{{ right_model }} | |
), | |
-- Combine left and right models into a unified version | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-05 | 2999-12-31 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-03 | 2020-02-07 | | |
-- | 1 | S1 | null | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__as_one as ( | |
select merge_key, series_key, row_key_left, row_key_right, from_at, to_at from __trange__left | |
union all | |
select merge_key, series_key, row_key_left, row_key_right, from_at, to_at from __trange__right | |
), | |
-- For each merge key, collect unique time boundary points from and to dates | |
-- | merge_key | ts_at | | |
-- | - | - | | |
-- | 1 | 2020-01-01 | | |
-- | 1 | 2020-02-03 | | |
-- | 1 | 2020-02-05 | | |
-- | 1 | 2020-02-07 | | |
-- | 1 | 2999-12-31 | | |
__trange__boundary_points as ( | |
select distinct | |
merge_key, | |
ts_at | |
from | |
__trange__as_one | |
unpivot (ts_at for boundary in (from_at, to_at)) | |
group by all | |
), | |
-- For each time point within a merge key, construct a range from the current point ot the next. | |
-- It may seem like this just undoes the work from the __trange__boundary_points CTE, but the | |
-- ranges constructed in this step will be the overlapping ranges, rather than the left/right | |
-- independent ones. | |
-- | merge_key | from_at | to_at | | |
-- | - | - | - | | |
-- | 1 | 2020-01-01 | 2020-02-03 | | |
-- | 1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | 2020-02-07 | 2999-12-31 | | |
__trange__boundary_ranges as ( | |
select | |
*, | |
ts_at as from_at, | |
lead(ts_at) over (partition by merge_key order by ts_at) as to_at | |
from | |
__trange__boundary_points | |
qualify | |
to_at is not null | |
), | |
-- Split up the orginal time ranges by boundaries identified in the other data. | |
-- e.g., if the left range includes the start or end of the right range, the left range | |
-- is split into a smaller segment. | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
-- | 1 | S1 | S1-1 | null | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-07 | 2999-12-31 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | null | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__merge as ( | |
select | |
as_one.merge_key, | |
as_one.series_key, | |
as_one.row_key_left, | |
as_one.row_key_right, | |
boundary_ranges.from_at, | |
boundary_ranges.to_at, | |
from | |
__trange__as_one as as_one | |
inner join | |
__trange__boundary_ranges as boundary_ranges | |
on | |
as_one.merge_key = boundary_ranges.merge_key | |
and ( | |
as_one.from_at <= boundary_ranges.from_at | |
and as_one.to_at >= boundary_ranges.to_at | |
) | |
), | |
-- For records with the same series key and date ranges, one record comes from the left | |
-- data (with a null right row key) and the other comes from the right data (with a null left row key). | |
-- Fill in the null values and drop the duplicates. | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
-- | 1 | S1 | S1-1 | S2-1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | S2-1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | S1-2 | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__fill_values as ( | |
select distinct | |
merge_key, | |
series_key, | |
max(row_key_left) over (partition by merge_key, series_key, from_at, to_at) as row_key_left, | |
max(row_key_right) over (partition by merge_key, series_key, from_at, to_at) as row_key_right, | |
from_at, | |
to_at | |
from | |
__trange__merge | |
), | |
-- Rename fields to match user expectations. | |
trange__final as ( | |
select | |
merge_key as {{ merge_key }}, | |
{%- if merge_key != series_key %} | |
series_key as {{ series_key }}, | |
{%- endif %} | |
row_key_left as {% if row_key_left == row_key_right %}{{ row_key_left }}_left{% else %}{{ row_key_left }}{% endif %}, | |
row_key_right as {% if row_key_left == row_key_right %}{{ row_key_right }}_right{% else %}{{ row_key_right }}{% endif %}, | |
from_at as {{ from_at }}, | |
to_at as {{ to_at }} | |
from | |
__trange__fill_values | |
) | |
{% endmacro %} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- Yaml file (sorry, forgot how to create multi-file gists if that's a thing... | |
version: 2 | |
macros: | |
- name: trange_join | |
description: | | |
This macro allows the user to perform a temporal range join between two models. | |
For example, supose we have the two datasets below. The merge key defines how the | |
two models are to be joined together. The series key represents a time series of data | |
represented in ranges of time where the various attributes (represented as a row key) are | |
constant. Time series keys that are shared between the two datasets will be interleaved. | |
Given left_model: | |
``` | |
| merge_key | series_key | row_key | from_at | to_at | | |
| - | - | - | - | - | | |
| 1 | S1 | S1-1 | 2020-01-01 | 2020-02-05 | | |
| 1 | S1 | S1-2 | 2020-02-05 | 2999-12-31 | | |
``` | |
Given right_model: | |
``` | |
| merge_key | series_key | row_key | from_at | to_at | | |
| - | - | - | - | - | | |
| 1 | S1 | S2-1 | 2020-02-03 | 2020-02-07 | | |
| 1 | S1 | S2-2 | 2020-02-07 | 2999-12-31 | | |
``` | |
Resultant temporal range join: | |
``` | |
| merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
| - | - | - | - | - | - | | |
| 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
| 1 | S1 | S1-1 | S2-1 | 2020-02-03 | 2020-02-05 | | |
| 1 | S1 | S1-2 | S2-1 | 2020-02-05 | 2020-02-07 | | |
| 1 | S1 | S1-2 | S2-2 | 2020-02-07 | 2999-12-31 | | |
``` | |
If there are multiple series within a merge key, all of those series will be broken up | |
into the same ranges, even if there is no change in attributes between one range and another | |
within the same series. This is useful when you want to define ranges of times where | |
records within a group (merge key) have constant attributes. | |
Requirements and assumptions: | |
* Models must be CTEs. | |
* The "left" and "right" designations are arbitrary. In fact, they _may_ be the same model. | |
* The `merge_key` and the `series_key` _may_ be the same column. | |
* The `row_key` _must_ uniquely define a record in each input model. | |
* For the left and right input models, within a merge key, each series _must_ only | |
contain non-overlapping ranges. Different series within a merge _may_ have have | |
overlapping ranges. | |
* The final result is a CTE named `trange__final`. Select from this CTE. The user | |
needs to join the `row_key_left` and `row_key_right` back to the original two models | |
in order to get the full attributes they desire. | |
docs: | |
show: true | |
arguments: | |
- name: left_model | |
type: string | |
description: The name of one model participating in the range join. | |
- name: row_key_left | |
type: string | |
description: Name of the column from the left model that uniquely identifies a row. | |
- name: right_model | |
type: string | |
description: The name of another model participating in the range join. | |
- name: row_key_right | |
type: string | |
description: Name of the column from the right model that uniquely identifies a row. | |
- name: merge_key | |
type: string | |
description: | | |
Name of the column containing the merge key. Both left and right models must contain | |
this column. | |
- name: series_key | |
type: string | |
description: | | |
Name of the column containing the series key. Both left and right models must contain | |
this column. | |
- name: from_at | |
type: string | |
description: | | |
Name of the column containing the range start timestamp. | |
Both left and right models must contain this column. | |
- name: to_at | |
type: string | |
description: | | |
Name of the column containing the range end timestamp (exclusive). | |
Both left and right models must contain this column. | |
unit_tests: | |
- name: test_trange_join_canonical | |
description: Simple range join example showing how overlapping ranges each get their own row. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-02-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-02-05', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-02-03', to_at: '2020-02-07'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-02-07', to_at: '2999-12-31'} | |
expect: | |
rows: | |
# First range is where the right model has no data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-02-03'} | |
# Then right model has some data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-02-03', to_at: '2020-02-05'} | |
# Then the left model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-02-05', to_at: '2020-02-07'} | |
# Then the right model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-02-07', to_at: '2999-12-31'} | |
- name: test_trange_join_timestamps | |
description: Simple range join example with full timestamp precision | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31 23:59:59'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01 02:00:00', to_at: '2020-01-05 10:00:00'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-05 10:00:00', to_at: '2999-12-31 23:59:59'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03 06:00:00', to_at: '2020-01-07 14:00:00'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-07 14:00:00', to_at: '2999-12-31 23:59:59'} | |
expect: | |
rows: | |
# First range is where the right model has no data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01 02:00:00', to_at: '2020-01-03 06:00:00'} | |
# Then right model has some data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03 06:00:00', to_at: '2020-01-05 10:00:00'} | |
# Then the left model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-01-05 10:00:00', to_at: '2020-01-07 14:00:00'} | |
# Then the right model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-07 14:00:00', to_at: '2999-12-31 23:59:59'} | |
- name: test_trange_join_series_gap | |
description: Range join example showing how series gaps are retained. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-05'} | |
# 1 day gap between previous record | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-06', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-07'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-07', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-03'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: null , row_key_right: 'S2-1', from_at: '2020-01-05', to_at: '2020-01-06'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-01-06', to_at: '2020-01-07'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-07', to_at: '2999-12-31'} | |
- name: test_trange_join_shared_boundary | |
description: | | |
Simple range join example showing that we get expected result even when boundaries are shared | |
between the two models. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-03'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
- name: test_trange_join_merge_multi | |
description: | | |
Range join example showing how multiple series are merged. That is, there can be multiple | |
distinct timeseries sharing the same merge key. When these datsets are merged, | |
we end up with those series all having the same ranges (even if nothing in the series changed, | |
but something in another series with the same merge key changed). | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'S2', row_key: 'S2-1', from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S2', row_key: 'S2-2', from_at: '2020-01-04', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'SX', row_key: 'SX-1', from_at: '2020-01-01', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: null , from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-2', row_key_right: null , from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-2', row_key_right: null , from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-10', to_at: '2999-12-31'} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks :) could you also expand on what the series key is in this new revision? still a bit confused after reading the yml