-
-
Save gnilrets/48886b4c8945dde1da13547c2373df73 to your computer and use it in GitHub Desktop.
{% macro trange_join(left_model, row_key_left, right_model, row_key_right, merge_key, series_key, from_at, to_at) %} | |
-- Assumptions: | |
-- Within a series key, there can be no overlapping ranges. | |
-- Overlaping ranges must be resolved prior to using this macro. | |
-- The row key represents a set of attributes from a model. It is typically a hash of those | |
-- attributes. The user is expected to join the results of this macro back to the | |
-- original data to get the full attribute set. | |
-- Canonical Example: | |
-- | merge_key | series_key | row_key | from_at | to_at | | |
-- | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | 2020-01-01 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | 2020-02-05 | 2999-12-31 | | |
__trange__left as ( | |
select | |
{{ merge_key }} as merge_key, | |
{{ series_key }} as series_key, | |
{{ row_key_left }} as row_key_left, | |
null::varchar as row_key_right, | |
{{ from_at }} as from_at, | |
{{ to_at }} as to_at | |
from | |
{{ left_model }} | |
), | |
-- Canonical Example: | |
-- | merge_key | series_key | row_key | from_at | to_at | | |
-- | - | - | - | - | - | | |
-- | 1 | S1 | S2-1 | 2020-02-03 | 2020-02-07 | | |
-- | 1 | S1 | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__right as ( | |
select | |
{{ merge_key }} as merge_key, | |
{{ series_key }} as series_key, | |
null::varchar as row_key_left, | |
{{ row_key_right }} as row_key_right, | |
{{ from_at }} as from_at, | |
{{ to_at }} as to_at | |
from | |
{{ right_model }} | |
), | |
-- Combine left and right models into a unified version | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-05 | 2999-12-31 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-03 | 2020-02-07 | | |
-- | 1 | S1 | null | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__as_one as ( | |
select merge_key, series_key, row_key_left, row_key_right, from_at, to_at from __trange__left | |
union all | |
select merge_key, series_key, row_key_left, row_key_right, from_at, to_at from __trange__right | |
), | |
-- For each merge key, collect unique time boundary points from and to dates | |
-- | merge_key | ts_at | | |
-- | - | - | | |
-- | 1 | 2020-01-01 | | |
-- | 1 | 2020-02-03 | | |
-- | 1 | 2020-02-05 | | |
-- | 1 | 2020-02-07 | | |
-- | 1 | 2999-12-31 | | |
__trange__boundary_points as ( | |
select distinct | |
merge_key, | |
ts_at | |
from | |
__trange__as_one | |
unpivot (ts_at for boundary in (from_at, to_at)) | |
group by all | |
), | |
-- For each time point within a merge key, construct a range from the current point ot the next. | |
-- It may seem like this just undoes the work from the __trange__boundary_points CTE, but the | |
-- ranges constructed in this step will be the overlapping ranges, rather than the left/right | |
-- independent ones. | |
-- | merge_key | from_at | to_at | | |
-- | - | - | - | | |
-- | 1 | 2020-01-01 | 2020-02-03 | | |
-- | 1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | 2020-02-07 | 2999-12-31 | | |
__trange__boundary_ranges as ( | |
select | |
*, | |
ts_at as from_at, | |
lead(ts_at) over (partition by merge_key order by ts_at) as to_at | |
from | |
__trange__boundary_points | |
qualify | |
to_at is not null | |
), | |
-- Split up the orginal time ranges by boundaries identified in the other data. | |
-- e.g., if the left range includes the start or end of the right range, the left range | |
-- is split into a smaller segment. | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
-- | 1 | S1 | S1-1 | null | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | S1-2 | null | 2020-02-07 | 2999-12-31 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | null | S2-1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | null | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__merge as ( | |
select | |
as_one.merge_key, | |
as_one.series_key, | |
as_one.row_key_left, | |
as_one.row_key_right, | |
boundary_ranges.from_at, | |
boundary_ranges.to_at, | |
from | |
__trange__as_one as as_one | |
inner join | |
__trange__boundary_ranges as boundary_ranges | |
on | |
as_one.merge_key = boundary_ranges.merge_key | |
and ( | |
as_one.from_at <= boundary_ranges.from_at | |
and as_one.to_at >= boundary_ranges.to_at | |
) | |
), | |
-- For records with the same series key and date ranges, one record comes from the left | |
-- data (with a null right row key) and the other comes from the right data (with a null left row key). | |
-- Fill in the null values and drop the duplicates. | |
-- | merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
-- | - | - | - | - | - | - | | |
-- | 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
-- | 1 | S1 | S1-1 | S2-1 | 2020-02-03 | 2020-02-05 | | |
-- | 1 | S1 | S1-2 | S2-1 | 2020-02-05 | 2020-02-07 | | |
-- | 1 | S1 | S1-2 | S2-2 | 2020-02-07 | 2999-12-31 | | |
__trange__fill_values as ( | |
select distinct | |
merge_key, | |
series_key, | |
max(row_key_left) over (partition by merge_key, series_key, from_at, to_at) as row_key_left, | |
max(row_key_right) over (partition by merge_key, series_key, from_at, to_at) as row_key_right, | |
from_at, | |
to_at | |
from | |
__trange__merge | |
), | |
-- Rename fields to match user expectations. | |
trange__final as ( | |
select | |
merge_key as {{ merge_key }}, | |
{%- if merge_key != series_key %} | |
series_key as {{ series_key }}, | |
{%- endif %} | |
row_key_left as {% if row_key_left == row_key_right %}{{ row_key_left }}_left{% else %}{{ row_key_left }}{% endif %}, | |
row_key_right as {% if row_key_left == row_key_right %}{{ row_key_right }}_right{% else %}{{ row_key_right }}{% endif %}, | |
from_at as {{ from_at }}, | |
to_at as {{ to_at }} | |
from | |
__trange__fill_values | |
) | |
{% endmacro %} |
--- Yaml file (sorry, forgot how to create multi-file gists if that's a thing... | |
version: 2 | |
macros: | |
- name: trange_join | |
description: | | |
This macro allows the user to perform a temporal range join between two models. | |
For example, supose we have the two datasets below. The merge key defines how the | |
two models are to be joined together. The series key represents a time series of data | |
represented in ranges of time where the various attributes (represented as a row key) are | |
constant. Time series keys that are shared between the two datasets will be interleaved. | |
Given left_model: | |
``` | |
| merge_key | series_key | row_key | from_at | to_at | | |
| - | - | - | - | - | | |
| 1 | S1 | S1-1 | 2020-01-01 | 2020-02-05 | | |
| 1 | S1 | S1-2 | 2020-02-05 | 2999-12-31 | | |
``` | |
Given right_model: | |
``` | |
| merge_key | series_key | row_key | from_at | to_at | | |
| - | - | - | - | - | | |
| 1 | S1 | S2-1 | 2020-02-03 | 2020-02-07 | | |
| 1 | S1 | S2-2 | 2020-02-07 | 2999-12-31 | | |
``` | |
Resultant temporal range join: | |
``` | |
| merge_key | series_key | row_key_left | row_key_right | from_at | to_at | | |
| - | - | - | - | - | - | | |
| 1 | S1 | S1-1 | null | 2020-01-01 | 2020-02-03 | | |
| 1 | S1 | S1-1 | S2-1 | 2020-02-03 | 2020-02-05 | | |
| 1 | S1 | S1-2 | S2-1 | 2020-02-05 | 2020-02-07 | | |
| 1 | S1 | S1-2 | S2-2 | 2020-02-07 | 2999-12-31 | | |
``` | |
If there are multiple series within a merge key, all of those series will be broken up | |
into the same ranges, even if there is no change in attributes between one range and another | |
within the same series. This is useful when you want to define ranges of times where | |
records within a group (merge key) have constant attributes. | |
Requirements and assumptions: | |
* Models must be CTEs. | |
* The "left" and "right" designations are arbitrary. In fact, they _may_ be the same model. | |
* The `merge_key` and the `series_key` _may_ be the same column. | |
* The `row_key` _must_ uniquely define a record in each input model. | |
* For the left and right input models, within a merge key, each series _must_ only | |
contain non-overlapping ranges. Different series within a merge _may_ have have | |
overlapping ranges. | |
* The final result is a CTE named `trange__final`. Select from this CTE. The user | |
needs to join the `row_key_left` and `row_key_right` back to the original two models | |
in order to get the full attributes they desire. | |
docs: | |
show: true | |
arguments: | |
- name: left_model | |
type: string | |
description: The name of one model participating in the range join. | |
- name: row_key_left | |
type: string | |
description: Name of the column from the left model that uniquely identifies a row. | |
- name: right_model | |
type: string | |
description: The name of another model participating in the range join. | |
- name: row_key_right | |
type: string | |
description: Name of the column from the right model that uniquely identifies a row. | |
- name: merge_key | |
type: string | |
description: | | |
Name of the column containing the merge key. Both left and right models must contain | |
this column. | |
- name: series_key | |
type: string | |
description: | | |
Name of the column containing the series key. Both left and right models must contain | |
this column. | |
- name: from_at | |
type: string | |
description: | | |
Name of the column containing the range start timestamp. | |
Both left and right models must contain this column. | |
- name: to_at | |
type: string | |
description: | | |
Name of the column containing the range end timestamp (exclusive). | |
Both left and right models must contain this column. | |
unit_tests: | |
- name: test_trange_join_canonical | |
description: Simple range join example showing how overlapping ranges each get their own row. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-02-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-02-05', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-02-03', to_at: '2020-02-07'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-02-07', to_at: '2999-12-31'} | |
expect: | |
rows: | |
# First range is where the right model has no data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-02-03'} | |
# Then right model has some data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-02-03', to_at: '2020-02-05'} | |
# Then the left model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-02-05', to_at: '2020-02-07'} | |
# Then the right model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-02-07', to_at: '2999-12-31'} | |
- name: test_trange_join_timestamps | |
description: Simple range join example with full timestamp precision | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31 23:59:59'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01 02:00:00', to_at: '2020-01-05 10:00:00'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-05 10:00:00', to_at: '2999-12-31 23:59:59'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03 06:00:00', to_at: '2020-01-07 14:00:00'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-07 14:00:00', to_at: '2999-12-31 23:59:59'} | |
expect: | |
rows: | |
# First range is where the right model has no data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01 02:00:00', to_at: '2020-01-03 06:00:00'} | |
# Then right model has some data | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03 06:00:00', to_at: '2020-01-05 10:00:00'} | |
# Then the left model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-01-05 10:00:00', to_at: '2020-01-07 14:00:00'} | |
# Then the right model changes | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-07 14:00:00', to_at: '2999-12-31 23:59:59'} | |
- name: test_trange_join_series_gap | |
description: Range join example showing how series gaps are retained. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-05'} | |
# 1 day gap between previous record | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-06', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-07'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-07', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-03'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: null , row_key_right: 'S2-1', from_at: '2020-01-05', to_at: '2020-01-06'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-1', from_at: '2020-01-06', to_at: '2020-01-07'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-07', to_at: '2999-12-31'} | |
- name: test_trange_join_shared_boundary | |
description: | | |
Simple range join example showing that we get expected result even when boundaries are shared | |
between the two models. | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S2-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-03'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: 'S2-1', from_at: '2020-01-03', to_at: '2020-01-05'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: 'S2-2', from_at: '2020-01-05', to_at: '2999-12-31'} | |
- name: test_trange_join_merge_multi | |
description: | | |
Range join example showing how multiple series are merged. That is, there can be multiple | |
distinct timeseries sharing the same merge key. When these datsets are merged, | |
we end up with those series all having the same ranges (even if nothing in the series changed, | |
but something in another series with the same merge key changed). | |
model: unit__trange_join | |
overrides: | |
vars: | |
OPEN_END_DATE: "'2999-12-31'::timestamp" | |
given: | |
- input: ref('unit__trange_join_input_left') | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-1', from_at: '2020-01-01', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S1', row_key: 'S1-2', from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'S2', row_key: 'S2-1', from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S2', row_key: 'S2-2', from_at: '2020-01-04', to_at: '2999-12-31'} | |
- input: ref('unit__trange_join_input_right') | |
rows: | |
- {merge_key: '1', series_key: 'SX', row_key: 'SX-1', from_at: '2020-01-01', to_at: '2999-12-31'} | |
expect: | |
rows: | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-1', row_key_right: null , from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S1', row_key_left: 'S1-2', row_key_right: null , from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-1', row_key_right: null , from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-2', row_key_right: null , from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'S2', row_key_left: 'S2-2', row_key_right: null , from_at: '2020-01-10', to_at: '2999-12-31'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-01', to_at: '2020-01-04'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-04', to_at: '2020-01-10'} | |
- {merge_key: '1', series_key: 'SX', row_key_left: null , row_key_right: 'SX-1', from_at: '2020-01-10', to_at: '2999-12-31'} |
@gnilrets - really great work. Thanks!
This is great. We were trying to solve this same problem
Awesome work !! I integrated this into my project and it works perfectly !
Am I right to say that this macro only works when the FKs are on the LEFT model ? For instance if you have your dimensions organized in a snowflake structure. You'd need to use the macro several times in a nested manner, right ?
Thank you for this great macro! What does trange
stand for and how do you pronounce it? "tee-range" or "trange"?
Thanks @panasenco ! It stands for "temporal range" and I just pronounce it "tee-range". Also you reminded me that I forgot to update this after I discovered a serious performance bug with this macro that can happen in certain situations. I've updated this gist.
Hey, thanks for keeping this awesome macro updated. Would I be correct in saying that since the macro doesn't account for more than 2 source models anymore if we wanted to apply it to more than 2 we could call this macro multiple times? E.g. call it once on 2 source models, then again on the resultant of that and a third source model and so forth?
Thanks again
thanks :) could you also expand on what the series key is in this new revision? still a bit confused after reading the yml
@dongchris - I haven't tried, but I don't see why it couldn't work.