Last active
September 6, 2023 16:54
-
-
Save danthegoodman1/20dceb93dbffdfe8ff3b8b0d4a1f409d to your computer and use it in GitHub Desktop.
ClickHouse net dollar retention
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with tbl as ( | |
-- fake table | |
select c1 as usr, c2 as price, c3 as d from VALUES ( | |
('a', 10, toDate('2023-08-09')) | |
, ('a', 10, toDate('2023-08-10')) | |
, ('a', 8, toDate('2023-08-11')) | |
, ('a', 11, toDate('2023-08-12')) | |
, ('a', 12, toDate('2023-08-13')) | |
, ('b', 10, toDate('2023-08-09')) | |
, ('b', 10, toDate('2023-08-10')) | |
, ('b', 9, toDate('2023-08-11')) | |
, ('b', 11, toDate('2023-08-12')) | |
, ('b', 12, toDate('2023-08-13')) | |
, ('c', 11, toDate('2023-08-11')) | |
, ('c', 11, toDate('2023-08-12')) | |
, ('c', 15, toDate('2023-08-13')) | |
, ('c', 15, toDate('2023-08-15')) -- a non-contig day | |
) | |
) | |
, mins as ( | |
-- get the first date and price we saw them at (aggregate down if needed) | |
select usr, min(d) as first_d | |
from tbl | |
group by usr | |
order by usr asc | |
) | |
, buckets as ( | |
-- aggregate the days based on the delta to the first date | |
select usr, sum(price) as price, mins.first_d, dateDiff('day', mins.first_d, d) as delta | |
from tbl | |
left join mins on mins.usr = tbl.usr | |
group by usr, delta, mins.first_d | |
) | |
, first_deltas as ( | |
select usr, argMin(price, delta) as first_delta_price | |
from buckets | |
group by usr | |
) | |
-- select * from first_deltas; | |
, per_user as ( | |
-- get each user's ndr | |
select buckets.usr as usr | |
, mins.first_d as start | |
, delta | |
, price/first_deltas.first_delta_price as ndr -- could do multiple quantiles here, etc. | |
from buckets | |
left join mins on mins.usr = buckets.usr | |
left join first_deltas on first_deltas.usr = buckets.usr | |
order by buckets.usr asc, start asc, delta asc with fill | |
) | |
-- select * from per_user; | |
, aggs as ( | |
-- get the average ndr for each start date | |
-- could use this as final select to have tabular format | |
select | |
start -- could do toStartOfInterval here since that only changes the shown start, but not the deltas | |
, delta | |
-- , uniq(if(ndr = 0, null, usr)) as users -- don't count the user if they didn't do anything this delta | |
, uniq(usr) as users -- count them regardless of whether the earned anything | |
, round(avg(ndr), 2) as ndr_a | |
from per_user | |
group by start, delta | |
order by start asc, delta asc | |
) | |
-- select * from aggs; | |
-- group up in arrays for each start date | |
select start, groupArray(ndr_a) as ndr_a, groupArray(users) as users_a, groupArray(delta) as delta_a | |
from aggs | |
group by start | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note that
ndr_a
andusers
drop together (e.g. if a user does not do anything in an interval than the avg goes down) because a user still impacts the average for that cohort (start date). The reason to drop the user number is because that is the users that had something in that delta, not the number of users that were considered when calculating the avg.