Skip to content

Instantly share code, notes, and snippets.

@mjpitz
Created December 18, 2024 13:48
Show Gist options
  • Save mjpitz/975acf6bf2629e85d2975978d918f90f to your computer and use it in GitHub Desktop.
Save mjpitz/975acf6bf2629e85d2975978d918f90f to your computer and use it in GitHub Desktop.
Dataset builder example using Go, GORM, and ClickHouse.
// Copyright (C) 2024 C8 Labs
// SPDX-License-Identifier: AGPL-3.0-or-later
// This file illustrates the notion of an "index builder" or "dataset builder" using Go, GORM, and ClickHouse. While
// working at Indeed, we leveraged some fairly robust data pipelines that were built on top of our analytics engine.
// One of those parts were this idea of an index builder which allowed us to derive higher level metrics from data
// stored in our data warehouse.
//
// For most projects, I wind up using Go and GORM for quick and easy schema management. This offered a convenient
// way to define and iterate on schema over time and build migrations in as part of the index building job.
//
// When writing this example, I wanted to illustrate how someone with a non-technical background, who knows SQL can
// their own builder with very little effort (namely... defining schema, and the query used to populate the table).
//
package datasets
import (
"time"
"github.com/urfave/cli/v2"
)
// CustomerLTV - tracks lifetime value over time.
// It's computed using the following formula:
//
// (revenue / user_count / (1 - churn_rate)) - acquisition_cost
//
// `company_spend` - how much the company spends on ads and marketing campaigns
//
// date ... spend_millicent
// 2024-01-01 ... 10000000
// 2024-01-02 ... 10000000
// ...
// 2024-12-16 ... 10000000
//
// `customer_spend` - the amount customers spent on the platform.
// (e.g. monthly subscription)
//
// date customer_id spend_millicent
// 2024-01-01 ... 10000000
// 2024-01-01 ... 10000000
// 2024-02-01 ... 10000000
// ...
// 2024-12-01 ... 10000000
//
// .
type CustomerLTV struct {
Date time.Time `gorm:"date"`
UserCount int64 `gorm:"user_count"`
RevenueMillicent int64 `gorm:"revenue_millicent"`
AcquisitionCostMillicent int64 `gorm:"acquisition_cost_millicent"`
ChurnRateBasisPoints int32 `gorm:"churn_rate_basis_points"`
LifeTimeValueMillicent int64 `gorm:"life_time_value_millicent"`
}
//goland:noinspection ALL
const customerLTV = `
INSERT INTO customer_ltv (*)
SELECT
"date",
customers.user_count AS user_count,
customers.revenue_millicent AS revenue_millicent,
company.acquisition_cost_millicent AS acquisition_cost_millicent,
74 as churn_rate_basis_points,
(customers.revenue_millicent / customers.user_count / (1 - 0.074)) - company.acquisition_cost_millicent as life_time_value_millicent
FROM (
SELECT
"date",
count(distinct(customer_id)) AS user_count,
sum(spend_millicent) AS revenue_millicent
FROM customer_spend
WHERE "date" > ?
GROUP BY "date"
ORDER BY "date" ASC
) AS customers,
(
SELECT
"date",
sum(spend_millicent) AS acquisition_cost_millicent
FROM company_spend
WHERE "date" > ?
GROUP BY "date"
ORDER BY "date" ASC
) AS company
USING ("date")
GROUP BY "date"
ORDER BY "date" ASC
`
//===
// Lots of boilerplate below this line
//===
type CLTVConfig struct {
Config
ChurnBasisPoints int `json:"churn_basis_points" usage:"the churn rate for customers, in basis points" default:"74"`
StartTime string `json:"start_time" usage:"the start time of the query"`
}
var (
cltvConfig = &CLTVConfig{}
cltv = &cli.Command{
Name: "cltv",
Usage: "Demonstrates index builders by compute customer lifetime value",
Action: func(ctx *cli.Context) error {
cfg := cltvConfig
db, err := cfg.Config.Open()
if err != nil {
return nil
}
err = db.AutoMigrate(CustomerLTV{})
if err != nil {
return nil
}
return db.Raw(
customerLTV,
cfg.StartTime,
cfg.StartTime,
).Error
},
}
)
// Copyright (C) 2024 C8 Labs
// SPDX-License-Identifier: AGPL-3.0-or-later
package datasets
import (
"fmt"
"gorm.io/driver/clickhouse"
"gorm.io/gorm"
"gorm.io/gorm/logger"
)
type Config struct {
Driver string `json:"driver" usage:"Specify the driver used to connect to the database" default:"clickhouse"`
ConnectionString string `json:"connection_string" usage:"The connection string to the persistent store" required:"true"`
TableOptions string `json:"table_options" usage:"Options for the clickhouse table" default:"ENGINE=Distributed(cluster, default, hits)"`
TableClusterOptions string `json:"table_cluster_options" usage:"Options for the clickhouse table on the cluster" default:"on cluster default"`
}
func (cfg Config) Open() (*gorm.DB, error) {
if cfg.ConnectionString == "" {
return nil, fmt.Errorf("no connection string provided")
}
var dialect gorm.Dialector
switch cfg.Driver {
case "clickhouse":
dialect = clickhouse.Open(cfg.ConnectionString)
default:
return nil, fmt.Errorf("driver not yet supported: %s", cfg.Driver)
}
db, err := gorm.Open(dialect, &gorm.Config{
Logger: logger.Default.LogMode(logger.Silent),
})
if err != nil {
return nil, err
}
db.Set("gorm:table_options", cfg.TableOptions)
db.Set("gorm:table_cluster_options", cfg.TableClusterOptions)
return db, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment