Created
December 18, 2024 13:48
-
-
Save mjpitz/975acf6bf2629e85d2975978d918f90f to your computer and use it in GitHub Desktop.
Dataset builder example using Go, GORM, and ClickHouse.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (C) 2024 C8 Labs | |
// SPDX-License-Identifier: AGPL-3.0-or-later | |
// This file illustrates the notion of an "index builder" or "dataset builder" using Go, GORM, and ClickHouse. While | |
// working at Indeed, we leveraged some fairly robust data pipelines that were built on top of our analytics engine. | |
// One of those parts were this idea of an index builder which allowed us to derive higher level metrics from data | |
// stored in our data warehouse. | |
// | |
// For most projects, I wind up using Go and GORM for quick and easy schema management. This offered a convenient | |
// way to define and iterate on schema over time and build migrations in as part of the index building job. | |
// | |
// When writing this example, I wanted to illustrate how someone with a non-technical background, who knows SQL can | |
// their own builder with very little effort (namely... defining schema, and the query used to populate the table). | |
// | |
package datasets | |
import ( | |
"time" | |
"github.com/urfave/cli/v2" | |
) | |
// CustomerLTV - tracks lifetime value over time. | |
// It's computed using the following formula: | |
// | |
// (revenue / user_count / (1 - churn_rate)) - acquisition_cost | |
// | |
// `company_spend` - how much the company spends on ads and marketing campaigns | |
// | |
// date ... spend_millicent | |
// 2024-01-01 ... 10000000 | |
// 2024-01-02 ... 10000000 | |
// ... | |
// 2024-12-16 ... 10000000 | |
// | |
// `customer_spend` - the amount customers spent on the platform. | |
// (e.g. monthly subscription) | |
// | |
// date customer_id spend_millicent | |
// 2024-01-01 ... 10000000 | |
// 2024-01-01 ... 10000000 | |
// 2024-02-01 ... 10000000 | |
// ... | |
// 2024-12-01 ... 10000000 | |
// | |
// . | |
type CustomerLTV struct { | |
Date time.Time `gorm:"date"` | |
UserCount int64 `gorm:"user_count"` | |
RevenueMillicent int64 `gorm:"revenue_millicent"` | |
AcquisitionCostMillicent int64 `gorm:"acquisition_cost_millicent"` | |
ChurnRateBasisPoints int32 `gorm:"churn_rate_basis_points"` | |
LifeTimeValueMillicent int64 `gorm:"life_time_value_millicent"` | |
} | |
//goland:noinspection ALL | |
const customerLTV = ` | |
INSERT INTO customer_ltv (*) | |
SELECT | |
"date", | |
customers.user_count AS user_count, | |
customers.revenue_millicent AS revenue_millicent, | |
company.acquisition_cost_millicent AS acquisition_cost_millicent, | |
74 as churn_rate_basis_points, | |
(customers.revenue_millicent / customers.user_count / (1 - 0.074)) - company.acquisition_cost_millicent as life_time_value_millicent | |
FROM ( | |
SELECT | |
"date", | |
count(distinct(customer_id)) AS user_count, | |
sum(spend_millicent) AS revenue_millicent | |
FROM customer_spend | |
WHERE "date" > ? | |
GROUP BY "date" | |
ORDER BY "date" ASC | |
) AS customers, | |
( | |
SELECT | |
"date", | |
sum(spend_millicent) AS acquisition_cost_millicent | |
FROM company_spend | |
WHERE "date" > ? | |
GROUP BY "date" | |
ORDER BY "date" ASC | |
) AS company | |
USING ("date") | |
GROUP BY "date" | |
ORDER BY "date" ASC | |
` | |
//=== | |
// Lots of boilerplate below this line | |
//=== | |
type CLTVConfig struct { | |
Config | |
ChurnBasisPoints int `json:"churn_basis_points" usage:"the churn rate for customers, in basis points" default:"74"` | |
StartTime string `json:"start_time" usage:"the start time of the query"` | |
} | |
var ( | |
cltvConfig = &CLTVConfig{} | |
cltv = &cli.Command{ | |
Name: "cltv", | |
Usage: "Demonstrates index builders by compute customer lifetime value", | |
Action: func(ctx *cli.Context) error { | |
cfg := cltvConfig | |
db, err := cfg.Config.Open() | |
if err != nil { | |
return nil | |
} | |
err = db.AutoMigrate(CustomerLTV{}) | |
if err != nil { | |
return nil | |
} | |
return db.Raw( | |
customerLTV, | |
cfg.StartTime, | |
cfg.StartTime, | |
).Error | |
}, | |
} | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (C) 2024 C8 Labs | |
// SPDX-License-Identifier: AGPL-3.0-or-later | |
package datasets | |
import ( | |
"fmt" | |
"gorm.io/driver/clickhouse" | |
"gorm.io/gorm" | |
"gorm.io/gorm/logger" | |
) | |
type Config struct { | |
Driver string `json:"driver" usage:"Specify the driver used to connect to the database" default:"clickhouse"` | |
ConnectionString string `json:"connection_string" usage:"The connection string to the persistent store" required:"true"` | |
TableOptions string `json:"table_options" usage:"Options for the clickhouse table" default:"ENGINE=Distributed(cluster, default, hits)"` | |
TableClusterOptions string `json:"table_cluster_options" usage:"Options for the clickhouse table on the cluster" default:"on cluster default"` | |
} | |
func (cfg Config) Open() (*gorm.DB, error) { | |
if cfg.ConnectionString == "" { | |
return nil, fmt.Errorf("no connection string provided") | |
} | |
var dialect gorm.Dialector | |
switch cfg.Driver { | |
case "clickhouse": | |
dialect = clickhouse.Open(cfg.ConnectionString) | |
default: | |
return nil, fmt.Errorf("driver not yet supported: %s", cfg.Driver) | |
} | |
db, err := gorm.Open(dialect, &gorm.Config{ | |
Logger: logger.Default.LogMode(logger.Silent), | |
}) | |
if err != nil { | |
return nil, err | |
} | |
db.Set("gorm:table_options", cfg.TableOptions) | |
db.Set("gorm:table_cluster_options", cfg.TableClusterOptions) | |
return db, nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment