Last active
March 4, 2024 11:37
-
-
Save sayle-doit/b2e651645385b858bb6c44502c4ad1df to your computer and use it in GitHub Desktop.
BigQuery storage billing model recommender on a per table basis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This query will look at a single project (by default in US multi-region) and | |
* calculate the logical and physical billing prices for each table in all datasets | |
* contained inside of it then provide a recommendation on if it is better to put it | |
* into a dataset that uses the logical (uncompressed) or physical (compressed) | |
* storage billing models. | |
* | |
* Physical (also called compressed) Storage went GA on July 5, 2023. It is set at | |
* the dataset level, so if two or more tables are better suited for different | |
* billing models in the same dataset it may be best to split them into separate | |
* datasets to get the best cost benefit. | |
* | |
* Note that there are not any performance penalties for using physical storage as | |
* the underlying data is always compressed and operations operate on that compressed | |
* data already. This is strictly a billing change. | |
* | |
* One thing to note is that there isn't a 100% effective way to determine if an | |
* existing table is logical or physical storage (it only works on newly created | |
* tables, but this is a known issue the BQ team is working on). Once this has | |
* been fixed this query will be updated to show the current model. | |
* | |
* It also includes inside of the storage CTE lots of extra values that can be used | |
* for other calculations that are being left in here to assist you as the customer | |
* make the best decision or to see additional information about your tables/datasets. | |
* | |
* Note it targets the US multi-region by default. If needing to change the region | |
* then change `region-us` below to whichever region the data exists in. Also uncomment | |
* the DECLARE values below for the EU region or if you are using a non-multi-region | |
* then refer here: https://cloud.google.com/bigquery/pricing#storage | |
* for the correct pricing and update accordingly. | |
*/ | |
-- These values are for the US multi-region | |
-- Comment these out and uncomment below if using the EU multi-region | |
DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; | |
DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; | |
DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.04; | |
DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.02; | |
-- These values are for the EU multi-region | |
-- Uncomment these and comment out the above if using the EU multi-region | |
/* | |
DECLARE active_logical_price_per_gb NUMERIC DEFAULT 0.02; | |
DECLARE long_term_logical_price_per_gb NUMERIC DEFAULT 0.01; | |
DECLARE active_physical_price_per_gb NUMERIC DEFAULT 0.044; | |
DECLARE long_term_physical_price_per_gb NUMERIC DEFAULT 0.022; | |
*/ | |
WITH storage AS | |
( | |
SELECT DISTINCT | |
tb.table_name, | |
tb.table_schema AS dataset, | |
total_rows, | |
total_partitions, | |
-- Uncompressed bytes | |
total_logical_bytes AS total_uncompressed_bytes, | |
total_logical_bytes/POW(1024, 3) AS total_logical_gibytes, | |
total_logical_bytes/POW(1024, 4) AS total_logical_tibytes, | |
active_logical_bytes AS active_uncompressed_bytes, | |
active_logical_bytes/POW(1024, 3) AS active_uncompressed_gibytes, | |
active_logical_bytes/POW(1024, 4) AS active_uncompressed_tibytes, | |
long_term_logical_bytes AS long_term_uncompressed_bytes, | |
long_term_logical_bytes/POW(1024, 3) AS long_term_uncompressed_gibytes, | |
long_term_logical_bytes/POW(1024, 4) AS long_term_uncompressed_tibytes, | |
-- Compressed bytes | |
total_physical_bytes AS total_compressed_bytes, | |
total_physical_bytes/POW(1024, 3) AS total_compressed_gibytes, | |
total_physical_bytes/POW(1024, 4) AS total_compressed_tibytes, | |
-- Note that active physical bytes includes time travel so need to remove that | |
active_physical_bytes-time_travel_physical_bytes AS active_compressed_bytes, | |
(active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3) AS active_compressed_gibytes, | |
(active_physical_bytes-time_travel_physical_bytes)/POW(1024, 4) AS active_compressed_tibytes, | |
long_term_physical_bytes AS long_term_compressed_bytes, | |
long_term_physical_bytes/POW(1024, 3) AS long_term_compressed_gibytes, | |
long_term_physical_bytes/POW(1024, 4) AS long_term_compressed_tibytes, | |
time_travel_physical_bytes AS time_travel_compressed_bytes, | |
time_travel_physical_bytes/POW(1024, 3) AS time_travel_compressed_gibytes, | |
time_travel_physical_bytes/POW(1024, 4) AS time_travel_compressed_tibytes, | |
fail_safe_physical_bytes AS fail_safe_physical_bytes, | |
fail_safe_physical_bytes/POW(1024, 3) AS fail_safe_compressed_gibytes, | |
fail_safe_physical_bytes/POW(1024, 4) AS fail_safe_compressed_tibytes, | |
-- Compression ratios | |
SAFE_DIVIDE(total_logical_bytes, total_physical_bytes) AS total_compression_ratio, -- Defined as uncompressed size/compressed size | |
SAFE_DIVIDE(long_term_logical_bytes, long_term_physical_bytes) AS long_term_compression_ratio, | |
SAFE_DIVIDE(active_logical_bytes, active_physical_bytes) AS active_compression_ratio, | |
-- Pricing | |
((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) + | |
((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS total_uncompressed_price, | |
((active_logical_bytes/POW(1024, 3))*active_logical_price_per_gb) AS active_uncompressed_price, | |
((long_term_logical_bytes/POW(1024, 3))*long_term_logical_price_per_gb) AS long_term_uncompressed_price, | |
(((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) + | |
((long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb) AS total_compressed_price, | |
(((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*active_physical_price_per_gb) AS active_compressed_price, | |
(long_term_physical_bytes/POW(1024, 3))*long_term_physical_price_per_gb AS long_term_compressed_price, | |
(time_travel_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS time_travel_compressed_price, | |
(fail_safe_physical_bytes/POW(1024, 3))*active_physical_price_per_gb AS fail_safe_compressed_price | |
FROM | |
`region-us`.INFORMATION_SCHEMA.TABLE_STORAGE AS tb | |
-- Need to join on TABLES for existing tables to remove any temporary or job result tables | |
-- Note due to this information being in the TABLE_STORAGE view this means it cannot be | |
-- performed across an entire organization without checking the TABLES view in each project. | |
JOIN `region-us`.INFORMATION_SCHEMA.TABLES AS t | |
ON t.table_catalog = tb.project_id | |
AND t.table_name = tb.table_name | |
WHERE | |
tb.deleted = false | |
), | |
calculations AS | |
( | |
SELECT | |
table_name, | |
dataset, | |
SUM(active_uncompressed_price) AS active_uncompressed_price, | |
SUM(active_compressed_price) AS active_compressed_price, | |
SUM(long_term_uncompressed_price) AS long_term_uncompressed_price, | |
SUM(long_term_compressed_price) AS long_term_compressed_price, | |
SUM(time_travel_compressed_price) AS time_travel_compressed_price, | |
SUM(fail_safe_compressed_price) AS fail_safe_compressed_price | |
FROM | |
storage | |
GROUP by | |
table_name, dataset | |
), | |
final_data AS | |
( | |
SELECT | |
dataset, | |
table_name, | |
-- Price differences, note that >0 means physical storage is cheaper before adding in time travel and failsafe | |
active_uncompressed_price-active_compressed_price AS active_price_difference, | |
long_term_uncompressed_price-long_term_compressed_price AS long_term_price_difference, | |
-- Time travel and fail safe storage reductions | |
(time_travel_compressed_price+fail_safe_compressed_price) AS additional_costs_for_physical_storage, | |
-- Totals for each model | |
active_uncompressed_price+long_term_uncompressed_price AS logical_storage_price, | |
(active_compressed_price+long_term_compressed_price)+ | |
(time_travel_compressed_price+fail_safe_compressed_price) AS physical_storage_price, | |
-- Difference in values (logical - active) | |
(active_uncompressed_price+long_term_uncompressed_price) | |
- | |
( | |
(active_compressed_price+long_term_compressed_price)+ | |
(time_travel_compressed_price+fail_safe_compressed_price) | |
) AS difference | |
FROM | |
calculations | |
) | |
SELECT | |
dataset, | |
table_name, | |
-- Price differences, note that >0 means physical storage is cheaper before adding in time travel and failsafe | |
CONCAT('$ ',FORMAT("%'.2f", active_price_difference)) AS active_price_difference, | |
CONCAT('$ ',FORMAT("%'.2f", long_term_price_difference)) AS long_term_price_difference, | |
-- Time travel and fail safe storage reductions | |
CONCAT('$ ',FORMAT("%'.2f", additional_costs_for_physical_storage)) AS additional_costs_for_physical_storage, | |
-- Totals for each model | |
CONCAT('$ ',FORMAT("%'.2f", logical_storage_price)) AS logical_storage_price, | |
CONCAT('$ ',FORMAT("%'.2f", physical_storage_price)) AS physical_storage_price, | |
-- Difference between logical storage and physical storage (logical - active) | |
-- Note that a negative value means logica/uncompressed is cheaper | |
CONCAT('$ ',FORMAT("%'.2f", difference)) AS difference_in_price_if_physical_is_chosen, | |
-- Recommendation | |
IF(logical_storage_price < physical_storage_price, | |
'Logical storage', 'Physical storage') AS recommendation, | |
-- If you wish to get the raw values that are not formatted uncomment the below line | |
--final_data.* EXCEPT(dataset) | |
FROM | |
final_data | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment