Last active
November 8, 2020 14:40
-
-
Save johnnyaug/f0cd7805b54228d237af0f33d0f37942 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Athena queries to share inventory stats with the Treeverse team. | |
-- The result of these queries will help us a great deal in engineering lakeFS, without exposing any data or object names. | |
-- Thank you for your effort and help! | |
-- It is assumed that you have a table named inventory_tbl representing the inventory. | |
-- A CREATE statement for such a table is available here: https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-inventory.html#storage-inventory-athena-query | |
-- It is also assumed that Object-Level versioning is enabled on the bucket, and on the inventory (i.e. the inventory contains is_latest as a field). | |
-- If you don't have Object-Level versioning, please contact us and we will provide an alternative set of queries. | |
-- Replace the dates with the dates with the ones you intend to share with us (NOTE: they appear in three queries). | |
-- Replace inventory_tbl with the name of the inventory table, if it is different. | |
-- Run these queries on Athena, and share with us the result of the final SELECT query. | |
CREATE table tmp_lakefs_sample AS ( | |
SELECT dt, key, row_number() over(PARTITION BY dt ORDER BY key) AS block_num | |
FROM inventory_tbl TABLESAMPLE BERNOULLI(0.02) | |
WHERE dt between '2020-10-20-00-00' and '2020-10-26-00-00' AND is_latest | |
); | |
CREATE table tmp_lakefs_blocks WITH (partitioned_by = ARRAY['dt']) AS ( | |
SELECT t1.key AS min_key, t2.key as max_key, t1.block_num, t1.dt | |
FROM tmp_lakefs_sample t1 JOIN tmp_lakefs_sample t2 ON t1.dt = t2.dt AND t1.block_num + 1 = t2.block_num | |
GROUP BY t1.dt, t1.key, t2.key, t1.block_num | |
ORDER BY t1.block_num, 2, 3 | |
); | |
CREATE table tmp_lakefs_inventory_times AS ( | |
SELECT dt, MAX(last_modified_date) AS inventory_time | |
FROM inventory_tbl s | |
WHERE dt between '2020-10-20-00-00' and '2020-10-26-00-00' | |
GROUP BY 1 | |
); | |
CREATE table tmp_lakefs_recently_modified | |
WITH (partitioned_by = ARRAY['dt']) | |
AS ( | |
SELECT key,s.dt | |
FROM inventory_tbl s JOIN tmp_lakefs_inventory_times it | |
ON it.dt = s.dt | |
WHERE s.last_modified_date > it.inventory_time - interval '1' day AND is_latest | |
); | |
-- Send us the results of the following query: | |
SELECT dt,block_num,EXISTS( | |
SELECT key FROM tmp_lakefs_recently_modified s WHERE b.dt = s.dt AND s.key >= b.min_key AND s.key < b.max_key | |
) AS recently_modified | |
FROM tmp_lakefs_blocks b; | |
-- Please also send the results of the following query: | |
SELECT *, ROW_NUMBER() OVER(PARTITION BY dt ORDER BY changes DESC) AS rnk | |
FROM ( | |
SELECT s.dt, last_modified_date, COUNT(*) AS changes | |
FROM inventory_tbl s JOIN tmp_lakefs_inventory_times it ON it.dt = s.dt | |
WHERE s.last_modified_date > it.inventory_time - interval '1' day | |
AND s.dt BETWEEN '2020-10-20-00-00' AND '2020-10-26-00-00' | |
GROUP BY 1,2 | |
); | |
-- Drop all tables | |
DROP TABLE tmp_lakefs_sample; | |
DROP TABLE tmp_lakefs_blocks; | |
DROP TABLE tmp_lakefs_recently_modified; | |
DROP TABLE tmp_lakefs_inventory_times; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment