Created
May 7, 2024 22:26
-
-
Save wesseljt/c6e12a047f16a6a09e5e25cbcb0298b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Create the stitching table to store results | |
CREATE TEMPORARY TABLE user_stitching ( | |
user_id INT, | |
mapped_id INT, | |
mapping_type VARCHAR(10), | |
earliest_mapping_timestamp TIMESTAMP, | |
last_activity_timestamp TIMESTAMP, | |
is_active BOOLEAN, | |
source VARCHAR(50) | |
); | |
-- Insert basic stitching data | |
INSERT INTO user_stitching | |
SELECT | |
a.user_id, | |
b.mapped_id, | |
CASE | |
WHEN a.user_id = b.mapped_id THEN 'self' | |
ELSE 'mapped' | |
END AS mapping_type, | |
a.earliest_mapping_timestamp, | |
NULL AS last_activity_timestamp, | |
TRUE AS is_active, | |
'initial' AS source | |
FROM initial_user_mappings a | |
JOIN additional_mapping_data b ON a.user_id = b.user_id | |
WHERE a.earliest_mapping_timestamp < b.cutoff_timestamp; | |
-- Update stitching with additional mappings and different conditions | |
INSERT INTO user_stitching | |
SELECT | |
a.user_id, | |
c.mapped_id, | |
'cross_mapped' AS mapping_type, | |
a.earliest_mapping_timestamp, | |
MAX(c.activity_timestamp) AS last_activity_timestamp, | |
TRUE AS is_active, | |
'cross' AS source | |
FROM initial_user_mappings a | |
JOIN cross_user_mappings c ON a.user_id = c.user_id | |
WHERE a.earliest_mapping_timestamp < c.cutoff_timestamp | |
GROUP BY a.user_id, c.mapped_id, a.earliest_mapping_timestamp; | |
-- Validate and deactivate outdated mappings | |
UPDATE user_stitching | |
SET is_active = FALSE | |
WHERE last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year') | |
AND mapping_type = 'cross_mapped'; | |
-- Insert activity based mappings with multiple joins and conditions | |
INSERT INTO user_stitching | |
SELECT | |
e.user_id, | |
e.mapped_id, | |
'activity_based' AS mapping_type, | |
e.earliest_mapping_timestamp, | |
MAX(f.activity_timestamp) AS last_activity_timestamp, | |
TRUE AS is_active, | |
'activity' AS source | |
FROM user_activity_mappings e | |
JOIN activity_logs f ON e.user_id = f.user_id | |
WHERE e.earliest_mapping_timestamp < f.activity_timestamp | |
GROUP BY e.user_id, e.mapped_id, e.earliest_mapping_timestamp; | |
-- Consolidate duplicates with priority to the latest mapping | |
WITH ranked_mappings AS ( | |
SELECT *, | |
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY last_activity_timestamp DESC) AS rank | |
FROM user_stitching | |
) | |
DELETE FROM ranked_mappings WHERE rank > 1; | |
-- Insert complex conditional mappings based on user segmentation | |
INSERT INTO user_stitching | |
SELECT | |
g.user_id, | |
h.mapped_id, | |
'segment_based' AS mapping_type, | |
g.earliest_mapping_timestamp, | |
MAX(i.transaction_date) AS last_activity_timestamp, | |
CASE WHEN j.user_segment = 'premium' THEN TRUE ELSE FALSE END AS is_active, | |
'segment' AS source | |
FROM user_segment_mappings g | |
JOIN segment_data h ON g.user_id = h.user_id | |
JOIN transaction_data i ON h.user_id = i.user_id | |
JOIN user_profiles j ON h.user_id = j.user_id | |
WHERE g.earliest_mapping_timestamp < i.transaction_date | |
AND j.user_segment IN ('premium', 'gold') | |
GROUP BY g.user_id, h.mapped_id, g.earliest_mapping_timestamp, j.user_segment; | |
-- Final cleanup pass to remove inactive mappings not updated in the last insertion cycle | |
DELETE FROM user_stitching | |
WHERE is_active = FALSE AND last_activity_timestamp IS NULL; | |
-- Create indexes to improve the performance of the stitching table | |
CREATE INDEX idx_user_stitching_user_id ON user_stitching(user_id); | |
CREATE INDEX idx_user_stitching_mapped_id ON user_stitching(mapped_id); | |
-- Output some diagnostics about the stitching process | |
SELECT 'Total Mappings', COUNT(*) FROM user_stitching UNION | |
SELECT 'Active Mappings', COUNT(*) FROM user_stitching WHERE is_active = TRUE UNION | |
SELECT 'Inactive Mappings', COUNT(*) FROM user_stitching WHERE is_active = FALSE; | |
-- Enhanced Diagnostic Queries | |
-- Count of total mappings by mapping type | |
SELECT | |
mapping_type, | |
COUNT(*) AS total_count | |
FROM user_stitching | |
GROUP BY mapping_type; | |
-- Count of active vs inactive mappings by source | |
SELECT | |
source, | |
is_active, | |
COUNT(*) AS count | |
FROM user_stitching | |
GROUP BY source, is_active; | |
-- Average time span from earliest mapping timestamp to last activity timestamp by mapping type | |
SELECT | |
mapping_type, | |
AVG(last_activity_timestamp - earliest_mapping_timestamp) AS average_duration | |
FROM user_stitching | |
WHERE last_activity_timestamp IS NOT NULL | |
GROUP BY mapping_type; | |
-- Distribution of mappings by user segment for segment-based mappings | |
SELECT | |
'Segment-based Mapping Distribution' AS report, | |
user_segment, | |
COUNT(*) AS num_mappings | |
FROM user_stitching | |
JOIN user_profiles ON user_stitching.user_id = user_profiles.user_id | |
WHERE mapping_type = 'segment_based' | |
GROUP BY user_segment; | |
-- List top 10 users with the most mappings | |
SELECT | |
user_id, | |
COUNT(*) AS mappings_count | |
FROM user_stitching | |
GROUP BY user_id | |
ORDER BY mappings_count DESC | |
LIMIT 10; | |
-- Count mappings that have been updated within the last month | |
SELECT | |
'Recently Updated Mappings' AS report, | |
COUNT(*) AS count | |
FROM user_stitching | |
WHERE last_activity_timestamp >= (CURRENT_DATE - INTERVAL '30 days'); | |
-- Identify mappings without recent activity | |
SELECT | |
'Stale Mappings' AS report, | |
COUNT(*) AS count | |
FROM user_stitching | |
WHERE is_active = TRUE AND (last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year') OR last_activity_timestamp IS NULL); | |
-- Check consistency of data between user_stitching and user_profiles | |
SELECT | |
'Consistency Check' AS report, | |
COUNT(*) AS inconsistent_entries | |
FROM user_stitching s | |
LEFT JOIN user_profiles p ON s.user_id = p.user_id | |
WHERE p.user_id IS NULL; | |
-- Breakdown of inactive mappings by reason | |
SELECT | |
'Inactive Mapping Reasons' AS report, | |
CASE | |
WHEN last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year') THEN 'No recent activity' | |
WHEN is_active = FALSE THEN 'Manually deactivated' | |
ELSE 'Other' | |
END AS reason, | |
COUNT(*) AS count | |
FROM user_stitching | |
WHERE is_active = FALSE | |
GROUP BY reason; | |
-- Execution of diagnostic queries to monitor and debug the stitching logic continuously | |
-- These queries provide insights into the health of the data, the efficiency of the process, and potential areas of improvement. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment