-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathminimal_variant_query_guide.sql
More file actions
157 lines (127 loc) · 5.46 KB
/
minimal_variant_query_guide.sql
File metadata and controls
157 lines (127 loc) · 5.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
-- =====================================================
-- MINIMAL VARIANT QUERY GUIDE
-- How to query data stored in Variant(JSON) column
-- =====================================================
-- Table structure:
-- CREATE TABLE bluesky_data (data Variant(JSON))
-- =====================================================
-- BASIC EXTRACTION PATTERNS
-- =====================================================
-- 1. Extract top-level string fields
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind
FROM bluesky_minimal_1m.bluesky_data;
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'did') as did
FROM bluesky_minimal_1m.bluesky_data;
-- 2. Extract nested string fields
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'collection') as collection
FROM bluesky_minimal_1m.bluesky_data;
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'operation') as operation
FROM bluesky_minimal_1m.bluesky_data;
-- 3. Extract numeric fields
SELECT JSONExtractUInt(toString(variantElement(data, 'JSON')), 'time_us') as time_us
FROM bluesky_minimal_1m.bluesky_data;
-- 4. Extract and convert to different types
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind,
JSONExtractUInt(toString(variantElement(data, 'JSON')), 'time_us') as time_us,
JSONExtractString(toString(variantElement(data, 'JSON')), 'did') as did
FROM bluesky_minimal_1m.bluesky_data;
-- =====================================================
-- FILTERING AND WHERE CLAUSES
-- =====================================================
-- Filter by string equality
SELECT count()
FROM bluesky_minimal_1m.bluesky_data
WHERE JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') = 'commit';
-- Filter by nested field
SELECT count()
FROM bluesky_minimal_1m.bluesky_data
WHERE JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'operation') = 'create';
-- Filter by numeric comparison
SELECT count()
FROM bluesky_minimal_1m.bluesky_data
WHERE JSONExtractUInt(toString(variantElement(data, 'JSON')), 'time_us') > 1700000000000000;
-- Filter with multiple conditions
SELECT count()
FROM bluesky_minimal_1m.bluesky_data
WHERE JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') = 'commit'
AND JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'collection') = 'app.bsky.feed.post';
-- =====================================================
-- AGGREGATIONS AND GROUP BY
-- =====================================================
-- Count by field
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind,
count()
FROM bluesky_minimal_1m.bluesky_data
GROUP BY kind
ORDER BY count() DESC;
-- Count by nested field
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'collection') as collection,
count()
FROM bluesky_minimal_1m.bluesky_data
WHERE collection != ''
GROUP BY collection
ORDER BY count() DESC;
-- Group by multiple fields
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'operation') as op,
JSONExtractString(toString(variantElement(data, 'JSON')), 'commit', 'collection') as coll,
count()
FROM bluesky_minimal_1m.bluesky_data
WHERE op != '' AND coll != ''
GROUP BY op, coll
ORDER BY count() DESC;
-- =====================================================
-- ADVANCED PATTERNS
-- =====================================================
-- Extract arrays (if they exist)
SELECT JSONExtractArrayRaw(toString(variantElement(data, 'JSON')), 'some_array_field')
FROM bluesky_minimal_1m.bluesky_data;
-- Check if field exists
SELECT count()
FROM bluesky_minimal_1m.bluesky_data
WHERE JSONHas(toString(variantElement(data, 'JSON')), 'commit');
-- Extract with default value
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'kind'),
JSONExtractString(toString(variantElement(data, 'JSON')), 'nonexistent_field') as missing_field
FROM bluesky_minimal_1m.bluesky_data
LIMIT 3;
-- =====================================================
-- PERFORMANCE CONSIDERATIONS
-- =====================================================
-- Bad: Don't extract the same field multiple times
-- SELECT
-- JSONExtractString(toString(variantElement(data, 'JSON')), 'kind'),
-- JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') -- Duplicate!
-- Good: Extract once and reuse
SELECT
kind,
kind as kind_copy
FROM (
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind
FROM bluesky_minimal_1m.bluesky_data
);
-- =====================================================
-- COMPARISON WITH JSON BASELINE
-- =====================================================
-- Minimal Variant approach (complex):
SELECT JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind
FROM bluesky_minimal_1m.bluesky_data;
-- JSON Baseline approach (simple):
-- SELECT toString(data.kind) as kind
-- FROM bluesky_1m.bluesky;
-- =====================================================
-- MEMORY OPTIMIZATION (if needed)
-- =====================================================
-- Add these settings for large queries to prevent memory issues:
-- SETTINGS max_threads = 1, max_memory_usage = 4000000000
SELECT
JSONExtractString(toString(variantElement(data, 'JSON')), 'kind') as kind,
count()
FROM bluesky_minimal_1m.bluesky_data
GROUP BY kind
ORDER BY count() DESC
SETTINGS max_threads = 1, max_memory_usage = 4000000000;