Skip to content

Commit 3781fe5

Browse files
joeviggianojviggiannick-mccordtunetheweb
authored
CDN 2025 SQL (#4300)
* Pushing 2024 queries into 2025 * Updated queries to pull from latest crawl and dataset. * Fixed sanList array parsing for tls time. * Adding client hint, early hint, and image format changes (#4295) * Removed old early hints queries. * SQLFluff fix * Manual formatting * More linting --------- Co-authored-by: Joe Viggiano <jviggian@amazon.com> Co-authored-by: Nick McCord <102171522+nick-mccord@users.noreply.github.com> Co-authored-by: Barry Pollard <barrypollard@google.com>
1 parent 9f20839 commit 3781fe5

23 files changed

+1576
-0
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#standardSQL
2+
# cdn_usage_by_site_rank.sql : Distribution of HTML pages served by CDN vs Origin by rank
3+
4+
WITH requests AS (
5+
SELECT
6+
client,
7+
resp.rank, -- Need to validate this should be resp.rank and not pages.rank
8+
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(resp.summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
9+
FROM
10+
`httparchive.crawl.requests` AS resp
11+
INNER JOIN
12+
`httparchive.crawl.pages`
13+
USING (page, client, date)
14+
WHERE
15+
date = '2025-07-01' AND
16+
is_main_document
17+
)
18+
19+
SELECT
20+
client,
21+
nested_rank,
22+
cdn,
23+
COUNT(0) AS num_requests,
24+
SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS total,
25+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS pct_requests
26+
FROM
27+
requests,
28+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS nested_rank
29+
WHERE
30+
rank <= nested_rank
31+
GROUP BY
32+
client,
33+
cdn,
34+
nested_rank
35+
ORDER BY
36+
client,
37+
nested_rank,
38+
cdn
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#standardSQL
2+
-- Baseline Client Hints adoption metrics for context
3+
-- Measures both server-side (Accept-CH) and client-side (Sec-CH-*) adoption
4+
DECLARE d DATE DEFAULT '2025-07-01';
5+
6+
WITH
7+
-- Total requests baseline
8+
total_requests AS (
9+
SELECT
10+
client,
11+
COUNT(0) AS total_requests,
12+
COUNT(DISTINCT page) AS total_pages
13+
FROM
14+
`httparchive.crawl.requests`
15+
WHERE
16+
date = d
17+
GROUP BY
18+
client
19+
),
20+
21+
-- Requests with Accept-CH header (servers requesting hints)
22+
accept_ch_stats AS (
23+
SELECT
24+
r.client,
25+
COUNT(0) AS requests_with_accept_ch,
26+
COUNT(DISTINCT r.page) AS pages_with_accept_ch
27+
FROM
28+
`httparchive.crawl.requests` r,
29+
UNNEST(r.response_headers) AS h
30+
WHERE
31+
r.date = d AND
32+
LOWER(h.name) = 'accept-ch' AND
33+
h.value IS NOT NULL AND
34+
h.value != ''
35+
GROUP BY
36+
r.client
37+
),
38+
39+
-- Requests with any Sec-CH-* header (clients sending hints)
40+
-- Use DISTINCT to count each request only once, even if it has multiple hint headers
41+
client_hints_sent AS (
42+
SELECT
43+
client,
44+
COUNT(0) AS requests_with_hints,
45+
COUNT(DISTINCT page) AS pages_sending_hints
46+
FROM (
47+
SELECT DISTINCT
48+
r.client,
49+
r.page,
50+
r.url -- Use url to uniquely identify each request
51+
FROM
52+
`httparchive.crawl.requests` r,
53+
UNNEST(r.request_headers) AS h
54+
WHERE
55+
r.date = d AND
56+
LOWER(h.name) LIKE 'sec-ch-%'
57+
)
58+
GROUP BY
59+
client
60+
),
61+
62+
-- Count distinct hint types being sent
63+
hint_diversity AS (
64+
SELECT
65+
r.client,
66+
COUNT(DISTINCT LOWER(h.name)) AS distinct_hint_types
67+
FROM
68+
`httparchive.crawl.requests` r,
69+
UNNEST(r.request_headers) AS h
70+
WHERE
71+
r.date = d AND
72+
LOWER(h.name) LIKE 'sec-ch-%'
73+
GROUP BY
74+
r.client
75+
)
76+
77+
SELECT
78+
t.client AS `Client`,
79+
t.total_requests AS `Total_Requests`,
80+
t.total_pages AS `Total_Pages`,
81+
82+
-- Server-side adoption (Accept-CH)
83+
IFNULL(a.requests_with_accept_ch, 0) AS `Requests_with_Accept-CH`,
84+
IFNULL(a.pages_with_accept_ch, 0) AS `Pages_with_Accept-CH`,
85+
ROUND(IFNULL(a.requests_with_accept_ch, 0) / t.total_requests * 100, 2) AS `Pct_Requests_with_Accept-CH`,
86+
ROUND(IFNULL(a.pages_with_accept_ch, 0) / t.total_pages * 100, 2) AS `Pct_Pages_with_Accept-CH`,
87+
88+
-- Client-side adoption (Sec-CH-*)
89+
IFNULL(c.requests_with_hints, 0) AS `Requests_Sending_Hints`,
90+
IFNULL(c.pages_sending_hints, 0) AS `Pages_Sending_Hints`,
91+
ROUND(IFNULL(c.requests_with_hints, 0) / t.total_requests * 100, 2) AS `Pct_Requests_Sending_Hints`,
92+
ROUND(IFNULL(c.pages_sending_hints, 0) / t.total_pages * 100, 2) AS `Pct_Pages_Sending_Hints`,
93+
94+
-- Diversity
95+
IFNULL(h.distinct_hint_types, 0) AS `Distinct_Hint_Types_in_Use`
96+
97+
FROM
98+
total_requests t
99+
LEFT JOIN accept_ch_stats a ON t.client = a.client
100+
LEFT JOIN client_hints_sent c ON t.client = c.client
101+
LEFT JOIN hint_diversity h ON t.client = h.client
102+
ORDER BY
103+
t.client;
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#standardSQL
2+
-- Analyze Client Hints usage by specific CDN provider
3+
DECLARE d DATE DEFAULT '2025-07-01';
4+
5+
WITH
6+
-- Extract CDN provider and check for Accept-CH header
7+
cdn_requests AS (
8+
SELECT
9+
client,
10+
IFNULL(
11+
NULLIF(
12+
REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'),
13+
''
14+
),
15+
'Origin'
16+
) AS cdn_provider,
17+
response_headers
18+
FROM
19+
`httparchive.crawl.requests`
20+
WHERE
21+
date = d
22+
),
23+
24+
-- Count total requests per CDN
25+
total_by_cdn AS (
26+
SELECT
27+
client,
28+
cdn_provider,
29+
COUNT(0) AS total_requests
30+
FROM
31+
cdn_requests
32+
GROUP BY
33+
client,
34+
cdn_provider
35+
),
36+
37+
-- Count requests with Accept-CH header per CDN
38+
accept_ch_by_cdn AS (
39+
SELECT
40+
r.client,
41+
r.cdn_provider,
42+
COUNT(0) AS requests_with_accept_ch
43+
FROM
44+
cdn_requests r,
45+
UNNEST(r.response_headers) AS h
46+
WHERE
47+
LOWER(h.name) = 'accept-ch' AND
48+
h.value IS NOT NULL AND
49+
h.value != ''
50+
GROUP BY
51+
r.client,
52+
r.cdn_provider
53+
)
54+
55+
SELECT
56+
t.cdn_provider AS `CDN_Provider`,
57+
t.client AS `Client`,
58+
t.total_requests AS `Total_Requests`,
59+
IFNULL(a.requests_with_accept_ch, 0) AS `Requests_with_Accept-CH`,
60+
ROUND(IFNULL(a.requests_with_accept_ch, 0) / t.total_requests * 100, 2) AS `Pct_with_Accept-CH`
61+
FROM
62+
total_by_cdn t
63+
LEFT JOIN accept_ch_by_cdn a ON t.client = a.client AND t.cdn_provider = a.cdn_provider
64+
WHERE
65+
t.total_requests >= 1000 -- Filter to CDNs with meaningful sample size
66+
ORDER BY
67+
t.total_requests DESC,
68+
t.cdn_provider,
69+
t.client
70+
LIMIT 50;
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#standardSQL
2+
DECLARE d DATE DEFAULT '2025-07-01';
3+
4+
WITH
5+
-- Count requests per Client Hint and client (mobile/desktop)
6+
hint_counts AS (
7+
SELECT
8+
LOWER(h.name) AS client_hint,
9+
r.client,
10+
COUNT(0) AS requests
11+
FROM
12+
`httparchive.crawl.requests` r,
13+
UNNEST(r.request_headers) AS h
14+
WHERE
15+
r.date = d AND
16+
LOWER(h.name) LIKE 'sec-ch-%'
17+
GROUP BY
18+
client_hint,
19+
r.client
20+
),
21+
22+
-- Pivot to mobile/desktop columns
23+
hint_rollup AS (
24+
SELECT
25+
client_hint,
26+
SUM(CASE WHEN client = 'mobile' THEN requests ELSE 0 END) AS mobile_requests,
27+
SUM(CASE WHEN client = 'desktop' THEN requests ELSE 0 END) AS desktop_requests
28+
FROM
29+
hint_counts
30+
GROUP BY
31+
client_hint
32+
),
33+
34+
-- Dictionary for Category/Entropy
35+
dict AS (
36+
SELECT *
37+
FROM UNNEST([
38+
STRUCT('sec-ch-ua' AS client_hint, 'User-Agent' AS category, 'Low' AS entropy, 1 AS ord),
39+
('sec-ch-ua-mobile', 'User-Agent', 'Low', 2),
40+
('sec-ch-ua-platform', 'User-Agent', 'Low', 3),
41+
('sec-ch-ua-platform-version', 'User-Agent', 'High', 4),
42+
('sec-ch-ua-model', 'Device', 'High', 5),
43+
('sec-ch-ua-full-version-list', 'User-Agent', 'High', 6),
44+
('sec-ch-ua-arch', 'Device', 'High', 7),
45+
('sec-ch-ua-bitness', 'Device', 'High', 8),
46+
('sec-ch-ua-wow64', 'Device', 'High', 9),
47+
('sec-ch-ua-full-version', 'User-Agent', 'High', 10),
48+
('sec-ch-prefers-color-scheme', 'User Preference', 'Low', 11),
49+
('sec-ch-viewport-width', 'Viewport', 'High', 12),
50+
('sec-ch-dpr', 'Device', 'High', 13),
51+
('sec-ch-device-memory', 'Device', 'High', 14),
52+
('sec-ch-ua-form-factors', 'Device', 'High', 15),
53+
('sec-ch-viewport-height', 'Viewport', 'High', 16),
54+
('sec-ch-prefers-reduced-motion', 'User Preference', 'Low', 17),
55+
('sec-ch-prefers-reduced-transparency', 'User Preference', 'Low', 18),
56+
('sec-ch-width', 'Viewport', 'High', 19)
57+
])
58+
)
59+
60+
SELECT
61+
d.client_hint AS `Client_Hint`,
62+
d.category AS `Category`,
63+
d.entropy AS `Entropy`,
64+
IFNULL(r.mobile_requests, 0) AS `Mobile_Requests`,
65+
IFNULL(r.desktop_requests, 0) AS `Desktop_Requests`,
66+
IFNULL(r.mobile_requests, 0) + IFNULL(r.desktop_requests, 0) AS `Total_Requests`
67+
FROM
68+
dict d
69+
LEFT JOIN hint_rollup r USING (client_hint)
70+
ORDER BY
71+
d.ord;
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#standardSQL
2+
-- Compare Client Hints adoption between CDN and origin requests
3+
DECLARE d DATE DEFAULT '2025-07-01';
4+
5+
WITH
6+
-- Classify requests as CDN or origin
7+
requests_classified AS (
8+
SELECT
9+
client,
10+
CASE
11+
WHEN IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = ''
12+
THEN 'Origin'
13+
ELSE 'CDN'
14+
END AS source_type,
15+
response_headers
16+
FROM
17+
`httparchive.crawl.requests`
18+
WHERE
19+
date = d
20+
),
21+
22+
-- Count total requests
23+
total_requests AS (
24+
SELECT
25+
client,
26+
source_type,
27+
COUNT(0) AS total_requests
28+
FROM
29+
requests_classified
30+
GROUP BY
31+
client,
32+
source_type
33+
),
34+
35+
-- Count requests with Accept-CH header (server requesting hints)
36+
accept_ch_requests AS (
37+
SELECT
38+
r.client,
39+
r.source_type,
40+
COUNT(0) AS requests_with_accept_ch
41+
FROM
42+
requests_classified r,
43+
UNNEST(r.response_headers) AS h
44+
WHERE
45+
LOWER(h.name) = 'accept-ch' AND
46+
h.value IS NOT NULL AND
47+
h.value != ''
48+
GROUP BY
49+
r.client,
50+
r.source_type
51+
)
52+
53+
SELECT
54+
t.client AS `Client`,
55+
t.source_type AS `Source`,
56+
t.total_requests AS `Total_Requests`,
57+
IFNULL(a.requests_with_accept_ch, 0) AS `Requests_with_Accept-CH`,
58+
ROUND(IFNULL(a.requests_with_accept_ch, 0) / t.total_requests * 100, 2) AS `Pct_with_Accept-CH`
59+
FROM
60+
total_requests t
61+
LEFT JOIN accept_ch_requests a ON t.client = a.client AND t.source_type = a.source_type
62+
ORDER BY
63+
t.client,
64+
t.source_type;

0 commit comments

Comments
 (0)