Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions sql/2025/cdn/alt_svc_protocol_analysis.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#standardSQL
# Alt-Svc Protocol Analysis - Deep dive into Alternative Service headers
# Analyzes what protocols are advertised vs actually used

WITH alt_svc_analysis AS (
SELECT
client,
page,
url,
is_main_document,

-- CDN detection
IFNULL(
NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''),
'ORIGIN'
) AS cdn,

-- Current protocol used
UPPER(IFNULL(
JSON_EXTRACT_SCALAR(payload, '$._protocol'),
IFNULL(
NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'),
NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')
)
)) AS current_protocol,

-- Extract Alt-Svc header value
(
SELECT h.value
FROM UNNEST(response_headers) AS h
WHERE LOWER(h.name) = 'alt-svc'
LIMIT 1 -- noqa: AM09
) AS alt_svc_header,

-- Check if Alt-Svc exists
EXISTS(
SELECT 1
FROM UNNEST(response_headers) AS h
WHERE LOWER(h.name) = 'alt-svc'
) AS has_alt_svc

FROM `httparchive.crawl.requests`
WHERE date = '2025-07-01' AND
EXISTS (
SELECT 1
FROM UNNEST(response_headers) AS h
WHERE LOWER(h.name) = 'alt-svc'
)
),

protocol_extraction AS (
SELECT
*,
-- Extract HTTP/3 variants from Alt-Svc
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3[^=]*=') AS advertises_h3,
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3-\d+[^=]*=') AS advertises_h3_draft,
REGEXP_CONTAINS(LOWER(alt_svc_header), r'h2[^=]*=') AS advertises_h2,
REGEXP_CONTAINS(LOWER(alt_svc_header), r'http/1\.1[^=]*=') AS advertises_h1,

-- Extract max-age values
SAFE_CAST(REGEXP_EXTRACT(alt_svc_header, r'ma=(\d+)') AS INT64) AS max_age_seconds,

-- Check for clear directive
REGEXP_CONTAINS(LOWER(alt_svc_header), r'clear') AS has_clear_directive

FROM alt_svc_analysis
)

SELECT
client,
cdn,
is_main_document,
current_protocol,
COUNT(0) AS total_requests,

-- Protocol advertising analysis
COUNTIF(advertises_h3) AS advertises_h3_count,
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(0)) * 100, 2) AS advertises_h3_pct,

COUNTIF(advertises_h3_draft) AS advertises_h3_draft_count,
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(0)) * 100, 2) AS advertises_h3_draft_pct,

COUNTIF(advertises_h2) AS advertises_h2_count,
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(0)) * 100, 2) AS advertises_h2_pct,

COUNTIF(advertises_h1) AS advertises_h1_count,
ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(0)) * 100, 2) AS advertises_h1_pct,

-- Protocol mismatch analysis
COUNTIF(advertises_h3 AND current_protocol NOT LIKE '%H3%' AND current_protocol != 'HTTP/3') AS h3_advertised_not_used,
COUNTIF(current_protocol LIKE '%H3%' OR current_protocol = 'HTTP/3') AS currently_using_h3,

-- Max-age statistics
APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(50)] AS median_max_age_seconds,
APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(90)] AS p90_max_age_seconds,

-- Clear directive usage
COUNTIF(has_clear_directive) AS clear_directive_count,
ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(0)) * 100, 2) AS clear_directive_pct,

-- Sample Alt-Svc headers for analysis
ARRAY_AGG(alt_svc_header IGNORE NULLS LIMIT 5) AS sample_alt_svc_headers

FROM protocol_extraction
GROUP BY
client,
cdn,
is_main_document,
current_protocol
HAVING
total_requests >= 100
ORDER BY
client DESC,
total_requests DESC,
advertises_h3_pct DESC
118 changes: 118 additions & 0 deletions sql/2025/cdn/brotli_compression_adoption.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#standardSQL
# 20_brotli_compression_adoption.sql: Brotli and modern compression adoption by CDN
# Analyzes compression algorithm usage patterns and efficiency across CDN providers
#
# Rationale: Brotli compression offers 20-30% better compression than gzip for text resources.
# As it becomes more widely supported, we want to track which CDNs are leading adoption
# and how much bandwidth savings are being achieved. This is critical for performance
# and sustainability metrics.

WITH compression_analysis AS (
SELECT
client,

-- CDN detection
IFNULL(
NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''),
'ORIGIN'
) AS cdn,

-- Page and resource info
page,
url,
is_main_document,

-- Content type classification
CASE
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs)($|\?)') THEN 'JavaScript'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.css($|\?)') THEN 'CSS'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(html|htm)($|\?)') OR is_main_document THEN 'HTML'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(json)($|\?)') THEN 'JSON'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(svg)($|\?)') THEN 'SVG'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(woff2?|ttf|otf|eot)($|\?)') THEN 'Fonts'
WHEN REGEXP_CONTAINS(LOWER(url), r'\.(jpg|jpeg|png|gif|webp|avif)($|\?)') THEN 'Images'
ELSE 'Other'
END AS content_type,

-- Compression detection from Content-Encoding header
(
SELECT LOWER(h.value)
FROM UNNEST(response_headers) AS h
WHERE LOWER(h.name) = 'content-encoding'
LIMIT 1 -- noqa: AM09
) AS content_encoding,

-- Vary header check (indicates dynamic compression support)
EXISTS(
SELECT 1 FROM UNNEST(response_headers) AS h
WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%'
) AS supports_dynamic_compression,

-- Response size metrics
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS response_body_size,
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS uncompressed_size,

-- Transfer size (actual bytes transferred)
SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response._transferSize') AS INT64) AS transfer_size
FROM `httparchive.crawl.requests`
WHERE date = '2025-07-01' AND
-- Focus on compressible content types
REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)')
)

SELECT
client,
cdn,
content_type,

-- Volume metrics
COUNT(DISTINCT page) AS total_pages,
COUNT(0) AS total_requests,

-- Compression type distribution
COUNTIF(content_encoding = 'br') AS brotli_requests,
COUNTIF(content_encoding = 'gzip') AS gzip_requests,
COUNTIF(content_encoding = 'deflate') AS deflate_requests,
COUNTIF(content_encoding IS NULL OR content_encoding = '') AS uncompressed_requests,
COUNTIF(content_encoding NOT IN ('br', 'gzip', 'deflate', '') AND content_encoding IS NOT NULL) AS other_compression,

-- Compression percentages
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'br'), COUNT(0)) * 100, 2) AS brotli_pct,
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'gzip'), COUNT(0)) * 100, 2) AS gzip_pct,
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'deflate'), COUNT(0)) * 100, 2) AS deflate_pct,
ROUND(SAFE_DIVIDE(COUNTIF(content_encoding IS NULL OR content_encoding = ''), COUNT(0)) * 100, 2) AS uncompressed_pct,

-- Dynamic compression support
COUNTIF(supports_dynamic_compression) AS dynamic_compression_count,
ROUND(SAFE_DIVIDE(COUNTIF(supports_dynamic_compression), COUNT(0)) * 100, 2) AS dynamic_compression_pct,

-- Size metrics (in KB)
ROUND(AVG(response_body_size) / 1024, 2) AS avg_response_size_kb,
ROUND(AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END) / 1024, 2) AS avg_brotli_size_kb,
ROUND(AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / 1024, 2) AS avg_gzip_size_kb,
ROUND(AVG(CASE WHEN content_encoding IS NULL OR content_encoding = '' THEN response_body_size END) / 1024, 2) AS avg_uncompressed_size_kb,

-- Compression efficiency comparison
ROUND(
SAFE_DIVIDE(
AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) -
AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END),
AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END)
) * 100, 2
) AS brotli_vs_gzip_savings_pct,

-- Total data transfer metrics
ROUND(SUM(response_body_size) / (1024 * 1024 * 1024), 2) AS total_gb_transferred,
ROUND(SUM(CASE WHEN content_encoding = 'br' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_brotli,
ROUND(SUM(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_gzip
FROM compression_analysis
GROUP BY
client,
cdn,
content_type
HAVING
total_requests >= 100 -- Minimum threshold for statistical relevance
ORDER BY
client DESC,
brotli_pct DESC,
total_requests DESC
Loading