diff --git a/sql/2025/cdn/alt_svc_protocol_analysis.sql b/sql/2025/cdn/alt_svc_protocol_analysis.sql new file mode 100644 index 00000000000..e6c12928600 --- /dev/null +++ b/sql/2025/cdn/alt_svc_protocol_analysis.sql @@ -0,0 +1,115 @@ +#standardSQL +# Alt-Svc Protocol Analysis - Deep dive into Alternative Service headers +# Analyzes what protocols are advertised vs actually used + +WITH alt_svc_analysis AS ( + SELECT + client, + page, + url, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Current protocol used + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS current_protocol, + + -- Extract Alt-Svc header value + ( + SELECT h.value + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + LIMIT 1 -- noqa: AM09 + ) AS alt_svc_header, + + -- Check if Alt-Svc exists + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + ) AS has_alt_svc + + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + EXISTS ( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + ) +), + +protocol_extraction AS ( + SELECT + *, + -- Extract HTTP/3 variants from Alt-Svc + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3[^=]*=') AS advertises_h3, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3-\d+[^=]*=') AS advertises_h3_draft, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h2[^=]*=') AS advertises_h2, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'http/1\.1[^=]*=') AS advertises_h1, + + -- Extract max-age values + SAFE_CAST(REGEXP_EXTRACT(alt_svc_header, r'ma=(\d+)') AS INT64) AS max_age_seconds, + + -- Check for clear directive + REGEXP_CONTAINS(LOWER(alt_svc_header), r'clear') AS has_clear_directive + + FROM alt_svc_analysis +) + +SELECT + client, + cdn, + is_main_document, + current_protocol, + COUNT(0) AS total_requests, + + -- Protocol advertising analysis + COUNTIF(advertises_h3) AS advertises_h3_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(0)) * 100, 2) AS advertises_h3_pct, + + COUNTIF(advertises_h3_draft) AS advertises_h3_draft_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(0)) * 100, 2) AS advertises_h3_draft_pct, + + COUNTIF(advertises_h2) AS advertises_h2_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(0)) * 100, 2) AS advertises_h2_pct, + + COUNTIF(advertises_h1) AS advertises_h1_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(0)) * 100, 2) AS advertises_h1_pct, + + -- Protocol mismatch analysis + COUNTIF(advertises_h3 AND current_protocol NOT LIKE '%H3%' AND current_protocol != 'HTTP/3') AS h3_advertised_not_used, + COUNTIF(current_protocol LIKE '%H3%' OR current_protocol = 'HTTP/3') AS currently_using_h3, + + -- Max-age statistics + APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(50)] AS median_max_age_seconds, + APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(90)] AS p90_max_age_seconds, + + -- Clear directive usage + COUNTIF(has_clear_directive) AS clear_directive_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(0)) * 100, 2) AS clear_directive_pct, + + -- Sample Alt-Svc headers for analysis + ARRAY_AGG(alt_svc_header IGNORE NULLS LIMIT 5) AS sample_alt_svc_headers + +FROM protocol_extraction +GROUP BY + client, + cdn, + is_main_document, + current_protocol +HAVING + total_requests >= 100 +ORDER BY + client DESC, + total_requests DESC, + advertises_h3_pct DESC diff --git a/sql/2025/cdn/brotli_compression_adoption.sql b/sql/2025/cdn/brotli_compression_adoption.sql new file mode 100644 index 00000000000..c7f502603cb --- /dev/null +++ b/sql/2025/cdn/brotli_compression_adoption.sql @@ -0,0 +1,118 @@ +#standardSQL +# 20_brotli_compression_adoption.sql: Brotli and modern compression adoption by CDN +# Analyzes compression algorithm usage patterns and efficiency across CDN providers +# +# Rationale: Brotli compression offers 20-30% better compression than gzip for text resources. +# As it becomes more widely supported, we want to track which CDNs are leading adoption +# and how much bandwidth savings are being achieved. This is critical for performance +# and sustainability metrics. + +WITH compression_analysis AS ( + SELECT + client, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Page and resource info + page, + url, + is_main_document, + + -- Content type classification + CASE + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs)($|\?)') THEN 'JavaScript' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.css($|\?)') THEN 'CSS' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(html|htm)($|\?)') OR is_main_document THEN 'HTML' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(json)($|\?)') THEN 'JSON' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(svg)($|\?)') THEN 'SVG' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(woff2?|ttf|otf|eot)($|\?)') THEN 'Fonts' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(jpg|jpeg|png|gif|webp|avif)($|\?)') THEN 'Images' + ELSE 'Other' + END AS content_type, + + -- Compression detection from Content-Encoding header + ( + SELECT LOWER(h.value) + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'content-encoding' + LIMIT 1 -- noqa: AM09 + ) AS content_encoding, + + -- Vary header check (indicates dynamic compression support) + EXISTS( + SELECT 1 FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%' + ) AS supports_dynamic_compression, + + -- Response size metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS response_body_size, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS uncompressed_size, + + -- Transfer size (actual bytes transferred) + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response._transferSize') AS INT64) AS transfer_size + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + -- Focus on compressible content types + REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)') +) + +SELECT + client, + cdn, + content_type, + + -- Volume metrics + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests, + + -- Compression type distribution + COUNTIF(content_encoding = 'br') AS brotli_requests, + COUNTIF(content_encoding = 'gzip') AS gzip_requests, + COUNTIF(content_encoding = 'deflate') AS deflate_requests, + COUNTIF(content_encoding IS NULL OR content_encoding = '') AS uncompressed_requests, + COUNTIF(content_encoding NOT IN ('br', 'gzip', 'deflate', '') AND content_encoding IS NOT NULL) AS other_compression, + + -- Compression percentages + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'br'), COUNT(0)) * 100, 2) AS brotli_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'gzip'), COUNT(0)) * 100, 2) AS gzip_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'deflate'), COUNT(0)) * 100, 2) AS deflate_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding IS NULL OR content_encoding = ''), COUNT(0)) * 100, 2) AS uncompressed_pct, + + -- Dynamic compression support + COUNTIF(supports_dynamic_compression) AS dynamic_compression_count, + ROUND(SAFE_DIVIDE(COUNTIF(supports_dynamic_compression), COUNT(0)) * 100, 2) AS dynamic_compression_pct, + + -- Size metrics (in KB) + ROUND(AVG(response_body_size) / 1024, 2) AS avg_response_size_kb, + ROUND(AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END) / 1024, 2) AS avg_brotli_size_kb, + ROUND(AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / 1024, 2) AS avg_gzip_size_kb, + ROUND(AVG(CASE WHEN content_encoding IS NULL OR content_encoding = '' THEN response_body_size END) / 1024, 2) AS avg_uncompressed_size_kb, + + -- Compression efficiency comparison + ROUND( + SAFE_DIVIDE( + AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) - + AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END), + AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) + ) * 100, 2 + ) AS brotli_vs_gzip_savings_pct, + + -- Total data transfer metrics + ROUND(SUM(response_body_size) / (1024 * 1024 * 1024), 2) AS total_gb_transferred, + ROUND(SUM(CASE WHEN content_encoding = 'br' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_brotli, + ROUND(SUM(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_gzip +FROM compression_analysis +GROUP BY + client, + cdn, + content_type +HAVING + total_requests >= 100 -- Minimum threshold for statistical relevance +ORDER BY + client DESC, + brotli_pct DESC, + total_requests DESC diff --git a/sql/2025/cdn/cache_control_analysis.sql b/sql/2025/cdn/cache_control_analysis.sql new file mode 100644 index 00000000000..cbd663a9fa4 --- /dev/null +++ b/sql/2025/cdn/cache_control_analysis.sql @@ -0,0 +1,165 @@ +#standardSQL +# Cache-Control and CDN caching behavior analysis +# Analyzes cache directives and actual caching behavior + +WITH cache_analysis AS ( + SELECT + client, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Content type + JSON_EXTRACT_SCALAR(summary, '$._contentType') AS content_type, + + -- Response status + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.status') AS INT64) AS status_code, + + -- Extract cache-control header value + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'cache-control' + LIMIT 1 -- noqa: AM09 + ) AS cache_control, + + -- Extract other cache-related headers + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'expires' + LIMIT 1 -- noqa: AM09 + ) AS expires_header, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'etag' + LIMIT 1 -- noqa: AM09 + ) AS etag, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'last-modified' + LIMIT 1 -- noqa: AM09 + ) AS last_modified, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'vary' + LIMIT 1 -- noqa: AM09 + ) AS vary_header, + + -- CDN-specific headers + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'x-cache' + LIMIT 1 -- noqa: AM09 + ) AS x_cache, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'cf-cache-status' + LIMIT 1 -- noqa: AM09 + ) AS cf_cache_status, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'x-served-by' + LIMIT 1 -- noqa: AM09 + ) AS x_served_by + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' +) + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total_requests, + + -- Cache-Control directives analysis + COUNTIF(cache_control IS NOT NULL) AS has_cache_control, + ROUND(SAFE_DIVIDE(COUNTIF(cache_control IS NOT NULL), COUNT(0)) * 100, 2) AS cache_control_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')) AS is_public, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')), COUNT(0)) * 100, 2) AS public_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')) AS is_private, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')), COUNT(0)) * 100, 2) AS private_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')) AS is_no_cache, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')), COUNT(0)) * 100, 2) AS no_cache_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')) AS is_no_store, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')), COUNT(0)) * 100, 2) AS no_store_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')) AS is_max_age_zero, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')), COUNT(0)) * 100, 2) AS max_age_zero_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')) AS is_immutable, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')), COUNT(0)) * 100, 2) AS immutable_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')) AS has_s_maxage, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')), COUNT(0)) * 100, 2) AS s_maxage_pct, + + -- Modern cache directives + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')) AS has_stale_while_revalidate, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')), COUNT(0)) * 100, 2) AS stale_while_revalidate_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')) AS has_stale_if_error, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')), COUNT(0)) * 100, 2) AS stale_if_error_pct, + + -- Max-age value analysis + APPROX_QUANTILES( + SAFE_CAST(REGEXP_EXTRACT(cache_control, r'max-age=(\d+)') AS INT64), 100 + )[OFFSET(50)] AS median_max_age_seconds, + + APPROX_QUANTILES( + SAFE_CAST(REGEXP_EXTRACT(cache_control, r's-maxage=(\d+)') AS INT64), 100 + )[OFFSET(50)] AS median_s_maxage_seconds, + + -- Cache hit/miss analysis from CDN headers + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hits, + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'miss')) AS cache_misses, + ROUND(SAFE_DIVIDE( + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit|miss')) + ) * 100, 2) AS cache_hit_rate_pct, + + -- Cloudflare-specific cache status + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'hit')) AS cf_cache_hits, + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'miss')) AS cf_cache_misses, + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'dynamic')) AS cf_cache_dynamic, + + -- Validation headers + COUNTIF(etag IS NOT NULL) AS has_etag, + ROUND(SAFE_DIVIDE(COUNTIF(etag IS NOT NULL), COUNT(0)) * 100, 2) AS etag_pct, + + COUNTIF(last_modified IS NOT NULL) AS has_last_modified, + ROUND(SAFE_DIVIDE(COUNTIF(last_modified IS NOT NULL), COUNT(0)) * 100, 2) AS last_modified_pct, + + -- CDN cache hit indicators + COUNTIF(x_cache IS NOT NULL) AS has_x_cache, + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hit_count, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(x_cache IS NOT NULL)) * 100, 2) AS cache_hit_rate, + + -- Cloudflare specific + COUNTIF(cf_cache_status = 'HIT') AS cf_hits, + COUNTIF(cf_cache_status = 'MISS') AS cf_misses, + ROUND(SAFE_DIVIDE(COUNTIF(cf_cache_status = 'HIT'), COUNTIF(cf_cache_status IS NOT NULL)) * 100, 2) AS cf_hit_rate +FROM cache_analysis +WHERE status_code = 200 -- Focus on successful responses +GROUP BY + client, + cdn, + is_main_document +HAVING + total_requests >= 1000 +ORDER BY + client DESC, + total_requests DESC diff --git a/sql/2025/cdn/cdn_security_features_adoption.sql b/sql/2025/cdn/cdn_security_features_adoption.sql new file mode 100644 index 00000000000..e539be1a202 --- /dev/null +++ b/sql/2025/cdn/cdn_security_features_adoption.sql @@ -0,0 +1,77 @@ +#standardSQL +# 17_cdn_security_features_adoption.sql: CDN Security Features Adoption +# +# Rationale: Security is a crucial aspect of content delivery. This query analyzes +# the adoption of various security features across CDNs, including HSTS, CSP, XSS +# protection, and other security headers. This helps understand which CDNs are +# leading in security implementations. +# +# Expected insights: +# - Which CDNs implement the most comprehensive security headers +# - Adoption rates of specific security features (HSTS, CSP, etc.) +# - Comparison between CDN and origin server security implementations +# - Trends in security header adoption for 2025 + +SELECT + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + COUNT(DISTINCT page) AS total_pages, + + -- Overall security header presence + COUNT(DISTINCT CASE WHEN ( + REGEXP_CONTAINS( + TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"' + ) + ) THEN page + END) AS pages_with_any_security_headers, + + ROUND( + (COUNT(DISTINCT CASE WHEN ( + REGEXP_CONTAINS( + TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"' + ) + ) THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2 + ) AS security_headers_adoption_rate_pct, + + -- Specific security headers breakdown + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') + THEN page + END) AS pages_with_hsts, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') + THEN page + END) AS pages_with_csp, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') + THEN page + END) AS pages_with_xframe, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-content-type-options"') + THEN page + END) AS pages_with_xcontent, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"referrer-policy"') + THEN page + END) AS pages_with_referrer_policy, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"permissions-policy"') + THEN page + END) AS pages_with_permissions_policy, + + -- Calculate percentages for each header + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS hsts_pct, + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS csp_pct, + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS xframe_pct +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' AND + is_main_document = true +GROUP BY cdn_provider +HAVING + total_pages > 100 -- Filter out CDNs with very small sample sizes +ORDER BY + security_headers_adoption_rate_pct DESC, + total_pages DESC +LIMIT 100 diff --git a/sql/2025/cdn/h3_adoption_by_cdn_provider.sql b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql new file mode 100644 index 00000000000..e38b8a263bf --- /dev/null +++ b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql @@ -0,0 +1,59 @@ +#standardSQL +# h3_adoption_by_cdn_provider_FINAL_v2.sql: HTTP/3 adoption rates by CDN provider +# Simplified version without unnecessary isSecure field to avoid casting issues + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total, + + -- Protocol counts + COUNTIF(protocol = 'HTTP/0.9') AS http09, + COUNTIF(protocol = 'HTTP/1.0') AS http10, + COUNTIF(protocol = 'HTTP/1.1') AS http11, + COUNTIF(protocol = 'HTTP/2') AS http2, + COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, + COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, + + -- Protocol percentages + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, + + -- HTTP/2+ adoption (includes HTTP/3) + COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct +FROM ( + SELECT + client, + is_main_document, + + # Protocol detection - using the same logic as distribution_of_http_versions.sql + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS protocol, + + # CDN detection - extract first CDN provider if multiple are listed + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' +) +WHERE cdn != 'ORIGIN' AND -- Only CDN providers, not origin servers + protocol IS NOT NULL -- Must have a detected protocol +GROUP BY + client, + cdn, + is_main_document +HAVING + total >= 1000 -- Only include CDNs with meaningful traffic volume +ORDER BY + client DESC, + total DESC diff --git a/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql new file mode 100644 index 00000000000..cebdf6f56f2 --- /dev/null +++ b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql @@ -0,0 +1,58 @@ +#standardSQL +# h3_adoption_by_cdn_provider_FINAL_v2.sql: HTTP/3 adoption rates by CDN provider +# Simplified version without unnecessary isSecure field to avoid casting issues + +SELECT + client, + IF(cdn = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn_type, + is_main_document, + COUNT(0) AS total, + + -- Protocol counts + COUNTIF(protocol = 'HTTP/0.9') AS http09, + COUNTIF(protocol = 'HTTP/1.0') AS http10, + COUNTIF(protocol = 'HTTP/1.1') AS http11, + COUNTIF(protocol = 'HTTP/2') AS http2, + COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, + COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, + + -- Protocol percentages + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, + + -- HTTP/2+ adoption (includes HTTP/3) + COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct +FROM ( + SELECT + client, + is_main_document, + + # Protocol detection - using the same logic as distribution_of_http_versions.sql + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS protocol, + + # CDN detection - extract first CDN provider if multiple are listed + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' +) +WHERE protocol IS NOT NULL -- Must have a detected protocol +GROUP BY + client, + cdn_type, + is_main_document +HAVING + total >= 1000 -- Only include CDNs with meaningful traffic volume +ORDER BY + client DESC, + total DESC diff --git a/sql/2025/cdn/http3_performance_impact.sql b/sql/2025/cdn/http3_performance_impact.sql new file mode 100644 index 00000000000..b29eec3f2ca --- /dev/null +++ b/sql/2025/cdn/http3_performance_impact.sql @@ -0,0 +1,98 @@ +#standardSQL +# 33_http3_performance_impact.sql +# Comprehensive analysis of HTTP/3 performance impact vs HTTP/2 +# Measures real-world performance differences between protocols + +WITH protocol_performance AS ( + SELECT + client, + page, + url, + is_main_document, + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + + -- Protocol normalization + CASE + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') IN ('h3', 'HTTP/3', 'QUIC') THEN 'H3' + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') = 'HTTP/2' THEN 'HTTP/2' + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') IN ('http/1.1', 'http/1.0') THEN 'HTTP/1.x' + ELSE JSON_EXTRACT_SCALAR(payload, '$._protocol') + END AS protocol, + + -- Performance metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.wait') AS FLOAT64) AS ttfb, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.receive') AS FLOAT64) AS download_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.ssl') AS FLOAT64) AS ssl_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.connect') AS FLOAT64) AS connect_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.dns') AS FLOAT64) AS dns_time, + + -- Total request time + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.time') AS FLOAT64) AS total_time, + + -- Resource information + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INT64) AS response_size, + JSON_EXTRACT_SCALAR(summary, '$.type') AS resource_type, + JSON_EXTRACT_SCALAR(summary, '$.format') AS format, + + -- Connection reuse + JSON_EXTRACT_SCALAR(payload, '$._socket') AS socket_id, + + -- Priority information + JSON_EXTRACT_SCALAR(payload, '$._priority') AS priority + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + client = 'mobile' +) + +SELECT + cdn_provider, + protocol, + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests, + + -- Performance metrics by protocol + APPROX_QUANTILES(ttfb, 100)[OFFSET(25)] AS p25_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(50)] AS median_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(75)] AS p75_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(90)] AS p90_ttfb, + + AVG(ttfb) AS avg_ttfb, + STDDEV(ttfb) AS stddev_ttfb, + + -- Download performance + APPROX_QUANTILES(download_time, 100)[OFFSET(50)] AS median_download_time, + AVG(download_time) AS avg_download_time, + + -- Connection establishment + AVG(connect_time) AS avg_connect_time, + AVG(ssl_time) AS avg_ssl_time, + AVG(dns_time) AS avg_dns_time, + + -- Total time metrics + APPROX_QUANTILES(total_time, 100)[OFFSET(50)] AS median_total_time, + AVG(total_time) AS avg_total_time, + + -- Resource size analysis + AVG(response_size) AS avg_response_size, + SUM(response_size) AS total_bytes_transferred, + + -- Efficiency metrics (bytes per millisecond) + SAFE_DIVIDE(AVG(response_size), AVG(total_time)) AS avg_throughput, + + -- Connection reuse (unique sockets vs requests) + COUNT(DISTINCT socket_id) AS unique_connections, + SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT socket_id)) AS avg_requests_per_connection, + + -- Resource type breakdown + COUNTIF(resource_type = 'Document') AS document_requests, + COUNTIF(resource_type = 'Script') AS script_requests, + COUNTIF(resource_type = 'Stylesheet') AS style_requests, + COUNTIF(resource_type = 'Image') AS image_requests +FROM protocol_performance +WHERE protocol IN ('H3', 'HTTP/2', 'HTTP/1.x') AND + ttfb IS NOT NULL AND + ttfb > 0 AND + ttfb < 10000 -- Filter outliers +GROUP BY cdn_provider, protocol +HAVING total_requests > 1000 +ORDER BY cdn_provider, protocol diff --git a/sql/2025/cdn/security_headers_by_cdn.sql b/sql/2025/cdn/security_headers_by_cdn.sql new file mode 100644 index 00000000000..3cf82e7f28f --- /dev/null +++ b/sql/2025/cdn/security_headers_by_cdn.sql @@ -0,0 +1,193 @@ +#standardSQL +# Security headers adoption by CDN vs Origin +# Analyzes adoption of key security headers across CDN providers + +WITH security_headers AS ( + SELECT + client, + url, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Extract specific security headers + -- Note: response_headers is an array, so we need to check each element + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'strict-transport-security' + ) AS has_hsts, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'content-security-policy' + ) AS has_csp, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-frame-options' + ) AS has_xfo, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-content-type-options' + ) AS has_xcto, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-xss-protection' + ) AS has_xxp, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'referrer-policy' + ) AS has_referrer_policy, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'permissions-policy' + ) AS has_permissions_policy, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'feature-policy' + ) AS has_feature_policy, + + -- Cache-Control directives + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'cache-control' AND + LOWER(h.value) LIKE '%no-store%' + ) AS has_cache_no_store, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'cache-control' AND + LOWER(h.value) LIKE '%immutable%' + ) AS has_cache_immutable, + + -- CORS headers + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-origin' + ) AS has_cors_origin, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-methods' + ) AS has_cors_methods, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-headers' + ) AS has_cors_headers, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-credentials' + ) AS has_cors_credentials, + + -- Extract CORS origin values for analysis + ( + SELECT h.value + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-origin' + LIMIT 1 -- noqa: AM09 + ) AS cors_origin_value + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' +) + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total_requests, + + -- Security headers adoption + COUNTIF(has_hsts) AS hsts_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_hsts), COUNT(0)) * 100, 2) AS hsts_pct, + + COUNTIF(has_csp) AS csp_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_csp), COUNT(0)) * 100, 2) AS csp_pct, + + COUNTIF(has_xfo) AS xfo_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xfo), COUNT(0)) * 100, 2) AS xfo_pct, + + COUNTIF(has_xcto) AS xcto_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xcto), COUNT(0)) * 100, 2) AS xcto_pct, + + COUNTIF(has_xxp) AS xxp_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xxp), COUNT(0)) * 100, 2) AS xxp_pct, + + COUNTIF(has_referrer_policy) AS referrer_policy_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_referrer_policy), COUNT(0)) * 100, 2) AS referrer_policy_pct, + + COUNTIF(has_permissions_policy OR has_feature_policy) AS permissions_policy_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_permissions_policy OR has_feature_policy), COUNT(0)) * 100, 2) AS permissions_policy_pct, + + -- Cache control adoption + COUNTIF(has_cache_no_store) AS cache_no_store_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cache_no_store), COUNT(0)) * 100, 2) AS cache_no_store_pct, + + COUNTIF(has_cache_immutable) AS cache_immutable_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cache_immutable), COUNT(0)) * 100, 2) AS cache_immutable_pct, + + -- CORS headers adoption + COUNTIF(has_cors_origin) AS cors_origin_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_origin), COUNT(0)) * 100, 2) AS cors_origin_pct, + + COUNTIF(has_cors_methods) AS cors_methods_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_methods), COUNT(0)) * 100, 2) AS cors_methods_pct, + + COUNTIF(has_cors_headers) AS cors_headers_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_headers), COUNT(0)) * 100, 2) AS cors_headers_pct, + + COUNTIF(has_cors_credentials) AS cors_credentials_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_credentials), COUNT(0)) * 100, 2) AS cors_credentials_pct, + + -- CORS origin patterns (sample values) + ARRAY_AGG(cors_origin_value IGNORE NULLS LIMIT 5) AS sample_cors_origins, + + -- Wildcard CORS usage + COUNTIF(cors_origin_value = '*') AS cors_wildcard_count, + ROUND(SAFE_DIVIDE(COUNTIF(cors_origin_value = '*'), COUNTIF(has_cors_origin)) * 100, 2) AS cors_wildcard_pct, + + -- Security score (how many headers out of 7 main ones) + ROUND(AVG( + CAST(has_hsts AS INT64) + + CAST(has_csp AS INT64) + + CAST(has_xfo AS INT64) + + CAST(has_xcto AS INT64) + + CAST(has_xxp AS INT64) + + CAST(has_referrer_policy AS INT64) + + CAST(has_permissions_policy OR has_feature_policy AS INT64) + ), 2) AS avg_security_headers +FROM security_headers +GROUP BY + client, + cdn, + is_main_document +HAVING + total_requests >= 1000 +ORDER BY + client DESC, + avg_security_headers DESC, + total_requests DESC diff --git a/sql/2025/cdn/server_timing_adoption.sql b/sql/2025/cdn/server_timing_adoption.sql new file mode 100644 index 00000000000..32bd389cf37 --- /dev/null +++ b/sql/2025/cdn/server_timing_adoption.sql @@ -0,0 +1,105 @@ +#standardSQL +# 32_server_timing_adoption.sql +# Deep dive into Server-Timing header usage across CDNs +# Server-Timing provides transparency into server-side performance metrics + +WITH server_timing_analysis AS ( + SELECT + client, + page, + url, + is_main_document, + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + JSON_EXTRACT_SCALAR(payload, '$._protocol') AS protocol, + + -- Check for Server-Timing header + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'server-timing' + ) AS has_server_timing, + + -- Extract Server-Timing value for analysis + ( + SELECT header.value + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'server-timing' + LIMIT 1 -- noqa: AM09 + ) AS server_timing_value, + + -- Other performance headers + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-cache' + ) AS has_x_cache, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-cdn' + ) AS has_x_cdn, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'cf-ray' -- Cloudflare specific + ) AS has_cf_ray, + + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-amz-cf-id' -- CloudFront specific + ) AS has_amz_cf_id, + + -- Performance metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.wait') AS FLOAT64) AS ttfb, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.ssl') AS FLOAT64) AS ssl_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.connect') AS FLOAT64) AS connect_time, + + -- Resource info + JSON_EXTRACT_SCALAR(summary, '$.type') AS resource_type, + JSON_EXTRACT_SCALAR(summary, '$.respSize') AS response_size + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + client = 'mobile' +) + +SELECT + cdn_provider, + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests, + + -- Server-Timing adoption + COUNTIF(has_server_timing) AS requests_with_server_timing, + SAFE_DIVIDE(COUNTIF(has_server_timing) * 100.0, COUNT(0)) AS pct_server_timing, + + -- Other transparency headers + COUNTIF(has_x_cache) AS requests_with_x_cache, + COUNTIF(has_x_cdn) AS requests_with_x_cdn, + SAFE_DIVIDE(COUNTIF(has_x_cache) * 100.0, COUNT(0)) AS pct_x_cache, + + -- CDN-specific headers + COUNTIF(has_cf_ray) AS cloudflare_ray_headers, + COUNTIF(has_amz_cf_id) AS cloudfront_id_headers, + + -- Performance correlation + AVG(CASE WHEN has_server_timing THEN ttfb END) AS avg_ttfb_with_timing, + AVG(CASE WHEN NOT has_server_timing THEN ttfb END) AS avg_ttfb_without_timing, + + APPROX_QUANTILES(CASE WHEN has_server_timing THEN ttfb END, 100)[OFFSET(50)] AS median_ttfb_with_timing, + APPROX_QUANTILES(CASE WHEN NOT has_server_timing THEN ttfb END, 100)[OFFSET(50)] AS median_ttfb_without_timing, + + -- Sample Server-Timing values + ARRAY_AGG(DISTINCT server_timing_value IGNORE NULLS LIMIT 5) AS sample_timing_values, + + -- By resource type + COUNTIF(resource_type = 'Document' AND has_server_timing) AS documents_with_timing, + COUNTIF(resource_type = 'Script' AND has_server_timing) AS scripts_with_timing, + COUNTIF(resource_type = 'Stylesheet' AND has_server_timing) AS styles_with_timing +FROM server_timing_analysis +WHERE cdn_provider IS NOT NULL +GROUP BY cdn_provider +HAVING total_requests > 1000 +ORDER BY pct_server_timing DESC +LIMIT 100