From 9dc6c1a9930d44a44a874fa0723cc321eea13211 Mon Sep 17 00:00:00 2001 From: Joe Viggiano <17817811+joeviggiano@users.noreply.github.com> Date: Sat, 10 Jan 2026 20:30:05 -0500 Subject: [PATCH 1/3] Added new queries for analysis --- sql/2025/cdn/alt_svc_protocol_analysis.sql | 115 +++++++++++ sql/2025/cdn/brotli_compression_adoption.sql | 116 +++++++++++ sql/2025/cdn/cache_control_analysis.sql | 147 ++++++++++++++ .../cdn/cdn_security_features_adoption.sql | 61 ++++++ sql/2025/cdn/h3_adoption_by_cdn_provider.sql | 57 ++++++ sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql | 56 +++++ sql/2025/cdn/http3_performance_impact.sql | 98 +++++++++ sql/2025/cdn/security_headers_by_cdn.sql | 191 ++++++++++++++++++ sql/2025/cdn/server_timing_adoption.sql | 105 ++++++++++ 9 files changed, 946 insertions(+) create mode 100644 sql/2025/cdn/alt_svc_protocol_analysis.sql create mode 100644 sql/2025/cdn/brotli_compression_adoption.sql create mode 100644 sql/2025/cdn/cache_control_analysis.sql create mode 100644 sql/2025/cdn/cdn_security_features_adoption.sql create mode 100644 sql/2025/cdn/h3_adoption_by_cdn_provider.sql create mode 100644 sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql create mode 100644 sql/2025/cdn/http3_performance_impact.sql create mode 100644 sql/2025/cdn/security_headers_by_cdn.sql create mode 100644 sql/2025/cdn/server_timing_adoption.sql diff --git a/sql/2025/cdn/alt_svc_protocol_analysis.sql b/sql/2025/cdn/alt_svc_protocol_analysis.sql new file mode 100644 index 00000000000..a1aa46088fa --- /dev/null +++ b/sql/2025/cdn/alt_svc_protocol_analysis.sql @@ -0,0 +1,115 @@ +#standardSQL +# Alt-Svc Protocol Analysis - Deep dive into Alternative Service headers +# Analyzes what protocols are advertised vs actually used + +WITH alt_svc_analysis AS ( + SELECT + client, + page, + url, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Current protocol used + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS current_protocol, + + -- Extract Alt-Svc header value + ( + SELECT h.value + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + LIMIT 1 + ) AS alt_svc_header, + + -- Check if Alt-Svc exists + EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + ) AS has_alt_svc + + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' + AND EXISTS( + SELECT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'alt-svc' + ) +), + +protocol_extraction AS ( + SELECT + *, + -- Extract HTTP/3 variants from Alt-Svc + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3[^=]*=') AS advertises_h3, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3-\d+[^=]*=') AS advertises_h3_draft, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'h2[^=]*=') AS advertises_h2, + REGEXP_CONTAINS(LOWER(alt_svc_header), r'http/1\.1[^=]*=') AS advertises_h1, + + -- Extract max-age values + SAFE_CAST(REGEXP_EXTRACT(alt_svc_header, r'ma=(\d+)') AS INT64) AS max_age_seconds, + + -- Check for clear directive + REGEXP_CONTAINS(LOWER(alt_svc_header), r'clear') AS has_clear_directive + + FROM alt_svc_analysis +) + +SELECT + client, + cdn, + is_main_document, + current_protocol, + COUNT(*) AS total_requests, + + -- Protocol advertising analysis + COUNTIF(advertises_h3) AS advertises_h3_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(*)) * 100, 2) AS advertises_h3_pct, + + COUNTIF(advertises_h3_draft) AS advertises_h3_draft_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(*)) * 100, 2) AS advertises_h3_draft_pct, + + COUNTIF(advertises_h2) AS advertises_h2_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(*)) * 100, 2) AS advertises_h2_pct, + + COUNTIF(advertises_h1) AS advertises_h1_count, + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(*)) * 100, 2) AS advertises_h1_pct, + + -- Protocol mismatch analysis + COUNTIF(advertises_h3 AND current_protocol NOT LIKE '%H3%' AND current_protocol != 'HTTP/3') AS h3_advertised_not_used, + COUNTIF(current_protocol LIKE '%H3%' OR current_protocol = 'HTTP/3') AS currently_using_h3, + + -- Max-age statistics + APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(50)] AS median_max_age_seconds, + APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(90)] AS p90_max_age_seconds, + + -- Clear directive usage + COUNTIF(has_clear_directive) AS clear_directive_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(*)) * 100, 2) AS clear_directive_pct, + + -- Sample Alt-Svc headers for analysis + ARRAY_AGG(alt_svc_header IGNORE NULLS LIMIT 5) AS sample_alt_svc_headers + +FROM protocol_extraction +GROUP BY + client, + cdn, + is_main_document, + current_protocol +HAVING + total_requests >= 100 +ORDER BY + client DESC, + total_requests DESC, + advertises_h3_pct DESC diff --git a/sql/2025/cdn/brotli_compression_adoption.sql b/sql/2025/cdn/brotli_compression_adoption.sql new file mode 100644 index 00000000000..5fd843b2f71 --- /dev/null +++ b/sql/2025/cdn/brotli_compression_adoption.sql @@ -0,0 +1,116 @@ +#standardSQL +# 20_brotli_compression_adoption.sql: Brotli and modern compression adoption by CDN +# Analyzes compression algorithm usage patterns and efficiency across CDN providers +# +# Rationale: Brotli compression offers 20-30% better compression than gzip for text resources. +# As it becomes more widely supported, we want to track which CDNs are leading adoption +# and how much bandwidth savings are being achieved. This is critical for performance +# and sustainability metrics. + +WITH compression_analysis AS ( + SELECT + client, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Page and resource info + page, + url, + is_main_document, + + -- Content type classification + CASE + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs)($|\?)') THEN 'JavaScript' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.css($|\?)') THEN 'CSS' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(html|htm)($|\?)') OR is_main_document THEN 'HTML' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(json)($|\?)') THEN 'JSON' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(svg)($|\?)') THEN 'SVG' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(woff2?|ttf|otf|eot)($|\?)') THEN 'Fonts' + WHEN REGEXP_CONTAINS(LOWER(url), r'\.(jpg|jpeg|png|gif|webp|avif)($|\?)') THEN 'Images' + ELSE 'Other' + END AS content_type, + + -- Compression detection from Content-Encoding header + ( + SELECT LOWER(h.value) +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'content-encoding' +LIMIT 1 + ) AS content_encoding, + + -- Vary header check (indicates dynamic compression support) + EXISTS( + SELECT 1 FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%' + ) AS supports_dynamic_compression, + + -- Response size metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS response_body_size, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.bodySize') AS INT64) AS uncompressed_size, + + -- Transfer size (actual bytes transferred) + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response._transferSize') AS INT64) AS transfer_size +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' + -- Focus on compressible content types + AND REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)') +) + +SELECT + client, + cdn, + content_type, + + -- Volume metrics + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests, + + -- Compression type distribution + COUNTIF(content_encoding = 'br') AS brotli_requests, + COUNTIF(content_encoding = 'gzip') AS gzip_requests, + COUNTIF(content_encoding = 'deflate') AS deflate_requests, + COUNTIF(content_encoding IS NULL OR content_encoding = '') AS uncompressed_requests, + COUNTIF(content_encoding NOT IN ('br', 'gzip', 'deflate', '') AND content_encoding IS NOT NULL) AS other_compression, + + -- Compression percentages + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'br'), COUNT(0)) * 100, 2) AS brotli_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'gzip'), COUNT(0)) * 100, 2) AS gzip_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding = 'deflate'), COUNT(0)) * 100, 2) AS deflate_pct, + ROUND(SAFE_DIVIDE(COUNTIF(content_encoding IS NULL OR content_encoding = ''), COUNT(0)) * 100, 2) AS uncompressed_pct, + + -- Dynamic compression support + COUNTIF(supports_dynamic_compression) AS dynamic_compression_count, + ROUND(SAFE_DIVIDE(COUNTIF(supports_dynamic_compression), COUNT(0)) * 100, 2) AS dynamic_compression_pct, + + -- Size metrics (in KB) + ROUND(AVG(response_body_size) / 1024, 2) AS avg_response_size_kb, + ROUND(AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END) / 1024, 2) AS avg_brotli_size_kb, + ROUND(AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / 1024, 2) AS avg_gzip_size_kb, + ROUND(AVG(CASE WHEN content_encoding IS NULL OR content_encoding = '' THEN response_body_size END) / 1024, 2) AS avg_uncompressed_size_kb, + + -- Compression efficiency comparison + ROUND( + SAFE_DIVIDE( + AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) - + AVG(CASE WHEN content_encoding = 'br' THEN response_body_size END), + AVG(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) + ) * 100, 2 + ) AS brotli_vs_gzip_savings_pct, + + -- Total data transfer metrics + ROUND(SUM(response_body_size) / (1024 * 1024 * 1024), 2) AS total_gb_transferred, + ROUND(SUM(CASE WHEN content_encoding = 'br' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_brotli, + ROUND(SUM(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_gzip +FROM compression_analysis +GROUP BY client, + cdn, + content_type +HAVING + total_requests >= 100 -- Minimum threshold for statistical relevance +ORDER BY client DESC, + brotli_pct DESC, + total_requests DESC \ No newline at end of file diff --git a/sql/2025/cdn/cache_control_analysis.sql b/sql/2025/cdn/cache_control_analysis.sql new file mode 100644 index 00000000000..fd36ce2cea3 --- /dev/null +++ b/sql/2025/cdn/cache_control_analysis.sql @@ -0,0 +1,147 @@ +#standardSQL +# Cache-Control and CDN caching behavior analysis +# Analyzes cache directives and actual caching behavior + +WITH cache_analysis AS ( + SELECT + client, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Content type + JSON_EXTRACT_SCALAR(summary, '$._contentType') AS content_type, + + -- Response status + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.status') AS INT64) AS status_code, + + -- Extract cache-control header value + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'cache-control' +LIMIT 1) AS cache_control, + + -- Extract other cache-related headers + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'expires' +LIMIT 1) AS expires_header, + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'etag' +LIMIT 1) AS etag, + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'last-modified' +LIMIT 1) AS last_modified, + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'vary' +LIMIT 1) AS vary_header, + + -- CDN-specific headers + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'x-cache' +LIMIT 1) AS x_cache, + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'cf-cache-status' +LIMIT 1) AS cf_cache_status, + (SELECT value +FROM UNNEST(response_headers) +WHERE LOWER(name) = 'x-served-by' +LIMIT 1) AS x_served_by +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' +) + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total_requests, + + -- Cache-Control directives analysis + COUNTIF(cache_control IS NOT NULL) AS has_cache_control, + ROUND(SAFE_DIVIDE(COUNTIF(cache_control IS NOT NULL), COUNT(0)) * 100, 2) AS cache_control_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')) AS is_public, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')), COUNT(0)) * 100, 2) AS public_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')) AS is_private, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')), COUNT(0)) * 100, 2) AS private_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')) AS is_no_cache, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')), COUNT(0)) * 100, 2) AS no_cache_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')) AS is_no_store, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')), COUNT(0)) * 100, 2) AS no_store_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')) AS is_max_age_zero, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')), COUNT(0)) * 100, 2) AS max_age_zero_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')) AS is_immutable, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')), COUNT(0)) * 100, 2) AS immutable_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')) AS has_s_maxage, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')), COUNT(0)) * 100, 2) AS s_maxage_pct, + + -- Modern cache directives + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')) AS has_stale_while_revalidate, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')), COUNT(0)) * 100, 2) AS stale_while_revalidate_pct, + + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')) AS has_stale_if_error, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')), COUNT(0)) * 100, 2) AS stale_if_error_pct, + + -- Max-age value analysis + APPROX_QUANTILES( + SAFE_CAST(REGEXP_EXTRACT(cache_control, r'max-age=(\d+)') AS INT64), 100 + )[OFFSET(50)] AS median_max_age_seconds, + + APPROX_QUANTILES( + SAFE_CAST(REGEXP_EXTRACT(cache_control, r's-maxage=(\d+)') AS INT64), 100 + )[OFFSET(50)] AS median_s_maxage_seconds, + + -- Cache hit/miss analysis from CDN headers + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hits, + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'miss')) AS cache_misses, + ROUND(SAFE_DIVIDE( + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit|miss')) + ) * 100, 2) AS cache_hit_rate_pct, + + -- Cloudflare-specific cache status + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'hit')) AS cf_cache_hits, + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'miss')) AS cf_cache_misses, + COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'dynamic')) AS cf_cache_dynamic, + + -- Validation headers + COUNTIF(etag IS NOT NULL) AS has_etag, + ROUND(SAFE_DIVIDE(COUNTIF(etag IS NOT NULL), COUNT(0)) * 100, 2) AS etag_pct, + + COUNTIF(last_modified IS NOT NULL) AS has_last_modified, + ROUND(SAFE_DIVIDE(COUNTIF(last_modified IS NOT NULL), COUNT(0)) * 100, 2) AS last_modified_pct, + + -- CDN cache hit indicators + COUNTIF(x_cache IS NOT NULL) AS has_x_cache, + COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hit_count, + ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(x_cache IS NOT NULL)) * 100, 2) AS cache_hit_rate, + + -- Cloudflare specific + COUNTIF(cf_cache_status = 'HIT') AS cf_hits, + COUNTIF(cf_cache_status = 'MISS') AS cf_misses, + ROUND(SAFE_DIVIDE(COUNTIF(cf_cache_status = 'HIT'), COUNTIF(cf_cache_status IS NOT NULL)) * 100, 2) AS cf_hit_rate +FROM cache_analysis +WHERE status_code = 200 -- Focus on successful responses +GROUP BY client, + cdn, + is_main_document +HAVING + total_requests >= 1000 +ORDER BY client DESC, + total_requests DESC diff --git a/sql/2025/cdn/cdn_security_features_adoption.sql b/sql/2025/cdn/cdn_security_features_adoption.sql new file mode 100644 index 00000000000..b8640080a93 --- /dev/null +++ b/sql/2025/cdn/cdn_security_features_adoption.sql @@ -0,0 +1,61 @@ +#standardSQL +# 17_cdn_security_features_adoption.sql: CDN Security Features Adoption +# +# Rationale: Security is a crucial aspect of content delivery. This query analyzes +# the adoption of various security features across CDNs, including HSTS, CSP, XSS +# protection, and other security headers. This helps understand which CDNs are +# leading in security implementations. +# +# Expected insights: +# - Which CDNs implement the most comprehensive security headers +# - Adoption rates of specific security features (HSTS, CSP, etc.) +# - Comparison between CDN and origin server security implementations +# - Trends in security header adoption for 2025 + +SELECT + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + COUNT(DISTINCT page) AS total_pages, + + -- Overall security header presence + COUNT(DISTINCT CASE WHEN ( + REGEXP_CONTAINS(TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"') + ) THEN page END) AS pages_with_any_security_headers, + + ROUND( + (COUNT(DISTINCT CASE WHEN ( + REGEXP_CONTAINS(TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"') + ) THEN page END) * 100.0 / COUNT(DISTINCT page)), 2 + ) AS security_headers_adoption_rate_pct, + + -- Specific security headers breakdown + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') + THEN page END) AS pages_with_hsts, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') + THEN page END) AS pages_with_csp, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') + THEN page END) AS pages_with_xframe, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-content-type-options"') + THEN page END) AS pages_with_xcontent, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"referrer-policy"') + THEN page END) AS pages_with_referrer_policy, + COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"permissions-policy"') + THEN page END) AS pages_with_permissions_policy, + + -- Calculate percentages for each header + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') + THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS hsts_pct, + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') + THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS csp_pct, + ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') + THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS xframe_pct +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' AND + is_main_document = true +GROUP BY cdn_provider +HAVING + total_pages > 100 -- Filter out CDNs with very small sample sizes +ORDER BY security_headers_adoption_rate_pct DESC, + total_pages DESC +LIMIT 100 \ No newline at end of file diff --git a/sql/2025/cdn/h3_adoption_by_cdn_provider.sql b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql new file mode 100644 index 00000000000..1b368a51f8d --- /dev/null +++ b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql @@ -0,0 +1,57 @@ +#standardSQL +# h3_adoption_by_cdn_provider_FINAL_v2.sql: HTTP/3 adoption rates by CDN provider +# Simplified version without unnecessary isSecure field to avoid casting issues + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total, + + -- Protocol counts + COUNTIF(protocol = 'HTTP/0.9') AS http09, + COUNTIF(protocol = 'HTTP/1.0') AS http10, + COUNTIF(protocol = 'HTTP/1.1') AS http11, + COUNTIF(protocol = 'HTTP/2') AS http2, + COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, + COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, + + -- Protocol percentages + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, + + -- HTTP/2+ adoption (includes HTTP/3) + COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct +FROM ( + SELECT + client, + is_main_document, + + # Protocol detection - using the same logic as distribution_of_http_versions.sql + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS protocol, + + # CDN detection - extract first CDN provider if multiple are listed + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' +) +WHERE cdn != 'ORIGIN' AND -- Only CDN providers, not origin servers + protocol IS NOT NULL -- Must have a detected protocol +GROUP BY client, + cdn, + is_main_document +HAVING + total >= 1000 -- Only include CDNs with meaningful traffic volume +ORDER BY client DESC, + total DESC \ No newline at end of file diff --git a/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql new file mode 100644 index 00000000000..85683f22fb8 --- /dev/null +++ b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql @@ -0,0 +1,56 @@ +#standardSQL +# h3_adoption_by_cdn_provider_FINAL_v2.sql: HTTP/3 adoption rates by CDN provider +# Simplified version without unnecessary isSecure field to avoid casting issues + +SELECT + client, + IF(cdn = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn_type, + is_main_document, + COUNT(0) AS total, + + -- Protocol counts + COUNTIF(protocol = 'HTTP/0.9') AS http09, + COUNTIF(protocol = 'HTTP/1.0') AS http10, + COUNTIF(protocol = 'HTTP/1.1') AS http11, + COUNTIF(protocol = 'HTTP/2') AS http2, + COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, + COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, + + -- Protocol percentages + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, + + -- HTTP/2+ adoption (includes HTTP/3) + COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, + ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct +FROM ( + SELECT + client, + is_main_document, + + # Protocol detection - using the same logic as distribution_of_http_versions.sql + UPPER(IFNULL( + JSON_EXTRACT_SCALAR(payload, '$._protocol'), + IFNULL( + NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), + NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') + ) + )) AS protocol, + + # CDN detection - extract first CDN provider if multiple are listed + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' +) +WHERE protocol IS NOT NULL -- Must have a detected protocol +GROUP BY client, + cdn_type, + is_main_document +HAVING + total >= 1000 -- Only include CDNs with meaningful traffic volume +ORDER BY client DESC, + total DESC \ No newline at end of file diff --git a/sql/2025/cdn/http3_performance_impact.sql b/sql/2025/cdn/http3_performance_impact.sql new file mode 100644 index 00000000000..2f9f5979526 --- /dev/null +++ b/sql/2025/cdn/http3_performance_impact.sql @@ -0,0 +1,98 @@ +#standardSQL +# 33_http3_performance_impact.sql +# Comprehensive analysis of HTTP/3 performance impact vs HTTP/2 +# Measures real-world performance differences between protocols + +WITH protocol_performance AS ( + SELECT + client, + page, + url, + is_main_document, + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + + -- Protocol normalization + CASE + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') IN ('h3', 'HTTP/3', 'QUIC') THEN 'H3' + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') = 'HTTP/2' THEN 'HTTP/2' + WHEN JSON_EXTRACT_SCALAR(payload, '$._protocol') IN ('http/1.1', 'http/1.0') THEN 'HTTP/1.x' + ELSE JSON_EXTRACT_SCALAR(payload, '$._protocol') + END AS protocol, + + -- Performance metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.wait') AS FLOAT64) AS ttfb, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.receive') AS FLOAT64) AS download_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.ssl') AS FLOAT64) AS ssl_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.connect') AS FLOAT64) AS connect_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.dns') AS FLOAT64) AS dns_time, + + -- Total request time + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.time') AS FLOAT64) AS total_time, + + -- Resource information + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INT64) AS response_size, + JSON_EXTRACT_SCALAR(summary, '$.type') AS resource_type, + JSON_EXTRACT_SCALAR(summary, '$.format') AS format, + + -- Connection reuse + JSON_EXTRACT_SCALAR(payload, '$._socket') AS socket_id, + + -- Priority information + JSON_EXTRACT_SCALAR(payload, '$._priority') AS priority +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' + AND client = 'mobile' +) + +SELECT + cdn_provider, + protocol, + COUNT(DISTINCT page) AS total_pages, + COUNT(*) AS total_requests, + + -- Performance metrics by protocol + APPROX_QUANTILES(ttfb, 100)[OFFSET(25)] AS p25_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(50)] AS median_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(75)] AS p75_ttfb, + APPROX_QUANTILES(ttfb, 100)[OFFSET(90)] AS p90_ttfb, + + AVG(ttfb) AS avg_ttfb, + STDDEV(ttfb) AS stddev_ttfb, + + -- Download performance + APPROX_QUANTILES(download_time, 100)[OFFSET(50)] AS median_download_time, + AVG(download_time) AS avg_download_time, + + -- Connection establishment + AVG(connect_time) AS avg_connect_time, + AVG(ssl_time) AS avg_ssl_time, + AVG(dns_time) AS avg_dns_time, + + -- Total time metrics + APPROX_QUANTILES(total_time, 100)[OFFSET(50)] AS median_total_time, + AVG(total_time) AS avg_total_time, + + -- Resource size analysis + AVG(response_size) AS avg_response_size, + SUM(response_size) AS total_bytes_transferred, + + -- Efficiency metrics (bytes per millisecond) + SAFE_DIVIDE(AVG(response_size), AVG(total_time)) AS avg_throughput, + + -- Connection reuse (unique sockets vs requests) + COUNT(DISTINCT socket_id) AS unique_connections, + SAFE_DIVIDE(COUNT(*), COUNT(DISTINCT socket_id)) AS avg_requests_per_connection, + + -- Resource type breakdown + COUNTIF(resource_type = 'Document') AS document_requests, + COUNTIF(resource_type = 'Script') AS script_requests, + COUNTIF(resource_type = 'Stylesheet') AS style_requests, + COUNTIF(resource_type = 'Image') AS image_requests +FROM protocol_performance +WHERE protocol IN ('H3', 'HTTP/2', 'HTTP/1.x') + AND ttfb IS NOT NULL + AND ttfb > 0 + AND ttfb < 10000 -- Filter outliers +GROUP BY cdn_provider, protocol +HAVING total_requests > 1000 +ORDER BY cdn_provider, protocol \ No newline at end of file diff --git a/sql/2025/cdn/security_headers_by_cdn.sql b/sql/2025/cdn/security_headers_by_cdn.sql new file mode 100644 index 00000000000..a508e814f75 --- /dev/null +++ b/sql/2025/cdn/security_headers_by_cdn.sql @@ -0,0 +1,191 @@ +#standardSQL +# Security headers adoption by CDN vs Origin +# Analyzes adoption of key security headers across CDN providers + +WITH security_headers AS ( + SELECT + client, + url, + is_main_document, + + -- CDN detection + IFNULL( + NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), + 'ORIGIN' + ) AS cdn, + + -- Extract specific security headers + -- Note: response_headers is an array, so we need to check each element + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'strict-transport-security' + ) AS has_hsts, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'content-security-policy' + ) AS has_csp, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'x-frame-options' + ) AS has_xfo, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'x-content-type-options' + ) AS has_xcto, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'x-xss-protection' + ) AS has_xxp, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'referrer-policy' + ) AS has_referrer_policy, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'permissions-policy' + ) AS has_permissions_policy, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'feature-policy' + ) AS has_feature_policy, + + -- Cache-Control directives + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'cache-control' + AND LOWER(h.value) LIKE '%no-store%' + ) AS has_cache_no_store, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'cache-control' + AND LOWER(h.value) LIKE '%immutable%' + ) AS has_cache_immutable, + + -- CORS headers + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'access-control-allow-origin' + ) AS has_cors_origin, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'access-control-allow-methods' + ) AS has_cors_methods, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'access-control-allow-headers' + ) AS has_cors_headers, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS h +WHERE LOWER(h.name) = 'access-control-allow-credentials' + ) AS has_cors_credentials, + + -- Extract CORS origin values for analysis + ( + SELECT h.value + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-origin' + LIMIT 1 + ) AS cors_origin_value +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' +) + +SELECT + client, + cdn, + is_main_document, + COUNT(0) AS total_requests, + + -- Security headers adoption + COUNTIF(has_hsts) AS hsts_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_hsts), COUNT(0)) * 100, 2) AS hsts_pct, + + COUNTIF(has_csp) AS csp_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_csp), COUNT(0)) * 100, 2) AS csp_pct, + + COUNTIF(has_xfo) AS xfo_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xfo), COUNT(0)) * 100, 2) AS xfo_pct, + + COUNTIF(has_xcto) AS xcto_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xcto), COUNT(0)) * 100, 2) AS xcto_pct, + + COUNTIF(has_xxp) AS xxp_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_xxp), COUNT(0)) * 100, 2) AS xxp_pct, + + COUNTIF(has_referrer_policy) AS referrer_policy_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_referrer_policy), COUNT(0)) * 100, 2) AS referrer_policy_pct, + + COUNTIF(has_permissions_policy OR has_feature_policy) AS permissions_policy_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_permissions_policy OR has_feature_policy), COUNT(0)) * 100, 2) AS permissions_policy_pct, + + -- Cache control adoption + COUNTIF(has_cache_no_store) AS cache_no_store_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cache_no_store), COUNT(0)) * 100, 2) AS cache_no_store_pct, + + COUNTIF(has_cache_immutable) AS cache_immutable_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cache_immutable), COUNT(0)) * 100, 2) AS cache_immutable_pct, + + -- CORS headers adoption + COUNTIF(has_cors_origin) AS cors_origin_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_origin), COUNT(0)) * 100, 2) AS cors_origin_pct, + + COUNTIF(has_cors_methods) AS cors_methods_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_methods), COUNT(0)) * 100, 2) AS cors_methods_pct, + + COUNTIF(has_cors_headers) AS cors_headers_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_headers), COUNT(0)) * 100, 2) AS cors_headers_pct, + + COUNTIF(has_cors_credentials) AS cors_credentials_count, + ROUND(SAFE_DIVIDE(COUNTIF(has_cors_credentials), COUNT(0)) * 100, 2) AS cors_credentials_pct, + + -- CORS origin patterns (sample values) + ARRAY_AGG(cors_origin_value IGNORE NULLS LIMIT 5) AS sample_cors_origins, + + -- Wildcard CORS usage + COUNTIF(cors_origin_value = '*') AS cors_wildcard_count, + ROUND(SAFE_DIVIDE(COUNTIF(cors_origin_value = '*'), COUNTIF(has_cors_origin)) * 100, 2) AS cors_wildcard_pct, + + -- Security score (how many headers out of 7 main ones) + ROUND(AVG( + CAST(has_hsts AS INT64) + + CAST(has_csp AS INT64) + + CAST(has_xfo AS INT64) + + CAST(has_xcto AS INT64) + + CAST(has_xxp AS INT64) + + CAST(has_referrer_policy AS INT64) + + CAST(has_permissions_policy OR has_feature_policy AS INT64) + ), 2) AS avg_security_headers +FROM security_headers +GROUP BY client, + cdn, + is_main_document +HAVING + total_requests >= 1000 +ORDER BY client DESC, + avg_security_headers DESC, + total_requests DESC diff --git a/sql/2025/cdn/server_timing_adoption.sql b/sql/2025/cdn/server_timing_adoption.sql new file mode 100644 index 00000000000..3eb46397611 --- /dev/null +++ b/sql/2025/cdn/server_timing_adoption.sql @@ -0,0 +1,105 @@ +#standardSQL +# 32_server_timing_adoption.sql +# Deep dive into Server-Timing header usage across CDNs +# Server-Timing provides transparency into server-side performance metrics + +WITH server_timing_analysis AS ( + SELECT + client, + page, + url, + is_main_document, + IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn_provider, + JSON_EXTRACT_SCALAR(payload, '$._protocol') AS protocol, + + -- Check for Server-Timing header + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'server-timing' + ) AS has_server_timing, + + -- Extract Server-Timing value for analysis + ( + SELECT header.value +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'server-timing' +LIMIT 1 + ) AS server_timing_value, + + -- Other performance headers + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'x-cache' + ) AS has_x_cache, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'x-cdn' + ) AS has_x_cdn, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'cf-ray' -- Cloudflare specific + ) AS has_cf_ray, + + EXISTS( + SELECT 1 +FROM UNNEST(response_headers) AS header +WHERE LOWER(header.name) = 'x-amz-cf-id' -- CloudFront specific + ) AS has_amz_cf_id, + + -- Performance metrics + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.wait') AS FLOAT64) AS ttfb, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.ssl') AS FLOAT64) AS ssl_time, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.timings.connect') AS FLOAT64) AS connect_time, + + -- Resource info + JSON_EXTRACT_SCALAR(summary, '$.type') AS resource_type, + JSON_EXTRACT_SCALAR(summary, '$.respSize') AS response_size +FROM `httparchive.crawl.requests` +WHERE date = '2025-07-01' + AND client = 'mobile' +) + +SELECT + cdn_provider, + COUNT(DISTINCT page) AS total_pages, + COUNT(*) AS total_requests, + + -- Server-Timing adoption + COUNTIF(has_server_timing) AS requests_with_server_timing, + SAFE_DIVIDE(COUNTIF(has_server_timing) * 100.0, COUNT(*)) AS pct_server_timing, + + -- Other transparency headers + COUNTIF(has_x_cache) AS requests_with_x_cache, + COUNTIF(has_x_cdn) AS requests_with_x_cdn, + SAFE_DIVIDE(COUNTIF(has_x_cache) * 100.0, COUNT(*)) AS pct_x_cache, + + -- CDN-specific headers + COUNTIF(has_cf_ray) AS cloudflare_ray_headers, + COUNTIF(has_amz_cf_id) AS cloudfront_id_headers, + + -- Performance correlation + AVG(CASE WHEN has_server_timing THEN ttfb END) AS avg_ttfb_with_timing, + AVG(CASE WHEN NOT has_server_timing THEN ttfb END) AS avg_ttfb_without_timing, + + APPROX_QUANTILES(CASE WHEN has_server_timing THEN ttfb END, 100)[OFFSET(50)] AS median_ttfb_with_timing, + APPROX_QUANTILES(CASE WHEN NOT has_server_timing THEN ttfb END, 100)[OFFSET(50)] AS median_ttfb_without_timing, + + -- Sample Server-Timing values + ARRAY_AGG(DISTINCT server_timing_value IGNORE NULLS LIMIT 5) AS sample_timing_values, + + -- By resource type + COUNTIF(resource_type = 'Document' AND has_server_timing) AS documents_with_timing, + COUNTIF(resource_type = 'Script' AND has_server_timing) AS scripts_with_timing, + COUNTIF(resource_type = 'Stylesheet' AND has_server_timing) AS styles_with_timing +FROM server_timing_analysis +WHERE cdn_provider IS NOT NULL +GROUP BY cdn_provider +HAVING total_requests > 1000 +ORDER BY pct_server_timing DESC +LIMIT 100 \ No newline at end of file From 98f52e2841e90a9a42fc389cd18e6064a4eb1d62 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Sun, 11 Jan 2026 16:09:27 +0000 Subject: [PATCH 2/3] Linting fixes --- sql/2025/cdn/alt_svc_protocol_analysis.sql | 56 +++---- sql/2025/cdn/brotli_compression_adoption.sql | 22 +-- sql/2025/cdn/cache_control_analysis.sql | 138 ++++++++++------- .../cdn/cdn_security_features_adoption.sql | 50 ++++-- sql/2025/cdn/h3_adoption_by_cdn_provider.sql | 22 +-- sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql | 22 +-- sql/2025/cdn/http3_performance_impact.sql | 20 +-- sql/2025/cdn/security_headers_by_cdn.sql | 146 +++++++++--------- sql/2025/cdn/server_timing_adoption.sql | 40 ++--- 9 files changed, 279 insertions(+), 237 deletions(-) diff --git a/sql/2025/cdn/alt_svc_protocol_analysis.sql b/sql/2025/cdn/alt_svc_protocol_analysis.sql index a1aa46088fa..a19914da36b 100644 --- a/sql/2025/cdn/alt_svc_protocol_analysis.sql +++ b/sql/2025/cdn/alt_svc_protocol_analysis.sql @@ -8,13 +8,13 @@ WITH alt_svc_analysis AS ( page, url, is_main_document, - + -- CDN detection IFNULL( NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN' ) AS cdn, - + -- Current protocol used UPPER(IFNULL( JSON_EXTRACT_SCALAR(payload, '$._protocol'), @@ -23,7 +23,7 @@ WITH alt_svc_analysis AS ( NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') ) )) AS current_protocol, - + -- Extract Alt-Svc header value ( SELECT h.value @@ -31,17 +31,17 @@ WITH alt_svc_analysis AS ( WHERE LOWER(h.name) = 'alt-svc' LIMIT 1 ) AS alt_svc_header, - + -- Check if Alt-Svc exists EXISTS( SELECT 1 FROM UNNEST(response_headers) AS h WHERE LOWER(h.name) = 'alt-svc' ) AS has_alt_svc - + FROM `httparchive.crawl.requests` - WHERE date = '2025-07-01' - AND EXISTS( + WHERE date = '2025-07-01' AND + EXISTS ( SELECT 1 FROM UNNEST(response_headers) AS h WHERE LOWER(h.name) = 'alt-svc' @@ -56,13 +56,13 @@ protocol_extraction AS ( REGEXP_CONTAINS(LOWER(alt_svc_header), r'h3-\d+[^=]*=') AS advertises_h3_draft, REGEXP_CONTAINS(LOWER(alt_svc_header), r'h2[^=]*=') AS advertises_h2, REGEXP_CONTAINS(LOWER(alt_svc_header), r'http/1\.1[^=]*=') AS advertises_h1, - + -- Extract max-age values SAFE_CAST(REGEXP_EXTRACT(alt_svc_header, r'ma=(\d+)') AS INT64) AS max_age_seconds, - + -- Check for clear directive REGEXP_CONTAINS(LOWER(alt_svc_header), r'clear') AS has_clear_directive - + FROM alt_svc_analysis ) @@ -71,45 +71,45 @@ SELECT cdn, is_main_document, current_protocol, - COUNT(*) AS total_requests, - + COUNT(0) AS total_requests, + -- Protocol advertising analysis COUNTIF(advertises_h3) AS advertises_h3_count, - ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(*)) * 100, 2) AS advertises_h3_pct, - + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3), COUNT(0)) * 100, 2) AS advertises_h3_pct, + COUNTIF(advertises_h3_draft) AS advertises_h3_draft_count, - ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(*)) * 100, 2) AS advertises_h3_draft_pct, - + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h3_draft), COUNT(0)) * 100, 2) AS advertises_h3_draft_pct, + COUNTIF(advertises_h2) AS advertises_h2_count, - ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(*)) * 100, 2) AS advertises_h2_pct, - + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h2), COUNT(0)) * 100, 2) AS advertises_h2_pct, + COUNTIF(advertises_h1) AS advertises_h1_count, - ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(*)) * 100, 2) AS advertises_h1_pct, - + ROUND(SAFE_DIVIDE(COUNTIF(advertises_h1), COUNT(0)) * 100, 2) AS advertises_h1_pct, + -- Protocol mismatch analysis COUNTIF(advertises_h3 AND current_protocol NOT LIKE '%H3%' AND current_protocol != 'HTTP/3') AS h3_advertised_not_used, COUNTIF(current_protocol LIKE '%H3%' OR current_protocol = 'HTTP/3') AS currently_using_h3, - + -- Max-age statistics APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(50)] AS median_max_age_seconds, APPROX_QUANTILES(max_age_seconds, 100)[OFFSET(90)] AS p90_max_age_seconds, - + -- Clear directive usage COUNTIF(has_clear_directive) AS clear_directive_count, - ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(*)) * 100, 2) AS clear_directive_pct, - + ROUND(SAFE_DIVIDE(COUNTIF(has_clear_directive), COUNT(0)) * 100, 2) AS clear_directive_pct, + -- Sample Alt-Svc headers for analysis ARRAY_AGG(alt_svc_header IGNORE NULLS LIMIT 5) AS sample_alt_svc_headers - + FROM protocol_extraction -GROUP BY +GROUP BY client, cdn, is_main_document, current_protocol -HAVING +HAVING total_requests >= 100 -ORDER BY +ORDER BY client DESC, total_requests DESC, advertises_h3_pct DESC diff --git a/sql/2025/cdn/brotli_compression_adoption.sql b/sql/2025/cdn/brotli_compression_adoption.sql index 5fd843b2f71..9f0188221f7 100644 --- a/sql/2025/cdn/brotli_compression_adoption.sql +++ b/sql/2025/cdn/brotli_compression_adoption.sql @@ -37,15 +37,15 @@ WITH compression_analysis AS ( -- Compression detection from Content-Encoding header ( SELECT LOWER(h.value) -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'content-encoding' -LIMIT 1 + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'content-encoding' + LIMIT 1 ) AS content_encoding, -- Vary header check (indicates dynamic compression support) EXISTS( SELECT 1 FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%' + WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%' ) AS supports_dynamic_compression, -- Response size metrics @@ -54,10 +54,10 @@ WHERE LOWER(h.name) = 'vary' AND LOWER(h.value) LIKE '%accept-encoding%' -- Transfer size (actual bytes transferred) SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response._transferSize') AS INT64) AS transfer_size -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND -- Focus on compressible content types - AND REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)') + REGEXP_CONTAINS(LOWER(url), r'\.(js|mjs|css|html|htm|json|svg|xml|txt)($|\?)') ) SELECT @@ -106,11 +106,13 @@ SELECT ROUND(SUM(CASE WHEN content_encoding = 'br' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_brotli, ROUND(SUM(CASE WHEN content_encoding = 'gzip' THEN response_body_size END) / (1024 * 1024 * 1024), 2) AS total_gb_gzip FROM compression_analysis -GROUP BY client, +GROUP BY + client, cdn, content_type HAVING total_requests >= 100 -- Minimum threshold for statistical relevance -ORDER BY client DESC, +ORDER BY + client DESC, brotli_pct DESC, - total_requests DESC \ No newline at end of file + total_requests DESC diff --git a/sql/2025/cdn/cache_control_analysis.sql b/sql/2025/cdn/cache_control_analysis.sql index fd36ce2cea3..6b2bfc3acc6 100644 --- a/sql/2025/cdn/cache_control_analysis.sql +++ b/sql/2025/cdn/cache_control_analysis.sql @@ -6,58 +6,74 @@ WITH cache_analysis AS ( SELECT client, is_main_document, - + -- CDN detection IFNULL( NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN' ) AS cdn, - + -- Content type JSON_EXTRACT_SCALAR(summary, '$._contentType') AS content_type, - + -- Response status SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.status') AS INT64) AS status_code, - + -- Extract cache-control header value - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'cache-control' -LIMIT 1) AS cache_control, - + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'cache-control' + LIMIT 1 + ) AS cache_control, + -- Extract other cache-related headers - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'expires' -LIMIT 1) AS expires_header, - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'etag' -LIMIT 1) AS etag, - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'last-modified' -LIMIT 1) AS last_modified, - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'vary' -LIMIT 1) AS vary_header, - + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'expires' + LIMIT 1 + ) AS expires_header, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'etag' + LIMIT 1 + ) AS etag, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'last-modified' + LIMIT 1 + ) AS last_modified, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'vary' + LIMIT 1 + ) AS vary_header, + -- CDN-specific headers - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'x-cache' -LIMIT 1) AS x_cache, - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'cf-cache-status' -LIMIT 1) AS cf_cache_status, - (SELECT value -FROM UNNEST(response_headers) -WHERE LOWER(name) = 'x-served-by' -LIMIT 1) AS x_served_by -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'x-cache' + LIMIT 1 + ) AS x_cache, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'cf-cache-status' + LIMIT 1 + ) AS cf_cache_status, + ( + SELECT value + FROM UNNEST(response_headers) + WHERE LOWER(name) = 'x-served-by' + LIMIT 1 + ) AS x_served_by + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' ) SELECT @@ -65,48 +81,48 @@ SELECT cdn, is_main_document, COUNT(0) AS total_requests, - + -- Cache-Control directives analysis COUNTIF(cache_control IS NOT NULL) AS has_cache_control, ROUND(SAFE_DIVIDE(COUNTIF(cache_control IS NOT NULL), COUNT(0)) * 100, 2) AS cache_control_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')) AS is_public, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'public')), COUNT(0)) * 100, 2) AS public_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')) AS is_private, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'private')), COUNT(0)) * 100, 2) AS private_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')) AS is_no_cache, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-cache')), COUNT(0)) * 100, 2) AS no_cache_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')) AS is_no_store, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'no-store')), COUNT(0)) * 100, 2) AS no_store_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')) AS is_max_age_zero, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'max-age=0')), COUNT(0)) * 100, 2) AS max_age_zero_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')) AS is_immutable, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'immutable')), COUNT(0)) * 100, 2) AS immutable_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')) AS has_s_maxage, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r's-maxage')), COUNT(0)) * 100, 2) AS s_maxage_pct, - + -- Modern cache directives COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')) AS has_stale_while_revalidate, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-while-revalidate')), COUNT(0)) * 100, 2) AS stale_while_revalidate_pct, - + COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')) AS has_stale_if_error, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(cache_control), r'stale-if-error')), COUNT(0)) * 100, 2) AS stale_if_error_pct, - + -- Max-age value analysis APPROX_QUANTILES( SAFE_CAST(REGEXP_EXTRACT(cache_control, r'max-age=(\d+)') AS INT64), 100 )[OFFSET(50)] AS median_max_age_seconds, - + APPROX_QUANTILES( SAFE_CAST(REGEXP_EXTRACT(cache_control, r's-maxage=(\d+)') AS INT64), 100 )[OFFSET(50)] AS median_s_maxage_seconds, - + -- Cache hit/miss analysis from CDN headers COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hits, COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'miss')) AS cache_misses, @@ -114,34 +130,36 @@ SELECT COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit|miss')) ) * 100, 2) AS cache_hit_rate_pct, - + -- Cloudflare-specific cache status COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'hit')) AS cf_cache_hits, COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'miss')) AS cf_cache_misses, COUNTIF(REGEXP_CONTAINS(LOWER(cf_cache_status), r'dynamic')) AS cf_cache_dynamic, - + -- Validation headers COUNTIF(etag IS NOT NULL) AS has_etag, ROUND(SAFE_DIVIDE(COUNTIF(etag IS NOT NULL), COUNT(0)) * 100, 2) AS etag_pct, - + COUNTIF(last_modified IS NOT NULL) AS has_last_modified, ROUND(SAFE_DIVIDE(COUNTIF(last_modified IS NOT NULL), COUNT(0)) * 100, 2) AS last_modified_pct, - + -- CDN cache hit indicators COUNTIF(x_cache IS NOT NULL) AS has_x_cache, COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')) AS cache_hit_count, ROUND(SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(LOWER(x_cache), r'hit')), COUNTIF(x_cache IS NOT NULL)) * 100, 2) AS cache_hit_rate, - + -- Cloudflare specific COUNTIF(cf_cache_status = 'HIT') AS cf_hits, COUNTIF(cf_cache_status = 'MISS') AS cf_misses, ROUND(SAFE_DIVIDE(COUNTIF(cf_cache_status = 'HIT'), COUNTIF(cf_cache_status IS NOT NULL)) * 100, 2) AS cf_hit_rate FROM cache_analysis WHERE status_code = 200 -- Focus on successful responses -GROUP BY client, +GROUP BY + client, cdn, is_main_document HAVING total_requests >= 1000 -ORDER BY client DESC, +ORDER BY + client DESC, total_requests DESC diff --git a/sql/2025/cdn/cdn_security_features_adoption.sql b/sql/2025/cdn/cdn_security_features_adoption.sql index b8640080a93..e539be1a202 100644 --- a/sql/2025/cdn/cdn_security_features_adoption.sql +++ b/sql/2025/cdn/cdn_security_features_adoption.sql @@ -18,44 +18,60 @@ SELECT -- Overall security header presence COUNT(DISTINCT CASE WHEN ( - REGEXP_CONTAINS(TO_JSON_STRING(response_headers), - r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"') - ) THEN page END) AS pages_with_any_security_headers, + REGEXP_CONTAINS( + TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"' + ) + ) THEN page + END) AS pages_with_any_security_headers, ROUND( (COUNT(DISTINCT CASE WHEN ( - REGEXP_CONTAINS(TO_JSON_STRING(response_headers), - r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"') - ) THEN page END) * 100.0 / COUNT(DISTINCT page)), 2 + REGEXP_CONTAINS( + TO_JSON_STRING(response_headers), + r'(?i)"strict-transport-security"|"content-security-policy"|"x-xss-protection"|"x-frame-options"|"x-content-type-options"|"referrer-policy"|"permissions-policy"' + ) + ) THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2 ) AS security_headers_adoption_rate_pct, -- Specific security headers breakdown COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') - THEN page END) AS pages_with_hsts, + THEN page + END) AS pages_with_hsts, COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') - THEN page END) AS pages_with_csp, + THEN page + END) AS pages_with_csp, COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') - THEN page END) AS pages_with_xframe, + THEN page + END) AS pages_with_xframe, COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-content-type-options"') - THEN page END) AS pages_with_xcontent, + THEN page + END) AS pages_with_xcontent, COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"referrer-policy"') - THEN page END) AS pages_with_referrer_policy, + THEN page + END) AS pages_with_referrer_policy, COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"permissions-policy"') - THEN page END) AS pages_with_permissions_policy, + THEN page + END) AS pages_with_permissions_policy, -- Calculate percentages for each header ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"strict-transport-security"') - THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS hsts_pct, + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS hsts_pct, ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"content-security-policy"') - THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS csp_pct, + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS csp_pct, ROUND((COUNT(DISTINCT CASE WHEN REGEXP_CONTAINS(TO_JSON_STRING(response_headers), r'(?i)"x-frame-options"') - THEN page END) * 100.0 / COUNT(DISTINCT page)), 2) AS xframe_pct + THEN page + END) * 100.0 / COUNT(DISTINCT page)), 2) AS xframe_pct FROM `httparchive.crawl.requests` WHERE date = '2025-07-01' AND is_main_document = true GROUP BY cdn_provider HAVING total_pages > 100 -- Filter out CDNs with very small sample sizes -ORDER BY security_headers_adoption_rate_pct DESC, +ORDER BY + security_headers_adoption_rate_pct DESC, total_pages DESC -LIMIT 100 \ No newline at end of file +LIMIT 100 diff --git a/sql/2025/cdn/h3_adoption_by_cdn_provider.sql b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql index 1b368a51f8d..e38b8a263bf 100644 --- a/sql/2025/cdn/h3_adoption_by_cdn_provider.sql +++ b/sql/2025/cdn/h3_adoption_by_cdn_provider.sql @@ -7,7 +7,7 @@ SELECT cdn, is_main_document, COUNT(0) AS total, - + -- Protocol counts COUNTIF(protocol = 'HTTP/0.9') AS http09, COUNTIF(protocol = 'HTTP/1.0') AS http10, @@ -15,12 +15,12 @@ SELECT COUNTIF(protocol = 'HTTP/2') AS http2, COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, - + -- Protocol percentages ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, - + -- HTTP/2+ adoption (includes HTTP/3) COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct @@ -28,7 +28,7 @@ FROM ( SELECT client, is_main_document, - + # Protocol detection - using the same logic as distribution_of_http_versions.sql UPPER(IFNULL( JSON_EXTRACT_SCALAR(payload, '$._protocol'), @@ -37,21 +37,23 @@ FROM ( NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') ) )) AS protocol, - + # CDN detection - extract first CDN provider if multiple are listed IFNULL( NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN' ) AS cdn -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' ) WHERE cdn != 'ORIGIN' AND -- Only CDN providers, not origin servers protocol IS NOT NULL -- Must have a detected protocol -GROUP BY client, +GROUP BY + client, cdn, is_main_document HAVING total >= 1000 -- Only include CDNs with meaningful traffic volume -ORDER BY client DESC, - total DESC \ No newline at end of file +ORDER BY + client DESC, + total DESC diff --git a/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql index 85683f22fb8..cebdf6f56f2 100644 --- a/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql +++ b/sql/2025/cdn/h3_adoption_by_cdn_vs_origin.sql @@ -7,7 +7,7 @@ SELECT IF(cdn = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn_type, is_main_document, COUNT(0) AS total, - + -- Protocol counts COUNTIF(protocol = 'HTTP/0.9') AS http09, COUNTIF(protocol = 'HTTP/1.0') AS http10, @@ -15,12 +15,12 @@ SELECT COUNTIF(protocol = 'HTTP/2') AS http2, COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http3, COUNTIF(protocol NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http_other, - + -- Protocol percentages ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/1.1'), COUNT(0)) * 100, 2) AS http11_pct, ROUND(SAFE_DIVIDE(COUNTIF(protocol = 'HTTP/2'), COUNT(0)) * 100, 2) AS http2_pct, ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http3_pct, - + -- HTTP/2+ adoption (includes HTTP/3) COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')) AS http2plus, ROUND(SAFE_DIVIDE(COUNTIF(protocol IN ('HTTP/2', 'H3-29', 'H3-Q050', 'H3', 'HTTP/3')), COUNT(0)) * 100, 2) AS http2plus_pct @@ -28,7 +28,7 @@ FROM ( SELECT client, is_main_document, - + # Protocol detection - using the same logic as distribution_of_http_versions.sql UPPER(IFNULL( JSON_EXTRACT_SCALAR(payload, '$._protocol'), @@ -37,20 +37,22 @@ FROM ( NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/') ) )) AS protocol, - + # CDN detection - extract first CDN provider if multiple are listed IFNULL( NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN' ) AS cdn -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' ) WHERE protocol IS NOT NULL -- Must have a detected protocol -GROUP BY client, +GROUP BY + client, cdn_type, is_main_document HAVING total >= 1000 -- Only include CDNs with meaningful traffic volume -ORDER BY client DESC, - total DESC \ No newline at end of file +ORDER BY + client DESC, + total DESC diff --git a/sql/2025/cdn/http3_performance_impact.sql b/sql/2025/cdn/http3_performance_impact.sql index 2f9f5979526..b29eec3f2ca 100644 --- a/sql/2025/cdn/http3_performance_impact.sql +++ b/sql/2025/cdn/http3_performance_impact.sql @@ -39,16 +39,16 @@ WITH protocol_performance AS ( -- Priority information JSON_EXTRACT_SCALAR(payload, '$._priority') AS priority -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' - AND client = 'mobile' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + client = 'mobile' ) SELECT cdn_provider, protocol, COUNT(DISTINCT page) AS total_pages, - COUNT(*) AS total_requests, + COUNT(0) AS total_requests, -- Performance metrics by protocol APPROX_QUANTILES(ttfb, 100)[OFFSET(25)] AS p25_ttfb, @@ -81,7 +81,7 @@ SELECT -- Connection reuse (unique sockets vs requests) COUNT(DISTINCT socket_id) AS unique_connections, - SAFE_DIVIDE(COUNT(*), COUNT(DISTINCT socket_id)) AS avg_requests_per_connection, + SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT socket_id)) AS avg_requests_per_connection, -- Resource type breakdown COUNTIF(resource_type = 'Document') AS document_requests, @@ -89,10 +89,10 @@ SELECT COUNTIF(resource_type = 'Stylesheet') AS style_requests, COUNTIF(resource_type = 'Image') AS image_requests FROM protocol_performance -WHERE protocol IN ('H3', 'HTTP/2', 'HTTP/1.x') - AND ttfb IS NOT NULL - AND ttfb > 0 - AND ttfb < 10000 -- Filter outliers +WHERE protocol IN ('H3', 'HTTP/2', 'HTTP/1.x') AND + ttfb IS NOT NULL AND + ttfb > 0 AND + ttfb < 10000 -- Filter outliers GROUP BY cdn_provider, protocol HAVING total_requests > 1000 -ORDER BY cdn_provider, protocol \ No newline at end of file +ORDER BY cdn_provider, protocol diff --git a/sql/2025/cdn/security_headers_by_cdn.sql b/sql/2025/cdn/security_headers_by_cdn.sql index a508e814f75..fb914c3c91b 100644 --- a/sql/2025/cdn/security_headers_by_cdn.sql +++ b/sql/2025/cdn/security_headers_by_cdn.sql @@ -7,103 +7,103 @@ WITH security_headers AS ( client, url, is_main_document, - + -- CDN detection IFNULL( NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN' ) AS cdn, - + -- Extract specific security headers -- Note: response_headers is an array, so we need to check each element EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'strict-transport-security' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'strict-transport-security' ) AS has_hsts, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'content-security-policy' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'content-security-policy' ) AS has_csp, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'x-frame-options' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-frame-options' ) AS has_xfo, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'x-content-type-options' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-content-type-options' ) AS has_xcto, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'x-xss-protection' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'x-xss-protection' ) AS has_xxp, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'referrer-policy' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'referrer-policy' ) AS has_referrer_policy, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'permissions-policy' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'permissions-policy' ) AS has_permissions_policy, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'feature-policy' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'feature-policy' ) AS has_feature_policy, - + -- Cache-Control directives EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'cache-control' - AND LOWER(h.value) LIKE '%no-store%' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'cache-control' AND + LOWER(h.value) LIKE '%no-store%' ) AS has_cache_no_store, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'cache-control' - AND LOWER(h.value) LIKE '%immutable%' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'cache-control' AND + LOWER(h.value) LIKE '%immutable%' ) AS has_cache_immutable, - + -- CORS headers EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'access-control-allow-origin' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-origin' ) AS has_cors_origin, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'access-control-allow-methods' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-methods' ) AS has_cors_methods, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'access-control-allow-headers' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-headers' ) AS has_cors_headers, - + EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS h -WHERE LOWER(h.name) = 'access-control-allow-credentials' + FROM UNNEST(response_headers) AS h + WHERE LOWER(h.name) = 'access-control-allow-credentials' ) AS has_cors_credentials, - + -- Extract CORS origin values for analysis ( SELECT h.value @@ -111,8 +111,8 @@ WHERE LOWER(h.name) = 'access-control-allow-credentials' WHERE LOWER(h.name) = 'access-control-allow-origin' LIMIT 1 ) AS cors_origin_value -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' ) SELECT @@ -120,72 +120,74 @@ SELECT cdn, is_main_document, COUNT(0) AS total_requests, - + -- Security headers adoption COUNTIF(has_hsts) AS hsts_count, ROUND(SAFE_DIVIDE(COUNTIF(has_hsts), COUNT(0)) * 100, 2) AS hsts_pct, - + COUNTIF(has_csp) AS csp_count, ROUND(SAFE_DIVIDE(COUNTIF(has_csp), COUNT(0)) * 100, 2) AS csp_pct, - + COUNTIF(has_xfo) AS xfo_count, ROUND(SAFE_DIVIDE(COUNTIF(has_xfo), COUNT(0)) * 100, 2) AS xfo_pct, - + COUNTIF(has_xcto) AS xcto_count, ROUND(SAFE_DIVIDE(COUNTIF(has_xcto), COUNT(0)) * 100, 2) AS xcto_pct, - + COUNTIF(has_xxp) AS xxp_count, ROUND(SAFE_DIVIDE(COUNTIF(has_xxp), COUNT(0)) * 100, 2) AS xxp_pct, - + COUNTIF(has_referrer_policy) AS referrer_policy_count, ROUND(SAFE_DIVIDE(COUNTIF(has_referrer_policy), COUNT(0)) * 100, 2) AS referrer_policy_pct, - + COUNTIF(has_permissions_policy OR has_feature_policy) AS permissions_policy_count, ROUND(SAFE_DIVIDE(COUNTIF(has_permissions_policy OR has_feature_policy), COUNT(0)) * 100, 2) AS permissions_policy_pct, - + -- Cache control adoption COUNTIF(has_cache_no_store) AS cache_no_store_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cache_no_store), COUNT(0)) * 100, 2) AS cache_no_store_pct, - + COUNTIF(has_cache_immutable) AS cache_immutable_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cache_immutable), COUNT(0)) * 100, 2) AS cache_immutable_pct, - + -- CORS headers adoption COUNTIF(has_cors_origin) AS cors_origin_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cors_origin), COUNT(0)) * 100, 2) AS cors_origin_pct, - + COUNTIF(has_cors_methods) AS cors_methods_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cors_methods), COUNT(0)) * 100, 2) AS cors_methods_pct, - + COUNTIF(has_cors_headers) AS cors_headers_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cors_headers), COUNT(0)) * 100, 2) AS cors_headers_pct, - + COUNTIF(has_cors_credentials) AS cors_credentials_count, ROUND(SAFE_DIVIDE(COUNTIF(has_cors_credentials), COUNT(0)) * 100, 2) AS cors_credentials_pct, - + -- CORS origin patterns (sample values) ARRAY_AGG(cors_origin_value IGNORE NULLS LIMIT 5) AS sample_cors_origins, - + -- Wildcard CORS usage COUNTIF(cors_origin_value = '*') AS cors_wildcard_count, ROUND(SAFE_DIVIDE(COUNTIF(cors_origin_value = '*'), COUNTIF(has_cors_origin)) * 100, 2) AS cors_wildcard_pct, - + -- Security score (how many headers out of 7 main ones) ROUND(AVG( - CAST(has_hsts AS INT64) + - CAST(has_csp AS INT64) + - CAST(has_xfo AS INT64) + - CAST(has_xcto AS INT64) + - CAST(has_xxp AS INT64) + - CAST(has_referrer_policy AS INT64) + + CAST(has_hsts AS INT64) + + CAST(has_csp AS INT64) + + CAST(has_xfo AS INT64) + + CAST(has_xcto AS INT64) + + CAST(has_xxp AS INT64) + + CAST(has_referrer_policy AS INT64) + CAST(has_permissions_policy OR has_feature_policy AS INT64) ), 2) AS avg_security_headers FROM security_headers -GROUP BY client, +GROUP BY + client, cdn, is_main_document HAVING total_requests >= 1000 -ORDER BY client DESC, +ORDER BY + client DESC, avg_security_headers DESC, total_requests DESC diff --git a/sql/2025/cdn/server_timing_adoption.sql b/sql/2025/cdn/server_timing_adoption.sql index 3eb46397611..136a4d2270c 100644 --- a/sql/2025/cdn/server_timing_adoption.sql +++ b/sql/2025/cdn/server_timing_adoption.sql @@ -15,41 +15,41 @@ WITH server_timing_analysis AS ( -- Check for Server-Timing header EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'server-timing' + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'server-timing' ) AS has_server_timing, -- Extract Server-Timing value for analysis ( SELECT header.value -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'server-timing' -LIMIT 1 + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'server-timing' + LIMIT 1 ) AS server_timing_value, -- Other performance headers EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'x-cache' + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-cache' ) AS has_x_cache, EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'x-cdn' + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-cdn' ) AS has_x_cdn, EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'cf-ray' -- Cloudflare specific + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'cf-ray' -- Cloudflare specific ) AS has_cf_ray, EXISTS( SELECT 1 -FROM UNNEST(response_headers) AS header -WHERE LOWER(header.name) = 'x-amz-cf-id' -- CloudFront specific + FROM UNNEST(response_headers) AS header + WHERE LOWER(header.name) = 'x-amz-cf-id' -- CloudFront specific ) AS has_amz_cf_id, -- Performance metrics @@ -60,24 +60,24 @@ WHERE LOWER(header.name) = 'x-amz-cf-id' -- CloudFront specific -- Resource info JSON_EXTRACT_SCALAR(summary, '$.type') AS resource_type, JSON_EXTRACT_SCALAR(summary, '$.respSize') AS response_size -FROM `httparchive.crawl.requests` -WHERE date = '2025-07-01' - AND client = 'mobile' + FROM `httparchive.crawl.requests` + WHERE date = '2025-07-01' AND + client = 'mobile' ) SELECT cdn_provider, COUNT(DISTINCT page) AS total_pages, - COUNT(*) AS total_requests, + COUNT(0) AS total_requests, -- Server-Timing adoption COUNTIF(has_server_timing) AS requests_with_server_timing, - SAFE_DIVIDE(COUNTIF(has_server_timing) * 100.0, COUNT(*)) AS pct_server_timing, + SAFE_DIVIDE(COUNTIF(has_server_timing) * 100.0, COUNT(0)) AS pct_server_timing, -- Other transparency headers COUNTIF(has_x_cache) AS requests_with_x_cache, COUNTIF(has_x_cdn) AS requests_with_x_cdn, - SAFE_DIVIDE(COUNTIF(has_x_cache) * 100.0, COUNT(*)) AS pct_x_cache, + SAFE_DIVIDE(COUNTIF(has_x_cache) * 100.0, COUNT(0)) AS pct_x_cache, -- CDN-specific headers COUNTIF(has_cf_ray) AS cloudflare_ray_headers, @@ -102,4 +102,4 @@ WHERE cdn_provider IS NOT NULL GROUP BY cdn_provider HAVING total_requests > 1000 ORDER BY pct_server_timing DESC -LIMIT 100 \ No newline at end of file +LIMIT 100 From 2eae0f2f2123a43de01bdd6df2fc09e80ce9cf38 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Wed, 14 Jan 2026 00:12:30 +0000 Subject: [PATCH 3/3] Linting --- sql/2025/cdn/alt_svc_protocol_analysis.sql | 2 +- sql/2025/cdn/brotli_compression_adoption.sql | 2 +- sql/2025/cdn/cache_control_analysis.sql | 16 ++++++++-------- sql/2025/cdn/security_headers_by_cdn.sql | 2 +- sql/2025/cdn/server_timing_adoption.sql | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sql/2025/cdn/alt_svc_protocol_analysis.sql b/sql/2025/cdn/alt_svc_protocol_analysis.sql index a19914da36b..e6c12928600 100644 --- a/sql/2025/cdn/alt_svc_protocol_analysis.sql +++ b/sql/2025/cdn/alt_svc_protocol_analysis.sql @@ -29,7 +29,7 @@ WITH alt_svc_analysis AS ( SELECT h.value FROM UNNEST(response_headers) AS h WHERE LOWER(h.name) = 'alt-svc' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS alt_svc_header, -- Check if Alt-Svc exists diff --git a/sql/2025/cdn/brotli_compression_adoption.sql b/sql/2025/cdn/brotli_compression_adoption.sql index 9f0188221f7..c7f502603cb 100644 --- a/sql/2025/cdn/brotli_compression_adoption.sql +++ b/sql/2025/cdn/brotli_compression_adoption.sql @@ -39,7 +39,7 @@ WITH compression_analysis AS ( SELECT LOWER(h.value) FROM UNNEST(response_headers) AS h WHERE LOWER(h.name) = 'content-encoding' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS content_encoding, -- Vary header check (indicates dynamic compression support) diff --git a/sql/2025/cdn/cache_control_analysis.sql b/sql/2025/cdn/cache_control_analysis.sql index 6b2bfc3acc6..cbd663a9fa4 100644 --- a/sql/2025/cdn/cache_control_analysis.sql +++ b/sql/2025/cdn/cache_control_analysis.sql @@ -24,7 +24,7 @@ WITH cache_analysis AS ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'cache-control' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS cache_control, -- Extract other cache-related headers @@ -32,25 +32,25 @@ WITH cache_analysis AS ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'expires' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS expires_header, ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'etag' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS etag, ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'last-modified' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS last_modified, ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'vary' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS vary_header, -- CDN-specific headers @@ -58,19 +58,19 @@ WITH cache_analysis AS ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'x-cache' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS x_cache, ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'cf-cache-status' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS cf_cache_status, ( SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'x-served-by' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS x_served_by FROM `httparchive.crawl.requests` WHERE date = '2025-07-01' diff --git a/sql/2025/cdn/security_headers_by_cdn.sql b/sql/2025/cdn/security_headers_by_cdn.sql index fb914c3c91b..3cf82e7f28f 100644 --- a/sql/2025/cdn/security_headers_by_cdn.sql +++ b/sql/2025/cdn/security_headers_by_cdn.sql @@ -109,7 +109,7 @@ WITH security_headers AS ( SELECT h.value FROM UNNEST(response_headers) AS h WHERE LOWER(h.name) = 'access-control-allow-origin' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS cors_origin_value FROM `httparchive.crawl.requests` WHERE date = '2025-07-01' diff --git a/sql/2025/cdn/server_timing_adoption.sql b/sql/2025/cdn/server_timing_adoption.sql index 136a4d2270c..32bd389cf37 100644 --- a/sql/2025/cdn/server_timing_adoption.sql +++ b/sql/2025/cdn/server_timing_adoption.sql @@ -24,7 +24,7 @@ WITH server_timing_analysis AS ( SELECT header.value FROM UNNEST(response_headers) AS header WHERE LOWER(header.name) = 'server-timing' - LIMIT 1 + LIMIT 1 -- noqa: AM09 ) AS server_timing_value, -- Other performance headers