From c2c225631aeb09cdf91d053133427106959728d9 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 11:09:50 -0400 Subject: [PATCH 001/150] Create a11y_frontend_technology.sql Adding updated sql. --- .../a11y_frontend_technology.sql | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 sql/2025/accessibility/a11y_frontend_technology.sql diff --git a/sql/2025/accessibility/a11y_frontend_technology.sql b/sql/2025/accessibility/a11y_frontend_technology.sql new file mode 100644 index 00000000000..4e87e845838 --- /dev/null +++ b/sql/2025/accessibility/a11y_frontend_technology.sql @@ -0,0 +1,61 @@ +-- This query analyzes Lighthouse category scores (performance, accessibility, +-- best-practices, SEO) for root pages in the HTTP Archive crawl on 2025-07-01. +-- It extracts scores from the JSON `lighthouse` field, associates each page +-- with detected frontend frameworks or JS libraries (via the `technologies` +-- array), and calculates the average scores per framework and client (desktop/mobile). +-- The results include: +-- • avg_performance_score +-- • avg_accessibility_score +-- • avg_best_practices_score +-- • avg_seo_score +-- • total_pages (distinct pages per framework/client) +-- Ordered by the frameworks with the most pages. + +WITH score_data AS ( + SELECT + client, + page, + SAFE_CAST(JSON_VALUE(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + SAFE_CAST(JSON_VALUE(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + SAFE_CAST(JSON_VALUE(lighthouse, '$.categories."best-practices".score') AS FLOAT64) AS best_practices_score, + SAFE_CAST(JSON_VALUE(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + t.technology AS framework + FROM + `httparchive.crawl.pages` + CROSS JOIN + UNNEST(technologies) AS t + WHERE + date = '2025-07-01' + AND lighthouse IS NOT NULL + AND JSON_TYPE(lighthouse) = 'object' + AND ( + 'Web frameworks' IN UNNEST(t.categories) OR + 'JavaScript libraries' IN UNNEST(t.categories) OR + 'Frontend frameworks' IN UNNEST(t.categories) OR + 'JavaScript frameworks' IN UNNEST(t.categories) + ) + AND t.technology IS NOT NULL +) +SELECT + client, + framework, + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(DISTINCT page) AS total_pages +FROM ( + SELECT + client, + page, + framework, + AVG(performance_score) AS performance_score, + AVG(accessibility_score) AS accessibility_score, + AVG(best_practices_score) AS best_practices_score, + AVG(seo_score) AS seo_score + FROM score_data + WHERE performance_score IS NOT NULL + GROUP BY client, page, framework +) +GROUP BY client, framework +ORDER BY total_pages DESC; From de81dac6b740e279f0d45089b1eefbdcd257fe1a Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 11:36:52 -0400 Subject: [PATCH 002/150] Create a11y_overall_tech_usage_by_domain_rank.sql --- ...a11y_overall_tech_usage_by_domain_rank.sql | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql diff --git a/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql new file mode 100644 index 00000000000..a194513b4bd --- /dev/null +++ b/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql @@ -0,0 +1,92 @@ +#standardSQL +# Purpose +# Estimate the share of pages using Accessibility-category technologies (e.g., overlays) +# by client and domain-rank buckets for the 2025-07-01 HTTP Archive crawl. +# This preserves the original behavior: +# • Unit: pages (COUNT DISTINCT page), not sites +# • Grouping: client, is_root_page, rank_grouping (top N thresholds) +# • Technology filter: category = 'Accessibility' +# +# Sampling (for cheap test runs) +# To cut cost without biasing the percentage, this script uses a deterministic hash +# sampler applied IDENTICALLY in both numerator and denominator: +# MOD(ABS(FARM_FINGERPRINT(page)), cfg.modulus) = cfg.remainder +# Set cfg.enable_sample = TRUE and pick a modulus to control sample size +# (e.g., 100_000 ≈ ~0.001 = 0.1%). For full accuracy, set enable_sample = FALSE. +# +# If you prefer TABLESAMPLE, you must apply it consistently to EVERY reference of +# `httparchive.crawl.pages` (both subqueries). Otherwise the percentage is biased. +# +# Notes on comparability +# • Logic and outputs match your original query; only a consistent sampler was added. +# • Remove or disable sampling for production numbers. + +WITH cfg AS ( + SELECT AS STRUCT + FALSE AS enable_sample, -- set to TRUE for sample + 100000 AS modulus, + 0 AS remainder +) + +# Main SELECT statement to aggregate results by client and rank grouping. +SELECT + client, + is_root_page, + rank_grouping, # Grouping of domains by rank threshold + total_in_rank, # Total number of pages within the rank grouping + COUNT(DISTINCT page) AS sites_with_a11y_tech, # Pages with Accessibility tech + COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Share within bucket +FROM + ( + # Subquery: pages with Accessibility technology (numerator) + SELECT DISTINCT + p.client, + p.is_root_page, + p.page, + rank_grouping, + category + FROM + `httparchive.crawl.pages` AS p, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, + UNNEST(p.technologies) AS tech, + UNNEST(categories) AS category, + cfg + WHERE + p.date = '2025-07-01' + AND category = 'Accessibility' + AND p.rank <= rank_grouping + AND (NOT cfg.enable_sample OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.modulus) = cfg.remainder) + # If you insist on TABLESAMPLE instead of hash sampling, replace the line above with: + # -- and also add TABLESAMPLE to the denominator subquery below: + # -- FROM `httparchive.crawl.pages` TABLESAMPLE SYSTEM (0.01 PERCENT) AS p, ... + ) +JOIN + ( + # Subquery: total pages in each rank grouping (denominator) + SELECT + p.client, + rank_grouping, + COUNT(0) AS total_in_rank + FROM + `httparchive.crawl.pages` AS p, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, + cfg + WHERE + p.date = '2025-07-01' + AND p.rank <= rank_grouping + AND (NOT cfg.enable_sample OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.modulus) = cfg.remainder) + # If using TABLESAMPLE, apply the SAME TABLESAMPLE here too. + GROUP BY + p.client, + rank_grouping + ) +USING (client, rank_grouping) +GROUP BY + client, + is_root_page, + rank_grouping, + total_in_rank +ORDER BY + client, + is_root_page, + rank_grouping; From 897ccb75ccef1a9b82400dbecd22d793a79a9a97 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 11:44:07 -0400 Subject: [PATCH 003/150] Create a11y_technology_usage.sql upgrading crawl --- .../accessibility/a11y_technology_usage.sql | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 sql/2025/accessibility/a11y_technology_usage.sql diff --git a/sql/2025/accessibility/a11y_technology_usage.sql b/sql/2025/accessibility/a11y_technology_usage.sql new file mode 100644 index 00000000000..95acb4cd5f2 --- /dev/null +++ b/sql/2025/accessibility/a11y_technology_usage.sql @@ -0,0 +1,43 @@ +#standardSQL +# Purpose +# Measure the prevalence of Accessibility-related technologies (e.g., overlays) +# detected by Wappalyzer in the HTTP Archive crawl. +# The query calculates, by client (desktop/mobile) and root-page flag: +# • total_sites = number of unique pages (deduped by page URL) +# • sites_with_a11y_tech = number of those pages that use at least one +# technology categorized as "Accessibility" +# • pct_sites_with_a11y_tech = share of pages with Accessibility tech +# +# Sampling +# • `TABLESAMPLE SYSTEM (0.01 PERCENT)` is included for cheap testing. +# This dramatically reduces cost by scanning ~0.01% of rows. +# • IMPORTANT: Results will be noisy and not comparable across runs. +# • For accurate, publishable numbers, remove the TABLESAMPLE clause. +# +# Notes +# • The unit here is page (URL), not site/host. COUNT(DISTINCT page) ensures +# each unique URL is counted once. +# • `is_root_page` is grouped so you can compare root vs non-root behavior. +# • If you want “site-level” adoption (combining multiple pages under one host), +# extract the host from `page` (e.g., REGEXP_EXTRACT) and count distinct hosts instead. +# • Percentages are computed as ratio of DISTINCT page counts. +# • Make sure to use the same sampling settings across numerator and denominator. + +SELECT + client, -- desktop or mobile + is_root_page, + COUNT(DISTINCT page) AS total_sites, -- unique pages for this client/root flag + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech, + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech +FROM + `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS category +WHERE + date = '2025-07-01' +GROUP BY + client, + is_root_page +ORDER BY + client, + is_root_page; From 0abbb8a6324486a700ef01a345e70669689aa3ef Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 11:47:45 -0400 Subject: [PATCH 004/150] Update a11y_technology_usage.sql removing sampling --- sql/2025/accessibility/a11y_technology_usage.sql | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sql/2025/accessibility/a11y_technology_usage.sql b/sql/2025/accessibility/a11y_technology_usage.sql index 95acb4cd5f2..b652b8df1a1 100644 --- a/sql/2025/accessibility/a11y_technology_usage.sql +++ b/sql/2025/accessibility/a11y_technology_usage.sql @@ -8,12 +8,6 @@ # technology categorized as "Accessibility" # • pct_sites_with_a11y_tech = share of pages with Accessibility tech # -# Sampling -# • `TABLESAMPLE SYSTEM (0.01 PERCENT)` is included for cheap testing. -# This dramatically reduces cost by scanning ~0.01% of rows. -# • IMPORTANT: Results will be noisy and not comparable across runs. -# • For accurate, publishable numbers, remove the TABLESAMPLE clause. -# # Notes # • The unit here is page (URL), not site/host. COUNT(DISTINCT page) ensures # each unique URL is counted once. From 0d449c5c774d48f6cdbc747cd50c4b6a06c9dcc5 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 11:55:57 -0400 Subject: [PATCH 005/150] Create a11y_technology_usage_by_domain_rank.sql --- .../a11y_technology_usage_by_domain_rank.sql | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql diff --git a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql new file mode 100644 index 00000000000..950e2c76798 --- /dev/null +++ b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql @@ -0,0 +1,84 @@ +#standardSQL +# Purpose +# Report usage of Accessibility-category technologies by domain-rank buckets, +# split by client and root-page flag. For each rank bucket, compute: +# • total_in_rank = distinct pages in the bucket +# • sites_with_app = distinct pages using each Accessibility technology +# • pct_sites_with_app = sites_with_app / total_in_rank +# +# How to use the toggle the deterministic hash sampler +# • For cheap test runs: set cfg.enable_sample = TRUE and pick a modulus (e.g. 10000 ≈ ~0.01%). +# • For full production: set cfg.enable_sample = FALSE (sampler bypassed). + +WITH cfg AS ( + SELECT + FALSE AS enable_sample, -- set TRUE for full table (no sampling) + 10000 AS modulus, -- larger = smaller sample (e.g., 10000 ≈ 0.01%) + 0 AS remainder -- choose any remainder in [0, modulus-1] for a different slice +), + +ranked_sites AS ( + -- Base set of pages with rank bucket + SELECT + p.client, + p.is_root_page, + p.page, + p.rank, + p.technologies, + CASE + WHEN p.rank <= 1000 THEN 1000 + WHEN p.rank <= 10000 THEN 10000 + WHEN p.rank <= 100000 THEN 100000 + WHEN p.rank <= 1000000 THEN 1000000 + WHEN p.rank <= 10000000 THEN 10000000 + WHEN p.rank <= 100000000 THEN 100000000 + ELSE NULL + END AS rank_grouping + FROM `httparchive.crawl.pages` AS p, cfg + WHERE + p.date = '2025-07-01' + AND p.rank IS NOT NULL + AND p.rank <= 100000000 + AND ( + NOT cfg.enable_sample + OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.modulus) = cfg.remainder + ) +), + +rank_totals AS ( + -- Denominator: total distinct pages per client/root/rank bucket + SELECT + client, + is_root_page, + rank_grouping, + COUNT(DISTINCT page) AS total_in_rank + FROM ranked_sites + WHERE rank_grouping IS NOT NULL + GROUP BY client, is_root_page, rank_grouping +) + +SELECT + r.client, + r.is_root_page, + r.rank_grouping, + rt.total_in_rank, -- denominator + tech.technology AS app, -- each Accessibility technology + COUNT(DISTINCT r.page) AS sites_with_app, -- pages using that app + SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app +FROM ranked_sites AS r +CROSS JOIN UNNEST(r.technologies) AS tech +JOIN rank_totals AS rt + ON rt.client = r.client + AND rt.is_root_page = r.is_root_page + AND rt.rank_grouping = r.rank_grouping +WHERE + r.rank_grouping IS NOT NULL + AND EXISTS ( + SELECT 1 + FROM UNNEST(tech.categories) AS c + WHERE c = 'Accessibility' + ) +GROUP BY + r.client, r.is_root_page, r.rank_grouping, rt.total_in_rank, tech.technology +ORDER BY + tech.technology, r.rank_grouping, r.client, r.is_root_page; From 326969433bade021fa6e4298a8ee4452d4d29bbd Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 12:04:25 -0400 Subject: [PATCH 006/150] Update a11y_technology_usage_by_domain_rank.sql Updating to scan whole domain --- .../accessibility/a11y_technology_usage_by_domain_rank.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql index 950e2c76798..79d88f788d1 100644 --- a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql +++ b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql @@ -12,7 +12,7 @@ WITH cfg AS ( SELECT - FALSE AS enable_sample, -- set TRUE for full table (no sampling) + TRUE AS enable_sample, -- set TRUE for full table (no sampling) 10000 AS modulus, -- larger = smaller sample (e.g., 10000 ≈ 0.01%) 0 AS remainder -- choose any remainder in [0, modulus-1] for a different slice ), @@ -64,7 +64,8 @@ SELECT rt.total_in_rank, -- denominator tech.technology AS app, -- each Accessibility technology COUNT(DISTINCT r.page) AS sites_with_app, -- pages using that app - SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app + -- SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app + ROUND(100 * SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank), 1) AS pct_sites_with_app FROM ranked_sites AS r CROSS JOIN UNNEST(r.technologies) AS tech JOIN rank_totals AS rt From 7ab00177f1b3c45c2e9574191f86daf0ee68a58c Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 12:13:41 -0400 Subject: [PATCH 007/150] Update a11y_technology_usage_by_domain_rank.sql - update --- .../a11y_technology_usage_by_domain_rank.sql | 98 +++++++++++-------- 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql index 79d88f788d1..3563676dd4d 100644 --- a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql +++ b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql @@ -1,30 +1,35 @@ #standardSQL # Purpose -# Report usage of Accessibility-category technologies by domain-rank buckets, -# split by client and root-page flag. For each rank bucket, compute: -# • total_in_rank = distinct pages in the bucket -# • sites_with_app = distinct pages using each Accessibility technology -# • pct_sites_with_app = sites_with_app / total_in_rank +# Report prevalence of ALL Accessibility-category technologies by domain-rank bucket, +# split by client and root-page flag, for a single crawl date. +# For each (client, is_root_page, rank_grouping, app) compute: +# • total_in_rank = distinct pages in the bucket (denominator) +# • sites_with_app = distinct pages using that technology +# • pct_sites_with_app = 100 * sites_with_app / total_in_rank (percent points) # -# How to use the toggle the deterministic hash sampler -# • For cheap test runs: set cfg.enable_sample = TRUE and pick a modulus (e.g. 10000 ≈ ~0.01%). -# • For full production: set cfg.enable_sample = FALSE (sampler bypassed). +# Design +# • Shared base CTE (ranked_pages) feeds both numerator and denominator. +# • LEFT JOIN keeps buckets even when a vendor has zero hits. +# • Toggleable deterministic sampler for cheap tests; disable for production. +# • Unit is page (distinct URL). For site-level, replace DISTINCT page with +# DISTINCT REGEXP_EXTRACT(page, r'^https?://([^/]+)') in BOTH numerator and denominator. +# +# How to use +# • Set cfg.enable_sample = FALSE for YoY comparability (full partition). +# • Keep the date, bucket CASE thresholds, and filters identical across years. WITH cfg AS ( SELECT - TRUE AS enable_sample, -- set TRUE for full table (no sampling) - 10000 AS modulus, -- larger = smaller sample (e.g., 10000 ≈ 0.01%) - 0 AS remainder -- choose any remainder in [0, modulus-1] for a different slice + FALSE AS enable_sample, -- set TRUE for cheap tests; FALSE for full run + 10000 AS modulus, -- if sampling: ~0.01% when TRUE + 0 AS remainder ), -ranked_sites AS ( - -- Base set of pages with rank bucket +ranked_pages AS ( SELECT p.client, p.is_root_page, p.page, - p.rank, - p.technologies, CASE WHEN p.rank <= 1000 THEN 1000 WHEN p.rank <= 10000 THEN 10000 @@ -33,7 +38,8 @@ ranked_sites AS ( WHEN p.rank <= 10000000 THEN 10000000 WHEN p.rank <= 100000000 THEN 100000000 ELSE NULL - END AS rank_grouping + END AS rank_grouping, + p.technologies FROM `httparchive.crawl.pages` AS p, cfg WHERE p.date = '2025-07-01' @@ -46,40 +52,48 @@ ranked_sites AS ( ), rank_totals AS ( - -- Denominator: total distinct pages per client/root/rank bucket SELECT client, is_root_page, rank_grouping, COUNT(DISTINCT page) AS total_in_rank - FROM ranked_sites + FROM ranked_pages WHERE rank_grouping IS NOT NULL GROUP BY client, is_root_page, rank_grouping +), + +vendor_hits AS ( + -- Count pages per Accessibility vendor + SELECT + rp.client, + rp.is_root_page, + rp.rank_grouping, + tech.technology AS app, + COUNT(DISTINCT rp.page) AS pages_with_vendor + FROM ranked_pages rp + CROSS JOIN UNNEST(rp.technologies) AS tech + WHERE + rp.rank_grouping IS NOT NULL + AND EXISTS ( + SELECT 1 + FROM UNNEST(tech.categories) AS c + WHERE c = 'Accessibility' + ) + GROUP BY rp.client, rp.is_root_page, rp.rank_grouping, app ) SELECT - r.client, - r.is_root_page, - r.rank_grouping, - rt.total_in_rank, -- denominator - tech.technology AS app, -- each Accessibility technology - COUNT(DISTINCT r.page) AS sites_with_app, -- pages using that app - -- SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app - ROUND(100 * SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank), 1) AS pct_sites_with_app -FROM ranked_sites AS r -CROSS JOIN UNNEST(r.technologies) AS tech -JOIN rank_totals AS rt - ON rt.client = r.client - AND rt.is_root_page = r.is_root_page - AND rt.rank_grouping = r.rank_grouping -WHERE - r.rank_grouping IS NOT NULL - AND EXISTS ( - SELECT 1 - FROM UNNEST(tech.categories) AS c - WHERE c = 'Accessibility' - ) -GROUP BY - r.client, r.is_root_page, r.rank_grouping, rt.total_in_rank, tech.technology + rt.client, + rt.is_root_page, + rt.rank_grouping, + rt.total_in_rank, + vh.app, + IFNULL(vh.pages_with_vendor, 0) AS sites_with_app, + ROUND(100 * SAFE_DIVIDE(IFNULL(vh.pages_with_vendor, 0), rt.total_in_rank), 1) AS pct_sites_with_app +FROM rank_totals rt +LEFT JOIN vendor_hits vh + ON vh.client = rt.client + AND vh.is_root_page = rt.is_root_page + AND vh.rank_grouping = rt.rank_grouping ORDER BY - tech.technology, r.rank_grouping, r.client, r.is_root_page; + vh.app, rt.rank_grouping, rt.client, rt.is_root_page; From f962c3bcb3cbe80fe58d36e327cc8800ec2e2f8b Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 12:39:33 -0400 Subject: [PATCH 008/150] Create alt_ending_in_image_extension.sql --- .../alt_ending_in_image_extension.sql | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 sql/2025/accessibility/alt_ending_in_image_extension.sql diff --git a/sql/2025/accessibility/alt_ending_in_image_extension.sql b/sql/2025/accessibility/alt_ending_in_image_extension.sql new file mode 100644 index 00000000000..936b6febce7 --- /dev/null +++ b/sql/2025/accessibility/alt_ending_in_image_extension.sql @@ -0,0 +1,129 @@ +-- UDF must come before any WITH/SELECT +CREATE TEMPORARY FUNCTION getUsedExtensions(payload STRING) +RETURNS ARRAY> +LANGUAGE js AS r''' +try { + if (!payload) return []; + const a11y = JSON.parse(payload); + const fe = a11y && a11y.file_extension_alts && a11y.file_extension_alts.file_extensions; + if (!fe || typeof fe !== 'object') return []; + return Object.entries(fe).map(([extension, total]) => ({ + extension, + total: Number(total) || 0 + })); +} catch (e) { return []; } +'''; + +#standardSQL +# Purpose +# Quantify how often alt text ends with an image file extension +# (e.g., “.jpg”, “.png”) for a single crawl date, split by client and is_root_page. +# +# Outputs (per client, is_root_page, extension) +# • sites_with_non_empty_alt – sites (distinct pages) with ≥1 non-empty alt +# • sites_with_file_extension_alt – sites with ≥1 alt ending in a file extension +# • total_non_empty_alts – total non-empty alts across those sites +# • total_alts_with_file_extensions – total alts that end in a file extension +# • extension – specific file extension (e.g., "jpg") +# • total_sites_using – sites with ≥1 alt ending in this extension +# • pct_sites_with_file_extension_alt – share of applicable sites with any file-extension alt +# • pct_alts_with_file_extension – share of all non-empty alts that end with any extension +# • pct_applicable_sites_using – share of applicable sites using this specific extension +# • total_occurrences – total occurrences of this extension in alt text +# • pct_total_occurrences – share of all file-extension alt occurrences that are this extension +# +-- Set the crawl date once here +WITH + params AS (SELECT DATE '2025-07-01' AS crawl_date), + + -- Cost-saving toggle (remove later for full run) + cfg AS ( + SELECT + FALSE AS enable_sample, -- set to FALSE (or delete cfg + predicates) for full run + 100000 AS modulus, -- larger modulus => smaller sample (~0.01% here) + 0 AS remainder + ), + + -- Sampled base: filter BEFORE any JSON parsing/UNNEST; select only needed subfields + base_pages AS ( + SELECT + p.client, + p.is_root_page, + p.page, + p.custom_metrics.a11y AS a11y, -- JSON + p.custom_metrics.markup AS markup -- JSON + FROM `httparchive.crawl.pages` AS p + JOIN params ON p.date = params.crawl_date + CROSS JOIN cfg + WHERE + NOT cfg.enable_sample + OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.modulus) = cfg.remainder + ), + + -- Denominators and "any extension" numerators, from the SAME sampled base + site_rollup AS ( + SELECT + bp.client, + bp.is_root_page, + COUNTIF(total_non_empty_alt > 0) AS sites_with_non_empty_alt, + COUNTIF(total_with_file_extension > 0) AS sites_with_file_extension_alt, + SUM(total_non_empty_alt) AS total_non_empty_alts, + SUM(total_with_file_extension) AS total_alts_with_file_extensions + FROM ( + SELECT + bp.client, + bp.is_root_page, + SAFE_CAST(JSON_VALUE(bp.markup, '$.images.img.alt.present') AS INT64) AS total_non_empty_alt, + SAFE_CAST(JSON_VALUE(bp.a11y, '$.file_extension_alts.total_with_file_extension') AS INT64) AS total_with_file_extension + FROM base_pages AS bp + ) bp + GROUP BY bp.client, bp.is_root_page + ) + +SELECT + bp.client, + bp.is_root_page, + sr.sites_with_non_empty_alt, + sr.sites_with_file_extension_alt, + sr.total_alts_with_file_extensions, + + -- Of sites with a non-empty alt, what % have any alt with a file extension + SAFE_DIVIDE(sr.sites_with_file_extension_alt, sr.sites_with_non_empty_alt) AS pct_sites_with_file_extension_alt, + + -- Given a random non-empty alt, how often does it end in a file extension + SAFE_DIVIDE(sr.total_alts_with_file_extensions, sr.total_non_empty_alts) AS pct_alts_with_file_extension, + + ext.extension AS extension, + + -- Sites using this specific extension at least once (still from the sampled base) + COUNT(DISTINCT bp.page) AS total_sites_using, + + -- Of applicable sites, what % use this specific extension at least once + SAFE_DIVIDE(COUNT(DISTINCT bp.page), sr.sites_with_non_empty_alt) AS pct_applicable_sites_using, + + -- Total occurrences of this specific extension in alts + SUM(ext.total) AS total_occurrences, + + -- Given a random file-extension alt, how often is it this extension + SAFE_DIVIDE(SUM(ext.total), sr.total_alts_with_file_extensions) AS pct_total_occurrences + +FROM base_pages AS bp +CROSS JOIN UNNEST( + getUsedExtensions(TO_JSON_STRING(bp.a11y)) +) AS ext +LEFT JOIN site_rollup sr + ON sr.client = bp.client AND sr.is_root_page = bp.is_root_page + +GROUP BY + bp.client, + bp.is_root_page, + sr.sites_with_non_empty_alt, + sr.sites_with_file_extension_alt, + sr.total_alts_with_file_extensions, + sr.total_non_empty_alts, + ext.extension + +ORDER BY + bp.client, + bp.is_root_page, + total_occurrences DESC; From 365f90bfcb3f9e29349ff109e7cf6cf6bf5ead7a Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 12:49:36 -0400 Subject: [PATCH 009/150] Update alt_ending_in_image_extension.sql - update --- .../alt_ending_in_image_extension.sql | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/sql/2025/accessibility/alt_ending_in_image_extension.sql b/sql/2025/accessibility/alt_ending_in_image_extension.sql index 936b6febce7..ab15dada4be 100644 --- a/sql/2025/accessibility/alt_ending_in_image_extension.sql +++ b/sql/2025/accessibility/alt_ending_in_image_extension.sql @@ -22,36 +22,37 @@ try { # Outputs (per client, is_root_page, extension) # • sites_with_non_empty_alt – sites (distinct pages) with ≥1 non-empty alt # • sites_with_file_extension_alt – sites with ≥1 alt ending in a file extension -# • total_non_empty_alts – total non-empty alts across those sites # • total_alts_with_file_extensions – total alts that end in a file extension +# • pct_sites_with_file_extension_alt – string percentage of sites with file-extension alt +# • pct_alts_with_file_extension – string percentage of alts ending with any extension # • extension – specific file extension (e.g., "jpg") # • total_sites_using – sites with ≥1 alt ending in this extension -# • pct_sites_with_file_extension_alt – share of applicable sites with any file-extension alt -# • pct_alts_with_file_extension – share of all non-empty alts that end with any extension -# • pct_applicable_sites_using – share of applicable sites using this specific extension +# • pct_applicable_sites_using – string percentage of sites with non-empty alts using this extension # • total_occurrences – total occurrences of this extension in alt text -# • pct_total_occurrences – share of all file-extension alt occurrences that are this extension +# • pct_total_occurrences – string percentage of extension occurrences among all file-extension alts +# +# Notes +# • Percentages are returned as formatted strings (e.g. "6.7%") to match 2024 output style. +# • Underlying math uses SAFE_DIVIDE to avoid divide-by-zero. +# • Cost-saving sampler included; remove cfg CTE and predicates or set enable_sample=FALSE for full run. # --- Set the crawl date once here WITH params AS (SELECT DATE '2025-07-01' AS crawl_date), - -- Cost-saving toggle (remove later for full run) cfg AS ( SELECT - FALSE AS enable_sample, -- set to FALSE (or delete cfg + predicates) for full run - 100000 AS modulus, -- larger modulus => smaller sample (~0.01% here) + TRUE AS enable_sample, -- set FALSE (or delete cfg + predicates) for full run + 10000 AS modulus, -- larger modulus => smaller sample (~0.01% here) 0 AS remainder ), - -- Sampled base: filter BEFORE any JSON parsing/UNNEST; select only needed subfields base_pages AS ( SELECT p.client, p.is_root_page, p.page, - p.custom_metrics.a11y AS a11y, -- JSON - p.custom_metrics.markup AS markup -- JSON + p.custom_metrics.a11y AS a11y, + p.custom_metrics.markup AS markup FROM `httparchive.crawl.pages` AS p JOIN params ON p.date = params.crawl_date CROSS JOIN cfg @@ -60,7 +61,6 @@ WITH OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.modulus) = cfg.remainder ), - -- Denominators and "any extension" numerators, from the SAME sampled base site_rollup AS ( SELECT bp.client, @@ -87,30 +87,26 @@ SELECT sr.sites_with_file_extension_alt, sr.total_alts_with_file_extensions, - -- Of sites with a non-empty alt, what % have any alt with a file extension - SAFE_DIVIDE(sr.sites_with_file_extension_alt, sr.sites_with_non_empty_alt) AS pct_sites_with_file_extension_alt, - - -- Given a random non-empty alt, how often does it end in a file extension - SAFE_DIVIDE(sr.total_alts_with_file_extensions, sr.total_non_empty_alts) AS pct_alts_with_file_extension, + -- Percentages as strings + FORMAT('%.1f%%', 100 * SAFE_DIVIDE(sr.sites_with_file_extension_alt, sr.sites_with_non_empty_alt)) + AS pct_sites_with_file_extension_alt, + FORMAT('%.1f%%', 100 * SAFE_DIVIDE(sr.total_alts_with_file_extensions, sr.total_non_empty_alts)) + AS pct_alts_with_file_extension, - ext.extension AS extension, + ext.extension, - -- Sites using this specific extension at least once (still from the sampled base) COUNT(DISTINCT bp.page) AS total_sites_using, - -- Of applicable sites, what % use this specific extension at least once - SAFE_DIVIDE(COUNT(DISTINCT bp.page), sr.sites_with_non_empty_alt) AS pct_applicable_sites_using, + FORMAT('%.1f%%', 100 * SAFE_DIVIDE(COUNT(DISTINCT bp.page), sr.sites_with_non_empty_alt)) + AS pct_applicable_sites_using, - -- Total occurrences of this specific extension in alts SUM(ext.total) AS total_occurrences, - -- Given a random file-extension alt, how often is it this extension - SAFE_DIVIDE(SUM(ext.total), sr.total_alts_with_file_extensions) AS pct_total_occurrences + FORMAT('%.1f%%', 100 * SAFE_DIVIDE(SUM(ext.total), sr.total_alts_with_file_extensions)) + AS pct_total_occurrences FROM base_pages AS bp -CROSS JOIN UNNEST( - getUsedExtensions(TO_JSON_STRING(bp.a11y)) -) AS ext +CROSS JOIN UNNEST(getUsedExtensions(TO_JSON_STRING(bp.a11y))) AS ext LEFT JOIN site_rollup sr ON sr.client = bp.client AND sr.is_root_page = bp.is_root_page From 30f50c65093de20d95d1cc3a3b7e1752b0eb4521 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 12:50:29 -0400 Subject: [PATCH 010/150] Update alt_ending_in_image_extension.sql - FULL RUN --- sql/2025/accessibility/alt_ending_in_image_extension.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2025/accessibility/alt_ending_in_image_extension.sql b/sql/2025/accessibility/alt_ending_in_image_extension.sql index ab15dada4be..aa44abc531e 100644 --- a/sql/2025/accessibility/alt_ending_in_image_extension.sql +++ b/sql/2025/accessibility/alt_ending_in_image_extension.sql @@ -41,7 +41,7 @@ WITH cfg AS ( SELECT - TRUE AS enable_sample, -- set FALSE (or delete cfg + predicates) for full run + FALSE AS enable_sample, -- set FALSE (or delete cfg + predicates) for full run 10000 AS modulus, -- larger modulus => smaller sample (~0.01% here) 0 AS remainder ), From 7c5f3a1c5a5956ce5a82fa59ad431f953de61600 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 13:04:44 -0400 Subject: [PATCH 011/150] Create anchors_with_role_button.sql --- .../anchors_with_role_button.sql | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 sql/2025/accessibility/anchors_with_role_button.sql diff --git a/sql/2025/accessibility/anchors_with_role_button.sql b/sql/2025/accessibility/anchors_with_role_button.sql new file mode 100644 index 00000000000..4bf566e15a6 --- /dev/null +++ b/sql/2025/accessibility/anchors_with_role_button.sql @@ -0,0 +1,57 @@ +#standardSQL +# Anchors with role='button' +# What this does +# - Reads the 2025-07-01 crawl from `httparchive.crawl.pages` +# - Uses custom_metrics (not the legacy `payload`) to get: +# - total_anchors_with_role_button: $.a11y.total_anchors_with_role_button +# - total_anchors: from $.markup.elements.a.count (fallbacks included) +# - Aggregates by client and is_root_page: +# - sites_with_anchors = COUNTIF(total_anchors > 0) +# - sites_with_anchor_role_button = COUNTIF(total_anchors_with_role_button > 0) +# - pct_sites_with_anchor_role_button = SAFE_DIVIDE(sites_with_anchor_role_button, sites_with_anchors) +# - Sampling: hash-samples by URL to reduce cost. Flip `enable_sample` to FALSE for full run. + +WITH cfg AS ( + SELECT + FALSE AS enable_sample, -- set to FALSE for full data + 1000 AS sample_mod, -- 1 in 1000 (~0.1%) + 0 AS sample_keep -- keep bucket where MOD(...) = this value +), + +page_rows AS ( + SELECT + p.client, + p.is_root_page, + -- a11y metric: anchors having role="button" + SAFE_CAST(JSON_VALUE(p.custom_metrics.a11y, '$.total_anchors_with_role_button') AS INT64) + AS total_anchors_with_role_button, + + -- total anchors; try common paths and fall back to 0 + COALESCE( + SAFE_CAST(JSON_VALUE(p.custom_metrics.markup, '$.elements.a.count') AS INT64), + SAFE_CAST(JSON_VALUE(p.custom_metrics.markup, '$.elements.a') AS INT64), + 0 + ) AS total_anchors + FROM + `httparchive.crawl.pages` AS p + CROSS JOIN cfg + WHERE + -- push partition filter INSIDE to avoid scanning all dates + p.date = '2025-07-01' + -- low-cost deterministic sampling by URL + AND (NOT cfg.enable_sample + OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.sample_mod) = cfg.sample_keep) +) + +SELECT + client, + is_root_page, + COUNTIF(total_anchors > 0) AS sites_with_anchors, + COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button, + SAFE_DIVIDE( + COUNTIF(total_anchors_with_role_button > 0), + COUNTIF(total_anchors > 0) + ) AS pct_sites_with_anchor_role_button +FROM page_rows +GROUP BY client, is_root_page +ORDER BY client, is_root_page; From addec2bd2e603b439d176687ab991905a14e697c Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 13:39:13 -0400 Subject: [PATCH 012/150] Update anchors_with_role_button.sql - update to avoid blanks 0 and null values were returned incorrectly. --- .../anchors_with_role_button.sql | 125 +++++++++++------- 1 file changed, 75 insertions(+), 50 deletions(-) diff --git a/sql/2025/accessibility/anchors_with_role_button.sql b/sql/2025/accessibility/anchors_with_role_button.sql index 4bf566e15a6..9f38e2bde4a 100644 --- a/sql/2025/accessibility/anchors_with_role_button.sql +++ b/sql/2025/accessibility/anchors_with_role_button.sql @@ -1,57 +1,82 @@ -#standardSQL -# Anchors with role='button' -# What this does -# - Reads the 2025-07-01 crawl from `httparchive.crawl.pages` -# - Uses custom_metrics (not the legacy `payload`) to get: -# - total_anchors_with_role_button: $.a11y.total_anchors_with_role_button -# - total_anchors: from $.markup.elements.a.count (fallbacks included) -# - Aggregates by client and is_root_page: -# - sites_with_anchors = COUNTIF(total_anchors > 0) -# - sites_with_anchor_role_button = COUNTIF(total_anchors_with_role_button > 0) -# - pct_sites_with_anchor_role_button = SAFE_DIVIDE(sites_with_anchor_role_button, sites_with_anchors) -# - Sampling: hash-samples by URL to reduce cost. Flip `enable_sample` to FALSE for full run. +-- HTTP Archive Almanac: with role="button" +-- +-- Purpose: +-- This query measures how many websites use (anchor) elements with +-- role="button" attributes, relative to the total number of anchor tags. +-- +-- Dataset: +-- `httparchive.crawl.pages` on date = '2025-07-01' +-- +-- Key fields used: +-- - custom_metrics.a11y.total_anchors_with_role_button: +-- Count of anchor elements that declare role="button". +-- - custom_metrics.element_count.a: +-- Total count of elements per page. +-- - URL extraction: +-- Multiple fallbacks are used to extract the canonical URL for de-duping: +-- 1. custom_metrics.performance.lcp_resource.documentURL +-- 2. custom_metrics.canonicals.url (via JSON) +-- 3. payload.url +-- 4. payload._url +-- +-- Aggregation logic: +-- - We restrict to is_root_page = TRUE so each site is counted once. +-- - Hosts are extracted with NET.HOST(url_str). +-- - sites_with_any_a: +-- Distinct hosts with at least one . +-- - sites_with_a_role_button: +-- Distinct hosts with at least one . +-- - pct_sites_with_a_role_button: +-- Fraction of sites_with_a_role_button / sites_with_any_a. +-- +-- Safety: +-- - SAFE_CAST used to convert JSON strings into INT64, preventing query errors. +-- - SAFE_DIVIDE avoids division-by-zero. +-- +-- Output: +-- client | sites_with_any_a | sites_with_a_role_button | pct_sites_with_a_role_button -WITH cfg AS ( +WITH base AS ( SELECT - FALSE AS enable_sample, -- set to FALSE for full data - 1000 AS sample_mod, -- 1 in 1000 (~0.1%) - 0 AS sample_keep -- keep bucket where MOD(...) = this value -), - -page_rows AS ( - SELECT - p.client, - p.is_root_page, - -- a11y metric: anchors having role="button" - SAFE_CAST(JSON_VALUE(p.custom_metrics.a11y, '$.total_anchors_with_role_button') AS INT64) - AS total_anchors_with_role_button, - - -- total anchors; try common paths and fall back to 0 + client, + is_root_page, COALESCE( - SAFE_CAST(JSON_VALUE(p.custom_metrics.markup, '$.elements.a.count') AS INT64), - SAFE_CAST(JSON_VALUE(p.custom_metrics.markup, '$.elements.a') AS INT64), - 0 - ) AS total_anchors - FROM - `httparchive.crawl.pages` AS p - CROSS JOIN cfg - WHERE - -- push partition filter INSIDE to avoid scanning all dates - p.date = '2025-07-01' - -- low-cost deterministic sampling by URL - AND (NOT cfg.enable_sample - OR MOD(ABS(FARM_FINGERPRINT(p.page)), cfg.sample_mod) = cfg.sample_keep) + JSON_VALUE(custom_metrics.performance, '$.lcp_resource.documentURL'), + JSON_VALUE(TO_JSON_STRING(custom_metrics), '$.canonicals.url'), + JSON_VALUE(payload, '$.url'), + JSON_VALUE(payload, '$._url') + ) AS url_str, + SAFE_CAST(JSON_VALUE(custom_metrics.a11y, '$.total_anchors_with_role_button') AS INT64) + AS anchors_role_button, + SAFE_CAST(JSON_VALUE(TO_JSON_STRING(custom_metrics), '$.element_count.a') AS INT64) + AS total_a_elements + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' ) - SELECT client, - is_root_page, - COUNTIF(total_anchors > 0) AS sites_with_anchors, - COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button, + COUNT(DISTINCT IF( + total_a_elements > 0 + AND url_str IS NOT NULL + AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), + NET.HOST(url_str), NULL)) AS sites_with_any_a, + COUNT(DISTINCT IF( + anchors_role_button > 0 + AND url_str IS NOT NULL + AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), + NET.HOST(url_str), NULL)) AS sites_with_a_role_button, SAFE_DIVIDE( - COUNTIF(total_anchors_with_role_button > 0), - COUNTIF(total_anchors > 0) - ) AS pct_sites_with_anchor_role_button -FROM page_rows -GROUP BY client, is_root_page -ORDER BY client, is_root_page; + COUNT(DISTINCT IF( + anchors_role_button > 0 + AND url_str IS NOT NULL + AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), + NET.HOST(url_str), NULL)), + COUNT(DISTINCT IF( + total_a_elements > 0 + AND url_str IS NOT NULL + AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), + NET.HOST(url_str), NULL)) + ) AS pct_sites_with_a_role_button +FROM base +WHERE is_root_page +GROUP BY client; From d4f0bfff09e1b37496d1552f7d1922d001d82a48 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 13:43:12 -0400 Subject: [PATCH 013/150] pulling apart root and non-root values --- .../anchors_with_role_button.sql | 106 ++++++++---------- 1 file changed, 45 insertions(+), 61 deletions(-) diff --git a/sql/2025/accessibility/anchors_with_role_button.sql b/sql/2025/accessibility/anchors_with_role_button.sql index 9f38e2bde4a..a29115b24fb 100644 --- a/sql/2025/accessibility/anchors_with_role_button.sql +++ b/sql/2025/accessibility/anchors_with_role_button.sql @@ -1,82 +1,66 @@ --- HTTP Archive Almanac: with role="button" --- --- Purpose: --- This query measures how many websites use (anchor) elements with --- role="button" attributes, relative to the total number of anchor tags. --- --- Dataset: --- `httparchive.crawl.pages` on date = '2025-07-01' --- --- Key fields used: --- - custom_metrics.a11y.total_anchors_with_role_button: --- Count of anchor elements that declare role="button". --- - custom_metrics.element_count.a: --- Total count of elements per page. --- - URL extraction: --- Multiple fallbacks are used to extract the canonical URL for de-duping: --- 1. custom_metrics.performance.lcp_resource.documentURL --- 2. custom_metrics.canonicals.url (via JSON) --- 3. payload.url --- 4. payload._url --- --- Aggregation logic: --- - We restrict to is_root_page = TRUE so each site is counted once. --- - Hosts are extracted with NET.HOST(url_str). --- - sites_with_any_a: --- Distinct hosts with at least one . --- - sites_with_a_role_button: --- Distinct hosts with at least one . --- - pct_sites_with_a_role_button: --- Fraction of sites_with_a_role_button / sites_with_any_a. --- +-- HTTP Archive Almanac 2025 (match 2024 structure) +-- Metric: share of SITES that have at least one +-- Grouping: client, is_root_page (do NOT merge root/non-root) +-- Method: +-- 1) Determine a canonical-ish URL string per row (page) from a few fields. +-- 2) Keep only http(s) URLs and extract NET.HOST() as the site key. +-- 3) From custom_metrics: +-- - total_anchors_with_role_button: a11y.total_anchors_with_role_button +-- - total_a_elements: element_count.a +-- 4) For each (client, is_root_page, host) decide: +-- - has_any_a := max(total_a_elements > 0) +-- - has_anchor_role_button := max(total_anchors_with_role_button > 0) +-- 5) Count DISTINCT hosts per group that satisfy each condition. +-- Sampling: +-- - TABLESAMPLE SYSTEM (.1 PERCENT) for approximate results. -- Safety: --- - SAFE_CAST used to convert JSON strings into INT64, preventing query errors. --- - SAFE_DIVIDE avoids division-by-zero. --- --- Output: --- client | sites_with_any_a | sites_with_a_role_button | pct_sites_with_a_role_button +-- - SAFE_CAST / JSON_VALUE with TO_JSON_STRING() where needed. -WITH base AS ( +-- standardSQL +WITH per_page AS ( SELECT client, is_root_page, + -- canonical-ish URL with fallbacks COALESCE( JSON_VALUE(custom_metrics.performance, '$.lcp_resource.documentURL'), JSON_VALUE(TO_JSON_STRING(custom_metrics), '$.canonicals.url'), JSON_VALUE(payload, '$.url'), JSON_VALUE(payload, '$._url') ) AS url_str, + + -- metrics from custom_metrics SAFE_CAST(JSON_VALUE(custom_metrics.a11y, '$.total_anchors_with_role_button') AS INT64) AS anchors_role_button, SAFE_CAST(JSON_VALUE(TO_JSON_STRING(custom_metrics), '$.element_count.a') AS INT64) AS total_a_elements FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' +), +per_site AS ( + -- One row per (client, is_root_page, host) with boolean flags + SELECT + client, + is_root_page, + NET.HOST(url_str) AS host, + -- A site “has any a” if ANY sampled page on that site has > 0 + LOGICAL_OR(total_a_elements > 0) AS has_any_a, + -- A site “has anchor role button” if ANY sampled page has > 0 + LOGICAL_OR(anchors_role_button > 0) AS has_anchor_role_button + FROM per_page + WHERE url_str IS NOT NULL + AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')) + GROUP BY client, is_root_page, host ) SELECT client, - COUNT(DISTINCT IF( - total_a_elements > 0 - AND url_str IS NOT NULL - AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), - NET.HOST(url_str), NULL)) AS sites_with_any_a, - COUNT(DISTINCT IF( - anchors_role_button > 0 - AND url_str IS NOT NULL - AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), - NET.HOST(url_str), NULL)) AS sites_with_a_role_button, + is_root_page, + COUNTIF(has_any_a) AS sites_with_anchors, + COUNTIF(has_anchor_role_button) AS sites_with_anchor_role_button, SAFE_DIVIDE( - COUNT(DISTINCT IF( - anchors_role_button > 0 - AND url_str IS NOT NULL - AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), - NET.HOST(url_str), NULL)), - COUNT(DISTINCT IF( - total_a_elements > 0 - AND url_str IS NOT NULL - AND (STARTS_WITH(url_str, 'http://') OR STARTS_WITH(url_str, 'https://')), - NET.HOST(url_str), NULL)) - ) AS pct_sites_with_a_role_button -FROM base -WHERE is_root_page -GROUP BY client; + COUNTIF(has_anchor_role_button), + COUNTIF(has_any_a) + ) AS pct_sites_with_anchor_role_button +FROM per_site +GROUP BY client, is_root_page +ORDER BY client, is_root_page DESC; From 4765addc48078cb8a6ff12fe740e28d8280056bb Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 13:49:15 -0400 Subject: [PATCH 014/150] dividing up the is_root_page and not --- sql/2025/accessibility/anchors_with_role_button.sql | 3 --- 1 file changed, 3 deletions(-) diff --git a/sql/2025/accessibility/anchors_with_role_button.sql b/sql/2025/accessibility/anchors_with_role_button.sql index a29115b24fb..619cd130c5a 100644 --- a/sql/2025/accessibility/anchors_with_role_button.sql +++ b/sql/2025/accessibility/anchors_with_role_button.sql @@ -1,4 +1,3 @@ --- HTTP Archive Almanac 2025 (match 2024 structure) -- Metric: share of SITES that have at least one -- Grouping: client, is_root_page (do NOT merge root/non-root) -- Method: @@ -11,8 +10,6 @@ -- - has_any_a := max(total_a_elements > 0) -- - has_anchor_role_button := max(total_anchors_with_role_button > 0) -- 5) Count DISTINCT hosts per group that satisfy each condition. --- Sampling: --- - TABLESAMPLE SYSTEM (.1 PERCENT) for approximate results. -- Safety: -- - SAFE_CAST / JSON_VALUE with TO_JSON_STRING() where needed. From a7abec92656a603f37f09ceb6b13f1a97da115e3 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 1 Sep 2025 16:00:47 -0400 Subject: [PATCH 015/150] Create audio_track_usage.sql - update --- sql/2025/accessibility/audio_track_usage.sql | 86 ++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 sql/2025/accessibility/audio_track_usage.sql diff --git a/sql/2025/accessibility/audio_track_usage.sql b/sql/2025/accessibility/audio_track_usage.sql new file mode 100644 index 00000000000..e528af7a11b --- /dev/null +++ b/sql/2025/accessibility/audio_track_usage.sql @@ -0,0 +1,86 @@ +-- standardSQL +-- Web Almanac — Audio elements with usage (2025) +-- +-- What this query does +-- For each (client, is_root_page), at the SITE (hostname) level: +-- • Count sites that have any