HTTPArchive
diff --git a/‎.github/workflows/compress-images.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/compress-images.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/predeploy.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/predeploy.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sql/.sqlfluff‎
Lines changed: 1 addition & 1 deletion b/‎sql/.sqlfluff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/2024/media/video_adoption.sql‎
Lines changed: 1 addition & 1 deletion b/‎sql/2024/media/video_adoption.sql‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/2025/accessibility/.sqlfluffignore‎
Lines changed: 2 additions & 0 deletions b/‎sql/2025/accessibility/.sqlfluffignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/2025/accessibility/README copy.md‎
Lines changed: 20 additions & 0 deletions b/‎sql/2025/accessibility/README copy.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎sql/2025/accessibility/a11y_frontend_technology.sql‎
Lines changed: 84 additions & 0 deletions b/‎sql/2025/accessibility/a11y_frontend_technology.sql‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql‎
Lines changed: 93 additions & 0 deletions b/‎sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎sql/2025/accessibility/a11y_technology_usage.sql‎
Lines changed: 49 additions & 0 deletions b/‎sql/2025/accessibility/a11y_technology_usage.sql‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql‎
Lines changed: 95 additions & 0 deletions b/‎sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql‎
Lines changed: 95 additions & 0 deletions
@@ -59,7 +59,7 @@ jobs:
           github.event_name != 'pull_request' &&
           steps.calibre.outputs.markdown != ''
         id: cpr
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v8
         with:
           title: Auto Compress Images
           branch-suffix: timestamp
 
@@ -67,7 +67,7 @@ jobs:
         npm run ebooks
     - name: Upload PDF artifact
       if: ${{ github.event.inputs.ebooks == 'true' }}
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v6
       with:
         name: pdfs
         path: ./src/static/pdfs/*.pdf
@@ -79,7 +79,7 @@ jobs:
         npm run timestamps
     - name: Create Pull Request
       id: cpr
-      uses: peter-evans/create-pull-request@v7
+      uses: peter-evans/create-pull-request@v8
       with:
         title: Pre-deploy Updates
         branch-suffix: timestamp
 
@@ -28,7 +28,7 @@ exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,AM08,CP02,CP03,CV02,CV12,LT05,LT09
 # ST08 - Sometimes clearer to include brackets for complex COUNT(DISTINCT) cases
 # ST11 - Doesn't consider wildcards in SELECT. Issue: https://github.com/sqlfluff/sqlfluff/issues/6511
 
-large_file_skip_byte_limit = 40000
+large_file_skip_byte_limit = 1120000
 # CPU processes to use while linting.
 # If positive, just implies number of processes.
 # If negative or zero, implies number_of_cpus - specifed_number.
 
@@ -1,7 +1,7 @@
 #standardSQL
 # How many pages use <video>?
 # video_adoption.sql
-# â\x9d• Updated in 2024
+# ❕ Updated in 2024
 
 SELECT
   client,
 
@@ -0,0 +1,2 @@
+lighthouse_score_by_government.sql
+lighthouse_score_by_government_with_urls.sql
@@ -0,0 +1,20 @@
+# 2024 Accessibility queries
+
+<!--
+  This directory contains all of the 2024 Accessibility chapter queries.
+
+  Each query should have a corresponding `metric_name.sql` file.
+  Note that readers are linked to this directory, so try to make the SQL file names descriptive for easy browsing.
+
+  Analysts: if helpful, you can use this README to give additional info about the queries.
+-->
+
+## Resources
+
+- [📄 Planning doc][~google-doc]
+- [📊 Results sheet][~google-sheets]
+- [📝 Markdown file][~chapter-markdown]
+
+[~google-doc]: https://docs.google.com/document/d/1anCSQk9g_YDfZP6GtjqdC-vCfnCNZAUEQwjSr8AzqTw/edit
+[~google-sheets]: https://docs.google.com/spreadsheets/d/1btB1r9QpdgTyToPhn7glcGAdMFs7eq4UcQSVIHBqiYQ/edit#gid=1778117656
+[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/accessibility.md
@@ -0,0 +1,84 @@
+#standardSQL
+-- Web Almanac — Lighthouse category scores by framework (2025-07-01)
+-- Google Sheet: a11y_frontend_technology
+--
+-- Purpose
+--   • Extract Lighthouse category scores (performance, accessibility,
+--     best-practices, SEO) from JSON in the crawl dataset.
+--   • Associate each crawled page with detected frontend frameworks or JS libraries.
+--   • Limit to root pages only for consistency.
+--   • De-duplicate multiple {page, framework} rows caused by UNNEST, by averaging
+--     scores per page before computing framework-level averages.
+--
+-- Method
+--   1. Extract scores with JSON_EXTRACT_SCALAR, cast to FLOAT64.
+--   2. Filter to categories: Web frameworks, JavaScript libraries,
+--      Frontend frameworks, JavaScript frameworks.
+--   3. Aggregate in two steps:
+--        a. Per {client, page, framework}, average scores to remove duplicates.
+--        b. Global averages per {client, framework}.
+--
+-- Output columns
+--   client                   — "desktop" | "mobile"
+--   framework                — detected framework or JS library
+--   avg_performance_score    — average Lighthouse performance score (0–1)
+--   avg_accessibility_score  — average Lighthouse accessibility score (0–1)
+--   avg_best_practices_score — average Lighthouse best-practices score (0–1)
+--   avg_seo_score            — average Lighthouse SEO score (0–1)
+--   total_pages              — distinct page count per {client, framework}
+--
+-- Notes
+--   • Scores remain in 0–1 float scale (not percentages).
+--   • `is_root_page = TRUE` ensures only root URLs are included.
+--   • Optional: enable TABLESAMPLE for faster smoke testing.
+WITH score_data AS (
+  SELECT
+    client,
+    page,
+    CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score,
+    CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score,
+    CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score,
+    CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score,
+    t.technology AS framework
+  FROM
+    `httparchive.crawl.pages`,
+    -- TABLESAMPLE SYSTEM (0.1 PERCENT)   -- ← optional: cheap smoke test
+    UNNEST(technologies) AS t
+  WHERE
+    date = '2025-07-01' AND
+    lighthouse IS NOT NULL AND
+    -- lighthouse != '{}' AND
+    is_root_page = TRUE AND
+    ('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND
+    t.technology IS NOT NULL
+)
+
+SELECT
+  client,
+  framework,
+  AVG(performance_score) AS avg_performance_score,
+  AVG(accessibility_score) AS avg_accessibility_score,
+  AVG(best_practices_score) AS avg_best_practices_score,
+  AVG(seo_score) AS avg_seo_score,
+  COUNT(DISTINCT page) AS total_pages
+FROM (
+  SELECT
+    client,
+    page,
+    framework,
+    AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average
+    AVG(accessibility_score) AS accessibility_score,
+    AVG(best_practices_score) AS best_practices_score,
+    AVG(seo_score) AS seo_score
+  FROM
+    score_data
+  GROUP BY
+    client,
+    page,
+    framework
+)
+GROUP BY
+  client,
+  framework
+ORDER BY
+  total_pages DESC;
@@ -0,0 +1,93 @@
+#standardSQL
+-- Accessibility Technology (A11y) Usage by Domain Rank (2025-07-01)
+-- Google Sheet: a11y_overall_tech_usage_by_domain_rank
+--
+-- Purpose
+--   • Quantify adoption of accessibility-related technologies (e.g., overlays)
+--     across websites, segmented by domain rank tiers.
+--   • Provide both absolute counts of sites using A11y tech and percentages
+--     within each rank grouping.
+--
+-- Dataset
+--   • Source: `httparchive.crawl.pages`
+--   • Crawl date: 2025-07-01
+--   • Technologies: extracted from `technologies` and `categories` arrays.
+--   • Rank groupings: [1K, 10K, 100K, 1M, 10M, 100M].
+--
+-- Method
+--   1. Subquery A:
+--        – Expand rank thresholds with UNNEST.
+--        – Select distinct {client, page, is_root_page, rank_grouping}
+--          where `category = 'Accessibility'`.
+--   2. Subquery B:
+--        – Count all sites per {client, rank_grouping} as denominators
+--          (total sites in each rank band).
+--   3. Join Subquery A with Subquery B on {client, rank_grouping}.
+--   4. Aggregate results to compute distinct site counts and percentages.
+--
+-- Output columns
+--   client                  — "desktop" | "mobile"
+--   is_root_page            — TRUE if page is a root URL
+--   rank_grouping           — maximum rank threshold (e.g., 1000, 10000, …)
+--   total_in_rank           — total number of sites within the rank group
+--   sites_with_a11y_tech    — count of distinct sites using A11y technology
+--   pct_sites_with_a11y_tech — fraction of sites in rank group using A11y tech
+--
+-- Notes
+--   • Percentages are relative to the total sites in each rank grouping.
+--   • Multiple rank thresholds allow trend analysis across different scales
+--     of the web (top 1K → top 100M).
+--   • `is_root_page` is preserved to allow filtering on root vs non-root pages.
+SELECT
+  client,
+  is_root_page,
+  rank_grouping, # Grouping of domains by their rank (e.g., top 1000, top 10000, etc.)
+  total_in_rank, # Total number of sites within the rank grouping
+  COUNT(DISTINCT page) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology
+  COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology within the rank grouping
+FROM
+  (
+    # Subquery to filter and extract relevant pages with A11Y technology
+    SELECT DISTINCT
+      client,
+      is_root_page,
+      page,
+      rank_grouping,
+      category
+    FROM
+      `httparchive.crawl.pages`,
+      UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, # Expand rank_grouping to cover different rank categories
+      UNNEST(technologies) AS tech,
+      UNNEST(categories) AS category
+    WHERE
+      date = '2025-07-01' AND
+      category = 'Accessibility' AND
+      rank <= rank_grouping # Include only sites within the specified rank grouping
+  )
+JOIN
+  (
+    # Subquery to count total sites in each rank grouping for each client
+    SELECT
+      client,
+      rank_grouping,
+      COUNT(0) AS total_in_rank
+    FROM
+      `httparchive.crawl.pages`,
+      UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
+    WHERE
+      date = '2025-07-01' AND
+      rank <= rank_grouping
+    GROUP BY
+      client,
+      rank_grouping
+  )
+USING (client, rank_grouping)
+GROUP BY
+  client,
+  is_root_page,
+  rank_grouping,
+  total_in_rank
+ORDER BY
+  client,
+  is_root_page,
+  rank_grouping
@@ -0,0 +1,49 @@
+#standardSQL
+-- Accessibility Technology (A11y) Usage by Client (2025-07-01)
+-- Google Sheets: a11y_technology_usage
+--
+-- Purpose
+--   • Measure the adoption of accessibility-related technologies (e.g., overlays)
+--     across websites, segmented by client type (desktop vs mobile).
+--   • Provide absolute counts of sites with A11y tech and their percentage share
+--     relative to all sites.
+--
+-- Dataset
+--   • Source: `httparchive.crawl.pages`
+--   • Crawl date: 2025-07-01
+--   • Technologies: extracted via `UNNEST(technologies)` and `UNNEST(categories)`.
+--
+-- Method
+--   1. Count distinct sites (pages) per {client, is_root_page}.
+--   2. Count distinct sites where `category = 'Accessibility'`.
+--   3. Compute percentage as (# sites with A11y tech / total sites).
+--
+-- Output columns
+--   client                  — "desktop" | "mobile"
+--   is_root_page            — TRUE if page is a root URL
+--   total_sites             — number of distinct sites per client
+--   sites_with_a11y_tech    — number of distinct sites with Accessibility technology
+--   pct_sites_with_a11y_tech — fraction of sites using A11y tech (0–1 float)
+--
+-- Notes
+--   • `DISTINCT page` prevents double-counting when a site has multiple technologies.
+--   • Percentages are per client (desktop/mobile) and root-page grouping.
+--   • Useful for high-level comparison of A11y tech adoption across clients.
+SELECT
+  client,  # Client domain
+  is_root_page,
+  COUNT(DISTINCT page) AS total_sites,  # Total number of unique sites for the client
+  COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech,  # Number of unique sites that use accessibility technology
+  COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech  # Percentage of sites using accessibility technology
+FROM
+  `httparchive.crawl.pages`,
+  UNNEST(technologies) AS tech,
+  UNNEST(categories) AS category
+WHERE
+  date = '2025-07-01' # Specific date for data extraction
+GROUP BY
+  client,
+  is_root_page
+ORDER BY
+  client,
+  is_root_page;
@@ -0,0 +1,95 @@
+#standardSQL
+# Purpose
+#   Measure adoption of specific Accessibility-related technologies (apps/overlays)
+#   across domain rank buckets in the 2025-07-01 HTTP Archive crawl.
+#   Google Sheet: a11y_technology_usage_by_domain_rank
+#
+# Output columns
+#   • client             = desktop or mobile
+#   • is_root_page       = TRUE if page is the root of the site
+#   • rank_grouping      = domain rank bucket (1k, 10k, …, 100M)
+#   • total_in_rank      = total number of unique pages in the rank bucket
+#   • app                = specific Accessibility technology detected (Wappalyzer name)
+#   • sites_with_app     = number of unique pages using that technology
+#   • pct_sites_with_app = share of pages in the rank bucket using that technology
+#
+# Method
+#   1. Assign each page to a rank_grouping based on its domain rank.
+#   2. Compute totals per client / root flag / rank grouping (denominator).
+#   3. Expand technologies and categories, keeping only category = 'Accessibility'.
+#   4. Count distinct pages per technology and divide by the rank total.
+#
+# Notes
+#   • Unit of analysis = page URL, not host/site.
+#   • Percentages are returned as numeric fractions (0–1). Use FORMAT() if a
+#     human-readable percent string is needed.
+#   • Rank groupings are aligned with prior reporting thresholds (1k → 100M).
+WITH ranked_sites AS (
+  -- Get the total number of sites within each rank grouping
+  SELECT
+    client,
+    is_root_page,
+    page,
+    rank,
+    technologies,  -- Include technologies field here
+    CASE
+      WHEN rank <= 1000 THEN 1000
+      WHEN rank <= 10000 THEN 10000
+      WHEN rank <= 100000 THEN 100000
+      WHEN rank <= 1000000 THEN 1000000
+      WHEN rank <= 10000000 THEN 10000000
+      WHEN rank <= 100000000 THEN 100000000
+    END AS rank_grouping
+  FROM
+    `httparchive.crawl.pages`
+  WHERE
+    date = '2025-07-01'  -- Use the relevant date for analysis
+),
+
+rank_totals AS (
+  -- Calculate total sites in each rank grouping
+  SELECT
+    client,
+    is_root_page,
+    rank_grouping,
+    COUNT(DISTINCT page) AS total_in_rank
+  FROM
+    ranked_sites
+  GROUP BY
+    client,
+    is_root_page,
+    rank_grouping
+)
+
+SELECT
+  r.client,
+  r.is_root_page,
+  r.rank_grouping,
+  rt.total_in_rank,  -- Total number of unique sites within the rank grouping
+  tech.technology AS app,  -- Accessibility technology used
+  COUNT(DISTINCT r.page) AS sites_with_app,  -- Number of sites using the specific accessibility technology
+  SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app  -- Percentage of sites using the accessibility technology
+FROM
+  ranked_sites r
+JOIN
+  UNNEST(r.technologies) AS tech  -- Expand technologies array to individual rows
+JOIN
+  rank_totals rt  -- Join to get the total number of sites per rank grouping
+ON r.client = rt.client AND
+  r.is_root_page = rt.is_root_page AND
+  r.rank_grouping = rt.rank_grouping
+JOIN
+  UNNEST(tech.categories) AS category  -- Unnest the categories array to filter for accessibility
+WHERE
+  category = 'Accessibility'  -- Filter to include only accessibility-related technologies
+GROUP BY
+  r.client,
+  r.is_root_page,
+  r.rank_grouping,
+  rt.total_in_rank,
+  tech.technology
+ORDER BY
+  tech.technology,  -- Order results by technology (app)
+  r.rank_grouping,  -- Order results by rank grouping
+  r.client,
+  r.is_root_page;
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lighthouse_score_by_government.sql`
	`2`	`+lighthouse_score_by_government_with_urls.sql`