|
| 1 | +#standardSQL |
| 2 | +-- Web Almanac — Lighthouse category scores by framework (2025-07-01) |
| 3 | +-- Google Sheet: a11y_frontend_technology |
| 4 | +-- |
| 5 | +-- Purpose |
| 6 | +-- • Extract Lighthouse category scores (performance, accessibility, |
| 7 | +-- best-practices, SEO) from JSON in the crawl dataset. |
| 8 | +-- • Associate each crawled page with detected frontend frameworks or JS libraries. |
| 9 | +-- • Limit to root pages only for consistency. |
| 10 | +-- • De-duplicate multiple {page, framework} rows caused by UNNEST, by averaging |
| 11 | +-- scores per page before computing framework-level averages. |
| 12 | +-- |
| 13 | +-- Method |
| 14 | +-- 1. Extract scores with JSON_EXTRACT_SCALAR, cast to FLOAT64. |
| 15 | +-- 2. Filter to categories: Web frameworks, JavaScript libraries, |
| 16 | +-- Frontend frameworks, JavaScript frameworks. |
| 17 | +-- 3. Aggregate in two steps: |
| 18 | +-- a. Per {client, page, framework}, average scores to remove duplicates. |
| 19 | +-- b. Global averages per {client, framework}. |
| 20 | +-- |
| 21 | +-- Output columns |
| 22 | +-- client — "desktop" | "mobile" |
| 23 | +-- framework — detected framework or JS library |
| 24 | +-- avg_performance_score — average Lighthouse performance score (0–1) |
| 25 | +-- avg_accessibility_score — average Lighthouse accessibility score (0–1) |
| 26 | +-- avg_best_practices_score — average Lighthouse best-practices score (0–1) |
| 27 | +-- avg_seo_score — average Lighthouse SEO score (0–1) |
| 28 | +-- total_pages — distinct page count per {client, framework} |
| 29 | +-- |
| 30 | +-- Notes |
| 31 | +-- • Scores remain in 0–1 float scale (not percentages). |
| 32 | +-- • `is_root_page = TRUE` ensures only root URLs are included. |
| 33 | +-- • Optional: enable TABLESAMPLE for faster smoke testing. |
| 34 | +WITH score_data AS ( |
| 35 | + SELECT |
| 36 | + client, |
| 37 | + page, |
| 38 | + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, |
| 39 | + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, |
| 40 | + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, |
| 41 | + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, |
| 42 | + t.technology AS framework |
| 43 | + FROM |
| 44 | + `httparchive.crawl.pages`, |
| 45 | + -- TABLESAMPLE SYSTEM (0.1 PERCENT) -- ← optional: cheap smoke test |
| 46 | + UNNEST(technologies) AS t |
| 47 | + WHERE |
| 48 | + date = '2025-07-01' AND |
| 49 | + lighthouse IS NOT NULL AND |
| 50 | + -- lighthouse != '{}' AND |
| 51 | + is_root_page = TRUE AND |
| 52 | + ('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND |
| 53 | + t.technology IS NOT NULL |
| 54 | +) |
| 55 | + |
| 56 | +SELECT |
| 57 | + client, |
| 58 | + framework, |
| 59 | + AVG(performance_score) AS avg_performance_score, |
| 60 | + AVG(accessibility_score) AS avg_accessibility_score, |
| 61 | + AVG(best_practices_score) AS avg_best_practices_score, |
| 62 | + AVG(seo_score) AS avg_seo_score, |
| 63 | + COUNT(DISTINCT page) AS total_pages |
| 64 | +FROM ( |
| 65 | + SELECT |
| 66 | + client, |
| 67 | + page, |
| 68 | + framework, |
| 69 | + AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average |
| 70 | + AVG(accessibility_score) AS accessibility_score, |
| 71 | + AVG(best_practices_score) AS best_practices_score, |
| 72 | + AVG(seo_score) AS seo_score |
| 73 | + FROM |
| 74 | + score_data |
| 75 | + GROUP BY |
| 76 | + client, |
| 77 | + page, |
| 78 | + framework |
| 79 | +) |
| 80 | +GROUP BY |
| 81 | + client, |
| 82 | + framework |
| 83 | +ORDER BY |
| 84 | + total_pages DESC; |
0 commit comments