From 20917b12bc19edd8d0c1d292c81e7ca73281c40d Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Sun, 19 Oct 2025 17:42:08 +0200 Subject: [PATCH 01/13] Add SQL query to analyze CSS gradient usage This SQL query calculates the percentage of sites using CSS gradients over specific years, grouped by client and rank. --- sql/2025/generative-ai/gradient-use.sql | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 sql/2025/generative-ai/gradient-use.sql diff --git a/sql/2025/generative-ai/gradient-use.sql b/sql/2025/generative-ai/gradient-use.sql new file mode 100644 index 00000000000..5da25613b9b --- /dev/null +++ b/sql/2025/generative-ai/gradient-use.sql @@ -0,0 +1,30 @@ +#standardSQL +-- % of sites whose CSS variables include a gradient( ... ) per year, client, rank +-- Goal it use is as an indicator for vibe coded website like "delve into" is used for papers +SELECT + EXTRACT(YEAR FROM date) AS year, + client, + rank, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL)) AS pages_with_gradient, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL)), + COUNT(DISTINCT page) + ) AS pct_with_gradient +FROM `httparchive.crawl.pages` +WHERE + is_root_page + AND date IN ( + DATE '2019-07-01', + DATE '2020-08-01', + DATE '2021-07-01', + DATE '2022-07-01', + DATE '2024-06-01', + DATE '2025-07-01' + ) +GROUP BY year, client, rank +ORDER BY year, client, rank; From d7cdccf2fca8c35e674357636948a78bfaa0fb83 Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:59:37 +0200 Subject: [PATCH 02/13] Add SQL query for robots.txt site analysis This SQL query calculates the percentage of sites with a valid robots.txt file, including various directives, based on data from the HTTP Archive crawl. --- .../generative-ai/sites-with-robot-txt.sql | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 sql/2025/generative-ai/sites-with-robot-txt.sql diff --git a/sql/2025/generative-ai/sites-with-robot-txt.sql b/sql/2025/generative-ai/sites-with-robot-txt.sql new file mode 100644 index 00000000000..58b46e33e0d --- /dev/null +++ b/sql/2025/generative-ai/sites-with-robot-txt.sql @@ -0,0 +1,32 @@ +#standardSQL +-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere) +WITH roots AS ( + SELECT + client, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + AND is_root_page +) +SELECT + client, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT IF(status = 200 + AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL)) AS sites_with_robots_txt, + SAFE_DIVIDE( + COUNT(DISTINCT IF(status = 200 + AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL)), + COUNT(DISTINCT root_page) + ) AS pct_sites_with_robots_txt +FROM roots +GROUP BY client +ORDER BY client; From 0daad6a887c50c6834a5779f3231c6be6177797a Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:16:53 +0200 Subject: [PATCH 03/13] Add SQL script for user-agent analysis in robots.txt This SQL script calculates the percentage of sites mentioning a specific user-agent in their robots.txt file, categorized by rank bucket. Inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/ --- sql/2025/generative-ai/user-agent-named.sql | 110 ++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 sql/2025/generative-ai/user-agent-named.sql diff --git a/sql/2025/generative-ai/user-agent-named.sql b/sql/2025/generative-ai/user-agent-named.sql new file mode 100644 index 00000000000..9a76924b6cd --- /dev/null +++ b/sql/2025/generative-ai/user-agent-named.sql @@ -0,0 +1,110 @@ +#standardSQL +# Percent of sites with a given user-agent mentioned in robots.txt, by rank bucket +# Returns percentages among (a) all sites and (b) only sites with robots.txt status=200. + +CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) +RETURNS STRING +LANGUAGE js AS r""" + try { + const obj = JSON.parse(byua_json || '{}'); + const key = String(agent || '').toLowerCase(); + const rec = obj[key]; + return rec ? JSON.stringify(rec) : null; + } catch (e) { return null; } +"""; + +-- Base rows: one per site +WITH base AS ( + SELECT + client, + rank, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, + JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + AND is_root_page +), + +-- Extract all UA keys present on each site (keys of by_useragent object) +ua_keys AS ( + SELECT + b.client, + b.rank, + b.root_page, + LOWER(agent) AS agent + FROM base AS b, + UNNEST( + REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') + ) AS agent +), + +-- Per-site per-agent presence (any directive count > 0) +ua_presence AS ( + SELECT + k.client, + k.rank, + k.root_page, + k.agent, + -- Look up this agent's counts without reparsing the whole robots again + getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj + FROM ua_keys k + JOIN base b + USING (client, rank, root_page) +), + +ua_presence_scored AS ( + SELECT + client, + rank, + root_page, + agent, + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum + FROM ua_presence +), + +-- Totals per rank bucket +totals_all AS ( + SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites + FROM base + GROUP BY client, rank +), +totals_200 AS ( + SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200 + FROM base + WHERE status = 200 + GROUP BY client, rank +), + +-- Numerators per agent +numerators AS ( + SELECT + p.client, + p.rank, + p.agent, + COUNT(DISTINCT p.root_page) AS sites_with_agent, + COUNT(DISTINCT IF(b.status = 200, p.root_page, NULL)) AS sites_with_agent_among_200 + FROM ua_presence_scored p + JOIN base b USING (client, rank, root_page) + WHERE p.rules_sum > 0 + GROUP BY p.client, p.rank, p.agent +) + +SELECT + n.client, + n.rank, + n.agent, + t.total_sites, + t2.total_sites_200, + n.sites_with_agent, + n.sites_with_agent_among_200, + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 +FROM numerators n +JOIN totals_all t USING (client, rank) +JOIN totals_200 t2 USING (client, rank) +ORDER BY rank, client, pct_of_all_sites DESC; From 49ad032ccbd0a64a587b2bfebb5ed966f6825830 Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:39:13 +0200 Subject: [PATCH 04/13] Create user-agent-years.sql inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/ --- sql/2025/generative-ai/user-agent-years.sql | 98 +++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 sql/2025/generative-ai/user-agent-years.sql diff --git a/sql/2025/generative-ai/user-agent-years.sql b/sql/2025/generative-ai/user-agent-years.sql new file mode 100644 index 00000000000..63b13237428 --- /dev/null +++ b/sql/2025/generative-ai/user-agent-years.sql @@ -0,0 +1,98 @@ +#standardSQL +-- Percent of sites mentioning each user-agent in robots.txt, over fixed Almanac snapshots + +CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) +RETURNS STRING +LANGUAGE js AS r""" + try { + const obj = JSON.parse(byua_json || '{}'); + const rec = obj[String(agent || '').toLowerCase()]; + return rec ? JSON.stringify(rec) : null; + } catch (e) { return null; } +"""; + +-- Base: one row per site snapshot (only fields we need) +WITH base AS ( + SELECT + date, + client, + rank, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, + JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua + FROM `httparchive.crawl.pages` + WHERE date IN ('2019-07-01','2020-08-01','2021-07-01','2022-06-01','2024-06-01','2025-07-01') + AND client = 'mobile' + AND is_root_page +), + +-- Extract UA keys present on each site +ua_keys AS ( + SELECT + b.date, b.client, b.rank, b.root_page, + LOWER(agent) AS agent + FROM base AS b, + UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent +), + +-- Look up that agent’s counts on that site +ua_presence AS ( + SELECT + k.date, k.client, k.rank, k.root_page, k.agent, + getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj, + b.status + FROM ua_keys k + JOIN base b USING (date, client, rank, root_page) +), + +-- Sum rule counts and keep only sites where the agent actually appears +ua_scored AS ( + SELECT + date, client, rank, root_page, agent, status, + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum + FROM ua_presence +), + +-- Denominators per (date, rank) +totals_all AS ( + SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites + FROM base + GROUP BY date, client, rank +), +totals_200 AS ( + SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200 + FROM base + WHERE status = 200 + GROUP BY date, client, rank +), + +-- Numerators per (date, rank, agent) +numerators AS ( + SELECT + date, client, rank, agent, + COUNT(DISTINCT IF(rules_sum > 0, root_page, NULL)) AS sites_with_agent, + COUNT(DISTINCT IF(status = 200 AND rules_sum > 0, root_page, NULL)) AS sites_with_agent_among_200 + FROM ua_scored + GROUP BY date, client, rank, agent +) + +SELECT + n.date, + n.client, + n.rank, + n.agent, + t.total_sites, + t2.total_sites_200, + n.sites_with_agent, + n.sites_with_agent_among_200, + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 +FROM numerators n +JOIN totals_all t USING (date, client, rank) +JOIN totals_200 t2 USING (date, client, rank) +WHERE n.sites_with_agent >= 100 +ORDER BY n.date, n.rank, pct_of_all_sites DESC; From 3259399431236b157f329683894482e2baa9f042 Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:58:58 +0100 Subject: [PATCH 05/13] Create gradient-by-client.sql --- sql/2025/generative-ai/gradient-by-client.sql | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 sql/2025/generative-ai/gradient-by-client.sql diff --git a/sql/2025/generative-ai/gradient-by-client.sql b/sql/2025/generative-ai/gradient-by-client.sql new file mode 100644 index 00000000000..8551a1c412d --- /dev/null +++ b/sql/2025/generative-ai/gradient-by-client.sql @@ -0,0 +1,31 @@ +#standardSQL +-- Gradient adoption in CSS variables over time (by year + client, no rank) +-- Detects any "gradient(" inside custom_metrics.css_variables +SELECT + EXTRACT(YEAR FROM date) AS year, + client, + COUNT(DISTINCT page) AS total_sites, + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL + )) AS sites_using_gradient, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL + )), + COUNT(DISTINCT page) + ) AS pct_sites_using_gradient +FROM `httparchive.crawl.pages` +WHERE + is_root_page + AND date IN ( + DATE '2019-07-01', + DATE '2020-08-01', + DATE '2021-07-01', + DATE '2022-07-01', -- CSS parsed-data exception (July) + DATE '2024-06-01', + DATE '2025-07-01' + ) +GROUP BY year, client +ORDER BY year, client; From 6d97a0b0758697e1335bbeaed73f59dd5223bc2a Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:11:55 +0100 Subject: [PATCH 06/13] Add SQL query for gradient adoption analysis This SQL query analyzes the adoption of CSS gradients by clients over specific years, grouping results by rank and calculating the percentage of sites using gradients. --- .../generative-ai/gradient-client-rank.sql | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 sql/2025/generative-ai/gradient-client-rank.sql diff --git a/sql/2025/generative-ai/gradient-client-rank.sql b/sql/2025/generative-ai/gradient-client-rank.sql new file mode 100644 index 00000000000..c87c1fe2631 --- /dev/null +++ b/sql/2025/generative-ai/gradient-client-rank.sql @@ -0,0 +1,52 @@ +#standardSQL +-- Adoption of CSS gradients in custom_metrics.css_variables +-- Grouped by: year, client, rank bucket + +WITH ranks AS ( + SELECT 1000 AS rank_grouping UNION ALL + SELECT 10000 UNION ALL + SELECT 100000 UNION ALL + SELECT 1000000 UNION ALL + SELECT 10000000 UNION ALL + SELECT 100000000 +) + +SELECT + EXTRACT(YEAR FROM date) AS year, + client, + r.rank_grouping, + COUNT(DISTINCT page) AS total_sites, + COUNT(DISTINCT IF( + REGEXP_CONTAINS( + TO_JSON_STRING(custom_metrics.css_variables), + r'(?i)gradient\(' + ), + page, + NULL + )) AS sites_using_gradient, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + REGEXP_CONTAINS( + TO_JSON_STRING(custom_metrics.css_variables), + r'(?i)gradient\(' + ), + page, + NULL + )), + COUNT(DISTINCT page) + ) AS pct_sites_using_gradient +FROM `httparchive.crawl.pages` +CROSS JOIN ranks r +WHERE + is_root_page + AND rank <= r.rank_grouping + AND date IN ( + DATE '2019-07-01', + DATE '2020-08-01', + DATE '2021-07-01', + DATE '2022-07-01', -- CSS metrics exception + DATE '2024-06-01', + DATE '2025-07-01' + ) +GROUP BY year, client, r.rank_grouping +ORDER BY year, client, r.rank_grouping; From 79ebcc2695ffa6d29377e02b1d1907a01188af1f Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Sun, 14 Dec 2025 18:58:07 +0100 Subject: [PATCH 07/13] Update and rename gradient-by-client.sql to gradient.sql --- sql/2025/generative-ai/gradient-by-client.sql | 31 ----- sql/2025/generative-ai/gradient.sql | 130 ++++++++++++++++++ 2 files changed, 130 insertions(+), 31 deletions(-) delete mode 100644 sql/2025/generative-ai/gradient-by-client.sql create mode 100644 sql/2025/generative-ai/gradient.sql diff --git a/sql/2025/generative-ai/gradient-by-client.sql b/sql/2025/generative-ai/gradient-by-client.sql deleted file mode 100644 index 8551a1c412d..00000000000 --- a/sql/2025/generative-ai/gradient-by-client.sql +++ /dev/null @@ -1,31 +0,0 @@ -#standardSQL --- Gradient adoption in CSS variables over time (by year + client, no rank) --- Detects any "gradient(" inside custom_metrics.css_variables -SELECT - EXTRACT(YEAR FROM date) AS year, - client, - COUNT(DISTINCT page) AS total_sites, - COUNT(DISTINCT IF( - REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), - page, NULL - )) AS sites_using_gradient, - SAFE_DIVIDE( - COUNT(DISTINCT IF( - REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), - page, NULL - )), - COUNT(DISTINCT page) - ) AS pct_sites_using_gradient -FROM `httparchive.crawl.pages` -WHERE - is_root_page - AND date IN ( - DATE '2019-07-01', - DATE '2020-08-01', - DATE '2021-07-01', - DATE '2022-07-01', -- CSS parsed-data exception (July) - DATE '2024-06-01', - DATE '2025-07-01' - ) -GROUP BY year, client -ORDER BY year, client; diff --git a/sql/2025/generative-ai/gradient.sql b/sql/2025/generative-ai/gradient.sql new file mode 100644 index 00000000000..b333363b4b0 --- /dev/null +++ b/sql/2025/generative-ai/gradient.sql @@ -0,0 +1,130 @@ +#standardSQL +WITH raw_data AS ( + SELECT + date, + page, + -- 1. TECHNOLOGY FLAGS + -- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string + ( + custom_metrics.css_variables IS NOT NULL + AND TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null') + ) AS uses_css_vars, + + -- Tailwind: Check the array for the technology + EXISTS( + SELECT 1 FROM UNNEST(technologies) AS t WHERE t.technology = 'Tailwind CSS' + ) AS uses_tailwind, + + -- Content String for Regex + LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str + FROM + `httparchive.crawl.pages` + WHERE + client = 'mobile' + AND is_root_page + -- NO RANK FILTER (Analyze the entire long-tail of the web) + + -- Quarterly Dates + AND date IN UNNEST([ + DATE '2020-10-01', + DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01', + DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01', + DATE '2023-01-01', DATE '2023-04-01', DATE '2023-07-01', DATE '2023-10-01', + DATE '2024-01-01', DATE '2024-04-01', DATE '2024-07-01', DATE '2024-10-01', + DATE '2025-01-01', DATE '2025-04-01', DATE '2025-07-01', DATE '2025-10-01' + ]) +), + +-- Pre-calculate heuristics +flags AS ( + SELECT + date, + page, + uses_css_vars, + uses_tailwind, + + -- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true) + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'inter')) AS has_inter, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'roboto')) AS has_roboto, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'system-ui')) AS has_system_ui, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'linear-gradient\(|radial-gradient\(')) AS has_gradient, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(2px|4px|6px|8px|12px|16px|0\.25rem|0\.5rem|0\.75rem|1rem|9999px)"')) AS has_radius, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'rgba\(|box-shadow')) AS has_shadow + FROM + raw_data +) + +SELECT + FORMAT_DATE('%Y-Q%Q', date) AS year_quarter, + + -- 1. CONTEXT (Denominators) + COUNT(DISTINCT page) AS total_sites, + COUNT(DISTINCT IF(uses_css_vars, page, NULL)) AS sites_using_vars, + COUNT(DISTINCT IF(uses_tailwind, page, NULL)) AS sites_using_tailwind, + + ------------------------------------------------------------------------- + -- 2. "AI PURPLE" SPECTRUM (Indigo/Violet/Purple 500) + ------------------------------------------------------------------------- + COUNT(DISTINCT IF(has_ai_purples, page, NULL)) AS cnt_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT page)) AS pct_all_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_ai_purples, + + ------------------------------------------------------------------------- + -- 3. SPECIFIC INDIGO 500 (#6366f1 Only) + ------------------------------------------------------------------------- + COUNT(DISTINCT IF(has_indigo_500, page, NULL)) AS cnt_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT page)) AS pct_all_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500 AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_indigo, + + ------------------------------------------------------------------------- + -- 4. FONTS + ------------------------------------------------------------------------- + -- Inter + COUNT(DISTINCT IF(has_inter, page, NULL)) AS cnt_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter, + + -- Roboto + COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_roboto, + + -- System UI + COUNT(DISTINCT IF(has_system_ui, page, NULL)) AS cnt_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT page)) AS pct_all_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_system, + + ------------------------------------------------------------------------- + -- 5. UI ELEMENTS + ------------------------------------------------------------------------- + -- Gradients + COUNT(DISTINCT IF(has_gradient, page, NULL)) AS cnt_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT page)) AS pct_all_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_gradient, + + -- Radius + COUNT(DISTINCT IF(has_radius, page, NULL)) AS cnt_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT page)) AS pct_all_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_radius, + + -- Shadows + COUNT(DISTINCT IF(has_shadow, page, NULL)) AS cnt_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT page)) AS pct_all_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_shadow + +FROM + flags +GROUP BY + year_quarter +ORDER BY + year_quarter; From 12c22cab653889b89e7aa8073ae6958a0c2f1d53 Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Sun, 14 Dec 2025 20:02:00 +0100 Subject: [PATCH 08/13] Add SQL query for .ai domain ranking analysis --- sql/2025/generative-ai/ai_tld.sql | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 sql/2025/generative-ai/ai_tld.sql diff --git a/sql/2025/generative-ai/ai_tld.sql b/sql/2025/generative-ai/ai_tld.sql new file mode 100644 index 00000000000..74c5e7257bd --- /dev/null +++ b/sql/2025/generative-ai/ai_tld.sql @@ -0,0 +1,45 @@ +#standardSQL +# .ai domains per exclusive rank bucket, 2022 vs 2025 + +WITH base AS ( + SELECT + date, + client, + rank, + NET.HOST(page) AS host + FROM + `httparchive.crawl.pages` + WHERE + is_root_page + AND client IN ('desktop', 'mobile') + AND date IN ('2022-06-01', '2025-07-01') + AND rank <= 10000000 + AND ENDS_WITH(NET.HOST(page), '.ai') +), + +bucketed AS ( + SELECT + date, + client, + CASE + WHEN rank <= 1000 THEN 1000 + WHEN rank <= 10000 THEN 10000 + WHEN rank <= 100000 THEN 100000 + WHEN rank <= 1000000 THEN 1000000 + WHEN rank <= 10000000 THEN 10000000 + END AS rank_bucket, + host + FROM base +) + +SELECT + date, + client, + rank_bucket, + COUNT(DISTINCT host) AS ai_domains +FROM + bucketed +GROUP BY + date, client, rank_bucket +ORDER BY + date, client, rank_bucket; From e299642fac537e245d4c7a25b925ba45b38f59be Mon Sep 17 00:00:00 2001 From: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Date: Sun, 14 Dec 2025 20:02:31 +0100 Subject: [PATCH 09/13] Add SQL script for analyzing web page platforms --- sql/2025/generative-ai/vibecodetools.sql | 68 ++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 sql/2025/generative-ai/vibecodetools.sql diff --git a/sql/2025/generative-ai/vibecodetools.sql b/sql/2025/generative-ai/vibecodetools.sql new file mode 100644 index 00000000000..f5e6d3ee240 --- /dev/null +++ b/sql/2025/generative-ai/vibecodetools.sql @@ -0,0 +1,68 @@ +#standardSQL +WITH base AS ( + SELECT + date, + page, + NET.HOST(page) AS host + FROM + `httparchive.crawl.pages` + WHERE + client = 'desktop' + AND is_root_page + AND date IN ( + '2020-01-01','2020-04-01','2020-07-01','2020-10-01', + '2021-01-01','2021-04-01','2021-07-01','2021-10-01', + '2022-01-01','2022-04-01','2022-07-01','2022-10-01', + '2023-01-01','2023-04-01','2023-07-01','2023-10-01', + '2024-01-01','2024-04-01','2024-07-01','2024-10-01', + '2025-01-01','2025-04-01','2025-07-01','2025-10-01' + ) +), + +classified AS ( + SELECT + date, + page, + CASE + WHEN ENDS_WITH(host, 'vercel.app') THEN 'vercel' + WHEN ENDS_WITH(host, 'pages.dev') THEN 'cloudflare_pages' + WHEN ENDS_WITH(host, 'workers.dev') THEN 'cloudflare_workers' + WHEN ENDS_WITH(host, 'lovable.app') OR ENDS_WITH(host, 'lovable.dev') THEN 'lovable' + WHEN ENDS_WITH(host, 'bolt.new') OR ENDS_WITH(host, 'stackblitz.io') THEN 'bolt' + WHEN ENDS_WITH(host, 'v0.dev') THEN 'v0' + WHEN ENDS_WITH(host, 'replit.app') THEN 'replit' + ELSE NULL + END AS platform + FROM + base +), + +totals AS ( + SELECT + date, + COUNT(0) AS total_pages + FROM + base + GROUP BY + date +) + +SELECT + c.date, + c.platform, + COUNT(0) AS pages, + SAFE_DIVIDE(COUNT(0), t.total_pages) AS pct_pages +FROM + classified c +JOIN + totals t +USING (date) +WHERE + c.platform IS NOT NULL +GROUP BY + c.date, + c.platform, + t.total_pages +ORDER BY + c.date, + pct_pages DESC; From 5c6f0018817e40bf344171574c263f48511926e8 Mon Sep 17 00:00:00 2001 From: Christian Liebel Date: Mon, 29 Dec 2025 11:46:29 +0100 Subject: [PATCH 10/13] Fix linter errors in SQL scripts --- sql/2025/generative-ai/ai_tld.sql | 10 +++--- .../generative-ai/gradient-client-rank.sql | 21 +++++++----- sql/2025/generative-ai/gradient-use.sql | 10 +++--- sql/2025/generative-ai/gradient.sql | 20 +++++------ .../generative-ai/sites-with-robot-txt.sql | 33 +++++++++++-------- sql/2025/generative-ai/user-agent-named.sql | 29 ++++++++-------- sql/2025/generative-ai/user-agent-years.sql | 21 ++++++------ sql/2025/generative-ai/vibecodetools.sql | 18 +++++----- 8 files changed, 88 insertions(+), 74 deletions(-) diff --git a/sql/2025/generative-ai/ai_tld.sql b/sql/2025/generative-ai/ai_tld.sql index 74c5e7257bd..6ac62d434c3 100644 --- a/sql/2025/generative-ai/ai_tld.sql +++ b/sql/2025/generative-ai/ai_tld.sql @@ -10,11 +10,11 @@ WITH base AS ( FROM `httparchive.crawl.pages` WHERE - is_root_page - AND client IN ('desktop', 'mobile') - AND date IN ('2022-06-01', '2025-07-01') - AND rank <= 10000000 - AND ENDS_WITH(NET.HOST(page), '.ai') + is_root_page AND + client IN ('desktop', 'mobile') AND + date IN ('2022-06-01', '2025-07-01') AND + rank <= 10000000 AND + ENDS_WITH(NET.HOST(page), '.ai') ), bucketed AS ( diff --git a/sql/2025/generative-ai/gradient-client-rank.sql b/sql/2025/generative-ai/gradient-client-rank.sql index c87c1fe2631..0ec51acd3ae 100644 --- a/sql/2025/generative-ai/gradient-client-rank.sql +++ b/sql/2025/generative-ai/gradient-client-rank.sql @@ -3,11 +3,16 @@ -- Grouped by: year, client, rank bucket WITH ranks AS ( - SELECT 1000 AS rank_grouping UNION ALL - SELECT 10000 UNION ALL - SELECT 100000 UNION ALL - SELECT 1000000 UNION ALL - SELECT 10000000 UNION ALL + SELECT 1000 AS rank_grouping + UNION ALL + SELECT 10000 + UNION ALL + SELECT 100000 + UNION ALL + SELECT 1000000 + UNION ALL + SELECT 10000000 + UNION ALL SELECT 100000000 ) @@ -38,9 +43,9 @@ SELECT FROM `httparchive.crawl.pages` CROSS JOIN ranks r WHERE - is_root_page - AND rank <= r.rank_grouping - AND date IN ( + is_root_page AND + rank <= r.rank_grouping AND + date IN ( DATE '2019-07-01', DATE '2020-08-01', DATE '2021-07-01', diff --git a/sql/2025/generative-ai/gradient-use.sql b/sql/2025/generative-ai/gradient-use.sql index 5da25613b9b..e77ab21c558 100644 --- a/sql/2025/generative-ai/gradient-use.sql +++ b/sql/2025/generative-ai/gradient-use.sql @@ -8,17 +8,19 @@ SELECT COUNT(DISTINCT page) AS total_pages, COUNT(DISTINCT IF( REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), - page, NULL)) AS pages_with_gradient, + page, NULL + )) AS pages_with_gradient, SAFE_DIVIDE( COUNT(DISTINCT IF( REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), - page, NULL)), + page, NULL + )), COUNT(DISTINCT page) ) AS pct_with_gradient FROM `httparchive.crawl.pages` WHERE - is_root_page - AND date IN ( + is_root_page AND + date IN ( DATE '2019-07-01', DATE '2020-08-01', DATE '2021-07-01', diff --git a/sql/2025/generative-ai/gradient.sql b/sql/2025/generative-ai/gradient.sql index b333363b4b0..39031304f94 100644 --- a/sql/2025/generative-ai/gradient.sql +++ b/sql/2025/generative-ai/gradient.sql @@ -4,12 +4,12 @@ WITH raw_data AS ( date, page, -- 1. TECHNOLOGY FLAGS - -- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string + -- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string ( - custom_metrics.css_variables IS NOT NULL - AND TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null') + custom_metrics.css_variables IS NOT NULL AND + TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null') ) AS uses_css_vars, - + -- Tailwind: Check the array for the technology EXISTS( SELECT 1 FROM UNNEST(technologies) AS t WHERE t.technology = 'Tailwind CSS' @@ -20,12 +20,12 @@ WITH raw_data AS ( FROM `httparchive.crawl.pages` WHERE - client = 'mobile' - AND is_root_page + client = 'mobile' AND + is_root_page AND -- NO RANK FILTER (Analyze the entire long-tail of the web) - + -- Quarterly Dates - AND date IN UNNEST([ + date IN UNNEST([ DATE '2020-10-01', DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01', DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01', @@ -42,7 +42,7 @@ flags AS ( page, uses_css_vars, uses_tailwind, - + -- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true) (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500, (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples, @@ -88,7 +88,7 @@ SELECT IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter, IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter, IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter, - + -- Roboto COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto, IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto, diff --git a/sql/2025/generative-ai/sites-with-robot-txt.sql b/sql/2025/generative-ai/sites-with-robot-txt.sql index 58b46e33e0d..b3da2454592 100644 --- a/sql/2025/generative-ai/sites-with-robot-txt.sql +++ b/sql/2025/generative-ai/sites-with-robot-txt.sql @@ -5,26 +5,31 @@ WITH roots AS ( client, root_page, SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, - COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' - AND is_root_page + WHERE date = '2025-07-01' AND + is_root_page ) + SELECT client, COUNT(DISTINCT root_page) AS sites, - COUNT(DISTINCT IF(status = 200 - AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, - root_page, NULL)) AS sites_with_robots_txt, + COUNT(DISTINCT IF( + status = 200 AND + (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL + )) AS sites_with_robots_txt, SAFE_DIVIDE( - COUNT(DISTINCT IF(status = 200 - AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, - root_page, NULL)), + COUNT(DISTINCT IF( + status = 200 AND + (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL + )), COUNT(DISTINCT root_page) ) AS pct_sites_with_robots_txt FROM roots diff --git a/sql/2025/generative-ai/user-agent-named.sql b/sql/2025/generative-ai/user-agent-named.sql index 9a76924b6cd..b63794af536 100644 --- a/sql/2025/generative-ai/user-agent-named.sql +++ b/sql/2025/generative-ai/user-agent-named.sql @@ -22,8 +22,8 @@ WITH base AS ( SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' - AND is_root_page + WHERE date = '2025-07-01' AND + is_root_page ), -- Extract all UA keys present on each site (keys of by_useragent object) @@ -34,9 +34,9 @@ ua_keys AS ( b.root_page, LOWER(agent) AS agent FROM base AS b, - UNNEST( - REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') - ) AS agent + UNNEST( + REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') + ) AS agent ), -- Per-site per-agent presence (any directive count > 0) @@ -50,7 +50,7 @@ ua_presence AS ( getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj FROM ua_keys k JOIN base b - USING (client, rank, root_page) + USING (client, rank, root_page) ), ua_presence_scored AS ( @@ -59,11 +59,11 @@ ua_presence_scored AS ( rank, root_page, agent, - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum FROM ua_presence ), @@ -73,6 +73,7 @@ totals_all AS ( FROM base GROUP BY client, rank ), + totals_200 AS ( SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200 FROM base @@ -102,9 +103,9 @@ SELECT t2.total_sites_200, n.sites_with_agent, n.sites_with_agent_among_200, - SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, - SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 FROM numerators n -JOIN totals_all t USING (client, rank) +JOIN totals_all t USING (client, rank) JOIN totals_200 t2 USING (client, rank) ORDER BY rank, client, pct_of_all_sites DESC; diff --git a/sql/2025/generative-ai/user-agent-years.sql b/sql/2025/generative-ai/user-agent-years.sql index 63b13237428..70c154736a9 100644 --- a/sql/2025/generative-ai/user-agent-years.sql +++ b/sql/2025/generative-ai/user-agent-years.sql @@ -21,9 +21,9 @@ WITH base AS ( SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua FROM `httparchive.crawl.pages` - WHERE date IN ('2019-07-01','2020-08-01','2021-07-01','2022-06-01','2024-06-01','2025-07-01') - AND client = 'mobile' - AND is_root_page + WHERE date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND + client = 'mobile' AND + is_root_page ), -- Extract UA keys present on each site @@ -32,7 +32,7 @@ ua_keys AS ( b.date, b.client, b.rank, b.root_page, LOWER(agent) AS agent FROM base AS b, - UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent + UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent ), -- Look up that agent’s counts on that site @@ -49,11 +49,11 @@ ua_presence AS ( ua_scored AS ( SELECT date, client, rank, root_page, agent, status, - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum FROM ua_presence ), @@ -63,6 +63,7 @@ totals_all AS ( FROM base GROUP BY date, client, rank ), + totals_200 AS ( SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200 FROM base @@ -89,10 +90,10 @@ SELECT t2.total_sites_200, n.sites_with_agent, n.sites_with_agent_among_200, - SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 FROM numerators n -JOIN totals_all t USING (date, client, rank) +JOIN totals_all t USING (date, client, rank) JOIN totals_200 t2 USING (date, client, rank) WHERE n.sites_with_agent >= 100 ORDER BY n.date, n.rank, pct_of_all_sites DESC; diff --git a/sql/2025/generative-ai/vibecodetools.sql b/sql/2025/generative-ai/vibecodetools.sql index f5e6d3ee240..73e239be5e2 100644 --- a/sql/2025/generative-ai/vibecodetools.sql +++ b/sql/2025/generative-ai/vibecodetools.sql @@ -7,15 +7,15 @@ WITH base AS ( FROM `httparchive.crawl.pages` WHERE - client = 'desktop' - AND is_root_page - AND date IN ( - '2020-01-01','2020-04-01','2020-07-01','2020-10-01', - '2021-01-01','2021-04-01','2021-07-01','2021-10-01', - '2022-01-01','2022-04-01','2022-07-01','2022-10-01', - '2023-01-01','2023-04-01','2023-07-01','2023-10-01', - '2024-01-01','2024-04-01','2024-07-01','2024-10-01', - '2025-01-01','2025-04-01','2025-07-01','2025-10-01' + client = 'desktop' AND + is_root_page AND + date IN ( + '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01', + '2021-01-01', '2021-04-01', '2021-07-01', '2021-10-01', + '2022-01-01', '2022-04-01', '2022-07-01', '2022-10-01', + '2023-01-01', '2023-04-01', '2023-07-01', '2023-10-01', + '2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01', + '2025-01-01', '2025-04-01', '2025-07-01', '2025-10-01' ) ), From 265c477fc2ceaa34dd5b95338c400ddaf5f7abcb Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Fri, 9 Jan 2026 11:33:09 +0000 Subject: [PATCH 11/13] Apply suggestions from code review --- sql/2025/generative-ai/ai_tld.sql | 2 +- .../generative-ai/gradient-client-rank.sql | 35 +++++++------- sql/2025/generative-ai/gradient-use.sql | 13 +++-- sql/2025/generative-ai/gradient.sql | 4 +- .../generative-ai/sites-with-robot-txt.sql | 15 ++++-- sql/2025/generative-ai/user-agent-named.sql | 43 +++++++++-------- sql/2025/generative-ai/user-agent-years.sql | 47 ++++++++++++------- sql/2025/generative-ai/vibecodetools.sql | 14 +++--- 8 files changed, 98 insertions(+), 75 deletions(-) diff --git a/sql/2025/generative-ai/ai_tld.sql b/sql/2025/generative-ai/ai_tld.sql index 6ac62d434c3..4c64b63b5c0 100644 --- a/sql/2025/generative-ai/ai_tld.sql +++ b/sql/2025/generative-ai/ai_tld.sql @@ -13,7 +13,7 @@ WITH base AS ( is_root_page AND client IN ('desktop', 'mobile') AND date IN ('2022-06-01', '2025-07-01') AND - rank <= 10000000 AND + rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency ENDS_WITH(NET.HOST(page), '.ai') ), diff --git a/sql/2025/generative-ai/gradient-client-rank.sql b/sql/2025/generative-ai/gradient-client-rank.sql index 0ec51acd3ae..7129513478e 100644 --- a/sql/2025/generative-ai/gradient-client-rank.sql +++ b/sql/2025/generative-ai/gradient-client-rank.sql @@ -2,24 +2,14 @@ -- Adoption of CSS gradients in custom_metrics.css_variables -- Grouped by: year, client, rank bucket -WITH ranks AS ( - SELECT 1000 AS rank_grouping - UNION ALL - SELECT 10000 - UNION ALL - SELECT 100000 - UNION ALL - SELECT 1000000 - UNION ALL - SELECT 10000000 - UNION ALL - SELECT 100000000 -) +#standardSQL +-- Adoption of CSS gradients in custom_metrics.css_variables +-- Grouped by: year, client, rank bucket SELECT EXTRACT(YEAR FROM date) AS year, client, - r.rank_grouping, + rank_grouping, COUNT(DISTINCT page) AS total_sites, COUNT(DISTINCT IF( REGEXP_CONTAINS( @@ -40,11 +30,12 @@ SELECT )), COUNT(DISTINCT page) ) AS pct_sites_using_gradient -FROM `httparchive.crawl.pages` -CROSS JOIN ranks r +FROM + `httparchive.crawl.pages`, + UNNEST ([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping WHERE is_root_page AND - rank <= r.rank_grouping AND + rank <= rank_grouping AND date IN ( DATE '2019-07-01', DATE '2020-08-01', @@ -53,5 +44,11 @@ WHERE DATE '2024-06-01', DATE '2025-07-01' ) -GROUP BY year, client, r.rank_grouping -ORDER BY year, client, r.rank_grouping; +GROUP BY + year, + client, + rank_grouping +ORDER BY + year, + client, + rank_grouping; diff --git a/sql/2025/generative-ai/gradient-use.sql b/sql/2025/generative-ai/gradient-use.sql index e77ab21c558..3033ad62599 100644 --- a/sql/2025/generative-ai/gradient-use.sql +++ b/sql/2025/generative-ai/gradient-use.sql @@ -17,7 +17,8 @@ SELECT )), COUNT(DISTINCT page) ) AS pct_with_gradient -FROM `httparchive.crawl.pages` +FROM + `httparchive.crawl.pages` WHERE is_root_page AND date IN ( @@ -28,5 +29,11 @@ WHERE DATE '2024-06-01', DATE '2025-07-01' ) -GROUP BY year, client, rank -ORDER BY year, client, rank; +GROUP BY + year, + client, + rank +ORDER BY + year, + client, + rank; diff --git a/sql/2025/generative-ai/gradient.sql b/sql/2025/generative-ai/gradient.sql index 39031304f94..d01a03e5763 100644 --- a/sql/2025/generative-ai/gradient.sql +++ b/sql/2025/generative-ai/gradient.sql @@ -11,9 +11,7 @@ WITH raw_data AS ( ) AS uses_css_vars, -- Tailwind: Check the array for the technology - EXISTS( - SELECT 1 FROM UNNEST(technologies) AS t WHERE t.technology = 'Tailwind CSS' - ) AS uses_tailwind, + 'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind, -- Content String for Regex LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str diff --git a/sql/2025/generative-ai/sites-with-robot-txt.sql b/sql/2025/generative-ai/sites-with-robot-txt.sql index b3da2454592..2d74dff6a24 100644 --- a/sql/2025/generative-ai/sites-with-robot-txt.sql +++ b/sql/2025/generative-ai/sites-with-robot-txt.sql @@ -11,8 +11,10 @@ WITH roots AS ( COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt - FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' AND + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND is_root_page ) @@ -32,6 +34,9 @@ SELECT )), COUNT(DISTINCT root_page) ) AS pct_sites_with_robots_txt -FROM roots -GROUP BY client -ORDER BY client; +FROM + roots +GROUP BY + client +ORDER BY + client; diff --git a/sql/2025/generative-ai/user-agent-named.sql b/sql/2025/generative-ai/user-agent-named.sql index b63794af536..e7570c279da 100644 --- a/sql/2025/generative-ai/user-agent-named.sql +++ b/sql/2025/generative-ai/user-agent-named.sql @@ -2,14 +2,13 @@ # Percent of sites with a given user-agent mentioned in robots.txt, by rank bucket # Returns percentages among (a) all sites and (b) only sites with robots.txt status=200. -CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) -RETURNS STRING +CREATE TEMP FUNCTION getByAgent(byua_json JSON, agent STRING) +RETURNS JSON LANGUAGE js AS r""" try { - const obj = JSON.parse(byua_json || '{}'); const key = String(agent || '').toLowerCase(); - const rec = obj[key]; - return rec ? JSON.stringify(rec) : null; + const rec = byua_json[key]; + return rec; } catch (e) { return null; } """; @@ -19,10 +18,12 @@ WITH base AS ( client, rank, root_page, - SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, - JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua - FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' AND + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.status) AS INT64) AS status, + custom_metrics.robots_txt.record_counts.by_useragent AS byua + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND is_root_page ), @@ -33,7 +34,8 @@ ua_keys AS ( b.rank, b.root_page, LOWER(agent) AS agent - FROM base AS b, + FROM + base AS b, UNNEST( REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') ) AS agent @@ -47,9 +49,11 @@ ua_presence AS ( k.root_page, k.agent, -- Look up this agent's counts without reparsing the whole robots again - getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj - FROM ua_keys k - JOIN base b + getByAgent(b.byua, k.agent) AS agent_obj + FROM + ua_keys k + JOIN + base b USING (client, rank, root_page) ), @@ -59,12 +63,13 @@ ua_presence_scored AS ( rank, root_page, agent, - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum - FROM ua_presence + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.allow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.disallow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.crawl_delay) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.noindex) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.other) AS INT64), 0) AS rules_sum + FROM + ua_presence ), -- Totals per rank bucket diff --git a/sql/2025/generative-ai/user-agent-years.sql b/sql/2025/generative-ai/user-agent-years.sql index 70c154736a9..06ad92b5f9a 100644 --- a/sql/2025/generative-ai/user-agent-years.sql +++ b/sql/2025/generative-ai/user-agent-years.sql @@ -1,13 +1,12 @@ #standardSQL -- Percent of sites mentioning each user-agent in robots.txt, over fixed Almanac snapshots -CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) -RETURNS STRING +CREATE TEMP FUNCTION getByAgent(byua_json JSON, agent STRING) +RETURNS JSON LANGUAGE js AS r""" try { - const obj = JSON.parse(byua_json || '{}'); - const rec = obj[String(agent || '').toLowerCase()]; - return rec ? JSON.stringify(rec) : null; + const rec = byua_json[String(agent || '').toLowerCase()]; + return rec; } catch (e) { return null; } """; @@ -18,10 +17,12 @@ WITH base AS ( client, rank, root_page, - SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, - JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua - FROM `httparchive.crawl.pages` - WHERE date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.status) AS INT64) AS status, + custom_metrics.robots_txt.record_counts.by_useragent AS byua + FROM + `httparchive.crawl.pages` + WHERE + date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND client = 'mobile' AND is_root_page ), @@ -38,8 +39,12 @@ ua_keys AS ( -- Look up that agent’s counts on that site ua_presence AS ( SELECT - k.date, k.client, k.rank, k.root_page, k.agent, - getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj, + k.date, + k.client, + k.rank, + k.root_page, + k.agent, + getByAgent(b.byua, k.agent) AS agent_obj, b.status FROM ua_keys k JOIN base b USING (date, client, rank, root_page) @@ -48,13 +53,19 @@ ua_presence AS ( -- Sum rule counts and keep only sites where the agent actually appears ua_scored AS ( SELECT - date, client, rank, root_page, agent, status, - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + - COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum - FROM ua_presence + date, + client, + rank, + root_page, + agent, + status, + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.allow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.disallow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.crawl_delay) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.noindex) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.other) AS INT64), 0) AS rules_sum + FROM + ua_presence ), -- Denominators per (date, rank) diff --git a/sql/2025/generative-ai/vibecodetools.sql b/sql/2025/generative-ai/vibecodetools.sql index 73e239be5e2..65cf77443da 100644 --- a/sql/2025/generative-ai/vibecodetools.sql +++ b/sql/2025/generative-ai/vibecodetools.sql @@ -24,13 +24,13 @@ classified AS ( date, page, CASE - WHEN ENDS_WITH(host, 'vercel.app') THEN 'vercel' - WHEN ENDS_WITH(host, 'pages.dev') THEN 'cloudflare_pages' - WHEN ENDS_WITH(host, 'workers.dev') THEN 'cloudflare_workers' - WHEN ENDS_WITH(host, 'lovable.app') OR ENDS_WITH(host, 'lovable.dev') THEN 'lovable' - WHEN ENDS_WITH(host, 'bolt.new') OR ENDS_WITH(host, 'stackblitz.io') THEN 'bolt' - WHEN ENDS_WITH(host, 'v0.dev') THEN 'v0' - WHEN ENDS_WITH(host, 'replit.app') THEN 'replit' + WHEN ENDS_WITH(host, '.vercel.app') THEN 'vercel' + WHEN ENDS_WITH(host, '.pages.dev') THEN 'cloudflare_pages' + WHEN ENDS_WITH(host, '.workers.dev') THEN 'cloudflare_workers' + WHEN ENDS_WITH(host, '.lovable.app') OR ENDS_WITH(host, 'lovable.dev') THEN 'lovable' + WHEN ENDS_WITH(host, '.bolt.new') OR ENDS_WITH(host, 'stackblitz.io') THEN 'bolt' + WHEN ENDS_WITH(host, '.v0.dev') THEN 'v0' + WHEN ENDS_WITH(host, '.replit.app') THEN 'replit' ELSE NULL END AS platform FROM From 03388bd35544244e773d13f3fd4d267a0bec00e3 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Fri, 9 Jan 2026 11:37:12 +0000 Subject: [PATCH 12/13] Update sql/2025/generative-ai/gradient-client-rank.sql --- sql/2025/generative-ai/gradient-client-rank.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2025/generative-ai/gradient-client-rank.sql b/sql/2025/generative-ai/gradient-client-rank.sql index 7129513478e..7d57130b004 100644 --- a/sql/2025/generative-ai/gradient-client-rank.sql +++ b/sql/2025/generative-ai/gradient-client-rank.sql @@ -32,7 +32,7 @@ SELECT ) AS pct_sites_using_gradient FROM `httparchive.crawl.pages`, - UNNEST ([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping WHERE is_root_page AND rank <= rank_grouping AND From 75e6599f59a2684e567a645e9a7a219182786b80 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Fri, 9 Jan 2026 11:52:27 +0000 Subject: [PATCH 13/13] more formatting --- sql/2025/generative-ai/ai_tld.sql | 11 +++- sql/2025/generative-ai/user-agent-named.sql | 58 ++++++++++++----- sql/2025/generative-ai/user-agent-years.sql | 69 ++++++++++++++++----- 3 files changed, 103 insertions(+), 35 deletions(-) diff --git a/sql/2025/generative-ai/ai_tld.sql b/sql/2025/generative-ai/ai_tld.sql index 4c64b63b5c0..1e8bc361f2e 100644 --- a/sql/2025/generative-ai/ai_tld.sql +++ b/sql/2025/generative-ai/ai_tld.sql @@ -29,7 +29,8 @@ bucketed AS ( WHEN rank <= 10000000 THEN 10000000 END AS rank_bucket, host - FROM base + FROM + base ) SELECT @@ -40,6 +41,10 @@ SELECT FROM bucketed GROUP BY - date, client, rank_bucket + date, + client, + rank_bucket ORDER BY - date, client, rank_bucket; + date, + client, + rank_bucket; diff --git a/sql/2025/generative-ai/user-agent-named.sql b/sql/2025/generative-ai/user-agent-named.sql index e7570c279da..51f7309dc2e 100644 --- a/sql/2025/generative-ai/user-agent-named.sql +++ b/sql/2025/generative-ai/user-agent-named.sql @@ -74,16 +74,29 @@ ua_presence_scored AS ( -- Totals per rank bucket totals_all AS ( - SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites - FROM base - GROUP BY client, rank + SELECT + client, + rank, + COUNT(DISTINCT root_page) AS total_sites + FROM + base + GROUP BY + client, + rank ), totals_200 AS ( - SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200 - FROM base - WHERE status = 200 - GROUP BY client, rank + SELECT + client, + rank, + COUNT(DISTINCT root_page) AS total_sites_200 + FROM + base + WHERE + status = 200 + GROUP BY + client, + rank ), -- Numerators per agent @@ -94,10 +107,17 @@ numerators AS ( p.agent, COUNT(DISTINCT p.root_page) AS sites_with_agent, COUNT(DISTINCT IF(b.status = 200, p.root_page, NULL)) AS sites_with_agent_among_200 - FROM ua_presence_scored p - JOIN base b USING (client, rank, root_page) - WHERE p.rules_sum > 0 - GROUP BY p.client, p.rank, p.agent + FROM + ua_presence_scored p + JOIN + base b + USING (client, rank, root_page) + WHERE + p.rules_sum > 0 + GROUP BY + p.client, + p.rank, + p.agent ) SELECT @@ -110,7 +130,15 @@ SELECT n.sites_with_agent_among_200, SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 -FROM numerators n -JOIN totals_all t USING (client, rank) -JOIN totals_200 t2 USING (client, rank) -ORDER BY rank, client, pct_of_all_sites DESC; +FROM + numerators n +JOIN + totals_all t +USING (client, rank) +JOIN + totals_200 t2 +USING (client, rank) +ORDER BY + rank, + client, + pct_of_all_sites DESC; diff --git a/sql/2025/generative-ai/user-agent-years.sql b/sql/2025/generative-ai/user-agent-years.sql index 06ad92b5f9a..d8d318ae296 100644 --- a/sql/2025/generative-ai/user-agent-years.sql +++ b/sql/2025/generative-ai/user-agent-years.sql @@ -32,7 +32,8 @@ ua_keys AS ( SELECT b.date, b.client, b.rank, b.root_page, LOWER(agent) AS agent - FROM base AS b, + FROM + base AS b, UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent ), @@ -46,8 +47,11 @@ ua_presence AS ( k.agent, getByAgent(b.byua, k.agent) AS agent_obj, b.status - FROM ua_keys k - JOIN base b USING (date, client, rank, root_page) + FROM + ua_keys k + JOIN + base b + USING (date, client, rank, root_page) ), -- Sum rule counts and keep only sites where the agent actually appears @@ -70,16 +74,33 @@ ua_scored AS ( -- Denominators per (date, rank) totals_all AS ( - SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites - FROM base - GROUP BY date, client, rank + SELECT + date, + client, + rank, + COUNT(DISTINCT root_page) AS total_sites + FROM + base + GROUP BY + date, + client, + rank ), totals_200 AS ( - SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200 - FROM base - WHERE status = 200 - GROUP BY date, client, rank + SELECT + date, + client, + rank, + COUNT(DISTINCT root_page) AS total_sites_200 + FROM + base + WHERE + status = 200 + GROUP BY + date, + client, + rank ), -- Numerators per (date, rank, agent) @@ -88,8 +109,13 @@ numerators AS ( date, client, rank, agent, COUNT(DISTINCT IF(rules_sum > 0, root_page, NULL)) AS sites_with_agent, COUNT(DISTINCT IF(status = 200 AND rules_sum > 0, root_page, NULL)) AS sites_with_agent_among_200 - FROM ua_scored - GROUP BY date, client, rank, agent + FROM + ua_scored + GROUP BY + date, + client, + rank, + agent ) SELECT @@ -103,8 +129,17 @@ SELECT n.sites_with_agent_among_200, SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 -FROM numerators n -JOIN totals_all t USING (date, client, rank) -JOIN totals_200 t2 USING (date, client, rank) -WHERE n.sites_with_agent >= 100 -ORDER BY n.date, n.rank, pct_of_all_sites DESC; +FROM + numerators n +JOIN + totals_all t +USING (date, client, rank) +JOIN + totals_200 t2 +USING (date, client, rank) +WHERE + n.sites_with_agent >= 100 +ORDER BY + n.date, + n.rank, + pct_of_all_sites DESC;