Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions sql/2025/generative-ai/ai_tld.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#standardSQL
# .ai domains per exclusive rank bucket, 2022 vs 2025

WITH base AS (
SELECT
date,
client,
rank,
NET.HOST(page) AS host
FROM
`httparchive.crawl.pages`
WHERE
is_root_page AND
client IN ('desktop', 'mobile') AND
date IN ('2022-06-01', '2025-07-01') AND
rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency
ENDS_WITH(NET.HOST(page), '.ai')
),

bucketed AS (
SELECT
date,
client,
CASE
WHEN rank <= 1000 THEN 1000
WHEN rank <= 10000 THEN 10000
WHEN rank <= 100000 THEN 100000
WHEN rank <= 1000000 THEN 1000000
WHEN rank <= 10000000 THEN 10000000
END AS rank_bucket,
host
FROM
base
)

SELECT
date,
client,
rank_bucket,
COUNT(DISTINCT host) AS ai_domains
FROM
bucketed
GROUP BY
date,
client,
rank_bucket
ORDER BY
date,
client,
rank_bucket;
54 changes: 54 additions & 0 deletions sql/2025/generative-ai/gradient-client-rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#standardSQL
-- Adoption of CSS gradients in custom_metrics.css_variables
-- Grouped by: year, client, rank bucket

#standardSQL
-- Adoption of CSS gradients in custom_metrics.css_variables
-- Grouped by: year, client, rank bucket

SELECT
EXTRACT(YEAR FROM date) AS year,
client,
rank_grouping,
COUNT(DISTINCT page) AS total_sites,
COUNT(DISTINCT IF(
REGEXP_CONTAINS(
TO_JSON_STRING(custom_metrics.css_variables),
r'(?i)gradient\('
),
page,
NULL
)) AS sites_using_gradient,
SAFE_DIVIDE(
COUNT(DISTINCT IF(
REGEXP_CONTAINS(
TO_JSON_STRING(custom_metrics.css_variables),
r'(?i)gradient\('
),
page,
NULL
)),
COUNT(DISTINCT page)
) AS pct_sites_using_gradient
FROM
`httparchive.crawl.pages`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
is_root_page AND
rank <= rank_grouping AND
date IN (
DATE '2019-07-01',
DATE '2020-08-01',
DATE '2021-07-01',
DATE '2022-07-01', -- CSS metrics exception
DATE '2024-06-01',
DATE '2025-07-01'
)
GROUP BY
year,
client,
rank_grouping
ORDER BY
year,
client,
rank_grouping;
39 changes: 39 additions & 0 deletions sql/2025/generative-ai/gradient-use.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#standardSQL
-- % of sites whose CSS variables include a gradient( ... ) per year, client, rank
-- Goal it use is as an indicator for vibe coded website like "delve into" is used for papers
SELECT
EXTRACT(YEAR FROM date) AS year,
client,
rank,
COUNT(DISTINCT page) AS total_pages,
COUNT(DISTINCT IF(
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
page, NULL
)) AS pages_with_gradient,
SAFE_DIVIDE(
COUNT(DISTINCT IF(
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
page, NULL
)),
COUNT(DISTINCT page)
) AS pct_with_gradient
FROM
`httparchive.crawl.pages`
WHERE
is_root_page AND
date IN (
DATE '2019-07-01',
DATE '2020-08-01',
DATE '2021-07-01',
DATE '2022-07-01',
DATE '2024-06-01',
DATE '2025-07-01'
)
GROUP BY
year,
client,
rank
ORDER BY
year,
client,
rank;
128 changes: 128 additions & 0 deletions sql/2025/generative-ai/gradient.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#standardSQL
WITH raw_data AS (
SELECT
date,
page,
-- 1. TECHNOLOGY FLAGS
-- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string
(
custom_metrics.css_variables IS NOT NULL AND
TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null')
) AS uses_css_vars,

-- Tailwind: Check the array for the technology
'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind,

-- Content String for Regex
LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str
FROM
`httparchive.crawl.pages`
WHERE
client = 'mobile' AND
is_root_page AND
-- NO RANK FILTER (Analyze the entire long-tail of the web)

-- Quarterly Dates
date IN UNNEST([
DATE '2020-10-01',
DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01',
DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01',
DATE '2023-01-01', DATE '2023-04-01', DATE '2023-07-01', DATE '2023-10-01',
DATE '2024-01-01', DATE '2024-04-01', DATE '2024-07-01', DATE '2024-10-01',
DATE '2025-01-01', DATE '2025-04-01', DATE '2025-07-01', DATE '2025-10-01'
])
),

-- Pre-calculate heuristics
flags AS (
SELECT
date,
page,
uses_css_vars,
uses_tailwind,

-- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true)
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'inter')) AS has_inter,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'roboto')) AS has_roboto,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'system-ui')) AS has_system_ui,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'linear-gradient\(|radial-gradient\(')) AS has_gradient,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(2px|4px|6px|8px|12px|16px|0\.25rem|0\.5rem|0\.75rem|1rem|9999px)"')) AS has_radius,
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'rgba\(|box-shadow')) AS has_shadow
FROM
raw_data
)

SELECT
FORMAT_DATE('%Y-Q%Q', date) AS year_quarter,

-- 1. CONTEXT (Denominators)
COUNT(DISTINCT page) AS total_sites,
COUNT(DISTINCT IF(uses_css_vars, page, NULL)) AS sites_using_vars,
COUNT(DISTINCT IF(uses_tailwind, page, NULL)) AS sites_using_tailwind,

-------------------------------------------------------------------------
-- 2. "AI PURPLE" SPECTRUM (Indigo/Violet/Purple 500)
-------------------------------------------------------------------------
COUNT(DISTINCT IF(has_ai_purples, page, NULL)) AS cnt_ai_purples,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT page)) AS pct_all_ai_purples,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_ai_purples,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_ai_purples,

-------------------------------------------------------------------------
-- 3. SPECIFIC INDIGO 500 (#6366f1 Only)
-------------------------------------------------------------------------
COUNT(DISTINCT IF(has_indigo_500, page, NULL)) AS cnt_indigo,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT page)) AS pct_all_indigo,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_indigo,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500 AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_indigo,

-------------------------------------------------------------------------
-- 4. FONTS
-------------------------------------------------------------------------
-- Inter
COUNT(DISTINCT IF(has_inter, page, NULL)) AS cnt_inter,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter,

-- Roboto
COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_roboto,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_roboto,

-- System UI
COUNT(DISTINCT IF(has_system_ui, page, NULL)) AS cnt_system,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT page)) AS pct_all_system,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_system,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_system,

-------------------------------------------------------------------------
-- 5. UI ELEMENTS
-------------------------------------------------------------------------
-- Gradients
COUNT(DISTINCT IF(has_gradient, page, NULL)) AS cnt_gradient,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT page)) AS pct_all_gradient,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_gradient,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_gradient,

-- Radius
COUNT(DISTINCT IF(has_radius, page, NULL)) AS cnt_radius,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT page)) AS pct_all_radius,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_radius,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_radius,

-- Shadows
COUNT(DISTINCT IF(has_shadow, page, NULL)) AS cnt_shadow,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT page)) AS pct_all_shadow,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_shadow,
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_shadow

FROM
flags
GROUP BY
year_quarter
ORDER BY
year_quarter;
42 changes: 42 additions & 0 deletions sql/2025/generative-ai/sites-with-robot-txt.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#standardSQL
-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere)
WITH roots AS (
SELECT
client,
root_page,
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND
is_root_page
)

SELECT
client,
COUNT(DISTINCT root_page) AS sites,
COUNT(DISTINCT IF(
status = 200 AND
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
root_page, NULL
)) AS sites_with_robots_txt,
SAFE_DIVIDE(
COUNT(DISTINCT IF(
status = 200 AND
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
root_page, NULL
)),
COUNT(DISTINCT root_page)
) AS pct_sites_with_robots_txt
FROM
roots
GROUP BY
client
ORDER BY
client;
Loading