Skip to content

Commit 6d85fd5

Browse files
committed
Merge branch 'main' into generative-ai
2 parents 9657936 + f2f949c commit 6d85fd5

8 files changed

Lines changed: 670 additions & 0 deletions

File tree

sql/2025/generative-ai/ai_tld.sql

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#standardSQL
2+
# .ai domains per exclusive rank bucket, 2022 vs 2025
3+
4+
WITH base AS (
5+
SELECT
6+
date,
7+
client,
8+
rank,
9+
NET.HOST(page) AS host
10+
FROM
11+
`httparchive.crawl.pages`
12+
WHERE
13+
is_root_page AND
14+
client IN ('desktop', 'mobile') AND
15+
date IN ('2022-06-01', '2025-07-01') AND
16+
rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency
17+
ENDS_WITH(NET.HOST(page), '.ai')
18+
),
19+
20+
bucketed AS (
21+
SELECT
22+
date,
23+
client,
24+
CASE
25+
WHEN rank <= 1000 THEN 1000
26+
WHEN rank <= 10000 THEN 10000
27+
WHEN rank <= 100000 THEN 100000
28+
WHEN rank <= 1000000 THEN 1000000
29+
WHEN rank <= 10000000 THEN 10000000
30+
END AS rank_bucket,
31+
host
32+
FROM
33+
base
34+
)
35+
36+
SELECT
37+
date,
38+
client,
39+
rank_bucket,
40+
COUNT(DISTINCT host) AS ai_domains
41+
FROM
42+
bucketed
43+
GROUP BY
44+
date,
45+
client,
46+
rank_bucket
47+
ORDER BY
48+
date,
49+
client,
50+
rank_bucket;
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#standardSQL
2+
-- Adoption of CSS gradients in custom_metrics.css_variables
3+
-- Grouped by: year, client, rank bucket
4+
5+
#standardSQL
6+
-- Adoption of CSS gradients in custom_metrics.css_variables
7+
-- Grouped by: year, client, rank bucket
8+
9+
SELECT
10+
EXTRACT(YEAR FROM date) AS year,
11+
client,
12+
rank_grouping,
13+
COUNT(DISTINCT page) AS total_sites,
14+
COUNT(DISTINCT IF(
15+
REGEXP_CONTAINS(
16+
TO_JSON_STRING(custom_metrics.css_variables),
17+
r'(?i)gradient\('
18+
),
19+
page,
20+
NULL
21+
)) AS sites_using_gradient,
22+
SAFE_DIVIDE(
23+
COUNT(DISTINCT IF(
24+
REGEXP_CONTAINS(
25+
TO_JSON_STRING(custom_metrics.css_variables),
26+
r'(?i)gradient\('
27+
),
28+
page,
29+
NULL
30+
)),
31+
COUNT(DISTINCT page)
32+
) AS pct_sites_using_gradient
33+
FROM
34+
`httparchive.crawl.pages`,
35+
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
36+
WHERE
37+
is_root_page AND
38+
rank <= rank_grouping AND
39+
date IN (
40+
DATE '2019-07-01',
41+
DATE '2020-08-01',
42+
DATE '2021-07-01',
43+
DATE '2022-07-01', -- CSS metrics exception
44+
DATE '2024-06-01',
45+
DATE '2025-07-01'
46+
)
47+
GROUP BY
48+
year,
49+
client,
50+
rank_grouping
51+
ORDER BY
52+
year,
53+
client,
54+
rank_grouping;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#standardSQL
2+
-- % of sites whose CSS variables include a gradient( ... ) per year, client, rank
3+
-- Goal it use is as an indicator for vibe coded website like "delve into" is used for papers
4+
SELECT
5+
EXTRACT(YEAR FROM date) AS year,
6+
client,
7+
rank,
8+
COUNT(DISTINCT page) AS total_pages,
9+
COUNT(DISTINCT IF(
10+
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
11+
page, NULL
12+
)) AS pages_with_gradient,
13+
SAFE_DIVIDE(
14+
COUNT(DISTINCT IF(
15+
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
16+
page, NULL
17+
)),
18+
COUNT(DISTINCT page)
19+
) AS pct_with_gradient
20+
FROM
21+
`httparchive.crawl.pages`
22+
WHERE
23+
is_root_page AND
24+
date IN (
25+
DATE '2019-07-01',
26+
DATE '2020-08-01',
27+
DATE '2021-07-01',
28+
DATE '2022-07-01',
29+
DATE '2024-06-01',
30+
DATE '2025-07-01'
31+
)
32+
GROUP BY
33+
year,
34+
client,
35+
rank
36+
ORDER BY
37+
year,
38+
client,
39+
rank;
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#standardSQL
2+
WITH raw_data AS (
3+
SELECT
4+
date,
5+
page,
6+
-- 1. TECHNOLOGY FLAGS
7+
-- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string
8+
(
9+
custom_metrics.css_variables IS NOT NULL AND
10+
TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null')
11+
) AS uses_css_vars,
12+
13+
-- Tailwind: Check the array for the technology
14+
'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind,
15+
16+
-- Content String for Regex
17+
LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str
18+
FROM
19+
`httparchive.crawl.pages`
20+
WHERE
21+
client = 'mobile' AND
22+
is_root_page AND
23+
-- NO RANK FILTER (Analyze the entire long-tail of the web)
24+
25+
-- Quarterly Dates
26+
date IN UNNEST([
27+
DATE '2020-10-01',
28+
DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01',
29+
DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01',
30+
DATE '2023-01-01', DATE '2023-04-01', DATE '2023-07-01', DATE '2023-10-01',
31+
DATE '2024-01-01', DATE '2024-04-01', DATE '2024-07-01', DATE '2024-10-01',
32+
DATE '2025-01-01', DATE '2025-04-01', DATE '2025-07-01', DATE '2025-10-01'
33+
])
34+
),
35+
36+
-- Pre-calculate heuristics
37+
flags AS (
38+
SELECT
39+
date,
40+
page,
41+
uses_css_vars,
42+
uses_tailwind,
43+
44+
-- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true)
45+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500,
46+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples,
47+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'inter')) AS has_inter,
48+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'roboto')) AS has_roboto,
49+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'system-ui')) AS has_system_ui,
50+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'linear-gradient\(|radial-gradient\(')) AS has_gradient,
51+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(2px|4px|6px|8px|12px|16px|0\.25rem|0\.5rem|0\.75rem|1rem|9999px)"')) AS has_radius,
52+
(uses_css_vars AND REGEXP_CONTAINS(vars_str, r'rgba\(|box-shadow')) AS has_shadow
53+
FROM
54+
raw_data
55+
)
56+
57+
SELECT
58+
FORMAT_DATE('%Y-Q%Q', date) AS year_quarter,
59+
60+
-- 1. CONTEXT (Denominators)
61+
COUNT(DISTINCT page) AS total_sites,
62+
COUNT(DISTINCT IF(uses_css_vars, page, NULL)) AS sites_using_vars,
63+
COUNT(DISTINCT IF(uses_tailwind, page, NULL)) AS sites_using_tailwind,
64+
65+
-------------------------------------------------------------------------
66+
-- 2. "AI PURPLE" SPECTRUM (Indigo/Violet/Purple 500)
67+
-------------------------------------------------------------------------
68+
COUNT(DISTINCT IF(has_ai_purples, page, NULL)) AS cnt_ai_purples,
69+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT page)) AS pct_all_ai_purples,
70+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_ai_purples,
71+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_ai_purples,
72+
73+
-------------------------------------------------------------------------
74+
-- 3. SPECIFIC INDIGO 500 (#6366f1 Only)
75+
-------------------------------------------------------------------------
76+
COUNT(DISTINCT IF(has_indigo_500, page, NULL)) AS cnt_indigo,
77+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT page)) AS pct_all_indigo,
78+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_indigo,
79+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500 AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_indigo,
80+
81+
-------------------------------------------------------------------------
82+
-- 4. FONTS
83+
-------------------------------------------------------------------------
84+
-- Inter
85+
COUNT(DISTINCT IF(has_inter, page, NULL)) AS cnt_inter,
86+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter,
87+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter,
88+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter,
89+
90+
-- Roboto
91+
COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto,
92+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto,
93+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_roboto,
94+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_roboto,
95+
96+
-- System UI
97+
COUNT(DISTINCT IF(has_system_ui, page, NULL)) AS cnt_system,
98+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT page)) AS pct_all_system,
99+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_system,
100+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_system,
101+
102+
-------------------------------------------------------------------------
103+
-- 5. UI ELEMENTS
104+
-------------------------------------------------------------------------
105+
-- Gradients
106+
COUNT(DISTINCT IF(has_gradient, page, NULL)) AS cnt_gradient,
107+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT page)) AS pct_all_gradient,
108+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_gradient,
109+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_gradient,
110+
111+
-- Radius
112+
COUNT(DISTINCT IF(has_radius, page, NULL)) AS cnt_radius,
113+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT page)) AS pct_all_radius,
114+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_radius,
115+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_radius,
116+
117+
-- Shadows
118+
COUNT(DISTINCT IF(has_shadow, page, NULL)) AS cnt_shadow,
119+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT page)) AS pct_all_shadow,
120+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_shadow,
121+
IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_shadow
122+
123+
FROM
124+
flags
125+
GROUP BY
126+
year_quarter
127+
ORDER BY
128+
year_quarter;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#standardSQL
2+
-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere)
3+
WITH roots AS (
4+
SELECT
5+
client,
6+
root_page,
7+
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
8+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
9+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
10+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
11+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
12+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
13+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
14+
FROM
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '2025-07-01' AND
18+
is_root_page
19+
)
20+
21+
SELECT
22+
client,
23+
COUNT(DISTINCT root_page) AS sites,
24+
COUNT(DISTINCT IF(
25+
status = 200 AND
26+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
27+
root_page, NULL
28+
)) AS sites_with_robots_txt,
29+
SAFE_DIVIDE(
30+
COUNT(DISTINCT IF(
31+
status = 200 AND
32+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
33+
root_page, NULL
34+
)),
35+
COUNT(DISTINCT root_page)
36+
) AS pct_sites_with_robots_txt
37+
FROM
38+
roots
39+
GROUP BY
40+
client
41+
ORDER BY
42+
client;

0 commit comments

Comments
 (0)