Skip to content

Commit 49ad032

Browse files
jcmpagelchristianliebel
authored andcommitted
Create user-agent-years.sql
inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/
1 parent 0daad6a commit 49ad032

1 file changed

Lines changed: 98 additions & 0 deletions

File tree

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#standardSQL
2+
-- Percent of sites mentioning each user-agent in robots.txt, over fixed Almanac snapshots
3+
4+
CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING)
5+
RETURNS STRING
6+
LANGUAGE js AS r"""
7+
try {
8+
const obj = JSON.parse(byua_json || '{}');
9+
const rec = obj[String(agent || '').toLowerCase()];
10+
return rec ? JSON.stringify(rec) : null;
11+
} catch (e) { return null; }
12+
""";
13+
14+
-- Base: one row per site snapshot (only fields we need)
15+
WITH base AS (
16+
SELECT
17+
date,
18+
client,
19+
rank,
20+
root_page,
21+
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
22+
JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua
23+
FROM `httparchive.crawl.pages`
24+
WHERE date IN ('2019-07-01','2020-08-01','2021-07-01','2022-06-01','2024-06-01','2025-07-01')
25+
AND client = 'mobile'
26+
AND is_root_page
27+
),
28+
29+
-- Extract UA keys present on each site
30+
ua_keys AS (
31+
SELECT
32+
b.date, b.client, b.rank, b.root_page,
33+
LOWER(agent) AS agent
34+
FROM base AS b,
35+
UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent
36+
),
37+
38+
-- Look up that agent’s counts on that site
39+
ua_presence AS (
40+
SELECT
41+
k.date, k.client, k.rank, k.root_page, k.agent,
42+
getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj,
43+
b.status
44+
FROM ua_keys k
45+
JOIN base b USING (date, client, rank, root_page)
46+
),
47+
48+
-- Sum rule counts and keep only sites where the agent actually appears
49+
ua_scored AS (
50+
SELECT
51+
date, client, rank, root_page, agent, status,
52+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
53+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
54+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) +
55+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
56+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
57+
FROM ua_presence
58+
),
59+
60+
-- Denominators per (date, rank)
61+
totals_all AS (
62+
SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites
63+
FROM base
64+
GROUP BY date, client, rank
65+
),
66+
totals_200 AS (
67+
SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200
68+
FROM base
69+
WHERE status = 200
70+
GROUP BY date, client, rank
71+
),
72+
73+
-- Numerators per (date, rank, agent)
74+
numerators AS (
75+
SELECT
76+
date, client, rank, agent,
77+
COUNT(DISTINCT IF(rules_sum > 0, root_page, NULL)) AS sites_with_agent,
78+
COUNT(DISTINCT IF(status = 200 AND rules_sum > 0, root_page, NULL)) AS sites_with_agent_among_200
79+
FROM ua_scored
80+
GROUP BY date, client, rank, agent
81+
)
82+
83+
SELECT
84+
n.date,
85+
n.client,
86+
n.rank,
87+
n.agent,
88+
t.total_sites,
89+
t2.total_sites_200,
90+
n.sites_with_agent,
91+
n.sites_with_agent_among_200,
92+
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
93+
SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200
94+
FROM numerators n
95+
JOIN totals_all t USING (date, client, rank)
96+
JOIN totals_200 t2 USING (date, client, rank)
97+
WHERE n.sites_with_agent >= 100
98+
ORDER BY n.date, n.rank, pct_of_all_sites DESC;

0 commit comments

Comments
 (0)