Skip to content

Commit 0daad6a

Browse files
jcmpagelchristianliebel
authored andcommitted
Add SQL script for user-agent analysis in robots.txt
This SQL script calculates the percentage of sites mentioning a specific user-agent in their robots.txt file, categorized by rank bucket. Inspired by https://paulcalvano.com/2025-08-21-ai-bots-and-robots-txt/
1 parent d7cdccf commit 0daad6a

1 file changed

Lines changed: 110 additions & 0 deletions

File tree

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#standardSQL
2+
# Percent of sites with a given user-agent mentioned in robots.txt, by rank bucket
3+
# Returns percentages among (a) all sites and (b) only sites with robots.txt status=200.
4+
5+
CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING)
6+
RETURNS STRING
7+
LANGUAGE js AS r"""
8+
try {
9+
const obj = JSON.parse(byua_json || '{}');
10+
const key = String(agent || '').toLowerCase();
11+
const rec = obj[key];
12+
return rec ? JSON.stringify(rec) : null;
13+
} catch (e) { return null; }
14+
""";
15+
16+
-- Base rows: one per site
17+
WITH base AS (
18+
SELECT
19+
client,
20+
rank,
21+
root_page,
22+
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
23+
JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua
24+
FROM `httparchive.crawl.pages`
25+
WHERE date = '2025-07-01'
26+
AND is_root_page
27+
),
28+
29+
-- Extract all UA keys present on each site (keys of by_useragent object)
30+
ua_keys AS (
31+
SELECT
32+
b.client,
33+
b.rank,
34+
b.root_page,
35+
LOWER(agent) AS agent
36+
FROM base AS b,
37+
UNNEST(
38+
REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')
39+
) AS agent
40+
),
41+
42+
-- Per-site per-agent presence (any directive count > 0)
43+
ua_presence AS (
44+
SELECT
45+
k.client,
46+
k.rank,
47+
k.root_page,
48+
k.agent,
49+
-- Look up this agent's counts without reparsing the whole robots again
50+
getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj
51+
FROM ua_keys k
52+
JOIN base b
53+
USING (client, rank, root_page)
54+
),
55+
56+
ua_presence_scored AS (
57+
SELECT
58+
client,
59+
rank,
60+
root_page,
61+
agent,
62+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
63+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
64+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) +
65+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
66+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
67+
FROM ua_presence
68+
),
69+
70+
-- Totals per rank bucket
71+
totals_all AS (
72+
SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites
73+
FROM base
74+
GROUP BY client, rank
75+
),
76+
totals_200 AS (
77+
SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200
78+
FROM base
79+
WHERE status = 200
80+
GROUP BY client, rank
81+
),
82+
83+
-- Numerators per agent
84+
numerators AS (
85+
SELECT
86+
p.client,
87+
p.rank,
88+
p.agent,
89+
COUNT(DISTINCT p.root_page) AS sites_with_agent,
90+
COUNT(DISTINCT IF(b.status = 200, p.root_page, NULL)) AS sites_with_agent_among_200
91+
FROM ua_presence_scored p
92+
JOIN base b USING (client, rank, root_page)
93+
WHERE p.rules_sum > 0
94+
GROUP BY p.client, p.rank, p.agent
95+
)
96+
97+
SELECT
98+
n.client,
99+
n.rank,
100+
n.agent,
101+
t.total_sites,
102+
t2.total_sites_200,
103+
n.sites_with_agent,
104+
n.sites_with_agent_among_200,
105+
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
106+
SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200
107+
FROM numerators n
108+
JOIN totals_all t USING (client, rank)
109+
JOIN totals_200 t2 USING (client, rank)
110+
ORDER BY rank, client, pct_of_all_sites DESC;

0 commit comments

Comments
 (0)