Skip to content

Commit d7cdccf

Browse files
jcmpagelchristianliebel
authored andcommitted
Add SQL query for robots.txt site analysis
This SQL query calculates the percentage of sites with a valid robots.txt file, including various directives, based on data from the HTTP Archive crawl.
1 parent 20917b1 commit d7cdccf

1 file changed

Lines changed: 32 additions & 0 deletions

File tree

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#standardSQL
2+
-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere)
3+
WITH roots AS (
4+
SELECT
5+
client,
6+
root_page,
7+
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
8+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
9+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
10+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
11+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
12+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
13+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
14+
FROM `httparchive.crawl.pages`
15+
WHERE date = '2025-07-01'
16+
AND is_root_page
17+
)
18+
SELECT
19+
client,
20+
COUNT(DISTINCT root_page) AS sites,
21+
COUNT(DISTINCT IF(status = 200
22+
AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
23+
root_page, NULL)) AS sites_with_robots_txt,
24+
SAFE_DIVIDE(
25+
COUNT(DISTINCT IF(status = 200
26+
AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
27+
root_page, NULL)),
28+
COUNT(DISTINCT root_page)
29+
) AS pct_sites_with_robots_txt
30+
FROM roots
31+
GROUP BY client
32+
ORDER BY client;

0 commit comments

Comments
 (0)