|
| 1 | +#standardSQL |
| 2 | +-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere) |
| 3 | +WITH roots AS ( |
| 4 | + SELECT |
| 5 | + client, |
| 6 | + root_page, |
| 7 | + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, |
| 8 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt, |
| 9 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt, |
| 10 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt, |
| 11 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, |
| 12 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, |
| 13 | + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt |
| 14 | + FROM `httparchive.crawl.pages` |
| 15 | + WHERE date = '2025-07-01' |
| 16 | + AND is_root_page |
| 17 | +) |
| 18 | +SELECT |
| 19 | + client, |
| 20 | + COUNT(DISTINCT root_page) AS sites, |
| 21 | + COUNT(DISTINCT IF(status = 200 |
| 22 | + AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, |
| 23 | + root_page, NULL)) AS sites_with_robots_txt, |
| 24 | + SAFE_DIVIDE( |
| 25 | + COUNT(DISTINCT IF(status = 200 |
| 26 | + AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, |
| 27 | + root_page, NULL)), |
| 28 | + COUNT(DISTINCT root_page) |
| 29 | + ) AS pct_sites_with_robots_txt |
| 30 | +FROM roots |
| 31 | +GROUP BY client |
| 32 | +ORDER BY client; |
0 commit comments