|
| 1 | +#standardSQL |
| 2 | +# Percent of sites with a given user-agent mentioned in robots.txt, by rank bucket |
| 3 | +# Returns percentages among (a) all sites and (b) only sites with robots.txt status=200. |
| 4 | + |
| 5 | +CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) |
| 6 | +RETURNS STRING |
| 7 | +LANGUAGE js AS r""" |
| 8 | + try { |
| 9 | + const obj = JSON.parse(byua_json || '{}'); |
| 10 | + const key = String(agent || '').toLowerCase(); |
| 11 | + const rec = obj[key]; |
| 12 | + return rec ? JSON.stringify(rec) : null; |
| 13 | + } catch (e) { return null; } |
| 14 | +"""; |
| 15 | + |
| 16 | +-- Base rows: one per site |
| 17 | +WITH base AS ( |
| 18 | + SELECT |
| 19 | + client, |
| 20 | + rank, |
| 21 | + root_page, |
| 22 | + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, |
| 23 | + JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua |
| 24 | + FROM `httparchive.crawl.pages` |
| 25 | + WHERE date = '2025-07-01' |
| 26 | + AND is_root_page |
| 27 | +), |
| 28 | + |
| 29 | +-- Extract all UA keys present on each site (keys of by_useragent object) |
| 30 | +ua_keys AS ( |
| 31 | + SELECT |
| 32 | + b.client, |
| 33 | + b.rank, |
| 34 | + b.root_page, |
| 35 | + LOWER(agent) AS agent |
| 36 | + FROM base AS b, |
| 37 | + UNNEST( |
| 38 | + REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') |
| 39 | + ) AS agent |
| 40 | +), |
| 41 | + |
| 42 | +-- Per-site per-agent presence (any directive count > 0) |
| 43 | +ua_presence AS ( |
| 44 | + SELECT |
| 45 | + k.client, |
| 46 | + k.rank, |
| 47 | + k.root_page, |
| 48 | + k.agent, |
| 49 | + -- Look up this agent's counts without reparsing the whole robots again |
| 50 | + getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj |
| 51 | + FROM ua_keys k |
| 52 | + JOIN base b |
| 53 | + USING (client, rank, root_page) |
| 54 | +), |
| 55 | + |
| 56 | +ua_presence_scored AS ( |
| 57 | + SELECT |
| 58 | + client, |
| 59 | + rank, |
| 60 | + root_page, |
| 61 | + agent, |
| 62 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + |
| 63 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + |
| 64 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + |
| 65 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + |
| 66 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum |
| 67 | + FROM ua_presence |
| 68 | +), |
| 69 | + |
| 70 | +-- Totals per rank bucket |
| 71 | +totals_all AS ( |
| 72 | + SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites |
| 73 | + FROM base |
| 74 | + GROUP BY client, rank |
| 75 | +), |
| 76 | +totals_200 AS ( |
| 77 | + SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200 |
| 78 | + FROM base |
| 79 | + WHERE status = 200 |
| 80 | + GROUP BY client, rank |
| 81 | +), |
| 82 | + |
| 83 | +-- Numerators per agent |
| 84 | +numerators AS ( |
| 85 | + SELECT |
| 86 | + p.client, |
| 87 | + p.rank, |
| 88 | + p.agent, |
| 89 | + COUNT(DISTINCT p.root_page) AS sites_with_agent, |
| 90 | + COUNT(DISTINCT IF(b.status = 200, p.root_page, NULL)) AS sites_with_agent_among_200 |
| 91 | + FROM ua_presence_scored p |
| 92 | + JOIN base b USING (client, rank, root_page) |
| 93 | + WHERE p.rules_sum > 0 |
| 94 | + GROUP BY p.client, p.rank, p.agent |
| 95 | +) |
| 96 | + |
| 97 | +SELECT |
| 98 | + n.client, |
| 99 | + n.rank, |
| 100 | + n.agent, |
| 101 | + t.total_sites, |
| 102 | + t2.total_sites_200, |
| 103 | + n.sites_with_agent, |
| 104 | + n.sites_with_agent_among_200, |
| 105 | + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, |
| 106 | + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 |
| 107 | +FROM numerators n |
| 108 | +JOIN totals_all t USING (client, rank) |
| 109 | +JOIN totals_200 t2 USING (client, rank) |
| 110 | +ORDER BY rank, client, pct_of_all_sites DESC; |
0 commit comments