|
| 1 | +#standardSQL |
| 2 | +-- Percent of sites mentioning each user-agent in robots.txt, over fixed Almanac snapshots |
| 3 | + |
| 4 | +CREATE TEMP FUNCTION getByAgent(byua_json STRING, agent STRING) |
| 5 | +RETURNS STRING |
| 6 | +LANGUAGE js AS r""" |
| 7 | + try { |
| 8 | + const obj = JSON.parse(byua_json || '{}'); |
| 9 | + const rec = obj[String(agent || '').toLowerCase()]; |
| 10 | + return rec ? JSON.stringify(rec) : null; |
| 11 | + } catch (e) { return null; } |
| 12 | +"""; |
| 13 | + |
| 14 | +-- Base: one row per site snapshot (only fields we need) |
| 15 | +WITH base AS ( |
| 16 | + SELECT |
| 17 | + date, |
| 18 | + client, |
| 19 | + rank, |
| 20 | + root_page, |
| 21 | + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, |
| 22 | + JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua |
| 23 | + FROM `httparchive.crawl.pages` |
| 24 | + WHERE date IN ('2019-07-01','2020-08-01','2021-07-01','2022-06-01','2024-06-01','2025-07-01') |
| 25 | + AND client = 'mobile' |
| 26 | + AND is_root_page |
| 27 | +), |
| 28 | + |
| 29 | +-- Extract UA keys present on each site |
| 30 | +ua_keys AS ( |
| 31 | + SELECT |
| 32 | + b.date, b.client, b.rank, b.root_page, |
| 33 | + LOWER(agent) AS agent |
| 34 | + FROM base AS b, |
| 35 | + UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent |
| 36 | +), |
| 37 | + |
| 38 | +-- Look up that agent’s counts on that site |
| 39 | +ua_presence AS ( |
| 40 | + SELECT |
| 41 | + k.date, k.client, k.rank, k.root_page, k.agent, |
| 42 | + getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj, |
| 43 | + b.status |
| 44 | + FROM ua_keys k |
| 45 | + JOIN base b USING (date, client, rank, root_page) |
| 46 | +), |
| 47 | + |
| 48 | +-- Sum rule counts and keep only sites where the agent actually appears |
| 49 | +ua_scored AS ( |
| 50 | + SELECT |
| 51 | + date, client, rank, root_page, agent, status, |
| 52 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) + |
| 53 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) + |
| 54 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) + |
| 55 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) + |
| 56 | + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum |
| 57 | + FROM ua_presence |
| 58 | +), |
| 59 | + |
| 60 | +-- Denominators per (date, rank) |
| 61 | +totals_all AS ( |
| 62 | + SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites |
| 63 | + FROM base |
| 64 | + GROUP BY date, client, rank |
| 65 | +), |
| 66 | +totals_200 AS ( |
| 67 | + SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200 |
| 68 | + FROM base |
| 69 | + WHERE status = 200 |
| 70 | + GROUP BY date, client, rank |
| 71 | +), |
| 72 | + |
| 73 | +-- Numerators per (date, rank, agent) |
| 74 | +numerators AS ( |
| 75 | + SELECT |
| 76 | + date, client, rank, agent, |
| 77 | + COUNT(DISTINCT IF(rules_sum > 0, root_page, NULL)) AS sites_with_agent, |
| 78 | + COUNT(DISTINCT IF(status = 200 AND rules_sum > 0, root_page, NULL)) AS sites_with_agent_among_200 |
| 79 | + FROM ua_scored |
| 80 | + GROUP BY date, client, rank, agent |
| 81 | +) |
| 82 | + |
| 83 | +SELECT |
| 84 | + n.date, |
| 85 | + n.client, |
| 86 | + n.rank, |
| 87 | + n.agent, |
| 88 | + t.total_sites, |
| 89 | + t2.total_sites_200, |
| 90 | + n.sites_with_agent, |
| 91 | + n.sites_with_agent_among_200, |
| 92 | + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, |
| 93 | + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 |
| 94 | +FROM numerators n |
| 95 | +JOIN totals_all t USING (date, client, rank) |
| 96 | +JOIN totals_200 t2 USING (date, client, rank) |
| 97 | +WHERE n.sites_with_agent >= 100 |
| 98 | +ORDER BY n.date, n.rank, pct_of_all_sites DESC; |
0 commit comments