diff --git a/sql/2025/generative-ai/ai_tld.sql b/sql/2025/generative-ai/ai_tld.sql new file mode 100644 index 00000000000..1e8bc361f2e --- /dev/null +++ b/sql/2025/generative-ai/ai_tld.sql @@ -0,0 +1,50 @@ +#standardSQL +# .ai domains per exclusive rank bucket, 2022 vs 2025 + +WITH base AS ( + SELECT + date, + client, + rank, + NET.HOST(page) AS host + FROM + `httparchive.crawl.pages` + WHERE + is_root_page AND + client IN ('desktop', 'mobile') AND + date IN ('2022-06-01', '2025-07-01') AND + rank <= 10000000 AND -- later years go beyond 10,000,000 but let's keep to this limit for consistency + ENDS_WITH(NET.HOST(page), '.ai') +), + +bucketed AS ( + SELECT + date, + client, + CASE + WHEN rank <= 1000 THEN 1000 + WHEN rank <= 10000 THEN 10000 + WHEN rank <= 100000 THEN 100000 + WHEN rank <= 1000000 THEN 1000000 + WHEN rank <= 10000000 THEN 10000000 + END AS rank_bucket, + host + FROM + base +) + +SELECT + date, + client, + rank_bucket, + COUNT(DISTINCT host) AS ai_domains +FROM + bucketed +GROUP BY + date, + client, + rank_bucket +ORDER BY + date, + client, + rank_bucket; diff --git a/sql/2025/generative-ai/gradient-client-rank.sql b/sql/2025/generative-ai/gradient-client-rank.sql new file mode 100644 index 00000000000..7d57130b004 --- /dev/null +++ b/sql/2025/generative-ai/gradient-client-rank.sql @@ -0,0 +1,54 @@ +#standardSQL +-- Adoption of CSS gradients in custom_metrics.css_variables +-- Grouped by: year, client, rank bucket + +#standardSQL +-- Adoption of CSS gradients in custom_metrics.css_variables +-- Grouped by: year, client, rank bucket + +SELECT + EXTRACT(YEAR FROM date) AS year, + client, + rank_grouping, + COUNT(DISTINCT page) AS total_sites, + COUNT(DISTINCT IF( + REGEXP_CONTAINS( + TO_JSON_STRING(custom_metrics.css_variables), + r'(?i)gradient\(' + ), + page, + NULL + )) AS sites_using_gradient, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + REGEXP_CONTAINS( + TO_JSON_STRING(custom_metrics.css_variables), + r'(?i)gradient\(' + ), + page, + NULL + )), + COUNT(DISTINCT page) + ) AS pct_sites_using_gradient +FROM + `httparchive.crawl.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + is_root_page AND + rank <= rank_grouping AND + date IN ( + DATE '2019-07-01', + DATE '2020-08-01', + DATE '2021-07-01', + DATE '2022-07-01', -- CSS metrics exception + DATE '2024-06-01', + DATE '2025-07-01' + ) +GROUP BY + year, + client, + rank_grouping +ORDER BY + year, + client, + rank_grouping; diff --git a/sql/2025/generative-ai/gradient-use.sql b/sql/2025/generative-ai/gradient-use.sql new file mode 100644 index 00000000000..3033ad62599 --- /dev/null +++ b/sql/2025/generative-ai/gradient-use.sql @@ -0,0 +1,39 @@ +#standardSQL +-- % of sites whose CSS variables include a gradient( ... ) per year, client, rank +-- Goal it use is as an indicator for vibe coded website like "delve into" is used for papers +SELECT + EXTRACT(YEAR FROM date) AS year, + client, + rank, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL + )) AS pages_with_gradient, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('), + page, NULL + )), + COUNT(DISTINCT page) + ) AS pct_with_gradient +FROM + `httparchive.crawl.pages` +WHERE + is_root_page AND + date IN ( + DATE '2019-07-01', + DATE '2020-08-01', + DATE '2021-07-01', + DATE '2022-07-01', + DATE '2024-06-01', + DATE '2025-07-01' + ) +GROUP BY + year, + client, + rank +ORDER BY + year, + client, + rank; diff --git a/sql/2025/generative-ai/gradient.sql b/sql/2025/generative-ai/gradient.sql new file mode 100644 index 00000000000..d01a03e5763 --- /dev/null +++ b/sql/2025/generative-ai/gradient.sql @@ -0,0 +1,128 @@ +#standardSQL +WITH raw_data AS ( + SELECT + date, + page, + -- 1. TECHNOLOGY FLAGS + -- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string + ( + custom_metrics.css_variables IS NOT NULL AND + TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null') + ) AS uses_css_vars, + + -- Tailwind: Check the array for the technology + 'Tailwind CSS' IN UNNEST(technologies.technology) AS uses_tailwind, + + -- Content String for Regex + LOWER(TO_JSON_STRING(custom_metrics.css_variables)) AS vars_str + FROM + `httparchive.crawl.pages` + WHERE + client = 'mobile' AND + is_root_page AND + -- NO RANK FILTER (Analyze the entire long-tail of the web) + + -- Quarterly Dates + date IN UNNEST([ + DATE '2020-10-01', + DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01', + DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01', + DATE '2023-01-01', DATE '2023-04-01', DATE '2023-07-01', DATE '2023-10-01', + DATE '2024-01-01', DATE '2024-04-01', DATE '2024-07-01', DATE '2024-10-01', + DATE '2025-01-01', DATE '2025-04-01', DATE '2025-07-01', DATE '2025-10-01' + ]) +), + +-- Pre-calculate heuristics +flags AS ( + SELECT + date, + page, + uses_css_vars, + uses_tailwind, + + -- HEURISTIC BOOLEANS (Only true if uses_css_vars is also true) + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"#6366f1"')) AS has_indigo_500, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(#6366f1|#8b5cf6|#a855f7)"')) AS has_ai_purples, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'inter')) AS has_inter, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'roboto')) AS has_roboto, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'system-ui')) AS has_system_ui, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'linear-gradient\(|radial-gradient\(')) AS has_gradient, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'"(2px|4px|6px|8px|12px|16px|0\.25rem|0\.5rem|0\.75rem|1rem|9999px)"')) AS has_radius, + (uses_css_vars AND REGEXP_CONTAINS(vars_str, r'rgba\(|box-shadow')) AS has_shadow + FROM + raw_data +) + +SELECT + FORMAT_DATE('%Y-Q%Q', date) AS year_quarter, + + -- 1. CONTEXT (Denominators) + COUNT(DISTINCT page) AS total_sites, + COUNT(DISTINCT IF(uses_css_vars, page, NULL)) AS sites_using_vars, + COUNT(DISTINCT IF(uses_tailwind, page, NULL)) AS sites_using_tailwind, + + ------------------------------------------------------------------------- + -- 2. "AI PURPLE" SPECTRUM (Indigo/Violet/Purple 500) + ------------------------------------------------------------------------- + COUNT(DISTINCT IF(has_ai_purples, page, NULL)) AS cnt_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT page)) AS pct_all_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_ai_purples, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_ai_purples AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_ai_purples, + + ------------------------------------------------------------------------- + -- 3. SPECIFIC INDIGO 500 (#6366f1 Only) + ------------------------------------------------------------------------- + COUNT(DISTINCT IF(has_indigo_500, page, NULL)) AS cnt_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT page)) AS pct_all_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_indigo, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_indigo_500 AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_indigo, + + ------------------------------------------------------------------------- + -- 4. FONTS + ------------------------------------------------------------------------- + -- Inter + COUNT(DISTINCT IF(has_inter, page, NULL)) AS cnt_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT page)) AS pct_all_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_inter, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_inter AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_inter, + + -- Roboto + COUNT(DISTINCT IF(has_roboto, page, NULL)) AS cnt_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT page)) AS pct_all_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_roboto, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_roboto AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_roboto, + + -- System UI + COUNT(DISTINCT IF(has_system_ui, page, NULL)) AS cnt_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT page)) AS pct_all_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_system, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_system_ui AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_system, + + ------------------------------------------------------------------------- + -- 5. UI ELEMENTS + ------------------------------------------------------------------------- + -- Gradients + COUNT(DISTINCT IF(has_gradient, page, NULL)) AS cnt_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT page)) AS pct_all_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_gradient, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_gradient AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_gradient, + + -- Radius + COUNT(DISTINCT IF(has_radius, page, NULL)) AS cnt_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT page)) AS pct_all_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_radius, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_radius AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_radius, + + -- Shadows + COUNT(DISTINCT IF(has_shadow, page, NULL)) AS cnt_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT page)) AS pct_all_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow, page, NULL)), COUNT(DISTINCT IF(uses_css_vars, page, NULL))) AS pct_vars_shadow, + IEEE_DIVIDE(COUNT(DISTINCT IF(has_shadow AND uses_tailwind, page, NULL)), COUNT(DISTINCT IF(uses_tailwind, page, NULL))) AS pct_tw_shadow + +FROM + flags +GROUP BY + year_quarter +ORDER BY + year_quarter; diff --git a/sql/2025/generative-ai/sites-with-robot-txt.sql b/sql/2025/generative-ai/sites-with-robot-txt.sql new file mode 100644 index 00000000000..2d74dff6a24 --- /dev/null +++ b/sql/2025/generative-ai/sites-with-robot-txt.sql @@ -0,0 +1,42 @@ +#standardSQL +-- % of sites whose robots.txt returns 200 and includes any directive, therefor the percentage is smaller (75% instead of 95% reported elswhere) +WITH roots AS ( + SELECT + client, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt, + COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page +) + +SELECT + client, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT IF( + status = 200 AND + (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL + )) AS sites_with_robots_txt, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + status = 200 AND + (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0, + root_page, NULL + )), + COUNT(DISTINCT root_page) + ) AS pct_sites_with_robots_txt +FROM + roots +GROUP BY + client +ORDER BY + client; diff --git a/sql/2025/generative-ai/user-agent-named.sql b/sql/2025/generative-ai/user-agent-named.sql new file mode 100644 index 00000000000..51f7309dc2e --- /dev/null +++ b/sql/2025/generative-ai/user-agent-named.sql @@ -0,0 +1,144 @@ +#standardSQL +# Percent of sites with a given user-agent mentioned in robots.txt, by rank bucket +# Returns percentages among (a) all sites and (b) only sites with robots.txt status=200. + +CREATE TEMP FUNCTION getByAgent(byua_json JSON, agent STRING) +RETURNS JSON +LANGUAGE js AS r""" + try { + const key = String(agent || '').toLowerCase(); + const rec = byua_json[key]; + return rec; + } catch (e) { return null; } +"""; + +-- Base rows: one per site +WITH base AS ( + SELECT + client, + rank, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.status) AS INT64) AS status, + custom_metrics.robots_txt.record_counts.by_useragent AS byua + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page +), + +-- Extract all UA keys present on each site (keys of by_useragent object) +ua_keys AS ( + SELECT + b.client, + b.rank, + b.root_page, + LOWER(agent) AS agent + FROM + base AS b, + UNNEST( + REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{') + ) AS agent +), + +-- Per-site per-agent presence (any directive count > 0) +ua_presence AS ( + SELECT + k.client, + k.rank, + k.root_page, + k.agent, + -- Look up this agent's counts without reparsing the whole robots again + getByAgent(b.byua, k.agent) AS agent_obj + FROM + ua_keys k + JOIN + base b + USING (client, rank, root_page) +), + +ua_presence_scored AS ( + SELECT + client, + rank, + root_page, + agent, + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.allow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.disallow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.crawl_delay) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.noindex) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.other) AS INT64), 0) AS rules_sum + FROM + ua_presence +), + +-- Totals per rank bucket +totals_all AS ( + SELECT + client, + rank, + COUNT(DISTINCT root_page) AS total_sites + FROM + base + GROUP BY + client, + rank +), + +totals_200 AS ( + SELECT + client, + rank, + COUNT(DISTINCT root_page) AS total_sites_200 + FROM + base + WHERE + status = 200 + GROUP BY + client, + rank +), + +-- Numerators per agent +numerators AS ( + SELECT + p.client, + p.rank, + p.agent, + COUNT(DISTINCT p.root_page) AS sites_with_agent, + COUNT(DISTINCT IF(b.status = 200, p.root_page, NULL)) AS sites_with_agent_among_200 + FROM + ua_presence_scored p + JOIN + base b + USING (client, rank, root_page) + WHERE + p.rules_sum > 0 + GROUP BY + p.client, + p.rank, + p.agent +) + +SELECT + n.client, + n.rank, + n.agent, + t.total_sites, + t2.total_sites_200, + n.sites_with_agent, + n.sites_with_agent_among_200, + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 +FROM + numerators n +JOIN + totals_all t +USING (client, rank) +JOIN + totals_200 t2 +USING (client, rank) +ORDER BY + rank, + client, + pct_of_all_sites DESC; diff --git a/sql/2025/generative-ai/user-agent-years.sql b/sql/2025/generative-ai/user-agent-years.sql new file mode 100644 index 00000000000..d8d318ae296 --- /dev/null +++ b/sql/2025/generative-ai/user-agent-years.sql @@ -0,0 +1,145 @@ +#standardSQL +-- Percent of sites mentioning each user-agent in robots.txt, over fixed Almanac snapshots + +CREATE TEMP FUNCTION getByAgent(byua_json JSON, agent STRING) +RETURNS JSON +LANGUAGE js AS r""" + try { + const rec = byua_json[String(agent || '').toLowerCase()]; + return rec; + } catch (e) { return null; } +"""; + +-- Base: one row per site snapshot (only fields we need) +WITH base AS ( + SELECT + date, + client, + rank, + root_page, + SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.status) AS INT64) AS status, + custom_metrics.robots_txt.record_counts.by_useragent AS byua + FROM + `httparchive.crawl.pages` + WHERE + date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND + client = 'mobile' AND + is_root_page +), + +-- Extract UA keys present on each site +ua_keys AS ( + SELECT + b.date, b.client, b.rank, b.root_page, + LOWER(agent) AS agent + FROM + base AS b, + UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent +), + +-- Look up that agent’s counts on that site +ua_presence AS ( + SELECT + k.date, + k.client, + k.rank, + k.root_page, + k.agent, + getByAgent(b.byua, k.agent) AS agent_obj, + b.status + FROM + ua_keys k + JOIN + base b + USING (date, client, rank, root_page) +), + +-- Sum rule counts and keep only sites where the agent actually appears +ua_scored AS ( + SELECT + date, + client, + rank, + root_page, + agent, + status, + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.allow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.disallow) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.crawl_delay) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.noindex) AS INT64), 0) + + COALESCE(SAFE_CAST(JSON_VALUE(agent_obj.other) AS INT64), 0) AS rules_sum + FROM + ua_presence +), + +-- Denominators per (date, rank) +totals_all AS ( + SELECT + date, + client, + rank, + COUNT(DISTINCT root_page) AS total_sites + FROM + base + GROUP BY + date, + client, + rank +), + +totals_200 AS ( + SELECT + date, + client, + rank, + COUNT(DISTINCT root_page) AS total_sites_200 + FROM + base + WHERE + status = 200 + GROUP BY + date, + client, + rank +), + +-- Numerators per (date, rank, agent) +numerators AS ( + SELECT + date, client, rank, agent, + COUNT(DISTINCT IF(rules_sum > 0, root_page, NULL)) AS sites_with_agent, + COUNT(DISTINCT IF(status = 200 AND rules_sum > 0, root_page, NULL)) AS sites_with_agent_among_200 + FROM + ua_scored + GROUP BY + date, + client, + rank, + agent +) + +SELECT + n.date, + n.client, + n.rank, + n.agent, + t.total_sites, + t2.total_sites_200, + n.sites_with_agent, + n.sites_with_agent_among_200, + SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites, + SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200 +FROM + numerators n +JOIN + totals_all t +USING (date, client, rank) +JOIN + totals_200 t2 +USING (date, client, rank) +WHERE + n.sites_with_agent >= 100 +ORDER BY + n.date, + n.rank, + pct_of_all_sites DESC; diff --git a/sql/2025/generative-ai/vibecodetools.sql b/sql/2025/generative-ai/vibecodetools.sql new file mode 100644 index 00000000000..65cf77443da --- /dev/null +++ b/sql/2025/generative-ai/vibecodetools.sql @@ -0,0 +1,68 @@ +#standardSQL +WITH base AS ( + SELECT + date, + page, + NET.HOST(page) AS host + FROM + `httparchive.crawl.pages` + WHERE + client = 'desktop' AND + is_root_page AND + date IN ( + '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01', + '2021-01-01', '2021-04-01', '2021-07-01', '2021-10-01', + '2022-01-01', '2022-04-01', '2022-07-01', '2022-10-01', + '2023-01-01', '2023-04-01', '2023-07-01', '2023-10-01', + '2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01', + '2025-01-01', '2025-04-01', '2025-07-01', '2025-10-01' + ) +), + +classified AS ( + SELECT + date, + page, + CASE + WHEN ENDS_WITH(host, '.vercel.app') THEN 'vercel' + WHEN ENDS_WITH(host, '.pages.dev') THEN 'cloudflare_pages' + WHEN ENDS_WITH(host, '.workers.dev') THEN 'cloudflare_workers' + WHEN ENDS_WITH(host, '.lovable.app') OR ENDS_WITH(host, 'lovable.dev') THEN 'lovable' + WHEN ENDS_WITH(host, '.bolt.new') OR ENDS_WITH(host, 'stackblitz.io') THEN 'bolt' + WHEN ENDS_WITH(host, '.v0.dev') THEN 'v0' + WHEN ENDS_WITH(host, '.replit.app') THEN 'replit' + ELSE NULL + END AS platform + FROM + base +), + +totals AS ( + SELECT + date, + COUNT(0) AS total_pages + FROM + base + GROUP BY + date +) + +SELECT + c.date, + c.platform, + COUNT(0) AS pages, + SAFE_DIVIDE(COUNT(0), t.total_pages) AS pct_pages +FROM + classified c +JOIN + totals t +USING (date) +WHERE + c.platform IS NOT NULL +GROUP BY + c.date, + c.platform, + t.total_pages +ORDER BY + c.date, + pct_pages DESC;