Skip to content

Commit a8e8c5e

Browse files
Fix linter errors in SQL scripts
1 parent 19dc554 commit a8e8c5e

File tree

8 files changed

+83
-69
lines changed

8 files changed

+83
-69
lines changed

sql/2025/generative-ai/ai_tld.sql

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ WITH base AS (
1010
FROM
1111
`httparchive.crawl.pages`
1212
WHERE
13-
is_root_page
14-
AND client IN ('desktop', 'mobile')
15-
AND date IN ('2022-06-01', '2025-07-01')
16-
AND rank <= 10000000
17-
AND ENDS_WITH(NET.HOST(page), '.ai')
13+
is_root_page AND
14+
client IN ('desktop', 'mobile') AND
15+
date IN ('2022-06-01', '2025-07-01') AND
16+
rank <= 10000000 AND
17+
ENDS_WITH(NET.HOST(page), '.ai')
1818
),
1919

2020
bucketed AS (

sql/2025/generative-ai/gradient-client-rank.sql

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,16 @@
33
-- Grouped by: year, client, rank bucket
44

55
WITH ranks AS (
6-
SELECT 1000 AS rank_grouping UNION ALL
7-
SELECT 10000 UNION ALL
8-
SELECT 100000 UNION ALL
9-
SELECT 1000000 UNION ALL
10-
SELECT 10000000 UNION ALL
6+
SELECT 1000 AS rank_grouping
7+
UNION ALL
8+
SELECT 10000
9+
UNION ALL
10+
SELECT 100000
11+
UNION ALL
12+
SELECT 1000000
13+
UNION ALL
14+
SELECT 10000000
15+
UNION ALL
1116
SELECT 100000000
1217
)
1318

@@ -38,9 +43,9 @@ SELECT
3843
FROM `httparchive.crawl.pages`
3944
CROSS JOIN ranks r
4045
WHERE
41-
is_root_page
42-
AND rank <= r.rank_grouping
43-
AND date IN (
46+
is_root_page AND
47+
rank <= r.rank_grouping AND
48+
date IN (
4449
DATE '2019-07-01',
4550
DATE '2020-08-01',
4651
DATE '2021-07-01',

sql/2025/generative-ai/gradient-use.sql

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,19 @@ SELECT
88
COUNT(DISTINCT page) AS total_pages,
99
COUNT(DISTINCT IF(
1010
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
11-
page, NULL)) AS pages_with_gradient,
11+
page, NULL
12+
)) AS pages_with_gradient,
1213
SAFE_DIVIDE(
1314
COUNT(DISTINCT IF(
1415
REGEXP_CONTAINS(TO_JSON_STRING(custom_metrics.css_variables), r'(?i)gradient\('),
15-
page, NULL)),
16+
page, NULL
17+
)),
1618
COUNT(DISTINCT page)
1719
) AS pct_with_gradient
1820
FROM `httparchive.crawl.pages`
1921
WHERE
20-
is_root_page
21-
AND date IN (
22+
is_root_page AND
23+
date IN (
2224
DATE '2019-07-01',
2325
DATE '2020-08-01',
2426
DATE '2021-07-01',

sql/2025/generative-ai/gradient.sql

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ WITH raw_data AS (
66
-- 1. TECHNOLOGY FLAGS
77
-- CSS Variables: Exclude NULL, {}, '{"summary":{}}', and 'null' string
88
(
9-
custom_metrics.css_variables IS NOT NULL
10-
AND TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null')
9+
custom_metrics.css_variables IS NOT NULL AND
10+
TO_JSON_STRING(custom_metrics.css_variables) NOT IN ('{}', '{"summary":{}}', 'null')
1111
) AS uses_css_vars,
1212

1313
-- Tailwind: Check the array for the technology
@@ -20,12 +20,12 @@ WITH raw_data AS (
2020
FROM
2121
`httparchive.crawl.pages`
2222
WHERE
23-
client = 'mobile'
24-
AND is_root_page
23+
client = 'mobile' AND
24+
is_root_page AND
2525
-- NO RANK FILTER (Analyze the entire long-tail of the web)
2626

2727
-- Quarterly Dates
28-
AND date IN UNNEST([
28+
date IN UNNEST([
2929
DATE '2020-10-01',
3030
DATE '2021-01-01', DATE '2021-04-01', DATE '2021-07-01', DATE '2021-10-01',
3131
DATE '2022-01-01', DATE '2022-04-01', DATE '2022-07-01', DATE '2022-10-01',

sql/2025/generative-ai/sites-with-robot-txt.sql

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,31 @@ WITH roots AS (
55
client,
66
root_page,
77
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
8-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
9-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
10-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
11-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
12-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
13-
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
8+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.allow') AS INT64), 0) AS allow_cnt,
9+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.disallow') AS INT64), 0) AS disallow_cnt,
10+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.crawl_delay') AS INT64), 0) AS crawl_delay_cnt,
11+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.noindex') AS INT64), 0) AS noindex_cnt,
12+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.sitemap') AS INT64), 0) AS sitemap_cnt,
13+
COALESCE(SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt.record_counts.by_type, '$.user_agent') AS INT64), 0) AS ua_cnt
1414
FROM `httparchive.crawl.pages`
15-
WHERE date = '2025-07-01'
16-
AND is_root_page
15+
WHERE date = '2025-07-01' AND
16+
is_root_page
1717
)
18+
1819
SELECT
1920
client,
2021
COUNT(DISTINCT root_page) AS sites,
21-
COUNT(DISTINCT IF(status = 200
22-
AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
23-
root_page, NULL)) AS sites_with_robots_txt,
22+
COUNT(DISTINCT IF(
23+
status = 200 AND
24+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
25+
root_page, NULL
26+
)) AS sites_with_robots_txt,
2427
SAFE_DIVIDE(
25-
COUNT(DISTINCT IF(status = 200
26-
AND (allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
27-
root_page, NULL)),
28+
COUNT(DISTINCT IF(
29+
status = 200 AND
30+
(allow_cnt + disallow_cnt + crawl_delay_cnt + noindex_cnt + sitemap_cnt + ua_cnt) > 0,
31+
root_page, NULL
32+
)),
2833
COUNT(DISTINCT root_page)
2934
) AS pct_sites_with_robots_txt
3035
FROM roots

sql/2025/generative-ai/user-agent-named.sql

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ WITH base AS (
2222
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
2323
JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua
2424
FROM `httparchive.crawl.pages`
25-
WHERE date = '2025-07-01'
26-
AND is_root_page
25+
WHERE date = '2025-07-01' AND
26+
is_root_page
2727
),
2828

2929
-- Extract all UA keys present on each site (keys of by_useragent object)
@@ -34,9 +34,9 @@ ua_keys AS (
3434
b.root_page,
3535
LOWER(agent) AS agent
3636
FROM base AS b,
37-
UNNEST(
38-
REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')
39-
) AS agent
37+
UNNEST(
38+
REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')
39+
) AS agent
4040
),
4141

4242
-- Per-site per-agent presence (any directive count > 0)
@@ -50,7 +50,7 @@ ua_presence AS (
5050
getByAgent(TO_JSON_STRING(b.byua), k.agent) AS agent_obj
5151
FROM ua_keys k
5252
JOIN base b
53-
USING (client, rank, root_page)
53+
USING (client, rank, root_page)
5454
),
5555

5656
ua_presence_scored AS (
@@ -59,11 +59,11 @@ ua_presence_scored AS (
5959
rank,
6060
root_page,
6161
agent,
62-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
63-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
64-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) +
65-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
66-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
62+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
63+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
64+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) +
65+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
66+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
6767
FROM ua_presence
6868
),
6969

@@ -73,6 +73,7 @@ totals_all AS (
7373
FROM base
7474
GROUP BY client, rank
7575
),
76+
7677
totals_200 AS (
7778
SELECT client, rank, COUNT(DISTINCT root_page) AS total_sites_200
7879
FROM base
@@ -102,9 +103,9 @@ SELECT
102103
t2.total_sites_200,
103104
n.sites_with_agent,
104105
n.sites_with_agent_among_200,
105-
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
106-
SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200
106+
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
107+
SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200
107108
FROM numerators n
108-
JOIN totals_all t USING (client, rank)
109+
JOIN totals_all t USING (client, rank)
109110
JOIN totals_200 t2 USING (client, rank)
110111
ORDER BY rank, client, pct_of_all_sites DESC;

sql/2025/generative-ai/user-agent-years.sql

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ WITH base AS (
2121
SAFE_CAST(JSON_VALUE(custom_metrics.robots_txt, '$.status') AS INT64) AS status,
2222
JSON_QUERY(custom_metrics.robots_txt, '$.record_counts.by_useragent') AS byua
2323
FROM `httparchive.crawl.pages`
24-
WHERE date IN ('2019-07-01','2020-08-01','2021-07-01','2022-06-01','2024-06-01','2025-07-01')
25-
AND client = 'mobile'
26-
AND is_root_page
24+
WHERE date IN ('2019-07-01', '2020-08-01', '2021-07-01', '2022-06-01', '2024-06-01', '2025-07-01') AND
25+
client = 'mobile' AND
26+
is_root_page
2727
),
2828

2929
-- Extract UA keys present on each site
@@ -32,7 +32,7 @@ ua_keys AS (
3232
b.date, b.client, b.rank, b.root_page,
3333
LOWER(agent) AS agent
3434
FROM base AS b,
35-
UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent
35+
UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(b.byua), r'"([^"]+)":\{')) AS agent
3636
),
3737

3838
-- Look up that agent’s counts on that site
@@ -49,11 +49,11 @@ ua_presence AS (
4949
ua_scored AS (
5050
SELECT
5151
date, client, rank, root_page, agent, status,
52-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
53-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
52+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.allow') AS INT64), 0) +
53+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.disallow') AS INT64), 0) +
5454
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.crawl_delay') AS INT64), 0) +
55-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
56-
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
55+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.noindex') AS INT64), 0) +
56+
COALESCE(SAFE_CAST(JSON_VALUE(agent_obj, '$.other') AS INT64), 0) AS rules_sum
5757
FROM ua_presence
5858
),
5959

@@ -63,6 +63,7 @@ totals_all AS (
6363
FROM base
6464
GROUP BY date, client, rank
6565
),
66+
6667
totals_200 AS (
6768
SELECT date, client, rank, COUNT(DISTINCT root_page) AS total_sites_200
6869
FROM base
@@ -89,10 +90,10 @@ SELECT
8990
t2.total_sites_200,
9091
n.sites_with_agent,
9192
n.sites_with_agent_among_200,
92-
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
93+
SAFE_DIVIDE(n.sites_with_agent, t.total_sites) AS pct_of_all_sites,
9394
SAFE_DIVIDE(n.sites_with_agent_among_200, t2.total_sites_200) AS pct_of_sites_with_200
9495
FROM numerators n
95-
JOIN totals_all t USING (date, client, rank)
96+
JOIN totals_all t USING (date, client, rank)
9697
JOIN totals_200 t2 USING (date, client, rank)
9798
WHERE n.sites_with_agent >= 100
9899
ORDER BY n.date, n.rank, pct_of_all_sites DESC;

sql/2025/generative-ai/vibecodetools.sql

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ WITH base AS (
77
FROM
88
`httparchive.crawl.pages`
99
WHERE
10-
client = 'desktop'
11-
AND is_root_page
12-
AND date IN (
13-
'2020-01-01','2020-04-01','2020-07-01','2020-10-01',
14-
'2021-01-01','2021-04-01','2021-07-01','2021-10-01',
15-
'2022-01-01','2022-04-01','2022-07-01','2022-10-01',
16-
'2023-01-01','2023-04-01','2023-07-01','2023-10-01',
17-
'2024-01-01','2024-04-01','2024-07-01','2024-10-01',
18-
'2025-01-01','2025-04-01','2025-07-01','2025-10-01'
10+
client = 'desktop' AND
11+
is_root_page AND
12+
date IN (
13+
'2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01',
14+
'2021-01-01', '2021-04-01', '2021-07-01', '2021-10-01',
15+
'2022-01-01', '2022-04-01', '2022-07-01', '2022-10-01',
16+
'2023-01-01', '2023-04-01', '2023-07-01', '2023-10-01',
17+
'2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01',
18+
'2025-01-01', '2025-04-01', '2025-07-01', '2025-10-01'
1919
)
2020
),
2121

0 commit comments

Comments
 (0)