Skip to content

Commit ab8cddf

Browse files
committed
Fix not working SQL queries
1 parent 2a7bdae commit ab8cddf

9 files changed

Lines changed: 242 additions & 136 deletions

sql/2025/sustainability/favicons.sql

Lines changed: 130 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,59 @@ CREATE TEMPORARY FUNCTION GETFAVICONIMAGE(payload STRING)
44
RETURNS STRING LANGUAGE js AS '''
55
var result = 'NO_DATA';
66
try {
7-
var almanac = JSON.parse(payload);
7+
var parsed = JSON.parse(payload);
8+
9+
// If wrapped, unwrap _almanac
10+
if (parsed && typeof parsed === 'object' && parsed._almanac && typeof parsed._almanac === 'object') {
11+
parsed = parsed._almanac;
12+
}
813
9-
if (Array.isArray(almanac) || typeof almanac != 'object') return result;
14+
// Deep search for any array of link-like nodes anywhere in the object
15+
function findLinkNodes(obj) {
16+
if (!obj) return [];
17+
var stack = [obj];
18+
while (stack.length) {
19+
var current = stack.pop();
20+
if (!current) continue;
21+
if (Array.isArray(current)) {
22+
// If array of objects with rel/href, treat as nodes
23+
if (
24+
current.length && typeof current[0] === 'object' && current.some(function(it){return it && (it.href || it.rel);})
25+
) {
26+
return current;
27+
}
28+
for (var i = 0; i < current.length; i++) stack.push(current[i]);
29+
} else if (typeof current === 'object') {
30+
// Common patterns: {nodes: [...]} wrappers
31+
if (current.nodes && Array.isArray(current.nodes)) {
32+
var n = current.nodes;
33+
if (n.length && typeof n[0] === 'object' && n.some(function(it){return it && (it.href || it.rel);})){return n;}
34+
}
35+
for (var k in current) if (Object.prototype.hasOwnProperty.call(current, k)) stack.push(current[k]);
36+
}
37+
}
38+
return [];
39+
}
1040
11-
if (almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.find) {
12-
var faviconNode = almanac["link-nodes"].nodes.find(n => n.rel && n.rel.split(' ').find(r => r.trim().toLowerCase() == 'icon'));
41+
var nodes = findLinkNodes(parsed);
42+
if (!nodes || !nodes.length) return result;
43+
44+
if (nodes && nodes.find) {
45+
var faviconNode = nodes.find(function(n) {
46+
if (!n || !('rel' in n)) return false;
47+
var rels = Array.isArray(n.rel) ? n.rel : String(n.rel).split(' ');
48+
for (var j = 0; j < rels.length; j++) {
49+
var r = String(rels[j]).trim().toLowerCase();
50+
if (r === 'icon' || r === 'shortcut icon' || r === 'apple-touch-icon' || r === 'apple-touch-icon-precomposed') {
51+
return true;
52+
}
53+
}
54+
return false;
55+
});
1356
1457
if (faviconNode) {
1558
if (faviconNode.href) {
16-
var temp = faviconNode.href;
59+
var temp = String(faviconNode.href);
1760
1861
if (temp.includes('?')) {
1962
temp = temp.substring(0, temp.indexOf('?'));
@@ -43,29 +86,98 @@ try {
4386
return result;
4487
''';
4588

46-
# Main query to analyze favicon image extensions with sampling
47-
WITH favicons AS (
89+
# Main query to analyze favicon image extensions using requests heuristics
90+
WITH pages AS (
4891
SELECT
4992
client,
50-
GETFAVICONIMAGE(
51-
JSON_EXTRACT_SCALAR(payload, '$._almanac')
52-
) AS image_type_extension,
53-
COUNT(0) AS freq,
54-
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
55-
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS percentage_of_total
93+
page
5694
FROM
5795
`httparchive.crawl.pages`
5896
WHERE
59-
date = '2025-06-01'
97+
date = '2025-06-01' AND
98+
is_root_page
99+
),
100+
101+
reqs AS (
102+
SELECT
103+
client,
104+
page,
105+
url,
106+
response_headers
107+
FROM
108+
`httparchive.crawl.requests`
109+
WHERE
110+
date = '2025-06-01' AND
111+
is_root_page
112+
),
113+
114+
candidates AS (
115+
SELECT
116+
r.client,
117+
r.page,
118+
r.url,
119+
LOWER(
120+
REGEXP_EXTRACT(
121+
REGEXP_REPLACE(SPLIT(r.url, '?')[SAFE_OFFSET(0)], r'/+$', ''),
122+
r'\.([A-Za-z0-9]+)$'
123+
)
124+
) AS url_ext,
125+
(
126+
SELECT LOWER(value)
127+
FROM UNNEST(r.response_headers)
128+
WHERE LOWER(name) = 'content-type'
129+
LIMIT 1
130+
) AS content_type
131+
FROM
132+
reqs r
133+
JOIN
134+
pages p
135+
USING (client, page)
136+
WHERE
137+
REGEXP_CONTAINS(LOWER(r.url), r'favicon|apple-touch-icon|android-chrome|mstile|safari-pinned-tab')
138+
),
139+
140+
resolved AS (
141+
SELECT
142+
client,
143+
page,
144+
url,
145+
COALESCE(
146+
NULLIF(url_ext, ''),
147+
CASE
148+
WHEN content_type LIKE 'image/svg%' THEN 'svg'
149+
WHEN content_type LIKE 'image/png%' THEN 'png'
150+
WHEN content_type LIKE 'image/webp%' THEN 'webp'
151+
WHEN content_type LIKE 'image/jpeg%' OR content_type LIKE 'image/jpg%' THEN 'jpg'
152+
WHEN content_type LIKE 'image/x-icon%' OR content_type LIKE 'image/vnd.microsoft.icon%' THEN 'ico'
153+
ELSE 'unknown'
154+
END
155+
) AS image_type_extension
156+
FROM
157+
candidates
158+
),
159+
160+
rollup_data AS (
161+
SELECT
162+
client,
163+
image_type_extension,
164+
COUNT(DISTINCT page) AS pages
165+
FROM
166+
resolved
60167
GROUP BY
61168
client,
62169
image_type_extension
63170
)
64171

65172
SELECT
66-
*,
67-
percentage_of_total AS pct
173+
client,
174+
image_type_extension,
175+
pages AS count,
176+
SUM(pages) OVER (PARTITION BY client) AS total,
177+
ROUND(100 * SAFE_DIVIDE(pages, SUM(pages) OVER (PARTITION BY client)), 2) AS pct
68178
FROM
69-
favicons
179+
rollup_data
70180
ORDER BY
71-
pct DESC
181+
client ASC,
182+
count DESC,
183+
image_type_extension ASC

sql/2025/sustainability/green_third_party_requests.sql

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
#standardSQL
22
# Median third-parties & green third-party requests per websites by rank
33

4-
WITH requests AS (
4+
WITH third_party_date AS (
5+
SELECT MAX(date) AS date
6+
FROM `httparchive.almanac.third_parties`
7+
WHERE date <= '2025-06-01'
8+
),
9+
10+
gwf_date AS (
11+
SELECT MAX(date) AS date
12+
FROM `httparchive.almanac.green_web_foundation`
13+
WHERE date <= '2025-06-01'
14+
),
15+
16+
requests AS (
517
SELECT
618
client,
719
url,
8-
CAST(JSON_VALUE(summary, '$.pageid') AS INT64) AS page
20+
page
921
FROM
1022
`httparchive.crawl.requests`
1123
WHERE
@@ -17,20 +29,21 @@ green AS (
1729
TRUE AS is_green,
1830
NET.HOST(url) AS host
1931
FROM
20-
`httparchive.almanac.green_web_foundation`
21-
WHERE
22-
date = '2025-09-01'
32+
`httparchive.almanac.green_web_foundation` g
33+
JOIN gwf_date d
34+
ON g.date = d.date
2335
),
2436

2537
pages AS (
2638
SELECT
2739
client,
2840
rank,
29-
CAST(JSON_VALUE(summary, '$.pageid') AS INT64) AS page
41+
page
3042
FROM
3143
`httparchive.crawl.pages`
3244
WHERE
33-
date = '2025-06-01'
45+
date = '2025-06-01' AND
46+
is_root_page
3447
),
3548

3649
third_party AS (
@@ -41,9 +54,9 @@ third_party AS (
4154
`httparchive.almanac.third_parties` AS tp
4255
INNER JOIN
4356
requests AS r
44-
ON NET.HOST(r.url) = NET.HOST(tp.domain)
57+
ON NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(tp.domain)
4558
WHERE
46-
tp.date = '2025-06-01' AND
59+
tp.date = (SELECT date FROM third_party_date) AND
4760
tp.category NOT IN ('hosting')
4861
GROUP BY
4962
tp.domain
@@ -57,9 +70,9 @@ green_tp AS (
5770
`httparchive.almanac.third_parties` AS tp
5871
INNER JOIN
5972
green AS g
60-
ON NET.HOST(g.host) = NET.HOST(tp.domain)
73+
ON NET.REG_DOMAIN(g.host) = NET.REG_DOMAIN(tp.domain)
6174
WHERE
62-
tp.date = '2025-06-01' AND
75+
tp.date = (SELECT date FROM third_party_date) AND
6376
tp.category NOT IN ('hosting')
6477
GROUP BY
6578
tp.domain
@@ -70,13 +83,13 @@ base AS (
7083
r.client,
7184
r.page,
7285
p.rank,
73-
COUNT(tp.domain) AS third_parties_per_page
86+
COUNT(DISTINCT tp.domain) AS third_parties_per_page
7487
FROM
7588
requests AS r
7689
LEFT JOIN
7790
third_party AS tp
7891
ON
79-
NET.HOST(r.url) = NET.HOST(tp.domain)
92+
NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(tp.domain)
8093
INNER JOIN
8194
pages AS p
8295
ON r.client = p.client AND r.page = p.page
@@ -91,13 +104,13 @@ base_green AS (
91104
r.client,
92105
r.page,
93106
p.rank,
94-
COUNT(gtp.domain) AS green_third_parties_per_page
107+
COUNT(DISTINCT gtp.domain) AS green_third_parties_per_page
95108
FROM
96109
requests AS r
97110
LEFT JOIN
98111
green_tp AS gtp
99112
ON
100-
NET.HOST(r.url) = NET.HOST(gtp.domain)
113+
NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(gtp.domain)
101114
INNER JOIN
102115
pages AS p
103116
ON r.client = p.client AND r.page = p.page
@@ -126,7 +139,7 @@ SELECT
126139
bg.green_third_parties_per_page,
127140
b.third_parties_per_page
128141
), 1000
129-
)[OFFSET(500)] AS pct_green
142+
)[OFFSET(500)] * 100 AS pct_green
130143
FROM
131144
base AS b,
132145
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping

sql/2025/sustainability/green_web_hosting.sql

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
# standardSQL
22
# What percentage of URLs are hosted on a known green web hosting provider?
33

4-
WITH green AS (
4+
WITH gwf_date AS (
5+
SELECT MAX(date) AS date
6+
FROM `httparchive.almanac.green_web_foundation`
7+
WHERE date <= '2025-06-01'
8+
),
9+
10+
green AS (
511
SELECT
612
TRUE AS is_green,
713
NET.HOST(url) AS host
814
FROM
9-
`httparchive.almanac.green_web_foundation`
10-
WHERE
11-
date = '2025-09-01'
15+
`httparchive.almanac.green_web_foundation` g
16+
JOIN
17+
gwf_date d
18+
ON g.date = d.date
1219
),
1320

1421
pages AS (
@@ -34,7 +41,7 @@ SELECT
3441
END AS ranking,
3542
COUNTIF(is_green) AS total_green,
3643
COUNT(0) AS total_sites,
37-
SAFE_DIVIDE(COUNTIF(is_green), COUNT(0)) AS pct_green
44+
ROUND(100 * SAFE_DIVIDE(COUNTIF(is_green), COUNT(0)), 2) AS pct_green
3845
FROM (
3946
-- Left join green hosting information
4047
SELECT

0 commit comments

Comments
 (0)