Skip to content

Commit 1d3f247

Browse files
committed
fix some queries
1 parent ab8cddf commit 1d3f247

4 files changed

Lines changed: 46 additions & 161 deletions

File tree

Lines changed: 19 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,17 @@
11
#standardSQL
22
# Temporary function to extract favicon image extensions from the JSON payload
3-
CREATE TEMPORARY FUNCTION GETFAVICONIMAGE(payload STRING)
3+
CREATE TEMPORARY FUNCTION getFaviconImage(almanac JSON)
44
RETURNS STRING LANGUAGE js AS '''
55
var result = 'NO_DATA';
66
try {
7-
var parsed = JSON.parse(payload);
8-
9-
// If wrapped, unwrap _almanac
10-
if (parsed && typeof parsed === 'object' && parsed._almanac && typeof parsed._almanac === 'object') {
11-
parsed = parsed._almanac;
12-
}
13-
14-
// Deep search for any array of link-like nodes anywhere in the object
15-
function findLinkNodes(obj) {
16-
if (!obj) return [];
17-
var stack = [obj];
18-
while (stack.length) {
19-
var current = stack.pop();
20-
if (!current) continue;
21-
if (Array.isArray(current)) {
22-
// If array of objects with rel/href, treat as nodes
23-
if (
24-
current.length && typeof current[0] === 'object' && current.some(function(it){return it && (it.href || it.rel);})
25-
) {
26-
return current;
27-
}
28-
for (var i = 0; i < current.length; i++) stack.push(current[i]);
29-
} else if (typeof current === 'object') {
30-
// Common patterns: {nodes: [...]} wrappers
31-
if (current.nodes && Array.isArray(current.nodes)) {
32-
var n = current.nodes;
33-
if (n.length && typeof n[0] === 'object' && n.some(function(it){return it && (it.href || it.rel);})){return n;}
34-
}
35-
for (var k in current) if (Object.prototype.hasOwnProperty.call(current, k)) stack.push(current[k]);
36-
}
37-
}
38-
return [];
39-
}
7+
if (Array.isArray(almanac) || typeof almanac != 'object') return result;
408
41-
var nodes = findLinkNodes(parsed);
42-
if (!nodes || !nodes.length) return result;
43-
44-
if (nodes && nodes.find) {
45-
var faviconNode = nodes.find(function(n) {
46-
if (!n || !('rel' in n)) return false;
47-
var rels = Array.isArray(n.rel) ? n.rel : String(n.rel).split(' ');
48-
for (var j = 0; j < rels.length; j++) {
49-
var r = String(rels[j]).trim().toLowerCase();
50-
if (r === 'icon' || r === 'shortcut icon' || r === 'apple-touch-icon' || r === 'apple-touch-icon-precomposed') {
51-
return true;
52-
}
53-
}
54-
return false;
55-
});
9+
if (almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.find) {
10+
var faviconNode = almanac["link-nodes"].nodes.find(n => n.rel && n.rel.split(' ').find(r => r.trim().toLowerCase() == 'icon'));
5611
5712
if (faviconNode) {
5813
if (faviconNode.href) {
59-
var temp = String(faviconNode.href);
14+
var temp = faviconNode.href;
6015
6116
if (temp.includes('?')) {
6217
temp = temp.substring(0, temp.indexOf('?'));
@@ -86,98 +41,30 @@ try {
8641
return result;
8742
''';
8843

89-
# Main query to analyze favicon image extensions using requests heuristics
90-
WITH pages AS (
44+
# Main query to analyze favicon image extensions with sampling
45+
WITH favicons AS (
9146
SELECT
9247
client,
93-
page
48+
getFaviconImage(custom_metrics.other.almanac) AS image_type_extension,
49+
COUNT(0) AS freq,
50+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
51+
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS percentage_of_total
9452
FROM
9553
`httparchive.crawl.pages`
9654
WHERE
97-
date = '2025-06-01' AND
98-
is_root_page
99-
),
100-
101-
reqs AS (
102-
SELECT
103-
client,
104-
page,
105-
url,
106-
response_headers
107-
FROM
108-
`httparchive.crawl.requests`
109-
WHERE
110-
date = '2025-06-01' AND
111-
is_root_page
112-
),
113-
114-
candidates AS (
115-
SELECT
116-
r.client,
117-
r.page,
118-
r.url,
119-
LOWER(
120-
REGEXP_EXTRACT(
121-
REGEXP_REPLACE(SPLIT(r.url, '?')[SAFE_OFFSET(0)], r'/+$', ''),
122-
r'\.([A-Za-z0-9]+)$'
123-
)
124-
) AS url_ext,
125-
(
126-
SELECT LOWER(value)
127-
FROM UNNEST(r.response_headers)
128-
WHERE LOWER(name) = 'content-type'
129-
LIMIT 1
130-
) AS content_type
131-
FROM
132-
reqs r
133-
JOIN
134-
pages p
135-
USING (client, page)
136-
WHERE
137-
REGEXP_CONTAINS(LOWER(r.url), r'favicon|apple-touch-icon|android-chrome|mstile|safari-pinned-tab')
138-
),
139-
140-
resolved AS (
141-
SELECT
142-
client,
143-
page,
144-
url,
145-
COALESCE(
146-
NULLIF(url_ext, ''),
147-
CASE
148-
WHEN content_type LIKE 'image/svg%' THEN 'svg'
149-
WHEN content_type LIKE 'image/png%' THEN 'png'
150-
WHEN content_type LIKE 'image/webp%' THEN 'webp'
151-
WHEN content_type LIKE 'image/jpeg%' OR content_type LIKE 'image/jpg%' THEN 'jpg'
152-
WHEN content_type LIKE 'image/x-icon%' OR content_type LIKE 'image/vnd.microsoft.icon%' THEN 'ico'
153-
ELSE 'unknown'
154-
END
155-
) AS image_type_extension
156-
FROM
157-
candidates
158-
),
159-
160-
rollup_data AS (
161-
SELECT
162-
client,
163-
image_type_extension,
164-
COUNT(DISTINCT page) AS pages
165-
FROM
166-
resolved
55+
date = '2025-07-01'
16756
GROUP BY
16857
client,
16958
image_type_extension
17059
)
17160

17261
SELECT
173-
client,
174-
image_type_extension,
175-
pages AS count,
176-
SUM(pages) OVER (PARTITION BY client) AS total,
177-
ROUND(100 * SAFE_DIVIDE(pages, SUM(pages) OVER (PARTITION BY client)), 2) AS pct
62+
*,
63+
percentage_of_total AS pct
17864
FROM
179-
rollup_data
65+
favicons
18066
ORDER BY
181-
client ASC,
182-
count DESC,
183-
image_type_extension ASC
67+
pct DESC
68+
LIMIT
69+
1000;
70+

sql/2025/sustainability/green_third_party_requests.sql

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ third_party AS (
5454
`httparchive.almanac.third_parties` AS tp
5555
INNER JOIN
5656
requests AS r
57-
ON NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(tp.domain)
57+
ON NET.HOST(r.url) = NET.HOST(tp.domain)
5858
WHERE
5959
tp.date = (SELECT date FROM third_party_date) AND
6060
tp.category NOT IN ('hosting')
@@ -70,7 +70,7 @@ green_tp AS (
7070
`httparchive.almanac.third_parties` AS tp
7171
INNER JOIN
7272
green AS g
73-
ON NET.REG_DOMAIN(g.host) = NET.REG_DOMAIN(tp.domain)
73+
ON NET.HOST(g.host) = NET.HOST(tp.domain)
7474
WHERE
7575
tp.date = (SELECT date FROM third_party_date) AND
7676
tp.category NOT IN ('hosting')
@@ -83,13 +83,13 @@ base AS (
8383
r.client,
8484
r.page,
8585
p.rank,
86-
COUNT(DISTINCT tp.domain) AS third_parties_per_page
86+
COUNT(tp.domain) AS third_parties_per_page
8787
FROM
8888
requests AS r
8989
LEFT JOIN
9090
third_party AS tp
9191
ON
92-
NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(tp.domain)
92+
NET.HOST(r.url) = NET.HOST(tp.domain)
9393
INNER JOIN
9494
pages AS p
9595
ON r.client = p.client AND r.page = p.page
@@ -104,13 +104,13 @@ base_green AS (
104104
r.client,
105105
r.page,
106106
p.rank,
107-
COUNT(DISTINCT gtp.domain) AS green_third_parties_per_page
107+
COUNT(gtp.domain) AS green_third_parties_per_page
108108
FROM
109109
requests AS r
110110
LEFT JOIN
111111
green_tp AS gtp
112112
ON
113-
NET.REG_DOMAIN(r.url) = NET.REG_DOMAIN(gtp.domain)
113+
NET.HOST(r.url) = NET.HOST(gtp.domain)
114114
INNER JOIN
115115
pages AS p
116116
ON r.client = p.client AND r.page = p.page
@@ -139,7 +139,7 @@ SELECT
139139
bg.green_third_parties_per_page,
140140
b.third_parties_per_page
141141
), 1000
142-
)[OFFSET(500)] * 100 AS pct_green
142+
)[OFFSET(500)] AS pct_green
143143
FROM
144144
base AS b,
145145
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping

sql/2025/sustainability/responsive_images.sql

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,60 +4,58 @@
44
WITH page_data AS (
55
SELECT
66
client,
7-
-- Totals from markup custom metric
8-
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.markup), '$.images.img.srcset_total') AS INT64) AS img_srcset_total,
9-
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.markup), '$.images.source.srcset_total') AS INT64) AS source_srcset_total,
10-
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.markup), '$.images.picture.total') AS INT64) AS picture_total,
11-
12-
-- Sizes totals (may be missing; will be NULL)
13-
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.markup), '$.images.img.sizes_total') AS INT64) AS img_sizes_total,
14-
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.markup), '$.images.source.sizes_total') AS INT64) AS source_sizes_total
7+
-- Count occurrences in HTML of the main document
8+
ARRAY_LENGTH(REGEXP_EXTRACT_ALL(COALESCE(response_body, ''), r'(?is)<(?:img|source)[^>]*srcset\s*=')) AS num_srcset_all,
9+
ARRAY_LENGTH(REGEXP_EXTRACT_ALL(COALESCE(response_body, ''), r'(?is)<(?:img|source)[^>]*sizes\s*=')) AS num_srcset_sizes,
10+
-- Presence of <picture>
11+
IF(REGEXP_CONTAINS(COALESCE(response_body, ''), r'(?is)<picture\b'), 1, 0) AS picture_total
1512
FROM
16-
`httparchive.crawl.pages`
13+
`httparchive.crawl.requests`
1714
WHERE
18-
date = '2025-06-01' AND is_root_page
15+
date = '2025-06-01' AND
16+
is_main_document
1917
)
2018

2119
SELECT
2220
client,
2321
round(
2422
safe_divide(
25-
countif(coalesce(img_srcset_total, 0) + coalesce(source_srcset_total, 0) > 0),
23+
countif(num_srcset_all > 0),
2624
count(0)
2725
) * 100,
2826
2
2927
) AS pages_with_srcset_pct,
3028
round(
3129
safe_divide(
32-
countif(coalesce(img_sizes_total, 0) + coalesce(source_sizes_total, 0) > 0),
30+
countif(num_srcset_sizes > 0),
3331
count(0)
3432
) * 100,
3533
2
3634
) AS pages_with_srcset_sizes_pct,
3735
round(
3836
safe_divide(
3937
(
40-
countif(coalesce(img_srcset_total, 0) + coalesce(source_srcset_total, 0) > 0) -
41-
countif(coalesce(img_sizes_total, 0) + coalesce(source_sizes_total, 0) > 0)
38+
countif(num_srcset_all > 0) -
39+
countif(num_srcset_sizes > 0)
4240
),
4341
count(0)
4442
) * 100,
4543
2
4644
) AS pages_with_srcset_wo_sizes_pct,
4745
round(
4846
safe_divide(
49-
sum(coalesce(img_sizes_total, 0) + coalesce(source_sizes_total, 0)),
50-
sum(coalesce(img_srcset_total, 0) + coalesce(source_srcset_total, 0))
47+
sum(num_srcset_sizes),
48+
nullif(sum(num_srcset_all), 0)
5149
) * 100,
5250
2
5351
) AS instances_of_srcset_sizes_pct,
5452
round(
5553
safe_divide(
5654
(
57-
sum(coalesce(img_srcset_total, 0) + coalesce(source_srcset_total, 0)) -
58-
sum(coalesce(img_sizes_total, 0) + coalesce(source_sizes_total, 0))
55+
sum(num_srcset_all) -
56+
sum(num_srcset_sizes)
5957
),
60-
sum(coalesce(img_srcset_total, 0) + coalesce(source_srcset_total, 0))
58+
nullif(sum(num_srcset_all), 0)
6159
) * 100,
6260
2
6361
) AS instances_of_srcset_wo_sizes_pct,

sql/2025/sustainability/text_compression.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@ WITH content_encoding AS (
22
SELECT
33
client,
44
LOWER(h.value) AS encoding
5-
FROM `httparchive.crawl.requests`,
6-
UNNEST(response_headers) AS h
5+
FROM `httparchive.crawl.requests` r
6+
LEFT JOIN UNNEST(r.response_headers) AS h
7+
ON LOWER(h.name) = 'content-encoding'
78
WHERE
89
date = '2025-06-01'
910
AND is_root_page
1011
AND is_main_document
11-
AND LOWER(h.name) = 'content-encoding'
1212
),
1313

1414
compression_rollup AS (

0 commit comments

Comments
 (0)