11# standardSQL
22# Temporary function to extract favicon image extensions from the JSON payload
3- CREATE TEMPORARY FUNCTION GETFAVICONIMAGE(payload STRING )
3+ CREATE TEMPORARY FUNCTION getFaviconImage(almanac JSON )
44RETURNS STRING LANGUAGE js AS ' ' '
55var result = ' NO_DATA' ;
66try {
7- var parsed = JSON.parse(payload);
8-
9- // If wrapped, unwrap _almanac
10- if (parsed && typeof parsed === ' object' && parsed._almanac && typeof parsed._almanac === ' object' ) {
11- parsed = parsed._almanac;
12- }
13-
14- // Deep search for any array of link-like nodes anywhere in the object
15- function findLinkNodes(obj) {
16- if (!obj) return [];
17- var stack = [obj];
18- while (stack.length) {
19- var current = stack.pop();
20- if (!current) continue;
21- if (Array.isArray(current)) {
22- // If array of objects with rel/href, treat as nodes
23- if (
24- current.length && typeof current[0] === ' object' && current.some(function(it){return it && (it.href || it.rel);})
25- ) {
26- return current;
27- }
28- for (var i = 0; i < current.length; i++) stack.push(current[i]);
29- } else if (typeof current === ' object' ) {
30- // Common patterns: {nodes: [...]} wrappers
31- if (current.nodes && Array.isArray(current.nodes)) {
32- var n = current.nodes;
33- if (n.length && typeof n[0] === ' object' && n.some(function(it){return it && (it.href || it.rel);})){return n;}
34- }
35- for (var k in current) if (Object.prototype.hasOwnProperty.call(current, k)) stack.push(current[k]);
36- }
37- }
38- return [];
39- }
7+ if (Array.isArray(almanac) || typeof almanac != ' object' ) return result;
408
41- var nodes = findLinkNodes(parsed);
42- if (!nodes || !nodes.length) return result;
43-
44- if (nodes && nodes.find) {
45- var faviconNode = nodes.find(function(n) {
46- if (!n || !(' rel' in n)) return false;
47- var rels = Array.isArray(n.rel) ? n.rel : String(n.rel).split(' ' );
48- for (var j = 0; j < rels.length; j++) {
49- var r = String(rels[j]).trim().toLowerCase();
50- if (r === ' icon' || r === ' shortcut icon' || r === ' apple- touch- icon' || r === ' apple- touch- icon- precomposed' ) {
51- return true;
52- }
53- }
54- return false;
55- });
9+ if (almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.find) {
10+ var faviconNode = almanac["link-nodes"].nodes.find(n => n.rel && n.rel.split(' ' ).find(r => r.trim().toLowerCase() == ' icon' ));
5611
5712 if (faviconNode) {
5813 if (faviconNode.href) {
59- var temp = String( faviconNode.href) ;
14+ var temp = faviconNode.href;
6015
6116 if (temp.includes(' ?' )) {
6217 temp = temp.substring(0, temp.indexOf(' ?' ));
@@ -86,98 +41,30 @@ try {
8641return result;
8742' ' ' ;
8843
89- # Main query to analyze favicon image extensions using requests heuristics
90- WITH pages AS (
44+ # Main query to analyze favicon image extensions with sampling
45+ WITH favicons AS (
9146 SELECT
9247 client,
93- page
48+ getFaviconImage(custom_metrics .other .almanac) AS image_type_extension,
49+ COUNT (0 ) AS freq,
50+ SUM (COUNT (0 )) OVER (PARTITION BY client) AS total,
51+ COUNT (0 ) / SUM (COUNT (0 )) OVER (PARTITION BY client) AS percentage_of_total
9452 FROM
9553 ` httparchive.crawl.pages`
9654 WHERE
97- date = ' 2025-06-01' AND
98- is_root_page
99- ),
100-
101- reqs AS (
102- SELECT
103- client,
104- page,
105- url,
106- response_headers
107- FROM
108- ` httparchive.crawl.requests`
109- WHERE
110- date = ' 2025-06-01' AND
111- is_root_page
112- ),
113-
114- candidates AS (
115- SELECT
116- r .client ,
117- r .page ,
118- r .url ,
119- LOWER (
120- REGEXP_EXTRACT(
121- REGEXP_REPLACE(SPLIT(r .url , ' ?' )[SAFE_OFFSET(0 )], r' /+$' , ' ' ),
122- r' \. ([A-Za-z0-9]+)$'
123- )
124- ) AS url_ext,
125- (
126- SELECT LOWER (value)
127- FROM UNNEST(r .response_headers )
128- WHERE LOWER (name) = ' content-type'
129- LIMIT 1
130- ) AS content_type
131- FROM
132- reqs r
133- JOIN
134- pages p
135- USING (client, page)
136- WHERE
137- REGEXP_CONTAINS(LOWER (r .url ), r' favicon|apple-touch-icon|android-chrome|mstile|safari-pinned-tab' )
138- ),
139-
140- resolved AS (
141- SELECT
142- client,
143- page,
144- url,
145- COALESCE(
146- NULLIF(url_ext, ' ' ),
147- CASE
148- WHEN content_type LIKE ' image/svg%' THEN ' svg'
149- WHEN content_type LIKE ' image/png%' THEN ' png'
150- WHEN content_type LIKE ' image/webp%' THEN ' webp'
151- WHEN content_type LIKE ' image/jpeg%' OR content_type LIKE ' image/jpg%' THEN ' jpg'
152- WHEN content_type LIKE ' image/x-icon%' OR content_type LIKE ' image/vnd.microsoft.icon%' THEN ' ico'
153- ELSE ' unknown'
154- END
155- ) AS image_type_extension
156- FROM
157- candidates
158- ),
159-
160- rollup_data AS (
161- SELECT
162- client,
163- image_type_extension,
164- COUNT (DISTINCT page) AS pages
165- FROM
166- resolved
55+ date = ' 2025-07-01'
16756 GROUP BY
16857 client,
16958 image_type_extension
17059)
17160
17261SELECT
173- client,
174- image_type_extension,
175- pages AS count,
176- SUM (pages) OVER (PARTITION BY client) AS total,
177- ROUND(100 * SAFE_DIVIDE(pages, SUM (pages) OVER (PARTITION BY client)), 2 ) AS pct
62+ * ,
63+ percentage_of_total AS pct
17864FROM
179- rollup_data
65+ favicons
18066ORDER BY
181- client ASC ,
182- count DESC ,
183- image_type_extension ASC
67+ pct DESC
68+ LIMIT
69+ 1000 ;
70+
0 commit comments