@@ -4,16 +4,59 @@ CREATE TEMPORARY FUNCTION GETFAVICONIMAGE(payload STRING)
44RETURNS STRING LANGUAGE js AS ' ' '
55var result = ' NO_DATA' ;
66try {
7- var almanac = JSON.parse(payload);
7+ var parsed = JSON.parse(payload);
8+
9+ // If wrapped, unwrap _almanac
10+ if (parsed && typeof parsed === ' object' && parsed._almanac && typeof parsed._almanac === ' object' ) {
11+ parsed = parsed._almanac;
12+ }
813
9- if (Array.isArray(almanac) || typeof almanac != ' object' ) return result;
14+ // Deep search for any array of link-like nodes anywhere in the object
15+ function findLinkNodes(obj) {
16+ if (!obj) return [];
17+ var stack = [obj];
18+ while (stack.length) {
19+ var current = stack.pop();
20+ if (!current) continue;
21+ if (Array.isArray(current)) {
22+ // If array of objects with rel/href, treat as nodes
23+ if (
24+ current.length && typeof current[0] === ' object' && current.some(function(it){return it && (it.href || it.rel);})
25+ ) {
26+ return current;
27+ }
28+ for (var i = 0; i < current.length; i++) stack.push(current[i]);
29+ } else if (typeof current === ' object' ) {
30+ // Common patterns: {nodes: [...]} wrappers
31+ if (current.nodes && Array.isArray(current.nodes)) {
32+ var n = current.nodes;
33+ if (n.length && typeof n[0] === ' object' && n.some(function(it){return it && (it.href || it.rel);})){return n;}
34+ }
35+ for (var k in current) if (Object.prototype.hasOwnProperty.call(current, k)) stack.push(current[k]);
36+ }
37+ }
38+ return [];
39+ }
1040
11- if (almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.find) {
12- var faviconNode = almanac["link-nodes"].nodes.find(n => n.rel && n.rel.split(' ' ).find(r => r.trim().toLowerCase() == ' icon' ));
41+ var nodes = findLinkNodes(parsed);
42+ if (!nodes || !nodes.length) return result;
43+
44+ if (nodes && nodes.find) {
45+ var faviconNode = nodes.find(function(n) {
46+ if (!n || !(' rel' in n)) return false;
47+ var rels = Array.isArray(n.rel) ? n.rel : String(n.rel).split(' ' );
48+ for (var j = 0; j < rels.length; j++) {
49+ var r = String(rels[j]).trim().toLowerCase();
50+ if (r === ' icon' || r === ' shortcut icon' || r === ' apple- touch- icon' || r === ' apple- touch- icon- precomposed' ) {
51+ return true;
52+ }
53+ }
54+ return false;
55+ });
1356
1457 if (faviconNode) {
1558 if (faviconNode.href) {
16- var temp = faviconNode.href;
59+ var temp = String( faviconNode.href) ;
1760
1861 if (temp.includes(' ?' )) {
1962 temp = temp.substring(0, temp.indexOf(' ?' ));
@@ -43,29 +86,98 @@ try {
4386return result;
4487' ' ' ;
4588
46- # Main query to analyze favicon image extensions with sampling
47- WITH favicons AS (
89+ # Main query to analyze favicon image extensions using requests heuristics
90+ WITH pages AS (
4891 SELECT
4992 client,
50- GETFAVICONIMAGE(
51- JSON_EXTRACT_SCALAR(payload, ' $._almanac' )
52- ) AS image_type_extension,
53- COUNT (0 ) AS freq,
54- SUM (COUNT (0 )) OVER (PARTITION BY client) AS total,
55- COUNT (0 ) / SUM (COUNT (0 )) OVER (PARTITION BY client) AS percentage_of_total
93+ page
5694 FROM
5795 ` httparchive.crawl.pages`
5896 WHERE
59- date = ' 2025-06-01'
97+ date = ' 2025-06-01' AND
98+ is_root_page
99+ ),
100+
101+ reqs AS (
102+ SELECT
103+ client,
104+ page,
105+ url,
106+ response_headers
107+ FROM
108+ ` httparchive.crawl.requests`
109+ WHERE
110+ date = ' 2025-06-01' AND
111+ is_root_page
112+ ),
113+
114+ candidates AS (
115+ SELECT
116+ r .client ,
117+ r .page ,
118+ r .url ,
119+ LOWER (
120+ REGEXP_EXTRACT(
121+ REGEXP_REPLACE(SPLIT(r .url , ' ?' )[SAFE_OFFSET(0 )], r' /+$' , ' ' ),
122+ r' \. ([A-Za-z0-9]+)$'
123+ )
124+ ) AS url_ext,
125+ (
126+ SELECT LOWER (value)
127+ FROM UNNEST(r .response_headers )
128+ WHERE LOWER (name) = ' content-type'
129+ LIMIT 1
130+ ) AS content_type
131+ FROM
132+ reqs r
133+ JOIN
134+ pages p
135+ USING (client, page)
136+ WHERE
137+ REGEXP_CONTAINS(LOWER (r .url ), r' favicon|apple-touch-icon|android-chrome|mstile|safari-pinned-tab' )
138+ ),
139+
140+ resolved AS (
141+ SELECT
142+ client,
143+ page,
144+ url,
145+ COALESCE(
146+ NULLIF(url_ext, ' ' ),
147+ CASE
148+ WHEN content_type LIKE ' image/svg%' THEN ' svg'
149+ WHEN content_type LIKE ' image/png%' THEN ' png'
150+ WHEN content_type LIKE ' image/webp%' THEN ' webp'
151+ WHEN content_type LIKE ' image/jpeg%' OR content_type LIKE ' image/jpg%' THEN ' jpg'
152+ WHEN content_type LIKE ' image/x-icon%' OR content_type LIKE ' image/vnd.microsoft.icon%' THEN ' ico'
153+ ELSE ' unknown'
154+ END
155+ ) AS image_type_extension
156+ FROM
157+ candidates
158+ ),
159+
160+ rollup_data AS (
161+ SELECT
162+ client,
163+ image_type_extension,
164+ COUNT (DISTINCT page) AS pages
165+ FROM
166+ resolved
60167 GROUP BY
61168 client,
62169 image_type_extension
63170)
64171
65172SELECT
66- * ,
67- percentage_of_total AS pct
173+ client,
174+ image_type_extension,
175+ pages AS count,
176+ SUM (pages) OVER (PARTITION BY client) AS total,
177+ ROUND(100 * SAFE_DIVIDE(pages, SUM (pages) OVER (PARTITION BY client)), 2 ) AS pct
68178FROM
69- favicons
179+ rollup_data
70180ORDER BY
71- pct DESC
181+ client ASC ,
182+ count DESC ,
183+ image_type_extension ASC
0 commit comments