From 30641911a96c35a210cb6c4903dab0e60086bfdf Mon Sep 17 00:00:00 2001 From: Dave Smart Date: Wed, 12 Nov 2025 08:37:25 +0000 Subject: [PATCH 1/4] corrects filename for response_media_file_type_distribution.sql --- ...distribution.sql => response_media_file_type_distribution.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sql/2025/page-weight/{response_media_file_type.distribution.sql => response_media_file_type_distribution.sql} (100%) diff --git a/sql/2025/page-weight/response_media_file_type.distribution.sql b/sql/2025/page-weight/response_media_file_type_distribution.sql similarity index 100% rename from sql/2025/page-weight/response_media_file_type.distribution.sql rename to sql/2025/page-weight/response_media_file_type_distribution.sql From 43777716056d2197a23ab20b3bddd44556d7f929 Mon Sep 17 00:00:00 2001 From: Dave Smart Date: Wed, 12 Nov 2025 08:38:08 +0000 Subject: [PATCH 2/4] adds use-efficient-cache-lifetimes.sql lighthouse metric query --- .../use-efficient-cache-lifetimes.sql | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 sql/2025/page-weight/use-efficient-cache-lifetimes.sql diff --git a/sql/2025/page-weight/use-efficient-cache-lifetimes.sql b/sql/2025/page-weight/use-efficient-cache-lifetimes.sql new file mode 100644 index 00000000000..86a758fbdc8 --- /dev/null +++ b/sql/2025/page-weight/use-efficient-cache-lifetimes.sql @@ -0,0 +1,18 @@ +-- looks at the distribution of wasted bytes from resources identified by the Lighthouse "Use Efficient Cache Lifetimes" audit +-- https://developer.chrome.com/docs/performance/insights/cache +SELECT + percentile, + client, + --get the wasted bytes from resources that could be cached longer, in KB + APPROX_QUANTILES((CAST(JSON_VALUE(lighthouse.audits['uses-long-cache-ttl'].numericValue) AS FLOAT64) / 1024), 1000)[OFFSET(percentile * 10)] AS wasted_kb +FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2025-07-01' +GROUP BY + percentile, + client +ORDER BY + client, + percentile From 60247de89891ae0c7b6d84f693786b91ecf86e81 Mon Sep 17 00:00:00 2001 From: Dave Smart Date: Wed, 12 Nov 2025 09:47:28 +0000 Subject: [PATCH 3/4] group root and internal pages --- sql/2025/page-weight/use-efficient-cache-lifetimes.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sql/2025/page-weight/use-efficient-cache-lifetimes.sql b/sql/2025/page-weight/use-efficient-cache-lifetimes.sql index 86a758fbdc8..d92efd96314 100644 --- a/sql/2025/page-weight/use-efficient-cache-lifetimes.sql +++ b/sql/2025/page-weight/use-efficient-cache-lifetimes.sql @@ -3,6 +3,7 @@ SELECT percentile, client, + is_root_page, --get the wasted bytes from resources that could be cached longer, in KB APPROX_QUANTILES((CAST(JSON_VALUE(lighthouse.audits['uses-long-cache-ttl'].numericValue) AS FLOAT64) / 1024), 1000)[OFFSET(percentile * 10)] AS wasted_kb FROM @@ -12,7 +13,9 @@ WHERE date = '2025-07-01' GROUP BY percentile, - client + client, + is_root_page ORDER BY client, + is_root_page, percentile From 26372134443ebcc6dd57c914449724494ef9f408 Mon Sep 17 00:00:00 2001 From: Dave Smart Date: Thu, 13 Nov 2025 12:44:09 +0000 Subject: [PATCH 4/4] get bytesHtmlDoc from requests table --- sql/2025/page-weight/bytes_per_type.sql | 64 ++++++++++++++++++------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/sql/2025/page-weight/bytes_per_type.sql b/sql/2025/page-weight/bytes_per_type.sql index d06962d28cf..668da00672d 100644 --- a/sql/2025/page-weight/bytes_per_type.sql +++ b/sql/2025/page-weight/bytes_per_type.sql @@ -1,25 +1,57 @@ +-- Because summary bytesHtmlDoc is a little wrong and always zero in the pages table (see: https://github.com/HTTPArchive/wptagent/issues/47) +-- we need to recalculate it by joining back to the requests table. +-- To optimize performance, we first create a CTE to pre-filter and extract the necessary data +-- from the requests table, then join that CTE in the main query. +-- when fixed, we can revert to just using summary.bytesHtmlDoc directly and remove the join and CTE. +WITH HtmlDocRequests AS ( + SELECT + page, + client, + date, + -- Extract and cast the response size once inside the CTE + CAST(JSON_VALUE(summary.respSize) AS INT64) AS respSize_bytes + FROM + `httparchive.crawl.requests` + WHERE + -- Pre-filter the requests table for efficiency. + -- This MUST match the date in the main query's WHERE clause. + date = '2025-07-01' AND + type = 'html' AND + is_main_document +) + +-- Main query now joins the CTE SELECT percentile, - client, - is_root_page, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesTotal) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS total_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesHtml) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesJS) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS js_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesCss) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS css_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesImg) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS img_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesOther) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS other_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesHtmlDoc) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_doc_kbytes, - APPROX_QUANTILES(CAST(JSON_VALUE(summary.bytesFont) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS font_kbytes + p.client, + p.is_root_page, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesTotal) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS total_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesHtml) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS html_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesJS) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS js_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesCss) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS css_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesImg) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS img_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesOther) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS other_kbytes, + -- Use the pre-calculated column from the CTE + APPROX_QUANTILES(r.respSize_bytes / 1024, 1000)[OFFSET(percentile * 10)] AS html_doc_kbytes, + APPROX_QUANTILES(CAST(JSON_VALUE(p.summary.bytesFont) AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS font_kbytes FROM - `httparchive.crawl.pages`, + `httparchive.crawl.pages` p +LEFT JOIN + -- Join the new CTE on the matching keys + HtmlDocRequests r +ON + p.page = r.page AND + p.client = r.client AND + p.date = r.date, UNNEST([10, 25, 50, 75, 90, 100]) AS percentile WHERE - date = '2025-07-01' + -- This filter on the main 'pages' table is still required + p.date = '2025-07-01' GROUP BY percentile, - client, - is_root_page + p.client, + p.is_root_page ORDER BY - client, - is_root_page, + p.client, + p.is_root_page, percentile