diff --git a/sql/2024/cookies/0_extract_cookies.sql b/sql/2024/cookies/0_extract_cookies.sql index f2fd9ce541b..1b62157474c 100644 --- a/sql/2024/cookies/0_extract_cookies.sql +++ b/sql/2024/cookies/0_extract_cookies.sql @@ -50,7 +50,7 @@ WITH intermediate_cookie AS ( `httparchive.crawl.pages`, UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie WHERE - date = '2024-06-01' + date = '2025-07-01' ) SELECT diff --git a/sql/2025/cookies/0_extract_cookies.sql b/sql/2025/cookies/0_extract_cookies.sql new file mode 100644 index 00000000000..1e31108031f --- /dev/null +++ b/sql/2025/cookies/0_extract_cookies.sql @@ -0,0 +1,73 @@ +-- Extract to the `httparchive.almanac.cookies `table the cookies that were set +-- during the crawl on . Data in this table can then be queried +-- more efficiently in consecutive queries without having to reextract it every +-- time + + +-- Code used by @tunetheweb to create the table +-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262 + +-- CREATE TABLE `httparchive.almanac.cookies` +-- ( +-- date DATE, +-- client STRING, +-- page STRING, +-- root_page STRING, +-- rank INTEGER, +-- startedDateTime STRING, +-- firstPartyCookie BOOL, +-- name STRING, +-- domain STRING, +-- path STRING, +-- expires STRING, +-- size STRING, +-- httpOnly STRING, +-- secure STRING, +-- session STRING, +-- sameSite STRING, +-- sameParty STRING, +-- partitionKey STRING, +-- partitionKeyOpaque STRING +-- ) +-- PARTITION BY date +-- CLUSTER BY +-- client, rank, page +-- AS +-- ... + +CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING) +RETURNS INT64 LANGUAGE js AS ''' + try { + var timestamp = Math.round(new Date(date_string).getTime() / 1000); + return isNaN(timestamp) ? -1 : timestamp; + } catch (e) { + return -1; + } +'''; + +INSERT INTO `httparchive.almanac.cookies` +SELECT + date, + client, + page, + root_page, + rank, + CAST(toTimestamp(JSON_VALUE(payload.startedDateTime)) AS STRING) AS startedDateTime, + ENDS_WITH(NET.HOST(page), '.' || NET.REG_DOMAIN(JSON_VALUE(cookie.domain))) AS firstPartyCookie, + JSON_VALUE(cookie.name) AS name, + JSON_VALUE(cookie.domain) AS domain, + JSON_VALUE(cookie.path) AS path, + JSON_VALUE(cookie.expires) AS expires, + JSON_VALUE(cookie.size) AS size, + JSON_VALUE(cookie.httpOnly) AS httpOnly, + JSON_VALUE(cookie.secure) AS secure, + JSON_VALUE(cookie.session) AS session, + JSON_VALUE(cookie.sameSite) AS sameSite, + JSON_VALUE(cookie.sameParty) AS sameParty, + NULLIF(TO_JSON_STRING(cookie.partitionKey), 'null') AS partitionKey, + NULLIF(TO_JSON_STRING(cookie.partitionKeyOpaque), 'null') AS partitionKeyOpaque +FROM + `httparchive.crawl.pages`, + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie +WHERE + date = '2025-07-01' diff --git a/sql/2025/cookies/CHIPS_top_20_first_party_cookies.sql b/sql/2025/cookies/CHIPS_top_20_first_party_cookies.sql new file mode 100644 index 00000000000..c79c87d35e1 --- /dev/null +++ b/sql/2025/cookies/CHIPS_top_20_first_party_cookies.sql @@ -0,0 +1,33 @@ +-- Extract the top 20 first party cookies seen across websites that are +-- partitioned. Note: it is a bit weird that 1st party cookies would also be +-- partitioned, as CHIPS is meant for a 3rd party context... +-- Before running query: edit date and client + +SELECT + name, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = TRUE AND + partitionKey IS NOT NULL + ) AS percentWebsites +FROM + `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = TRUE AND + partitionKey IS NOT NULL +GROUP BY + name +ORDER BY + percentWebsites DESC +LIMIT + 20; diff --git a/sql/2025/cookies/CHIPS_top_20_third_party_cookies.sql b/sql/2025/cookies/CHIPS_top_20_third_party_cookies.sql new file mode 100644 index 00000000000..989f0af4827 --- /dev/null +++ b/sql/2025/cookies/CHIPS_top_20_third_party_cookies.sql @@ -0,0 +1,33 @@ +-- Extract the top 20 first party cookies seen across websites that are +-- partitioned. Note: it is a bit weird that 1st party cookies would also be +-- partitioned, as CHIPS is meant for a 3rd party context... +-- Before running query: edit date and client + +SELECT + name, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = FALSE AND + partitionKey IS NOT NULL + ) AS percentWebsites +FROM + `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = FALSE AND + partitionKey IS NOT NULL +GROUP BY + name +ORDER BY + percentWebsites DESC +LIMIT + 20; diff --git a/sql/2025/cookies/age_expire_cookies_per_type_quantiles.sql b/sql/2025/cookies/age_expire_cookies_per_type_quantiles.sql new file mode 100644 index 00000000000..811267e4a7e --- /dev/null +++ b/sql/2025/cookies/age_expire_cookies_per_type_quantiles.sql @@ -0,0 +1,28 @@ +-- Extract stats around the expire date (age rounded to closest number of days) +-- (only positive values, non session cookies) +-- Before running query: edit date and client + +WITH cookies_age AS ( + SELECT + firstPartyCookie, + ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL AND CAST(expires AS FLOAT64) > 0 AND + CAST(startedDateTime AS FLOAT64) > 0 +) + +SELECT + firstPartyCookie, + MIN(age) AS min, + APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99, + MAX(age) AS max +FROM cookies_age +GROUP BY firstPartyCookie diff --git a/sql/2025/cookies/age_expire_cookies_quantiles.sql b/sql/2025/cookies/age_expire_cookies_quantiles.sql new file mode 100644 index 00000000000..1ef36aff984 --- /dev/null +++ b/sql/2025/cookies/age_expire_cookies_quantiles.sql @@ -0,0 +1,25 @@ +-- Extract stats around the expire date (age rounded to closest number of days) +-- (only positive values, non session cookies) +-- Before running query: edit date and client + +WITH cookies_age AS ( + SELECT + ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL AND + CAST(expires AS FLOAT64) >= 0 +) + +SELECT + MIN(age) AS min, + APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99, + MAX(age) AS max +FROM cookies_age diff --git a/sql/2025/cookies/age_expires_cookies_cdf.sql b/sql/2025/cookies/age_expires_cookies_cdf.sql new file mode 100644 index 00000000000..9c2a8fb050d --- /dev/null +++ b/sql/2025/cookies/age_expires_cookies_cdf.sql @@ -0,0 +1,22 @@ +-- Extract the expire date (age rounded to closest number of days) +-- (only positive values, non session cookies) +-- Before running query: edit date and client + +WITH cookies_age AS ( + SELECT + ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL AND + CAST(expires AS FLOAT64) >= 0 +) + +SELECT + age, + COUNT(0) AS nbCookies +FROM cookies_age +GROUP BY age +ORDER BY age ASC diff --git a/sql/2025/cookies/nb_cookies_cdf.sql b/sql/2025/cookies/nb_cookies_cdf.sql new file mode 100644 index 00000000000..a2971b90740 --- /dev/null +++ b/sql/2025/cookies/nb_cookies_cdf.sql @@ -0,0 +1,24 @@ +-- Extract the nb of cookies +-- Before running query: edit date and client + +WITH nb_cookies_per_website AS ( + SELECT + firstPartyCookie, + NET.HOST(page) AS pageFirstPartyHost, + COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL + GROUP BY firstPartyCookie, pageFirstPartyHost +) + +SELECT + firstPartyCookie, + distinctNbCookies, + COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites +FROM nb_cookies_per_website +GROUP BY firstPartyCookie, distinctNbCookies +ORDER BY firstPartyCookie, distinctNbCookies ASC diff --git a/sql/2025/cookies/nb_cookies_per_type_quantiles.sql b/sql/2025/cookies/nb_cookies_per_type_quantiles.sql new file mode 100644 index 00000000000..e02a08fe35f --- /dev/null +++ b/sql/2025/cookies/nb_cookies_per_type_quantiles.sql @@ -0,0 +1,28 @@ +-- Extract stats about the nb of cookies +-- Before running query: edit date and client + +WITH nb_cookies_per_website AS ( + SELECT + firstPartyCookie, + NET.HOST(page) AS pageFirstPartyHost, + COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL + GROUP BY firstPartyCookie, pageFirstPartyHost +) + +SELECT + firstPartyCookie, + MIN(distinctNbCookies) AS min, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99, + MAX(distinctNbCookies) AS max +FROM nb_cookies_per_website +GROUP BY firstPartyCookie diff --git a/sql/2025/cookies/nb_cookies_quantiles.sql b/sql/2025/cookies/nb_cookies_quantiles.sql new file mode 100644 index 00000000000..2e7b6065d5f --- /dev/null +++ b/sql/2025/cookies/nb_cookies_quantiles.sql @@ -0,0 +1,25 @@ +-- Extract stats about the nb of cookies +-- Before running query: edit date and client + +WITH nb_cookies_per_website AS ( + SELECT + NET.HOST(page) AS pageFirstPartyHost, + COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies + FROM `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL + GROUP BY pageFirstPartyHost +) + +SELECT + MIN(distinctNbCookies) AS min, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99, + MAX(distinctNbCookies) AS max +FROM nb_cookies_per_website diff --git a/sql/2025/cookies/prevalence_attributes_per_type.sql b/sql/2025/cookies/prevalence_attributes_per_type.sql new file mode 100644 index 00000000000..26fd500613c --- /dev/null +++ b/sql/2025/cookies/prevalence_attributes_per_type.sql @@ -0,0 +1,48 @@ +-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party) +-- Before running query: edit date and client + +SELECT + firstPartyCookie, + SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly, + SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure, + SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session, + SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty, + SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax, + SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone, + SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict, + SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull, + SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey, + SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque, + SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix, + SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL -- just in case +GROUP BY + firstPartyCookie; + +SELECT + firstPartyCookie, + ROUND(SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) * 100, 2) AS httpOnly, + ROUND(SUM(IF(secure = 'true', 1, 0)) / COUNT(0) * 100, 2) AS secure, + ROUND(SUM(IF(session = 'true', 1, 0)) / COUNT(0) * 100, 2) AS session, + ROUND(SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) * 100, 2) AS sameParty, + ROUND(SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteLax, + ROUND(SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNone, + ROUND(SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteStrict, + ROUND(SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNull, + ROUND(SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKey, + ROUND(SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKeyOpaque, + ROUND(SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) * 100, 2) AS hostPrefix, + ROUND(SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) * 100, 2) AS securePrefix +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL -- just in case +GROUP BY + firstPartyCookie; diff --git a/sql/2025/cookies/prevalence_type_attributes_per_rank.sql b/sql/2025/cookies/prevalence_type_attributes_per_rank.sql new file mode 100644 index 00000000000..7e712ddf663 --- /dev/null +++ b/sql/2025/cookies/prevalence_type_attributes_per_rank.sql @@ -0,0 +1,55 @@ +-- Prevalence of cookies type and attributes per rank grouping +-- Before running query: edit date and client + +SELECT + rank_grouping, + SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) AS firstParty, + SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) AS thirdParty, + SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly, + SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure, + SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session, + SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty, + SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax, + SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone, + SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict, + SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull, + SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey, + SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque +FROM `httparchive.almanac.cookies`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping AND + date = '2025-07-01' AND + client = 'desktop' AND + firstPartyCookie IS NOT NULL -- just in case +GROUP BY + rank_grouping +ORDER BY + rank_grouping; + + +SELECT + rank_grouping, + ROUND(SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) * 100, 2) AS firstParty, + ROUND(SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) * 100, 2) AS thirdParty, + ROUND(SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) * 100, 2) AS httpOnly, + ROUND(SUM(IF(secure = 'true', 1, 0)) / COUNT(0) * 100, 2) AS secure, + ROUND(SUM(IF(session = 'true', 1, 0)) / COUNT(0) * 100, 2) AS session, + ROUND(SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) * 100, 2) AS sameParty, + ROUND(SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteLax, + ROUND(SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNone, + ROUND(SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteStrict, + ROUND(SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNull, + ROUND(SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKey, + ROUND(SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKeyOpaque +FROM `httparchive.almanac.cookies`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping AND + date = '2025-07-01' AND + client = 'mobile' AND + firstPartyCookie IS NOT NULL -- just in case +GROUP BY + rank_grouping +ORDER BY + rank_grouping; diff --git a/sql/2025/cookies/size_cookies_cdf.sql b/sql/2025/cookies/size_cookies_cdf.sql new file mode 100644 index 00000000000..51c68dcd0ba --- /dev/null +++ b/sql/2025/cookies/size_cookies_cdf.sql @@ -0,0 +1,14 @@ +-- Extract the size of the cookies +-- Before running query: edit date and client + +SELECT + CAST(size AS INT) AS sizeCookie, + COUNT(0) AS nbCookies +FROM `httparchive.almanac.cookies` +WHERE + date = '2024-06-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL +GROUP BY sizeCookie +ORDER BY sizeCookie ASC diff --git a/sql/2025/cookies/size_cookies_per_type_quantiles.sql b/sql/2025/cookies/size_cookies_per_type_quantiles.sql new file mode 100644 index 00000000000..3b974039895 --- /dev/null +++ b/sql/2025/cookies/size_cookies_per_type_quantiles.sql @@ -0,0 +1,36 @@ +-- Extract stats around the size of the cookies +-- Before running query: edit date and client + +SELECT + firstPartyCookie, + MIN(CAST(size AS INT)) AS min, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99, + MAX(CAST(size AS INT)) AS max +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL +GROUP BY firstPartyCookie; + +SELECT + firstPartyCookie, + MIN(CAST(size AS INT)) AS min, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99, + MAX(CAST(size AS INT)) AS max +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL +GROUP BY firstPartyCookie; diff --git a/sql/2025/cookies/size_cookies_quantiles.sql b/sql/2025/cookies/size_cookies_quantiles.sql new file mode 100644 index 00000000000..439efc5b026 --- /dev/null +++ b/sql/2025/cookies/size_cookies_quantiles.sql @@ -0,0 +1,17 @@ +-- Extract stats around the size of the cookies +-- Before running query: edit date and client + +SELECT + MIN(CAST(size AS INT)) AS min, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90, + APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99, + MAX(CAST(size AS INT)) AS max +FROM `httparchive.almanac.cookies` +WHERE + date = '2024-06-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie IS NOT NULL diff --git a/sql/2025/cookies/size_extract_largest.sql b/sql/2025/cookies/size_extract_largest.sql new file mode 100644 index 00000000000..76e0a3fcd00 --- /dev/null +++ b/sql/2025/cookies/size_extract_largest.sql @@ -0,0 +1,54 @@ +-- Extract largest cookies being set +-- Before running query: edit date and client + +SELECT + name, + domain, + CAST(size AS INT) AS sizeCookie +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = TRUE +ORDER BY sizeCookie DESC +LIMIT 10; + +SELECT + name, + domain, + CAST(size AS INT) AS sizeCookie +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = FALSE +ORDER BY sizeCookie DESC +LIMIT 10; + +SELECT + name, + domain, + CAST(size AS INT) AS sizeCookie +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie = TRUE +ORDER BY sizeCookie DESC +LIMIT 10; + +SELECT + name, + domain, + CAST(size AS INT) AS sizeCookie +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie = FALSE +ORDER BY sizeCookie DESC +LIMIT 10; diff --git a/sql/2025/cookies/top_20_domains_setting_cookies.sql b/sql/2025/cookies/top_20_domains_setting_cookies.sql new file mode 100644 index 00000000000..5ad2e637ed2 --- /dev/null +++ b/sql/2025/cookies/top_20_domains_setting_cookies.sql @@ -0,0 +1,48 @@ +-- Extract the top 20 registrable domains that set cookies +-- Before running query: edit date and client + +SELECT + NET.REG_DOMAIN(domain) AS regDomain, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL + ) AS percentWebsites +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL +GROUP BY regDomain +ORDER BY percentWebsites DESC +LIMIT 20; + +SELECT + NET.REG_DOMAIN(domain) AS regDomain, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL + ) AS percentWebsites +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie IS NOT NULL +GROUP BY regDomain +ORDER BY percentWebsites DESC +LIMIT 20; diff --git a/sql/2025/cookies/top_20_first_party_cookies.sql b/sql/2025/cookies/top_20_first_party_cookies.sql new file mode 100644 index 00000000000..dcc3a198606 --- /dev/null +++ b/sql/2025/cookies/top_20_first_party_cookies.sql @@ -0,0 +1,56 @@ +-- Extract the top 20 first party cookies seen across websites +-- Before running query: edit date and client +SELECT + name, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = TRUE + ) AS percentWebsites +FROM + `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = TRUE +GROUP BY + name +ORDER BY + percentWebsites DESC +LIMIT + 20; + + +SELECT + name, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie = TRUE + ) AS percentWebsites +FROM + `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie = TRUE +GROUP BY + name +ORDER BY + percentWebsites DESC +LIMIT + 20; diff --git a/sql/2025/cookies/top_20_third_party_cookies.sql b/sql/2025/cookies/top_20_third_party_cookies.sql new file mode 100644 index 00000000000..bfb5c001deb --- /dev/null +++ b/sql/2025/cookies/top_20_third_party_cookies.sql @@ -0,0 +1,51 @@ +-- Extract the top 20 first party cookies seen across websites +-- Before running query: edit date and client + +SELECT + name, + domain, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND + firstPartyCookie = FALSE + ) AS percentWebsites +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'desktop' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie = FALSE +GROUP BY name, domain +ORDER BY percentWebsites DESC +LIMIT 20; + + +SELECT + name, + domain, + COUNT(DISTINCT NET.HOST(page)) / ( + SELECT + (COUNT(DISTINCT NET.HOST(page))) + FROM + `httparchive.almanac.cookies` + WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND + firstPartyCookie = FALSE + ) AS percentWebsites +FROM `httparchive.almanac.cookies` +WHERE + date = '2025-07-01' AND + client = 'mobile' AND + rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future + firstPartyCookie = FALSE +GROUP BY name, domain +ORDER BY percentWebsites DESC +LIMIT 20;