Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sql/2024/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ WITH intermediate_cookie AS (
`httparchive.crawl.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
WHERE
date = '2024-06-01'
date = '2025-07-01'
Comment thread
ChrisBeeti marked this conversation as resolved.
)

SELECT
Expand Down
73 changes: 73 additions & 0 deletions sql/2025/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
-- more efficiently in consecutive queries without having to reextract it every
-- time


-- Code used by @tunetheweb to create the table
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262

-- CREATE TABLE `httparchive.almanac.cookies`
-- (
-- date DATE,
-- client STRING,
-- page STRING,
-- root_page STRING,
-- rank INTEGER,
-- startedDateTime STRING,
-- firstPartyCookie BOOL,
-- name STRING,
-- domain STRING,
-- path STRING,
-- expires STRING,
-- size STRING,
-- httpOnly STRING,
-- secure STRING,
-- session STRING,
-- sameSite STRING,
-- sameParty STRING,
-- partitionKey STRING,
-- partitionKeyOpaque STRING
-- )
-- PARTITION BY date
-- CLUSTER BY
-- client, rank, page
-- AS
-- ...

CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
RETURNS INT64 LANGUAGE js AS '''
try {
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
return isNaN(timestamp) ? -1 : timestamp;
} catch (e) {
return -1;
}
''';

INSERT INTO `httparchive.almanac.cookies`
SELECT
date,
client,
page,
root_page,
rank,
CAST(toTimestamp(JSON_VALUE(payload.startedDateTime)) AS STRING) AS startedDateTime,
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie.domain))) AS firstPartyCookie,
Comment thread
tunetheweb marked this conversation as resolved.
Outdated
JSON_VALUE(cookie.name) AS name,
JSON_VALUE(cookie.domain) AS domain,
JSON_VALUE(cookie.path) AS path,
JSON_VALUE(cookie.expires) AS expires,
JSON_VALUE(cookie.size) AS size,
JSON_VALUE(cookie.httpOnly) AS httpOnly,
JSON_VALUE(cookie.secure) AS secure,
JSON_VALUE(cookie.session) AS session,
JSON_VALUE(cookie.sameSite) AS sameSite,
JSON_VALUE(cookie.sameParty) AS sameParty,
NULLIF(TO_JSON_STRING(cookie.partitionKey), 'null') AS partitionKey,
NULLIF(TO_JSON_STRING(cookie.partitionKeyOpaque), 'null') AS partitionKeyOpaque
FROM
`httparchive.crawl.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
WHERE
date = '2025-07-01'
33 changes: 33 additions & 0 deletions sql/2025/cookies/CHIPS_top_20_first_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (
SELECT
(COUNT(DISTINCT NET.HOST(page)))
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
) AS percentWebsites
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
GROUP BY
name
ORDER BY
percentWebsites DESC
LIMIT
20;
33 changes: 33 additions & 0 deletions sql/2025/cookies/CHIPS_top_20_third_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (
SELECT
(COUNT(DISTINCT NET.HOST(page)))
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
) AS percentWebsites
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
GROUP BY
name
ORDER BY
percentWebsites DESC
LIMIT
20;
28 changes: 28 additions & 0 deletions sql/2025/cookies/age_expire_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
firstPartyCookie,
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND CAST(expires AS FLOAT64) > 0 AND
CAST(startedDateTime AS FLOAT64) > 0
)

SELECT
firstPartyCookie,
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2025/cookies/age_expire_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
22 changes: 22 additions & 0 deletions sql/2025/cookies/age_expires_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Extract the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
age,
COUNT(0) AS nbCookies
FROM cookies_age
GROUP BY age
ORDER BY age ASC
24 changes: 24 additions & 0 deletions sql/2025/cookies/nb_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- Extract the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
distinctNbCookies,
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
FROM nb_cookies_per_website
GROUP BY firstPartyCookie, distinctNbCookies
ORDER BY firstPartyCookie, distinctNbCookies ASC
28 changes: 28 additions & 0 deletions sql/2025/cookies/nb_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2025/cookies/nb_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY pageFirstPartyHost
)

SELECT
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
48 changes: 48 additions & 0 deletions sql/2025/cookies/prevalence_attributes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party)
-- Before running query: edit date and client

SELECT
firstPartyCookie,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque,
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix,
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie

SELECT
Comment thread
tunetheweb marked this conversation as resolved.
Comment thread
tunetheweb marked this conversation as resolved.
firstPartyCookie,
ROUND(SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) * 100, 2) AS httpOnly,
ROUND(SUM(IF(secure = 'true', 1, 0)) / COUNT(0) * 100, 2) AS secure,
ROUND(SUM(IF(session = 'true', 1, 0)) / COUNT(0) * 100, 2) AS session,
ROUND(SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) * 100, 2) AS sameParty,
ROUND(SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteLax,
ROUND(SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNone,
ROUND(SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteStrict,
ROUND(SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNull,
ROUND(SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKey,
ROUND(SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKeyOpaque,
ROUND(SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) * 100, 2) AS hostPrefix,
ROUND(SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) * 100, 2) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'mobile' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie;
Loading
Loading