Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sql/2024/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ WITH intermediate_cookie AS (
`httparchive.crawl.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
WHERE
date = '2024-06-01'
date = '2025-07-01'
Comment thread
ChrisBeeti marked this conversation as resolved.
)

SELECT
Expand Down
73 changes: 73 additions & 0 deletions sql/2025/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
-- more efficiently in consecutive queries without having to reextract it every
-- time


-- Code used by @tunetheweb to create the table
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262

-- CREATE TABLE `httparchive.almanac.cookies`
-- (
-- date DATE,
-- client STRING,
-- page STRING,
-- root_page STRING,
-- rank INTEGER,
-- startedDateTime STRING,
-- firstPartyCookie BOOL,
-- name STRING,
-- domain STRING,
-- path STRING,
-- expires STRING,
-- size STRING,
-- httpOnly STRING,
-- secure STRING,
-- session STRING,
-- sameSite STRING,
-- sameParty STRING,
-- partitionKey STRING,
-- partitionKeyOpaque STRING
-- )
-- PARTITION BY date
-- CLUSTER BY
-- client, rank, page
-- AS
-- ...

CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
RETURNS INT64 LANGUAGE js AS '''
try {
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
return isNaN(timestamp) ? -1 : timestamp;
} catch (e) {
return -1;
}
''';

INSERT INTO `httparchive.almanac.cookies`
SELECT
date,
client,
page,
root_page,
rank,
CAST(toTimestamp(JSON_VALUE(payload.startedDateTime)) AS STRING) AS startedDateTime,
ENDS_WITH(NET.HOST(page), '.' || NET.REG_DOMAIN(JSON_VALUE(cookie.domain))) AS firstPartyCookie,
JSON_VALUE(cookie.name) AS name,
JSON_VALUE(cookie.domain) AS domain,
JSON_VALUE(cookie.path) AS path,
JSON_VALUE(cookie.expires) AS expires,
JSON_VALUE(cookie.size) AS size,
JSON_VALUE(cookie.httpOnly) AS httpOnly,
JSON_VALUE(cookie.secure) AS secure,
JSON_VALUE(cookie.session) AS session,
JSON_VALUE(cookie.sameSite) AS sameSite,
JSON_VALUE(cookie.sameParty) AS sameParty,
NULLIF(TO_JSON_STRING(cookie.partitionKey), 'null') AS partitionKey,
NULLIF(TO_JSON_STRING(cookie.partitionKeyOpaque), 'null') AS partitionKeyOpaque
FROM
`httparchive.crawl.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
WHERE
date = '2025-07-01'
33 changes: 33 additions & 0 deletions sql/2025/cookies/CHIPS_top_20_first_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (
SELECT
(COUNT(DISTINCT NET.HOST(page)))
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
) AS percentWebsites
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
GROUP BY
name
ORDER BY
percentWebsites DESC
LIMIT
20;
33 changes: 33 additions & 0 deletions sql/2025/cookies/CHIPS_top_20_third_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (
SELECT
(COUNT(DISTINCT NET.HOST(page)))
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
) AS percentWebsites
FROM
`httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
GROUP BY
name
ORDER BY
percentWebsites DESC
LIMIT
20;
28 changes: 28 additions & 0 deletions sql/2025/cookies/age_expire_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
firstPartyCookie,
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND CAST(expires AS FLOAT64) > 0 AND
CAST(startedDateTime AS FLOAT64) > 0
)

SELECT
firstPartyCookie,
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2025/cookies/age_expire_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
22 changes: 22 additions & 0 deletions sql/2025/cookies/age_expires_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Extract the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
age,
COUNT(0) AS nbCookies
FROM cookies_age
GROUP BY age
ORDER BY age ASC
24 changes: 24 additions & 0 deletions sql/2025/cookies/nb_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- Extract the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
distinctNbCookies,
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
FROM nb_cookies_per_website
GROUP BY firstPartyCookie, distinctNbCookies
ORDER BY firstPartyCookie, distinctNbCookies ASC
28 changes: 28 additions & 0 deletions sql/2025/cookies/nb_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2025/cookies/nb_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY pageFirstPartyHost
)

SELECT
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
48 changes: 48 additions & 0 deletions sql/2025/cookies/prevalence_attributes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party)
-- Before running query: edit date and client

SELECT
firstPartyCookie,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque,
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix,
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie;

SELECT
Comment thread
tunetheweb marked this conversation as resolved.
Comment thread
tunetheweb marked this conversation as resolved.
firstPartyCookie,
ROUND(SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) * 100, 2) AS httpOnly,
ROUND(SUM(IF(secure = 'true', 1, 0)) / COUNT(0) * 100, 2) AS secure,
ROUND(SUM(IF(session = 'true', 1, 0)) / COUNT(0) * 100, 2) AS session,
ROUND(SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) * 100, 2) AS sameParty,
ROUND(SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteLax,
ROUND(SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNone,
ROUND(SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) * 100, 2) AS sameSiteStrict,
ROUND(SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) * 100, 2) AS sameSiteNull,
ROUND(SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKey,
ROUND(SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) * 100, 2) AS partitionKeyOpaque,
ROUND(SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) * 100, 2) AS hostPrefix,
ROUND(SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) * 100, 2) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2025-07-01' AND
client = 'mobile' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie;
Loading