Skip to content

Commit 49512d6

Browse files
committed
update queries
1 parent 2b293a0 commit 49512d6

19 files changed

+663
-1
lines changed

sql/2024/cookies/0_extract_cookies.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ WITH intermediate_cookie AS (
5050
`httparchive.crawl.pages`,
5151
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
5252
WHERE
53-
date = '2024-06-01'
53+
date = '2025-07-01'
5454
)
5555

5656
SELECT
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
2+
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
3+
-- more efficiently in consecutive queries without having to reextract it every
4+
-- time
5+
6+
7+
-- Code used by @tunetheweb to create the table
8+
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262
9+
10+
-- CREATE TABLE `httparchive.almanac.cookies`
11+
-- (
12+
-- date DATE,
13+
-- client STRING,
14+
-- page STRING,
15+
-- root_page STRING,
16+
-- rank INTEGER,
17+
-- startedDateTime STRING,
18+
-- firstPartyCookie BOOL,
19+
-- name STRING,
20+
-- domain STRING,
21+
-- path STRING,
22+
-- expires STRING,
23+
-- size STRING,
24+
-- httpOnly STRING,
25+
-- secure STRING,
26+
-- session STRING,
27+
-- sameSite STRING,
28+
-- sameParty STRING,
29+
-- partitionKey STRING,
30+
-- partitionKeyOpaque STRING
31+
-- )
32+
-- PARTITION BY date
33+
-- CLUSTER BY
34+
-- client, rank, page
35+
-- AS
36+
-- ...
37+
38+
CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
39+
RETURNS INT64 LANGUAGE js AS '''
40+
try {
41+
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
42+
return isNaN(timestamp) ? -1 : timestamp;
43+
} catch (e) {
44+
return -1;
45+
}
46+
''';
47+
48+
INSERT INTO `httparchive.almanac.cookies`
49+
SELECT
50+
date,
51+
client,
52+
page,
53+
root_page,
54+
rank,
55+
CAST(toTimestamp(JSON_VALUE(payload.startedDateTime)) AS STRING) AS startedDateTime,
56+
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie.domain))) AS firstPartyCookie,
57+
JSON_VALUE(cookie.name) AS name,
58+
JSON_VALUE(cookie.domain) AS domain,
59+
JSON_VALUE(cookie.path) AS path,
60+
JSON_VALUE(cookie.expires) AS expires,
61+
JSON_VALUE(cookie.size) AS size,
62+
JSON_VALUE(cookie.httpOnly) AS httpOnly,
63+
JSON_VALUE(cookie.secure) AS secure,
64+
JSON_VALUE(cookie.session) AS session,
65+
JSON_VALUE(cookie.sameSite) AS sameSite,
66+
JSON_VALUE(cookie.sameParty) AS sameParty,
67+
NULLIF(TO_JSON_STRING(cookie.partitionKey), 'null') AS partitionKey,
68+
NULLIF(TO_JSON_STRING(cookie.partitionKeyOpaque), 'null') AS partitionKeyOpaque
69+
FROM
70+
`httparchive.crawl.pages`,
71+
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
72+
WHERE
73+
date = '2025-07-01'
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
-- Extract the top 20 first party cookies seen across websites that are
2+
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
3+
-- partitioned, as CHIPS is meant for a 3rd party context...
4+
-- Before running query: edit date and client
5+
6+
SELECT
7+
name,
8+
COUNT(DISTINCT NET.HOST(page)) / (
9+
SELECT
10+
(COUNT(DISTINCT NET.HOST(page)))
11+
FROM
12+
`httparchive.almanac.cookies`
13+
WHERE
14+
date = '2025-07-01'
15+
AND client = 'desktop'
16+
AND rank <= 1000000
17+
AND firstPartyCookie = TRUE
18+
AND partitionKey IS NOT NULL) AS percentWebsites
19+
FROM
20+
`httparchive.almanac.cookies`
21+
WHERE
22+
date = '2025-07-01'
23+
AND client = 'desktop'
24+
AND rank <= 1000000
25+
AND firstPartyCookie = TRUE
26+
AND partitionKey IS NOT NULL
27+
GROUP BY
28+
name
29+
ORDER BY
30+
percentWebsites DESC
31+
LIMIT
32+
20;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
-- Extract the top 20 first party cookies seen across websites that are
2+
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
3+
-- partitioned, as CHIPS is meant for a 3rd party context...
4+
-- Before running query: edit date and client
5+
6+
SELECT
7+
name,
8+
COUNT(DISTINCT NET.HOST(page)) / (
9+
SELECT
10+
(COUNT(DISTINCT NET.HOST(page)))
11+
FROM
12+
`httparchive.almanac.cookies`
13+
WHERE
14+
date = '2025-07-01'
15+
AND client = 'desktop'
16+
AND rank <= 1000000
17+
AND firstPartyCookie = FALSE
18+
AND partitionKey IS NOT NULL) AS percentWebsites
19+
FROM
20+
`httparchive.almanac.cookies`
21+
WHERE
22+
date = '2025-07-01'
23+
AND client = 'desktop'
24+
AND rank <= 1000000
25+
AND firstPartyCookie = FALSE
26+
AND partitionKey IS NOT NULL
27+
GROUP BY
28+
name
29+
ORDER BY
30+
percentWebsites DESC
31+
LIMIT
32+
20;
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- Extract stats around the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
firstPartyCookie,
8+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2025-07-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL AND CAST(expires AS FLOAT64) > 0
15+
AND CAST(startedDateTime AS FLOAT64) > 0
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
MIN(age) AS min,
21+
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
22+
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
23+
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
24+
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
25+
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
26+
MAX(age) AS max
27+
FROM cookies_age
28+
GROUP BY firstPartyCookie
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Extract stats around the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2025-07-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL AND
14+
CAST(expires AS FLOAT64) >= 0
15+
)
16+
17+
SELECT
18+
MIN(age) AS min,
19+
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
20+
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
21+
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
22+
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
23+
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
24+
MAX(age) AS max
25+
FROM cookies_age
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
-- Extract the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2025-07-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL AND
14+
CAST(expires AS FLOAT64) >= 0
15+
)
16+
17+
SELECT
18+
age,
19+
COUNT(0) AS nbCookies
20+
FROM cookies_age
21+
GROUP BY age
22+
ORDER BY age ASC
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
-- Extract the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
firstPartyCookie,
7+
NET.HOST(page) AS pageFirstPartyHost,
8+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2025-07-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL
15+
GROUP BY firstPartyCookie, pageFirstPartyHost
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
distinctNbCookies,
21+
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
22+
FROM nb_cookies_per_website
23+
GROUP BY firstPartyCookie, distinctNbCookies
24+
ORDER BY firstPartyCookie, distinctNbCookies ASC
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- Extract stats about the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
firstPartyCookie,
7+
NET.HOST(page) AS pageFirstPartyHost,
8+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2025-07-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL
15+
GROUP BY firstPartyCookie, pageFirstPartyHost
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
MIN(distinctNbCookies) AS min,
21+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
22+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
23+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
24+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
25+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
26+
MAX(distinctNbCookies) AS max
27+
FROM nb_cookies_per_website
28+
GROUP BY firstPartyCookie
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Extract stats about the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
NET.HOST(page) AS pageFirstPartyHost,
7+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2025-07-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL
14+
GROUP BY pageFirstPartyHost
15+
)
16+
17+
SELECT
18+
MIN(distinctNbCookies) AS min,
19+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
20+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
21+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
22+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
23+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
24+
MAX(distinctNbCookies) AS max
25+
FROM nb_cookies_per_website

0 commit comments

Comments
 (0)