Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6aebd35
dates updated
max-ostapenko Aug 1, 2025
943ae28
query updates
max-ostapenko Aug 1, 2025
451cca9
sheet exporter update
max-ostapenko Aug 2, 2025
e394bb6
ID update
max-ostapenko Aug 2, 2025
2a7db9b
formatting
max-ostapenko Aug 2, 2025
cd572c8
lint
max-ostapenko Aug 2, 2025
bb5959a
Refactor origin trial functions for improved readability and structure
max-ostapenko Sep 12, 2025
579fb49
Merge remote-tracking branch 'origin/main' into privacy-sql-2025
max-ostapenko Sep 12, 2025
55cec4f
lint
max-ostapenko Sep 12, 2025
c3a2ee7
lint
max-ostapenko Sep 12, 2025
489d07d
Merge branch 'main' into privacy-sql-2025
max-ostapenko Sep 20, 2025
7640ee0
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 19, 2025
bd7506d
make bq_to_sheets.ipynb runnable and add deps to requirements
max-ostapenko Oct 19, 2025
08aa531
Refactor privacy queries and utilities; make bq_to_sheets runnable
max-ostapenko Oct 20, 2025
c2566e6
Potential fix for code scanning alert no. 640: Unused import
max-ostapenko Oct 20, 2025
42da6ad
Remove unused json import
max-ostapenko Oct 20, 2025
c88867e
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 20, 2025
28d240b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 10, 2026
8e543ef
Add SQL scripts for tracking first-party and third-party cookies; rem…
max-ostapenko Jan 11, 2026
c195c07
lint
max-ostapenko Jan 11, 2026
5c300e4
Refactor SQL scripts for IAB TCF v2 and client hints; streamline quer…
max-ostapenko Jan 11, 2026
8768f22
Review and apply sql pivots
max-ostapenko Jan 11, 2026
951a7d2
Remove deprecated SQL scripts, and add new scripts for tracker distri…
max-ostapenko Jan 11, 2026
1c31d62
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 11, 2026
9ab94bd
updated metrics
max-ostapenko Jan 12, 2026
864fddd
formatting
max-ostapenko Jan 12, 2026
2b523a9
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
c14742b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
8c4e816
3p cookie domains
max-ostapenko Jan 13, 2026
220d0b8
switch the columns for a chart
max-ostapenko Jan 13, 2026
edc9fb1
exclude android.clients.google.com
max-ostapenko Jan 13, 2026
d280cd3
fix order by
max-ostapenko Jan 13, 2026
0d23ecb
lint
max-ostapenko Jan 13, 2026
1096577
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
6c24d45
split requirements.txt
max-ostapenko Jan 14, 2026
78acbd8
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
8d9e84a
lint
max-ostapenko Jan 14, 2026
f521a83
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
1e3a9ab
Merge branch 'privacy-sql-2025' of https://github.com/HTTPArchive/alm…
max-ostapenko Jan 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# Pages that participate in the privacy-relayed origin trials
CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
);

CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
RETURNS STRUCT<
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
token STRING,
origin STRING,
feature STRING,
expiry TIMESTAMP,
is_subdomain BOOL,
is_third_party BOOL
> AS (
STRUCT(
DECODE_ORIGIN_TRIAL(token) AS token,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
>
DETERMINISTIC AS (
(
WITH decoded_token AS (
SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
)

SELECT
STRUCT(
decoded AS token,
JSON_VALUE(decoded, '$.origin') AS origin,
JSON_VALUE(decoded, '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
)
FROM decoded_token
)
);

Expand Down
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_first_party_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites.

WITH pages AS (
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains,
SAFE.STRING(cookie.name) AS cookie_name
FROM cookies
WHERE firstparty_host LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_third_party_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
WITH pages AS (
SELECT
page,
client,
root_page,
custom_metrics,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
page,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_pages
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
cookie_host,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_host
ORDER BY
page_count DESC,
client
LIMIT 500
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_third_party_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains.

WITH pages AS (
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains,
SAFE.STRING(cookie.name) AS cookie_name
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
90 changes: 90 additions & 0 deletions sql/2025/privacy/most_common_bounce_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
-- Detection logic explained:
-- https://github.com/privacycg/proposals/issues/6
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md

WITH redirect_requests AS (
SELECT
client,
url,
index,
response_headers,
page
FROM `httparchive.crawl.requests`
WHERE
date = '2025-07-01' AND
is_root_page = TRUE AND
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
ROUND(INT64(summary.status) / 100) = 3 AND
index <= 2
),

navigation_redirect AS (
-- Find the first navigation redirect
SELECT
client,
url,
page,
response_header.value AS navigation_redirect_location
FROM redirect_requests,
UNNEST(response_headers) AS response_header
WHERE
index = 1 AND
LOWER(response_header.name) = 'location' AND
NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page)
),

bounce_redirect AS (
-- Find the second navigation redirect
SELECT
client,
url,
page,
response_header.value AS bounce_redirect_location,
response_headers
FROM redirect_requests,
UNNEST(response_headers) AS response_header
WHERE
index = 2 AND
LOWER(response_header.name) = 'location'
),

bounce_sequences AS (
-- Combine the first and second navigation redirects
SELECT
nav.client,
NET.REG_DOMAIN(navigation_redirect_location) AS bounce_hostname,
COUNT(DISTINCT nav.page) AS number_of_pages
--ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
FROM navigation_redirect AS nav
LEFT JOIN bounce_redirect AS bounce
ON
nav.client = bounce.client AND
nav.page = bounce.page AND
nav.navigation_redirect_location = bounce.url
WHERE bounce_redirect_location IS NOT NULL
GROUP BY
nav.client,
bounce_hostname
),

pages_total AS (
SELECT
client,
COUNT(DISTINCT page) AS total_pages
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01' AND
is_root_page
GROUP BY client
)

-- Count the number of websites with bounce tracking per bounce hostname
SELECT
client,
bounce_hostname,
number_of_pages,
number_of_pages / total_pages AS pct_pages
FROM bounce_sequences
JOIN pages_total
USING (client)
ORDER BY number_of_pages DESC
LIMIT 100
53 changes: 53 additions & 0 deletions sql/2025/privacy/most_common_client_hints.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- Pages that use Client Hints

WITH response_headers AS (
SELECT
client,
page,
LOWER(response_header.name) AS header_name,
LOWER(response_header.value) AS header_value,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites
FROM `httparchive.crawl.requests`,
UNNEST(response_headers) AS response_header
WHERE
date = '2025-07-01' AND
is_root_page = TRUE AND
is_main_document = TRUE
),

meta_tags AS (
SELECT
client,
page,
LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name,
LOWER(SAFE.STRING(meta_node.content)) AS tag_value
FROM (
SELECT
client,
page,
custom_metrics.other.almanac AS metrics
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND
is_root_page = TRUE
),
UNNEST(JSON_QUERY_ARRAY(metrics.`meta-nodes`.nodes)) AS meta_node
WHERE SAFE.STRING(meta_node.`http-equiv`) IS NOT NULL
)

SELECT
client,
IF(header_name = 'accept-ch', header_value, tag_value) AS value,
COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages,
COUNT(DISTINCT page) AS number_of_pages
FROM response_headers
FULL OUTER JOIN meta_tags
USING (client, page)
WHERE
header_name = 'accept-ch' OR
tag_name = 'accept-ch'
GROUP BY
client,
value
ORDER BY pct_pages DESC
LIMIT 200
26 changes: 26 additions & 0 deletions sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- Counts of CMPs using IAB Transparency & Consent Framework
-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
-- CMP vendor list: https://iabeurope.eu/cmp-list/

WITH cmps AS (
SELECT
client,
page,
SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01'
)

SELECT
client,
cmpId,
COUNT(0) / ANY_VALUE(total_pages) AS pct_pages,
COUNT(0) AS number_of_pages
FROM cmps
GROUP BY
client,
cmpId
ORDER BY
pct_pages DESC
Loading
Loading