Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6aebd35
dates updated
max-ostapenko Aug 1, 2025
943ae28
query updates
max-ostapenko Aug 1, 2025
451cca9
sheet exporter update
max-ostapenko Aug 2, 2025
e394bb6
ID update
max-ostapenko Aug 2, 2025
2a7db9b
formatting
max-ostapenko Aug 2, 2025
cd572c8
lint
max-ostapenko Aug 2, 2025
bb5959a
Refactor origin trial functions for improved readability and structure
max-ostapenko Sep 12, 2025
579fb49
Merge remote-tracking branch 'origin/main' into privacy-sql-2025
max-ostapenko Sep 12, 2025
55cec4f
lint
max-ostapenko Sep 12, 2025
c3a2ee7
lint
max-ostapenko Sep 12, 2025
489d07d
Merge branch 'main' into privacy-sql-2025
max-ostapenko Sep 20, 2025
7640ee0
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 19, 2025
bd7506d
make bq_to_sheets.ipynb runnable and add deps to requirements
max-ostapenko Oct 19, 2025
08aa531
Refactor privacy queries and utilities; make bq_to_sheets runnable
max-ostapenko Oct 20, 2025
c2566e6
Potential fix for code scanning alert no. 640: Unused import
max-ostapenko Oct 20, 2025
42da6ad
Remove unused json import
max-ostapenko Oct 20, 2025
c88867e
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 20, 2025
28d240b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 10, 2026
8e543ef
Add SQL scripts for tracking first-party and third-party cookies; rem…
max-ostapenko Jan 11, 2026
c195c07
lint
max-ostapenko Jan 11, 2026
5c300e4
Refactor SQL scripts for IAB TCF v2 and client hints; streamline quer…
max-ostapenko Jan 11, 2026
8768f22
Review and apply sql pivots
max-ostapenko Jan 11, 2026
951a7d2
Remove deprecated SQL scripts, and add new scripts for tracker distri…
max-ostapenko Jan 11, 2026
1c31d62
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 11, 2026
9ab94bd
updated metrics
max-ostapenko Jan 12, 2026
864fddd
formatting
max-ostapenko Jan 12, 2026
2b523a9
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
c14742b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
8c4e816
3p cookie domains
max-ostapenko Jan 13, 2026
220d0b8
switch the columns for a chart
max-ostapenko Jan 13, 2026
edc9fb1
exclude android.clients.google.com
max-ostapenko Jan 13, 2026
d280cd3
fix order by
max-ostapenko Jan 13, 2026
0d23ecb
lint
max-ostapenko Jan 13, 2026
1096577
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
6c24d45
split requirements.txt
max-ostapenko Jan 14, 2026
78acbd8
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
8d9e84a
lint
max-ostapenko Jan 14, 2026
f521a83
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
1e3a9ab
Merge branch 'privacy-sql-2025' of https://github.com/HTTPArchive/alm…
max-ostapenko Jan 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# Pages that participate in the privacy-relayed origin trials
CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
);

CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
RETURNS STRUCT<
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
token STRING,
origin STRING,
feature STRING,
expiry TIMESTAMP,
is_subdomain BOOL,
is_third_party BOOL
> AS (
STRUCT(
DECODE_ORIGIN_TRIAL(token) AS token,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
>
DETERMINISTIC AS (
(
WITH decoded_token AS (
SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
)

SELECT
STRUCT(
decoded AS token,
JSON_VALUE(decoded, '$.origin') AS origin,
JSON_VALUE(decoded, '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
)
FROM decoded_token
)
);

Expand Down
78 changes: 78 additions & 0 deletions sql/2025/privacy/bounce_domains_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
-- noqa: disable=PRS
-- Detection logic explained:
-- https://github.com/privacycg/proposals/issues/6
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md

WITH redirect_requests AS (
FROM `httparchive.crawl.requests`
|> WHERE
date = '2025-07-01' AND
--rank = 1000 AND
SAFE.INT64(summary.status) BETWEEN 300 AND 399 AND
index <= 2
|> JOIN UNNEST(response_headers) AS header
|> WHERE LOWER(header.name) = 'location'
|> SELECT
client,
url,
index,
NET.REG_DOMAIN(header.value) AS location_domain,
page
),

-- Find the first navigation redirect
navigation_redirect AS (
FROM redirect_requests
|> WHERE
index = 1 AND
NET.REG_DOMAIN(page) = NET.REG_DOMAIN(url) AND
NET.REG_DOMAIN(url) != location_domain
|> SELECT
client,
page,
location_domain AS bounce_domain
),

-- Find the second navigation redirect
bounce_redirect AS (
FROM redirect_requests
|> WHERE
index = 2 AND
NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) AND
NET.REG_DOMAIN(url) != location_domain
|> SELECT
client,
url,
page,
location_domain AS bounce_redirect_location_domain
),

-- Combine the first and second navigation redirects
bounce_sequences AS (
FROM navigation_redirect AS nav
|> JOIN bounce_redirect AS bounce
ON
nav.client = bounce.client AND
nav.page = bounce.page
|> AGGREGATE COUNT(DISTINCT nav.page) AS pages_count
GROUP BY nav.client, bounce_domain
),

pages_total AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' --AND rank = 1000
|> AGGREGATE COUNT(DISTINCT page) AS total_pages GROUP BY client
)

FROM bounce_sequences
|> JOIN pages_total USING (client)
|> EXTEND pages_count / total_pages AS pages_pct
|> DROP total_pages
|> PIVOT(
ANY_VALUE(pages_count) AS cnt,
ANY_VALUE(pages_pct) AS pages_pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME cnt_mobile AS mobile, cnt_desktop AS desktop
|> ORDER BY mobile + desktop DESC
|> LIMIT 100
48 changes: 48 additions & 0 deletions sql/2025/privacy/client_hints_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- noqa: disable=PRS
WITH totals AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
|> AGGREGATE COUNT(*) AS total_websites GROUP BY client
),

/* Get Accept-CH Headers */
headers AS (
FROM `httparchive.crawl.requests`
|> WHERE date = '2025-07-01' AND is_root_page AND is_main_document --AND rank = 1000
|> JOIN UNNEST(response_headers) AS header
|> WHERE LOWER(header.name) = 'accept-ch'
|> LEFT JOIN UNNEST(SPLIT(LOWER(header.value), ',')) AS header_value
|> SELECT client, page, header_value

),

/* Get Accept-CH Meta Tags */
meta_tags AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
|> EXTEND
LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name,
Comment thread
max-ostapenko marked this conversation as resolved.
Outdated
|> WHERE tag_name = 'accept-ch'
|> LEFT JOIN UNNEST(SPLIT(LOWER(SAFE.STRING(meta_node.content)), ',')) AS tag_value
|> SELECT client, page, tag_value
)

FROM headers
|> FULL OUTER JOIN meta_tags USING (client, page)
|> JOIN totals USING (client)
|> EXTEND TRIM(COALESCE(header_value, tag_value)) AS value
|> AGGREGATE
COUNT(DISTINCT page) AS number_of_pages,
COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages
GROUP BY client, value
|> PIVOT(
ANY_VALUE(number_of_pages) AS pages_count,
ANY_VALUE(pct_pages) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_mobile AS mobile,
pct_desktop AS desktop
|> ORDER BY pages_count_mobile + pages_count_desktop DESC
|> LIMIT 200
61 changes: 61 additions & 0 deletions sql/2025/privacy/client_hints_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
-- noqa: disable=PRS
WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_websites
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
--AND rank = 1000
GROUP BY client
),

accept_ch_headers AS (
SELECT DISTINCT
client,
root_page
FROM `httparchive.crawl.requests`,
UNNEST(response_headers) response_header
WHERE
date = '2025-07-01' AND
is_main_document = TRUE AND
--rank = 1000 AND
LOWER(response_header.name) = 'accept-ch'
),

accept_ch_meta AS (
SELECT DISTINCT
client,
root_page
FROM (
SELECT
client,
root_page,
custom_metrics.other.almanac AS metrics
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
--AND rank = 1000
),
UNNEST(JSON_QUERY_ARRAY(metrics.`meta-nodes`.nodes)) AS meta_node
WHERE LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'accept-ch'
),

-- Combine both sources
all_accept_ch AS (
SELECT client, root_page FROM accept_ch_headers
UNION DISTINCT
SELECT client, root_page FROM accept_ch_meta
)

FROM all_accept_ch
|> JOIN base_totals USING (client)
|> AGGREGATE
COUNT(DISTINCT all_accept_ch.root_page) AS number_of_websites,
COUNT(DISTINCT all_accept_ch.root_page) / ANY_VALUE(base_totals.total_websites) AS pct_websites
GROUP BY all_accept_ch.client
|> PIVOT(
ANY_VALUE(number_of_websites) AS websites_count,
ANY_VALUE(pct_websites) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY websites_count_mobile + websites_count_desktop DESC
27 changes: 27 additions & 0 deletions sql/2025/privacy/cookies_first_party_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* Most common cookie names, by number of domains on which they appear.
Goal is to identify common trackers that use first-party cookies across sites.
*/

FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' -- AND rank = 1000
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
|> EXTEND
NET.HOST(root_page) AS firstparty_domain,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
SAFE.STRING(cookie.name) AS cookie_name
|> WHERE ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
|> AGGREGATE
COUNT(DISTINCT firstparty_domain) AS domain_count,
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
GROUP BY client, cookie_name
|> PIVOT (
ANY_VALUE(domain_count) AS domain_count,
ANY_VALUE(pct_domains) AS pct_domains
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_domains_mobile AS mobile,
pct_domains_desktop AS desktop
|> ORDER BY domain_count_mobile + domain_count_desktop DESC
|> LIMIT 1000
23 changes: 23 additions & 0 deletions sql/2025/privacy/cookies_third_party_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' -- AND rank = 1000
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
|> EXTEND
NET.HOST(root_page) AS firstparty_domain,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
NET.HOST(SAFE.STRING(cookie.domain)) || ' / ' || SAFE.STRING(cookie.name) AS cookie_details
|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
|> AGGREGATE
COUNT(DISTINCT firstparty_domain) AS domain_count,
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
GROUP BY client, cookie_details
|> PIVOT (
ANY_VALUE(domain_count) AS domain_count,
ANY_VALUE(pct_domains) AS pct_domains
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_domains_mobile AS mobile,
pct_domains_desktop AS desktop
|> ORDER BY domain_count_mobile + domain_count_desktop DESC
|> LIMIT 1000
19 changes: 19 additions & 0 deletions sql/2025/privacy/dnt_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Pages that use DNT feature

FROM `httparchive.blink_features.usage`
|> WHERE
date = '2025-07-01' AND
--rank <= 10000 AND
feature = 'NavigatorDoNotTrack'
|> SELECT DISTINCT
client,
rank,
num_urls,
pct_urls
|> PIVOT (
ANY_VALUE(num_urls) AS pages_count,
ANY_VALUE(pct_urls) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY rank ASC
31 changes: 31 additions & 0 deletions sql/2025/privacy/fingerprinting_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- noqa: disable=PRS
-- Percent of websites using a fingerprinting library based on wappalyzer category

WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS websites_total
FROM httparchive.crawl.pages
WHERE date = '2025-07-01'
GROUP BY client
)

FROM httparchive.crawl.pages,
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
|> WHERE
date = '2025-07-01' AND
category = 'Browser fingerprinting'
|> AGGREGATE
COUNT(DISTINCT root_page) AS websites_count
GROUP BY client, technology.technology
|> JOIN base_totals USING (client)
|> EXTEND websites_count / websites_total AS websites_pct
|> DROP websites_total
|> PIVOT(
ANY_VALUE(websites_count) AS websites_count,
ANY_VALUE(websites_pct) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME websites_count_mobile AS mobile, websites_count_desktop AS desktop
|> ORDER BY mobile + desktop DESC
20 changes: 20 additions & 0 deletions sql/2025/privacy/iab_tcf_v2_cmps_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
-- Counts of CMPs using IAB Transparency & Consent Framework
-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
-- CMP vendor list: https://iabeurope.eu/cmp-list/

FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' --AND rank = 1000
|> EXTEND
SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
|> AGGREGATE
COUNT(0) AS number_of_pages,
COUNT(0) / ANY_VALUE(total_pages) AS pct_pages
GROUP BY client, cmpId
|> PIVOT (
ANY_VALUE(number_of_pages) AS pages_count,
ANY_VALUE(pct_pages) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY pages_count_mobile + pages_count_desktop DESC
41 changes: 41 additions & 0 deletions sql/2025/privacy/iab_tcf_v2_countries_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- noqa: disable=PRS
-- Counts of countries for publishers using IAB Transparency & Consent Framework
-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
-- "Country code of the country that determines the legislation of
-- reference. Normally corresponds to the country code of the country
-- in which the publisher's business entity is established."

WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_websites
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01' --AND rank = 1000
GROUP BY client
),

base_data AS (
SELECT
client,
root_page,
UPPER(SAFE.STRING(custom_metrics.privacy.iab_tcf_v2.data.publisherCC)) AS publisherCC
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND --rank = 1000 AND
JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object'
)

FROM base_data
|> AGGREGATE
COUNT(DISTINCT root_page) AS number_of_pages
GROUP BY client, publisherCC
|> JOIN base_totals USING (client)
|> EXTEND number_of_pages / total_websites AS pct_of_pages
|> DROP total_websites
|> PIVOT(
ANY_VALUE(number_of_pages) AS pages_count,
ANY_VALUE(pct_of_pages) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY pages_count_mobile + pages_count_desktop DESC
Loading
Loading