Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6aebd35
dates updated
max-ostapenko Aug 1, 2025
943ae28
query updates
max-ostapenko Aug 1, 2025
451cca9
sheet exporter update
max-ostapenko Aug 2, 2025
e394bb6
ID update
max-ostapenko Aug 2, 2025
2a7db9b
formatting
max-ostapenko Aug 2, 2025
cd572c8
lint
max-ostapenko Aug 2, 2025
bb5959a
Refactor origin trial functions for improved readability and structure
max-ostapenko Sep 12, 2025
579fb49
Merge remote-tracking branch 'origin/main' into privacy-sql-2025
max-ostapenko Sep 12, 2025
55cec4f
lint
max-ostapenko Sep 12, 2025
c3a2ee7
lint
max-ostapenko Sep 12, 2025
489d07d
Merge branch 'main' into privacy-sql-2025
max-ostapenko Sep 20, 2025
7640ee0
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 19, 2025
bd7506d
make bq_to_sheets.ipynb runnable and add deps to requirements
max-ostapenko Oct 19, 2025
08aa531
Refactor privacy queries and utilities; make bq_to_sheets runnable
max-ostapenko Oct 20, 2025
c2566e6
Potential fix for code scanning alert no. 640: Unused import
max-ostapenko Oct 20, 2025
42da6ad
Remove unused json import
max-ostapenko Oct 20, 2025
c88867e
Merge branch 'main' into privacy-sql-2025
max-ostapenko Oct 20, 2025
28d240b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 10, 2026
8e543ef
Add SQL scripts for tracking first-party and third-party cookies; rem…
max-ostapenko Jan 11, 2026
c195c07
lint
max-ostapenko Jan 11, 2026
5c300e4
Refactor SQL scripts for IAB TCF v2 and client hints; streamline quer…
max-ostapenko Jan 11, 2026
8768f22
Review and apply sql pivots
max-ostapenko Jan 11, 2026
951a7d2
Remove deprecated SQL scripts, and add new scripts for tracker distri…
max-ostapenko Jan 11, 2026
1c31d62
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 11, 2026
9ab94bd
updated metrics
max-ostapenko Jan 12, 2026
864fddd
formatting
max-ostapenko Jan 12, 2026
2b523a9
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
c14742b
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 12, 2026
8c4e816
3p cookie domains
max-ostapenko Jan 13, 2026
220d0b8
switch the columns for a chart
max-ostapenko Jan 13, 2026
edc9fb1
exclude android.clients.google.com
max-ostapenko Jan 13, 2026
d280cd3
fix order by
max-ostapenko Jan 13, 2026
0d23ecb
lint
max-ostapenko Jan 13, 2026
1096577
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
6c24d45
split requirements.txt
max-ostapenko Jan 14, 2026
78acbd8
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
8d9e84a
lint
max-ostapenko Jan 14, 2026
f521a83
Merge branch 'main' into privacy-sql-2025
max-ostapenko Jan 14, 2026
1e3a9ab
Merge branch 'privacy-sql-2025' of https://github.com/HTTPArchive/alm…
max-ostapenko Jan 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# Pages that participate in the privacy-relayed origin trials
CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
);

CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
RETURNS STRUCT<
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
token STRING,
origin STRING,
feature STRING,
expiry TIMESTAMP,
is_subdomain BOOL,
is_third_party BOOL
> AS (
STRUCT(
DECODE_ORIGIN_TRIAL(token) AS token,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
>
DETERMINISTIC AS (
(
WITH decoded_token AS (
SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
)

SELECT
STRUCT(
decoded AS token,
JSON_VALUE(decoded, '$.origin') AS origin,
JSON_VALUE(decoded, '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
)
FROM decoded_token
)
);

Expand Down
78 changes: 78 additions & 0 deletions sql/2025/privacy/bounce_domains_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
-- noqa: disable=PRS
-- Detection logic explained:
-- https://github.com/privacycg/proposals/issues/6
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md

WITH redirect_requests AS (
FROM `httparchive.crawl.requests`
|> WHERE
date = '2025-07-01' AND
--rank = 1000 AND
SAFE.INT64(summary.status) BETWEEN 300 AND 399 AND
index <= 2
|> JOIN UNNEST(response_headers) AS header
|> WHERE LOWER(header.name) = 'location'
|> SELECT
client,
url,
index,
NET.REG_DOMAIN(header.value) AS location_domain,
root_page
),

-- Find the first navigation redirect
navigation_redirect AS (
FROM redirect_requests
|> WHERE
index = 1 AND
NET.REG_DOMAIN(root_page) = NET.REG_DOMAIN(url) AND
NET.REG_DOMAIN(url) != location_domain
|> SELECT
client,
root_page,
location_domain AS bounce_domain
),

-- Find the second navigation redirect
bounce_redirect AS (
FROM redirect_requests
|> WHERE
index = 2 AND
NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND
NET.REG_DOMAIN(url) != location_domain
|> SELECT
client,
url,
root_page,
location_domain AS bounce_redirect_location_domain
),

-- Combine the first and second navigation redirects
bounce_sequences AS (
FROM navigation_redirect AS nav
|> JOIN bounce_redirect AS bounce
ON
nav.client = bounce.client AND
nav.root_page = bounce.root_page
|> AGGREGATE COUNT(DISTINCT nav.root_page) AS websites_count
GROUP BY nav.client, bounce_domain
),

websites_total AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' --AND rank = 1000
|> AGGREGATE COUNT(DISTINCT root_page) AS total_websites GROUP BY client
)

FROM bounce_sequences
|> JOIN websites_total USING (client)
|> EXTEND websites_count / total_websites AS websites_pct
|> DROP total_websites
|> PIVOT(
ANY_VALUE(websites_count) AS cnt,
ANY_VALUE(websites_pct) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop, cnt_mobile AS mobile_count, cnt_desktop AS desktop_count
|> ORDER BY COALESCE(mobile_count, 0) + COALESCE(desktop_count, 0) DESC
|> LIMIT 100
44 changes: 44 additions & 0 deletions sql/2025/privacy/client_hints_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
-- noqa: disable=PRS
WITH totals AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
|> AGGREGATE COUNT(*) AS total_websites GROUP BY client
),

/* Get Accept-CH Headers */
headers AS (
FROM `httparchive.crawl.requests`
|> WHERE date = '2025-07-01' AND is_root_page AND is_main_document --AND rank = 1000
|> JOIN UNNEST(response_headers) AS header
|> WHERE LOWER(header.name) = 'accept-ch'
|> LEFT JOIN UNNEST(SPLIT(LOWER(header.value), ',')) AS header_value
|> SELECT client, root_page, header_value

),

/* Get Accept-CH Meta Tags */
meta_tags AS (
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
|> EXTEND LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name
|> WHERE tag_name = 'accept-ch'
|> LEFT JOIN UNNEST(SPLIT(LOWER(SAFE.STRING(meta_node.content)), ',')) AS tag_value
|> SELECT client, root_page, tag_value
)

FROM headers
|> FULL OUTER JOIN meta_tags USING (client, root_page)
|> JOIN totals USING (client)
|> EXTEND TRIM(COALESCE(header_value, tag_value)) AS value
|> AGGREGATE
COUNT(DISTINCT root_page) AS number_of_websites,
COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites
GROUP BY client, value
|> PIVOT(
ANY_VALUE(number_of_websites) AS websites_count,
ANY_VALUE(pct_websites) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
54 changes: 54 additions & 0 deletions sql/2025/privacy/client_hints_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
-- noqa: disable=PRS
WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_websites
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
--AND rank = 1000
GROUP BY client
),

accept_ch_headers AS (
SELECT DISTINCT
client,
root_page
FROM `httparchive.crawl.requests`,
UNNEST(response_headers) response_header
WHERE
date = '2025-07-01' AND
is_main_document = TRUE AND
--rank = 1000 AND
LOWER(response_header.name) = 'accept-ch'
),

accept_ch_meta AS (
SELECT DISTINCT
client,
root_page
FROM `httparchive.crawl.pages`,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
WHERE date = '2025-07-01'
--AND rank = 1000
AND LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'accept-ch'
),

-- Combine both sources
all_accept_ch AS (
SELECT client, root_page FROM accept_ch_headers
UNION DISTINCT
SELECT client, root_page FROM accept_ch_meta
)

FROM all_accept_ch
|> JOIN base_totals USING (client)
|> AGGREGATE
COUNT(DISTINCT all_accept_ch.root_page) AS number_of_websites,
COUNT(DISTINCT all_accept_ch.root_page) / ANY_VALUE(base_totals.total_websites) AS pct_websites
GROUP BY all_accept_ch.client
|> PIVOT(
ANY_VALUE(number_of_websites) AS websites_count,
ANY_VALUE(pct_websites) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
22 changes: 22 additions & 0 deletions sql/2025/privacy/cookie_domains_third_party_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' -- AND rank = 1000
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
|> EXTEND
NET.HOST(root_page) AS firstparty_domain,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain
|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
|> AGGREGATE
COUNT(DISTINCT firstparty_domain) AS domain_count,
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
GROUP BY client, cookie_domain
|> PIVOT (
ANY_VALUE(domain_count) AS domain_count,
ANY_VALUE(pct_domains) AS pct_domains
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_domains_mobile AS mobile,
pct_domains_desktop AS desktop
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
|> LIMIT 1000
27 changes: 27 additions & 0 deletions sql/2025/privacy/cookies_first_party_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* Most common cookie names, by number of domains on which they appear.
Goal is to identify common trackers that use first-party cookies across sites.
*/

FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' -- AND rank = 1000
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
|> EXTEND
NET.HOST(root_page) AS firstparty_domain,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
SAFE.STRING(cookie.name) AS cookie_name
|> WHERE ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
|> AGGREGATE
COUNT(DISTINCT firstparty_domain) AS domain_count,
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
GROUP BY client, cookie_name
|> PIVOT (
ANY_VALUE(domain_count) AS domain_count,
ANY_VALUE(pct_domains) AS pct_domains
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_domains_mobile AS mobile,
pct_domains_desktop AS desktop
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
|> LIMIT 1000
23 changes: 23 additions & 0 deletions sql/2025/privacy/cookies_third_party_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' -- AND rank = 1000
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
|> EXTEND
NET.HOST(root_page) AS firstparty_domain,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
NET.HOST(SAFE.STRING(cookie.domain)) || ' / ' || SAFE.STRING(cookie.name) AS cookie_details
|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
|> AGGREGATE
COUNT(DISTINCT firstparty_domain) AS domain_count,
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
GROUP BY client, cookie_details
|> PIVOT (
ANY_VALUE(domain_count) AS domain_count,
ANY_VALUE(pct_domains) AS pct_domains
FOR client IN ('desktop', 'mobile')
)
|> RENAME
pct_domains_mobile AS mobile,
pct_domains_desktop AS desktop
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
|> LIMIT 1000
19 changes: 19 additions & 0 deletions sql/2025/privacy/dnt_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Pages that use DNT feature

FROM `httparchive.blink_features.usage`
|> WHERE
date = '2025-07-01' AND
--rank <= 10000 AND
feature = 'NavigatorDoNotTrack'
|> SELECT DISTINCT
client,
rank,
num_urls,
pct_urls
|> PIVOT (
ANY_VALUE(num_urls) AS pages_count,
ANY_VALUE(pct_urls) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY rank ASC
31 changes: 31 additions & 0 deletions sql/2025/privacy/fingerprinting_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- noqa: disable=PRS
-- Percent of websites using a fingerprinting library based on wappalyzer category

WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS websites_total
FROM httparchive.crawl.pages
WHERE date = '2025-07-01'
GROUP BY client
)

FROM httparchive.crawl.pages,
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
|> WHERE
date = '2025-07-01' AND
category = 'Browser fingerprinting'
|> AGGREGATE
COUNT(DISTINCT root_page) AS websites_count
GROUP BY client, technology.technology
|> JOIN base_totals USING (client)
|> EXTEND websites_count / websites_total AS websites_pct
|> DROP websites_total
|> PIVOT(
ANY_VALUE(websites_count) AS websites_count,
ANY_VALUE(websites_pct) AS websites_pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME websites_pct_mobile AS mobile, websites_pct_desktop AS desktop
|> ORDER BY websites_count_mobile + websites_count_desktop DESC
21 changes: 21 additions & 0 deletions sql/2025/privacy/iab_tcf_v2_cmps_top.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- noqa: disable=PRS
-- Counts of CMPs using IAB Transparency & Consent Framework
-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
-- CMP vendor list: https://iabeurope.eu/cmp-list/

FROM `httparchive.crawl.pages`
|> WHERE date = '2025-07-01' --AND rank = 1000
|> EXTEND
SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId,
COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_websites
|> AGGREGATE
COUNT(DISTINCT root_page) AS number_of_websites,
COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites
GROUP BY client, cmpId
|> PIVOT (
ANY_VALUE(number_of_websites) AS websites_count,
ANY_VALUE(pct_websites) AS pct
FOR client IN ('desktop', 'mobile')
)
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
Loading
Loading