Skip to content

Commit d0f3b7a

Browse files
Privacy 2025 queries (#4178)
* dates updated * query updates * sheet exporter update * ID update * formatting * lint * Refactor origin trial functions for improved readability and structure * lint * lint * make bq_to_sheets.ipynb runnable and add deps to requirements * Refactor privacy queries and utilities; make bq_to_sheets runnable * Potential fix for code scanning alert no. 640: Unused import Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * Remove unused json import * Add SQL scripts for tracking first-party and third-party cookies; remove outdated scripts and update requirements * lint * Refactor SQL scripts for IAB TCF v2 and client hints; streamline queries and improve clarity * Review and apply sql pivots * Remove deprecated SQL scripts, and add new scripts for tracker distribution and technologies analysis * updated metrics * formatting * 3p cookie domains * switch the columns for a chart * exclude android.clients.google.com * fix order by * lint * split requirements.txt * lint --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
1 parent 4cad612 commit d0f3b7a

28 files changed

+1203
-273
lines changed

.github/linters/.python-lint

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ persistent=yes
3131
# Specify a configuration file.
3232
#rcfile=
3333

34-
# When enabled, pylint would attempt to guess common misconfiguration and emit
35-
# user-friendly hints instead of false-positive error messages
36-
suggestion-mode=yes
37-
3834
# Allow loading of arbitrary C extensions. Extensions are imported into the
3935
# active Python interpreter and may run arbitrary code.
4036
unsafe-load-any-extension=no
@@ -470,5 +466,6 @@ valid-metaclass-classmethod-first-arg=mcs
470466
[EXCEPTIONS]
471467

472468
# Exceptions that will emit a warning when being caught. Defaults to
473-
# "Exception"
474-
overgeneral-exceptions=Exception
469+
# ["builtins.BaseException", "builtins.Exception"]
470+
overgeneral-exceptions = builtins.BaseException,
471+
builtins.Exception

sql/2024/privacy/number_of_websites_with_related_origin_trials.sql

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
# Pages that participate in the privacy-relayed origin trials
2-
CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
3-
SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
4-
);
5-
6-
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
7-
RETURNS STRUCT<
2+
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
83
token STRING,
94
origin STRING,
105
feature STRING,
116
expiry TIMESTAMP,
127
is_subdomain BOOL,
138
is_third_party BOOL
14-
> AS (
15-
STRUCT(
16-
DECODE_ORIGIN_TRIAL(token) AS token,
17-
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
18-
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
19-
TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
20-
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
21-
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
9+
>
10+
DETERMINISTIC AS (
11+
(
12+
WITH decoded_token AS (
13+
SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
14+
)
15+
16+
SELECT
17+
STRUCT(
18+
decoded AS token,
19+
JSON_VALUE(decoded, '$.origin') AS origin,
20+
JSON_VALUE(decoded, '$.feature') AS feature,
21+
TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
22+
JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
23+
JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
24+
)
25+
FROM decoded_token
2226
)
2327
);
2428

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
-- noqa: disable=PRS
2+
-- Detection logic explained:
3+
-- https://github.com/privacycg/proposals/issues/6
4+
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md
5+
6+
WITH redirect_requests AS (
7+
FROM `httparchive.crawl.requests`
8+
|> WHERE
9+
date = '2025-07-01' AND
10+
--rank = 1000 AND
11+
SAFE.INT64(summary.status) BETWEEN 300 AND 399 AND
12+
index <= 2
13+
|> JOIN UNNEST(response_headers) AS header
14+
|> WHERE LOWER(header.name) = 'location'
15+
|> SELECT
16+
client,
17+
url,
18+
index,
19+
NET.REG_DOMAIN(header.value) AS location_domain,
20+
root_page
21+
),
22+
23+
-- Find the first navigation redirect
24+
navigation_redirect AS (
25+
FROM redirect_requests
26+
|> WHERE
27+
index = 1 AND
28+
NET.REG_DOMAIN(root_page) = NET.REG_DOMAIN(url) AND
29+
NET.REG_DOMAIN(url) != location_domain
30+
|> SELECT
31+
client,
32+
root_page,
33+
location_domain AS bounce_domain
34+
),
35+
36+
-- Find the second navigation redirect
37+
bounce_redirect AS (
38+
FROM redirect_requests
39+
|> WHERE
40+
index = 2 AND
41+
NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND
42+
NET.REG_DOMAIN(url) != location_domain
43+
|> SELECT
44+
client,
45+
url,
46+
root_page,
47+
location_domain AS bounce_redirect_location_domain
48+
),
49+
50+
-- Combine the first and second navigation redirects
51+
bounce_sequences AS (
52+
FROM navigation_redirect AS nav
53+
|> JOIN bounce_redirect AS bounce
54+
ON
55+
nav.client = bounce.client AND
56+
nav.root_page = bounce.root_page
57+
|> AGGREGATE COUNT(DISTINCT nav.root_page) AS websites_count
58+
GROUP BY nav.client, bounce_domain
59+
),
60+
61+
websites_total AS (
62+
FROM `httparchive.crawl.pages`
63+
|> WHERE date = '2025-07-01' --AND rank = 1000
64+
|> AGGREGATE COUNT(DISTINCT root_page) AS total_websites GROUP BY client
65+
)
66+
67+
FROM bounce_sequences
68+
|> JOIN websites_total USING (client)
69+
|> EXTEND websites_count / total_websites AS websites_pct
70+
|> DROP total_websites
71+
|> PIVOT(
72+
ANY_VALUE(websites_count) AS cnt,
73+
ANY_VALUE(websites_pct) AS pct
74+
FOR client IN ('desktop', 'mobile')
75+
)
76+
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop, cnt_mobile AS mobile_count, cnt_desktop AS desktop_count
77+
|> ORDER BY COALESCE(mobile_count, 0) + COALESCE(desktop_count, 0) DESC
78+
|> LIMIT 100
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
-- noqa: disable=PRS
2+
WITH totals AS (
3+
FROM `httparchive.crawl.pages`
4+
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
5+
|> AGGREGATE COUNT(*) AS total_websites GROUP BY client
6+
),
7+
8+
/* Get Accept-CH Headers */
9+
headers AS (
10+
FROM `httparchive.crawl.requests`
11+
|> WHERE date = '2025-07-01' AND is_root_page AND is_main_document --AND rank = 1000
12+
|> JOIN UNNEST(response_headers) AS header
13+
|> WHERE LOWER(header.name) = 'accept-ch'
14+
|> LEFT JOIN UNNEST(SPLIT(LOWER(header.value), ',')) AS header_value
15+
|> SELECT client, root_page, header_value
16+
17+
),
18+
19+
/* Get Accept-CH Meta Tags */
20+
meta_tags AS (
21+
FROM `httparchive.crawl.pages`
22+
|> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
23+
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
24+
|> EXTEND LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name
25+
|> WHERE tag_name = 'accept-ch'
26+
|> LEFT JOIN UNNEST(SPLIT(LOWER(SAFE.STRING(meta_node.content)), ',')) AS tag_value
27+
|> SELECT client, root_page, tag_value
28+
)
29+
30+
FROM headers
31+
|> FULL OUTER JOIN meta_tags USING (client, root_page)
32+
|> JOIN totals USING (client)
33+
|> EXTEND TRIM(COALESCE(header_value, tag_value)) AS value
34+
|> AGGREGATE
35+
COUNT(DISTINCT root_page) AS number_of_websites,
36+
COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites
37+
GROUP BY client, value
38+
|> PIVOT(
39+
ANY_VALUE(number_of_websites) AS websites_count,
40+
ANY_VALUE(pct_websites) AS pct
41+
FOR client IN ('desktop', 'mobile')
42+
)
43+
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
44+
|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
-- noqa: disable=PRS
2+
WITH base_totals AS (
3+
SELECT
4+
client,
5+
COUNT(DISTINCT root_page) AS total_websites
6+
FROM `httparchive.crawl.pages`
7+
WHERE date = '2025-07-01'
8+
--AND rank = 1000
9+
GROUP BY client
10+
),
11+
12+
accept_ch_headers AS (
13+
SELECT DISTINCT
14+
client,
15+
root_page
16+
FROM `httparchive.crawl.requests`,
17+
UNNEST(response_headers) response_header
18+
WHERE
19+
date = '2025-07-01' AND
20+
is_main_document = TRUE AND
21+
--rank = 1000 AND
22+
LOWER(response_header.name) = 'accept-ch'
23+
),
24+
25+
accept_ch_meta AS (
26+
SELECT DISTINCT
27+
client,
28+
root_page
29+
FROM `httparchive.crawl.pages`,
30+
UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
31+
WHERE date = '2025-07-01'
32+
--AND rank = 1000
33+
AND LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'accept-ch'
34+
),
35+
36+
-- Combine both sources
37+
all_accept_ch AS (
38+
SELECT client, root_page FROM accept_ch_headers
39+
UNION DISTINCT
40+
SELECT client, root_page FROM accept_ch_meta
41+
)
42+
43+
FROM all_accept_ch
44+
|> JOIN base_totals USING (client)
45+
|> AGGREGATE
46+
COUNT(DISTINCT all_accept_ch.root_page) AS number_of_websites,
47+
COUNT(DISTINCT all_accept_ch.root_page) / ANY_VALUE(base_totals.total_websites) AS pct_websites
48+
GROUP BY all_accept_ch.client
49+
|> PIVOT(
50+
ANY_VALUE(number_of_websites) AS websites_count,
51+
ANY_VALUE(pct_websites) AS pct
52+
FOR client IN ('desktop', 'mobile')
53+
)
54+
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM `httparchive.crawl.pages`
2+
|> WHERE date = '2025-07-01' -- AND rank = 1000
3+
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
4+
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
5+
|> EXTEND
6+
NET.HOST(root_page) AS firstparty_domain,
7+
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain
8+
|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
9+
|> AGGREGATE
10+
COUNT(DISTINCT firstparty_domain) AS domain_count,
11+
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
12+
GROUP BY client, cookie_domain
13+
|> PIVOT (
14+
ANY_VALUE(domain_count) AS domain_count,
15+
ANY_VALUE(pct_domains) AS pct_domains
16+
FOR client IN ('desktop', 'mobile')
17+
)
18+
|> RENAME
19+
pct_domains_mobile AS mobile,
20+
pct_domains_desktop AS desktop
21+
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
22+
|> LIMIT 1000
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/* Most common cookie names, by number of domains on which they appear.
2+
Goal is to identify common trackers that use first-party cookies across sites.
3+
*/
4+
5+
FROM `httparchive.crawl.pages`
6+
|> WHERE date = '2025-07-01' -- AND rank = 1000
7+
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
8+
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
9+
|> EXTEND
10+
NET.HOST(root_page) AS firstparty_domain,
11+
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
12+
SAFE.STRING(cookie.name) AS cookie_name
13+
|> WHERE ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
14+
|> AGGREGATE
15+
COUNT(DISTINCT firstparty_domain) AS domain_count,
16+
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
17+
GROUP BY client, cookie_name
18+
|> PIVOT (
19+
ANY_VALUE(domain_count) AS domain_count,
20+
ANY_VALUE(pct_domains) AS pct_domains
21+
FOR client IN ('desktop', 'mobile')
22+
)
23+
|> RENAME
24+
pct_domains_mobile AS mobile,
25+
pct_domains_desktop AS desktop
26+
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
27+
|> LIMIT 1000
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM `httparchive.crawl.pages`
2+
|> WHERE date = '2025-07-01' -- AND rank = 1000
3+
|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
4+
|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
5+
|> EXTEND
6+
NET.HOST(root_page) AS firstparty_domain,
7+
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
8+
NET.HOST(SAFE.STRING(cookie.domain)) || ' / ' || SAFE.STRING(cookie.name) AS cookie_details
9+
|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
10+
|> AGGREGATE
11+
COUNT(DISTINCT firstparty_domain) AS domain_count,
12+
COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
13+
GROUP BY client, cookie_details
14+
|> PIVOT (
15+
ANY_VALUE(domain_count) AS domain_count,
16+
ANY_VALUE(pct_domains) AS pct_domains
17+
FOR client IN ('desktop', 'mobile')
18+
)
19+
|> RENAME
20+
pct_domains_mobile AS mobile,
21+
pct_domains_desktop AS desktop
22+
|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
23+
|> LIMIT 1000

sql/2025/privacy/dnt_usage.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
-- Pages that use DNT feature
2+
3+
FROM `httparchive.blink_features.usage`
4+
|> WHERE
5+
date = '2025-07-01' AND
6+
--rank <= 10000 AND
7+
feature = 'NavigatorDoNotTrack'
8+
|> SELECT DISTINCT
9+
client,
10+
rank,
11+
num_urls,
12+
pct_urls
13+
|> PIVOT (
14+
ANY_VALUE(num_urls) AS pages_count,
15+
ANY_VALUE(pct_urls) AS pct
16+
FOR client IN ('desktop', 'mobile')
17+
)
18+
|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
19+
|> ORDER BY rank ASC
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
-- noqa: disable=PRS
2+
-- Percent of websites using a fingerprinting library based on wappalyzer category
3+
4+
WITH base_totals AS (
5+
SELECT
6+
client,
7+
COUNT(DISTINCT root_page) AS websites_total
8+
FROM httparchive.crawl.pages
9+
WHERE date = '2025-07-01'
10+
GROUP BY client
11+
)
12+
13+
FROM httparchive.crawl.pages,
14+
UNNEST(technologies) AS technology,
15+
UNNEST(technology.categories) AS category
16+
|> WHERE
17+
date = '2025-07-01' AND
18+
category = 'Browser fingerprinting'
19+
|> AGGREGATE
20+
COUNT(DISTINCT root_page) AS websites_count
21+
GROUP BY client, technology.technology
22+
|> JOIN base_totals USING (client)
23+
|> EXTEND websites_count / websites_total AS websites_pct
24+
|> DROP websites_total
25+
|> PIVOT(
26+
ANY_VALUE(websites_count) AS websites_count,
27+
ANY_VALUE(websites_pct) AS websites_pct
28+
FOR client IN ('desktop', 'mobile')
29+
)
30+
|> RENAME websites_pct_mobile AS mobile, websites_pct_desktop AS desktop
31+
|> ORDER BY websites_count_mobile + websites_count_desktop DESC

0 commit comments

Comments
 (0)