diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint
index 6b46d7370db..c576a485feb 100644
--- a/.github/linters/.python-lint
+++ b/.github/linters/.python-lint
@@ -31,10 +31,6 @@ persistent=yes
# Specify a configuration file.
#rcfile=
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages
-suggestion-mode=yes
-
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
@@ -470,5 +466,6 @@ valid-metaclass-classmethod-first-arg=mcs
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception
+# ["builtins.BaseException", "builtins.Exception"]
+overgeneral-exceptions = builtins.BaseException,
+ builtins.Exception
diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
index e55b352eadf..7a57ed673bd 100644
--- a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
+++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
@@ -1,24 +1,28 @@
# Pages that participate in the privacy-relayed origin trials
-CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
- SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
-);
-
-CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
-RETURNS STRUCT<
+CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
token STRING,
origin STRING,
feature STRING,
expiry TIMESTAMP,
is_subdomain BOOL,
is_third_party BOOL
-> AS (
- STRUCT(
- DECODE_ORIGIN_TRIAL(token) AS token,
- JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
- JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
- TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
- JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
- JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
+>
+DETERMINISTIC AS (
+ (
+ WITH decoded_token AS (
+ SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
+ )
+
+ SELECT
+ STRUCT(
+ decoded AS token,
+ JSON_VALUE(decoded, '$.origin') AS origin,
+ JSON_VALUE(decoded, '$.feature') AS feature,
+ TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
+ JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
+ JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
+ )
+ FROM decoded_token
)
);
diff --git a/sql/2025/privacy/bounce_domains_top.sql b/sql/2025/privacy/bounce_domains_top.sql
new file mode 100644
index 00000000000..0d7fa802f0f
--- /dev/null
+++ b/sql/2025/privacy/bounce_domains_top.sql
@@ -0,0 +1,78 @@
+-- noqa: disable=PRS
+-- Detection logic explained:
+-- https://github.com/privacycg/proposals/issues/6
+-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md
+
+WITH redirect_requests AS (
+ FROM `httparchive.crawl.requests`
+ |> WHERE
+ date = '2025-07-01' AND
+ --rank = 1000 AND
+ SAFE.INT64(summary.status) BETWEEN 300 AND 399 AND
+ index <= 2
+ |> JOIN UNNEST(response_headers) AS header
+ |> WHERE LOWER(header.name) = 'location'
+ |> SELECT
+ client,
+ url,
+ index,
+ NET.REG_DOMAIN(header.value) AS location_domain,
+ root_page
+),
+
+-- Find the first navigation redirect
+navigation_redirect AS (
+ FROM redirect_requests
+ |> WHERE
+ index = 1 AND
+ NET.REG_DOMAIN(root_page) = NET.REG_DOMAIN(url) AND
+ NET.REG_DOMAIN(url) != location_domain
+ |> SELECT
+ client,
+ root_page,
+ location_domain AS bounce_domain
+),
+
+-- Find the second navigation redirect
+bounce_redirect AS (
+ FROM redirect_requests
+ |> WHERE
+ index = 2 AND
+ NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND
+ NET.REG_DOMAIN(url) != location_domain
+ |> SELECT
+ client,
+ url,
+ root_page,
+ location_domain AS bounce_redirect_location_domain
+),
+
+-- Combine the first and second navigation redirects
+bounce_sequences AS (
+ FROM navigation_redirect AS nav
+ |> JOIN bounce_redirect AS bounce
+ ON
+ nav.client = bounce.client AND
+ nav.root_page = bounce.root_page
+ |> AGGREGATE COUNT(DISTINCT nav.root_page) AS websites_count
+ GROUP BY nav.client, bounce_domain
+),
+
+websites_total AS (
+ FROM `httparchive.crawl.pages`
+ |> WHERE date = '2025-07-01' --AND rank = 1000
+ |> AGGREGATE COUNT(DISTINCT root_page) AS total_websites GROUP BY client
+)
+
+FROM bounce_sequences
+|> JOIN websites_total USING (client)
+|> EXTEND websites_count / total_websites AS websites_pct
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(websites_count) AS cnt,
+ ANY_VALUE(websites_pct) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop, cnt_mobile AS mobile_count, cnt_desktop AS desktop_count
+|> ORDER BY COALESCE(mobile_count, 0) + COALESCE(desktop_count, 0) DESC
+|> LIMIT 100
diff --git a/sql/2025/privacy/client_hints_top.sql b/sql/2025/privacy/client_hints_top.sql
new file mode 100644
index 00000000000..9a3033b5061
--- /dev/null
+++ b/sql/2025/privacy/client_hints_top.sql
@@ -0,0 +1,44 @@
+-- noqa: disable=PRS
+WITH totals AS (
+ FROM `httparchive.crawl.pages`
+ |> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
+ |> AGGREGATE COUNT(*) AS total_websites GROUP BY client
+),
+
+/* Get Accept-CH Headers */
+headers AS (
+ FROM `httparchive.crawl.requests`
+ |> WHERE date = '2025-07-01' AND is_root_page AND is_main_document --AND rank = 1000
+ |> JOIN UNNEST(response_headers) AS header
+ |> WHERE LOWER(header.name) = 'accept-ch'
+ |> LEFT JOIN UNNEST(SPLIT(LOWER(header.value), ',')) AS header_value
+ |> SELECT client, root_page, header_value
+
+),
+
+/* Get Accept-CH Meta Tags */
+meta_tags AS (
+ FROM `httparchive.crawl.pages`
+ |> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000
+ |> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
+ |> EXTEND LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name
+ |> WHERE tag_name = 'accept-ch'
+ |> LEFT JOIN UNNEST(SPLIT(LOWER(SAFE.STRING(meta_node.content)), ',')) AS tag_value
+ |> SELECT client, root_page, tag_value
+)
+
+FROM headers
+|> FULL OUTER JOIN meta_tags USING (client, root_page)
+|> JOIN totals USING (client)
+|> EXTEND TRIM(COALESCE(header_value, tag_value)) AS value
+|> AGGREGATE
+COUNT(DISTINCT root_page) AS number_of_websites,
+COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites
+GROUP BY client, value
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/client_hints_usage.sql b/sql/2025/privacy/client_hints_usage.sql
new file mode 100644
index 00000000000..00c4d5efeb8
--- /dev/null
+++ b/sql/2025/privacy/client_hints_usage.sql
@@ -0,0 +1,54 @@
+-- noqa: disable=PRS
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ --AND rank = 1000
+ GROUP BY client
+),
+
+accept_ch_headers AS (
+ SELECT DISTINCT
+ client,
+ root_page
+ FROM `httparchive.crawl.requests`,
+ UNNEST(response_headers) response_header
+ WHERE
+ date = '2025-07-01' AND
+ is_main_document = TRUE AND
+ --rank = 1000 AND
+ LOWER(response_header.name) = 'accept-ch'
+),
+
+accept_ch_meta AS (
+ SELECT DISTINCT
+ client,
+ root_page
+ FROM `httparchive.crawl.pages`,
+ UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node
+ WHERE date = '2025-07-01'
+ --AND rank = 1000
+ AND LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'accept-ch'
+),
+
+-- Combine both sources
+all_accept_ch AS (
+ SELECT client, root_page FROM accept_ch_headers
+ UNION DISTINCT
+ SELECT client, root_page FROM accept_ch_meta
+)
+
+FROM all_accept_ch
+|> JOIN base_totals USING (client)
+|> AGGREGATE
+ COUNT(DISTINCT all_accept_ch.root_page) AS number_of_websites,
+ COUNT(DISTINCT all_accept_ch.root_page) / ANY_VALUE(base_totals.total_websites) AS pct_websites
+GROUP BY all_accept_ch.client
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
diff --git a/sql/2025/privacy/cookie_domains_third_party_top.sql b/sql/2025/privacy/cookie_domains_third_party_top.sql
new file mode 100644
index 00000000000..c38269479b8
--- /dev/null
+++ b/sql/2025/privacy/cookie_domains_third_party_top.sql
@@ -0,0 +1,22 @@
+FROM `httparchive.crawl.pages`
+|> WHERE date = '2025-07-01' -- AND rank = 1000
+|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
+|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
+|> EXTEND
+NET.HOST(root_page) AS firstparty_domain,
+NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain
+|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
+|> AGGREGATE
+COUNT(DISTINCT firstparty_domain) AS domain_count,
+COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
+GROUP BY client, cookie_domain
+|> PIVOT (
+ ANY_VALUE(domain_count) AS domain_count,
+ ANY_VALUE(pct_domains) AS pct_domains
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME
+pct_domains_mobile AS mobile,
+pct_domains_desktop AS desktop
+|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
+|> LIMIT 1000
diff --git a/sql/2025/privacy/cookies_first_party_top.sql b/sql/2025/privacy/cookies_first_party_top.sql
new file mode 100644
index 00000000000..62f6cb0d2c7
--- /dev/null
+++ b/sql/2025/privacy/cookies_first_party_top.sql
@@ -0,0 +1,27 @@
+/* Most common cookie names, by number of domains on which they appear.
+Goal is to identify common trackers that use first-party cookies across sites.
+*/
+
+FROM `httparchive.crawl.pages`
+|> WHERE date = '2025-07-01' -- AND rank = 1000
+|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
+|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
+|> EXTEND
+NET.HOST(root_page) AS firstparty_domain,
+NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
+SAFE.STRING(cookie.name) AS cookie_name
+|> WHERE ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
+|> AGGREGATE
+COUNT(DISTINCT firstparty_domain) AS domain_count,
+COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
+GROUP BY client, cookie_name
+|> PIVOT (
+ ANY_VALUE(domain_count) AS domain_count,
+ ANY_VALUE(pct_domains) AS pct_domains
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME
+pct_domains_mobile AS mobile,
+pct_domains_desktop AS desktop
+|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
+|> LIMIT 1000
diff --git a/sql/2025/privacy/cookies_third_party_top.sql b/sql/2025/privacy/cookies_third_party_top.sql
new file mode 100644
index 00000000000..dff37874242
--- /dev/null
+++ b/sql/2025/privacy/cookies_third_party_top.sql
@@ -0,0 +1,23 @@
+FROM `httparchive.crawl.pages`
+|> WHERE date = '2025-07-01' -- AND rank = 1000
+|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
+|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
+|> EXTEND
+NET.HOST(root_page) AS firstparty_domain,
+NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain,
+NET.HOST(SAFE.STRING(cookie.domain)) || ' / ' || SAFE.STRING(cookie.name) AS cookie_details
+|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain)
+|> AGGREGATE
+COUNT(DISTINCT firstparty_domain) AS domain_count,
+COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains
+GROUP BY client, cookie_details
+|> PIVOT (
+ ANY_VALUE(domain_count) AS domain_count,
+ ANY_VALUE(pct_domains) AS pct_domains
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME
+pct_domains_mobile AS mobile,
+pct_domains_desktop AS desktop
+|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC
+|> LIMIT 1000
diff --git a/sql/2025/privacy/dnt_usage.sql b/sql/2025/privacy/dnt_usage.sql
new file mode 100644
index 00000000000..c713043bf79
--- /dev/null
+++ b/sql/2025/privacy/dnt_usage.sql
@@ -0,0 +1,19 @@
+-- Pages that use DNT feature
+
+FROM `httparchive.blink_features.usage`
+|> WHERE
+ date = '2025-07-01' AND
+ --rank <= 10000 AND
+ feature = 'NavigatorDoNotTrack'
+|> SELECT DISTINCT
+ client,
+ rank,
+ num_urls,
+ pct_urls
+|> PIVOT (
+ ANY_VALUE(num_urls) AS pages_count,
+ ANY_VALUE(pct_urls) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY rank ASC
diff --git a/sql/2025/privacy/fingerprinting_top.sql b/sql/2025/privacy/fingerprinting_top.sql
new file mode 100644
index 00000000000..02dd77606d3
--- /dev/null
+++ b/sql/2025/privacy/fingerprinting_top.sql
@@ -0,0 +1,31 @@
+-- noqa: disable=PRS
+-- Percent of websites using a fingerprinting library based on wappalyzer category
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS websites_total
+ FROM httparchive.crawl.pages
+ WHERE date = '2025-07-01'
+ GROUP BY client
+)
+
+FROM httparchive.crawl.pages,
+ UNNEST(technologies) AS technology,
+ UNNEST(technology.categories) AS category
+|> WHERE
+ date = '2025-07-01' AND
+ category = 'Browser fingerprinting'
+|> AGGREGATE
+ COUNT(DISTINCT root_page) AS websites_count
+GROUP BY client, technology.technology
+|> JOIN base_totals USING (client)
+|> EXTEND websites_count / websites_total AS websites_pct
+|> DROP websites_total
+|> PIVOT(
+ ANY_VALUE(websites_count) AS websites_count,
+ ANY_VALUE(websites_pct) AS websites_pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME websites_pct_mobile AS mobile, websites_pct_desktop AS desktop
+|> ORDER BY websites_count_mobile + websites_count_desktop DESC
diff --git a/sql/2025/privacy/iab_tcf_v2_cmps_top.sql b/sql/2025/privacy/iab_tcf_v2_cmps_top.sql
new file mode 100644
index 00000000000..3768bc77078
--- /dev/null
+++ b/sql/2025/privacy/iab_tcf_v2_cmps_top.sql
@@ -0,0 +1,21 @@
+-- noqa: disable=PRS
+-- Counts of CMPs using IAB Transparency & Consent Framework
+-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
+-- CMP vendor list: https://iabeurope.eu/cmp-list/
+
+FROM `httparchive.crawl.pages`
+|> WHERE date = '2025-07-01' --AND rank = 1000
+|> EXTEND
+SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId,
+COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_websites
+|> AGGREGATE
+COUNT(DISTINCT root_page) AS number_of_websites,
+COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites
+GROUP BY client, cmpId
+|> PIVOT (
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/iab_tcf_v2_countries_top.sql b/sql/2025/privacy/iab_tcf_v2_countries_top.sql
new file mode 100644
index 00000000000..0f2115b7254
--- /dev/null
+++ b/sql/2025/privacy/iab_tcf_v2_countries_top.sql
@@ -0,0 +1,39 @@
+-- noqa: disable=PRS
+-- Counts of countries for publishers using IAB Transparency & Consent Framework
+-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
+-- "Country code of the country that determines the legislation of
+-- reference. Normally corresponds to the country code of the country
+-- in which the publisher's business entity is established."
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01' --AND rank = 1000
+ GROUP BY client
+),
+
+base_data AS (
+ SELECT
+ client,
+ root_page,
+ UPPER(SAFE.STRING(custom_metrics.privacy.iab_tcf_v2.data.publisherCC)) AS publisherCC
+ FROM `httparchive.crawl.pages`
+ WHERE
+ date = '2025-07-01' AND --rank = 1000 AND
+ JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object'
+)
+
+FROM base_data
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, publisherCC
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_of_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_of_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/iab_usage.sql b/sql/2025/privacy/iab_usage.sql
new file mode 100644
index 00000000000..272384e8d72
--- /dev/null
+++ b/sql/2025/privacy/iab_usage.sql
@@ -0,0 +1,57 @@
+-- noqa: disable=PRS
+-- Counts of websites with IAB Frameworks
+
+WITH base_data AS (
+ SELECT
+ client,
+ root_page,
+ SAFE.BOOL(custom_metrics.privacy.iab_tcf_v1.present) AS tcfv1,
+ SAFE.BOOL(custom_metrics.privacy.iab_tcf_v2.present) AS tcfv2,
+ SAFE.BOOL(custom_metrics.privacy.iab_gpp.present) AS gpp,
+ SAFE.BOOL(custom_metrics.privacy.iab_usp.present) AS usp,
+ SAFE.BOOL(custom_metrics.privacy.iab_tcf_v1.compliant_setup) AS tcfv1_compliant,
+ SAFE.BOOL(custom_metrics.privacy.iab_tcf_v2.compliant_setup) AS tcfv2_compliant,
+ custom_metrics.privacy.iab_gpp.data IS NOT NULL AS gpp_data
+ FROM `httparchive.crawl.pages`
+ WHERE
+ date = '2025-07-01'
+ --AND rank = 1000
+),
+
+aggregated AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites,
+ COUNT(DISTINCT IF(tcfv1, root_page, NULL)) AS websites_with_tcfv1,
+ COUNT(DISTINCT IF(tcfv2, root_page, NULL)) AS websites_with_tcfv2,
+ COUNT(DISTINCT IF(usp, root_page, NULL)) AS websites_with_usp,
+ COUNT(DISTINCT IF(tcfv1 OR tcfv2, root_page, NULL)) AS websites_with_tcf,
+ COUNT(DISTINCT IF(tcfv1 OR tcfv2 OR usp OR gpp, root_page, NULL)) AS websites_with_any,
+ COUNT(DISTINCT IF(tcfv1 AND tcfv1_compliant, root_page, NULL)) AS websites_with_tcfv1_compliant,
+ COUNT(DISTINCT IF(tcfv2 AND tcfv2_compliant, root_page, NULL)) AS websites_with_tcfv2_compliant,
+ COUNT(DISTINCT IF(gpp, root_page, NULL)) AS websites_with_gpp,
+ COUNT(DISTINCT IF(gpp_data, root_page, NULL)) AS websites_with_gpp_data
+ FROM base_data
+ GROUP BY client
+)
+
+FROM aggregated,
+ UNNEST([
+ STRUCT('tcfv1' AS metric, websites_with_tcfv1 / total_websites AS pct_websites, websites_with_tcfv1 AS number_of_websites),
+ STRUCT('tcfv2', websites_with_tcfv2 / total_websites, websites_with_tcfv2),
+ STRUCT('usp', websites_with_usp / total_websites, websites_with_usp),
+ STRUCT('tcf', websites_with_tcf / total_websites, websites_with_tcf),
+ STRUCT('any_framework', websites_with_any / total_websites, websites_with_any),
+ STRUCT('tcfv1_compliant', websites_with_tcfv1_compliant / total_websites, websites_with_tcfv1_compliant),
+ STRUCT('tcfv2_compliant', websites_with_tcfv2_compliant / total_websites, websites_with_tcfv2_compliant),
+ STRUCT('gpp', websites_with_gpp / total_websites, websites_with_gpp),
+ STRUCT('gpp_data_available', websites_with_gpp_data / total_websites, websites_with_gpp_data)
+ ]) AS metric
+|> SELECT client, metric.metric, metric.pct_websites, metric.number_of_websites
+|> PIVOT(
+ ANY_VALUE(pct_websites) AS pct,
+ ANY_VALUE(number_of_websites) AS websites_count
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY websites_count_desktop + websites_count_mobile DESC
diff --git a/sql/2025/privacy/iab_usp_strings_top.sql b/sql/2025/privacy/iab_usp_strings_top.sql
new file mode 100644
index 00000000000..c8c53b13d2e
--- /dev/null
+++ b/sql/2025/privacy/iab_usp_strings_top.sql
@@ -0,0 +1,28 @@
+-- noqa: disable=PRS
+-- Counts of US Privacy String values for websites using IAB US Privacy Framework
+-- cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ GROUP BY client
+)
+
+FROM `httparchive.crawl.pages`
+|> WHERE date = '2025-07-01'
+|> EXTEND UPPER(SAFE.STRING(custom_metrics.privacy.iab_usp.privacy_string.uspString)) AS uspString
+|> WHERE uspString IS NOT NULL
+|> AGGREGATE COUNT(DISTINCT root_page) AS websites_count GROUP BY client, uspString
+|> JOIN base_totals USING (client)
+|> EXTEND websites_count / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(websites_count) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/referrer_policy_top.sql b/sql/2025/privacy/referrer_policy_top.sql
new file mode 100644
index 00000000000..0e6ff60b66f
--- /dev/null
+++ b/sql/2025/privacy/referrer_policy_top.sql
@@ -0,0 +1,60 @@
+-- noqa: disable=PRS
+-- Most common values for Referrer-Policy (at site level)
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01' --AND rank = 1000
+ GROUP BY client
+),
+
+referrer_policy_custom_metrics AS (
+ SELECT
+ client,
+ root_page,
+ LOWER(TRIM(policy_meta)) AS policy_meta
+ FROM `httparchive.crawl.pages`,
+ UNNEST(SPLIT(SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy), ',')) AS policy_meta
+ WHERE date = '2025-07-01' --AND rank = 1000
+),
+
+response_headers AS (
+ SELECT
+ client,
+ root_page,
+ LOWER(response_header.name) AS name,
+ LOWER(response_header.value) AS value
+ FROM `httparchive.crawl.requests`,
+ UNNEST(response_headers) AS response_header
+ WHERE
+ date = '2025-07-01' AND
+ is_main_document = TRUE
+ --AND rank = 1000
+),
+
+referrer_policy_headers AS (
+ SELECT
+ client,
+ root_page,
+ TRIM(policy_header) AS policy_header
+ FROM response_headers,
+ UNNEST(SPLIT(value, ',')) AS policy_header
+ WHERE name = 'referrer-policy'
+)
+
+FROM referrer_policy_custom_metrics
+|> FULL OUTER JOIN referrer_policy_headers USING (client, root_page)
+|> EXTEND COALESCE(policy_header, policy_meta) AS policy
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, policy
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/referrer_policy_usage.sql b/sql/2025/privacy/referrer_policy_usage.sql
new file mode 100644
index 00000000000..aa296a9217a
--- /dev/null
+++ b/sql/2025/privacy/referrer_policy_usage.sql
@@ -0,0 +1,58 @@
+-- noqa: disable=PRS
+
+WITH referrer_policy_custom_metrics AS (
+ SELECT
+ client,
+ root_page,
+ SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy) AS meta_policy,
+ ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics.privacy.referrerPolicy.individual_requests)) > 0 AS individual_requests,
+ SAFE.INT64(custom_metrics.privacy.referrerPolicy.link_relations.A) > 0 AS link_relations
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+),
+
+referrer_policy_headers AS (
+ SELECT
+ client,
+ root_page,
+ LOWER(response_header.value) AS header_policy
+ FROM `httparchive.crawl.requests`,
+ UNNEST(response_headers) AS response_header
+ WHERE
+ date = '2025-07-01' AND
+ is_main_document = TRUE AND
+ response_header.name = 'referrer-policy'
+),
+
+aggregated AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites,
+ COUNT(DISTINCT IF(meta_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy_meta,
+ COUNT(DISTINCT IF(header_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy_header,
+ COUNT(DISTINCT IF(meta_policy IS NOT NULL OR header_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy,
+ COUNT(DISTINCT IF(individual_requests, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_individual_requests,
+ COUNT(DISTINCT IF(link_relations, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_link_relations,
+ COUNT(DISTINCT IF(meta_policy IS NOT NULL OR header_policy IS NOT NULL OR individual_requests OR link_relations, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_referrer_policy
+ FROM referrer_policy_custom_metrics
+ FULL OUTER JOIN referrer_policy_headers
+ USING (client, root_page)
+ GROUP BY client
+)
+
+FROM aggregated,
+ UNNEST([
+ STRUCT('entire_document_policy_meta' AS metric, pct_entire_document_policy_meta AS pct),
+ STRUCT('entire_document_policy_header', pct_entire_document_policy_header),
+ STRUCT('entire_document_policy', pct_entire_document_policy),
+ STRUCT('any_individual_requests', pct_any_individual_requests),
+ STRUCT('any_link_relations', pct_any_link_relations),
+ STRUCT('any_referrer_policy', pct_any_referrer_policy)
+ ]) AS metric_data
+|> SELECT client, metric_data.metric, metric_data.pct
+|> PIVOT(
+ ANY_VALUE(pct) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY mobile + desktop DESC
diff --git a/sql/2025/privacy/related_origin_trials_top.sql b/sql/2025/privacy/related_origin_trials_top.sql
new file mode 100644
index 00000000000..a6542d1c904
--- /dev/null
+++ b/sql/2025/privacy/related_origin_trials_top.sql
@@ -0,0 +1,114 @@
+-- noqa: disable=PRS
+-- Pages that participate in the privacy-relayed origin trials
+
+CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
+ token STRING,
+ origin STRING,
+ feature STRING,
+ expiry TIMESTAMP,
+ is_subdomain BOOL,
+ is_third_party BOOL
+>
+DETERMINISTIC AS (
+ (
+ WITH decoded_token AS (
+ SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
+ )
+
+ SELECT
+ STRUCT(
+ decoded AS token,
+ JSON_VALUE(decoded, '$.origin') AS origin,
+ JSON_VALUE(decoded, '$.feature') AS feature,
+ TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
+ JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
+ JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
+ )
+ FROM decoded_token
+ )
+);
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE
+ date = '2025-07-01'
+ GROUP BY client
+),
+
+pages AS (
+ SELECT
+ client,
+ root_page,
+ custom_metrics.other.`origin-trials` AS ot_metrics,
+ custom_metrics.other.almanac AS almanac_metrics
+ FROM `httparchive.crawl.pages`
+ WHERE
+ date = '2025-07-01'
+),
+
+response_headers AS (
+ SELECT
+ client,
+ root_page,
+ PARSE_ORIGIN_TRIAL(response_header.value) AS ot
+ FROM `httparchive.crawl.requests`,
+ UNNEST(response_headers) response_header
+ WHERE
+ date = '2025-07-01' AND
+ is_main_document = TRUE AND
+ LOWER(response_header.name) = 'origin-trial'
+),
+
+meta_tags AS (
+ SELECT
+ client,
+ root_page,
+ PARSE_ORIGIN_TRIAL(SAFE.STRING(meta_node.content)) AS ot
+ FROM pages,
+ UNNEST(JSON_QUERY_ARRAY(almanac_metrics.`meta-nodes`.nodes)) meta_node
+ WHERE
+ LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'origin-trial'
+),
+
+ot_from_custom_metric AS (
+ SELECT
+ client,
+ root_page,
+ PARSE_ORIGIN_TRIAL(SAFE.STRING(metric.token)) AS ot
+ FROM pages,
+ UNNEST(JSON_QUERY_ARRAY(ot_metrics)) metric
+),
+
+aggregated AS (
+ SELECT
+ client,
+ ot.feature,
+ --ot.expiry >= CURRENT_TIMESTAMP() AS is_active,
+ COUNT(DISTINCT root_page) AS number_of_websites
+ FROM (
+ SELECT * FROM response_headers
+ UNION ALL
+ SELECT * FROM meta_tags
+ UNION ALL
+ SELECT * FROM ot_from_custom_metric
+ )
+ GROUP BY
+ client,
+ feature
+ --is_active
+)
+
+FROM aggregated
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/tracker_categories_top.sql b/sql/2025/privacy/tracker_categories_top.sql
new file mode 100644
index 00000000000..819106eb537
--- /dev/null
+++ b/sql/2025/privacy/tracker_categories_top.sql
@@ -0,0 +1,33 @@
+-- noqa: disable=PRS
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ GROUP BY client
+)
+
+FROM `httparchive.crawl.pages`,
+ UNNEST(technologies) AS tech,
+ UNNEST(tech.categories) AS category
+|> WHERE
+ date = '2025-07-01' AND
+ category IN (
+ 'Analytics', 'Browser fingerprinting', 'Customer data platform',
+ 'Geolocation',
+ 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation',
+ 'Cookie compliance'
+ )
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, category
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/tracker_distribution.sql b/sql/2025/privacy/tracker_distribution.sql
new file mode 100644
index 00000000000..1eeb7a5efa7
--- /dev/null
+++ b/sql/2025/privacy/tracker_distribution.sql
@@ -0,0 +1,54 @@
+-- noqa: disable=PRS
+-- Number of websites that deploy a certain number of trackers
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ --AND rank = 1000
+ GROUP BY client
+),
+
+whotracksme AS (
+ SELECT
+ NET.HOST(domain) AS domain,
+ tracker
+ FROM `httparchive.almanac.whotracksme`
+ WHERE date = '2025-07-01'
+ AND category IN ('advertising', 'pornvertising', 'site_analytics', 'social_media')
+),
+
+tracker_counts AS (
+ SELECT
+ client,
+ root_page,
+ COUNT(DISTINCT tracker) AS number_of_trackers
+ FROM `httparchive.crawl.requests`
+ LEFT JOIN whotracksme
+ ON
+ NET.HOST(url) = domain OR
+ ENDS_WITH(NET.HOST(url), CONCAT('.', domain))
+ WHERE
+ date = '2025-07-01'
+ --AND rank = 1000
+ AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3')
+ GROUP BY
+ client,
+ root_page
+)
+
+FROM tracker_counts
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, number_of_trackers
+|> EXTEND SUM(number_of_websites) OVER (PARTITION BY client ORDER BY number_of_trackers DESC) AS ccdf_websites
+|> JOIN base_totals USING (client)
+|> EXTEND ccdf_websites / total_websites AS ccdf
+|> DROP total_websites, number_of_websites
+|> PIVOT(
+ ANY_VALUE(ccdf_websites) AS websites_count,
+ ANY_VALUE(ccdf) AS ccdf
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME ccdf_mobile AS mobile, ccdf_desktop AS desktop
+|> ORDER BY number_of_trackers
diff --git a/sql/2025/privacy/tracker_technologies_top.sql b/sql/2025/privacy/tracker_technologies_top.sql
new file mode 100644
index 00000000000..876bb9c3ded
--- /dev/null
+++ b/sql/2025/privacy/tracker_technologies_top.sql
@@ -0,0 +1,33 @@
+-- noqa: disable=PRS
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ GROUP BY client
+)
+
+FROM `httparchive.crawl.pages`,
+ UNNEST(technologies) AS tech,
+ UNNEST(tech.categories) AS category
+|> WHERE
+ date = '2025-07-01' AND
+ category IN (
+ 'Analytics', 'Browser fingerprinting', 'Customer data platform',
+ 'Geolocation',
+ 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation',
+ 'Cookie compliance'
+ )
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, tech.technology
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/whotracksme_categories_top.sql b/sql/2025/privacy/whotracksme_categories_top.sql
new file mode 100644
index 00000000000..819c41eb535
--- /dev/null
+++ b/sql/2025/privacy/whotracksme_categories_top.sql
@@ -0,0 +1,66 @@
+-- noqa: disable=PRS
+-- Percent of websites that deploy at least one tracker from each tracker category
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM httparchive.crawl.pages
+ WHERE date = '2025-07-01'
+ GROUP BY client
+),
+
+whotracksme AS (
+ SELECT
+ NET.HOST(domain) AS domain,
+ category
+ FROM httparchive.almanac.whotracksme
+ WHERE date = '2025-07-01'
+),
+
+tracker_categories AS (
+ SELECT
+ client,
+ category,
+ root_page
+ FROM httparchive.crawl.requests
+ JOIN whotracksme
+ ON (
+ NET.HOST(url) = domain OR
+ ENDS_WITH(NET.HOST(url), CONCAT('.', domain))
+ )
+ WHERE
+ date = '2025-07-01'
+ AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3')
+),
+
+aggregated AS (
+ SELECT
+ client,
+ category,
+ COUNT(DISTINCT root_page) AS number_of_websites
+ FROM tracker_categories
+ GROUP BY
+ client,
+ category
+ UNION ALL
+ SELECT
+ client,
+ 'any' AS category,
+ COUNT(DISTINCT root_page) AS number_of_websites
+ FROM tracker_categories
+ GROUP BY
+ client
+)
+
+FROM aggregated
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/2025/privacy/whotracksme_trackers_top.sql b/sql/2025/privacy/whotracksme_trackers_top.sql
new file mode 100644
index 00000000000..01fffd14aaf
--- /dev/null
+++ b/sql/2025/privacy/whotracksme_trackers_top.sql
@@ -0,0 +1,37 @@
+-- noqa: disable=PRS
+
+WITH base_totals AS (
+ SELECT
+ client,
+ COUNT(DISTINCT root_page) AS total_websites
+ FROM `httparchive.crawl.pages`
+ WHERE date = '2025-07-01'
+ GROUP BY client
+),
+
+whotracksme AS (
+ SELECT
+ NET.HOST(domain) AS domain,
+ category || ' / ' || tracker AS tracker
+ FROM `httparchive.almanac.whotracksme`
+ WHERE date = '2025-07-01'
+)
+
+FROM `httparchive.crawl.requests`
+|> JOIN whotracksme
+ ON NET.HOST(url) = domain OR
+ ENDS_WITH(NET.HOST(url), '.' || domain)
+|> WHERE
+ date = '2025-07-01'
+ AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3')
+|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, tracker
+|> JOIN base_totals USING (client)
+|> EXTEND number_of_websites / total_websites AS pct_websites
+|> DROP total_websites
+|> PIVOT(
+ ANY_VALUE(number_of_websites) AS websites_count,
+ ANY_VALUE(pct_websites) AS pct
+ FOR client IN ('desktop', 'mobile')
+)
+|> RENAME pct_mobile AS mobile, pct_desktop AS desktop
+|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC
diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb
index e32047eecb3..e8d95822b24 100644
--- a/sql/util/bq_to_sheets.ipynb
+++ b/sql/util/bq_to_sheets.ipynb
@@ -1,216 +1,264 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cellView": "form",
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "OVkCxlRQH6Yt",
- "outputId": "0e907d5e-3824-4b0c-935d-81e629702390"
- },
- "outputs": [],
- "source": [
- "# @title Download repo\n",
- "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cellView": "form",
- "id": "U37785Bxt5tE"
- },
- "outputs": [],
- "source": [
- "# @title Configure the chapter to process\n",
- "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n",
- "almanac_year = 2024 #@param {type: \"integer\"}\n",
- "chapter_name = 'privacy' #@param {type: \"string\"}\n",
- "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cellView": "form",
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "UzhgG5xvbQ1E",
- "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619"
- },
- "outputs": [],
- "source": [
- "# @title Update chapter branch\n",
- "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n",
- "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cellView": "form",
- "id": "45dBifFPJAtO"
- },
- "outputs": [],
- "source": [
- "# @title Authenticate\n",
- "import google.auth\n",
- "import os\n",
- "from google.colab import auth\n",
- "from google.cloud import bigquery\n",
- "\n",
- "import gspread\n",
- "from gspread_dataframe import set_with_dataframe\n",
- "\n",
- "\n",
- "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n",
- "auth.authenticate_user()\n",
- "credentials, project = google.auth.default()\n",
- "client = bigquery.Client()\n",
- "gc = gspread.authorize(credentials)\n",
- "\n",
- "try:\n",
- " ss = gc.open_by_url(spreadsheet_url)\n",
- "except:\n",
- " print('Spreadsheet not found')"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "U37785Bxt5tE"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Configuration\n",
+ "import os\n",
+ "import sys\n",
+ "\n",
+ "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n",
+ "almanac_year = 2025 #@param {type: \"integer\"}\n",
+ "chapter_name = 'privacy' #@param {type: \"string\"}\n",
+ "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}\n",
+ "\n",
+ "IN_COLAB = 'google.colab' in sys.modules\n",
+ "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cellView": "form",
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "collapsed": true,
- "id": "nblNil985Tjt",
- "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2"
- },
- "outputs": [],
- "source": [
- "# @title Upload query results\n",
- "\n",
- "import glob\n",
- "import re\n",
- "from tabulate import tabulate\n",
- "from IPython.display import clear_output\n",
- "\n",
- "\n",
- "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n",
- "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n",
- "dry_run = True # @param {type: \"boolean\"}\n",
- "overwrite_sheets = True # @param {type: \"boolean\"}\n",
- "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n",
- "\n",
- "filename_include_regexp = r'{}'.format(filename_match)\n",
- "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n",
- "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n",
- " year=almanac_year,\n",
- " chapter=chapter_name.lower()\n",
- ")\n",
- "existing_sheets = [s.title for s in ss.worksheets()]\n",
- "\n",
- "# Print formatted logs\n",
- "queries_processed_log = []\n",
- "def print_logs_table(log=None, append=True):\n",
- " if log:\n",
- " queries_processed_log.append(log)\n",
- " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n",
- " if not append:\n",
- " del queries_processed_log[-1]\n",
- " clear_output(wait=True)\n",
- " print(table)\n",
- "\n",
- "# Find matching SQL queries and save results to Google Sheets.\n",
- "for filepath in sorted(glob.iglob(folder)):\n",
- " filename = filepath.split('/')[-1]\n",
- "\n",
- " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n",
- "\n",
- " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n",
- "\n",
- " with open(filepath) as f:\n",
- " query = f.read()\n",
- "\n",
- " try:\n",
- " response = client.query(\n",
- " query,\n",
- " job_config = bigquery.QueryJobConfig(dry_run = True)\n",
- " )\n",
- " except Exception as e:\n",
- " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n",
- " continue\n",
- "\n",
- " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n",
- " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n",
- "\n",
- " if sheet_title in existing_sheets:\n",
- " if overwrite_sheets:\n",
- " st = ss.worksheet(sheet_title)\n",
- " else:\n",
- " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n",
- " continue\n",
- "\n",
- " if dry_run:\n",
- " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n",
- " continue\n",
- "\n",
- " try:\n",
- " if maximum_tb_billed:\n",
- " response = client.query(\n",
- " query,\n",
- " job_config = bigquery.QueryJobConfig(\n",
- " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n",
- " )\n",
- " )\n",
- " else:\n",
- " response = client.query(query)\n",
- "\n",
- " df = response.to_dataframe()\n",
- " if ('st' not in locals() or st.title != sheet_title):\n",
- " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n",
- " set_with_dataframe(st, df, resize=False)\n",
- "\n",
- " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n",
- " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n",
- "\n",
- " except Exception as e:\n",
- " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n",
- " continue\n",
- "\n",
- " else:\n",
- " print_logs_table([filename, None, None, 'Filename mismatch'])"
- ]
- }
- ],
- "metadata": {
+ "id": "OVkCxlRQH6Yt",
+ "outputId": "0e907d5e-3824-4b0c-935d-81e629702390"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Download repo\n",
+ "if IN_COLAB:\n",
+ " !git clone https://github.com/HTTPArchive/almanac.httparchive.org.git\n",
+ " !cd almanac.httparchive.org/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
"colab": {
- "provenance": []
+ "base_uri": "https://localhost:8080/"
},
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
+ "id": "UzhgG5xvbQ1E",
+ "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Update chapter branch\n",
+ "if IN_COLAB:\n",
+ " branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n",
+ " print(f\"Switching to branch: {branch_name}\")\n",
+ " !git checkout $branch_name && git pull"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# @title Authenticate in Google Colab\n",
+ "if IN_COLAB:\n",
+ " from google.colab import auth\n",
+ " auth.authenticate_user()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "45dBifFPJAtO"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Setup BigQuery and Google Sheets clients\n",
+ "import google.auth\n",
+ "from google.cloud import bigquery\n",
+ "import gspread\n",
+ "\n",
+ "\n",
+ "# Authenticate with required scopes for BigQuery and Google Sheets\n",
+ "SCOPES = [\n",
+ " 'https://www.googleapis.com/auth/spreadsheets',\n",
+ " 'https://www.googleapis.com/auth/drive',\n",
+ " 'https://www.googleapis.com/auth/bigquery'\n",
+ "]\n",
+ "\n",
+ "credentials, project = google.auth.default(scopes=SCOPES)\n",
+ "client = bigquery.Client(credentials=credentials)\n",
+ "gc = gspread.authorize(credentials)\n",
+ "\n",
+ "# Connect to spreadsheet\n",
+ "ss = gc.open_by_url(spreadsheet_url)\n",
+ "existing_sheets = [s.title for s in ss.worksheets()]\n",
+ "print(f\"ā Connected to spreadsheet with {len(existing_sheets)} existing sheets\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- "language_info": {
- "name": "python",
- "version": "3.12.4"
- }
+ "collapsed": true,
+ "id": "nblNil985Tjt",
+ "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2"
+ },
+ "outputs": [],
+ "source": [
+ "# @title Upload query results to Google Sheets\n",
+ "import glob\n",
+ "import re\n",
+ "from gspread_dataframe import set_with_dataframe\n",
+ "from IPython.display import clear_output\n",
+ "from tabulate import tabulate\n",
+ "\n",
+ "\n",
+ "# Query filters and options\n",
+ "filename_match = 'whotracksme.+' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n",
+ "filename_match_exclude = '' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n",
+ "dry_run = False # @param {type: \"boolean\"}\n",
+ "overwrite_sheets = True # @param {type: \"boolean\"}\n",
+ "maximum_tb_billed = 7 # @param {type: \"raw\", placeholder: \"Max TB to bill per query\"}\n",
+ "\n",
+ "# Setup file filters\n",
+ "filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else filename_match\n",
+ "filename_exclude_regexp = r'^$' if not filename_match_exclude else filename_match_exclude\n",
+ "\n",
+ "# Build path to SQL files\n",
+ "sql_folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n",
+ "print(f\"Looking for SQL files in: {sql_folder}\")\n",
+ "sql_files = sorted(glob.glob(sql_folder))\n",
+ "\n",
+ "if not sql_files:\n",
+ " print(\"ā No SQL files found. Check the folder path.\")\n",
+ "else:\n",
+ " print(f\"Found {len(sql_files)} SQL files\\n\")\n",
+ "\n",
+ " # Progress tracking\n",
+ " queries_processed_log = []\n",
+ "\n",
+ " def log_result(filename, tb_processed=None, sheet_name=None, skip_reason=None, preview=False):\n",
+ " \"\"\"Add result to log and display table\"\"\"\n",
+ " log_entry = [filename, tb_processed, sheet_name, skip_reason]\n",
+ " if not preview:\n",
+ " queries_processed_log.append(log_entry)\n",
+ "\n",
+ " # Build table from current log plus preview entry if needed\n",
+ " display_log = queries_processed_log if not preview else queries_processed_log + [log_entry]\n",
+ " table = tabulate(display_log, headers=['Query', 'TB Billed', 'Sheet', 'Status/Skip Reason'], tablefmt=\"grid\")\n",
+ " clear_output(wait=True)\n",
+ " print(table)\n",
+ "\n",
+ " # Process each SQL file\n",
+ " for filepath in sql_files:\n",
+ " filename = os.path.basename(filepath)\n",
+ "\n",
+ " # Show processing status\n",
+ " log_result(filename, 'Processing...', 'Processing...', 'Processing...', preview=True)\n",
+ "\n",
+ " # Check if filename matches filters\n",
+ " if not re.search(filename_include_regexp, filename) or re.search(filename_exclude_regexp, filename):\n",
+ " log_result(filename, None, None, 'Filename filter mismatch')\n",
+ " continue\n",
+ "\n",
+ " # Read query\n",
+ " with open(filepath) as f:\n",
+ " query = f.read()\n",
+ "\n",
+ " # Estimate query cost (dry run)\n",
+ " try:\n",
+ " dry_run_response = client.query(query, job_config=bigquery.QueryJobConfig(dry_run=True))\n",
+ " tb_processed = dry_run_response.total_bytes_processed / 1024**4\n",
+ " except Exception as e:\n",
+ " log_result(filename, None, None, f'Dry run error: {str(e)[:100]}...')\n",
+ " continue\n",
+ "\n",
+ " # Generate sheet title from filename\n",
+ " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n",
+ "\n",
+ " # Skip execution if dry run mode\n",
+ " if dry_run:\n",
+ " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Dry run mode')\n",
+ " continue\n",
+ "\n",
+ " # Check if sheet already exists\n",
+ " if sheet_title in existing_sheets and not overwrite_sheets:\n",
+ " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Sheet exists (set overwrite_sheets=True)')\n",
+ " continue\n",
+ "\n",
+ " # Execute query and upload to Sheets\n",
+ " try:\n",
+ " # Run query with billing limit\n",
+ " job_config = bigquery.QueryJobConfig()\n",
+ " if maximum_tb_billed:\n",
+ " job_config.maximum_bytes_billed = int(maximum_tb_billed * 1024**4)\n",
+ "\n",
+ " query_response = client.query(query, job_config=job_config)\n",
+ " df = query_response.to_dataframe()\n",
+ "\n",
+ " # Get or create sheet\n",
+ " if sheet_title in existing_sheets:\n",
+ " sheet = ss.worksheet(sheet_title)\n",
+ " else:\n",
+ " sheet = ss.add_worksheet(sheet_title, rows=1, cols=1)\n",
+ " existing_sheets.append(sheet_title)\n",
+ "\n",
+ " # Upload data\n",
+ " set_with_dataframe(sheet, df, resize=False)\n",
+ "\n",
+ " tb_billed = query_response.total_bytes_billed / 1024**4\n",
+ " log_result(filename, f'{tb_billed:.3f}', sheet_title, 'ā Uploaded')\n",
+ "\n",
+ " except Exception as e:\n",
+ " log_result(filename, f'{tb_billed:.3f}', None, f'Query error: {str(e)[:100]}...')\n",
+ "\n",
+ " print(f\"\\nā Processed {len(queries_processed_log)} queries\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.14.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}
diff --git a/sql/util/bq_writer.py b/sql/util/bq_writer.py
index 3355dcfcb94..a72b6784ee0 100644
--- a/sql/util/bq_writer.py
+++ b/sql/util/bq_writer.py
@@ -18,7 +18,6 @@ def write_to_bq(df, table_id, schema, write_disposition="WRITE_APPEND"):
client = bigquery.Client()
job_config = bigquery.LoadJobConfig(
- source_format=bigquery.SourceFormat.CSV,
write_disposition=write_disposition,
schema=schema,
)
diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py
index 1ea1bb7f75f..a89133467d9 100644
--- a/sql/util/haveibeenpwned.py
+++ b/sql/util/haveibeenpwned.py
@@ -1,50 +1,26 @@
"""
-This module retrieves data from the "haveibeenpwned" API and loads it into a BigQuery table.
+Retrieves breach data from the Have I Been Pwned API and loads it into BigQuery.
"""
-import json
-from datetime import datetime as DateTime
-
-import pandas
+import pandas as pd
import requests # pylint: disable=import-error
-from bq_writer import write_to_bq, bigquery
-
-
-# Retrieve data from the "haveibeenpwned" API
-breaches = json.loads(
- requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10).content
-)
-df = pandas.DataFrame(breaches)
-
-year = DateTime.now().year
-df["date"] = DateTime(year, 6, 1).date()
-df["Name"] = df["Name"].astype(str)
-df["Title"] = df["Title"].astype(str)
-df["Domain"] = df["Domain"].astype(str)
-df["BreachDate"] = pandas.to_datetime(
- df["BreachDate"], format="%Y-%m-%d", errors="coerce"
-).dt.date
-df["AddedDate"] = pandas.to_datetime(
- df["AddedDate"], format="%Y-%m-%d", errors="coerce"
-).dt.date
-df["ModifiedDate"] = pandas.to_datetime(
- df["ModifiedDate"], format="%Y-%m-%d", errors="coerce"
-).dt.date
-df["Description"] = df["Description"].astype(str)
-df["LogoPath"] = df["LogoPath"].astype(str)
-df["DataClasses"] = df["DataClasses"].apply(json.dumps)
+from bq_writer import bigquery, write_to_bq
-# Append to httparchive.almanac.breaches
+response = requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10)
+breaches = response.json()
+df = pd.DataFrame(breaches)
+df["BreachDate"] = pd.to_datetime(df["BreachDate"], errors="coerce")
+df["AddedDate"] = pd.to_datetime(df["AddedDate"], errors="coerce")
+df["ModifiedDate"] = pd.to_datetime(df["ModifiedDate"], errors="coerce")
schema = [
- bigquery.SchemaField("date", "DATE"),
bigquery.SchemaField("Name", "STRING"),
bigquery.SchemaField("Title", "STRING"),
bigquery.SchemaField("Domain", "STRING"),
bigquery.SchemaField("BreachDate", "DATE"),
- bigquery.SchemaField("AddedDate", "DATE"),
- bigquery.SchemaField("ModifiedDate", "DATE"),
+ bigquery.SchemaField("AddedDate", "TIMESTAMP"),
+ bigquery.SchemaField("ModifiedDate", "TIMESTAMP"),
bigquery.SchemaField("PwnCount", "INTEGER"),
bigquery.SchemaField("Description", "STRING"),
bigquery.SchemaField("LogoPath", "STRING"),
@@ -53,7 +29,12 @@
bigquery.SchemaField("IsSensitive", "BOOLEAN"),
bigquery.SchemaField("IsRetired", "BOOLEAN"),
bigquery.SchemaField("IsSpamList", "BOOLEAN"),
- bigquery.SchemaField("DataClasses", "STRING"),
+ bigquery.SchemaField("IsMalware", "BOOLEAN"),
+ bigquery.SchemaField("IsSubscriptionFree", "BOOLEAN"),
+ bigquery.SchemaField("IsStealerLog", "BOOLEAN"),
+ bigquery.SchemaField("DataClasses", "STRING", mode="REPEATED"),
+ bigquery.SchemaField("Attribution", "STRING"),
+ bigquery.SchemaField("DisclosureUrl", "STRING"),
]
-write_to_bq(df, "httparchive.almanac.breaches", schema)
+write_to_bq(df, "httparchive.almanac.breaches", schema, write_disposition="WRITE_TRUNCATE")
diff --git a/sql/util/requirements.txt b/sql/util/requirements.txt
new file mode 100644
index 00000000000..ac1288dc9f9
--- /dev/null
+++ b/sql/util/requirements.txt
@@ -0,0 +1,8 @@
+pandas==2.3.3
+google-cloud-bigquery==3.40.0
+requests==2.32.5
+tabulate==0.9.0
+gspread==6.2.1
+gspread-dataframe==4.0.0
+ipykernel==7.1.0
+db-dtypes==1.5.0
diff --git a/sql/util/whotracksme_trackers.py b/sql/util/whotracksme_trackers.py
index ec68f922e17..ad78ecf38ea 100644
--- a/sql/util/whotracksme_trackers.py
+++ b/sql/util/whotracksme_trackers.py
@@ -18,7 +18,7 @@
TRACKERS_QUERY = """
SELECT
- '2024-06-01' AS date,
+ '2025-07-01' AS date,
categories.name as category,
tracker,
domain
diff --git a/src/requirements.txt b/src/requirements.txt
index dd326df7dee..eb86638b690 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -5,8 +5,3 @@ pytest==9.0.2
pytest-watch==4.2.0
pytest-cov==7.0.0
sqlfluff==3.5.0
-
-# sql/util/* dependencies
-pandas==2.3.3
-google-cloud-bigquery==3.40.0
-requests==2.32.5