diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint index 6b46d7370db..c576a485feb 100644 --- a/.github/linters/.python-lint +++ b/.github/linters/.python-lint @@ -31,10 +31,6 @@ persistent=yes # Specify a configuration file. #rcfile= -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages -suggestion-mode=yes - # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no @@ -470,5 +466,6 @@ valid-metaclass-classmethod-first-arg=mcs [EXCEPTIONS] # Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception +# ["builtins.BaseException", "builtins.Exception"] +overgeneral-exceptions = builtins.BaseException, + builtins.Exception diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql index e55b352eadf..7a57ed673bd 100644 --- a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,24 +1,28 @@ # Pages that participate in the privacy-relayed origin trials -CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( - SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) -); - -CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) -RETURNS STRUCT< +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT< token STRING, origin STRING, feature STRING, expiry TIMESTAMP, is_subdomain BOOL, is_third_party BOOL -> AS ( - STRUCT( - DECODE_ORIGIN_TRIAL(token) AS token, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party +> +DETERMINISTIC AS ( + ( + WITH decoded_token AS ( + SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded + ) + + SELECT + STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + ) + FROM decoded_token ) ); diff --git a/sql/2025/privacy/bounce_domains_top.sql b/sql/2025/privacy/bounce_domains_top.sql new file mode 100644 index 00000000000..0d7fa802f0f --- /dev/null +++ b/sql/2025/privacy/bounce_domains_top.sql @@ -0,0 +1,78 @@ +-- noqa: disable=PRS +-- Detection logic explained: +-- https://github.com/privacycg/proposals/issues/6 +-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md + +WITH redirect_requests AS ( + FROM `httparchive.crawl.requests` + |> WHERE + date = '2025-07-01' AND + --rank = 1000 AND + SAFE.INT64(summary.status) BETWEEN 300 AND 399 AND + index <= 2 + |> JOIN UNNEST(response_headers) AS header + |> WHERE LOWER(header.name) = 'location' + |> SELECT + client, + url, + index, + NET.REG_DOMAIN(header.value) AS location_domain, + root_page +), + +-- Find the first navigation redirect +navigation_redirect AS ( + FROM redirect_requests + |> WHERE + index = 1 AND + NET.REG_DOMAIN(root_page) = NET.REG_DOMAIN(url) AND + NET.REG_DOMAIN(url) != location_domain + |> SELECT + client, + root_page, + location_domain AS bounce_domain +), + +-- Find the second navigation redirect +bounce_redirect AS ( + FROM redirect_requests + |> WHERE + index = 2 AND + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND + NET.REG_DOMAIN(url) != location_domain + |> SELECT + client, + url, + root_page, + location_domain AS bounce_redirect_location_domain +), + +-- Combine the first and second navigation redirects +bounce_sequences AS ( + FROM navigation_redirect AS nav + |> JOIN bounce_redirect AS bounce + ON + nav.client = bounce.client AND + nav.root_page = bounce.root_page + |> AGGREGATE COUNT(DISTINCT nav.root_page) AS websites_count + GROUP BY nav.client, bounce_domain +), + +websites_total AS ( + FROM `httparchive.crawl.pages` + |> WHERE date = '2025-07-01' --AND rank = 1000 + |> AGGREGATE COUNT(DISTINCT root_page) AS total_websites GROUP BY client +) + +FROM bounce_sequences +|> JOIN websites_total USING (client) +|> EXTEND websites_count / total_websites AS websites_pct +|> DROP total_websites +|> PIVOT( + ANY_VALUE(websites_count) AS cnt, + ANY_VALUE(websites_pct) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop, cnt_mobile AS mobile_count, cnt_desktop AS desktop_count +|> ORDER BY COALESCE(mobile_count, 0) + COALESCE(desktop_count, 0) DESC +|> LIMIT 100 diff --git a/sql/2025/privacy/client_hints_top.sql b/sql/2025/privacy/client_hints_top.sql new file mode 100644 index 00000000000..9a3033b5061 --- /dev/null +++ b/sql/2025/privacy/client_hints_top.sql @@ -0,0 +1,44 @@ +-- noqa: disable=PRS +WITH totals AS ( + FROM `httparchive.crawl.pages` + |> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000 + |> AGGREGATE COUNT(*) AS total_websites GROUP BY client +), + +/* Get Accept-CH Headers */ +headers AS ( + FROM `httparchive.crawl.requests` + |> WHERE date = '2025-07-01' AND is_root_page AND is_main_document --AND rank = 1000 + |> JOIN UNNEST(response_headers) AS header + |> WHERE LOWER(header.name) = 'accept-ch' + |> LEFT JOIN UNNEST(SPLIT(LOWER(header.value), ',')) AS header_value + |> SELECT client, root_page, header_value + +), + +/* Get Accept-CH Meta Tags */ +meta_tags AS ( + FROM `httparchive.crawl.pages` + |> WHERE date = '2025-07-01' AND is_root_page --AND rank = 1000 + |> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node + |> EXTEND LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name + |> WHERE tag_name = 'accept-ch' + |> LEFT JOIN UNNEST(SPLIT(LOWER(SAFE.STRING(meta_node.content)), ',')) AS tag_value + |> SELECT client, root_page, tag_value +) + +FROM headers +|> FULL OUTER JOIN meta_tags USING (client, root_page) +|> JOIN totals USING (client) +|> EXTEND TRIM(COALESCE(header_value, tag_value)) AS value +|> AGGREGATE +COUNT(DISTINCT root_page) AS number_of_websites, +COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites +GROUP BY client, value +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/client_hints_usage.sql b/sql/2025/privacy/client_hints_usage.sql new file mode 100644 index 00000000000..00c4d5efeb8 --- /dev/null +++ b/sql/2025/privacy/client_hints_usage.sql @@ -0,0 +1,54 @@ +-- noqa: disable=PRS +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + --AND rank = 1000 + GROUP BY client +), + +accept_ch_headers AS ( + SELECT DISTINCT + client, + root_page + FROM `httparchive.crawl.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE AND + --rank = 1000 AND + LOWER(response_header.name) = 'accept-ch' +), + +accept_ch_meta AS ( + SELECT DISTINCT + client, + root_page + FROM `httparchive.crawl.pages`, + UNNEST(JSON_QUERY_ARRAY(custom_metrics.other.almanac.`meta-nodes`.nodes)) AS meta_node + WHERE date = '2025-07-01' + --AND rank = 1000 + AND LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'accept-ch' +), + +-- Combine both sources +all_accept_ch AS ( + SELECT client, root_page FROM accept_ch_headers + UNION DISTINCT + SELECT client, root_page FROM accept_ch_meta +) + +FROM all_accept_ch +|> JOIN base_totals USING (client) +|> AGGREGATE + COUNT(DISTINCT all_accept_ch.root_page) AS number_of_websites, + COUNT(DISTINCT all_accept_ch.root_page) / ANY_VALUE(base_totals.total_websites) AS pct_websites +GROUP BY all_accept_ch.client +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop diff --git a/sql/2025/privacy/cookie_domains_third_party_top.sql b/sql/2025/privacy/cookie_domains_third_party_top.sql new file mode 100644 index 00000000000..c38269479b8 --- /dev/null +++ b/sql/2025/privacy/cookie_domains_third_party_top.sql @@ -0,0 +1,22 @@ +FROM `httparchive.crawl.pages` +|> WHERE date = '2025-07-01' -- AND rank = 1000 +|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains +|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie +|> EXTEND +NET.HOST(root_page) AS firstparty_domain, +NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain +|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain) +|> AGGREGATE +COUNT(DISTINCT firstparty_domain) AS domain_count, +COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains +GROUP BY client, cookie_domain +|> PIVOT ( + ANY_VALUE(domain_count) AS domain_count, + ANY_VALUE(pct_domains) AS pct_domains + FOR client IN ('desktop', 'mobile') +) +|> RENAME +pct_domains_mobile AS mobile, +pct_domains_desktop AS desktop +|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC +|> LIMIT 1000 diff --git a/sql/2025/privacy/cookies_first_party_top.sql b/sql/2025/privacy/cookies_first_party_top.sql new file mode 100644 index 00000000000..62f6cb0d2c7 --- /dev/null +++ b/sql/2025/privacy/cookies_first_party_top.sql @@ -0,0 +1,27 @@ +/* Most common cookie names, by number of domains on which they appear. +Goal is to identify common trackers that use first-party cookies across sites. +*/ + +FROM `httparchive.crawl.pages` +|> WHERE date = '2025-07-01' -- AND rank = 1000 +|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains +|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie +|> EXTEND +NET.HOST(root_page) AS firstparty_domain, +NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain, +SAFE.STRING(cookie.name) AS cookie_name +|> WHERE ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain) +|> AGGREGATE +COUNT(DISTINCT firstparty_domain) AS domain_count, +COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains +GROUP BY client, cookie_name +|> PIVOT ( + ANY_VALUE(domain_count) AS domain_count, + ANY_VALUE(pct_domains) AS pct_domains + FOR client IN ('desktop', 'mobile') +) +|> RENAME +pct_domains_mobile AS mobile, +pct_domains_desktop AS desktop +|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC +|> LIMIT 1000 diff --git a/sql/2025/privacy/cookies_third_party_top.sql b/sql/2025/privacy/cookies_third_party_top.sql new file mode 100644 index 00000000000..dff37874242 --- /dev/null +++ b/sql/2025/privacy/cookies_third_party_top.sql @@ -0,0 +1,23 @@ +FROM `httparchive.crawl.pages` +|> WHERE date = '2025-07-01' -- AND rank = 1000 +|> EXTEND COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains +|> JOIN UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie +|> EXTEND +NET.HOST(root_page) AS firstparty_domain, +NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_domain, +NET.HOST(SAFE.STRING(cookie.domain)) || ' / ' || SAFE.STRING(cookie.name) AS cookie_details +|> WHERE NOT ENDS_WITH('.' || firstparty_domain, '.' || cookie_domain) +|> AGGREGATE +COUNT(DISTINCT firstparty_domain) AS domain_count, +COUNT(DISTINCT firstparty_domain) / ANY_VALUE(total_domains) AS pct_domains +GROUP BY client, cookie_details +|> PIVOT ( + ANY_VALUE(domain_count) AS domain_count, + ANY_VALUE(pct_domains) AS pct_domains + FOR client IN ('desktop', 'mobile') +) +|> RENAME +pct_domains_mobile AS mobile, +pct_domains_desktop AS desktop +|> ORDER BY COALESCE(domain_count_mobile, 0) + COALESCE(domain_count_desktop, 0) DESC +|> LIMIT 1000 diff --git a/sql/2025/privacy/dnt_usage.sql b/sql/2025/privacy/dnt_usage.sql new file mode 100644 index 00000000000..c713043bf79 --- /dev/null +++ b/sql/2025/privacy/dnt_usage.sql @@ -0,0 +1,19 @@ +-- Pages that use DNT feature + +FROM `httparchive.blink_features.usage` +|> WHERE + date = '2025-07-01' AND + --rank <= 10000 AND + feature = 'NavigatorDoNotTrack' +|> SELECT DISTINCT + client, + rank, + num_urls, + pct_urls +|> PIVOT ( + ANY_VALUE(num_urls) AS pages_count, + ANY_VALUE(pct_urls) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY rank ASC diff --git a/sql/2025/privacy/fingerprinting_top.sql b/sql/2025/privacy/fingerprinting_top.sql new file mode 100644 index 00000000000..02dd77606d3 --- /dev/null +++ b/sql/2025/privacy/fingerprinting_top.sql @@ -0,0 +1,31 @@ +-- noqa: disable=PRS +-- Percent of websites using a fingerprinting library based on wappalyzer category + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS websites_total + FROM httparchive.crawl.pages + WHERE date = '2025-07-01' + GROUP BY client +) + +FROM httparchive.crawl.pages, + UNNEST(technologies) AS technology, + UNNEST(technology.categories) AS category +|> WHERE + date = '2025-07-01' AND + category = 'Browser fingerprinting' +|> AGGREGATE + COUNT(DISTINCT root_page) AS websites_count +GROUP BY client, technology.technology +|> JOIN base_totals USING (client) +|> EXTEND websites_count / websites_total AS websites_pct +|> DROP websites_total +|> PIVOT( + ANY_VALUE(websites_count) AS websites_count, + ANY_VALUE(websites_pct) AS websites_pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME websites_pct_mobile AS mobile, websites_pct_desktop AS desktop +|> ORDER BY websites_count_mobile + websites_count_desktop DESC diff --git a/sql/2025/privacy/iab_tcf_v2_cmps_top.sql b/sql/2025/privacy/iab_tcf_v2_cmps_top.sql new file mode 100644 index 00000000000..3768bc77078 --- /dev/null +++ b/sql/2025/privacy/iab_tcf_v2_cmps_top.sql @@ -0,0 +1,21 @@ +-- noqa: disable=PRS +-- Counts of CMPs using IAB Transparency & Consent Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata +-- CMP vendor list: https://iabeurope.eu/cmp-list/ + +FROM `httparchive.crawl.pages` +|> WHERE date = '2025-07-01' --AND rank = 1000 +|> EXTEND +SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId, +COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_websites +|> AGGREGATE +COUNT(DISTINCT root_page) AS number_of_websites, +COUNT(DISTINCT root_page) / ANY_VALUE(total_websites) AS pct_websites +GROUP BY client, cmpId +|> PIVOT ( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/iab_tcf_v2_countries_top.sql b/sql/2025/privacy/iab_tcf_v2_countries_top.sql new file mode 100644 index 00000000000..0f2115b7254 --- /dev/null +++ b/sql/2025/privacy/iab_tcf_v2_countries_top.sql @@ -0,0 +1,39 @@ +-- noqa: disable=PRS +-- Counts of countries for publishers using IAB Transparency & Consent Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata +-- "Country code of the country that determines the legislation of +-- reference. Normally corresponds to the country code of the country +-- in which the publisher's business entity is established." + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' --AND rank = 1000 + GROUP BY client +), + +base_data AS ( + SELECT + client, + root_page, + UPPER(SAFE.STRING(custom_metrics.privacy.iab_tcf_v2.data.publisherCC)) AS publisherCC + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND --rank = 1000 AND + JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object' +) + +FROM base_data +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, publisherCC +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_of_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_of_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/iab_usage.sql b/sql/2025/privacy/iab_usage.sql new file mode 100644 index 00000000000..272384e8d72 --- /dev/null +++ b/sql/2025/privacy/iab_usage.sql @@ -0,0 +1,57 @@ +-- noqa: disable=PRS +-- Counts of websites with IAB Frameworks + +WITH base_data AS ( + SELECT + client, + root_page, + SAFE.BOOL(custom_metrics.privacy.iab_tcf_v1.present) AS tcfv1, + SAFE.BOOL(custom_metrics.privacy.iab_tcf_v2.present) AS tcfv2, + SAFE.BOOL(custom_metrics.privacy.iab_gpp.present) AS gpp, + SAFE.BOOL(custom_metrics.privacy.iab_usp.present) AS usp, + SAFE.BOOL(custom_metrics.privacy.iab_tcf_v1.compliant_setup) AS tcfv1_compliant, + SAFE.BOOL(custom_metrics.privacy.iab_tcf_v2.compliant_setup) AS tcfv2_compliant, + custom_metrics.privacy.iab_gpp.data IS NOT NULL AS gpp_data + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' + --AND rank = 1000 +), + +aggregated AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites, + COUNT(DISTINCT IF(tcfv1, root_page, NULL)) AS websites_with_tcfv1, + COUNT(DISTINCT IF(tcfv2, root_page, NULL)) AS websites_with_tcfv2, + COUNT(DISTINCT IF(usp, root_page, NULL)) AS websites_with_usp, + COUNT(DISTINCT IF(tcfv1 OR tcfv2, root_page, NULL)) AS websites_with_tcf, + COUNT(DISTINCT IF(tcfv1 OR tcfv2 OR usp OR gpp, root_page, NULL)) AS websites_with_any, + COUNT(DISTINCT IF(tcfv1 AND tcfv1_compliant, root_page, NULL)) AS websites_with_tcfv1_compliant, + COUNT(DISTINCT IF(tcfv2 AND tcfv2_compliant, root_page, NULL)) AS websites_with_tcfv2_compliant, + COUNT(DISTINCT IF(gpp, root_page, NULL)) AS websites_with_gpp, + COUNT(DISTINCT IF(gpp_data, root_page, NULL)) AS websites_with_gpp_data + FROM base_data + GROUP BY client +) + +FROM aggregated, + UNNEST([ + STRUCT('tcfv1' AS metric, websites_with_tcfv1 / total_websites AS pct_websites, websites_with_tcfv1 AS number_of_websites), + STRUCT('tcfv2', websites_with_tcfv2 / total_websites, websites_with_tcfv2), + STRUCT('usp', websites_with_usp / total_websites, websites_with_usp), + STRUCT('tcf', websites_with_tcf / total_websites, websites_with_tcf), + STRUCT('any_framework', websites_with_any / total_websites, websites_with_any), + STRUCT('tcfv1_compliant', websites_with_tcfv1_compliant / total_websites, websites_with_tcfv1_compliant), + STRUCT('tcfv2_compliant', websites_with_tcfv2_compliant / total_websites, websites_with_tcfv2_compliant), + STRUCT('gpp', websites_with_gpp / total_websites, websites_with_gpp), + STRUCT('gpp_data_available', websites_with_gpp_data / total_websites, websites_with_gpp_data) + ]) AS metric +|> SELECT client, metric.metric, metric.pct_websites, metric.number_of_websites +|> PIVOT( + ANY_VALUE(pct_websites) AS pct, + ANY_VALUE(number_of_websites) AS websites_count + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY websites_count_desktop + websites_count_mobile DESC diff --git a/sql/2025/privacy/iab_usp_strings_top.sql b/sql/2025/privacy/iab_usp_strings_top.sql new file mode 100644 index 00000000000..c8c53b13d2e --- /dev/null +++ b/sql/2025/privacy/iab_usp_strings_top.sql @@ -0,0 +1,28 @@ +-- noqa: disable=PRS +-- Counts of US Privacy String values for websites using IAB US Privacy Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + GROUP BY client +) + +FROM `httparchive.crawl.pages` +|> WHERE date = '2025-07-01' +|> EXTEND UPPER(SAFE.STRING(custom_metrics.privacy.iab_usp.privacy_string.uspString)) AS uspString +|> WHERE uspString IS NOT NULL +|> AGGREGATE COUNT(DISTINCT root_page) AS websites_count GROUP BY client, uspString +|> JOIN base_totals USING (client) +|> EXTEND websites_count / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(websites_count) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/referrer_policy_top.sql b/sql/2025/privacy/referrer_policy_top.sql new file mode 100644 index 00000000000..0e6ff60b66f --- /dev/null +++ b/sql/2025/privacy/referrer_policy_top.sql @@ -0,0 +1,60 @@ +-- noqa: disable=PRS +-- Most common values for Referrer-Policy (at site level) + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' --AND rank = 1000 + GROUP BY client +), + +referrer_policy_custom_metrics AS ( + SELECT + client, + root_page, + LOWER(TRIM(policy_meta)) AS policy_meta + FROM `httparchive.crawl.pages`, + UNNEST(SPLIT(SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy), ',')) AS policy_meta + WHERE date = '2025-07-01' --AND rank = 1000 +), + +response_headers AS ( + SELECT + client, + root_page, + LOWER(response_header.name) AS name, + LOWER(response_header.value) AS value + FROM `httparchive.crawl.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE + --AND rank = 1000 +), + +referrer_policy_headers AS ( + SELECT + client, + root_page, + TRIM(policy_header) AS policy_header + FROM response_headers, + UNNEST(SPLIT(value, ',')) AS policy_header + WHERE name = 'referrer-policy' +) + +FROM referrer_policy_custom_metrics +|> FULL OUTER JOIN referrer_policy_headers USING (client, root_page) +|> EXTEND COALESCE(policy_header, policy_meta) AS policy +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, policy +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/referrer_policy_usage.sql b/sql/2025/privacy/referrer_policy_usage.sql new file mode 100644 index 00000000000..aa296a9217a --- /dev/null +++ b/sql/2025/privacy/referrer_policy_usage.sql @@ -0,0 +1,58 @@ +-- noqa: disable=PRS + +WITH referrer_policy_custom_metrics AS ( + SELECT + client, + root_page, + SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy) AS meta_policy, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics.privacy.referrerPolicy.individual_requests)) > 0 AS individual_requests, + SAFE.INT64(custom_metrics.privacy.referrerPolicy.link_relations.A) > 0 AS link_relations + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +), + +referrer_policy_headers AS ( + SELECT + client, + root_page, + LOWER(response_header.value) AS header_policy + FROM `httparchive.crawl.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE AND + response_header.name = 'referrer-policy' +), + +aggregated AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites, + COUNT(DISTINCT IF(meta_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy_meta, + COUNT(DISTINCT IF(header_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy_header, + COUNT(DISTINCT IF(meta_policy IS NOT NULL OR header_policy IS NOT NULL, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_entire_document_policy, + COUNT(DISTINCT IF(individual_requests, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_individual_requests, + COUNT(DISTINCT IF(link_relations, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_link_relations, + COUNT(DISTINCT IF(meta_policy IS NOT NULL OR header_policy IS NOT NULL OR individual_requests OR link_relations, root_page, NULL)) / COUNT(DISTINCT root_page) AS pct_any_referrer_policy + FROM referrer_policy_custom_metrics + FULL OUTER JOIN referrer_policy_headers + USING (client, root_page) + GROUP BY client +) + +FROM aggregated, + UNNEST([ + STRUCT('entire_document_policy_meta' AS metric, pct_entire_document_policy_meta AS pct), + STRUCT('entire_document_policy_header', pct_entire_document_policy_header), + STRUCT('entire_document_policy', pct_entire_document_policy), + STRUCT('any_individual_requests', pct_any_individual_requests), + STRUCT('any_link_relations', pct_any_link_relations), + STRUCT('any_referrer_policy', pct_any_referrer_policy) + ]) AS metric_data +|> SELECT client, metric_data.metric, metric_data.pct +|> PIVOT( + ANY_VALUE(pct) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY mobile + desktop DESC diff --git a/sql/2025/privacy/related_origin_trials_top.sql b/sql/2025/privacy/related_origin_trials_top.sql new file mode 100644 index 00000000000..a6542d1c904 --- /dev/null +++ b/sql/2025/privacy/related_origin_trials_top.sql @@ -0,0 +1,114 @@ +-- noqa: disable=PRS +-- Pages that participate in the privacy-relayed origin trials + +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT< + token STRING, + origin STRING, + feature STRING, + expiry TIMESTAMP, + is_subdomain BOOL, + is_third_party BOOL +> +DETERMINISTIC AS ( + ( + WITH decoded_token AS ( + SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded + ) + + SELECT + STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + ) + FROM decoded_token + ) +); + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' + GROUP BY client +), + +pages AS ( + SELECT + client, + root_page, + custom_metrics.other.`origin-trials` AS ot_metrics, + custom_metrics.other.almanac AS almanac_metrics + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +), + +response_headers AS ( + SELECT + client, + root_page, + PARSE_ORIGIN_TRIAL(response_header.value) AS ot + FROM `httparchive.crawl.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE AND + LOWER(response_header.name) = 'origin-trial' +), + +meta_tags AS ( + SELECT + client, + root_page, + PARSE_ORIGIN_TRIAL(SAFE.STRING(meta_node.content)) AS ot + FROM pages, + UNNEST(JSON_QUERY_ARRAY(almanac_metrics.`meta-nodes`.nodes)) meta_node + WHERE + LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'origin-trial' +), + +ot_from_custom_metric AS ( + SELECT + client, + root_page, + PARSE_ORIGIN_TRIAL(SAFE.STRING(metric.token)) AS ot + FROM pages, + UNNEST(JSON_QUERY_ARRAY(ot_metrics)) metric +), + +aggregated AS ( + SELECT + client, + ot.feature, + --ot.expiry >= CURRENT_TIMESTAMP() AS is_active, + COUNT(DISTINCT root_page) AS number_of_websites + FROM ( + SELECT * FROM response_headers + UNION ALL + SELECT * FROM meta_tags + UNION ALL + SELECT * FROM ot_from_custom_metric + ) + GROUP BY + client, + feature + --is_active +) + +FROM aggregated +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/tracker_categories_top.sql b/sql/2025/privacy/tracker_categories_top.sql new file mode 100644 index 00000000000..819106eb537 --- /dev/null +++ b/sql/2025/privacy/tracker_categories_top.sql @@ -0,0 +1,33 @@ +-- noqa: disable=PRS + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + GROUP BY client +) + +FROM `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category +|> WHERE + date = '2025-07-01' AND + category IN ( + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ) +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, category +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/tracker_distribution.sql b/sql/2025/privacy/tracker_distribution.sql new file mode 100644 index 00000000000..1eeb7a5efa7 --- /dev/null +++ b/sql/2025/privacy/tracker_distribution.sql @@ -0,0 +1,54 @@ +-- noqa: disable=PRS +-- Number of websites that deploy a certain number of trackers + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + --AND rank = 1000 + GROUP BY client +), + +whotracksme AS ( + SELECT + NET.HOST(domain) AS domain, + tracker + FROM `httparchive.almanac.whotracksme` + WHERE date = '2025-07-01' + AND category IN ('advertising', 'pornvertising', 'site_analytics', 'social_media') +), + +tracker_counts AS ( + SELECT + client, + root_page, + COUNT(DISTINCT tracker) AS number_of_trackers + FROM `httparchive.crawl.requests` + LEFT JOIN whotracksme + ON + NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), CONCAT('.', domain)) + WHERE + date = '2025-07-01' + --AND rank = 1000 + AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3') + GROUP BY + client, + root_page +) + +FROM tracker_counts +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, number_of_trackers +|> EXTEND SUM(number_of_websites) OVER (PARTITION BY client ORDER BY number_of_trackers DESC) AS ccdf_websites +|> JOIN base_totals USING (client) +|> EXTEND ccdf_websites / total_websites AS ccdf +|> DROP total_websites, number_of_websites +|> PIVOT( + ANY_VALUE(ccdf_websites) AS websites_count, + ANY_VALUE(ccdf) AS ccdf + FOR client IN ('desktop', 'mobile') +) +|> RENAME ccdf_mobile AS mobile, ccdf_desktop AS desktop +|> ORDER BY number_of_trackers diff --git a/sql/2025/privacy/tracker_technologies_top.sql b/sql/2025/privacy/tracker_technologies_top.sql new file mode 100644 index 00000000000..876bb9c3ded --- /dev/null +++ b/sql/2025/privacy/tracker_technologies_top.sql @@ -0,0 +1,33 @@ +-- noqa: disable=PRS + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + GROUP BY client +) + +FROM `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(tech.categories) AS category +|> WHERE + date = '2025-07-01' AND + category IN ( + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ) +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, tech.technology +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/whotracksme_categories_top.sql b/sql/2025/privacy/whotracksme_categories_top.sql new file mode 100644 index 00000000000..819c41eb535 --- /dev/null +++ b/sql/2025/privacy/whotracksme_categories_top.sql @@ -0,0 +1,66 @@ +-- noqa: disable=PRS +-- Percent of websites that deploy at least one tracker from each tracker category + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM httparchive.crawl.pages + WHERE date = '2025-07-01' + GROUP BY client +), + +whotracksme AS ( + SELECT + NET.HOST(domain) AS domain, + category + FROM httparchive.almanac.whotracksme + WHERE date = '2025-07-01' +), + +tracker_categories AS ( + SELECT + client, + category, + root_page + FROM httparchive.crawl.requests + JOIN whotracksme + ON ( + NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), CONCAT('.', domain)) + ) + WHERE + date = '2025-07-01' + AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3') +), + +aggregated AS ( + SELECT + client, + category, + COUNT(DISTINCT root_page) AS number_of_websites + FROM tracker_categories + GROUP BY + client, + category + UNION ALL + SELECT + client, + 'any' AS category, + COUNT(DISTINCT root_page) AS number_of_websites + FROM tracker_categories + GROUP BY + client +) + +FROM aggregated +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/2025/privacy/whotracksme_trackers_top.sql b/sql/2025/privacy/whotracksme_trackers_top.sql new file mode 100644 index 00000000000..01fffd14aaf --- /dev/null +++ b/sql/2025/privacy/whotracksme_trackers_top.sql @@ -0,0 +1,37 @@ +-- noqa: disable=PRS + +WITH base_totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + GROUP BY client +), + +whotracksme AS ( + SELECT + NET.HOST(domain) AS domain, + category || ' / ' || tracker AS tracker + FROM `httparchive.almanac.whotracksme` + WHERE date = '2025-07-01' +) + +FROM `httparchive.crawl.requests` +|> JOIN whotracksme + ON NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), '.' || domain) +|> WHERE + date = '2025-07-01' + AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3') +|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, tracker +|> JOIN base_totals USING (client) +|> EXTEND number_of_websites / total_websites AS pct_websites +|> DROP total_websites +|> PIVOT( + ANY_VALUE(number_of_websites) AS websites_count, + ANY_VALUE(pct_websites) AS pct + FOR client IN ('desktop', 'mobile') +) +|> RENAME pct_mobile AS mobile, pct_desktop AS desktop +|> ORDER BY COALESCE(websites_count_desktop, 0) + COALESCE(websites_count_mobile, 0) DESC diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index e32047eecb3..e8d95822b24 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -1,216 +1,264 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OVkCxlRQH6Yt", - "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" - }, - "outputs": [], - "source": [ - "# @title Download repo\n", - "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "U37785Bxt5tE" - }, - "outputs": [], - "source": [ - "# @title Configure the chapter to process\n", - "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", - "almanac_year = 2024 #@param {type: \"integer\"}\n", - "chapter_name = 'privacy' #@param {type: \"string\"}\n", - "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UzhgG5xvbQ1E", - "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" - }, - "outputs": [], - "source": [ - "# @title Update chapter branch\n", - "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", - "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "45dBifFPJAtO" - }, - "outputs": [], - "source": [ - "# @title Authenticate\n", - "import google.auth\n", - "import os\n", - "from google.colab import auth\n", - "from google.cloud import bigquery\n", - "\n", - "import gspread\n", - "from gspread_dataframe import set_with_dataframe\n", - "\n", - "\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", - "auth.authenticate_user()\n", - "credentials, project = google.auth.default()\n", - "client = bigquery.Client()\n", - "gc = gspread.authorize(credentials)\n", - "\n", - "try:\n", - " ss = gc.open_by_url(spreadsheet_url)\n", - "except:\n", - " print('Spreadsheet not found')" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "U37785Bxt5tE" + }, + "outputs": [], + "source": [ + "# @title Configuration\n", + "import os\n", + "import sys\n", + "\n", + "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", + "almanac_year = 2025 #@param {type: \"integer\"}\n", + "chapter_name = 'privacy' #@param {type: \"string\"}\n", + "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}\n", + "\n", + "IN_COLAB = 'google.colab' in sys.modules\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "nblNil985Tjt", - "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" - }, - "outputs": [], - "source": [ - "# @title Upload query results\n", - "\n", - "import glob\n", - "import re\n", - "from tabulate import tabulate\n", - "from IPython.display import clear_output\n", - "\n", - "\n", - "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "dry_run = True # @param {type: \"boolean\"}\n", - "overwrite_sheets = True # @param {type: \"boolean\"}\n", - "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", - "\n", - "filename_include_regexp = r'{}'.format(filename_match)\n", - "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", - "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", - " year=almanac_year,\n", - " chapter=chapter_name.lower()\n", - ")\n", - "existing_sheets = [s.title for s in ss.worksheets()]\n", - "\n", - "# Print formatted logs\n", - "queries_processed_log = []\n", - "def print_logs_table(log=None, append=True):\n", - " if log:\n", - " queries_processed_log.append(log)\n", - " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", - " if not append:\n", - " del queries_processed_log[-1]\n", - " clear_output(wait=True)\n", - " print(table)\n", - "\n", - "# Find matching SQL queries and save results to Google Sheets.\n", - "for filepath in sorted(glob.iglob(folder)):\n", - " filename = filepath.split('/')[-1]\n", - "\n", - " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", - "\n", - " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", - "\n", - " with open(filepath) as f:\n", - " query = f.read()\n", - "\n", - " try:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(dry_run = True)\n", - " )\n", - " except Exception as e:\n", - " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", - " continue\n", - "\n", - " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", - " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", - "\n", - " if sheet_title in existing_sheets:\n", - " if overwrite_sheets:\n", - " st = ss.worksheet(sheet_title)\n", - " else:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", - " continue\n", - "\n", - " if dry_run:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", - " continue\n", - "\n", - " try:\n", - " if maximum_tb_billed:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(\n", - " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", - " )\n", - " )\n", - " else:\n", - " response = client.query(query)\n", - "\n", - " df = response.to_dataframe()\n", - " if ('st' not in locals() or st.title != sheet_title):\n", - " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", - " set_with_dataframe(st, df, resize=False)\n", - "\n", - " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", - " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", - "\n", - " except Exception as e:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", - " continue\n", - "\n", - " else:\n", - " print_logs_table([filename, None, None, 'Filename mismatch'])" - ] - } - ], - "metadata": { + "id": "OVkCxlRQH6Yt", + "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" + }, + "outputs": [], + "source": [ + "# @title Download repo\n", + "if IN_COLAB:\n", + " !git clone https://github.com/HTTPArchive/almanac.httparchive.org.git\n", + " !cd almanac.httparchive.org/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "id": "UzhgG5xvbQ1E", + "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" + }, + "outputs": [], + "source": [ + "# @title Update chapter branch\n", + "if IN_COLAB:\n", + " branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", + " print(f\"Switching to branch: {branch_name}\")\n", + " !git checkout $branch_name && git pull" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Authenticate in Google Colab\n", + "if IN_COLAB:\n", + " from google.colab import auth\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "45dBifFPJAtO" + }, + "outputs": [], + "source": [ + "# @title Setup BigQuery and Google Sheets clients\n", + "import google.auth\n", + "from google.cloud import bigquery\n", + "import gspread\n", + "\n", + "\n", + "# Authenticate with required scopes for BigQuery and Google Sheets\n", + "SCOPES = [\n", + " 'https://www.googleapis.com/auth/spreadsheets',\n", + " 'https://www.googleapis.com/auth/drive',\n", + " 'https://www.googleapis.com/auth/bigquery'\n", + "]\n", + "\n", + "credentials, project = google.auth.default(scopes=SCOPES)\n", + "client = bigquery.Client(credentials=credentials)\n", + "gc = gspread.authorize(credentials)\n", + "\n", + "# Connect to spreadsheet\n", + "ss = gc.open_by_url(spreadsheet_url)\n", + "existing_sheets = [s.title for s in ss.worksheets()]\n", + "print(f\"āœ“ Connected to spreadsheet with {len(existing_sheets)} existing sheets\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python", - "version": "3.12.4" - } + "collapsed": true, + "id": "nblNil985Tjt", + "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" + }, + "outputs": [], + "source": [ + "# @title Upload query results to Google Sheets\n", + "import glob\n", + "import re\n", + "from gspread_dataframe import set_with_dataframe\n", + "from IPython.display import clear_output\n", + "from tabulate import tabulate\n", + "\n", + "\n", + "# Query filters and options\n", + "filename_match = 'whotracksme.+' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "filename_match_exclude = '' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "dry_run = False # @param {type: \"boolean\"}\n", + "overwrite_sheets = True # @param {type: \"boolean\"}\n", + "maximum_tb_billed = 7 # @param {type: \"raw\", placeholder: \"Max TB to bill per query\"}\n", + "\n", + "# Setup file filters\n", + "filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else filename_match\n", + "filename_exclude_regexp = r'^$' if not filename_match_exclude else filename_match_exclude\n", + "\n", + "# Build path to SQL files\n", + "sql_folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n", + "print(f\"Looking for SQL files in: {sql_folder}\")\n", + "sql_files = sorted(glob.glob(sql_folder))\n", + "\n", + "if not sql_files:\n", + " print(\"āŒ No SQL files found. Check the folder path.\")\n", + "else:\n", + " print(f\"Found {len(sql_files)} SQL files\\n\")\n", + "\n", + " # Progress tracking\n", + " queries_processed_log = []\n", + "\n", + " def log_result(filename, tb_processed=None, sheet_name=None, skip_reason=None, preview=False):\n", + " \"\"\"Add result to log and display table\"\"\"\n", + " log_entry = [filename, tb_processed, sheet_name, skip_reason]\n", + " if not preview:\n", + " queries_processed_log.append(log_entry)\n", + "\n", + " # Build table from current log plus preview entry if needed\n", + " display_log = queries_processed_log if not preview else queries_processed_log + [log_entry]\n", + " table = tabulate(display_log, headers=['Query', 'TB Billed', 'Sheet', 'Status/Skip Reason'], tablefmt=\"grid\")\n", + " clear_output(wait=True)\n", + " print(table)\n", + "\n", + " # Process each SQL file\n", + " for filepath in sql_files:\n", + " filename = os.path.basename(filepath)\n", + "\n", + " # Show processing status\n", + " log_result(filename, 'Processing...', 'Processing...', 'Processing...', preview=True)\n", + "\n", + " # Check if filename matches filters\n", + " if not re.search(filename_include_regexp, filename) or re.search(filename_exclude_regexp, filename):\n", + " log_result(filename, None, None, 'Filename filter mismatch')\n", + " continue\n", + "\n", + " # Read query\n", + " with open(filepath) as f:\n", + " query = f.read()\n", + "\n", + " # Estimate query cost (dry run)\n", + " try:\n", + " dry_run_response = client.query(query, job_config=bigquery.QueryJobConfig(dry_run=True))\n", + " tb_processed = dry_run_response.total_bytes_processed / 1024**4\n", + " except Exception as e:\n", + " log_result(filename, None, None, f'Dry run error: {str(e)[:100]}...')\n", + " continue\n", + "\n", + " # Generate sheet title from filename\n", + " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", + "\n", + " # Skip execution if dry run mode\n", + " if dry_run:\n", + " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Dry run mode')\n", + " continue\n", + "\n", + " # Check if sheet already exists\n", + " if sheet_title in existing_sheets and not overwrite_sheets:\n", + " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Sheet exists (set overwrite_sheets=True)')\n", + " continue\n", + "\n", + " # Execute query and upload to Sheets\n", + " try:\n", + " # Run query with billing limit\n", + " job_config = bigquery.QueryJobConfig()\n", + " if maximum_tb_billed:\n", + " job_config.maximum_bytes_billed = int(maximum_tb_billed * 1024**4)\n", + "\n", + " query_response = client.query(query, job_config=job_config)\n", + " df = query_response.to_dataframe()\n", + "\n", + " # Get or create sheet\n", + " if sheet_title in existing_sheets:\n", + " sheet = ss.worksheet(sheet_title)\n", + " else:\n", + " sheet = ss.add_worksheet(sheet_title, rows=1, cols=1)\n", + " existing_sheets.append(sheet_title)\n", + "\n", + " # Upload data\n", + " set_with_dataframe(sheet, df, resize=False)\n", + "\n", + " tb_billed = query_response.total_bytes_billed / 1024**4\n", + " log_result(filename, f'{tb_billed:.3f}', sheet_title, 'āœ“ Uploaded')\n", + "\n", + " except Exception as e:\n", + " log_result(filename, f'{tb_billed:.3f}', None, f'Query error: {str(e)[:100]}...')\n", + "\n", + " print(f\"\\nāœ“ Processed {len(queries_processed_log)} queries\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/sql/util/bq_writer.py b/sql/util/bq_writer.py index 3355dcfcb94..a72b6784ee0 100644 --- a/sql/util/bq_writer.py +++ b/sql/util/bq_writer.py @@ -18,7 +18,6 @@ def write_to_bq(df, table_id, schema, write_disposition="WRITE_APPEND"): client = bigquery.Client() job_config = bigquery.LoadJobConfig( - source_format=bigquery.SourceFormat.CSV, write_disposition=write_disposition, schema=schema, ) diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py index 1ea1bb7f75f..a89133467d9 100644 --- a/sql/util/haveibeenpwned.py +++ b/sql/util/haveibeenpwned.py @@ -1,50 +1,26 @@ """ -This module retrieves data from the "haveibeenpwned" API and loads it into a BigQuery table. +Retrieves breach data from the Have I Been Pwned API and loads it into BigQuery. """ -import json -from datetime import datetime as DateTime - -import pandas +import pandas as pd import requests # pylint: disable=import-error -from bq_writer import write_to_bq, bigquery - - -# Retrieve data from the "haveibeenpwned" API -breaches = json.loads( - requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10).content -) -df = pandas.DataFrame(breaches) - -year = DateTime.now().year -df["date"] = DateTime(year, 6, 1).date() -df["Name"] = df["Name"].astype(str) -df["Title"] = df["Title"].astype(str) -df["Domain"] = df["Domain"].astype(str) -df["BreachDate"] = pandas.to_datetime( - df["BreachDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["AddedDate"] = pandas.to_datetime( - df["AddedDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["ModifiedDate"] = pandas.to_datetime( - df["ModifiedDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["Description"] = df["Description"].astype(str) -df["LogoPath"] = df["LogoPath"].astype(str) -df["DataClasses"] = df["DataClasses"].apply(json.dumps) +from bq_writer import bigquery, write_to_bq -# Append to httparchive.almanac.breaches +response = requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10) +breaches = response.json() +df = pd.DataFrame(breaches) +df["BreachDate"] = pd.to_datetime(df["BreachDate"], errors="coerce") +df["AddedDate"] = pd.to_datetime(df["AddedDate"], errors="coerce") +df["ModifiedDate"] = pd.to_datetime(df["ModifiedDate"], errors="coerce") schema = [ - bigquery.SchemaField("date", "DATE"), bigquery.SchemaField("Name", "STRING"), bigquery.SchemaField("Title", "STRING"), bigquery.SchemaField("Domain", "STRING"), bigquery.SchemaField("BreachDate", "DATE"), - bigquery.SchemaField("AddedDate", "DATE"), - bigquery.SchemaField("ModifiedDate", "DATE"), + bigquery.SchemaField("AddedDate", "TIMESTAMP"), + bigquery.SchemaField("ModifiedDate", "TIMESTAMP"), bigquery.SchemaField("PwnCount", "INTEGER"), bigquery.SchemaField("Description", "STRING"), bigquery.SchemaField("LogoPath", "STRING"), @@ -53,7 +29,12 @@ bigquery.SchemaField("IsSensitive", "BOOLEAN"), bigquery.SchemaField("IsRetired", "BOOLEAN"), bigquery.SchemaField("IsSpamList", "BOOLEAN"), - bigquery.SchemaField("DataClasses", "STRING"), + bigquery.SchemaField("IsMalware", "BOOLEAN"), + bigquery.SchemaField("IsSubscriptionFree", "BOOLEAN"), + bigquery.SchemaField("IsStealerLog", "BOOLEAN"), + bigquery.SchemaField("DataClasses", "STRING", mode="REPEATED"), + bigquery.SchemaField("Attribution", "STRING"), + bigquery.SchemaField("DisclosureUrl", "STRING"), ] -write_to_bq(df, "httparchive.almanac.breaches", schema) +write_to_bq(df, "httparchive.almanac.breaches", schema, write_disposition="WRITE_TRUNCATE") diff --git a/sql/util/requirements.txt b/sql/util/requirements.txt new file mode 100644 index 00000000000..ac1288dc9f9 --- /dev/null +++ b/sql/util/requirements.txt @@ -0,0 +1,8 @@ +pandas==2.3.3 +google-cloud-bigquery==3.40.0 +requests==2.32.5 +tabulate==0.9.0 +gspread==6.2.1 +gspread-dataframe==4.0.0 +ipykernel==7.1.0 +db-dtypes==1.5.0 diff --git a/sql/util/whotracksme_trackers.py b/sql/util/whotracksme_trackers.py index ec68f922e17..ad78ecf38ea 100644 --- a/sql/util/whotracksme_trackers.py +++ b/sql/util/whotracksme_trackers.py @@ -18,7 +18,7 @@ TRACKERS_QUERY = """ SELECT - '2024-06-01' AS date, + '2025-07-01' AS date, categories.name as category, tracker, domain diff --git a/src/requirements.txt b/src/requirements.txt index dd326df7dee..eb86638b690 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -5,8 +5,3 @@ pytest==9.0.2 pytest-watch==4.2.0 pytest-cov==7.0.0 sqlfluff==3.5.0 - -# sql/util/* dependencies -pandas==2.3.3 -google-cloud-bigquery==3.40.0 -requests==2.32.5